diff --git a/src/parse/ast.rs b/src/parse/ast.rs index d7c282123f..1057ed3f3c 100644 --- a/src/parse/ast.rs +++ b/src/parse/ast.rs @@ -36,25 +36,39 @@ pub enum Expr<'a> { // List literals EmptyList, List(Vec<'a, Loc>>), - // // Lookups - // Var(&'a str), + // Lookups + Var(&'a [&'a str], &'a str), + Variant(&'a [&'a str], &'a str), // // Pattern Matching - // Case(&'a (Loc>, [(Loc>, Loc>)])), + When(&'a [(Loc>, Loc>)]), // Closure(&'a (&'a [Loc>], Loc>)), // /// basically Assign(Vec<(Loc, Loc)>, Loc) // Assign(&'a (&'a [(Loc>, Loc>)], Loc>)), - // // Application - // Call(&'a (Loc>, [Loc>])), - // ApplyVariant(&'a (&'a VariantName, [Loc>])), - // Variant(&'a VariantName), + // Application + /// To apply by name, do Apply(Var(...), ...) + /// To apply a variant by name, do Apply(Variant(...), ...) + Apply(&'a (Loc>, &'a [Loc>])), + Operator(&'a (Loc>, Loc, Loc>)), // Product Types EmptyRecord, - // // Sugar - // If(&'a (Loc>, Loc>, Loc>)), - Operator(&'a (Loc>, Loc, Loc>)), + /// e.g. `(expr).foo.bar` + Field(&'a Expr<'a>, &'a [&'a str]), + /// e.g. `Foo.Bar.baz.qux` + QualifiedField(&'a [&'a str], &'a [&'a str]), + /// e.g. `.foo` + AccessorFunction(&'a str), + + // Conditionals + If(&'a Loc>), + Then(&'a Loc>), + Else(&'a Loc>), + Case(&'a Loc>), + + // Problems + MalformedIdent(&'a str), } #[derive(Clone, Debug, PartialEq)] diff --git a/src/parse/ident.rs b/src/parse/ident.rs index 1b73aadf9e..a723beb5a6 100644 --- a/src/parse/ident.rs +++ b/src/parse/ident.rs @@ -1,27 +1,102 @@ use bumpalo::collections::string::String; +use bumpalo::collections::vec::Vec; use bumpalo::Bump; use parse::ast::Attempting; -use parse::parser::{ - unexpected, unexpected_eof, Fail, FailReason, Keyword, ParseResult, Parser, State, -}; +use parse::parser::{unexpected, unexpected_eof, Fail, ParseResult, Parser, State}; +/// The parser accepts all of these in any position where any one of them could +/// appear. This way, canonicalization can give more helpful error messages like +/// "you can't redefine this variant!" if you wrote `Foo = ...` or +/// "you can only define unqualified constants" if you wrote `Foo.bar = ...` +#[derive(Debug, PartialEq, Eq)] +pub enum Ident<'a> { + /// foo or Bar.Baz.foo + Var(MaybeQualified<'a, &'a str>), + /// Foo or Bar.Baz.Foo + Variant(MaybeQualified<'a, &'a str>), + /// foo.bar or Foo.Bar.baz.qux + Field(MaybeQualified<'a, &'a [&'a str]>), + /// .foo + AccessorFunction(&'a str), + /// .Foo or foo. or something like foo.Bar + Malformed(&'a str), +} + +/// An optional qualifier (the `Foo.Bar` in `Foo.Bar.baz`). +/// If module_parts is empty, this is unqualified. +#[derive(Debug, PartialEq, Eq)] +pub struct MaybeQualified<'a, Val> { + pub module_parts: &'a [&'a str], + pub value: Val, +} + +/// Parse an identifier into a string. +/// +/// This is separate from the `ident` Parser because string interpolation +/// wants to use it this way. +/// +/// By design, this does not check for reserved keywords like "if", "else", etc. +/// Sometimes we may want to check for those later in the process, and give +/// more contextually-aware error messages than "unexpected `if`" or the like. #[inline(always)] pub fn parse_into<'a, I>( arena: &'a Bump, chars: &mut I, state: State<'a>, -) -> ParseResult<'a, (&'a str, Option)> +) -> ParseResult<'a, (Ident<'a>, Option)> where I: Iterator, { - let mut buf = String::new_in(arena); + let mut part_buf = String::new_in(arena); // The current "part" (parts are dot-separated.) + let mut capitalized_parts: Vec<&'a str> = Vec::new_in(arena); + let mut noncapitalized_parts: Vec<&'a str> = Vec::new_in(arena); + let mut is_accessor_fn; + let mut is_capitalized; - // Identifiers must start with an ASCII letter. - // If this doesn't, it must not be an identifier! + let malformed = |opt_bad_char: Option| { + // Reconstruct the original string that we've been parsing. + let mut full_string = String::new_in(arena); + + full_string.push_str(&capitalized_parts.join(".")); + full_string.push_str(&noncapitalized_parts.join(".")); + + if let Some(bad_char) = opt_bad_char { + full_string.push(bad_char); + } + + // Consume the remaining chars in the identifier. + let mut next_char = None; + + while let Some(ch) = chars.next() { + // We can't use ch.is_alphanumeric() here because that passes for + // things that are "numeric" but not ASCII digits, like `¾` + if ch == '.' || ch.is_alphabetic() || ch.is_ascii_digit() { + full_string.push(ch); + } else { + next_char = Some(ch); + + break; + } + } + + Ok(( + (Ident::Malformed(&full_string), next_char), + state.advance_without_indenting(full_string.len())?, + )) + }; + + // Identifiers and accessor functions must start with either a letter or a dot. + // If this starts with neither, it must be something else! match chars.next() { Some(ch) => { - if ch.is_ascii_alphabetic() { - buf.push(ch); + if ch.is_alphabetic() { + part_buf.push(ch); + + is_capitalized = ch.is_uppercase(); + is_accessor_fn = false; + } else if ch == '.' { + is_capitalized = false; + is_accessor_fn = true; } else { return Err(unexpected(ch, 0, state, Attempting::Identifier)); } @@ -29,44 +104,128 @@ where None => { return Err(unexpected_eof(0, Attempting::Identifier, state)); } - } + }; + let mut chars_parsed = 1; let mut next_char = None; while let Some(ch) = chars.next() { - // After the first character, letters, numbers, and '.' are allowed. - if ch.is_ascii_alphanumeric() { - buf.push(ch); + // After the first character, only these are allowed: + // + // * Unicode alphabetic chars - you might name a variable `鹏` if that's clear to your readers + // * ASCII digits - e.g. `1` but not `¾`, both of which pass .is_numeric() + // * A dot ('.') + if ch.is_alphabetic() { + if part_buf.is_empty() { + // Capitalization is determined by the first character in the part. + is_capitalized = ch.is_uppercase(); + } + + part_buf.push(ch); + } else if ch.is_ascii_digit() { + // Parts may not start with numbers! + if part_buf.is_empty() { + return malformed(Some(ch)); + } + + part_buf.push(ch); } else if ch == '.' { - panic!("TODO support qualified identifiers. Make sure we don't have consecutive dots, and that module names are capitalized but post-module nothing is capitalized."); + // There are two posssible errors here: + // + // 1. Having two consecutive dots is an error. + // 2. Having capitalized parts after noncapitalized (e.g. `foo.Bar`) is an error. + if part_buf.is_empty() || (is_capitalized && !noncapitalized_parts.is_empty()) { + return malformed(Some(ch)); + } + + if is_capitalized { + capitalized_parts.push(&part_buf); + } else { + noncapitalized_parts.push(&part_buf); + } + + // Now that we've recorded the contents of the current buffer, reset it. + part_buf = String::new_in(arena); } else { // This must be the end of the identifier. We're done! + next_char = Some(ch); break; } + + chars_parsed += 1; } - let ident_str = buf.as_str(); + if part_buf.is_empty() { + // We probably had a trailing dot, e.g. `Foo.bar.` - this is malformed! + // + // This condition might also occur if we encounter a malformed accessor like `.|` + // + // If we made it this far and don't have a next_char, then necessarily + // we have consumed a '.' char previously. + return malformed(next_char.or_else(|| Some('.'))); + } - // Make sure we aren't trying to use a reserved keyword as an identifier - match Keyword::from_str(ident_str) { - Some(keyword) => Err(( - Fail { - reason: FailReason::UnexpectedKeyword(keyword), - attempting: Attempting::Identifier, - }, - state, - )), - None => { - let state = state.advance_without_indenting(buf.len())?; + // Record the final parts. + if is_capitalized { + capitalized_parts.push(&part_buf); + } else { + noncapitalized_parts.push(&part_buf); + } - Ok(((buf.into_bump_str(), next_char), state)) + let answer = if is_accessor_fn { + // Handle accessor functions first because they have the strictest requirements. + // Accessor functions may have exactly 1 noncapitalized part, and no capitalzed parts. + if capitalized_parts.is_empty() && noncapitalized_parts.len() == 1 { + let value = noncapitalized_parts.iter().next().unwrap(); + + Ident::AccessorFunction(value) + } else { + return malformed(None); } - } + } else { + match noncapitalized_parts.len() { + 0 => { + // We have capitalized parts only, so this must be a variant. + match capitalized_parts.pop() { + Some(value) => Ident::Variant(MaybeQualified { + module_parts: capitalized_parts.into_bump_slice(), + value, + }), + None => { + // We had neither capitalized nor noncapitalized parts, + // yet we made it this far. The only explanation is that this was + // a stray '.' drifting through the cosmos. + return Err(unexpected('.', 1, state, Attempting::Identifier)); + } + } + } + 1 => { + // We have exactly one noncapitalized part, so this must be a var. + let value = noncapitalized_parts.iter().next().unwrap(); + + Ident::Var(MaybeQualified { + module_parts: capitalized_parts.into_bump_slice(), + value, + }) + } + _ => { + // We have multiple noncapitalized parts, so this must be a field. + Ident::Field(MaybeQualified { + module_parts: capitalized_parts.into_bump_slice(), + value: noncapitalized_parts.into_bump_slice(), + }) + } + } + }; + + let state = state.advance_without_indenting(chars_parsed)?; + + Ok(((answer, next_char), state)) } -pub fn ident<'a>() -> impl Parser<'a, &'a str> { +pub fn ident<'a>() -> impl Parser<'a, Ident<'a>> { move |arena: &'a Bump, state: State<'a>| { // Discard next_char; we don't need it. let ((string, _), state) = parse_into(arena, &mut state.input.chars(), state)?; @@ -74,3 +233,49 @@ pub fn ident<'a>() -> impl Parser<'a, &'a str> { Ok((string, state)) } } + +// TESTS + +fn test_parse<'a>(input: &'a str) -> Result, Fail> { + let arena = Bump::new(); + let state = State::new(input, Attempting::Expression); + + ident() + .parse(&arena, state) + .map(|(answer, _)| answer) + .map_err(|(err, _)| err) +} + +fn var<'a>(module_parts: std::vec::Vec<&'a str>, value: &'a str) -> Ident<'a> { + Ident::Var(MaybeQualified { + module_parts: module_parts.as_slice(), + value, + }) +} + +fn variant<'a>(module_parts: std::vec::Vec<&'a str>, value: &'a str) -> Ident<'a> { + Ident::Variant(MaybeQualified { + module_parts: module_parts.as_slice(), + value, + }) +} + +fn field<'a>(module_parts: std::vec::Vec<&'a str>, value: std::vec::Vec<&'a str>) -> Ident<'a> { + Ident::Field(MaybeQualified { + module_parts: module_parts.as_slice(), + value: value.as_slice(), + }) +} + +fn accessor_fn<'a>(value: &'a str) -> Ident<'a> { + Ident::AccessorFunction(value) +} + +fn malformed<'a>(value: &'a str) -> Ident<'a> { + Ident::Malformed(value) +} + +#[test] +fn parse_var() { + assert_eq!(test_parse("foo"), Ok(var("foo"))) +} diff --git a/src/parse/keyword.rs b/src/parse/keyword.rs new file mode 100644 index 0000000000..a13aeef0da --- /dev/null +++ b/src/parse/keyword.rs @@ -0,0 +1,5 @@ +pub static IF: &'static str = "if"; +pub static THEN: &'static str = "then"; +pub static ELSE: &'static str = "else"; +pub static CASE: &'static str = "case"; +pub static WHEN: &'static str = "when"; diff --git a/src/parse/mod.rs b/src/parse/mod.rs index c53e7477e6..94531a71e4 100644 --- a/src/parse/mod.rs +++ b/src/parse/mod.rs @@ -1,39 +1,46 @@ pub mod ast; pub mod ident; +pub mod keyword; pub mod module; pub mod number_literal; pub mod parser; pub mod problems; pub mod string_literal; +use bumpalo::collections::vec::Vec; use bumpalo::Bump; use operator::Operator; use parse::ast::{Attempting, Expr}; +use parse::ident::{ident, Ident}; use parse::number_literal::number_literal; use parse::parser::{ - and, attempt, loc, map, map_with_arena, one_of3, optional, string, unexpected, unexpected_eof, - ParseResult, Parser, State, + and, attempt, loc, map, map_with_arena, one_of3, one_of4, one_of6, optional, string, + unexpected, unexpected_eof, Either, ParseResult, Parser, State, }; use parse::string_literal::string_literal; +use region::Located; -pub fn expr<'a>() -> impl Parser<'a, Expr<'a>> { +pub fn expr<'a>(min_indent: u16) -> impl Parser<'a, Expr<'a>> { // Recursive parsers must not directly invoke functions which return (impl Parser), // as this causes rustc to stack overflow. - parse_expr + move |arena, state| parse_expr(min_indent, arena, state) } -fn parse_expr<'a>(arena: &'a Bump, state: State<'a>) -> ParseResult<'a, Expr<'a>> { - map_with_arena( +fn parse_expr<'a>(min_indent: u16, arena: &'a Bump, state: State<'a>) -> ParseResult<'a, Expr<'a>> { + let expr_parser = map_with_arena( and( - attempt( - Attempting::Expression, - loc(one_of3( - record_literal(), - number_literal(), - string_literal(), - )), - ), - optional(and(loc(operator()), loc(parse_expr))), + loc(one_of6( + record_literal(), + number_literal(), + string_literal(), + when(min_indent), + conditional(min_indent), + ident_etc(min_indent), + )), + optional(and( + loc(operator()), + loc(move |arena, state| parse_expr(min_indent, arena, state)), + )), ), |arena, (loc_expr1, opt_operator)| match opt_operator { Some((loc_op, loc_expr2)) => { @@ -43,8 +50,89 @@ fn parse_expr<'a>(arena: &'a Bump, state: State<'a>) -> ParseResult<'a, Expr<'a> } None => loc_expr1.value, }, + ); + + attempt(Attempting::Expression, expr_parser).parse(arena, state) +} + +pub fn loc_function_args<'a>(min_indent: u16) -> impl Parser<'a, &'a [Located>]> { + move |arena, state| { + panic!("TODO stop early if we see an operator after the whitespace - precedence!"); + // zero_or_more(after(one_or_more(whitespace(min_indent)), function_arg())) + } +} + +pub fn when<'a>(min_indent: u16) -> impl Parser<'a, Expr<'a>> { + map(string(keyword::WHEN), |_| { + panic!("TODO implement WHEN"); + }) +} + +pub fn conditional<'a>(min_indent: u16) -> impl Parser<'a, Expr<'a>> { + one_of4( + cond_help(keyword::IF, Expr::If, min_indent), + cond_help(keyword::THEN, Expr::Then, min_indent), + cond_help(keyword::ELSE, Expr::Else, min_indent), + cond_help(keyword::CASE, Expr::Case, min_indent), + ) +} + +fn cond_help<'a, F>(name: &str, wrap_expr: F, min_indent: u16) -> impl Parser<'a, Expr<'a>> +where + F: Fn(&'a Located>) -> Expr<'a>, +{ + map( + after( + after(string(name), skip1_whitespace(min_indent)), + loc(expr(min_indent)), + ), + wrap_expr, + ) +} + +/// When we parse an ident like `foo ` it could be any of these: +/// +/// 1. A standalone variable with trailing whitespace (e.g. because an operator is next) +/// 2. The beginning of a function call (e.g. `foo bar baz`) +/// 3. The beginning of a defniition (e.g. `foo =`) +/// 4. A reserved keyword (e.g. `if ` or `case `), meaning we should do something else. +pub fn ident_etc<'a>(min_indent: u16) -> impl Parser<'a, Expr<'a>> { + let followed_by_equals = after(zero_or_more(whitespace(min_indent), char('='))); + + map_with_arena( + and( + loc(ident()), + either(followed_by_equals, loc_function_args(min_indent)), + ), + |arena, (loc_ident, equals_or_loc_args)| { + match equals_or_loc_args { + Either::First(()) => { + // We have now parsed the beginning of a def (e.g. `foo =`) + } + Either::Second(loc_args) => { + // This appears to be a var, keyword, or function application. + let name_expr = match loc_ident.value { + Ident::Var(info) => Expr::Var(info.module_parts, info.value), + Ident::Variant(info) => Expr::Variant(info.module_parts, info.value), + Ident::Field(info) => Expr::QualifiedField(info.module_parts, info.value), + Ident::AccessorFunction(string) => Expr::AccessorFunction(string), + Ident::Malformed(string) => Expr::MalformedIdent(string), + }; + + if loc_args.is_empty() { + name_expr + } else { + let loc_expr = Located { + region: loc_ident.region, + value: name_expr, + }; + + Expr::Apply(arena.alloc((loc_expr, loc_args))) + } + } + } + }, ) - .parse(arena, state) } pub fn operator<'a>() -> impl Parser<'a, Operator> { diff --git a/src/parse/parser.rs b/src/parse/parser.rs index b2eeb8dfd6..2aadc47893 100644 --- a/src/parse/parser.rs +++ b/src/parse/parser.rs @@ -35,6 +35,12 @@ pub struct State<'a> { pub attempting: Attempting, } +#[derive(Debug, PartialEq, Eq)] +pub enum Either { + First(First), + Second(Second), +} + impl<'a> State<'a> { pub fn new(input: &'a str, attempting: Attempting) -> State<'a> { State { @@ -151,7 +157,6 @@ pub type ParseResult<'a, Output> = Result<(Output, State<'a>), (Fail, State<'a>) #[derive(Debug, Clone, PartialEq, Eq)] pub enum FailReason { Unexpected(char, Region), - UnexpectedKeyword(Keyword), ConditionFailed, LineTooLong(u32 /* which line was too long */), TooManyLines, @@ -164,28 +169,6 @@ pub struct Fail { pub reason: FailReason, } -#[derive(Debug, Clone, PartialEq, Eq)] -pub enum Keyword { - If, - Then, - Else, - Case, - When, -} - -impl Keyword { - pub fn from_str(kw: &str) -> Option { - match kw { - "if" => Some(Keyword::If), - "then" => Some(Keyword::Then), - "else" => Some(Keyword::Else), - "case" => Some(Keyword::Case), - "when" => Some(Keyword::When), - _ => None, - } - } -} - pub trait Parser<'a, Output> { fn parse(&self, &'a Bump, State<'a>) -> ParseResult<'a, Output>; } @@ -271,6 +254,31 @@ where } } +pub fn zero_or_more<'a, P, A>(parser: P) -> impl Parser<'a, Vec<'a, A>> +where + P: Parser<'a, A>, +{ + move |arena, state| match parser.parse(arena, state) { + Ok((first_output, next_state)) => { + let mut state = next_state; + let mut buf = Vec::with_capacity_in(1, arena); + + buf.push(first_output); + + loop { + match parser.parse(arena, state) { + Ok((next_output, next_state)) => { + state = next_state; + buf.push(next_output); + } + Err((_, old_state)) => return Ok((buf, old_state)), + } + } + } + Err((_, new_state)) => return Ok((Vec::new_in(arena), new_state)), + } +} + pub fn one_or_more<'a, P, A>(parser: P) -> impl Parser<'a, Vec<'a, A>> where P: Parser<'a, A>, diff --git a/tests/test_canonicalize.rs b/tests/test_canonicalize.rs index 8d6571200e..a4711e0131 100644 --- a/tests/test_canonicalize.rs +++ b/tests/test_canonicalize.rs @@ -898,5 +898,8 @@ mod test_canonicalize { // } // // TODO test what happens when interpolated strings contain 1+ malformed idents + // // TODO test hex/oct/binary conversion to numbers + // + // TODO test for \t \r and \n in string literals *outside* unicode escape sequence! } diff --git a/tests/test_parse.rs b/tests/test_parse.rs index d751387f71..b6156c87d0 100644 --- a/tests/test_parse.rs +++ b/tests/test_parse.rs @@ -282,10 +282,31 @@ mod test_parse { assert_eq!(Ok(expected), actual); } + #[test] + fn multiple_operators() { + let arena = Bump::new(); + let inner = arena.alloc(( + Located::new(0, 3, 0, 5, Int("42")), + Located::new(0, 5, 0, 6, Plus), + Located::new(0, 6, 0, 9, Int("534")), + )); + let outer = arena.alloc(( + Located::new(0, 0, 0, 2, Int("31")), + Located::new(0, 2, 0, 3, Star), + Located::new(0, 3, 0, 9, Operator(inner)), + )); + let expected = Operator(outer); + let actual = parse_with(&arena, "31*42+534"); + + assert_eq!(Ok(expected), actual); + } + // TODO test hex/oct/binary parsing // // TODO test for \t \r and \n in string literals *outside* unicode escape sequence! // + // TODO test for non-ASCII variables + // // TODO verify that when a string literal contains a newline before the // closing " it correctly updates both the line *and* column in the State. }