wip add more stuff

This commit is contained in:
Richard Feldman 2019-09-17 20:06:57 -04:00
parent 9863268793
commit 4713087bb2
7 changed files with 422 additions and 78 deletions

View file

@ -36,25 +36,39 @@ pub enum Expr<'a> {
// List literals // List literals
EmptyList, EmptyList,
List(Vec<'a, Loc<Expr<'a>>>), List(Vec<'a, Loc<Expr<'a>>>),
// // Lookups // Lookups
// Var(&'a str), Var(&'a [&'a str], &'a str),
Variant(&'a [&'a str], &'a str),
// // Pattern Matching // // Pattern Matching
// Case(&'a (Loc<Expr<'a>>, [(Loc<Pattern<'a>>, Loc<Expr<'a>>)])), When(&'a [(Loc<Pattern<'a>>, Loc<Expr<'a>>)]),
// Closure(&'a (&'a [Loc<Pattern<'a>>], Loc<Expr<'a>>)), // Closure(&'a (&'a [Loc<Pattern<'a>>], Loc<Expr<'a>>)),
// /// basically Assign(Vec<(Loc<Pattern>, Loc<Expr>)>, Loc<Expr>) // /// basically Assign(Vec<(Loc<Pattern>, Loc<Expr>)>, Loc<Expr>)
// Assign(&'a (&'a [(Loc<Pattern<'a>>, Loc<Expr<'a>>)], Loc<Expr<'a>>)), // Assign(&'a (&'a [(Loc<Pattern<'a>>, Loc<Expr<'a>>)], Loc<Expr<'a>>)),
// // Application // Application
// Call(&'a (Loc<Expr<'a>>, [Loc<Expr<'a>>])), /// To apply by name, do Apply(Var(...), ...)
// ApplyVariant(&'a (&'a VariantName, [Loc<Expr<'a>>])), /// To apply a variant by name, do Apply(Variant(...), ...)
// Variant(&'a VariantName), Apply(&'a (Loc<Expr<'a>>, &'a [Loc<Expr<'a>>])),
Operator(&'a (Loc<Expr<'a>>, Loc<Operator>, Loc<Expr<'a>>)),
// Product Types // Product Types
EmptyRecord, EmptyRecord,
// // Sugar /// e.g. `(expr).foo.bar`
// If(&'a (Loc<Expr<'a>>, Loc<Expr<'a>>, Loc<Expr<'a>>)), Field(&'a Expr<'a>, &'a [&'a str]),
Operator(&'a (Loc<Expr<'a>>, Loc<Operator>, Loc<Expr<'a>>)), /// e.g. `Foo.Bar.baz.qux`
QualifiedField(&'a [&'a str], &'a [&'a str]),
/// e.g. `.foo`
AccessorFunction(&'a str),
// Conditionals
If(&'a Loc<Expr<'a>>),
Then(&'a Loc<Expr<'a>>),
Else(&'a Loc<Expr<'a>>),
Case(&'a Loc<Expr<'a>>),
// Problems
MalformedIdent(&'a str),
} }
#[derive(Clone, Debug, PartialEq)] #[derive(Clone, Debug, PartialEq)]

View file

@ -1,27 +1,102 @@
use bumpalo::collections::string::String; use bumpalo::collections::string::String;
use bumpalo::collections::vec::Vec;
use bumpalo::Bump; use bumpalo::Bump;
use parse::ast::Attempting; use parse::ast::Attempting;
use parse::parser::{ use parse::parser::{unexpected, unexpected_eof, Fail, ParseResult, Parser, State};
unexpected, unexpected_eof, Fail, FailReason, Keyword, ParseResult, Parser, State,
};
/// The parser accepts all of these in any position where any one of them could
/// appear. This way, canonicalization can give more helpful error messages like
/// "you can't redefine this variant!" if you wrote `Foo = ...` or
/// "you can only define unqualified constants" if you wrote `Foo.bar = ...`
#[derive(Debug, PartialEq, Eq)]
pub enum Ident<'a> {
/// foo or Bar.Baz.foo
Var(MaybeQualified<'a, &'a str>),
/// Foo or Bar.Baz.Foo
Variant(MaybeQualified<'a, &'a str>),
/// foo.bar or Foo.Bar.baz.qux
Field(MaybeQualified<'a, &'a [&'a str]>),
/// .foo
AccessorFunction(&'a str),
/// .Foo or foo. or something like foo.Bar
Malformed(&'a str),
}
/// An optional qualifier (the `Foo.Bar` in `Foo.Bar.baz`).
/// If module_parts is empty, this is unqualified.
#[derive(Debug, PartialEq, Eq)]
pub struct MaybeQualified<'a, Val> {
pub module_parts: &'a [&'a str],
pub value: Val,
}
/// Parse an identifier into a string.
///
/// This is separate from the `ident` Parser because string interpolation
/// wants to use it this way.
///
/// By design, this does not check for reserved keywords like "if", "else", etc.
/// Sometimes we may want to check for those later in the process, and give
/// more contextually-aware error messages than "unexpected `if`" or the like.
#[inline(always)] #[inline(always)]
pub fn parse_into<'a, I>( pub fn parse_into<'a, I>(
arena: &'a Bump, arena: &'a Bump,
chars: &mut I, chars: &mut I,
state: State<'a>, state: State<'a>,
) -> ParseResult<'a, (&'a str, Option<char>)> ) -> ParseResult<'a, (Ident<'a>, Option<char>)>
where where
I: Iterator<Item = char>, I: Iterator<Item = char>,
{ {
let mut buf = String::new_in(arena); let mut part_buf = String::new_in(arena); // The current "part" (parts are dot-separated.)
let mut capitalized_parts: Vec<&'a str> = Vec::new_in(arena);
let mut noncapitalized_parts: Vec<&'a str> = Vec::new_in(arena);
let mut is_accessor_fn;
let mut is_capitalized;
// Identifiers must start with an ASCII letter. let malformed = |opt_bad_char: Option<char>| {
// If this doesn't, it must not be an identifier! // Reconstruct the original string that we've been parsing.
let mut full_string = String::new_in(arena);
full_string.push_str(&capitalized_parts.join("."));
full_string.push_str(&noncapitalized_parts.join("."));
if let Some(bad_char) = opt_bad_char {
full_string.push(bad_char);
}
// Consume the remaining chars in the identifier.
let mut next_char = None;
while let Some(ch) = chars.next() {
// We can't use ch.is_alphanumeric() here because that passes for
// things that are "numeric" but not ASCII digits, like `¾`
if ch == '.' || ch.is_alphabetic() || ch.is_ascii_digit() {
full_string.push(ch);
} else {
next_char = Some(ch);
break;
}
}
Ok((
(Ident::Malformed(&full_string), next_char),
state.advance_without_indenting(full_string.len())?,
))
};
// Identifiers and accessor functions must start with either a letter or a dot.
// If this starts with neither, it must be something else!
match chars.next() { match chars.next() {
Some(ch) => { Some(ch) => {
if ch.is_ascii_alphabetic() { if ch.is_alphabetic() {
buf.push(ch); part_buf.push(ch);
is_capitalized = ch.is_uppercase();
is_accessor_fn = false;
} else if ch == '.' {
is_capitalized = false;
is_accessor_fn = true;
} else { } else {
return Err(unexpected(ch, 0, state, Attempting::Identifier)); return Err(unexpected(ch, 0, state, Attempting::Identifier));
} }
@ -29,44 +104,128 @@ where
None => { None => {
return Err(unexpected_eof(0, Attempting::Identifier, state)); return Err(unexpected_eof(0, Attempting::Identifier, state));
} }
} };
let mut chars_parsed = 1;
let mut next_char = None; let mut next_char = None;
while let Some(ch) = chars.next() { while let Some(ch) = chars.next() {
// After the first character, letters, numbers, and '.' are allowed. // After the first character, only these are allowed:
if ch.is_ascii_alphanumeric() { //
buf.push(ch); // * Unicode alphabetic chars - you might name a variable `鹏` if that's clear to your readers
// * ASCII digits - e.g. `1` but not `¾`, both of which pass .is_numeric()
// * A dot ('.')
if ch.is_alphabetic() {
if part_buf.is_empty() {
// Capitalization is determined by the first character in the part.
is_capitalized = ch.is_uppercase();
}
part_buf.push(ch);
} else if ch.is_ascii_digit() {
// Parts may not start with numbers!
if part_buf.is_empty() {
return malformed(Some(ch));
}
part_buf.push(ch);
} else if ch == '.' { } else if ch == '.' {
panic!("TODO support qualified identifiers. Make sure we don't have consecutive dots, and that module names are capitalized but post-module nothing is capitalized."); // There are two posssible errors here:
//
// 1. Having two consecutive dots is an error.
// 2. Having capitalized parts after noncapitalized (e.g. `foo.Bar`) is an error.
if part_buf.is_empty() || (is_capitalized && !noncapitalized_parts.is_empty()) {
return malformed(Some(ch));
}
if is_capitalized {
capitalized_parts.push(&part_buf);
} else {
noncapitalized_parts.push(&part_buf);
}
// Now that we've recorded the contents of the current buffer, reset it.
part_buf = String::new_in(arena);
} else { } else {
// This must be the end of the identifier. We're done! // This must be the end of the identifier. We're done!
next_char = Some(ch); next_char = Some(ch);
break; break;
} }
chars_parsed += 1;
} }
let ident_str = buf.as_str(); if part_buf.is_empty() {
// We probably had a trailing dot, e.g. `Foo.bar.` - this is malformed!
//
// This condition might also occur if we encounter a malformed accessor like `.|`
//
// If we made it this far and don't have a next_char, then necessarily
// we have consumed a '.' char previously.
return malformed(next_char.or_else(|| Some('.')));
}
// Make sure we aren't trying to use a reserved keyword as an identifier // Record the final parts.
match Keyword::from_str(ident_str) { if is_capitalized {
Some(keyword) => Err(( capitalized_parts.push(&part_buf);
Fail { } else {
reason: FailReason::UnexpectedKeyword(keyword), noncapitalized_parts.push(&part_buf);
attempting: Attempting::Identifier, }
},
state,
)),
None => {
let state = state.advance_without_indenting(buf.len())?;
Ok(((buf.into_bump_str(), next_char), state)) let answer = if is_accessor_fn {
// Handle accessor functions first because they have the strictest requirements.
// Accessor functions may have exactly 1 noncapitalized part, and no capitalzed parts.
if capitalized_parts.is_empty() && noncapitalized_parts.len() == 1 {
let value = noncapitalized_parts.iter().next().unwrap();
Ident::AccessorFunction(value)
} else {
return malformed(None);
} }
} } else {
match noncapitalized_parts.len() {
0 => {
// We have capitalized parts only, so this must be a variant.
match capitalized_parts.pop() {
Some(value) => Ident::Variant(MaybeQualified {
module_parts: capitalized_parts.into_bump_slice(),
value,
}),
None => {
// We had neither capitalized nor noncapitalized parts,
// yet we made it this far. The only explanation is that this was
// a stray '.' drifting through the cosmos.
return Err(unexpected('.', 1, state, Attempting::Identifier));
}
}
}
1 => {
// We have exactly one noncapitalized part, so this must be a var.
let value = noncapitalized_parts.iter().next().unwrap();
Ident::Var(MaybeQualified {
module_parts: capitalized_parts.into_bump_slice(),
value,
})
}
_ => {
// We have multiple noncapitalized parts, so this must be a field.
Ident::Field(MaybeQualified {
module_parts: capitalized_parts.into_bump_slice(),
value: noncapitalized_parts.into_bump_slice(),
})
}
}
};
let state = state.advance_without_indenting(chars_parsed)?;
Ok(((answer, next_char), state))
} }
pub fn ident<'a>() -> impl Parser<'a, &'a str> { pub fn ident<'a>() -> impl Parser<'a, Ident<'a>> {
move |arena: &'a Bump, state: State<'a>| { move |arena: &'a Bump, state: State<'a>| {
// Discard next_char; we don't need it. // Discard next_char; we don't need it.
let ((string, _), state) = parse_into(arena, &mut state.input.chars(), state)?; let ((string, _), state) = parse_into(arena, &mut state.input.chars(), state)?;
@ -74,3 +233,49 @@ pub fn ident<'a>() -> impl Parser<'a, &'a str> {
Ok((string, state)) Ok((string, state))
} }
} }
// TESTS
fn test_parse<'a>(input: &'a str) -> Result<Ident<'a>, Fail> {
let arena = Bump::new();
let state = State::new(input, Attempting::Expression);
ident()
.parse(&arena, state)
.map(|(answer, _)| answer)
.map_err(|(err, _)| err)
}
fn var<'a>(module_parts: std::vec::Vec<&'a str>, value: &'a str) -> Ident<'a> {
Ident::Var(MaybeQualified {
module_parts: module_parts.as_slice(),
value,
})
}
fn variant<'a>(module_parts: std::vec::Vec<&'a str>, value: &'a str) -> Ident<'a> {
Ident::Variant(MaybeQualified {
module_parts: module_parts.as_slice(),
value,
})
}
fn field<'a>(module_parts: std::vec::Vec<&'a str>, value: std::vec::Vec<&'a str>) -> Ident<'a> {
Ident::Field(MaybeQualified {
module_parts: module_parts.as_slice(),
value: value.as_slice(),
})
}
fn accessor_fn<'a>(value: &'a str) -> Ident<'a> {
Ident::AccessorFunction(value)
}
fn malformed<'a>(value: &'a str) -> Ident<'a> {
Ident::Malformed(value)
}
#[test]
fn parse_var() {
assert_eq!(test_parse("foo"), Ok(var("foo")))
}

5
src/parse/keyword.rs Normal file
View file

@ -0,0 +1,5 @@
pub static IF: &'static str = "if";
pub static THEN: &'static str = "then";
pub static ELSE: &'static str = "else";
pub static CASE: &'static str = "case";
pub static WHEN: &'static str = "when";

View file

@ -1,39 +1,46 @@
pub mod ast; pub mod ast;
pub mod ident; pub mod ident;
pub mod keyword;
pub mod module; pub mod module;
pub mod number_literal; pub mod number_literal;
pub mod parser; pub mod parser;
pub mod problems; pub mod problems;
pub mod string_literal; pub mod string_literal;
use bumpalo::collections::vec::Vec;
use bumpalo::Bump; use bumpalo::Bump;
use operator::Operator; use operator::Operator;
use parse::ast::{Attempting, Expr}; use parse::ast::{Attempting, Expr};
use parse::ident::{ident, Ident};
use parse::number_literal::number_literal; use parse::number_literal::number_literal;
use parse::parser::{ use parse::parser::{
and, attempt, loc, map, map_with_arena, one_of3, optional, string, unexpected, unexpected_eof, and, attempt, loc, map, map_with_arena, one_of3, one_of4, one_of6, optional, string,
ParseResult, Parser, State, unexpected, unexpected_eof, Either, ParseResult, Parser, State,
}; };
use parse::string_literal::string_literal; use parse::string_literal::string_literal;
use region::Located;
pub fn expr<'a>() -> impl Parser<'a, Expr<'a>> { pub fn expr<'a>(min_indent: u16) -> impl Parser<'a, Expr<'a>> {
// Recursive parsers must not directly invoke functions which return (impl Parser), // Recursive parsers must not directly invoke functions which return (impl Parser),
// as this causes rustc to stack overflow. // as this causes rustc to stack overflow.
parse_expr move |arena, state| parse_expr(min_indent, arena, state)
} }
fn parse_expr<'a>(arena: &'a Bump, state: State<'a>) -> ParseResult<'a, Expr<'a>> { fn parse_expr<'a>(min_indent: u16, arena: &'a Bump, state: State<'a>) -> ParseResult<'a, Expr<'a>> {
map_with_arena( let expr_parser = map_with_arena(
and( and(
attempt( loc(one_of6(
Attempting::Expression, record_literal(),
loc(one_of3( number_literal(),
record_literal(), string_literal(),
number_literal(), when(min_indent),
string_literal(), conditional(min_indent),
)), ident_etc(min_indent),
), )),
optional(and(loc(operator()), loc(parse_expr))), optional(and(
loc(operator()),
loc(move |arena, state| parse_expr(min_indent, arena, state)),
)),
), ),
|arena, (loc_expr1, opt_operator)| match opt_operator { |arena, (loc_expr1, opt_operator)| match opt_operator {
Some((loc_op, loc_expr2)) => { Some((loc_op, loc_expr2)) => {
@ -43,8 +50,89 @@ fn parse_expr<'a>(arena: &'a Bump, state: State<'a>) -> ParseResult<'a, Expr<'a>
} }
None => loc_expr1.value, None => loc_expr1.value,
}, },
);
attempt(Attempting::Expression, expr_parser).parse(arena, state)
}
pub fn loc_function_args<'a>(min_indent: u16) -> impl Parser<'a, &'a [Located<Expr<'a>>]> {
move |arena, state| {
panic!("TODO stop early if we see an operator after the whitespace - precedence!");
// zero_or_more(after(one_or_more(whitespace(min_indent)), function_arg()))
}
}
pub fn when<'a>(min_indent: u16) -> impl Parser<'a, Expr<'a>> {
map(string(keyword::WHEN), |_| {
panic!("TODO implement WHEN");
})
}
pub fn conditional<'a>(min_indent: u16) -> impl Parser<'a, Expr<'a>> {
one_of4(
cond_help(keyword::IF, Expr::If, min_indent),
cond_help(keyword::THEN, Expr::Then, min_indent),
cond_help(keyword::ELSE, Expr::Else, min_indent),
cond_help(keyword::CASE, Expr::Case, min_indent),
)
}
fn cond_help<'a, F>(name: &str, wrap_expr: F, min_indent: u16) -> impl Parser<'a, Expr<'a>>
where
F: Fn(&'a Located<Expr<'a>>) -> Expr<'a>,
{
map(
after(
after(string(name), skip1_whitespace(min_indent)),
loc(expr(min_indent)),
),
wrap_expr,
)
}
/// When we parse an ident like `foo ` it could be any of these:
///
/// 1. A standalone variable with trailing whitespace (e.g. because an operator is next)
/// 2. The beginning of a function call (e.g. `foo bar baz`)
/// 3. The beginning of a defniition (e.g. `foo =`)
/// 4. A reserved keyword (e.g. `if ` or `case `), meaning we should do something else.
pub fn ident_etc<'a>(min_indent: u16) -> impl Parser<'a, Expr<'a>> {
let followed_by_equals = after(zero_or_more(whitespace(min_indent), char('=')));
map_with_arena(
and(
loc(ident()),
either(followed_by_equals, loc_function_args(min_indent)),
),
|arena, (loc_ident, equals_or_loc_args)| {
match equals_or_loc_args {
Either::First(()) => {
// We have now parsed the beginning of a def (e.g. `foo =`)
}
Either::Second(loc_args) => {
// This appears to be a var, keyword, or function application.
let name_expr = match loc_ident.value {
Ident::Var(info) => Expr::Var(info.module_parts, info.value),
Ident::Variant(info) => Expr::Variant(info.module_parts, info.value),
Ident::Field(info) => Expr::QualifiedField(info.module_parts, info.value),
Ident::AccessorFunction(string) => Expr::AccessorFunction(string),
Ident::Malformed(string) => Expr::MalformedIdent(string),
};
if loc_args.is_empty() {
name_expr
} else {
let loc_expr = Located {
region: loc_ident.region,
value: name_expr,
};
Expr::Apply(arena.alloc((loc_expr, loc_args)))
}
}
}
},
) )
.parse(arena, state)
} }
pub fn operator<'a>() -> impl Parser<'a, Operator> { pub fn operator<'a>() -> impl Parser<'a, Operator> {

View file

@ -35,6 +35,12 @@ pub struct State<'a> {
pub attempting: Attempting, pub attempting: Attempting,
} }
#[derive(Debug, PartialEq, Eq)]
pub enum Either<First, Second> {
First(First),
Second(Second),
}
impl<'a> State<'a> { impl<'a> State<'a> {
pub fn new(input: &'a str, attempting: Attempting) -> State<'a> { pub fn new(input: &'a str, attempting: Attempting) -> State<'a> {
State { State {
@ -151,7 +157,6 @@ pub type ParseResult<'a, Output> = Result<(Output, State<'a>), (Fail, State<'a>)
#[derive(Debug, Clone, PartialEq, Eq)] #[derive(Debug, Clone, PartialEq, Eq)]
pub enum FailReason { pub enum FailReason {
Unexpected(char, Region), Unexpected(char, Region),
UnexpectedKeyword(Keyword),
ConditionFailed, ConditionFailed,
LineTooLong(u32 /* which line was too long */), LineTooLong(u32 /* which line was too long */),
TooManyLines, TooManyLines,
@ -164,28 +169,6 @@ pub struct Fail {
pub reason: FailReason, pub reason: FailReason,
} }
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum Keyword {
If,
Then,
Else,
Case,
When,
}
impl Keyword {
pub fn from_str(kw: &str) -> Option<Keyword> {
match kw {
"if" => Some(Keyword::If),
"then" => Some(Keyword::Then),
"else" => Some(Keyword::Else),
"case" => Some(Keyword::Case),
"when" => Some(Keyword::When),
_ => None,
}
}
}
pub trait Parser<'a, Output> { pub trait Parser<'a, Output> {
fn parse(&self, &'a Bump, State<'a>) -> ParseResult<'a, Output>; fn parse(&self, &'a Bump, State<'a>) -> ParseResult<'a, Output>;
} }
@ -271,6 +254,31 @@ where
} }
} }
pub fn zero_or_more<'a, P, A>(parser: P) -> impl Parser<'a, Vec<'a, A>>
where
P: Parser<'a, A>,
{
move |arena, state| match parser.parse(arena, state) {
Ok((first_output, next_state)) => {
let mut state = next_state;
let mut buf = Vec::with_capacity_in(1, arena);
buf.push(first_output);
loop {
match parser.parse(arena, state) {
Ok((next_output, next_state)) => {
state = next_state;
buf.push(next_output);
}
Err((_, old_state)) => return Ok((buf, old_state)),
}
}
}
Err((_, new_state)) => return Ok((Vec::new_in(arena), new_state)),
}
}
pub fn one_or_more<'a, P, A>(parser: P) -> impl Parser<'a, Vec<'a, A>> pub fn one_or_more<'a, P, A>(parser: P) -> impl Parser<'a, Vec<'a, A>>
where where
P: Parser<'a, A>, P: Parser<'a, A>,

View file

@ -898,5 +898,8 @@ mod test_canonicalize {
// } // }
// //
// TODO test what happens when interpolated strings contain 1+ malformed idents // TODO test what happens when interpolated strings contain 1+ malformed idents
//
// TODO test hex/oct/binary conversion to numbers // TODO test hex/oct/binary conversion to numbers
//
// TODO test for \t \r and \n in string literals *outside* unicode escape sequence!
} }

View file

@ -282,10 +282,31 @@ mod test_parse {
assert_eq!(Ok(expected), actual); assert_eq!(Ok(expected), actual);
} }
#[test]
fn multiple_operators() {
let arena = Bump::new();
let inner = arena.alloc((
Located::new(0, 3, 0, 5, Int("42")),
Located::new(0, 5, 0, 6, Plus),
Located::new(0, 6, 0, 9, Int("534")),
));
let outer = arena.alloc((
Located::new(0, 0, 0, 2, Int("31")),
Located::new(0, 2, 0, 3, Star),
Located::new(0, 3, 0, 9, Operator(inner)),
));
let expected = Operator(outer);
let actual = parse_with(&arena, "31*42+534");
assert_eq!(Ok(expected), actual);
}
// TODO test hex/oct/binary parsing // TODO test hex/oct/binary parsing
// //
// TODO test for \t \r and \n in string literals *outside* unicode escape sequence! // TODO test for \t \r and \n in string literals *outside* unicode escape sequence!
// //
// TODO test for non-ASCII variables
//
// TODO verify that when a string literal contains a newline before the // TODO verify that when a string literal contains a newline before the
// closing " it correctly updates both the line *and* column in the State. // closing " it correctly updates both the line *and* column in the State.
} }