wip add more stuff

This commit is contained in:
Richard Feldman 2019-09-17 20:06:57 -04:00
parent 9863268793
commit 4713087bb2
7 changed files with 422 additions and 78 deletions

View file

@ -36,25 +36,39 @@ pub enum Expr<'a> {
// List literals
EmptyList,
List(Vec<'a, Loc<Expr<'a>>>),
// // Lookups
// Var(&'a str),
// Lookups
Var(&'a [&'a str], &'a str),
Variant(&'a [&'a str], &'a str),
// // Pattern Matching
// Case(&'a (Loc<Expr<'a>>, [(Loc<Pattern<'a>>, Loc<Expr<'a>>)])),
When(&'a [(Loc<Pattern<'a>>, Loc<Expr<'a>>)]),
// Closure(&'a (&'a [Loc<Pattern<'a>>], Loc<Expr<'a>>)),
// /// basically Assign(Vec<(Loc<Pattern>, Loc<Expr>)>, Loc<Expr>)
// Assign(&'a (&'a [(Loc<Pattern<'a>>, Loc<Expr<'a>>)], Loc<Expr<'a>>)),
// // Application
// Call(&'a (Loc<Expr<'a>>, [Loc<Expr<'a>>])),
// ApplyVariant(&'a (&'a VariantName, [Loc<Expr<'a>>])),
// Variant(&'a VariantName),
// Application
/// To apply by name, do Apply(Var(...), ...)
/// To apply a variant by name, do Apply(Variant(...), ...)
Apply(&'a (Loc<Expr<'a>>, &'a [Loc<Expr<'a>>])),
Operator(&'a (Loc<Expr<'a>>, Loc<Operator>, Loc<Expr<'a>>)),
// Product Types
EmptyRecord,
// // Sugar
// If(&'a (Loc<Expr<'a>>, Loc<Expr<'a>>, Loc<Expr<'a>>)),
Operator(&'a (Loc<Expr<'a>>, Loc<Operator>, Loc<Expr<'a>>)),
/// e.g. `(expr).foo.bar`
Field(&'a Expr<'a>, &'a [&'a str]),
/// e.g. `Foo.Bar.baz.qux`
QualifiedField(&'a [&'a str], &'a [&'a str]),
/// e.g. `.foo`
AccessorFunction(&'a str),
// Conditionals
If(&'a Loc<Expr<'a>>),
Then(&'a Loc<Expr<'a>>),
Else(&'a Loc<Expr<'a>>),
Case(&'a Loc<Expr<'a>>),
// Problems
MalformedIdent(&'a str),
}
#[derive(Clone, Debug, PartialEq)]

View file

@ -1,27 +1,102 @@
use bumpalo::collections::string::String;
use bumpalo::collections::vec::Vec;
use bumpalo::Bump;
use parse::ast::Attempting;
use parse::parser::{
unexpected, unexpected_eof, Fail, FailReason, Keyword, ParseResult, Parser, State,
};
use parse::parser::{unexpected, unexpected_eof, Fail, ParseResult, Parser, State};
/// The parser accepts all of these in any position where any one of them could
/// appear. This way, canonicalization can give more helpful error messages like
/// "you can't redefine this variant!" if you wrote `Foo = ...` or
/// "you can only define unqualified constants" if you wrote `Foo.bar = ...`
#[derive(Debug, PartialEq, Eq)]
pub enum Ident<'a> {
/// foo or Bar.Baz.foo
Var(MaybeQualified<'a, &'a str>),
/// Foo or Bar.Baz.Foo
Variant(MaybeQualified<'a, &'a str>),
/// foo.bar or Foo.Bar.baz.qux
Field(MaybeQualified<'a, &'a [&'a str]>),
/// .foo
AccessorFunction(&'a str),
/// .Foo or foo. or something like foo.Bar
Malformed(&'a str),
}
/// An optional qualifier (the `Foo.Bar` in `Foo.Bar.baz`).
/// If module_parts is empty, this is unqualified.
#[derive(Debug, PartialEq, Eq)]
pub struct MaybeQualified<'a, Val> {
pub module_parts: &'a [&'a str],
pub value: Val,
}
/// Parse an identifier into a string.
///
/// This is separate from the `ident` Parser because string interpolation
/// wants to use it this way.
///
/// By design, this does not check for reserved keywords like "if", "else", etc.
/// Sometimes we may want to check for those later in the process, and give
/// more contextually-aware error messages than "unexpected `if`" or the like.
#[inline(always)]
pub fn parse_into<'a, I>(
arena: &'a Bump,
chars: &mut I,
state: State<'a>,
) -> ParseResult<'a, (&'a str, Option<char>)>
) -> ParseResult<'a, (Ident<'a>, Option<char>)>
where
I: Iterator<Item = char>,
{
let mut buf = String::new_in(arena);
let mut part_buf = String::new_in(arena); // The current "part" (parts are dot-separated.)
let mut capitalized_parts: Vec<&'a str> = Vec::new_in(arena);
let mut noncapitalized_parts: Vec<&'a str> = Vec::new_in(arena);
let mut is_accessor_fn;
let mut is_capitalized;
// Identifiers must start with an ASCII letter.
// If this doesn't, it must not be an identifier!
let malformed = |opt_bad_char: Option<char>| {
// Reconstruct the original string that we've been parsing.
let mut full_string = String::new_in(arena);
full_string.push_str(&capitalized_parts.join("."));
full_string.push_str(&noncapitalized_parts.join("."));
if let Some(bad_char) = opt_bad_char {
full_string.push(bad_char);
}
// Consume the remaining chars in the identifier.
let mut next_char = None;
while let Some(ch) = chars.next() {
// We can't use ch.is_alphanumeric() here because that passes for
// things that are "numeric" but not ASCII digits, like `¾`
if ch == '.' || ch.is_alphabetic() || ch.is_ascii_digit() {
full_string.push(ch);
} else {
next_char = Some(ch);
break;
}
}
Ok((
(Ident::Malformed(&full_string), next_char),
state.advance_without_indenting(full_string.len())?,
))
};
// Identifiers and accessor functions must start with either a letter or a dot.
// If this starts with neither, it must be something else!
match chars.next() {
Some(ch) => {
if ch.is_ascii_alphabetic() {
buf.push(ch);
if ch.is_alphabetic() {
part_buf.push(ch);
is_capitalized = ch.is_uppercase();
is_accessor_fn = false;
} else if ch == '.' {
is_capitalized = false;
is_accessor_fn = true;
} else {
return Err(unexpected(ch, 0, state, Attempting::Identifier));
}
@ -29,44 +104,128 @@ where
None => {
return Err(unexpected_eof(0, Attempting::Identifier, state));
}
}
};
let mut chars_parsed = 1;
let mut next_char = None;
while let Some(ch) = chars.next() {
// After the first character, letters, numbers, and '.' are allowed.
if ch.is_ascii_alphanumeric() {
buf.push(ch);
// After the first character, only these are allowed:
//
// * Unicode alphabetic chars - you might name a variable `鹏` if that's clear to your readers
// * ASCII digits - e.g. `1` but not `¾`, both of which pass .is_numeric()
// * A dot ('.')
if ch.is_alphabetic() {
if part_buf.is_empty() {
// Capitalization is determined by the first character in the part.
is_capitalized = ch.is_uppercase();
}
part_buf.push(ch);
} else if ch.is_ascii_digit() {
// Parts may not start with numbers!
if part_buf.is_empty() {
return malformed(Some(ch));
}
part_buf.push(ch);
} else if ch == '.' {
panic!("TODO support qualified identifiers. Make sure we don't have consecutive dots, and that module names are capitalized but post-module nothing is capitalized.");
// There are two posssible errors here:
//
// 1. Having two consecutive dots is an error.
// 2. Having capitalized parts after noncapitalized (e.g. `foo.Bar`) is an error.
if part_buf.is_empty() || (is_capitalized && !noncapitalized_parts.is_empty()) {
return malformed(Some(ch));
}
if is_capitalized {
capitalized_parts.push(&part_buf);
} else {
noncapitalized_parts.push(&part_buf);
}
// Now that we've recorded the contents of the current buffer, reset it.
part_buf = String::new_in(arena);
} else {
// This must be the end of the identifier. We're done!
next_char = Some(ch);
break;
}
chars_parsed += 1;
}
let ident_str = buf.as_str();
if part_buf.is_empty() {
// We probably had a trailing dot, e.g. `Foo.bar.` - this is malformed!
//
// This condition might also occur if we encounter a malformed accessor like `.|`
//
// If we made it this far and don't have a next_char, then necessarily
// we have consumed a '.' char previously.
return malformed(next_char.or_else(|| Some('.')));
}
// Make sure we aren't trying to use a reserved keyword as an identifier
match Keyword::from_str(ident_str) {
Some(keyword) => Err((
Fail {
reason: FailReason::UnexpectedKeyword(keyword),
attempting: Attempting::Identifier,
},
state,
)),
// Record the final parts.
if is_capitalized {
capitalized_parts.push(&part_buf);
} else {
noncapitalized_parts.push(&part_buf);
}
let answer = if is_accessor_fn {
// Handle accessor functions first because they have the strictest requirements.
// Accessor functions may have exactly 1 noncapitalized part, and no capitalzed parts.
if capitalized_parts.is_empty() && noncapitalized_parts.len() == 1 {
let value = noncapitalized_parts.iter().next().unwrap();
Ident::AccessorFunction(value)
} else {
return malformed(None);
}
} else {
match noncapitalized_parts.len() {
0 => {
// We have capitalized parts only, so this must be a variant.
match capitalized_parts.pop() {
Some(value) => Ident::Variant(MaybeQualified {
module_parts: capitalized_parts.into_bump_slice(),
value,
}),
None => {
let state = state.advance_without_indenting(buf.len())?;
// We had neither capitalized nor noncapitalized parts,
// yet we made it this far. The only explanation is that this was
// a stray '.' drifting through the cosmos.
return Err(unexpected('.', 1, state, Attempting::Identifier));
}
}
}
1 => {
// We have exactly one noncapitalized part, so this must be a var.
let value = noncapitalized_parts.iter().next().unwrap();
Ok(((buf.into_bump_str(), next_char), state))
Ident::Var(MaybeQualified {
module_parts: capitalized_parts.into_bump_slice(),
value,
})
}
_ => {
// We have multiple noncapitalized parts, so this must be a field.
Ident::Field(MaybeQualified {
module_parts: capitalized_parts.into_bump_slice(),
value: noncapitalized_parts.into_bump_slice(),
})
}
}
};
let state = state.advance_without_indenting(chars_parsed)?;
Ok(((answer, next_char), state))
}
pub fn ident<'a>() -> impl Parser<'a, &'a str> {
pub fn ident<'a>() -> impl Parser<'a, Ident<'a>> {
move |arena: &'a Bump, state: State<'a>| {
// Discard next_char; we don't need it.
let ((string, _), state) = parse_into(arena, &mut state.input.chars(), state)?;
@ -74,3 +233,49 @@ pub fn ident<'a>() -> impl Parser<'a, &'a str> {
Ok((string, state))
}
}
// TESTS
fn test_parse<'a>(input: &'a str) -> Result<Ident<'a>, Fail> {
let arena = Bump::new();
let state = State::new(input, Attempting::Expression);
ident()
.parse(&arena, state)
.map(|(answer, _)| answer)
.map_err(|(err, _)| err)
}
fn var<'a>(module_parts: std::vec::Vec<&'a str>, value: &'a str) -> Ident<'a> {
Ident::Var(MaybeQualified {
module_parts: module_parts.as_slice(),
value,
})
}
fn variant<'a>(module_parts: std::vec::Vec<&'a str>, value: &'a str) -> Ident<'a> {
Ident::Variant(MaybeQualified {
module_parts: module_parts.as_slice(),
value,
})
}
fn field<'a>(module_parts: std::vec::Vec<&'a str>, value: std::vec::Vec<&'a str>) -> Ident<'a> {
Ident::Field(MaybeQualified {
module_parts: module_parts.as_slice(),
value: value.as_slice(),
})
}
fn accessor_fn<'a>(value: &'a str) -> Ident<'a> {
Ident::AccessorFunction(value)
}
fn malformed<'a>(value: &'a str) -> Ident<'a> {
Ident::Malformed(value)
}
#[test]
fn parse_var() {
assert_eq!(test_parse("foo"), Ok(var("foo")))
}

5
src/parse/keyword.rs Normal file
View file

@ -0,0 +1,5 @@
pub static IF: &'static str = "if";
pub static THEN: &'static str = "then";
pub static ELSE: &'static str = "else";
pub static CASE: &'static str = "case";
pub static WHEN: &'static str = "when";

View file

@ -1,39 +1,46 @@
pub mod ast;
pub mod ident;
pub mod keyword;
pub mod module;
pub mod number_literal;
pub mod parser;
pub mod problems;
pub mod string_literal;
use bumpalo::collections::vec::Vec;
use bumpalo::Bump;
use operator::Operator;
use parse::ast::{Attempting, Expr};
use parse::ident::{ident, Ident};
use parse::number_literal::number_literal;
use parse::parser::{
and, attempt, loc, map, map_with_arena, one_of3, optional, string, unexpected, unexpected_eof,
ParseResult, Parser, State,
and, attempt, loc, map, map_with_arena, one_of3, one_of4, one_of6, optional, string,
unexpected, unexpected_eof, Either, ParseResult, Parser, State,
};
use parse::string_literal::string_literal;
use region::Located;
pub fn expr<'a>() -> impl Parser<'a, Expr<'a>> {
pub fn expr<'a>(min_indent: u16) -> impl Parser<'a, Expr<'a>> {
// Recursive parsers must not directly invoke functions which return (impl Parser),
// as this causes rustc to stack overflow.
parse_expr
move |arena, state| parse_expr(min_indent, arena, state)
}
fn parse_expr<'a>(arena: &'a Bump, state: State<'a>) -> ParseResult<'a, Expr<'a>> {
map_with_arena(
fn parse_expr<'a>(min_indent: u16, arena: &'a Bump, state: State<'a>) -> ParseResult<'a, Expr<'a>> {
let expr_parser = map_with_arena(
and(
attempt(
Attempting::Expression,
loc(one_of3(
loc(one_of6(
record_literal(),
number_literal(),
string_literal(),
when(min_indent),
conditional(min_indent),
ident_etc(min_indent),
)),
optional(and(
loc(operator()),
loc(move |arena, state| parse_expr(min_indent, arena, state)),
)),
),
optional(and(loc(operator()), loc(parse_expr))),
),
|arena, (loc_expr1, opt_operator)| match opt_operator {
Some((loc_op, loc_expr2)) => {
@ -43,8 +50,89 @@ fn parse_expr<'a>(arena: &'a Bump, state: State<'a>) -> ParseResult<'a, Expr<'a>
}
None => loc_expr1.value,
},
);
attempt(Attempting::Expression, expr_parser).parse(arena, state)
}
pub fn loc_function_args<'a>(min_indent: u16) -> impl Parser<'a, &'a [Located<Expr<'a>>]> {
move |arena, state| {
panic!("TODO stop early if we see an operator after the whitespace - precedence!");
// zero_or_more(after(one_or_more(whitespace(min_indent)), function_arg()))
}
}
pub fn when<'a>(min_indent: u16) -> impl Parser<'a, Expr<'a>> {
map(string(keyword::WHEN), |_| {
panic!("TODO implement WHEN");
})
}
pub fn conditional<'a>(min_indent: u16) -> impl Parser<'a, Expr<'a>> {
one_of4(
cond_help(keyword::IF, Expr::If, min_indent),
cond_help(keyword::THEN, Expr::Then, min_indent),
cond_help(keyword::ELSE, Expr::Else, min_indent),
cond_help(keyword::CASE, Expr::Case, min_indent),
)
}
fn cond_help<'a, F>(name: &str, wrap_expr: F, min_indent: u16) -> impl Parser<'a, Expr<'a>>
where
F: Fn(&'a Located<Expr<'a>>) -> Expr<'a>,
{
map(
after(
after(string(name), skip1_whitespace(min_indent)),
loc(expr(min_indent)),
),
wrap_expr,
)
}
/// When we parse an ident like `foo ` it could be any of these:
///
/// 1. A standalone variable with trailing whitespace (e.g. because an operator is next)
/// 2. The beginning of a function call (e.g. `foo bar baz`)
/// 3. The beginning of a defniition (e.g. `foo =`)
/// 4. A reserved keyword (e.g. `if ` or `case `), meaning we should do something else.
pub fn ident_etc<'a>(min_indent: u16) -> impl Parser<'a, Expr<'a>> {
let followed_by_equals = after(zero_or_more(whitespace(min_indent), char('=')));
map_with_arena(
and(
loc(ident()),
either(followed_by_equals, loc_function_args(min_indent)),
),
|arena, (loc_ident, equals_or_loc_args)| {
match equals_or_loc_args {
Either::First(()) => {
// We have now parsed the beginning of a def (e.g. `foo =`)
}
Either::Second(loc_args) => {
// This appears to be a var, keyword, or function application.
let name_expr = match loc_ident.value {
Ident::Var(info) => Expr::Var(info.module_parts, info.value),
Ident::Variant(info) => Expr::Variant(info.module_parts, info.value),
Ident::Field(info) => Expr::QualifiedField(info.module_parts, info.value),
Ident::AccessorFunction(string) => Expr::AccessorFunction(string),
Ident::Malformed(string) => Expr::MalformedIdent(string),
};
if loc_args.is_empty() {
name_expr
} else {
let loc_expr = Located {
region: loc_ident.region,
value: name_expr,
};
Expr::Apply(arena.alloc((loc_expr, loc_args)))
}
}
}
},
)
.parse(arena, state)
}
pub fn operator<'a>() -> impl Parser<'a, Operator> {

View file

@ -35,6 +35,12 @@ pub struct State<'a> {
pub attempting: Attempting,
}
#[derive(Debug, PartialEq, Eq)]
pub enum Either<First, Second> {
First(First),
Second(Second),
}
impl<'a> State<'a> {
pub fn new(input: &'a str, attempting: Attempting) -> State<'a> {
State {
@ -151,7 +157,6 @@ pub type ParseResult<'a, Output> = Result<(Output, State<'a>), (Fail, State<'a>)
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum FailReason {
Unexpected(char, Region),
UnexpectedKeyword(Keyword),
ConditionFailed,
LineTooLong(u32 /* which line was too long */),
TooManyLines,
@ -164,28 +169,6 @@ pub struct Fail {
pub reason: FailReason,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum Keyword {
If,
Then,
Else,
Case,
When,
}
impl Keyword {
pub fn from_str(kw: &str) -> Option<Keyword> {
match kw {
"if" => Some(Keyword::If),
"then" => Some(Keyword::Then),
"else" => Some(Keyword::Else),
"case" => Some(Keyword::Case),
"when" => Some(Keyword::When),
_ => None,
}
}
}
pub trait Parser<'a, Output> {
fn parse(&self, &'a Bump, State<'a>) -> ParseResult<'a, Output>;
}
@ -271,6 +254,31 @@ where
}
}
pub fn zero_or_more<'a, P, A>(parser: P) -> impl Parser<'a, Vec<'a, A>>
where
P: Parser<'a, A>,
{
move |arena, state| match parser.parse(arena, state) {
Ok((first_output, next_state)) => {
let mut state = next_state;
let mut buf = Vec::with_capacity_in(1, arena);
buf.push(first_output);
loop {
match parser.parse(arena, state) {
Ok((next_output, next_state)) => {
state = next_state;
buf.push(next_output);
}
Err((_, old_state)) => return Ok((buf, old_state)),
}
}
}
Err((_, new_state)) => return Ok((Vec::new_in(arena), new_state)),
}
}
pub fn one_or_more<'a, P, A>(parser: P) -> impl Parser<'a, Vec<'a, A>>
where
P: Parser<'a, A>,

View file

@ -898,5 +898,8 @@ mod test_canonicalize {
// }
//
// TODO test what happens when interpolated strings contain 1+ malformed idents
//
// TODO test hex/oct/binary conversion to numbers
//
// TODO test for \t \r and \n in string literals *outside* unicode escape sequence!
}

View file

@ -282,10 +282,31 @@ mod test_parse {
assert_eq!(Ok(expected), actual);
}
#[test]
fn multiple_operators() {
let arena = Bump::new();
let inner = arena.alloc((
Located::new(0, 3, 0, 5, Int("42")),
Located::new(0, 5, 0, 6, Plus),
Located::new(0, 6, 0, 9, Int("534")),
));
let outer = arena.alloc((
Located::new(0, 0, 0, 2, Int("31")),
Located::new(0, 2, 0, 3, Star),
Located::new(0, 3, 0, 9, Operator(inner)),
));
let expected = Operator(outer);
let actual = parse_with(&arena, "31*42+534");
assert_eq!(Ok(expected), actual);
}
// TODO test hex/oct/binary parsing
//
// TODO test for \t \r and \n in string literals *outside* unicode escape sequence!
//
// TODO test for non-ASCII variables
//
// TODO verify that when a string literal contains a newline before the
// closing " it correctly updates both the line *and* column in the State.
}