mirror of
https://github.com/roc-lang/roc.git
synced 2025-10-03 00:24:34 +00:00
Record raw strings during parse step
This commit is contained in:
parent
fa9e074488
commit
d54cf81f7b
40 changed files with 4111 additions and 7400 deletions
|
@ -1,9 +1,8 @@
|
|||
use bumpalo::collections::vec::Vec;
|
||||
use operator::Operator;
|
||||
use parse::problems::Problem;
|
||||
use region::Loc;
|
||||
use std::fmt::{self, Display, Formatter};
|
||||
|
||||
pub type Ident = str;
|
||||
pub type VariantName = str;
|
||||
|
||||
/// A parsed expression. This uses lifetimes extensively for two reasons:
|
||||
|
@ -23,50 +22,45 @@ pub type VariantName = str;
|
|||
#[derive(Clone, Debug, PartialEq)]
|
||||
pub enum Expr<'a> {
|
||||
// Number Literals
|
||||
Int(i64),
|
||||
Float(f64),
|
||||
Float(&'a str),
|
||||
Int(&'a str),
|
||||
HexInt(&'a str),
|
||||
OctalInt(&'a str),
|
||||
BinaryInt(&'a str),
|
||||
|
||||
// String Literals
|
||||
EmptyStr,
|
||||
Str(&'a str),
|
||||
/// basically InterpolatedStr(Vec<(String, Loc<Expr>)>, String)
|
||||
InterpolatedStr(&'a (&'a [(&'a str, Loc<Expr<'a>>)], &'a str)),
|
||||
BlockStr(&'a [&'a str]),
|
||||
|
||||
// List literals
|
||||
EmptyList,
|
||||
List(&'a [Loc<Expr<'a>>]),
|
||||
List(Vec<'a, Loc<Expr<'a>>>),
|
||||
// // Lookups
|
||||
// Var(&'a str),
|
||||
|
||||
// Lookups
|
||||
Var(&'a Ident),
|
||||
// // Pattern Matching
|
||||
// Case(&'a (Loc<Expr<'a>>, [(Loc<Pattern<'a>>, Loc<Expr<'a>>)])),
|
||||
// Closure(&'a (&'a [Loc<Pattern<'a>>], Loc<Expr<'a>>)),
|
||||
// /// basically Assign(Vec<(Loc<Pattern>, Loc<Expr>)>, Loc<Expr>)
|
||||
// Assign(&'a (&'a [(Loc<Pattern<'a>>, Loc<Expr<'a>>)], Loc<Expr<'a>>)),
|
||||
|
||||
// Pattern Matching
|
||||
Case(&'a (Loc<Expr<'a>>, [(Loc<Pattern<'a>>, Loc<Expr<'a>>)])),
|
||||
Closure(&'a (&'a [Loc<Pattern<'a>>], Loc<Expr<'a>>)),
|
||||
/// basically Assign(Vec<(Loc<Pattern>, Loc<Expr>)>, Loc<Expr>)
|
||||
Assign(&'a (&'a [(Loc<Pattern<'a>>, Loc<Expr<'a>>)], Loc<Expr<'a>>)),
|
||||
|
||||
// Application
|
||||
Call(&'a (Loc<Expr<'a>>, [Loc<Expr<'a>>])),
|
||||
ApplyVariant(&'a (&'a VariantName, [Loc<Expr<'a>>])),
|
||||
Variant(&'a VariantName),
|
||||
// // Application
|
||||
// Call(&'a (Loc<Expr<'a>>, [Loc<Expr<'a>>])),
|
||||
// ApplyVariant(&'a (&'a VariantName, [Loc<Expr<'a>>])),
|
||||
// Variant(&'a VariantName),
|
||||
|
||||
// Product Types
|
||||
EmptyRecord,
|
||||
|
||||
// Sugar
|
||||
If(&'a (Loc<Expr<'a>>, Loc<Expr<'a>>, Loc<Expr<'a>>)),
|
||||
// // Sugar
|
||||
// If(&'a (Loc<Expr<'a>>, Loc<Expr<'a>>, Loc<Expr<'a>>)),
|
||||
Operator(&'a (Loc<Expr<'a>>, Loc<Operator>, Loc<Expr<'a>>)),
|
||||
|
||||
// Runtime errors
|
||||
MalformedStr(Box<[Loc<Problem>]>),
|
||||
MalformedInt(Problem),
|
||||
MalformedFloat(Problem),
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
pub enum Pattern<'a> {
|
||||
// Identifier
|
||||
Identifier(&'a Ident),
|
||||
Identifier(&'a str),
|
||||
|
||||
// Variant
|
||||
Variant(&'a VariantName),
|
||||
|
@ -82,33 +76,35 @@ pub enum Pattern<'a> {
|
|||
|
||||
#[test]
|
||||
fn expr_size() {
|
||||
// The size of the Expr data structure should be exactly 3 machine words.
|
||||
// The size of the Expr data structure should be exactly 5 machine words.
|
||||
// This test helps avoid regressions wich accidentally increase its size!
|
||||
//
|
||||
// Worth noting that going up to 4 machine words is probably not a big deal;
|
||||
// an 8-byte cache line will only fit 2 of these regardless.
|
||||
assert_eq!(
|
||||
std::mem::size_of::<Expr>(),
|
||||
// TODO [move this comment to an issue] We should be able to get this
|
||||
// down to 2, which would mean we could fit 4 of these nodes in a single
|
||||
// 64-byte cache line instead of only being able to fit 2.
|
||||
// 64-byte cache line instead of only being able to fit 1.
|
||||
//
|
||||
// Doing this would require, among other things:
|
||||
// 1. Making a str replacement where the length is stored as u32 instead of usize,
|
||||
// to leave room for the tagged union's u8 tag.
|
||||
// (Alternatively could store it as (&'a &'a str), but ew.)
|
||||
// 2. Figuring out why &'a (Foo, Bar) by default takes up 24 bytes in Rust.
|
||||
// 2. Similarly, making a slice replacement like that str replacement, and
|
||||
// also where it doesn't share the bytes with anything else - so its
|
||||
// elements can be consumed without having to clone them (unlike a slice).
|
||||
// That's the only reason we're using Vec right now instead of slices -
|
||||
// if we used slices, we'd have to clone their elements during canonicalization
|
||||
// just to iterate over them and canonicalize them normally.
|
||||
// 3. Figuring out why (&'a (Foo, Bar)) by default takes up 24 bytes in Rust.
|
||||
// I assume it's because the struct is being stored inline instead of
|
||||
// as a pointer, but in this case we actually do want the pointer!
|
||||
// We want to have the lifetime and we want to avoid using the unsafe keyword,
|
||||
// but we also want this to only store 1 pointer in the AST node.
|
||||
// Hopefully there's a way!
|
||||
//
|
||||
// It's also possible that going up to 4 machine words might yield even
|
||||
// better performance, due to more data structures being inlinable,
|
||||
// and therefore having fewer pointers to chase. This seems worth
|
||||
// investigating as well.
|
||||
std::mem::size_of::<usize>() * 3
|
||||
// It's also possible that 4 machine words might yield better performance
|
||||
// than 2, due to more data structures being inlinable, and therefore
|
||||
// having fewer pointers to chase. This seems worth investigating as well.
|
||||
std::mem::size_of::<usize>() * 5
|
||||
);
|
||||
}
|
||||
|
||||
|
@ -151,6 +147,7 @@ pub enum Attempting {
|
|||
List,
|
||||
Keyword,
|
||||
StringLiteral,
|
||||
RecordLiteral,
|
||||
InterpolatedString,
|
||||
NumberLiteral,
|
||||
UnicodeEscape,
|
||||
|
@ -165,7 +162,15 @@ impl<'a> Display for Expr<'a> {
|
|||
|
||||
match self {
|
||||
EmptyStr => write!(f, "\"\""),
|
||||
_ => panic!("TODO"),
|
||||
Str(string) => write!(f, "\"{}\"", string),
|
||||
BlockStr(lines) => write!(f, "\"\"\"{}\"\"\"", lines.join("\n")),
|
||||
Int(string) => string.fmt(f),
|
||||
Float(string) => string.fmt(f),
|
||||
HexInt(string) => write!(f, "0x{}", string),
|
||||
BinaryInt(string) => write!(f, "0b{}", string),
|
||||
OctalInt(string) => write!(f, "0o{}", string),
|
||||
EmptyRecord => write!(f, "{}", "{}"),
|
||||
other => panic!("TODO implement Display for AST variant {:?}", other),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,18 +1,76 @@
|
|||
pub mod ast;
|
||||
pub mod ident;
|
||||
pub mod module;
|
||||
pub mod number_literal;
|
||||
pub mod parser;
|
||||
pub mod problems;
|
||||
pub mod string_literal;
|
||||
|
||||
use bumpalo::Bump;
|
||||
use operator::Operator;
|
||||
use parse::ast::{Attempting, Expr};
|
||||
use parse::number_literal::number_literal;
|
||||
use parse::parser::{attempt, one_of2, Parser};
|
||||
use parse::parser::{
|
||||
and, attempt, lazy, loc, map, map_with_arena, one_of3, optional, string, unexpected,
|
||||
unexpected_eof, val, Parser, State,
|
||||
};
|
||||
use parse::string_literal::string_literal;
|
||||
|
||||
pub fn expr<'a>() -> impl Parser<'a, Expr<'a>> {
|
||||
attempt(
|
||||
Attempting::Expression,
|
||||
one_of2(number_literal(), string_literal()),
|
||||
map_with_arena(
|
||||
and(
|
||||
attempt(
|
||||
Attempting::Expression,
|
||||
loc(one_of3(
|
||||
record_literal(),
|
||||
number_literal(),
|
||||
string_literal(),
|
||||
)),
|
||||
),
|
||||
optional(and(loc(operator()), loc(val(Expr::Str("blah"))))),
|
||||
),
|
||||
|arena, (loc_expr1, opt_operator)| match opt_operator {
|
||||
Some((loc_op, loc_expr2)) => {
|
||||
let tuple = arena.alloc((loc_expr1, loc_op, loc_expr2));
|
||||
|
||||
Expr::Operator(tuple)
|
||||
}
|
||||
None => loc_expr1.value,
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
pub fn operator<'a>() -> impl Parser<'a, Operator> {
|
||||
val(Operator::Plus)
|
||||
// one_of3(
|
||||
// map(string("+"), |_| Operator::Plus),
|
||||
// map(string("-"), |_| Operator::Minus),
|
||||
// map(string("*"), |_| Operator::Star),
|
||||
// )
|
||||
}
|
||||
|
||||
pub fn record_literal<'a>() -> impl Parser<'a, Expr<'a>> {
|
||||
move |_arena: &'a Bump, state: State<'a>| {
|
||||
let mut chars = state.input.chars();
|
||||
|
||||
match chars.next() {
|
||||
Some('{') => (),
|
||||
Some(other_char) => {
|
||||
return Err(unexpected(other_char, 0, state, Attempting::RecordLiteral));
|
||||
}
|
||||
None => {
|
||||
return Err(unexpected_eof(0, Attempting::RecordLiteral, state));
|
||||
}
|
||||
}
|
||||
|
||||
match chars.next() {
|
||||
Some('}') => {
|
||||
let next_state = state.advance_without_indenting(2)?;
|
||||
|
||||
Ok((Expr::EmptyRecord, next_state))
|
||||
}
|
||||
Some(other_char) => Err(unexpected(other_char, 0, state, Attempting::RecordLiteral)),
|
||||
None => Err(unexpected_eof(0, Attempting::RecordLiteral, state)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
16
src/parse/module.rs
Normal file
16
src/parse/module.rs
Normal file
|
@ -0,0 +1,16 @@
|
|||
use ident::Ident;
|
||||
use parse::ast::{Expr, Pattern};
|
||||
|
||||
pub struct Module<'a> {
|
||||
pub name: Ident,
|
||||
pub exposes: Vec<Ident>,
|
||||
pub uses: Vec<Ident>,
|
||||
pub decls: Vec<Decl<'a>>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
pub enum Decl<'a> {
|
||||
Def(Pattern<'a>, Expr<'a>, Expr<'a>),
|
||||
// TODO Alias
|
||||
// TODO SumType
|
||||
}
|
|
@ -1,19 +1,16 @@
|
|||
use bumpalo::collections::string::String;
|
||||
use bumpalo::Bump;
|
||||
use parse::ast::{Attempting, Expr};
|
||||
use parse::parser::{unexpected, unexpected_eof, ParseResult, Parser, State};
|
||||
use parse::problems::Problem;
|
||||
use std::char;
|
||||
|
||||
pub fn number_literal<'a>() -> impl Parser<'a, Expr<'a>> {
|
||||
move |arena: &'a Bump, state: State<'a>| {
|
||||
move |_arena, state: State<'a>| {
|
||||
let mut chars = state.input.chars();
|
||||
|
||||
match chars.next() {
|
||||
Some(first_ch) => {
|
||||
// Number literals must start with either an '-' or a digit.
|
||||
if first_ch == '-' || first_ch.is_ascii_digit() {
|
||||
parse_number_literal(first_ch, &mut chars, arena, state)
|
||||
parse_number_literal(first_ch, &mut chars, state)
|
||||
} else {
|
||||
Err(unexpected(
|
||||
first_ch,
|
||||
|
@ -32,61 +29,69 @@ pub fn number_literal<'a>() -> impl Parser<'a, Expr<'a>> {
|
|||
fn parse_number_literal<'a, I>(
|
||||
first_ch: char,
|
||||
chars: &mut I,
|
||||
arena: &'a Bump,
|
||||
state: State<'a>,
|
||||
) -> ParseResult<'a, Expr<'a>>
|
||||
where
|
||||
I: Iterator<Item = char>,
|
||||
{
|
||||
let mut before_decimal = String::with_capacity_in(1, arena);
|
||||
let mut after_decimal = String::new_in(arena);
|
||||
let mut has_decimal_point = false;
|
||||
let mut chars_skipped = 0;
|
||||
use self::LiteralType::*;
|
||||
|
||||
// Put the first character into the buffer, even if all we've parsed so
|
||||
// far is a minus sign.
|
||||
//
|
||||
// We have to let i64::parse handle the minus sign (if it's there), because
|
||||
// otherwise if we ask it to parse i64::MIN.to_string() as a positive i64,
|
||||
// it errors because that positive number doesn't fit in an i64!
|
||||
before_decimal.push(first_ch);
|
||||
let mut typ = Int;
|
||||
|
||||
// We already parsed 1 character (which may have been a minus sign).
|
||||
let mut chars_parsed = 1;
|
||||
|
||||
while let Some(next_ch) = chars.next() {
|
||||
match next_ch {
|
||||
digit if next_ch.is_ascii_digit() => {
|
||||
if has_decimal_point {
|
||||
after_decimal.push(digit);
|
||||
} else {
|
||||
before_decimal.push(digit);
|
||||
}
|
||||
}
|
||||
'_' => {
|
||||
// Underscores are allowed, and disregarded.
|
||||
chars_skipped += 1;
|
||||
}
|
||||
'.' => {
|
||||
if has_decimal_point {
|
||||
// You only get one decimal point!
|
||||
let len = before_decimal.len() + after_decimal.len() + chars_skipped;
|
||||
chars_parsed += 1;
|
||||
|
||||
return Err(unexpected('.', len, state, Attempting::NumberLiteral));
|
||||
} else {
|
||||
chars_skipped += 1;
|
||||
has_decimal_point = true;
|
||||
}
|
||||
}
|
||||
invalid_char => {
|
||||
if before_decimal.is_empty() {
|
||||
// No digits! We likely parsed a minus sign that's actually an operator.
|
||||
let len = before_decimal.len() + after_decimal.len() + chars_skipped;
|
||||
return Err(unexpected(
|
||||
invalid_char,
|
||||
len,
|
||||
state,
|
||||
Attempting::NumberLiteral,
|
||||
));
|
||||
}
|
||||
let err_unexpected = || {
|
||||
Err(unexpected(
|
||||
next_ch,
|
||||
chars_parsed,
|
||||
state.clone(),
|
||||
Attempting::NumberLiteral,
|
||||
))
|
||||
};
|
||||
|
||||
// Returns true iff so far we have parsed the given char and no other chars.
|
||||
let so_far_parsed = |ch| chars_parsed == 2 && first_ch == ch;
|
||||
|
||||
// We don't support negative escaped ints (e.g. 0x01 is supported but -0x01 is not).
|
||||
// If you want that, do something like (negate 0x01).
|
||||
//
|
||||
// I'm open to changing this policy (that is, allowing support for
|
||||
// negative escaped ints), but it'll complicate parsing logic and seems
|
||||
// nonessential, so I'm leaving it out for now.
|
||||
if next_ch == '.' {
|
||||
if typ == Float {
|
||||
// You only get one decimal point!
|
||||
return err_unexpected();
|
||||
} else {
|
||||
typ = Float;
|
||||
}
|
||||
} else if next_ch == 'x' {
|
||||
if so_far_parsed('0') {
|
||||
typ = Hex;
|
||||
} else {
|
||||
return err_unexpected();
|
||||
}
|
||||
} else if next_ch == 'b' {
|
||||
if so_far_parsed('0') {
|
||||
typ = Binary;
|
||||
} else {
|
||||
return err_unexpected();
|
||||
}
|
||||
} else if next_ch == 'o' {
|
||||
if so_far_parsed('0') {
|
||||
typ = Octal;
|
||||
} else {
|
||||
return err_unexpected();
|
||||
}
|
||||
} else if !next_ch.is_ascii_digit() && next_ch != '_' {
|
||||
if so_far_parsed('-') {
|
||||
// No digits! We likely parsed a minus sign that's actually an operator.
|
||||
return err_unexpected();
|
||||
} else {
|
||||
// We hit an invalid number literal character; we're done!
|
||||
break;
|
||||
}
|
||||
|
@ -96,41 +101,25 @@ where
|
|||
// At this point we have a number, and will definitely succeed.
|
||||
// If the number is malformed (outside the supported range),
|
||||
// we'll succeed with an appropriate Expr which records that.
|
||||
let expr = if has_decimal_point {
|
||||
let mut f64_buf = String::with_capacity_in(
|
||||
before_decimal.len()
|
||||
// +1 for the decimal point itself
|
||||
+ 1
|
||||
+ after_decimal.len(),
|
||||
arena,
|
||||
);
|
||||
|
||||
f64_buf.push_str(&before_decimal);
|
||||
f64_buf.push('.');
|
||||
f64_buf.push_str(&after_decimal);
|
||||
|
||||
// TODO [convert this comment to an issue] - we can get better
|
||||
// performance here by inlining string.parse() for the f64 case,
|
||||
// since we've already done the work of validating that each char
|
||||
// is a digit, plus we also already separately parsed the minus
|
||||
// sign and dot.
|
||||
match f64_buf.parse::<f64>() {
|
||||
Ok(float) if float.is_finite() => Expr::Float(float),
|
||||
_ => Expr::MalformedFloat(Problem::OutsideSupportedRange),
|
||||
}
|
||||
} else {
|
||||
// TODO [convert this comment to an issue] - we can get better
|
||||
// performance here by inlining string.parse() for the i64 case,
|
||||
// since we've already done the work of validating that each char
|
||||
// is a digit.
|
||||
match before_decimal.parse::<i64>() {
|
||||
Ok(int_val) => Expr::Int(int_val),
|
||||
Err(_) => Expr::MalformedInt(Problem::OutsideSupportedRange),
|
||||
}
|
||||
let expr = match typ {
|
||||
Int => Expr::Int(&state.input[0..chars_parsed]),
|
||||
Float => Expr::Float(&state.input[0..chars_parsed]),
|
||||
// For these we trim off the 0x/0o/0b part
|
||||
Hex => Expr::HexInt(&state.input[2..chars_parsed - 1]),
|
||||
Binary => Expr::BinaryInt(&state.input[2..chars_parsed - 1]),
|
||||
Octal => Expr::OctalInt(&state.input[2..chars_parsed - 1]),
|
||||
};
|
||||
|
||||
let total_chars_parsed = before_decimal.len() + chars_skipped;
|
||||
let state = state.advance_without_indenting(total_chars_parsed)?;
|
||||
let next_state = state.advance_without_indenting(chars_parsed)?;
|
||||
|
||||
Ok((expr, state))
|
||||
Ok((expr, next_state))
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
enum LiteralType {
|
||||
Int,
|
||||
Float,
|
||||
Hex,
|
||||
Octal,
|
||||
Binary,
|
||||
}
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
use bumpalo::collections::vec::Vec;
|
||||
use bumpalo::Bump;
|
||||
use parse::ast::Attempting;
|
||||
use region::Region;
|
||||
use region::{Located, Region};
|
||||
use std::char;
|
||||
|
||||
// Strategy:
|
||||
|
@ -190,6 +190,21 @@ pub trait Parser<'a, Output> {
|
|||
fn parse(&self, &'a Bump, State<'a>) -> ParseResult<'a, Output>;
|
||||
}
|
||||
|
||||
pub struct BoxedParser<'a, Output> {
|
||||
parser: &'a (dyn Parser<'a, Output> + 'a),
|
||||
}
|
||||
|
||||
impl<'a, Output> BoxedParser<'a, Output> {
|
||||
fn new<P>(arena: &'a Bump, parser: P) -> Self
|
||||
where
|
||||
P: Parser<'a, Output> + 'a,
|
||||
{
|
||||
BoxedParser {
|
||||
parser: arena.alloc(parser),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, F, Output> Parser<'a, Output> for F
|
||||
where
|
||||
F: Fn(&'a Bump, State<'a>) -> ParseResult<'a, Output>,
|
||||
|
@ -199,6 +214,22 @@ where
|
|||
}
|
||||
}
|
||||
|
||||
pub fn val<'a, Val>(value: Val) -> impl Parser<'a, Val>
|
||||
where
|
||||
Val: Clone,
|
||||
{
|
||||
move |_, state| Ok((value.clone(), state))
|
||||
}
|
||||
|
||||
/// Needed for recursive parsers
|
||||
pub fn lazy<'a, F, P, Val>(get_parser: F) -> impl Parser<'a, Val>
|
||||
where
|
||||
F: Fn() -> P,
|
||||
P: Parser<'a, Val>,
|
||||
{
|
||||
move |arena, state| get_parser().parse(arena, state)
|
||||
}
|
||||
|
||||
pub fn map<'a, P, F, Before, After>(parser: P, transform: F) -> impl Parser<'a, After>
|
||||
where
|
||||
P: Parser<'a, Before>,
|
||||
|
@ -211,6 +242,18 @@ where
|
|||
}
|
||||
}
|
||||
|
||||
pub fn map_with_arena<'a, P, F, Before, After>(parser: P, transform: F) -> impl Parser<'a, After>
|
||||
where
|
||||
P: Parser<'a, Before>,
|
||||
F: Fn(&'a Bump, Before) -> After,
|
||||
{
|
||||
move |arena, state| {
|
||||
parser
|
||||
.parse(arena, state)
|
||||
.map(|(output, next_state)| (transform(arena, output), next_state))
|
||||
}
|
||||
}
|
||||
|
||||
pub fn attempt<'a, P, Val>(attempting: Attempting, parser: P) -> impl Parser<'a, Val>
|
||||
where
|
||||
P: Parser<'a, Val>,
|
||||
|
@ -226,6 +269,32 @@ where
|
|||
}
|
||||
}
|
||||
|
||||
pub fn loc<'a, P, Val>(parser: P) -> impl Parser<'a, Located<Val>>
|
||||
where
|
||||
P: Parser<'a, Val>,
|
||||
{
|
||||
move |arena, state: State<'a>| {
|
||||
let start_col = state.column;
|
||||
let start_line = state.line;
|
||||
|
||||
match parser.parse(arena, state) {
|
||||
Ok((value, state)) => {
|
||||
let end_col = state.column;
|
||||
let end_line = state.line;
|
||||
let region = Region {
|
||||
start_col,
|
||||
start_line,
|
||||
end_col,
|
||||
end_line,
|
||||
};
|
||||
|
||||
Ok((Located { region, value }, state))
|
||||
}
|
||||
Err((fail, state)) => Err((fail, state)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn one_or_more<'a, P, A>(parser: P) -> impl Parser<'a, Vec<'a, A>>
|
||||
where
|
||||
P: Parser<'a, A>,
|
||||
|
@ -317,6 +386,7 @@ pub fn string<'a>(string: &'static str) -> impl Parser<'a, ()> {
|
|||
let input = state.input;
|
||||
let len = string.len();
|
||||
|
||||
// TODO do this comparison in one SIMD instruction (on supported systems)
|
||||
match input.get(0..len) {
|
||||
Some(next_str) if next_str == string => Ok(((), state.advance_without_indenting(len)?)),
|
||||
_ => Err(unexpected_eof(len, Attempting::Keyword, state)),
|
||||
|
@ -378,6 +448,46 @@ where
|
|||
// satisfies(any, |ch| ch.is_whitespace())
|
||||
// }
|
||||
|
||||
pub fn and<'a, P1, P2, A, B>(p1: P1, p2: P2) -> impl Parser<'a, (A, B)>
|
||||
where
|
||||
P1: Parser<'a, A>,
|
||||
P2: Parser<'a, B>,
|
||||
{
|
||||
move |arena: &'a Bump, state: State<'a>| {
|
||||
let original_attempting = state.attempting;
|
||||
|
||||
match p1.parse(arena, state) {
|
||||
Ok((out1, state)) => match p2.parse(arena, state) {
|
||||
Ok((out2, state)) => Ok(((out1, out2), state)),
|
||||
Err((fail, state)) => Err((
|
||||
Fail {
|
||||
attempting: original_attempting,
|
||||
..fail
|
||||
},
|
||||
state,
|
||||
)),
|
||||
},
|
||||
Err((fail, state)) => Err((
|
||||
Fail {
|
||||
attempting: original_attempting,
|
||||
..fail
|
||||
},
|
||||
state,
|
||||
)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn optional<'a, P, T>(parser: P) -> impl Parser<'a, Option<T>>
|
||||
where
|
||||
P: Parser<'a, T>,
|
||||
{
|
||||
move |arena: &'a Bump, state: State<'a>| match parser.parse(arena, state) {
|
||||
Ok((out1, state)) => Ok((Some(out1), state)),
|
||||
Err((_, state)) => Ok((None, state)),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn one_of2<'a, P1, P2, A>(p1: P1, p2: P2) -> impl Parser<'a, A>
|
||||
where
|
||||
P1: Parser<'a, A>,
|
||||
|
|
|
@ -1,18 +1,11 @@
|
|||
use bumpalo::collections::string::String;
|
||||
use bumpalo::collections::vec::Vec;
|
||||
use bumpalo::Bump;
|
||||
use parse::ast::{Attempting, Expr};
|
||||
use parse::ident;
|
||||
use parse::parser::{unexpected, unexpected_eof, Fail, Parser, State};
|
||||
use parse::problems::{Problem, Problems};
|
||||
use region::{Loc, Region};
|
||||
use parse::parser::{unexpected, unexpected_eof, ParseResult, Parser, State};
|
||||
use std::char;
|
||||
use std::iter::Peekable;
|
||||
|
||||
pub fn string_literal<'a>() -> impl Parser<'a, Expr<'a>> {
|
||||
move |arena: &'a Bump, state: State<'a>| {
|
||||
let mut problems = std::vec::Vec::new();
|
||||
let mut chars = state.input.chars().peekable();
|
||||
let mut chars = state.input.chars();
|
||||
|
||||
// String literals must start with a quote.
|
||||
// If this doesn't, it must not be a string literal!
|
||||
|
@ -26,464 +19,75 @@ pub fn string_literal<'a>() -> impl Parser<'a, Expr<'a>> {
|
|||
}
|
||||
}
|
||||
|
||||
// If we have precisely an empty string here, don't bother allocating
|
||||
// a buffer; instead, return EmptyStr immediately.
|
||||
if chars.peek() == Some(&'"') {
|
||||
return Ok((
|
||||
Expr::EmptyStr,
|
||||
// 2 because `""` has length 2
|
||||
state.advance_without_indenting(2)?,
|
||||
));
|
||||
}
|
||||
|
||||
// Stores the accumulated string characters
|
||||
let mut buf = String::new_in(arena);
|
||||
|
||||
// This caches the total string length of interpolated_pairs. Every
|
||||
// time we add a new pair to interpolated_pairs, we increment this
|
||||
// by the sum of whatever we parsed in order to obtain that pair.
|
||||
let mut buf_col_offset: usize = 0;
|
||||
|
||||
// Stores interpolated identifiers, if any.
|
||||
let mut interpolated_pairs = Vec::new_in(arena);
|
||||
// At the parsing stage we keep the entire raw string, because the formatter
|
||||
// needs the raw string. (For example, so it can "remember" whether you
|
||||
// wrote \u{...} or the actual unicode character itself.)
|
||||
//
|
||||
// Later, in canonicalization, we'll do things like resolving
|
||||
// unicode escapes and string interpolation.
|
||||
//
|
||||
// Since we're keeping the entire raw string, all we need to track is
|
||||
// how many characters we've parsed. So far, that's 1 (the opening `"`).
|
||||
let mut parsed_chars = 1;
|
||||
let mut prev_ch = '"';
|
||||
|
||||
while let Some(ch) = chars.next() {
|
||||
match ch {
|
||||
// If it's a backslash, escape things.
|
||||
'\\' => match chars.next() {
|
||||
Some(next_ch) => {
|
||||
if let Some(ident) = handle_escaped_char(
|
||||
arena,
|
||||
&state,
|
||||
next_ch,
|
||||
&mut chars,
|
||||
&mut buf,
|
||||
&mut problems,
|
||||
)? {
|
||||
let expr = Expr::Var(ident);
|
||||
parsed_chars += 1;
|
||||
|
||||
// +2 for `\(` and then another +1 for `)` at the end
|
||||
let parsed_length = buf.len() + 2 + ident.len() + 1;
|
||||
|
||||
// It's okay if casting fails in this section, because
|
||||
// we're going to check for line length overflow at the
|
||||
// end anyway. That will render this region useless,
|
||||
// but the user wasn't going to see this region
|
||||
// anyway if the line length overflowed.
|
||||
let start_line = state.line;
|
||||
|
||||
// Subtract ident length and another 1 for the `)`
|
||||
let start_col = state.column
|
||||
+ buf_col_offset as u16
|
||||
+ (parsed_length - ident.len() - 1) as u16;
|
||||
let ident_region = Region {
|
||||
start_line,
|
||||
start_col,
|
||||
end_line: start_line,
|
||||
end_col: start_col + ident.len() as u16 - 1,
|
||||
};
|
||||
let loc_expr = Loc {
|
||||
region: ident_region,
|
||||
value: expr,
|
||||
};
|
||||
|
||||
// Push the accumulated string into the pairs list,
|
||||
// along with the ident that came after it.
|
||||
interpolated_pairs.push((buf.into_bump_str(), loc_expr));
|
||||
|
||||
// Reset the buffer so we start working on a new string.
|
||||
buf = String::new_in(arena);
|
||||
|
||||
// Advance the cached offset of how many chars we've parsed,
|
||||
// so the next time we see an interpolated ident, we can
|
||||
// correctly calculate its region.
|
||||
buf_col_offset += parsed_length;
|
||||
}
|
||||
}
|
||||
None => {
|
||||
// We ran out of characters before finding a closed quote;
|
||||
// let the loop finish normally, so we end up returning
|
||||
// the error that the string was not terminated.
|
||||
//
|
||||
// (There's the separate problem of a trailing backslash,
|
||||
// but often that will get fixed in the course of
|
||||
// addressing the missing closed quote.)
|
||||
()
|
||||
}
|
||||
},
|
||||
'"' => {
|
||||
// We found a closed quote; this is the end of the string!
|
||||
let len_with_quotes = buf.len() + 2;
|
||||
let expr = if problems.is_empty() {
|
||||
let final_str = buf.into_bump_str();
|
||||
|
||||
if interpolated_pairs.is_empty() {
|
||||
Expr::Str(final_str)
|
||||
} else {
|
||||
let tuple_ref =
|
||||
arena.alloc((interpolated_pairs.into_bump_slice(), final_str));
|
||||
|
||||
Expr::InterpolatedStr(tuple_ref)
|
||||
}
|
||||
// Potentially end the string (unless this is an escaped `"`!)
|
||||
if ch == '"' && prev_ch != '\\' {
|
||||
let expr = if parsed_chars == 2 {
|
||||
if let Some('"') = chars.next() {
|
||||
// If the first three chars were all `"`, then this
|
||||
// literal begins with `"""` and is a block string.
|
||||
return parse_block_string(arena, state, &mut chars);
|
||||
} else {
|
||||
Expr::MalformedStr(problems.into_boxed_slice())
|
||||
};
|
||||
Expr::EmptyStr
|
||||
}
|
||||
} else {
|
||||
// Start at 1 so we omit the opening `"`.
|
||||
// Subtract 1 from parsed_chars so we omit the closing `"`.
|
||||
Expr::Str(&state.input[1..(parsed_chars - 1)])
|
||||
};
|
||||
|
||||
let next_state = state.advance_without_indenting(len_with_quotes)?;
|
||||
let next_state = state.advance_without_indenting(parsed_chars)?;
|
||||
|
||||
return Ok((expr, next_state));
|
||||
}
|
||||
'\t' => {
|
||||
// Report the problem and continue. Tabs are syntax errors,
|
||||
// but maybe the rest of the string is fine!
|
||||
problems.push(loc_char(Problem::Tab, &state, buf.len()));
|
||||
}
|
||||
'\r' => {
|
||||
// Carriage returns aren't allowed in string literals,
|
||||
// but maybe the rest of the string is fine!
|
||||
problems.push(loc_char(Problem::CarriageReturn, &state, buf.len()));
|
||||
}
|
||||
'\n' => {
|
||||
// We hit a newline before a close quote.
|
||||
// We can't safely assume where the string was supposed
|
||||
// to end, so this is an unrecoverable error.
|
||||
return Err(unexpected('\n', 0, state, Attempting::StringLiteral));
|
||||
}
|
||||
normal_char => buf.push(normal_char),
|
||||
return Ok((expr, next_state));
|
||||
} else if ch == '\n' {
|
||||
// This is a single-line string, which cannot have newlines!
|
||||
// Treat this as an unclosed string literal, and consume
|
||||
// all remaining chars. This will mask all other errors, but
|
||||
// it should make it easiest to debug; the file will be a giant
|
||||
// error starting from where the open quote appeared.
|
||||
return Err(unexpected(
|
||||
'\n',
|
||||
state.input.len() - 1,
|
||||
state,
|
||||
Attempting::StringLiteral,
|
||||
));
|
||||
} else {
|
||||
prev_ch = ch;
|
||||
}
|
||||
}
|
||||
|
||||
// We ran out of characters before finding a closed quote
|
||||
Err(unexpected_eof(
|
||||
buf.len(),
|
||||
parsed_chars,
|
||||
Attempting::StringLiteral,
|
||||
state.clone(),
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
fn loc_char<'a, V>(value: V, state: &State<'a>, buf_len: usize) -> Loc<V> {
|
||||
let start_line = state.line;
|
||||
let start_col = state.column + buf_len as u16;
|
||||
let end_line = start_line;
|
||||
// All invalid chars should have a length of 1
|
||||
let end_col = state.column + 1;
|
||||
|
||||
let region = Region {
|
||||
start_line,
|
||||
start_col,
|
||||
end_line,
|
||||
end_col,
|
||||
};
|
||||
|
||||
Loc { region, value }
|
||||
}
|
||||
|
||||
fn loc_escaped_char<'a, V>(value: V, state: &State<'a>, buf_len: usize) -> Loc<V> {
|
||||
let start_line = state.line;
|
||||
let start_col = state.column + buf_len as u16;
|
||||
let end_line = start_line;
|
||||
// escapes should all be 2 chars long
|
||||
let end_col = state.column + 1;
|
||||
|
||||
let region = Region {
|
||||
start_line,
|
||||
start_col,
|
||||
end_line,
|
||||
end_col,
|
||||
};
|
||||
|
||||
Loc { region, value }
|
||||
}
|
||||
|
||||
fn loc_escaped_unicode<'a, V>(
|
||||
value: V,
|
||||
state: &State<'a>,
|
||||
buf_len: usize,
|
||||
hex_str_len: usize,
|
||||
) -> Loc<V> {
|
||||
let start_line = state.line;
|
||||
// +1 due to the `"` which precedes buf.
|
||||
let start_col = state.column + buf_len as u16 + 1;
|
||||
let end_line = start_line;
|
||||
// +3 due to the `\u{` and another + 1 due to the `}`
|
||||
// -1 to prevent overshooting because end col is inclusive.
|
||||
let end_col = start_col + 3 + hex_str_len as u16 + 1 - 1;
|
||||
|
||||
let region = Region {
|
||||
start_line,
|
||||
start_col,
|
||||
end_line,
|
||||
end_col,
|
||||
};
|
||||
|
||||
Loc { region, value }
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn handle_escaped_char<'a, I>(
|
||||
arena: &'a Bump,
|
||||
state: &State<'a>,
|
||||
ch: char,
|
||||
chars: &mut Peekable<I>,
|
||||
buf: &mut String<'a>,
|
||||
problems: &mut Problems,
|
||||
) -> Result<Option<&'a str>, (Fail, State<'a>)>
|
||||
fn parse_block_string<'a, I>(
|
||||
_arena: &'a Bump,
|
||||
_state: State<'a>,
|
||||
_chars: &mut I,
|
||||
) -> ParseResult<'a, Expr<'a>>
|
||||
where
|
||||
I: Iterator<Item = char>,
|
||||
{
|
||||
match ch {
|
||||
'\\' => buf.push('\\'),
|
||||
'"' => buf.push('"'),
|
||||
't' => buf.push('\t'),
|
||||
'n' => buf.push('\n'),
|
||||
'r' => buf.push('\r'),
|
||||
'0' => buf.push('\0'), // We explicitly support null characters, as we
|
||||
// can't be sure we won't receive them from Rust.
|
||||
'u' => handle_escaped_unicode(arena, &state, chars, buf, problems)?,
|
||||
'(' => {
|
||||
let ident = parse_interpolated_ident(arena, state, chars)?;
|
||||
|
||||
return Ok(Some(ident));
|
||||
}
|
||||
'\t' => {
|
||||
// Report and continue.
|
||||
// Tabs are syntax errors, but maybe the rest of the string is fine!
|
||||
problems.push(loc_escaped_char(Problem::Tab, &state, buf.len()));
|
||||
}
|
||||
'\r' => {
|
||||
// Report and continue.
|
||||
// Carriage returns aren't allowed in string literals,
|
||||
// but maybe the rest of the string is fine!
|
||||
problems.push(loc_escaped_char(Problem::CarriageReturn, &state, buf.len()));
|
||||
}
|
||||
'\n' => {
|
||||
// Report and bail out.
|
||||
// We can't safely assume where the string was supposed to end.
|
||||
problems.push(loc_escaped_char(
|
||||
Problem::NewlineInLiteral,
|
||||
&state,
|
||||
buf.len(),
|
||||
));
|
||||
|
||||
return Err(unexpected_eof(
|
||||
buf.len(),
|
||||
Attempting::UnicodeEscape,
|
||||
state.clone(),
|
||||
));
|
||||
}
|
||||
_ => {
|
||||
// Report and continue.
|
||||
// An unsupported escaped char (e.g. \q) shouldn't halt parsing.
|
||||
problems.push(loc_escaped_char(
|
||||
Problem::UnsupportedEscapedChar,
|
||||
&state,
|
||||
buf.len(),
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(None)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn handle_escaped_unicode<'a, I>(
|
||||
arena: &'a Bump,
|
||||
state: &State<'a>,
|
||||
chars: &mut Peekable<I>,
|
||||
buf: &mut String<'a>,
|
||||
problems: &mut Problems,
|
||||
) -> Result<(), (Fail, State<'a>)>
|
||||
where
|
||||
I: Iterator<Item = char>,
|
||||
{
|
||||
// \u{00A0} is how you specify a Unicode code point,
|
||||
// so we should always see a '{' next.
|
||||
if chars.next() != Some('{') {
|
||||
let start_line = state.line;
|
||||
// +1 due to the `"` which precedes buf
|
||||
let start_col = state.column + 1 + buf.len() as u16;
|
||||
let end_line = start_line;
|
||||
|
||||
// All we parsed was `\u`, so end on the column after `\`'s column.
|
||||
let end_col = start_col + 1;
|
||||
|
||||
let region = Region {
|
||||
start_line,
|
||||
start_col,
|
||||
end_line,
|
||||
end_col,
|
||||
};
|
||||
|
||||
problems.push(Loc {
|
||||
region,
|
||||
value: Problem::NoUnicodeDigits,
|
||||
});
|
||||
|
||||
// The rest of the string literal might be fine. Keep parsing!
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Record the point in the string literal where we started parsing `\u`
|
||||
let start_of_unicode = buf.len();
|
||||
|
||||
// Stores the accumulated unicode digits
|
||||
let mut hex_str = String::new_in(arena);
|
||||
|
||||
while let Some(hex_char) = chars.next() {
|
||||
match hex_char {
|
||||
'}' => {
|
||||
// Done! Validate and add it to the buffer.
|
||||
match u32::from_str_radix(&hex_str, 16) {
|
||||
Ok(code_pt) => {
|
||||
if code_pt > 0x10FFFF {
|
||||
let start_line = state.line;
|
||||
// +1 due to the `"` which precedes buf
|
||||
// +3 due to the `\u{` which precedes the hex digits
|
||||
let start_col = state.column + 1 + buf.len() as u16 + 3;
|
||||
let end_line = start_line;
|
||||
|
||||
// We want to underline only the number. That's the error!
|
||||
// -1 because we want to end on the last digit, not
|
||||
// overshoot it.
|
||||
let end_col = start_col + hex_str.len() as u16 - 1;
|
||||
|
||||
let region = Region {
|
||||
start_line,
|
||||
start_col,
|
||||
end_line,
|
||||
end_col,
|
||||
};
|
||||
|
||||
problems.push(Loc {
|
||||
region,
|
||||
value: Problem::UnicodeCodePointTooLarge,
|
||||
});
|
||||
} else {
|
||||
// If it all checked out, add it to
|
||||
// the main buffer.
|
||||
match char::from_u32(code_pt) {
|
||||
Some(ch) => buf.push(ch),
|
||||
None => {
|
||||
problems.push(loc_escaped_unicode(
|
||||
Problem::InvalidUnicodeCodePoint,
|
||||
&state,
|
||||
start_of_unicode,
|
||||
hex_str.len(),
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(_) => {
|
||||
let problem = if hex_str.is_empty() {
|
||||
Problem::NoUnicodeDigits
|
||||
} else {
|
||||
Problem::NonHexCharsInUnicodeCodePoint
|
||||
};
|
||||
|
||||
problems.push(loc_escaped_unicode(
|
||||
problem,
|
||||
&state,
|
||||
start_of_unicode,
|
||||
hex_str.len(),
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
// We are now done processing the unicode portion of the string,
|
||||
// so exit the loop without further advancing the iterator.
|
||||
return Ok(());
|
||||
}
|
||||
'\t' => {
|
||||
// Report and continue.
|
||||
// Tabs are syntax errors, but maybe the rest of the string is fine!
|
||||
problems.push(loc_escaped_unicode(
|
||||
Problem::Tab,
|
||||
&state,
|
||||
start_of_unicode,
|
||||
hex_str.len(),
|
||||
));
|
||||
}
|
||||
'\r' => {
|
||||
// Report and continue.
|
||||
// Carriage returns aren't allowed in string literals,
|
||||
// but maybe the rest of the string is fine!
|
||||
problems.push(loc_escaped_unicode(
|
||||
Problem::CarriageReturn,
|
||||
&state,
|
||||
start_of_unicode,
|
||||
hex_str.len(),
|
||||
));
|
||||
}
|
||||
'\n' => {
|
||||
// Report and bail out.
|
||||
// We can't safely assume where the string was supposed to end.
|
||||
problems.push(loc_escaped_unicode(
|
||||
Problem::NewlineInLiteral,
|
||||
&state,
|
||||
start_of_unicode,
|
||||
hex_str.len(),
|
||||
));
|
||||
|
||||
return Err(unexpected_eof(
|
||||
buf.len(),
|
||||
Attempting::UnicodeEscape,
|
||||
state.clone(),
|
||||
));
|
||||
}
|
||||
normal_char => hex_str.push(normal_char),
|
||||
}
|
||||
|
||||
// If we're about to hit the end of the string, and we didn't already
|
||||
// complete parsing a valid unicode escape sequence, this is a malformed
|
||||
// escape sequence - it wasn't terminated!
|
||||
if chars.peek() == Some(&'"') {
|
||||
// Record a problem and exit the loop early, so the string literal
|
||||
// parsing logic can consume the quote and do its job as normal.
|
||||
let start_line = state.line;
|
||||
// +1 due to the `"` which precedes buf.
|
||||
let start_col = state.column + buf.len() as u16 + 1;
|
||||
let end_line = start_line;
|
||||
// +3 due to the `\u{`
|
||||
// -1 to prevent overshooting because end col is inclusive.
|
||||
let end_col = start_col + 3 + hex_str.len() as u16 - 1;
|
||||
|
||||
let region = Region {
|
||||
start_line,
|
||||
start_col,
|
||||
end_line,
|
||||
end_col,
|
||||
};
|
||||
|
||||
problems.push(Loc {
|
||||
region,
|
||||
value: Problem::MalformedEscapedUnicode,
|
||||
});
|
||||
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn parse_interpolated_ident<'a, I>(
|
||||
arena: &'a Bump,
|
||||
state: &State<'a>,
|
||||
chars: &mut Peekable<I>,
|
||||
) -> Result<&'a str, (Fail, State<'a>)>
|
||||
where
|
||||
I: Iterator<Item = char>,
|
||||
{
|
||||
// This will return Err on invalid identifiers like "if"
|
||||
let ((string, next_char), state) = ident::parse_into(arena, chars, state.clone())?;
|
||||
|
||||
// Make sure we got a closing ) to end the interpolation.
|
||||
match next_char {
|
||||
Some(')') => Ok(string),
|
||||
Some(ch) => Err(unexpected(ch, 0, state, Attempting::InterpolatedString)),
|
||||
None => Err(unexpected_eof(0, Attempting::InterpolatedString, state)),
|
||||
}
|
||||
// So far we have consumed the `"""` and that's it.
|
||||
let _parsed_chars = 3;
|
||||
panic!("TODO parse block string, advance state, etc");
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue