Record raw strings during parse step

This commit is contained in:
Richard Feldman 2019-09-16 00:25:31 -04:00
parent fa9e074488
commit d54cf81f7b
40 changed files with 4111 additions and 7400 deletions

View file

@ -1,9 +1,8 @@
use bumpalo::collections::vec::Vec;
use operator::Operator;
use parse::problems::Problem;
use region::Loc;
use std::fmt::{self, Display, Formatter};
pub type Ident = str;
pub type VariantName = str;
/// A parsed expression. This uses lifetimes extensively for two reasons:
@ -23,50 +22,45 @@ pub type VariantName = str;
#[derive(Clone, Debug, PartialEq)]
pub enum Expr<'a> {
// Number Literals
Int(i64),
Float(f64),
Float(&'a str),
Int(&'a str),
HexInt(&'a str),
OctalInt(&'a str),
BinaryInt(&'a str),
// String Literals
EmptyStr,
Str(&'a str),
/// basically InterpolatedStr(Vec<(String, Loc<Expr>)>, String)
InterpolatedStr(&'a (&'a [(&'a str, Loc<Expr<'a>>)], &'a str)),
BlockStr(&'a [&'a str]),
// List literals
EmptyList,
List(&'a [Loc<Expr<'a>>]),
List(Vec<'a, Loc<Expr<'a>>>),
// // Lookups
// Var(&'a str),
// Lookups
Var(&'a Ident),
// // Pattern Matching
// Case(&'a (Loc<Expr<'a>>, [(Loc<Pattern<'a>>, Loc<Expr<'a>>)])),
// Closure(&'a (&'a [Loc<Pattern<'a>>], Loc<Expr<'a>>)),
// /// basically Assign(Vec<(Loc<Pattern>, Loc<Expr>)>, Loc<Expr>)
// Assign(&'a (&'a [(Loc<Pattern<'a>>, Loc<Expr<'a>>)], Loc<Expr<'a>>)),
// Pattern Matching
Case(&'a (Loc<Expr<'a>>, [(Loc<Pattern<'a>>, Loc<Expr<'a>>)])),
Closure(&'a (&'a [Loc<Pattern<'a>>], Loc<Expr<'a>>)),
/// basically Assign(Vec<(Loc<Pattern>, Loc<Expr>)>, Loc<Expr>)
Assign(&'a (&'a [(Loc<Pattern<'a>>, Loc<Expr<'a>>)], Loc<Expr<'a>>)),
// Application
Call(&'a (Loc<Expr<'a>>, [Loc<Expr<'a>>])),
ApplyVariant(&'a (&'a VariantName, [Loc<Expr<'a>>])),
Variant(&'a VariantName),
// // Application
// Call(&'a (Loc<Expr<'a>>, [Loc<Expr<'a>>])),
// ApplyVariant(&'a (&'a VariantName, [Loc<Expr<'a>>])),
// Variant(&'a VariantName),
// Product Types
EmptyRecord,
// Sugar
If(&'a (Loc<Expr<'a>>, Loc<Expr<'a>>, Loc<Expr<'a>>)),
// // Sugar
// If(&'a (Loc<Expr<'a>>, Loc<Expr<'a>>, Loc<Expr<'a>>)),
Operator(&'a (Loc<Expr<'a>>, Loc<Operator>, Loc<Expr<'a>>)),
// Runtime errors
MalformedStr(Box<[Loc<Problem>]>),
MalformedInt(Problem),
MalformedFloat(Problem),
}
#[derive(Clone, Debug, PartialEq)]
pub enum Pattern<'a> {
// Identifier
Identifier(&'a Ident),
Identifier(&'a str),
// Variant
Variant(&'a VariantName),
@ -82,33 +76,35 @@ pub enum Pattern<'a> {
#[test]
fn expr_size() {
// The size of the Expr data structure should be exactly 3 machine words.
// The size of the Expr data structure should be exactly 5 machine words.
// This test helps avoid regressions wich accidentally increase its size!
//
// Worth noting that going up to 4 machine words is probably not a big deal;
// an 8-byte cache line will only fit 2 of these regardless.
assert_eq!(
std::mem::size_of::<Expr>(),
// TODO [move this comment to an issue] We should be able to get this
// down to 2, which would mean we could fit 4 of these nodes in a single
// 64-byte cache line instead of only being able to fit 2.
// 64-byte cache line instead of only being able to fit 1.
//
// Doing this would require, among other things:
// 1. Making a str replacement where the length is stored as u32 instead of usize,
// to leave room for the tagged union's u8 tag.
// (Alternatively could store it as (&'a &'a str), but ew.)
// 2. Figuring out why &'a (Foo, Bar) by default takes up 24 bytes in Rust.
// 2. Similarly, making a slice replacement like that str replacement, and
// also where it doesn't share the bytes with anything else - so its
// elements can be consumed without having to clone them (unlike a slice).
// That's the only reason we're using Vec right now instead of slices -
// if we used slices, we'd have to clone their elements during canonicalization
// just to iterate over them and canonicalize them normally.
// 3. Figuring out why (&'a (Foo, Bar)) by default takes up 24 bytes in Rust.
// I assume it's because the struct is being stored inline instead of
// as a pointer, but in this case we actually do want the pointer!
// We want to have the lifetime and we want to avoid using the unsafe keyword,
// but we also want this to only store 1 pointer in the AST node.
// Hopefully there's a way!
//
// It's also possible that going up to 4 machine words might yield even
// better performance, due to more data structures being inlinable,
// and therefore having fewer pointers to chase. This seems worth
// investigating as well.
std::mem::size_of::<usize>() * 3
// It's also possible that 4 machine words might yield better performance
// than 2, due to more data structures being inlinable, and therefore
// having fewer pointers to chase. This seems worth investigating as well.
std::mem::size_of::<usize>() * 5
);
}
@ -151,6 +147,7 @@ pub enum Attempting {
List,
Keyword,
StringLiteral,
RecordLiteral,
InterpolatedString,
NumberLiteral,
UnicodeEscape,
@ -165,7 +162,15 @@ impl<'a> Display for Expr<'a> {
match self {
EmptyStr => write!(f, "\"\""),
_ => panic!("TODO"),
Str(string) => write!(f, "\"{}\"", string),
BlockStr(lines) => write!(f, "\"\"\"{}\"\"\"", lines.join("\n")),
Int(string) => string.fmt(f),
Float(string) => string.fmt(f),
HexInt(string) => write!(f, "0x{}", string),
BinaryInt(string) => write!(f, "0b{}", string),
OctalInt(string) => write!(f, "0o{}", string),
EmptyRecord => write!(f, "{}", "{}"),
other => panic!("TODO implement Display for AST variant {:?}", other),
}
}
}

View file

@ -1,18 +1,76 @@
pub mod ast;
pub mod ident;
pub mod module;
pub mod number_literal;
pub mod parser;
pub mod problems;
pub mod string_literal;
use bumpalo::Bump;
use operator::Operator;
use parse::ast::{Attempting, Expr};
use parse::number_literal::number_literal;
use parse::parser::{attempt, one_of2, Parser};
use parse::parser::{
and, attempt, lazy, loc, map, map_with_arena, one_of3, optional, string, unexpected,
unexpected_eof, val, Parser, State,
};
use parse::string_literal::string_literal;
pub fn expr<'a>() -> impl Parser<'a, Expr<'a>> {
attempt(
Attempting::Expression,
one_of2(number_literal(), string_literal()),
map_with_arena(
and(
attempt(
Attempting::Expression,
loc(one_of3(
record_literal(),
number_literal(),
string_literal(),
)),
),
optional(and(loc(operator()), loc(val(Expr::Str("blah"))))),
),
|arena, (loc_expr1, opt_operator)| match opt_operator {
Some((loc_op, loc_expr2)) => {
let tuple = arena.alloc((loc_expr1, loc_op, loc_expr2));
Expr::Operator(tuple)
}
None => loc_expr1.value,
},
)
}
pub fn operator<'a>() -> impl Parser<'a, Operator> {
val(Operator::Plus)
// one_of3(
// map(string("+"), |_| Operator::Plus),
// map(string("-"), |_| Operator::Minus),
// map(string("*"), |_| Operator::Star),
// )
}
pub fn record_literal<'a>() -> impl Parser<'a, Expr<'a>> {
move |_arena: &'a Bump, state: State<'a>| {
let mut chars = state.input.chars();
match chars.next() {
Some('{') => (),
Some(other_char) => {
return Err(unexpected(other_char, 0, state, Attempting::RecordLiteral));
}
None => {
return Err(unexpected_eof(0, Attempting::RecordLiteral, state));
}
}
match chars.next() {
Some('}') => {
let next_state = state.advance_without_indenting(2)?;
Ok((Expr::EmptyRecord, next_state))
}
Some(other_char) => Err(unexpected(other_char, 0, state, Attempting::RecordLiteral)),
None => Err(unexpected_eof(0, Attempting::RecordLiteral, state)),
}
}
}

16
src/parse/module.rs Normal file
View file

@ -0,0 +1,16 @@
use ident::Ident;
use parse::ast::{Expr, Pattern};
pub struct Module<'a> {
pub name: Ident,
pub exposes: Vec<Ident>,
pub uses: Vec<Ident>,
pub decls: Vec<Decl<'a>>,
}
#[derive(Clone, Debug, PartialEq)]
pub enum Decl<'a> {
Def(Pattern<'a>, Expr<'a>, Expr<'a>),
// TODO Alias
// TODO SumType
}

View file

@ -1,19 +1,16 @@
use bumpalo::collections::string::String;
use bumpalo::Bump;
use parse::ast::{Attempting, Expr};
use parse::parser::{unexpected, unexpected_eof, ParseResult, Parser, State};
use parse::problems::Problem;
use std::char;
pub fn number_literal<'a>() -> impl Parser<'a, Expr<'a>> {
move |arena: &'a Bump, state: State<'a>| {
move |_arena, state: State<'a>| {
let mut chars = state.input.chars();
match chars.next() {
Some(first_ch) => {
// Number literals must start with either an '-' or a digit.
if first_ch == '-' || first_ch.is_ascii_digit() {
parse_number_literal(first_ch, &mut chars, arena, state)
parse_number_literal(first_ch, &mut chars, state)
} else {
Err(unexpected(
first_ch,
@ -32,61 +29,69 @@ pub fn number_literal<'a>() -> impl Parser<'a, Expr<'a>> {
fn parse_number_literal<'a, I>(
first_ch: char,
chars: &mut I,
arena: &'a Bump,
state: State<'a>,
) -> ParseResult<'a, Expr<'a>>
where
I: Iterator<Item = char>,
{
let mut before_decimal = String::with_capacity_in(1, arena);
let mut after_decimal = String::new_in(arena);
let mut has_decimal_point = false;
let mut chars_skipped = 0;
use self::LiteralType::*;
// Put the first character into the buffer, even if all we've parsed so
// far is a minus sign.
//
// We have to let i64::parse handle the minus sign (if it's there), because
// otherwise if we ask it to parse i64::MIN.to_string() as a positive i64,
// it errors because that positive number doesn't fit in an i64!
before_decimal.push(first_ch);
let mut typ = Int;
// We already parsed 1 character (which may have been a minus sign).
let mut chars_parsed = 1;
while let Some(next_ch) = chars.next() {
match next_ch {
digit if next_ch.is_ascii_digit() => {
if has_decimal_point {
after_decimal.push(digit);
} else {
before_decimal.push(digit);
}
}
'_' => {
// Underscores are allowed, and disregarded.
chars_skipped += 1;
}
'.' => {
if has_decimal_point {
// You only get one decimal point!
let len = before_decimal.len() + after_decimal.len() + chars_skipped;
chars_parsed += 1;
return Err(unexpected('.', len, state, Attempting::NumberLiteral));
} else {
chars_skipped += 1;
has_decimal_point = true;
}
}
invalid_char => {
if before_decimal.is_empty() {
// No digits! We likely parsed a minus sign that's actually an operator.
let len = before_decimal.len() + after_decimal.len() + chars_skipped;
return Err(unexpected(
invalid_char,
len,
state,
Attempting::NumberLiteral,
));
}
let err_unexpected = || {
Err(unexpected(
next_ch,
chars_parsed,
state.clone(),
Attempting::NumberLiteral,
))
};
// Returns true iff so far we have parsed the given char and no other chars.
let so_far_parsed = |ch| chars_parsed == 2 && first_ch == ch;
// We don't support negative escaped ints (e.g. 0x01 is supported but -0x01 is not).
// If you want that, do something like (negate 0x01).
//
// I'm open to changing this policy (that is, allowing support for
// negative escaped ints), but it'll complicate parsing logic and seems
// nonessential, so I'm leaving it out for now.
if next_ch == '.' {
if typ == Float {
// You only get one decimal point!
return err_unexpected();
} else {
typ = Float;
}
} else if next_ch == 'x' {
if so_far_parsed('0') {
typ = Hex;
} else {
return err_unexpected();
}
} else if next_ch == 'b' {
if so_far_parsed('0') {
typ = Binary;
} else {
return err_unexpected();
}
} else if next_ch == 'o' {
if so_far_parsed('0') {
typ = Octal;
} else {
return err_unexpected();
}
} else if !next_ch.is_ascii_digit() && next_ch != '_' {
if so_far_parsed('-') {
// No digits! We likely parsed a minus sign that's actually an operator.
return err_unexpected();
} else {
// We hit an invalid number literal character; we're done!
break;
}
@ -96,41 +101,25 @@ where
// At this point we have a number, and will definitely succeed.
// If the number is malformed (outside the supported range),
// we'll succeed with an appropriate Expr which records that.
let expr = if has_decimal_point {
let mut f64_buf = String::with_capacity_in(
before_decimal.len()
// +1 for the decimal point itself
+ 1
+ after_decimal.len(),
arena,
);
f64_buf.push_str(&before_decimal);
f64_buf.push('.');
f64_buf.push_str(&after_decimal);
// TODO [convert this comment to an issue] - we can get better
// performance here by inlining string.parse() for the f64 case,
// since we've already done the work of validating that each char
// is a digit, plus we also already separately parsed the minus
// sign and dot.
match f64_buf.parse::<f64>() {
Ok(float) if float.is_finite() => Expr::Float(float),
_ => Expr::MalformedFloat(Problem::OutsideSupportedRange),
}
} else {
// TODO [convert this comment to an issue] - we can get better
// performance here by inlining string.parse() for the i64 case,
// since we've already done the work of validating that each char
// is a digit.
match before_decimal.parse::<i64>() {
Ok(int_val) => Expr::Int(int_val),
Err(_) => Expr::MalformedInt(Problem::OutsideSupportedRange),
}
let expr = match typ {
Int => Expr::Int(&state.input[0..chars_parsed]),
Float => Expr::Float(&state.input[0..chars_parsed]),
// For these we trim off the 0x/0o/0b part
Hex => Expr::HexInt(&state.input[2..chars_parsed - 1]),
Binary => Expr::BinaryInt(&state.input[2..chars_parsed - 1]),
Octal => Expr::OctalInt(&state.input[2..chars_parsed - 1]),
};
let total_chars_parsed = before_decimal.len() + chars_skipped;
let state = state.advance_without_indenting(total_chars_parsed)?;
let next_state = state.advance_without_indenting(chars_parsed)?;
Ok((expr, state))
Ok((expr, next_state))
}
#[derive(Debug, PartialEq, Eq)]
enum LiteralType {
Int,
Float,
Hex,
Octal,
Binary,
}

View file

@ -1,7 +1,7 @@
use bumpalo::collections::vec::Vec;
use bumpalo::Bump;
use parse::ast::Attempting;
use region::Region;
use region::{Located, Region};
use std::char;
// Strategy:
@ -190,6 +190,21 @@ pub trait Parser<'a, Output> {
fn parse(&self, &'a Bump, State<'a>) -> ParseResult<'a, Output>;
}
pub struct BoxedParser<'a, Output> {
parser: &'a (dyn Parser<'a, Output> + 'a),
}
impl<'a, Output> BoxedParser<'a, Output> {
fn new<P>(arena: &'a Bump, parser: P) -> Self
where
P: Parser<'a, Output> + 'a,
{
BoxedParser {
parser: arena.alloc(parser),
}
}
}
impl<'a, F, Output> Parser<'a, Output> for F
where
F: Fn(&'a Bump, State<'a>) -> ParseResult<'a, Output>,
@ -199,6 +214,22 @@ where
}
}
pub fn val<'a, Val>(value: Val) -> impl Parser<'a, Val>
where
Val: Clone,
{
move |_, state| Ok((value.clone(), state))
}
/// Needed for recursive parsers
pub fn lazy<'a, F, P, Val>(get_parser: F) -> impl Parser<'a, Val>
where
F: Fn() -> P,
P: Parser<'a, Val>,
{
move |arena, state| get_parser().parse(arena, state)
}
pub fn map<'a, P, F, Before, After>(parser: P, transform: F) -> impl Parser<'a, After>
where
P: Parser<'a, Before>,
@ -211,6 +242,18 @@ where
}
}
pub fn map_with_arena<'a, P, F, Before, After>(parser: P, transform: F) -> impl Parser<'a, After>
where
P: Parser<'a, Before>,
F: Fn(&'a Bump, Before) -> After,
{
move |arena, state| {
parser
.parse(arena, state)
.map(|(output, next_state)| (transform(arena, output), next_state))
}
}
pub fn attempt<'a, P, Val>(attempting: Attempting, parser: P) -> impl Parser<'a, Val>
where
P: Parser<'a, Val>,
@ -226,6 +269,32 @@ where
}
}
pub fn loc<'a, P, Val>(parser: P) -> impl Parser<'a, Located<Val>>
where
P: Parser<'a, Val>,
{
move |arena, state: State<'a>| {
let start_col = state.column;
let start_line = state.line;
match parser.parse(arena, state) {
Ok((value, state)) => {
let end_col = state.column;
let end_line = state.line;
let region = Region {
start_col,
start_line,
end_col,
end_line,
};
Ok((Located { region, value }, state))
}
Err((fail, state)) => Err((fail, state)),
}
}
}
pub fn one_or_more<'a, P, A>(parser: P) -> impl Parser<'a, Vec<'a, A>>
where
P: Parser<'a, A>,
@ -317,6 +386,7 @@ pub fn string<'a>(string: &'static str) -> impl Parser<'a, ()> {
let input = state.input;
let len = string.len();
// TODO do this comparison in one SIMD instruction (on supported systems)
match input.get(0..len) {
Some(next_str) if next_str == string => Ok(((), state.advance_without_indenting(len)?)),
_ => Err(unexpected_eof(len, Attempting::Keyword, state)),
@ -378,6 +448,46 @@ where
// satisfies(any, |ch| ch.is_whitespace())
// }
pub fn and<'a, P1, P2, A, B>(p1: P1, p2: P2) -> impl Parser<'a, (A, B)>
where
P1: Parser<'a, A>,
P2: Parser<'a, B>,
{
move |arena: &'a Bump, state: State<'a>| {
let original_attempting = state.attempting;
match p1.parse(arena, state) {
Ok((out1, state)) => match p2.parse(arena, state) {
Ok((out2, state)) => Ok(((out1, out2), state)),
Err((fail, state)) => Err((
Fail {
attempting: original_attempting,
..fail
},
state,
)),
},
Err((fail, state)) => Err((
Fail {
attempting: original_attempting,
..fail
},
state,
)),
}
}
}
pub fn optional<'a, P, T>(parser: P) -> impl Parser<'a, Option<T>>
where
P: Parser<'a, T>,
{
move |arena: &'a Bump, state: State<'a>| match parser.parse(arena, state) {
Ok((out1, state)) => Ok((Some(out1), state)),
Err((_, state)) => Ok((None, state)),
}
}
pub fn one_of2<'a, P1, P2, A>(p1: P1, p2: P2) -> impl Parser<'a, A>
where
P1: Parser<'a, A>,

View file

@ -1,18 +1,11 @@
use bumpalo::collections::string::String;
use bumpalo::collections::vec::Vec;
use bumpalo::Bump;
use parse::ast::{Attempting, Expr};
use parse::ident;
use parse::parser::{unexpected, unexpected_eof, Fail, Parser, State};
use parse::problems::{Problem, Problems};
use region::{Loc, Region};
use parse::parser::{unexpected, unexpected_eof, ParseResult, Parser, State};
use std::char;
use std::iter::Peekable;
pub fn string_literal<'a>() -> impl Parser<'a, Expr<'a>> {
move |arena: &'a Bump, state: State<'a>| {
let mut problems = std::vec::Vec::new();
let mut chars = state.input.chars().peekable();
let mut chars = state.input.chars();
// String literals must start with a quote.
// If this doesn't, it must not be a string literal!
@ -26,464 +19,75 @@ pub fn string_literal<'a>() -> impl Parser<'a, Expr<'a>> {
}
}
// If we have precisely an empty string here, don't bother allocating
// a buffer; instead, return EmptyStr immediately.
if chars.peek() == Some(&'"') {
return Ok((
Expr::EmptyStr,
// 2 because `""` has length 2
state.advance_without_indenting(2)?,
));
}
// Stores the accumulated string characters
let mut buf = String::new_in(arena);
// This caches the total string length of interpolated_pairs. Every
// time we add a new pair to interpolated_pairs, we increment this
// by the sum of whatever we parsed in order to obtain that pair.
let mut buf_col_offset: usize = 0;
// Stores interpolated identifiers, if any.
let mut interpolated_pairs = Vec::new_in(arena);
// At the parsing stage we keep the entire raw string, because the formatter
// needs the raw string. (For example, so it can "remember" whether you
// wrote \u{...} or the actual unicode character itself.)
//
// Later, in canonicalization, we'll do things like resolving
// unicode escapes and string interpolation.
//
// Since we're keeping the entire raw string, all we need to track is
// how many characters we've parsed. So far, that's 1 (the opening `"`).
let mut parsed_chars = 1;
let mut prev_ch = '"';
while let Some(ch) = chars.next() {
match ch {
// If it's a backslash, escape things.
'\\' => match chars.next() {
Some(next_ch) => {
if let Some(ident) = handle_escaped_char(
arena,
&state,
next_ch,
&mut chars,
&mut buf,
&mut problems,
)? {
let expr = Expr::Var(ident);
parsed_chars += 1;
// +2 for `\(` and then another +1 for `)` at the end
let parsed_length = buf.len() + 2 + ident.len() + 1;
// It's okay if casting fails in this section, because
// we're going to check for line length overflow at the
// end anyway. That will render this region useless,
// but the user wasn't going to see this region
// anyway if the line length overflowed.
let start_line = state.line;
// Subtract ident length and another 1 for the `)`
let start_col = state.column
+ buf_col_offset as u16
+ (parsed_length - ident.len() - 1) as u16;
let ident_region = Region {
start_line,
start_col,
end_line: start_line,
end_col: start_col + ident.len() as u16 - 1,
};
let loc_expr = Loc {
region: ident_region,
value: expr,
};
// Push the accumulated string into the pairs list,
// along with the ident that came after it.
interpolated_pairs.push((buf.into_bump_str(), loc_expr));
// Reset the buffer so we start working on a new string.
buf = String::new_in(arena);
// Advance the cached offset of how many chars we've parsed,
// so the next time we see an interpolated ident, we can
// correctly calculate its region.
buf_col_offset += parsed_length;
}
}
None => {
// We ran out of characters before finding a closed quote;
// let the loop finish normally, so we end up returning
// the error that the string was not terminated.
//
// (There's the separate problem of a trailing backslash,
// but often that will get fixed in the course of
// addressing the missing closed quote.)
()
}
},
'"' => {
// We found a closed quote; this is the end of the string!
let len_with_quotes = buf.len() + 2;
let expr = if problems.is_empty() {
let final_str = buf.into_bump_str();
if interpolated_pairs.is_empty() {
Expr::Str(final_str)
} else {
let tuple_ref =
arena.alloc((interpolated_pairs.into_bump_slice(), final_str));
Expr::InterpolatedStr(tuple_ref)
}
// Potentially end the string (unless this is an escaped `"`!)
if ch == '"' && prev_ch != '\\' {
let expr = if parsed_chars == 2 {
if let Some('"') = chars.next() {
// If the first three chars were all `"`, then this
// literal begins with `"""` and is a block string.
return parse_block_string(arena, state, &mut chars);
} else {
Expr::MalformedStr(problems.into_boxed_slice())
};
Expr::EmptyStr
}
} else {
// Start at 1 so we omit the opening `"`.
// Subtract 1 from parsed_chars so we omit the closing `"`.
Expr::Str(&state.input[1..(parsed_chars - 1)])
};
let next_state = state.advance_without_indenting(len_with_quotes)?;
let next_state = state.advance_without_indenting(parsed_chars)?;
return Ok((expr, next_state));
}
'\t' => {
// Report the problem and continue. Tabs are syntax errors,
// but maybe the rest of the string is fine!
problems.push(loc_char(Problem::Tab, &state, buf.len()));
}
'\r' => {
// Carriage returns aren't allowed in string literals,
// but maybe the rest of the string is fine!
problems.push(loc_char(Problem::CarriageReturn, &state, buf.len()));
}
'\n' => {
// We hit a newline before a close quote.
// We can't safely assume where the string was supposed
// to end, so this is an unrecoverable error.
return Err(unexpected('\n', 0, state, Attempting::StringLiteral));
}
normal_char => buf.push(normal_char),
return Ok((expr, next_state));
} else if ch == '\n' {
// This is a single-line string, which cannot have newlines!
// Treat this as an unclosed string literal, and consume
// all remaining chars. This will mask all other errors, but
// it should make it easiest to debug; the file will be a giant
// error starting from where the open quote appeared.
return Err(unexpected(
'\n',
state.input.len() - 1,
state,
Attempting::StringLiteral,
));
} else {
prev_ch = ch;
}
}
// We ran out of characters before finding a closed quote
Err(unexpected_eof(
buf.len(),
parsed_chars,
Attempting::StringLiteral,
state.clone(),
))
}
}
fn loc_char<'a, V>(value: V, state: &State<'a>, buf_len: usize) -> Loc<V> {
let start_line = state.line;
let start_col = state.column + buf_len as u16;
let end_line = start_line;
// All invalid chars should have a length of 1
let end_col = state.column + 1;
let region = Region {
start_line,
start_col,
end_line,
end_col,
};
Loc { region, value }
}
fn loc_escaped_char<'a, V>(value: V, state: &State<'a>, buf_len: usize) -> Loc<V> {
let start_line = state.line;
let start_col = state.column + buf_len as u16;
let end_line = start_line;
// escapes should all be 2 chars long
let end_col = state.column + 1;
let region = Region {
start_line,
start_col,
end_line,
end_col,
};
Loc { region, value }
}
fn loc_escaped_unicode<'a, V>(
value: V,
state: &State<'a>,
buf_len: usize,
hex_str_len: usize,
) -> Loc<V> {
let start_line = state.line;
// +1 due to the `"` which precedes buf.
let start_col = state.column + buf_len as u16 + 1;
let end_line = start_line;
// +3 due to the `\u{` and another + 1 due to the `}`
// -1 to prevent overshooting because end col is inclusive.
let end_col = start_col + 3 + hex_str_len as u16 + 1 - 1;
let region = Region {
start_line,
start_col,
end_line,
end_col,
};
Loc { region, value }
}
#[inline(always)]
fn handle_escaped_char<'a, I>(
arena: &'a Bump,
state: &State<'a>,
ch: char,
chars: &mut Peekable<I>,
buf: &mut String<'a>,
problems: &mut Problems,
) -> Result<Option<&'a str>, (Fail, State<'a>)>
fn parse_block_string<'a, I>(
_arena: &'a Bump,
_state: State<'a>,
_chars: &mut I,
) -> ParseResult<'a, Expr<'a>>
where
I: Iterator<Item = char>,
{
match ch {
'\\' => buf.push('\\'),
'"' => buf.push('"'),
't' => buf.push('\t'),
'n' => buf.push('\n'),
'r' => buf.push('\r'),
'0' => buf.push('\0'), // We explicitly support null characters, as we
// can't be sure we won't receive them from Rust.
'u' => handle_escaped_unicode(arena, &state, chars, buf, problems)?,
'(' => {
let ident = parse_interpolated_ident(arena, state, chars)?;
return Ok(Some(ident));
}
'\t' => {
// Report and continue.
// Tabs are syntax errors, but maybe the rest of the string is fine!
problems.push(loc_escaped_char(Problem::Tab, &state, buf.len()));
}
'\r' => {
// Report and continue.
// Carriage returns aren't allowed in string literals,
// but maybe the rest of the string is fine!
problems.push(loc_escaped_char(Problem::CarriageReturn, &state, buf.len()));
}
'\n' => {
// Report and bail out.
// We can't safely assume where the string was supposed to end.
problems.push(loc_escaped_char(
Problem::NewlineInLiteral,
&state,
buf.len(),
));
return Err(unexpected_eof(
buf.len(),
Attempting::UnicodeEscape,
state.clone(),
));
}
_ => {
// Report and continue.
// An unsupported escaped char (e.g. \q) shouldn't halt parsing.
problems.push(loc_escaped_char(
Problem::UnsupportedEscapedChar,
&state,
buf.len(),
));
}
}
Ok(None)
}
#[inline(always)]
fn handle_escaped_unicode<'a, I>(
arena: &'a Bump,
state: &State<'a>,
chars: &mut Peekable<I>,
buf: &mut String<'a>,
problems: &mut Problems,
) -> Result<(), (Fail, State<'a>)>
where
I: Iterator<Item = char>,
{
// \u{00A0} is how you specify a Unicode code point,
// so we should always see a '{' next.
if chars.next() != Some('{') {
let start_line = state.line;
// +1 due to the `"` which precedes buf
let start_col = state.column + 1 + buf.len() as u16;
let end_line = start_line;
// All we parsed was `\u`, so end on the column after `\`'s column.
let end_col = start_col + 1;
let region = Region {
start_line,
start_col,
end_line,
end_col,
};
problems.push(Loc {
region,
value: Problem::NoUnicodeDigits,
});
// The rest of the string literal might be fine. Keep parsing!
return Ok(());
}
// Record the point in the string literal where we started parsing `\u`
let start_of_unicode = buf.len();
// Stores the accumulated unicode digits
let mut hex_str = String::new_in(arena);
while let Some(hex_char) = chars.next() {
match hex_char {
'}' => {
// Done! Validate and add it to the buffer.
match u32::from_str_radix(&hex_str, 16) {
Ok(code_pt) => {
if code_pt > 0x10FFFF {
let start_line = state.line;
// +1 due to the `"` which precedes buf
// +3 due to the `\u{` which precedes the hex digits
let start_col = state.column + 1 + buf.len() as u16 + 3;
let end_line = start_line;
// We want to underline only the number. That's the error!
// -1 because we want to end on the last digit, not
// overshoot it.
let end_col = start_col + hex_str.len() as u16 - 1;
let region = Region {
start_line,
start_col,
end_line,
end_col,
};
problems.push(Loc {
region,
value: Problem::UnicodeCodePointTooLarge,
});
} else {
// If it all checked out, add it to
// the main buffer.
match char::from_u32(code_pt) {
Some(ch) => buf.push(ch),
None => {
problems.push(loc_escaped_unicode(
Problem::InvalidUnicodeCodePoint,
&state,
start_of_unicode,
hex_str.len(),
));
}
}
}
}
Err(_) => {
let problem = if hex_str.is_empty() {
Problem::NoUnicodeDigits
} else {
Problem::NonHexCharsInUnicodeCodePoint
};
problems.push(loc_escaped_unicode(
problem,
&state,
start_of_unicode,
hex_str.len(),
));
}
}
// We are now done processing the unicode portion of the string,
// so exit the loop without further advancing the iterator.
return Ok(());
}
'\t' => {
// Report and continue.
// Tabs are syntax errors, but maybe the rest of the string is fine!
problems.push(loc_escaped_unicode(
Problem::Tab,
&state,
start_of_unicode,
hex_str.len(),
));
}
'\r' => {
// Report and continue.
// Carriage returns aren't allowed in string literals,
// but maybe the rest of the string is fine!
problems.push(loc_escaped_unicode(
Problem::CarriageReturn,
&state,
start_of_unicode,
hex_str.len(),
));
}
'\n' => {
// Report and bail out.
// We can't safely assume where the string was supposed to end.
problems.push(loc_escaped_unicode(
Problem::NewlineInLiteral,
&state,
start_of_unicode,
hex_str.len(),
));
return Err(unexpected_eof(
buf.len(),
Attempting::UnicodeEscape,
state.clone(),
));
}
normal_char => hex_str.push(normal_char),
}
// If we're about to hit the end of the string, and we didn't already
// complete parsing a valid unicode escape sequence, this is a malformed
// escape sequence - it wasn't terminated!
if chars.peek() == Some(&'"') {
// Record a problem and exit the loop early, so the string literal
// parsing logic can consume the quote and do its job as normal.
let start_line = state.line;
// +1 due to the `"` which precedes buf.
let start_col = state.column + buf.len() as u16 + 1;
let end_line = start_line;
// +3 due to the `\u{`
// -1 to prevent overshooting because end col is inclusive.
let end_col = start_col + 3 + hex_str.len() as u16 - 1;
let region = Region {
start_line,
start_col,
end_line,
end_col,
};
problems.push(Loc {
region,
value: Problem::MalformedEscapedUnicode,
});
return Ok(());
}
}
Ok(())
}
#[inline(always)]
fn parse_interpolated_ident<'a, I>(
arena: &'a Bump,
state: &State<'a>,
chars: &mut Peekable<I>,
) -> Result<&'a str, (Fail, State<'a>)>
where
I: Iterator<Item = char>,
{
// This will return Err on invalid identifiers like "if"
let ((string, next_char), state) = ident::parse_into(arena, chars, state.clone())?;
// Make sure we got a closing ) to end the interpolation.
match next_char {
Some(')') => Ok(string),
Some(ch) => Err(unexpected(ch, 0, state, Attempting::InterpolatedString)),
None => Err(unexpected_eof(0, Attempting::InterpolatedString, state)),
}
// So far we have consumed the `"""` and that's it.
let _parsed_chars = 3;
panic!("TODO parse block string, advance state, etc");
}