number parsing with new errors

This commit is contained in:
Folkert 2021-02-26 15:06:24 +01:00
parent d4de440943
commit bb9a2525b5
8 changed files with 187 additions and 263 deletions

View file

@ -1,189 +1,131 @@
use crate::ast::{Attempting, Base, Expr};
use crate::parser::{
parse_utf8, unexpected, unexpected_eof, ParseResult, Parser, Progress, State, SyntaxError,
};
use bumpalo::Bump;
use crate::ast::{Base, Expr};
use crate::parser::{parse_utf8, Number, ParseResult, Parser, Progress, State, SyntaxError};
use std::char;
use std::str::from_utf8_unchecked;
pub fn number_literal<'a>() -> impl Parser<'a, Expr<'a>, SyntaxError<'a>> {
pub fn number_literal<'a>() -> impl Parser<'a, Expr<'a>, Number> {
move |arena, state: State<'a>| {
let bytes = &mut state.bytes.iter();
match bytes.next() {
Some(&first_byte) => {
// Number literals must start with either an '-' or a digit.
if first_byte == b'-' || (first_byte as char).is_ascii_digit() {
parse_number_literal(first_byte as char, bytes, arena, state)
} else {
Err(unexpected(arena, 1, Attempting::NumberLiteral, state))
}
match state.bytes.get(0) {
Some(first_byte) if *first_byte == b'-' => {
// drop the minus
parse_number_base(true, &state.bytes[1..], state)
}
Some(first_byte) if (*first_byte as char).is_ascii_digit() => {
parse_number_base(false, &state.bytes, state)
}
_ => {
// this is not a number at all
Err((Progress::NoProgress, Number::End, state))
}
None => Err(unexpected_eof(arena, state, 0)),
}
}
}
#[inline(always)]
fn parse_number_literal<'a, I>(
first_ch: char,
bytes: &mut I,
arena: &'a Bump,
fn parse_number_base<'a>(
is_negated: bool,
bytes: &'a [u8],
state: State<'a>,
) -> ParseResult<'a, Expr<'a>, SyntaxError<'a>>
where
I: Iterator<Item = &'a u8>,
{
use self::LiteralType::*;
let mut typ = Num;
// We already parsed 1 character (which may have been a minus sign).
let mut bytes_parsed = 1;
let mut prev_byte = first_ch as u8;
let mut has_parsed_digits = first_ch.is_ascii_digit();
for &next_byte in bytes {
let err_unexpected = || {
Err(unexpected(
arena,
bytes_parsed,
Attempting::NumberLiteral,
state.clone(),
))
};
let is_potentially_non_base10 = || {
(bytes_parsed == 1 && first_ch == '0')
|| (bytes_parsed == 2 && first_ch == '-' && prev_byte == b'0')
};
match next_byte as char {
'.' => {
if typ == Float {
// You only get one decimal point!
return err_unexpected();
} else {
typ = Float;
}
}
'x' => {
if is_potentially_non_base10() {
typ = Hex;
} else {
return err_unexpected();
}
}
'b' if typ == Num => {
// We have to check for typ == Num because otherwise we get a false
// positive here when parsing a hex literal that happens to have
// a 'b' in it, e.g. 0xbbbb
if is_potentially_non_base10() {
typ = Binary;
} else {
return err_unexpected();
}
}
'o' => {
if is_potentially_non_base10() {
typ = Octal;
} else {
return err_unexpected();
}
}
'_' => {
// Underscores are ignored.
}
next_ch => {
if next_ch.is_ascii_digit() {
has_parsed_digits = true;
} else {
if !has_parsed_digits {
// No digits! We likely parsed a minus sign
// that's actually a unary negation operator.
return err_unexpected();
}
// ASCII alphabetic chars (like 'a' and 'f') are
// allowed in Hex int literals. We verify them in
// canonicalization, so if there's a problem, we can
// give a more helpful error (e.g. "the character 'f'
// is not allowed in Octal literals" or
// "the character 'g' is outside the range of valid
// Hex literals") while still allowing the formatter
// to format them normally.
if !next_ch.is_ascii_alphabetic() {
// We hit an invalid number literal character; we're done!
break;
}
}
}
}
// Since we only consume characters in the ASCII range for number literals,
// this will always be exactly 1. There's no need to call next_ch.utf8_len().
bytes_parsed += 1;
prev_byte = next_byte;
}
// At this point we have a number, and will definitely succeed.
// If the number is malformed (outside the supported range),
// we'll succeed with an appropriate Expr which records that.
match typ {
Num => Ok((
Progress::from_consumed(bytes_parsed),
// SAFETY: it's safe to use from_utf8_unchecked here, because we've
// already validated that this range contains only ASCII digits
Expr::Num(unsafe { from_utf8_unchecked(&state.bytes[0..bytes_parsed]) }),
state.advance_without_indenting(arena, bytes_parsed)?,
)),
Float => Ok((
Progress::from_consumed(bytes_parsed),
// SAFETY: it's safe to use from_utf8_unchecked here, because we've
// already validated that this range contains only ASCII digits
Expr::Float(unsafe { from_utf8_unchecked(&state.bytes[0..bytes_parsed]) }),
state.advance_without_indenting(arena, bytes_parsed)?,
)),
// For these we trim off the 0x/0o/0b part
Hex => from_base(Base::Hex, first_ch, bytes_parsed, arena, state),
Octal => from_base(Base::Octal, first_ch, bytes_parsed, arena, state),
Binary => from_base(Base::Binary, first_ch, bytes_parsed, arena, state),
) -> ParseResult<'a, Expr<'a>, Number> {
match bytes.get(0..2) {
Some(b"0b") => chomp_number_base(Base::Binary, is_negated, &bytes[2..], state),
Some(b"0o") => chomp_number_base(Base::Octal, is_negated, &bytes[2..], state),
Some(b"0x") => chomp_number_base(Base::Hex, is_negated, &bytes[2..], state),
_ => chomp_number_dec(is_negated, bytes, state),
}
}
#[derive(Debug, PartialEq, Eq)]
enum LiteralType {
Num,
Float,
Hex,
Octal,
Binary,
}
fn from_base<'a>(
fn chomp_number_base<'a>(
base: Base,
first_ch: char,
bytes_parsed: usize,
arena: &'a Bump,
is_negative: bool,
bytes: &'a [u8],
state: State<'a>,
) -> ParseResult<'a, Expr<'a>, SyntaxError<'a>> {
let is_negative = first_ch == '-';
let bytes = if is_negative {
&state.bytes[3..bytes_parsed]
} else {
&state.bytes[2..bytes_parsed]
};
) -> ParseResult<'a, Expr<'a>, Number> {
let (_is_float, mut chomped) = chomp_number(bytes);
chomped += 2 + (is_negative as usize);
match parse_utf8(bytes) {
Ok(string) => Ok((
Progress::from_consumed(bytes_parsed),
Expr::NonBase10Int {
is_negative,
string,
base,
},
state.advance_without_indenting(arena, bytes_parsed)?,
)),
Err(reason) => state.fail(arena, Progress::from_consumed(bytes_parsed), reason),
match parse_utf8(&bytes[0..chomped]) {
Ok(string) => match state.advance_without_indenting(chomped) {
Ok(new) => {
// all is well
Ok((
Progress::MadeProgress,
Expr::NonBase10Int {
is_negative,
string,
base: Base::Binary,
},
new,
))
}
Err((_, SyntaxError::LineTooLong(_), new)) => {
// the only error we care about in this context
Err((Progress::MadeProgress, Number::LineTooLong, new))
}
Err(_) => unreachable!("we know advancing will succeed if there is space on the line"),
},
Err(_) => unreachable!("no invalid utf8 could have been chomped"),
}
}
fn chomp_number_dec<'a>(
is_negative: bool,
bytes: &'a [u8],
state: State<'a>,
) -> ParseResult<'a, Expr<'a>, Number> {
let (is_float, mut chomped) = chomp_number(bytes);
chomped += is_negative as usize;
let string = unsafe { from_utf8_unchecked(&state.bytes[0..chomped]) };
match state.advance_without_indenting(chomped) {
Ok(new) => {
// all is well
Ok((
Progress::MadeProgress,
if is_float {
Expr::Float(string)
} else {
Expr::Num(string)
},
new,
))
}
Err((_, SyntaxError::LineTooLong(_), new)) => {
// the only error we care about in this context
Err((Progress::MadeProgress, Number::LineTooLong, new))
}
Err(_) => unreachable!("we know advancing will succeed if there is space on the line"),
}
}
fn chomp_number<'a>(mut bytes: &'a [u8]) -> (bool, usize) {
let start_bytes_len = bytes.len();
let mut is_float = false;
while let Some(byte) = bytes.get(0) {
match byte {
b'.' => {
// skip, fix multiple `.`s in canonicalization
is_float = true;
bytes = &bytes[1..];
}
b'_' => {
// skip
bytes = &bytes[1..];
}
_ if byte.is_ascii_digit() || byte.is_ascii_alphabetic() => {
// valid digits (alphabetic in hex digits, and the `e` in `12e26` scientific notation
bytes = &bytes[1..];
}
_ => {
// not a valid digit; we're done
return (is_float, start_bytes_len - bytes.len());
}
}
}
// if the above loop exits, we must be dealing with an empty slice
// therefore we parsed all of the bytes in the input
(is_float, start_bytes_len)
}