Lazily validate that input bytes are valid UTF-8

This commit is contained in:
Richard Feldman 2020-07-25 22:12:42 -04:00
parent 15f087c93e
commit 9f9ce327d4
21 changed files with 709 additions and 626 deletions

View file

@ -1,14 +1,16 @@
use crate::ast::Attempting;
use bumpalo::collections::vec::Vec;
use bumpalo::Bump;
use encode_unicode::CharExt;
use roc_region::all::{Located, Region};
use std::{char, u16};
use std::str::from_utf8;
use std::{char, mem, u16};
/// A position in a source file.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct State<'a> {
/// The raw input string.
pub input: &'a str,
/// The raw input bytes from the file.
pub bytes: &'a [u8],
/// Current line of the input
pub line: u32,
@ -39,15 +41,15 @@ pub enum Either<First, Second> {
}
impl<'a> State<'a> {
pub fn new(input: &'a str, attempting: Attempting) -> State<'a> {
pub fn new(bytes: &'a [u8], attempting: Attempting) -> State<'a> {
State {
input,
bytes,
line: 0,
column: 0,
indent_col: 0,
is_indenting: true,
attempting,
original_len: input.len(),
original_len: bytes.len(),
}
}
@ -69,7 +71,7 @@ impl<'a> State<'a> {
///
/// So if the parser has consumed 8 bytes, this function will return 8.
pub fn bytes_consumed(&self) -> usize {
self.original_len - self.input.len()
self.original_len - self.bytes.len()
}
/// Increments the line, then resets column, indent_col, and is_indenting.
@ -77,7 +79,7 @@ impl<'a> State<'a> {
pub fn newline(&self) -> Result<Self, (Fail, Self)> {
match self.line.checked_add(1) {
Some(line) => Ok(State {
input: &self.input[1..],
bytes: &self.bytes[1..],
line,
column: 0,
indent_col: 0,
@ -103,7 +105,7 @@ impl<'a> State<'a> {
match (self.column as usize).checked_add(quantity) {
Some(column_usize) if column_usize <= u16::MAX as usize => {
Ok(State {
input: &self.input[quantity..],
bytes: &self.bytes[quantity..],
line: self.line,
column: column_usize as u16,
indent_col: self.indent_col,
@ -141,7 +143,7 @@ impl<'a> State<'a> {
};
Ok(State {
input: &self.input[spaces..],
bytes: &self.bytes[spaces..],
line: self.line,
column: column_usize as u16,
indent_col,
@ -169,6 +171,17 @@ impl<'a> State<'a> {
end_line: self.line,
}
}
/// Return a failing ParseResult for the given FailReason
pub fn fail<T>(self, reason: FailReason) -> Result<(T, Self), (Fail, Self)> {
Err((
Fail {
reason,
attempting: self.attempting,
},
self,
))
}
}
#[test]
@ -182,13 +195,14 @@ pub type ParseResult<'a, Output> = Result<(Output, State<'a>), (Fail, State<'a>)
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum FailReason {
Unexpected(char, Region),
Unexpected(Region),
OutdentedTooFar,
ConditionFailed,
LineTooLong(u32 /* which line was too long */),
TooManyLines,
Eof(Region),
InvalidPattern,
BadUtf8,
ReservedKeyword(Region),
ArgumentsBeforeEquals(Region),
}
@ -332,13 +346,12 @@ pub fn unexpected_eof(
}
pub fn unexpected(
ch: char,
chars_consumed: usize,
state: State<'_>,
attempting: Attempting,
) -> (Fail, State<'_>) {
checked_unexpected(chars_consumed, state, |region| Fail {
reason: FailReason::Unexpected(ch, region),
reason: FailReason::Unexpected(region),
attempting,
})
}
@ -385,9 +398,9 @@ fn line_too_long(attempting: Attempting, state: State<'_>) -> (Fail, State<'_>)
// (for example) the LineTooLong initially occurs in the middle of
// a one_of chain, which would otherwise prevent it from propagating.
let column = u16::MAX;
let input = state.input.get(0..state.input.len()).unwrap();
let bytes = state.bytes.get(0..state.bytes.len()).unwrap();
let state = State {
input,
bytes,
line: state.line,
indent_col: state.indent_col,
is_indenting: state.is_indenting,
@ -399,28 +412,69 @@ fn line_too_long(attempting: Attempting, state: State<'_>) -> (Fail, State<'_>)
(fail, state)
}
/// A single char.
pub fn char<'a>(expected: char) -> impl Parser<'a, ()> {
move |_arena, state: State<'a>| match state.input.chars().next() {
Some(actual) if expected == actual => Ok(((), state.advance_without_indenting(1)?)),
Some(other_ch) => Err(unexpected(other_ch, 0, state, Attempting::Keyword)),
/// A single ASCII char.
pub fn ascii_char<'a>(expected: char) -> impl Parser<'a, ()> {
// Make sure this really is an ASCII char!
debug_assert!(expected.len_utf8() == 1);
move |_arena, state: State<'a>| match state.bytes.first() {
Some(&actual) if expected == actual as char => {
Ok(((), state.advance_without_indenting(1)?))
}
Some(_) => Err(unexpected(0, state, Attempting::Keyword)),
_ => Err(unexpected_eof(0, Attempting::Keyword, state)),
}
}
/// A hardcoded keyword string with no newlines in it.
pub fn string<'a>(keyword: &'static str) -> impl Parser<'a, ()> {
// We can't have newlines because we don't attempt to advance the row
// in the state, only the column.
debug_assert!(!keyword.contains('\n'));
/// A single UTF-8-encoded char. This will both parse *and* validate that the
/// char is valid UTF-8.
pub fn utf8_char<'a>() -> impl Parser<'a, char> {
move |_arena, state: State<'a>| {
if !state.bytes.is_empty() {
match char::from_utf8_slice_start(state.bytes) {
Ok((ch, bytes_parsed)) => {
return Ok((ch, state.advance_without_indenting(bytes_parsed)?))
}
Err(_) => return state.fail(dbg!(FailReason::BadUtf8)),
}
} else {
Err(unexpected_eof(0, state.attempting, state))
}
}
}
/// A single UTF-8-encoded char. This will both parse *and* validate that the
/// char is valid UTF-8, but it will *not* advance the state.
pub fn peek_utf8_char<'a>(state: &State<'a>) -> Result<char, Fail> {
match char::from_utf8_slice_start(state.bytes) {
Ok((ch, _)) => Ok(ch),
Err(_) => Err(Fail {
reason: dbg!(FailReason::BadUtf8),
attempting: state.attempting,
}),
}
}
/// A hardcoded string consisting only of ASCII characters.
pub fn ascii_string<'a>(keyword: &'static str) -> impl Parser<'a, ()> {
// Verify that this really is exclusively ASCII characters.
// The `unsafe` block in this function relies upon this assumption!
//
// Also, this can't have newlines because we don't attempt to advance
// the row in the state, only the column.
debug_assert!(keyword.chars().all(|ch| ch.len_utf8() == 1 && ch != '\n'));
move |_arena, state: State<'a>| {
let input = state.input;
let len = keyword.len();
// TODO do this comparison in one SIMD instruction (on supported systems)
match input.get(0..len) {
Some(next_str) if next_str == keyword => {
match state.bytes.get(0..len) {
// SAFETY: Roc language keywords are statically known to contain only
// ASCII characters, which means their &str will be 100% u8 values in
// memory, and thus can be safely interpreted as &[u8]
Some(next_str)
if next_str == unsafe { mem::transmute::<&'static str, &'a [u8]>(keyword) } =>
{
Ok(((), state.advance_without_indenting(len)?))
}
_ => Err(unexpected_eof(0, Attempting::Keyword, state)),
@ -686,7 +740,7 @@ macro_rules! collection {
// We could change the AST to add extra storage specifically to
// support empty literals containing newlines or comments, but this
// does not seem worth even the tiniest regression in compiler performance.
zero_or_more!($crate::parser::char(' ')),
zero_or_more!($crate::parser::ascii_char(' ')),
skip_second!(
$crate::parser::sep_by0(
$delimiter,
@ -912,6 +966,7 @@ macro_rules! record_field {
use $crate::ast::AssignedField::*;
use $crate::blankspace::{space0, space0_before};
use $crate::ident::lowercase_ident;
use $crate::parser::ascii_char;
use $crate::parser::Either::*;
// You must have a field name, e.g. "email"
@ -922,8 +977,8 @@ macro_rules! record_field {
// Having a value is optional; both `{ email }` and `{ email: blah }` work.
// (This is true in both literals and types.)
let (opt_loc_val, state) = $crate::parser::optional(either!(
skip_first!(char(':'), space0_before($val_parser, $min_indent)),
skip_first!(char('?'), space0_before($val_parser, $min_indent))
skip_first!(ascii_char(':'), space0_before($val_parser, $min_indent)),
skip_first!(ascii_char('?'), space0_before($val_parser, $min_indent))
))
.parse(arena, state)?;
@ -952,10 +1007,10 @@ macro_rules! record_field {
macro_rules! record_without_update {
($val_parser:expr, $min_indent:expr) => {
collection!(
char('{'),
ascii_char('{'),
loc!(record_field!($val_parser, $min_indent)),
char(','),
char('}'),
ascii_char(','),
ascii_char('}'),
$min_indent
)
};
@ -965,7 +1020,7 @@ macro_rules! record_without_update {
macro_rules! record {
($val_parser:expr, $min_indent:expr) => {
skip_first!(
$crate::parser::char('{'),
$crate::parser::ascii_char('{'),
and!(
// You can optionally have an identifier followed by an '&' to
// make this a record update, e.g. { Foo.user & username: "blah" }.
@ -981,7 +1036,7 @@ macro_rules! record {
)),
$min_indent
),
$crate::parser::char('&')
$crate::parser::ascii_char('&')
)),
loc!(skip_first!(
// We specifically allow space characters inside here, so that
@ -995,16 +1050,16 @@ macro_rules! record {
// We could change the AST to add extra storage specifically to
// support empty literals containing newlines or comments, but this
// does not seem worth even the tiniest regression in compiler performance.
zero_or_more!($crate::parser::char(' ')),
zero_or_more!($crate::parser::ascii_char(' ')),
skip_second!(
$crate::parser::sep_by0(
$crate::parser::char(','),
$crate::parser::ascii_char(','),
$crate::blankspace::space0_around(
loc!(record_field!($val_parser, $min_indent)),
$min_indent
)
),
$crate::parser::char('}')
$crate::parser::ascii_char('}')
)
))
)
@ -1067,3 +1122,10 @@ where
{
attempt!(attempting, parser)
}
pub fn parse_utf8<'a>(bytes: &'a [u8]) -> Result<&'a str, FailReason> {
match from_utf8(bytes) {
Ok(string) => Ok(string),
Err(_) => Err(dbg!(FailReason::BadUtf8)),
}
}