Lazily validate that input bytes are valid UTF-8

2025-09-28 22:34:45 +00:00 · 2020-07-25 22:12:42 -04:00 · 2020-07-25 22:12:42 -04:00 · 9f9ce327d4
commit 9f9ce327d4
parent 15f087c93e
21 changed files with 709 additions and 626 deletions
--- a/compiler/parse/src/parser.rs
+++ b/compiler/parse/src/parser.rs
@ -1,14 +1,16 @@
 use crate::ast::Attempting;
 use bumpalo::collections::vec::Vec;
 use bumpalo::Bump;
+use encode_unicode::CharExt;
 use roc_region::all::{Located, Region};
-use std::{char, u16};
+use std::str::from_utf8;
+use std::{char, mem, u16};

 /// A position in a source file.
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct State<'a> {
-    /// The raw input string.
-    pub input: &'a str,
+    /// The raw input bytes from the file.
+    pub bytes: &'a [u8],

    /// Current line of the input
    pub line: u32,
@ -39,15 +41,15 @@ pub enum Either<First, Second> {
 }

 impl<'a> State<'a> {
-    pub fn new(input: &'a str, attempting: Attempting) -> State<'a> {
+    pub fn new(bytes: &'a [u8], attempting: Attempting) -> State<'a> {
        State {
-            input,
+            bytes,
            line: 0,
            column: 0,
            indent_col: 0,
            is_indenting: true,
            attempting,
-            original_len: input.len(),
+            original_len: bytes.len(),
        }
    }

@ -69,7 +71,7 @@ impl<'a> State<'a> {
    ///
    /// So if the parser has consumed 8 bytes, this function will return 8.
    pub fn bytes_consumed(&self) -> usize {
-        self.original_len - self.input.len()
+        self.original_len - self.bytes.len()
    }

    /// Increments the line, then resets column, indent_col, and is_indenting.
@ -77,7 +79,7 @@ impl<'a> State<'a> {
    pub fn newline(&self) -> Result<Self, (Fail, Self)> {
        match self.line.checked_add(1) {
            Some(line) => Ok(State {
-                input: &self.input[1..],
+                bytes: &self.bytes[1..],
                line,
                column: 0,
                indent_col: 0,
@ -103,7 +105,7 @@ impl<'a> State<'a> {
        match (self.column as usize).checked_add(quantity) {
            Some(column_usize) if column_usize <= u16::MAX as usize => {
                Ok(State {
-                    input: &self.input[quantity..],
+                    bytes: &self.bytes[quantity..],
                    line: self.line,
                    column: column_usize as u16,
                    indent_col: self.indent_col,
@ -141,7 +143,7 @@ impl<'a> State<'a> {
                };

                Ok(State {
-                    input: &self.input[spaces..],
+                    bytes: &self.bytes[spaces..],
                    line: self.line,
                    column: column_usize as u16,
                    indent_col,
@ -169,6 +171,17 @@ impl<'a> State<'a> {
            end_line: self.line,
        }
    }
+
+    /// Return a failing ParseResult for the given FailReason
+    pub fn fail<T>(self, reason: FailReason) -> Result<(T, Self), (Fail, Self)> {
+        Err((
+            Fail {
+                reason,
+                attempting: self.attempting,
+            },
+            self,
+        ))
+    }
 }

 #[test]
@ -182,13 +195,14 @@ pub type ParseResult<'a, Output> = Result<(Output, State<'a>), (Fail, State<'a>)

 #[derive(Debug, Clone, PartialEq, Eq)]
 pub enum FailReason {
-    Unexpected(char, Region),
+    Unexpected(Region),
    OutdentedTooFar,
    ConditionFailed,
    LineTooLong(u32 /* which line was too long */),
    TooManyLines,
    Eof(Region),
    InvalidPattern,
+    BadUtf8,
    ReservedKeyword(Region),
    ArgumentsBeforeEquals(Region),
 }
@ -332,13 +346,12 @@ pub fn unexpected_eof(
 }

 pub fn unexpected(
-    ch: char,
    chars_consumed: usize,
    state: State<'_>,
    attempting: Attempting,
 ) -> (Fail, State<'_>) {
    checked_unexpected(chars_consumed, state, |region| Fail {
-        reason: FailReason::Unexpected(ch, region),
+        reason: FailReason::Unexpected(region),
        attempting,
    })
 }
@ -385,9 +398,9 @@ fn line_too_long(attempting: Attempting, state: State<'_>) -> (Fail, State<'_>)
    // (for example) the LineTooLong initially occurs in the middle of
    // a one_of chain, which would otherwise prevent it from propagating.
    let column = u16::MAX;
-    let input = state.input.get(0..state.input.len()).unwrap();
+    let bytes = state.bytes.get(0..state.bytes.len()).unwrap();
    let state = State {
-        input,
+        bytes,
        line: state.line,
        indent_col: state.indent_col,
        is_indenting: state.is_indenting,
@ -399,28 +412,69 @@ fn line_too_long(attempting: Attempting, state: State<'_>) -> (Fail, State<'_>)
    (fail, state)
 }

-/// A single char.
-pub fn char<'a>(expected: char) -> impl Parser<'a, ()> {
-    move |_arena, state: State<'a>| match state.input.chars().next() {
-        Some(actual) if expected == actual => Ok(((), state.advance_without_indenting(1)?)),
-        Some(other_ch) => Err(unexpected(other_ch, 0, state, Attempting::Keyword)),
+/// A single ASCII char.
+pub fn ascii_char<'a>(expected: char) -> impl Parser<'a, ()> {
+    // Make sure this really is an ASCII char!
+    debug_assert!(expected.len_utf8() == 1);
+
+    move |_arena, state: State<'a>| match state.bytes.first() {
+        Some(&actual) if expected == actual as char => {
+            Ok(((), state.advance_without_indenting(1)?))
+        }
+        Some(_) => Err(unexpected(0, state, Attempting::Keyword)),
        _ => Err(unexpected_eof(0, Attempting::Keyword, state)),
    }
 }

-/// A hardcoded keyword string with no newlines in it.
-pub fn string<'a>(keyword: &'static str) -> impl Parser<'a, ()> {
-    // We can't have newlines because we don't attempt to advance the row
-    // in the state, only the column.
-    debug_assert!(!keyword.contains('\n'));
+/// A single UTF-8-encoded char. This will both parse *and* validate that the
+/// char is valid UTF-8.
+pub fn utf8_char<'a>() -> impl Parser<'a, char> {
+    move |_arena, state: State<'a>| {
+        if !state.bytes.is_empty() {
+            match char::from_utf8_slice_start(state.bytes) {
+                Ok((ch, bytes_parsed)) => {
+                    return Ok((ch, state.advance_without_indenting(bytes_parsed)?))
+                }
+                Err(_) => return state.fail(dbg!(FailReason::BadUtf8)),
+            }
+        } else {
+            Err(unexpected_eof(0, state.attempting, state))
+        }
+    }
+}
+
+/// A single UTF-8-encoded char. This will both parse *and* validate that the
+/// char is valid UTF-8, but it will *not* advance the state.
+pub fn peek_utf8_char<'a>(state: &State<'a>) -> Result<char, Fail> {
+    match char::from_utf8_slice_start(state.bytes) {
+        Ok((ch, _)) => Ok(ch),
+        Err(_) => Err(Fail {
+            reason: dbg!(FailReason::BadUtf8),
+            attempting: state.attempting,
+        }),
+    }
+}
+
+/// A hardcoded string consisting only of ASCII characters.
+pub fn ascii_string<'a>(keyword: &'static str) -> impl Parser<'a, ()> {
+    // Verify that this really is exclusively ASCII characters.
+    // The `unsafe` block in this function relies upon this assumption!
+    //
+    // Also, this can't have newlines because we don't attempt to advance
+    // the row in the state, only the column.
+    debug_assert!(keyword.chars().all(|ch| ch.len_utf8() == 1 && ch != '\n'));

    move |_arena, state: State<'a>| {
-        let input = state.input;
        let len = keyword.len();

        // TODO do this comparison in one SIMD instruction (on supported systems)
-        match input.get(0..len) {
-            Some(next_str) if next_str == keyword => {
+        match state.bytes.get(0..len) {
+            // SAFETY: Roc language keywords are statically known to contain only
+            // ASCII characters, which means their &str will be 100% u8 values in
+            // memory, and thus can be safely interpreted as &[u8]
+            Some(next_str)
+                if next_str == unsafe { mem::transmute::<&'static str, &'a [u8]>(keyword) } =>
+            {
                Ok(((), state.advance_without_indenting(len)?))
            }
            _ => Err(unexpected_eof(0, Attempting::Keyword, state)),
@ -686,7 +740,7 @@ macro_rules! collection {
                // We could change the AST to add extra storage specifically to
                // support empty literals containing newlines or comments, but this
                // does not seem worth even the tiniest regression in compiler performance.
-                zero_or_more!($crate::parser::char(' ')),
+                zero_or_more!($crate::parser::ascii_char(' ')),
                skip_second!(
                    $crate::parser::sep_by0(
                        $delimiter,
@ -912,6 +966,7 @@ macro_rules! record_field {
            use $crate::ast::AssignedField::*;
            use $crate::blankspace::{space0, space0_before};
            use $crate::ident::lowercase_ident;
+            use $crate::parser::ascii_char;
            use $crate::parser::Either::*;

            // You must have a field name, e.g. "email"
@ -922,8 +977,8 @@ macro_rules! record_field {
            // Having a value is optional; both `{ email }` and `{ email: blah }` work.
            // (This is true in both literals and types.)
            let (opt_loc_val, state) = $crate::parser::optional(either!(
-                skip_first!(char(':'), space0_before($val_parser, $min_indent)),
-                skip_first!(char('?'), space0_before($val_parser, $min_indent))
+                skip_first!(ascii_char(':'), space0_before($val_parser, $min_indent)),
+                skip_first!(ascii_char('?'), space0_before($val_parser, $min_indent))
            ))
            .parse(arena, state)?;

@ -952,10 +1007,10 @@ macro_rules! record_field {
 macro_rules! record_without_update {
    ($val_parser:expr, $min_indent:expr) => {
        collection!(
-            char('{'),
+            ascii_char('{'),
            loc!(record_field!($val_parser, $min_indent)),
-            char(','),
-            char('}'),
+            ascii_char(','),
+            ascii_char('}'),
            $min_indent
        )
    };
@ -965,7 +1020,7 @@ macro_rules! record_without_update {
 macro_rules! record {
    ($val_parser:expr, $min_indent:expr) => {
        skip_first!(
-            $crate::parser::char('{'),
+            $crate::parser::ascii_char('{'),
            and!(
                // You can optionally have an identifier followed by an '&' to
                // make this a record update, e.g. { Foo.user & username: "blah" }.
@ -981,7 +1036,7 @@ macro_rules! record {
                        )),
                        $min_indent
                    ),
-                    $crate::parser::char('&')
+                    $crate::parser::ascii_char('&')
                )),
                loc!(skip_first!(
                    // We specifically allow space characters inside here, so that
@ -995,16 +1050,16 @@ macro_rules! record {
                    // We could change the AST to add extra storage specifically to
                    // support empty literals containing newlines or comments, but this
                    // does not seem worth even the tiniest regression in compiler performance.
-                    zero_or_more!($crate::parser::char(' ')),
+                    zero_or_more!($crate::parser::ascii_char(' ')),
                    skip_second!(
                        $crate::parser::sep_by0(
-                            $crate::parser::char(','),
+                            $crate::parser::ascii_char(','),
                            $crate::blankspace::space0_around(
                                loc!(record_field!($val_parser, $min_indent)),
                                $min_indent
                            )
                        ),
-                        $crate::parser::char('}')
+                        $crate::parser::ascii_char('}')
                    )
                ))
            )
@ -1067,3 +1122,10 @@ where
 {
    attempt!(attempting, parser)
 }
+
+pub fn parse_utf8<'a>(bytes: &'a [u8]) -> Result<&'a str, FailReason> {
+    match from_utf8(bytes) {
+        Ok(string) => Ok(string),
+        Err(_) => Err(dbg!(FailReason::BadUtf8)),
+    }
+}