Lazily validate that input bytes are valid UTF-8

2025-10-01 07:41:12 +00:00 · 2020-07-25 22:12:42 -04:00 · 2020-07-25 22:12:42 -04:00 · 9f9ce327d4
commit 9f9ce327d4
parent 15f087c93e
21 changed files with 709 additions and 626 deletions
--- a/compiler/parse/src/number_literal.rs
+++ b/compiler/parse/src/number_literal.rs
@ -1,23 +1,19 @@
 use crate::ast::{Attempting, Base, Expr};
-use crate::parser::{unexpected, unexpected_eof, ParseResult, Parser, State};
+use crate::parser::{parse_utf8, unexpected, unexpected_eof, ParseResult, Parser, State};
 use std::char;
+use std::str::from_utf8_unchecked;

 pub fn number_literal<'a>() -> impl Parser<'a, Expr<'a>> {
    move |_arena, state: State<'a>| {
-        let mut chars = state.input.chars();
+        let bytes = &mut state.bytes.iter();

-        match chars.next() {
-            Some(first_ch) => {
+        match bytes.next() {
+            Some(&first_byte) => {
                // Number literals must start with either an '-' or a digit.
-                if first_ch == '-' || first_ch.is_ascii_digit() {
-                    parse_number_literal(first_ch, &mut chars, state)
+                if first_byte == '-' as u8 || (first_byte as char).is_ascii_digit() {
+                    parse_number_literal(first_byte as char, bytes, state)
                } else {
-                    Err(unexpected(
-                        first_ch,
-                        first_ch.len_utf8(),
-                        state,
-                        Attempting::NumberLiteral,
-                    ))
+                    Err(unexpected(1, state, Attempting::NumberLiteral))
                }
            }
            None => Err(unexpected_eof(0, state.attempting, state)),
@ -28,11 +24,11 @@ pub fn number_literal<'a>() -> impl Parser<'a, Expr<'a>> {
 #[inline(always)]
 fn parse_number_literal<'a, I>(
    first_ch: char,
-    chars: &mut I,
+    bytes: &mut I,
    state: State<'a>,
 ) -> ParseResult<'a, Expr<'a>>
 where
-    I: Iterator<Item = char>,
+    I: Iterator<Item = &'a u8>,
 {
    use self::LiteralType::*;

@ -40,13 +36,12 @@ where

    // We already parsed 1 character (which may have been a minus sign).
    let mut bytes_parsed = 1;
-    let mut prev_ch = first_ch;
+    let mut prev_byte = first_ch as u8;
    let mut has_parsed_digits = first_ch.is_ascii_digit();

-    for next_ch in chars {
+    for &next_byte in bytes {
        let err_unexpected = || {
            Err(unexpected(
-                next_ch,
                bytes_parsed,
                state.clone(),
                Attempting::NumberLiteral,
@ -55,91 +50,91 @@ where

        let is_potentially_non_base10 = || {
            (bytes_parsed == 1 && first_ch == '0')
-                || (bytes_parsed == 2 && first_ch == '-' && prev_ch == '0')
+                || (bytes_parsed == 2 && first_ch == '-' && prev_byte == '0' as u8)
        };

-        if next_ch == '.' {
-            if typ == Float {
-                // You only get one decimal point!
-                return err_unexpected();
-            } else {
-                typ = Float;
+        match next_byte as char {
+            '.' => {
+                if typ == Float {
+                    // You only get one decimal point!
+                    return err_unexpected();
+                } else {
+                    typ = Float;
+                }
            }
-        } else if next_ch == 'x' {
-            if is_potentially_non_base10() {
-                typ = Hex;
-            } else {
-                return err_unexpected();
+            'x' => {
+                if is_potentially_non_base10() {
+                    typ = Hex;
+                } else {
+                    return err_unexpected();
+                }
            }
-        } else if next_ch == 'b' && typ == Num {
-            // We have to check for typ == Num because otherwise we get a false
-            // positive here when parsing a hex literal that happens to have
-            // a 'b' in it, e.g. 0xbbbb
-            if is_potentially_non_base10() {
-                typ = Binary;
-            } else {
-                return err_unexpected();
+            'b' if typ == Num => {
+                // We have to check for typ == Num because otherwise we get a false
+                // positive here when parsing a hex literal that happens to have
+                // a 'b' in it, e.g. 0xbbbb
+                if is_potentially_non_base10() {
+                    typ = Binary;
+                } else {
+                    return err_unexpected();
+                }
            }
-        } else if next_ch == 'o' {
-            if is_potentially_non_base10() {
-                typ = Octal;
-            } else {
-                return err_unexpected();
+            'o' => {
+                if is_potentially_non_base10() {
+                    typ = Octal;
+                } else {
+                    return err_unexpected();
+                }
            }
-        } else if next_ch.is_ascii_digit() {
-            has_parsed_digits = true;
-        } else if next_ch != '_' &&
+            next_ch if next_ch.is_ascii_digit() => {
+                has_parsed_digits = true;
+            }
+            next_ch
+                if next_ch != '_' &&
            // ASCII alphabetic chars (like 'a' and 'f') are allowed in Hex int literals.
            // We parse them in any int literal, so we can give a more helpful error
            // in canonicalization (e.g. "the character 'f' is not allowed in Octal literals"
            // or "the character 'g' is outside the range of valid Hex literals")
-            !next_ch.is_ascii_alphabetic()
-        {
-            if has_parsed_digits {
-                // We hit an invalid number literal character; we're done!
-                break;
-            } else {
-                // No digits! We likely parsed a minus sign that's actually an operator.
-                return err_unexpected();
+            !next_ch.is_ascii_alphabetic() =>
+            {
+                if has_parsed_digits {
+                    // We hit an invalid number literal character; we're done!
+                    break;
+                } else {
+                    // No digits! We likely parsed a minus sign that's actually an operator.
+                    return err_unexpected();
+                }
            }
+            _ => {}
        }

        // Since we only consume characters in the ASCII range for number literals,
        // this will always be exactly 1. There's no need to call next_ch.utf8_len().
        bytes_parsed += 1;
-        prev_ch = next_ch;
+        prev_byte = next_byte;
    }

-    let from_base = |base| {
-        let is_negative = first_ch == '-';
-        let string = if is_negative {
-            &state.input[3..bytes_parsed]
-        } else {
-            &state.input[2..bytes_parsed]
-        };
-
-        Expr::NonBase10Int {
-            is_negative,
-            string,
-            base,
-        }
-    };
-
    // At this point we have a number, and will definitely succeed.
    // If the number is malformed (outside the supported range),
    // we'll succeed with an appropriate Expr which records that.
-    let expr = match typ {
-        Num => Expr::Num(&state.input[0..bytes_parsed]),
-        Float => Expr::Float(&state.input[0..bytes_parsed]),
+    match typ {
+        Num => Ok((
+            // SAFETY: it's safe to use from_utf8_unchecked here, because we've
+            // already validated that this range contains only ASCII digits
+            Expr::Num(unsafe { from_utf8_unchecked(&state.bytes[0..bytes_parsed]) }),
+            state.advance_without_indenting(bytes_parsed)?,
+        )),
+        Float => Ok((
+            // SAFETY: it's safe to use from_utf8_unchecked here, because we've
+            // already validated that this range contains only ASCII digits
+            Expr::Float(unsafe { from_utf8_unchecked(&state.bytes[0..bytes_parsed]) }),
+            state.advance_without_indenting(bytes_parsed)?,
+        )),
        // For these we trim off the 0x/0o/0b part
-        Hex => from_base(Base::Hex),
-        Octal => from_base(Base::Octal),
-        Binary => from_base(Base::Binary),
-    };
-
-    let next_state = state.advance_without_indenting(bytes_parsed)?;
-
-    Ok((expr, next_state))
+        Hex => from_base(Base::Hex, first_ch, bytes_parsed, state),
+        Octal => from_base(Base::Octal, first_ch, bytes_parsed, state),
+        Binary => from_base(Base::Binary, first_ch, bytes_parsed, state),
+    }
 }

 #[derive(Debug, PartialEq, Eq)]
@ -150,3 +145,29 @@ enum LiteralType {
    Octal,
    Binary,
 }
+
+fn from_base<'a>(
+    base: Base,
+    first_ch: char,
+    bytes_parsed: usize,
+    state: State<'a>,
+) -> ParseResult<'a, Expr<'a>> {
+    let is_negative = first_ch == '-';
+    let bytes = if is_negative {
+        &state.bytes[3..bytes_parsed]
+    } else {
+        &state.bytes[2..bytes_parsed]
+    };
+
+    match parse_utf8(bytes) {
+        Ok(string) => Ok((
+            Expr::NonBase10Int {
+                is_negative,
+                string,
+                base,
+            },
+            state.advance_without_indenting(bytes_parsed)?,
+        )),
+        Err(reason) => state.fail(reason),
+    }
+}