mirror of
https://github.com/roc-lang/roc.git
synced 2025-09-28 22:34:45 +00:00
Lazily validate that input bytes are valid UTF-8
This commit is contained in:
parent
15f087c93e
commit
9f9ce327d4
21 changed files with 709 additions and 626 deletions
|
@ -1,14 +1,16 @@
|
|||
use crate::ast::Attempting;
|
||||
use bumpalo::collections::vec::Vec;
|
||||
use bumpalo::Bump;
|
||||
use encode_unicode::CharExt;
|
||||
use roc_region::all::{Located, Region};
|
||||
use std::{char, u16};
|
||||
use std::str::from_utf8;
|
||||
use std::{char, mem, u16};
|
||||
|
||||
/// A position in a source file.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct State<'a> {
|
||||
/// The raw input string.
|
||||
pub input: &'a str,
|
||||
/// The raw input bytes from the file.
|
||||
pub bytes: &'a [u8],
|
||||
|
||||
/// Current line of the input
|
||||
pub line: u32,
|
||||
|
@ -39,15 +41,15 @@ pub enum Either<First, Second> {
|
|||
}
|
||||
|
||||
impl<'a> State<'a> {
|
||||
pub fn new(input: &'a str, attempting: Attempting) -> State<'a> {
|
||||
pub fn new(bytes: &'a [u8], attempting: Attempting) -> State<'a> {
|
||||
State {
|
||||
input,
|
||||
bytes,
|
||||
line: 0,
|
||||
column: 0,
|
||||
indent_col: 0,
|
||||
is_indenting: true,
|
||||
attempting,
|
||||
original_len: input.len(),
|
||||
original_len: bytes.len(),
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -69,7 +71,7 @@ impl<'a> State<'a> {
|
|||
///
|
||||
/// So if the parser has consumed 8 bytes, this function will return 8.
|
||||
pub fn bytes_consumed(&self) -> usize {
|
||||
self.original_len - self.input.len()
|
||||
self.original_len - self.bytes.len()
|
||||
}
|
||||
|
||||
/// Increments the line, then resets column, indent_col, and is_indenting.
|
||||
|
@ -77,7 +79,7 @@ impl<'a> State<'a> {
|
|||
pub fn newline(&self) -> Result<Self, (Fail, Self)> {
|
||||
match self.line.checked_add(1) {
|
||||
Some(line) => Ok(State {
|
||||
input: &self.input[1..],
|
||||
bytes: &self.bytes[1..],
|
||||
line,
|
||||
column: 0,
|
||||
indent_col: 0,
|
||||
|
@ -103,7 +105,7 @@ impl<'a> State<'a> {
|
|||
match (self.column as usize).checked_add(quantity) {
|
||||
Some(column_usize) if column_usize <= u16::MAX as usize => {
|
||||
Ok(State {
|
||||
input: &self.input[quantity..],
|
||||
bytes: &self.bytes[quantity..],
|
||||
line: self.line,
|
||||
column: column_usize as u16,
|
||||
indent_col: self.indent_col,
|
||||
|
@ -141,7 +143,7 @@ impl<'a> State<'a> {
|
|||
};
|
||||
|
||||
Ok(State {
|
||||
input: &self.input[spaces..],
|
||||
bytes: &self.bytes[spaces..],
|
||||
line: self.line,
|
||||
column: column_usize as u16,
|
||||
indent_col,
|
||||
|
@ -169,6 +171,17 @@ impl<'a> State<'a> {
|
|||
end_line: self.line,
|
||||
}
|
||||
}
|
||||
|
||||
/// Return a failing ParseResult for the given FailReason
|
||||
pub fn fail<T>(self, reason: FailReason) -> Result<(T, Self), (Fail, Self)> {
|
||||
Err((
|
||||
Fail {
|
||||
reason,
|
||||
attempting: self.attempting,
|
||||
},
|
||||
self,
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
@ -182,13 +195,14 @@ pub type ParseResult<'a, Output> = Result<(Output, State<'a>), (Fail, State<'a>)
|
|||
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub enum FailReason {
|
||||
Unexpected(char, Region),
|
||||
Unexpected(Region),
|
||||
OutdentedTooFar,
|
||||
ConditionFailed,
|
||||
LineTooLong(u32 /* which line was too long */),
|
||||
TooManyLines,
|
||||
Eof(Region),
|
||||
InvalidPattern,
|
||||
BadUtf8,
|
||||
ReservedKeyword(Region),
|
||||
ArgumentsBeforeEquals(Region),
|
||||
}
|
||||
|
@ -332,13 +346,12 @@ pub fn unexpected_eof(
|
|||
}
|
||||
|
||||
pub fn unexpected(
|
||||
ch: char,
|
||||
chars_consumed: usize,
|
||||
state: State<'_>,
|
||||
attempting: Attempting,
|
||||
) -> (Fail, State<'_>) {
|
||||
checked_unexpected(chars_consumed, state, |region| Fail {
|
||||
reason: FailReason::Unexpected(ch, region),
|
||||
reason: FailReason::Unexpected(region),
|
||||
attempting,
|
||||
})
|
||||
}
|
||||
|
@ -385,9 +398,9 @@ fn line_too_long(attempting: Attempting, state: State<'_>) -> (Fail, State<'_>)
|
|||
// (for example) the LineTooLong initially occurs in the middle of
|
||||
// a one_of chain, which would otherwise prevent it from propagating.
|
||||
let column = u16::MAX;
|
||||
let input = state.input.get(0..state.input.len()).unwrap();
|
||||
let bytes = state.bytes.get(0..state.bytes.len()).unwrap();
|
||||
let state = State {
|
||||
input,
|
||||
bytes,
|
||||
line: state.line,
|
||||
indent_col: state.indent_col,
|
||||
is_indenting: state.is_indenting,
|
||||
|
@ -399,28 +412,69 @@ fn line_too_long(attempting: Attempting, state: State<'_>) -> (Fail, State<'_>)
|
|||
(fail, state)
|
||||
}
|
||||
|
||||
/// A single char.
|
||||
pub fn char<'a>(expected: char) -> impl Parser<'a, ()> {
|
||||
move |_arena, state: State<'a>| match state.input.chars().next() {
|
||||
Some(actual) if expected == actual => Ok(((), state.advance_without_indenting(1)?)),
|
||||
Some(other_ch) => Err(unexpected(other_ch, 0, state, Attempting::Keyword)),
|
||||
/// A single ASCII char.
|
||||
pub fn ascii_char<'a>(expected: char) -> impl Parser<'a, ()> {
|
||||
// Make sure this really is an ASCII char!
|
||||
debug_assert!(expected.len_utf8() == 1);
|
||||
|
||||
move |_arena, state: State<'a>| match state.bytes.first() {
|
||||
Some(&actual) if expected == actual as char => {
|
||||
Ok(((), state.advance_without_indenting(1)?))
|
||||
}
|
||||
Some(_) => Err(unexpected(0, state, Attempting::Keyword)),
|
||||
_ => Err(unexpected_eof(0, Attempting::Keyword, state)),
|
||||
}
|
||||
}
|
||||
|
||||
/// A hardcoded keyword string with no newlines in it.
|
||||
pub fn string<'a>(keyword: &'static str) -> impl Parser<'a, ()> {
|
||||
// We can't have newlines because we don't attempt to advance the row
|
||||
// in the state, only the column.
|
||||
debug_assert!(!keyword.contains('\n'));
|
||||
/// A single UTF-8-encoded char. This will both parse *and* validate that the
|
||||
/// char is valid UTF-8.
|
||||
pub fn utf8_char<'a>() -> impl Parser<'a, char> {
|
||||
move |_arena, state: State<'a>| {
|
||||
if !state.bytes.is_empty() {
|
||||
match char::from_utf8_slice_start(state.bytes) {
|
||||
Ok((ch, bytes_parsed)) => {
|
||||
return Ok((ch, state.advance_without_indenting(bytes_parsed)?))
|
||||
}
|
||||
Err(_) => return state.fail(dbg!(FailReason::BadUtf8)),
|
||||
}
|
||||
} else {
|
||||
Err(unexpected_eof(0, state.attempting, state))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A single UTF-8-encoded char. This will both parse *and* validate that the
|
||||
/// char is valid UTF-8, but it will *not* advance the state.
|
||||
pub fn peek_utf8_char<'a>(state: &State<'a>) -> Result<char, Fail> {
|
||||
match char::from_utf8_slice_start(state.bytes) {
|
||||
Ok((ch, _)) => Ok(ch),
|
||||
Err(_) => Err(Fail {
|
||||
reason: dbg!(FailReason::BadUtf8),
|
||||
attempting: state.attempting,
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
/// A hardcoded string consisting only of ASCII characters.
|
||||
pub fn ascii_string<'a>(keyword: &'static str) -> impl Parser<'a, ()> {
|
||||
// Verify that this really is exclusively ASCII characters.
|
||||
// The `unsafe` block in this function relies upon this assumption!
|
||||
//
|
||||
// Also, this can't have newlines because we don't attempt to advance
|
||||
// the row in the state, only the column.
|
||||
debug_assert!(keyword.chars().all(|ch| ch.len_utf8() == 1 && ch != '\n'));
|
||||
|
||||
move |_arena, state: State<'a>| {
|
||||
let input = state.input;
|
||||
let len = keyword.len();
|
||||
|
||||
// TODO do this comparison in one SIMD instruction (on supported systems)
|
||||
match input.get(0..len) {
|
||||
Some(next_str) if next_str == keyword => {
|
||||
match state.bytes.get(0..len) {
|
||||
// SAFETY: Roc language keywords are statically known to contain only
|
||||
// ASCII characters, which means their &str will be 100% u8 values in
|
||||
// memory, and thus can be safely interpreted as &[u8]
|
||||
Some(next_str)
|
||||
if next_str == unsafe { mem::transmute::<&'static str, &'a [u8]>(keyword) } =>
|
||||
{
|
||||
Ok(((), state.advance_without_indenting(len)?))
|
||||
}
|
||||
_ => Err(unexpected_eof(0, Attempting::Keyword, state)),
|
||||
|
@ -686,7 +740,7 @@ macro_rules! collection {
|
|||
// We could change the AST to add extra storage specifically to
|
||||
// support empty literals containing newlines or comments, but this
|
||||
// does not seem worth even the tiniest regression in compiler performance.
|
||||
zero_or_more!($crate::parser::char(' ')),
|
||||
zero_or_more!($crate::parser::ascii_char(' ')),
|
||||
skip_second!(
|
||||
$crate::parser::sep_by0(
|
||||
$delimiter,
|
||||
|
@ -912,6 +966,7 @@ macro_rules! record_field {
|
|||
use $crate::ast::AssignedField::*;
|
||||
use $crate::blankspace::{space0, space0_before};
|
||||
use $crate::ident::lowercase_ident;
|
||||
use $crate::parser::ascii_char;
|
||||
use $crate::parser::Either::*;
|
||||
|
||||
// You must have a field name, e.g. "email"
|
||||
|
@ -922,8 +977,8 @@ macro_rules! record_field {
|
|||
// Having a value is optional; both `{ email }` and `{ email: blah }` work.
|
||||
// (This is true in both literals and types.)
|
||||
let (opt_loc_val, state) = $crate::parser::optional(either!(
|
||||
skip_first!(char(':'), space0_before($val_parser, $min_indent)),
|
||||
skip_first!(char('?'), space0_before($val_parser, $min_indent))
|
||||
skip_first!(ascii_char(':'), space0_before($val_parser, $min_indent)),
|
||||
skip_first!(ascii_char('?'), space0_before($val_parser, $min_indent))
|
||||
))
|
||||
.parse(arena, state)?;
|
||||
|
||||
|
@ -952,10 +1007,10 @@ macro_rules! record_field {
|
|||
macro_rules! record_without_update {
|
||||
($val_parser:expr, $min_indent:expr) => {
|
||||
collection!(
|
||||
char('{'),
|
||||
ascii_char('{'),
|
||||
loc!(record_field!($val_parser, $min_indent)),
|
||||
char(','),
|
||||
char('}'),
|
||||
ascii_char(','),
|
||||
ascii_char('}'),
|
||||
$min_indent
|
||||
)
|
||||
};
|
||||
|
@ -965,7 +1020,7 @@ macro_rules! record_without_update {
|
|||
macro_rules! record {
|
||||
($val_parser:expr, $min_indent:expr) => {
|
||||
skip_first!(
|
||||
$crate::parser::char('{'),
|
||||
$crate::parser::ascii_char('{'),
|
||||
and!(
|
||||
// You can optionally have an identifier followed by an '&' to
|
||||
// make this a record update, e.g. { Foo.user & username: "blah" }.
|
||||
|
@ -981,7 +1036,7 @@ macro_rules! record {
|
|||
)),
|
||||
$min_indent
|
||||
),
|
||||
$crate::parser::char('&')
|
||||
$crate::parser::ascii_char('&')
|
||||
)),
|
||||
loc!(skip_first!(
|
||||
// We specifically allow space characters inside here, so that
|
||||
|
@ -995,16 +1050,16 @@ macro_rules! record {
|
|||
// We could change the AST to add extra storage specifically to
|
||||
// support empty literals containing newlines or comments, but this
|
||||
// does not seem worth even the tiniest regression in compiler performance.
|
||||
zero_or_more!($crate::parser::char(' ')),
|
||||
zero_or_more!($crate::parser::ascii_char(' ')),
|
||||
skip_second!(
|
||||
$crate::parser::sep_by0(
|
||||
$crate::parser::char(','),
|
||||
$crate::parser::ascii_char(','),
|
||||
$crate::blankspace::space0_around(
|
||||
loc!(record_field!($val_parser, $min_indent)),
|
||||
$min_indent
|
||||
)
|
||||
),
|
||||
$crate::parser::char('}')
|
||||
$crate::parser::ascii_char('}')
|
||||
)
|
||||
))
|
||||
)
|
||||
|
@ -1067,3 +1122,10 @@ where
|
|||
{
|
||||
attempt!(attempting, parser)
|
||||
}
|
||||
|
||||
pub fn parse_utf8<'a>(bytes: &'a [u8]) -> Result<&'a str, FailReason> {
|
||||
match from_utf8(bytes) {
|
||||
Ok(string) => Ok(string),
|
||||
Err(_) => Err(dbg!(FailReason::BadUtf8)),
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue