Fix remaining UTF-8 parsing issues

This commit is contained in:
Richard Feldman 2020-07-26 21:38:29 -04:00
parent eaaeda728a
commit 273528db77
5 changed files with 358 additions and 267 deletions

View file

@ -3,11 +3,12 @@ use bumpalo::collections::vec::Vec;
use bumpalo::Bump;
use encode_unicode::CharExt;
use roc_region::all::{Located, Region};
use std::fmt;
use std::str::from_utf8;
use std::{char, mem, u16};
/// A position in a source file.
#[derive(Debug, Clone, PartialEq, Eq)]
#[derive(Clone, PartialEq, Eq)]
pub struct State<'a> {
/// The raw input bytes from the file.
pub bytes: &'a [u8],
@ -101,7 +102,7 @@ impl<'a> State<'a> {
/// This assumes we are *not* advancing with spaces, or at least that
/// any spaces on the line were preceded by non-spaces - which would mean
/// they weren't eligible to indent anyway.
pub fn advance_without_indenting(&self, quantity: usize) -> Result<Self, (Fail, Self)> {
pub fn advance_without_indenting(self, quantity: usize) -> Result<Self, (Fail, Self)> {
match (self.column as usize).checked_add(quantity) {
Some(column_usize) if column_usize <= u16::MAX as usize => {
Ok(State {
@ -184,6 +185,24 @@ impl<'a> State<'a> {
}
}
impl<'a> fmt::Debug for State<'a> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "State {{")?;
match from_utf8(self.bytes) {
Ok(string) => write!(f, "\n\tbytes: [utf8] {:?}", string)?,
Err(_) => write!(f, "\n\tbytes: [invalid utf8] {:?}", self.bytes)?,
}
write!(f, "\n\t(line, col): ({}, {}),", self.line, self.column)?;
write!(f, "\n\tindent_col: {}", self.indent_col)?;
write!(f, "\n\tis_indenting: {:?}", self.is_indenting)?;
write!(f, "\n\tattempting: {:?}", self.attempting)?;
write!(f, "\n\toriginal_len: {}", self.original_len)?;
write!(f, "\n}}")
}
}
#[test]
fn state_size() {
// State should always be under 8 machine words, so it fits in a typical
@ -428,14 +447,14 @@ pub fn ascii_char<'a>(expected: char) -> impl Parser<'a, ()> {
/// A single UTF-8-encoded char. This will both parse *and* validate that the
/// char is valid UTF-8.
pub fn utf8_char<'a>() -> impl Parser<'a, char> {
pub fn utf8_char2<'a>() -> impl Parser<'a, char> {
move |_arena, state: State<'a>| {
if !state.bytes.is_empty() {
match char::from_utf8_slice_start(state.bytes) {
Ok((ch, bytes_parsed)) => {
return Ok((ch, state.advance_without_indenting(bytes_parsed)?))
}
Err(_) => return state.fail(dbg!(FailReason::BadUtf8)),
Err(_) => return state.fail(FailReason::BadUtf8),
}
} else {
Err(unexpected_eof(0, state.attempting, state))
@ -445,17 +464,40 @@ pub fn utf8_char<'a>() -> impl Parser<'a, char> {
/// A single UTF-8-encoded char. This will both parse *and* validate that the
/// char is valid UTF-8, but it will *not* advance the state.
pub fn peek_utf8_char<'a>(state: &State<'a>) -> Result<char, Fail> {
match char::from_utf8_slice_start(state.bytes) {
Ok((ch, _)) => Ok(ch),
Err(_) => Err(Fail {
reason: dbg!(FailReason::BadUtf8),
attempting: state.attempting,
}),
pub fn peek_utf8_char<'a>(state: &State<'a>) -> Result<(char, usize), FailReason> {
if !state.bytes.is_empty() {
match char::from_utf8_slice_start(state.bytes) {
Ok((ch, len_utf8)) => Ok((ch, len_utf8)),
Err(_) => Err(FailReason::BadUtf8),
}
} else {
Err(FailReason::Eof(
Region::zero(), /* TODO get a better region */
))
}
}
/// A hardcoded string consisting only of ASCII characters.
/// A single UTF-8-encoded char, with an offset. This will both parse *and*
/// validate that the char is valid UTF-8, but it will *not* advance the state.
pub fn peek_utf8_char_at<'a>(
state: &State<'a>,
offset: usize,
) -> Result<(char, usize), FailReason> {
if state.bytes.len() > offset {
let bytes = &state.bytes[offset..];
match char::from_utf8_slice_start(bytes) {
Ok((ch, len_utf8)) => Ok((ch, len_utf8)),
Err(_) => Err(FailReason::BadUtf8),
}
} else {
Err(FailReason::Eof(
Region::zero(), /* TODO get a better region */
))
}
}
/// A hardcoded string with no newlines, consisting only of ASCII characters
pub fn ascii_string<'a>(keyword: &'static str) -> impl Parser<'a, ()> {
// Verify that this really is exclusively ASCII characters.
// The `unsafe` block in this function relies upon this assumption!
@ -472,10 +514,12 @@ pub fn ascii_string<'a>(keyword: &'static str) -> impl Parser<'a, ()> {
// SAFETY: Roc language keywords are statically known to contain only
// ASCII characters, which means their &str will be 100% u8 values in
// memory, and thus can be safely interpreted as &[u8]
Some(next_str)
if next_str == unsafe { mem::transmute::<&'static str, &'a [u8]>(keyword) } =>
{
Ok(((), state.advance_without_indenting(len)?))
Some(next_str) => {
if next_str == unsafe { mem::transmute::<&'static str, &'a [u8]>(keyword) } {
Ok(((), state.advance_without_indenting(len)?))
} else {
Err(unexpected(len, state, Attempting::Keyword))
}
}
_ => Err(unexpected_eof(0, Attempting::Keyword, state)),
}
@ -1126,6 +1170,6 @@ where
pub fn parse_utf8<'a>(bytes: &'a [u8]) -> Result<&'a str, FailReason> {
match from_utf8(bytes) {
Ok(string) => Ok(string),
Err(_) => Err(dbg!(FailReason::BadUtf8)),
Err(_) => Err(FailReason::BadUtf8),
}
}