Give parser fuzzing some TLC

* The header + expr fuzzers can both be run again (header fuzzer had regressed).
* I ran the expr fuzzer for ~60 seconds with no additional panics uncovered
* "tab_crash" hit supposedly unreachable code in blankspace.rs - and I went to the liberty of dramatically simplifying all that code, rather than just trying to fix the bug
* Other failures were straight-forward error cases that should have been handled (and passed up the chain) instead of panicking
This commit is contained in:
Joshua Warner 2022-12-07 21:45:02 -08:00
parent 521afce1f4
commit 5f29402297
No known key found for this signature in database
GPG key ID: 89AD497003F93FDD
15 changed files with 176 additions and 475 deletions

View file

@ -37,10 +37,7 @@ where
E: 'a + SpaceProblem,
{
parser::map_with_arena(
and(
space0_e(indent_before_problem),
and(parser, space0_no_after_indent_check()),
),
and(space0_e(indent_before_problem), and(parser, spaces())),
spaces_around_help,
)
}
@ -164,474 +161,142 @@ where
}
}
fn eat_whitespace(bytes: &[u8]) -> usize {
let mut i = 0;
while i < bytes.len() {
match bytes[i] {
b' ' => i += 1,
_ => break,
}
}
i
}
fn eat_until_newline(bytes: &[u8]) -> usize {
let mut i = 0;
while i < bytes.len() {
if bytes[i] < b' ' {
break;
} else {
i += 1;
}
}
i
}
pub fn space0_e<'a, E>(
indent_problem: fn(Position) -> E,
) -> impl Parser<'a, &'a [CommentOrNewline<'a>], E>
where
E: 'a + SpaceProblem,
{
spaces_help_help(indent_problem)
move |arena, state: State<'a>, min_indent: u32| {
let start = state.pos();
match spaces().parse(arena, state, min_indent) {
Ok((progress, spaces, state)) => {
if progress == NoProgress || state.column() >= min_indent {
Ok((progress, spaces, state))
} else {
Err((progress, indent_problem(start)))
}
}
Err((progress, err)) => Err((progress, err)),
}
}
}
#[inline(always)]
fn spaces_help_help<'a, E>(
indent_problem: fn(Position) -> E,
) -> impl Parser<'a, &'a [CommentOrNewline<'a>], E>
fn spaces<'a, E>() -> impl Parser<'a, &'a [CommentOrNewline<'a>], E>
where
E: 'a + SpaceProblem,
{
move |arena, state: State<'a>, min_indent: u32| match fast_eat_spaces(&state) {
FastSpaceState::HasTab(position) => Err((
MadeProgress,
E::space_problem(BadInputError::HasTab, position),
)),
FastSpaceState::Good {
newlines,
consumed,
column,
} => {
if consumed == 0 {
Ok((NoProgress, &[] as &[_], state))
} else if column < min_indent {
Err((MadeProgress, indent_problem(state.pos())))
} else {
let comments_and_newlines = Vec::with_capacity_in(newlines, arena);
let spaces = eat_spaces(state, comments_and_newlines);
Ok((
MadeProgress,
spaces.comments_and_newlines.into_bump_slice(),
spaces.state,
))
move |arena, mut state: State<'a>, _min_indent: u32| {
let mut newlines = Vec::new_in(arena);
let mut progress = NoProgress;
loop {
let whitespace = eat_whitespace(state.bytes());
if whitespace > 0 {
state.advance_mut(whitespace);
progress = MadeProgress;
}
}
}
}
#[inline(always)]
fn space0_no_after_indent_check<'a, E>() -> impl Parser<'a, &'a [CommentOrNewline<'a>], E>
where
E: 'a + SpaceProblem,
{
move |arena, state: State<'a>, _min_indent: u32| match fast_eat_spaces(&state) {
FastSpaceState::HasTab(position) => Err((
MadeProgress,
E::space_problem(BadInputError::HasTab, position),
)),
FastSpaceState::Good {
newlines,
consumed,
column: _,
} => {
if consumed == 0 {
Ok((NoProgress, &[] as &[_], state))
} else {
let comments_and_newlines = Vec::with_capacity_in(newlines, arena);
let spaces = eat_spaces(state, comments_and_newlines);
match state.bytes().first() {
Some(b'#') => {
state.advance_mut(1);
Ok((
MadeProgress,
spaces.comments_and_newlines.into_bump_slice(),
spaces.state,
))
}
}
}
}
enum FastSpaceState {
Good {
newlines: usize,
consumed: usize,
column: u32,
},
HasTab(Position),
}
fn fast_eat_spaces(state: &State) -> FastSpaceState {
use FastSpaceState::*;
let mut newlines = 0;
let mut line_start = state.line_start.offset as usize;
let base_offset = state.pos().offset as usize;
let mut index = base_offset;
let bytes = state.original_bytes();
let length = bytes.len();
'outer: while index < length {
match bytes[index] {
b' ' => {
index += 1;
}
b'\n' => {
newlines += 1;
index += 1;
line_start = index;
}
b'\r' => {
index += 1;
line_start = index;
}
b'\t' => {
return HasTab(Position::new(index as u32));
}
b'#' => {
index += 1;
// try to use SIMD instructions explicitly
// run with RUSTFLAGS="-C target-cpu=native" to enable
#[cfg(all(
target_arch = "x86_64",
target_feature = "sse2",
target_feature = "sse4.2"
))]
{
use std::arch::x86_64::*;
// a bytestring with the three characters we're looking for (the rest is ignored)
let needle = b"\r\n\t=============";
let needle = unsafe { _mm_loadu_si128(needle.as_ptr() as *const _) };
while index < length {
let remaining = length - index;
let length = if remaining < 16 { remaining as i32 } else { 16 };
// the source bytes we'll be looking at
let haystack =
unsafe { _mm_loadu_si128(bytes.as_ptr().add(index) as *const _) };
// use first 3 characters of needle, first `length` characters of haystack
// finds the first index where one of the `needle` characters occurs
// or 16 when none of the needle characters occur
let first_special_char = unsafe {
_mm_cmpestri(needle, 3, haystack, length, _SIDD_CMP_EQUAL_ANY)
};
// we've made `first_special_char` characters of progress
index += usize::min(first_special_char as usize, remaining);
// if we found a special char, let the outer loop handle it
if first_special_char != 16 {
continue 'outer;
}
}
}
#[cfg(not(all(
target_arch = "x86_64",
target_feature = "sse2",
target_feature = "sse4.2"
)))]
{
while index < length {
match bytes[index] {
b'\n' | b'\t' | b'\r' => {
continue 'outer;
}
_ => {
index += 1;
}
}
}
}
}
_ => break,
}
}
Good {
newlines,
consumed: index - base_offset,
column: (index - line_start) as u32,
}
}
struct SpaceState<'a> {
state: State<'a>,
comments_and_newlines: Vec<'a, CommentOrNewline<'a>>,
}
fn eat_spaces<'a>(
mut state: State<'a>,
mut comments_and_newlines: Vec<'a, CommentOrNewline<'a>>,
) -> SpaceState<'a> {
for c in state.bytes() {
match c {
b' ' => {
state = state.advance(1);
}
b'\n' => {
state = state.advance_newline();
comments_and_newlines.push(CommentOrNewline::Newline);
}
b'\r' => {
state = state.advance_newline();
}
b'\t' => unreachable!(),
b'#' => {
state = state.advance(1);
return eat_line_comment(state, comments_and_newlines);
}
_ => {
if !comments_and_newlines.is_empty() {
state = state.mark_current_indent();
}
break;
}
}
}
SpaceState {
state,
comments_and_newlines,
}
}
fn eat_line_comment<'a>(
mut state: State<'a>,
mut comments_and_newlines: Vec<'a, CommentOrNewline<'a>>,
) -> SpaceState<'a> {
let mut index = state.pos().offset as usize;
let bytes = state.original_bytes();
let length = bytes.len();
'outer: loop {
let is_doc_comment = if let Some(b'#') = bytes.get(index) {
match bytes.get(index + 1) {
Some(b' ') => {
state = state.advance(2);
index += 2;
true
}
Some(b'\n') => {
// consume the second # and the \n
state = state.advance(1);
state = state.advance_newline();
index += 2;
comments_and_newlines.push(CommentOrNewline::DocComment(""));
for c in state.bytes() {
match c {
b' ' => {
state = state.advance(1);
}
b'\n' => {
state = state.advance_newline();
comments_and_newlines.push(CommentOrNewline::Newline);
}
b'\r' => {
state = state.advance_newline();
}
b'\t' => unreachable!(),
b'#' => {
state = state.advance(1);
index += 1;
continue 'outer;
}
_ => {
state = state.mark_current_indent();
break;
}
}
index += 1;
}
return SpaceState {
state,
comments_and_newlines,
};
}
None => {
// consume the second #
state = state.advance(1);
return SpaceState {
state,
comments_and_newlines,
};
}
Some(_) => false,
}
} else {
false
};
let loop_start = index;
#[cfg(all(
target_arch = "x86_64",
target_feature = "sse2",
target_feature = "sse4.2"
))]
{
use std::arch::x86_64::*;
// a bytestring with the three characters we're looking for (the rest is ignored)
let needle = b"\r\n\t=============";
let needle = unsafe { _mm_loadu_si128(needle.as_ptr() as *const _) };
while index < length {
let remaining = length - index;
let chunk = if remaining < 16 { remaining as i32 } else { 16 };
// the source bytes we'll be looking at
let haystack = unsafe { _mm_loadu_si128(bytes.as_ptr().add(index) as *const _) };
// use first 3 characters of needle, first chunk` characters of haystack
// finds the first index where one of the `needle` characters occurs
// or 16 when none of the needle characters occur
let first_special_char =
unsafe { _mm_cmpestri(needle, 3, haystack, chunk, _SIDD_CMP_EQUAL_ANY) };
// we've made `first_special_char` characters of progress
let progress = usize::min(first_special_char as usize, remaining);
index += progress;
state = state.advance(progress);
if first_special_char != 16 {
match bytes[index] {
b'\t' => unreachable!(),
b'\n' => {
let comment =
unsafe { std::str::from_utf8_unchecked(&bytes[loop_start..index]) };
if is_doc_comment {
comments_and_newlines.push(CommentOrNewline::DocComment(comment));
} else {
comments_and_newlines.push(CommentOrNewline::LineComment(comment));
}
state = state.advance_newline();
index += 1;
while index < length {
match bytes[index] {
b' ' => {
state = state.advance(1);
}
b'\n' => {
state = state.advance_newline();
comments_and_newlines.push(CommentOrNewline::Newline);
}
b'\r' => {
state = state.advance_newline();
}
b'\t' => unreachable!(),
b'#' => {
state = state.advance(1);
index += 1;
continue 'outer;
}
_ => {
state = state.mark_current_indent();
break;
}
}
index += 1;
}
return SpaceState {
state,
comments_and_newlines,
};
}
b'\r' => {
state = state.advance_newline();
index += 1;
}
odd_character => {
unreachable!(
"unexpected_character {} {}",
odd_character, odd_character as char
)
}
}
}
}
}
#[cfg(not(all(
target_arch = "x86_64",
target_feature = "sse2",
target_feature = "sse4.2"
)))]
while index < length {
match bytes[index] {
b'\t' => unreachable!(),
b'\n' => {
let comment =
unsafe { std::str::from_utf8_unchecked(&bytes[loop_start..index]) };
let is_doc_comment = state.bytes().first() == Some(&b'#')
&& (state.bytes().get(1) == Some(&b' ')
|| state.bytes().get(1) == Some(&b'\n')
|| state.bytes().get(1) == None);
if is_doc_comment {
comments_and_newlines.push(CommentOrNewline::DocComment(comment));
} else {
comments_and_newlines.push(CommentOrNewline::LineComment(comment));
}
state = state.advance_newline();
index += 1;
while index < length {
match bytes[index] {
b' ' => {
state = state.advance(1);
}
b'\n' => {
state = state.advance_newline();
comments_and_newlines.push(CommentOrNewline::Newline);
}
b'\r' => {
state = state.advance_newline();
}
b'\t' => unreachable!(),
b'#' => {
state = state.advance(1);
index += 1;
continue 'outer;
}
_ => {
state = state.mark_current_indent();
break;
}
state.advance_mut(1);
if state.bytes().first() == Some(&b' ') {
state.advance_mut(1);
}
index += 1;
}
return SpaceState {
state,
comments_and_newlines,
let len = eat_until_newline(state.bytes());
// We already checked that the string is valid UTF-8
debug_assert!(std::str::from_utf8(&state.bytes()[..len]).is_ok());
let text = unsafe { std::str::from_utf8_unchecked(&state.bytes()[..len]) };
let comment = if is_doc_comment {
CommentOrNewline::DocComment(text)
} else {
CommentOrNewline::LineComment(text)
};
newlines.push(comment);
state.advance_mut(len);
if state.bytes().first() == Some(&b'\n') {
state = state.advance_newline();
}
progress = MadeProgress;
}
b'\r' => {
Some(b'\r') => {
if state.bytes().get(1) == Some(&b'\n') {
newlines.push(CommentOrNewline::Newline);
state.advance_mut(1);
state = state.advance_newline();
progress = MadeProgress;
} else {
return Err((
progress,
E::space_problem(
BadInputError::HasMisplacedCarriageReturn,
state.pos(),
),
));
}
}
Some(b'\n') => {
newlines.push(CommentOrNewline::Newline);
state = state.advance_newline();
progress = MadeProgress;
}
Some(b'\t') => {
return Err((
progress,
E::space_problem(BadInputError::HasTab, state.pos()),
));
}
Some(x) if *x < b' ' => {
return Err((
progress,
E::space_problem(BadInputError::HasAsciiControl, state.pos()),
));
}
_ => {
state = state.advance(1);
if !newlines.is_empty() {
state = state.mark_current_indent();
}
break;
}
}
index += 1;
}
// We made it to the end of the bytes. This means there's a comment without a trailing newline.
let comment = unsafe { std::str::from_utf8_unchecked(&bytes[loop_start..index]) };
if is_doc_comment {
comments_and_newlines.push(CommentOrNewline::DocComment(comment));
} else {
comments_and_newlines.push(CommentOrNewline::LineComment(comment));
}
return SpaceState {
state,
comments_and_newlines,
};
Ok((progress, newlines.into_bump_slice(), state))
}
}

View file

@ -1114,7 +1114,15 @@ fn finish_parsing_alias_or_opaque<'a>(
Ok(good) => {
type_arguments.push(Loc::at(argument.region, good));
}
Err(_) => panic!(),
Err(()) => {
return Err((
MadeProgress,
EExpr::Pattern(
arena.alloc(EPattern::NotAPattern(state.pos())),
state.pos(),
),
));
}
}
}
@ -1577,8 +1585,8 @@ fn parse_expr_operator<'a>(
}
}
}
Err((NoProgress, expr)) => {
todo!("{:?} {:?}", expr, state)
Err((NoProgress, _e)) => {
return Err((MadeProgress, EExpr::TrailingOperator(state.pos())));
}
},
}
@ -1722,10 +1730,17 @@ fn parse_expr_end<'a>(
expr_state.consume_spaces(arena);
let call = to_call(arena, expr_state.arguments, expr_state.expr);
let loc_pattern = Loc::at(
call.region,
expr_to_pattern_help(arena, &call.value).unwrap(),
);
let pattern = expr_to_pattern_help(arena, &call.value).map_err(|()| {
(
MadeProgress,
EExpr::Pattern(
arena.alloc(EPattern::NotAPattern(state.pos())),
state.pos(),
),
)
})?;
let loc_pattern = Loc::at(call.region, pattern);
patterns.insert(0, loc_pattern);

View file

@ -64,7 +64,7 @@ pub enum SyntaxError<'a> {
Space(BadInputError),
NotEndOfFile(Position),
}
pub trait SpaceProblem {
pub trait SpaceProblem: std::fmt::Debug {
fn space_problem(e: BadInputError, pos: Position) -> Self;
}
@ -265,6 +265,8 @@ pub enum EGeneratesWith {
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum BadInputError {
HasTab,
HasMisplacedCarriageReturn,
HasAsciiControl,
///
TooManyLines,
///
@ -272,15 +274,6 @@ pub enum BadInputError {
BadUtf8,
}
pub fn bad_input_to_syntax_error<'a>(bad_input: BadInputError) -> SyntaxError<'a> {
use crate::parser::BadInputError::*;
match bad_input {
HasTab => SyntaxError::NotYetImplemented("call error on tabs".to_string()),
TooManyLines => SyntaxError::TooManyLines,
BadUtf8 => SyntaxError::BadUtf8,
}
}
impl<'a, T> SourceError<'a, T> {
pub fn new(problem: T, state: &State<'a>) -> Self {
Self {
@ -323,6 +316,8 @@ impl<'a> SyntaxError<'a> {
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum EExpr<'a> {
TrailingOperator(Position),
Start(Position),
End(Position),
BadExprEnd(Position),
@ -560,6 +555,7 @@ pub enum EPattern<'a> {
Record(PRecord<'a>, Position),
List(PList<'a>, Position),
Underscore(Position),
NotAPattern(Position),
Start(Position),
End(Position),
@ -773,7 +769,7 @@ pub struct FileError<'a, T> {
pub trait Parser<'a, Output, Error> {
fn parse(
&self,
alloc: &'a Bump,
arena: &'a Bump,
state: State<'a>,
min_indent: u32,
) -> ParseResult<'a, Output, Error>;

View file

@ -98,7 +98,7 @@ impl<'a> State<'a> {
self.offset += 1;
self.line_start = self.pos();
// WARNING! COULD CAUSE BUGS IF WE FORGET TO CALL mark_current_ident LATER!
// WARNING! COULD CAUSE BUGS IF WE FORGET TO CALL mark_current_indent LATER!
// We really need to be stricter about this.
self.line_start_after_whitespace = self.line_start;

View file

@ -41,3 +41,15 @@ pub fn parse_defs_with<'a>(arena: &'a Bump, input: &'a str) -> Result<Defs<'a>,
Err(tuple) => Err(tuple.1),
}
}
pub fn parse_header_with<'a>(
arena: &'a Bump,
input: &'a str,
) -> Result<ast::Module<'a>, SyntaxError<'a>> {
let state = State::new(input.trim().as_bytes());
match crate::module::parse_header(arena, state.clone()) {
Ok((header, _)) => Ok(header),
Err(fail) => Err(SyntaxError::Header(fail.problem)),
}
}