roc/crates/compiler/parse/src/blankspace.rs
Joshua Warner 07be8ec000
Refactor Parser trait to pass min_indent
This removes the need to explicitly pass thru min_indent when using the parser combinators.

My ultimate goal here is to evolve the current parser closer toward a purely combinator-based parser,
at which point we can more easily transition smoothly to a formal(ish) grammar, or expand the meanings of combinators
to include things like:
* Incremental (re)parsing
* Unified parsing and formatting code
* Better error recovery
* Using the main parser directly for syntax highlighting
2022-10-31 13:31:47 -07:00

598 lines
19 KiB
Rust

use crate::ast::CommentOrNewline;
use crate::ast::Spaceable;
use crate::parser::SpaceProblem;
use crate::parser::{self, and, backtrackable, BadInputError, Parser, Progress::*};
use crate::state::State;
use bumpalo::collections::vec::Vec;
use bumpalo::Bump;
use roc_region::all::Loc;
use roc_region::all::Position;
pub fn space0_around_ee<'a, P, S, E>(
parser: P,
indent_before_problem: fn(Position) -> E,
indent_after_problem: fn(Position) -> E,
) -> impl Parser<'a, Loc<S>, E>
where
S: Spaceable<'a>,
S: 'a,
P: Parser<'a, Loc<S>, E>,
P: 'a,
E: 'a + SpaceProblem,
{
parser::map_with_arena(
and(
space0_e(indent_before_problem),
and(parser, space0_e(indent_after_problem)),
),
spaces_around_help,
)
}
pub fn space0_before_optional_after<'a, P, S, E>(
parser: P,
indent_before_problem: fn(Position) -> E,
indent_after_problem: fn(Position) -> E,
) -> impl Parser<'a, Loc<S>, E>
where
S: Spaceable<'a>,
S: 'a,
P: Parser<'a, Loc<S>, E>,
P: 'a,
E: 'a + SpaceProblem,
{
parser::map_with_arena(
and(
space0_e(indent_before_problem),
and(
parser,
one_of![
backtrackable(space0_e(indent_after_problem)),
succeed!(&[] as &[_]),
],
),
),
spaces_around_help,
)
}
fn spaces_around_help<'a, S>(
arena: &'a Bump,
tuples: (
&'a [CommentOrNewline<'a>],
(Loc<S>, &'a [CommentOrNewline<'a>]),
),
) -> Loc<S>
where
S: Spaceable<'a>,
S: 'a,
{
let (spaces_before, (loc_val, spaces_after)) = tuples;
if spaces_before.is_empty() {
if spaces_after.is_empty() {
loc_val
} else {
arena
.alloc(loc_val.value)
.with_spaces_after(spaces_after, loc_val.region)
}
} else if spaces_after.is_empty() {
arena
.alloc(loc_val.value)
.with_spaces_before(spaces_before, loc_val.region)
} else {
let wrapped_expr = arena
.alloc(loc_val.value)
.with_spaces_after(spaces_after, loc_val.region);
arena
.alloc(wrapped_expr.value)
.with_spaces_before(spaces_before, wrapped_expr.region)
}
}
pub fn space0_before_e<'a, P, S, E>(
parser: P,
indent_problem: fn(Position) -> E,
) -> impl Parser<'a, Loc<S>, E>
where
S: Spaceable<'a>,
S: 'a,
P: Parser<'a, Loc<S>, E>,
P: 'a,
E: 'a + SpaceProblem,
{
parser::map_with_arena(
and!(space0_e(indent_problem), parser),
|arena: &'a Bump, (space_list, loc_expr): (&'a [CommentOrNewline<'a>], Loc<S>)| {
if space_list.is_empty() {
loc_expr
} else {
arena
.alloc(loc_expr.value)
.with_spaces_before(space_list, loc_expr.region)
}
},
)
}
pub fn space0_after_e<'a, P, S, E>(
parser: P,
indent_problem: fn(Position) -> E,
) -> impl Parser<'a, Loc<S>, E>
where
S: Spaceable<'a>,
S: 'a,
P: Parser<'a, Loc<S>, E>,
P: 'a,
E: 'a + SpaceProblem,
{
parser::map_with_arena(
and!(parser, space0_e(indent_problem)),
|arena: &'a Bump, (loc_expr, space_list): (Loc<S>, &'a [CommentOrNewline<'a>])| {
if space_list.is_empty() {
loc_expr
} else {
arena
.alloc(loc_expr.value)
.with_spaces_after(space_list, loc_expr.region)
}
},
)
}
pub fn check_indent<'a, E>(indent_problem: fn(Position) -> E) -> impl Parser<'a, (), E>
where
E: 'a,
{
move |_, state: State<'a>, min_indent: u32| {
if state.column() >= min_indent {
Ok((NoProgress, (), state))
} else {
Err((NoProgress, indent_problem(state.pos()), state))
}
}
}
pub fn space0_e<'a, E>(
indent_problem: fn(Position) -> E,
) -> impl Parser<'a, &'a [CommentOrNewline<'a>], E>
where
E: 'a + SpaceProblem,
{
spaces_help_help(indent_problem)
}
#[inline(always)]
fn spaces_help_help<'a, E>(
indent_problem: fn(Position) -> E,
) -> impl Parser<'a, &'a [CommentOrNewline<'a>], E>
where
E: 'a + SpaceProblem,
{
move |arena, state: State<'a>, min_indent: u32| match fast_eat_spaces(&state) {
FastSpaceState::HasTab(position) => Err((
MadeProgress,
E::space_problem(BadInputError::HasTab, position),
state,
)),
FastSpaceState::Good {
newlines,
consumed,
column,
} => {
if consumed == 0 {
Ok((NoProgress, &[] as &[_], state))
} else if column < min_indent {
Err((MadeProgress, indent_problem(state.pos()), state))
} else {
let comments_and_newlines = Vec::with_capacity_in(newlines, arena);
let spaces = eat_spaces(state, comments_and_newlines);
Ok((
MadeProgress,
spaces.comments_and_newlines.into_bump_slice(),
spaces.state,
))
}
}
}
}
enum FastSpaceState {
Good {
newlines: usize,
consumed: usize,
column: u32,
},
HasTab(Position),
}
fn fast_eat_spaces(state: &State) -> FastSpaceState {
use FastSpaceState::*;
let mut newlines = 0;
let mut line_start = state.line_start.offset as usize;
let base_offset = state.pos().offset as usize;
let mut index = base_offset;
let bytes = state.original_bytes();
let length = bytes.len();
'outer: while index < length {
match bytes[index] {
b' ' => {
index += 1;
}
b'\n' => {
newlines += 1;
index += 1;
line_start = index;
}
b'\r' => {
index += 1;
line_start = index;
}
b'\t' => {
return HasTab(Position::new(index as u32));
}
b'#' => {
index += 1;
// try to use SIMD instructions explicitly
// run with RUSTFLAGS="-C target-cpu=native" to enable
#[cfg(all(
target_arch = "x86_64",
target_feature = "sse2",
target_feature = "sse4.2"
))]
{
use std::arch::x86_64::*;
// a bytestring with the three characters we're looking for (the rest is ignored)
let needle = b"\r\n\t=============";
let needle = unsafe { _mm_loadu_si128(needle.as_ptr() as *const _) };
while index < length {
let remaining = length - index;
let length = if remaining < 16 { remaining as i32 } else { 16 };
// the source bytes we'll be looking at
let haystack =
unsafe { _mm_loadu_si128(bytes.as_ptr().add(index) as *const _) };
// use first 3 characters of needle, first `length` characters of haystack
// finds the first index where one of the `needle` characters occurs
// or 16 when none of the needle characters occur
let first_special_char = unsafe {
_mm_cmpestri(needle, 3, haystack, length, _SIDD_CMP_EQUAL_ANY)
};
// we've made `first_special_char` characters of progress
index += usize::min(first_special_char as usize, remaining);
// if we found a special char, let the outer loop handle it
if first_special_char != 16 {
continue 'outer;
}
}
}
#[cfg(not(all(
target_arch = "x86_64",
target_feature = "sse2",
target_feature = "sse4.2"
)))]
{
while index < length {
match bytes[index] {
b'\n' | b'\t' | b'\r' => {
continue 'outer;
}
_ => {
index += 1;
}
}
}
}
}
_ => break,
}
}
Good {
newlines,
consumed: index - base_offset,
column: (index - line_start) as u32,
}
}
struct SpaceState<'a> {
state: State<'a>,
comments_and_newlines: Vec<'a, CommentOrNewline<'a>>,
}
fn eat_spaces<'a>(
mut state: State<'a>,
mut comments_and_newlines: Vec<'a, CommentOrNewline<'a>>,
) -> SpaceState<'a> {
for c in state.bytes() {
match c {
b' ' => {
state = state.advance(1);
}
b'\n' => {
state = state.advance_newline();
comments_and_newlines.push(CommentOrNewline::Newline);
}
b'\r' => {
state = state.advance_newline();
}
b'\t' => unreachable!(),
b'#' => {
state = state.advance(1);
return eat_line_comment(state, comments_and_newlines);
}
_ => {
if !comments_and_newlines.is_empty() {
state = state.mark_current_indent();
}
break;
}
}
}
SpaceState {
state,
comments_and_newlines,
}
}
fn eat_line_comment<'a>(
mut state: State<'a>,
mut comments_and_newlines: Vec<'a, CommentOrNewline<'a>>,
) -> SpaceState<'a> {
let mut index = state.pos().offset as usize;
let bytes = state.original_bytes();
let length = bytes.len();
'outer: loop {
let is_doc_comment = if let Some(b'#') = bytes.get(index) {
match bytes.get(index + 1) {
Some(b' ') => {
state = state.advance(2);
index += 2;
true
}
Some(b'\n') => {
// consume the second # and the \n
state = state.advance(1);
state = state.advance_newline();
index += 2;
comments_and_newlines.push(CommentOrNewline::DocComment(""));
for c in state.bytes() {
match c {
b' ' => {
state = state.advance(1);
}
b'\n' => {
state = state.advance_newline();
comments_and_newlines.push(CommentOrNewline::Newline);
}
b'\r' => {
state = state.advance_newline();
}
b'\t' => unreachable!(),
b'#' => {
state = state.advance(1);
index += 1;
continue 'outer;
}
_ => {
state = state.mark_current_indent();
break;
}
}
index += 1;
}
return SpaceState {
state,
comments_and_newlines,
};
}
None => {
// consume the second #
state = state.advance(1);
return SpaceState {
state,
comments_and_newlines,
};
}
Some(_) => false,
}
} else {
false
};
let loop_start = index;
#[cfg(all(
target_arch = "x86_64",
target_feature = "sse2",
target_feature = "sse4.2"
))]
{
use std::arch::x86_64::*;
// a bytestring with the three characters we're looking for (the rest is ignored)
let needle = b"\r\n\t=============";
let needle = unsafe { _mm_loadu_si128(needle.as_ptr() as *const _) };
while index < length {
let remaining = length - index;
let chunk = if remaining < 16 { remaining as i32 } else { 16 };
// the source bytes we'll be looking at
let haystack = unsafe { _mm_loadu_si128(bytes.as_ptr().add(index) as *const _) };
// use first 3 characters of needle, first chunk` characters of haystack
// finds the first index where one of the `needle` characters occurs
// or 16 when none of the needle characters occur
let first_special_char =
unsafe { _mm_cmpestri(needle, 3, haystack, chunk, _SIDD_CMP_EQUAL_ANY) };
// we've made `first_special_char` characters of progress
let progress = usize::min(first_special_char as usize, remaining);
index += progress;
state = state.advance(progress);
if first_special_char != 16 {
match bytes[index] {
b'\t' => unreachable!(),
b'\n' => {
let comment =
unsafe { std::str::from_utf8_unchecked(&bytes[loop_start..index]) };
if is_doc_comment {
comments_and_newlines.push(CommentOrNewline::DocComment(comment));
} else {
comments_and_newlines.push(CommentOrNewline::LineComment(comment));
}
state = state.advance_newline();
index += 1;
while index < length {
match bytes[index] {
b' ' => {
state = state.advance(1);
}
b'\n' => {
state = state.advance_newline();
comments_and_newlines.push(CommentOrNewline::Newline);
}
b'\r' => {
state = state.advance_newline();
}
b'\t' => unreachable!(),
b'#' => {
state = state.advance(1);
index += 1;
continue 'outer;
}
_ => {
state = state.mark_current_indent();
break;
}
}
index += 1;
}
return SpaceState {
state,
comments_and_newlines,
};
}
b'\r' => {
state = state.advance_newline();
index += 1;
}
odd_character => {
unreachable!(
"unexpected_character {} {}",
odd_character, odd_character as char
)
}
}
}
}
}
#[cfg(not(all(
target_arch = "x86_64",
target_feature = "sse2",
target_feature = "sse4.2"
)))]
while index < length {
match bytes[index] {
b'\t' => unreachable!(),
b'\n' => {
let comment =
unsafe { std::str::from_utf8_unchecked(&bytes[loop_start..index]) };
if is_doc_comment {
comments_and_newlines.push(CommentOrNewline::DocComment(comment));
} else {
comments_and_newlines.push(CommentOrNewline::LineComment(comment));
}
state = state.advance_newline();
index += 1;
while index < length {
match bytes[index] {
b' ' => {
state = state.advance(1);
}
b'\n' => {
state = state.advance_newline();
comments_and_newlines.push(CommentOrNewline::Newline);
}
b'\r' => {
state = state.advance_newline();
}
b'\t' => unreachable!(),
b'#' => {
state = state.advance(1);
index += 1;
continue 'outer;
}
_ => {
state = state.mark_current_indent();
break;
}
}
index += 1;
}
return SpaceState {
state,
comments_and_newlines,
};
}
b'\r' => {
state = state.advance_newline();
}
_ => {
state = state.advance(1);
}
}
index += 1;
}
// We made it to the end of the bytes. This means there's a comment without a trailing newline.
let comment = unsafe { std::str::from_utf8_unchecked(&bytes[loop_start..index]) };
if is_doc_comment {
comments_and_newlines.push(CommentOrNewline::DocComment(comment));
} else {
comments_and_newlines.push(CommentOrNewline::LineComment(comment));
}
return SpaceState {
state,
comments_and_newlines,
};
}
}