roc/compiler/parse/src/state.rs
2022-01-01 18:20:05 -08:00

174 lines
5.1 KiB
Rust

use crate::parser::Progress::*;
use crate::parser::{BadInputError, Progress};
use bumpalo::Bump;
use roc_region::all::{Position, Region};
use std::fmt;
/// A position in a source file.
#[derive(Clone)]
pub struct State<'a> {
/// The raw input bytes from the file.
/// Beware: bytes[0] always points the the current byte the parser is examining.
bytes: &'a [u8],
original_bytes: &'a [u8],
/// Length of the original input in bytes
input_len: usize,
line_start: Position,
/// Current position within the input (line/column)
pub xyzlcol: JustColumn,
/// Current indentation level, in columns
/// (so no indent is col 1 - this saves an arithmetic operation.)
pub indent_column: u16,
}
#[derive(Clone, Copy)]
pub struct JustColumn {
pub column: u16,
}
impl<'a> State<'a> {
pub fn new(bytes: &'a [u8]) -> State<'a> {
State {
bytes,
original_bytes: bytes,
input_len: bytes.len(),
line_start: Position::zero(),
xyzlcol: JustColumn { column: 0 },
indent_column: 0,
}
}
pub fn bytes(&self) -> &'a [u8] {
self.bytes
}
pub fn column(&self) -> u16 {
assert_eq!(
self.xyzlcol.column as u32,
self.pos().offset - self.line_start.offset,
"between {:?} and {:?}",
std::str::from_utf8(&self.original_bytes[..self.pos().offset as usize]).unwrap(),
std::str::from_utf8(&self.original_bytes[self.pos().offset as usize..]).unwrap(),
);
self.xyzlcol.column
}
#[must_use]
pub fn advance(&self, offset: usize) -> State<'a> {
let mut state = self.clone();
// debug_assert!(!state.bytes[..offset].iter().any(|b| *b == b'\n'));
state.bytes = &state.bytes[offset..];
state
}
#[must_use]
pub fn advance_newline(&self) -> State<'a> {
let mut state = self.clone();
state.bytes = &state.bytes[1..];
state.line_start = state.pos();
state
}
/// Returns the current position
pub const fn pos(&self) -> Position {
Position::new((self.input_len - self.bytes.len()) as u32)
}
/// Returns whether the parser has reached the end of the input
pub const fn has_reached_end(&self) -> bool {
self.bytes.is_empty()
}
/// Use advance_spaces to advance with indenting.
/// This assumes we are *not* advancing with spaces, or at least that
/// any spaces on the line were preceded by non-spaces - which would mean
/// they weren't eligible to indent anyway.
pub fn advance_without_indenting_e<TE, E>(
self,
quantity: usize,
to_error: TE,
) -> Result<Self, (Progress, E, Self)>
where
TE: Fn(BadInputError, Position) -> E,
{
self.advance_without_indenting_ee(quantity, |p| to_error(BadInputError::LineTooLong, p))
}
pub fn advance_without_indenting_ee<TE, E>(
self,
quantity: usize,
to_error: TE,
) -> Result<Self, (Progress, E, Self)>
where
TE: Fn(Position) -> E,
{
match (self.xyzlcol.column as usize).checked_add(quantity) {
Some(column_usize) if column_usize <= u16::MAX as usize => {
Ok(State {
bytes: &self.bytes[quantity..],
xyzlcol: JustColumn {
column: column_usize as u16,
},
// Once we hit a nonspace character, we are no longer indenting.
..self
})
}
_ => Err((NoProgress, to_error(self.pos()), self)),
}
}
/// Returns a Region corresponding to the current state, but
/// with the the end column advanced by the given amount. This is
/// useful when parsing something "manually" (using input.chars())
/// and thus wanting a Region while not having access to loc().
pub fn len_region(&self, length: u16) -> Region {
Region::new(
self.pos(),
self.pos().bump_column(length),
)
}
/// Return a failing ParseResult for the given FailReason
pub fn fail<T, X>(
self,
_arena: &'a Bump,
progress: Progress,
reason: X,
) -> Result<(Progress, T, Self), (Progress, X, Self)> {
Err((progress, reason, self))
}
}
impl<'a> fmt::Debug for State<'a> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "State {{")?;
match std::str::from_utf8(self.bytes) {
Ok(string) => write!(f, "\n\tbytes: [utf8] {:?}", string)?,
Err(_) => write!(f, "\n\tbytes: [invalid utf8] {:?}", self.bytes)?,
}
write!(
f,
"\n\t(col): {},",
self.xyzlcol.column
)?;
write!(f, "\n\tindent_column: {}", self.indent_column)?;
write!(f, "\n}}")
}
}
#[test]
fn state_size() {
// State should always be under 8 machine words, so it fits in a typical
// cache line.
let state_size = std::mem::size_of::<State>();
let maximum = std::mem::size_of::<usize>() * 8;
assert!(state_size <= maximum, "{:?} <= {:?}", state_size, maximum);
}