use crate::ast::{EscapedChar, StrLiteral, StrSegment}; use crate::expr; use crate::parser::Progress::{self, *}; use crate::parser::{ allocated, loc, reset_min_indent, specialize_ref, word1, BadInputError, EString, Parser, }; use crate::state::State; use bumpalo::collections::vec::Vec; use bumpalo::Bump; /// One or more ASCII hex digits. (Useful when parsing unicode escape codes, /// which must consist entirely of ASCII hex digits.) fn ascii_hex_digits<'a>() -> impl Parser<'a, &'a str, EString<'a>> { move |arena, mut state: State<'a>, _min_indent: u32| { let mut buf = bumpalo::collections::String::new_in(arena); for &byte in state.bytes().iter() { if (byte as char).is_ascii_hexdigit() { buf.push(byte as char); } else if buf.is_empty() { // We didn't find any hex digits! return Err((NoProgress, EString::CodePtEnd(state.pos()), state)); } else { state.advance_mut(buf.len()); return Ok((MadeProgress, buf.into_bump_str(), state)); } } Err((NoProgress, EString::CodePtEnd(state.pos()), state)) } } pub fn parse_single_quote<'a>() -> impl Parser<'a, &'a str, EString<'a>> { move |arena: &'a Bump, mut state: State<'a>, _min_indent: u32| { if state.consume_mut("\'") { // we will be parsing a single-quote-string } else { return Err((NoProgress, EString::Open(state.pos()), state)); } // Handle back slaches in byte literal // - starts with a backslash and used as an escape character. ex: '\n', '\t' // - single quote floating (un closed single quote) should be an error match state.bytes().first() { Some(b'\\') => { state.advance_mut(1); match state.bytes().first() { Some(&ch) => { state.advance_mut(1); if (ch == b'n' || ch == b'r' || ch == b't' || ch == b'\'' || ch == b'\\') && (state.bytes().first() == Some(&b'\'')) { state.advance_mut(1); let test = match ch { b'n' => '\n', b't' => '\t', b'r' => '\r', // since we checked the current char between the single quotes we // know they are valid UTF-8, allowing us to use 'from_u32_unchecked' _ => unsafe { char::from_u32_unchecked(ch as u32) }, }; return Ok((MadeProgress, &*arena.alloc_str(&test.to_string()), state)); } // invalid error, backslah escaping something we do not recognize return Err((NoProgress, EString::CodePtEnd(state.pos()), state)); } None => { // no close quote found return Err((NoProgress, EString::CodePtEnd(state.pos()), state)); } } } Some(_) => { // do nothing for other characters, handled below } None => return Err((NoProgress, EString::CodePtEnd(state.pos()), state)), } let mut bytes = state.bytes().iter(); let mut end_index = 1; // Copy paste problem in mono loop { match bytes.next() { Some(b'\'') => { break; } Some(_) => end_index += 1, None => { return Err((NoProgress, EString::Open(state.pos()), state)); } } } if end_index == 1 { // no progress was made // this case is a double single quote, ex: '' // not supporting empty single quotes return Err((NoProgress, EString::Open(state.pos()), state)); } if end_index > (std::mem::size_of::() + 1) { // bad case: too big to fit into u32 return Err((NoProgress, EString::Open(state.pos()), state)); } // happy case -> we have some bytes that will fit into a u32 // ending up w/ a slice of bytes that we want to convert into an integer let raw_bytes = &state.bytes()[0..end_index - 1]; state.advance_mut(end_index); match std::str::from_utf8(raw_bytes) { Ok(string) => Ok((MadeProgress, string, state)), Err(_) => { // invalid UTF-8 return Err((NoProgress, EString::CodePtEnd(state.pos()), state)); } } } } fn consume_indent<'a>( mut state: State<'a>, mut indent: u32, ) -> Result, State<'a>)> { while indent > 0 { match state.bytes().first() { Some(b' ') => { state.advance_mut(1); indent -= 1; } None | Some(b'\n') => { break; } Some(_) => { return Err(( MadeProgress, EString::MultilineInsufficientIndent(state.pos()), state, )); } } } Ok(state) } fn utf8<'a>( state: State<'a>, string_bytes: &'a [u8], ) -> Result<&'a str, (Progress, EString<'a>, State<'a>)> { std::str::from_utf8(string_bytes).map_err(|_| { // Note Based on where this `utf8` function is used, the fact that we know the whole string // in the parser is valid utf8, and barring bugs in the parser itself // (e.g. where we accidentally split a multibyte utf8 char), this error _should_ actually be unreachable. ( MadeProgress, EString::Space(BadInputError::BadUtf8, state.pos()), state, ) }) } pub fn parse<'a>() -> impl Parser<'a, StrLiteral<'a>, EString<'a>> { use StrLiteral::*; move |arena: &'a Bump, mut state: State<'a>, min_indent: u32| { let is_multiline; let indent = state.column(); let start_state; if state.consume_mut("\"\"\"") { start_state = state.clone(); // we will be parsing a multi-line string is_multiline = true; if state.consume_mut("\n") { state = consume_indent(state, indent)?; } } else if state.consume_mut("\"") { start_state = state.clone(); // we will be parsing a single-line string is_multiline = false; } else { return Err((NoProgress, EString::Open(state.pos()), state)); } let mut bytes = state.bytes().iter(); let mut segment_parsed_bytes = 0; let mut segments = Vec::new_in(arena); macro_rules! escaped_char { ($ch:expr) => { // Record the escaped char. segments.push(StrSegment::EscapedChar($ch)); // Advance past the segment we just added state.advance_mut(segment_parsed_bytes); // Reset the segment segment_parsed_bytes = 0; }; } macro_rules! end_segment { ($transform:expr) => { // Don't push anything if the string would be empty. if segment_parsed_bytes > 1 { // This function is always called after we just parsed // something which signalled that we should end the // current segment - so use segment_parsed_bytes - 1 here, // to exclude that char we just parsed. let string_bytes = &state.bytes()[0..(segment_parsed_bytes - 1)]; match std::str::from_utf8(string_bytes) { Ok(string) => { state.advance_mut(string.len()); segments.push($transform(string)); } Err(_) => { return Err(( MadeProgress, EString::Space(BadInputError::BadUtf8, state.pos()), state, )); } } } // Depending on where this macro is used, in some // places this is unused. #[allow(unused_assignments)] { // This function is always called after we just parsed // something which signalled that we should end the // current segment. segment_parsed_bytes = 1; } }; } while let Some(&byte) = bytes.next() { // This is for the byte we just grabbed from the iterator. segment_parsed_bytes += 1; match byte { b'"' => { if segment_parsed_bytes == 1 && segments.is_empty() { // special case of the empty string if is_multiline { if bytes.as_slice().starts_with(b"\"\"") { return Ok((MadeProgress, Block(&[]), state.advance(3))); } else { // this quote is in a block string continue; } } else { // This is the end of the string! // Advance 1 for the close quote return Ok((MadeProgress, PlainLine(""), state.advance(1))); } } else { // the string is non-empty, which means we need to convert any previous segments // and the current segment into a string literal if is_multiline { if bytes.as_slice().starts_with(b"\"\"") { end_segment!(StrSegment::Plaintext); let expr = if segments.len() == 1 { // We had exactly one segment, so this is a candidate // to be StrLiteral::Plaintext match segments.pop().unwrap() { StrSegment::Plaintext(string) => { StrLiteral::PlainLine(string) } other => StrLiteral::Line(arena.alloc([other])), } } else { Block(arena.alloc([segments.into_bump_slice()])) }; return Ok((MadeProgress, expr, state.advance(3))); } else { // this quote is in a block string continue; } } else { end_segment!(StrSegment::Plaintext); let expr = if segments.len() == 1 { // We had exactly one segment, so this is a candidate // to be StrLiteral::Plaintext match segments.pop().unwrap() { StrSegment::Plaintext(string) => StrLiteral::PlainLine(string), other => StrLiteral::Line(arena.alloc([other])), } } else { Line(segments.into_bump_slice()) }; // Advance the state 1 to account for the closing `"` return Ok((MadeProgress, expr, state.advance(1))); } }; } b'\n' => { if is_multiline { let without_newline = &state.bytes()[0..(segment_parsed_bytes - 1)]; let with_newline = &state.bytes()[0..segment_parsed_bytes]; state.advance_mut(segment_parsed_bytes); state = consume_indent(state, indent)?; bytes = state.bytes().iter(); if state.bytes().starts_with(b"\"\"\"") { // ending the string; don't use the last newline segments .push(StrSegment::Plaintext(utf8(state.clone(), without_newline)?)); } else { segments .push(StrSegment::Plaintext(utf8(state.clone(), with_newline)?)); } segment_parsed_bytes = 0; continue; } else { // This is a single-line string, which cannot have newlines! // Treat this as an unclosed string literal, and consume // all remaining chars. This will mask all other errors, but // it should make it easiest to debug; the file will be a giant // error starting from where the open quote appeared. return Err(( MadeProgress, EString::EndlessSingle(start_state.pos()), start_state, )); } } b'\\' => { // We're about to begin an escaped segment of some sort! // // Record the current segment so we can begin a new one. // End it right before the `\` char we just parsed. end_segment!(StrSegment::Plaintext); // This is for the byte we're about to parse. segment_parsed_bytes += 1; // This is the start of a new escape. Look at the next byte // to figure out what type of escape it is. match bytes.next() { Some(b'(') => { // Advance past the `\(` before using the expr parser state.advance_mut(2); let original_byte_count = state.bytes().len(); // This is an interpolated variable. // Parse an arbitrary expression, then give a // canonicalization error if that expression variant // is not allowed inside a string interpolation. let (_progress, loc_expr, new_state) = skip_second!( specialize_ref( EString::Format, loc(allocated(reset_min_indent(expr::expr_help()))) ), word1(b')', EString::FormatEnd) ) .parse(arena, state, min_indent)?; // Advance the iterator past the expr we just parsed. for _ in 0..(original_byte_count - new_state.bytes().len()) { bytes.next(); } segments.push(StrSegment::Interpolated(loc_expr)); // Reset the segment segment_parsed_bytes = 0; state = new_state; } Some(b'u') => { // Advance past the `\u` before using the expr parser state.advance_mut(2); let original_byte_count = state.bytes().len(); // Parse the hex digits, surrounded by parens, then // give a canonicalization error if the digits form // an invalid unicode code point. let (_progress, loc_digits, new_state) = between!( word1(b'(', EString::CodePtOpen), loc(ascii_hex_digits()), word1(b')', EString::CodePtEnd) ) .parse(arena, state, min_indent)?; // Advance the iterator past the expr we just parsed. for _ in 0..(original_byte_count - new_state.bytes().len()) { bytes.next(); } segments.push(StrSegment::Unicode(loc_digits)); // Reset the segment segment_parsed_bytes = 0; state = new_state; } Some(b'\\') => { escaped_char!(EscapedChar::Backslash); } Some(b'"') => { escaped_char!(EscapedChar::Quote); } Some(b'r') => { escaped_char!(EscapedChar::CarriageReturn); } Some(b't') => { escaped_char!(EscapedChar::Tab); } Some(b'n') => { escaped_char!(EscapedChar::Newline); } _ => { // Invalid escape! A backslash must be followed // by either an open paren or else one of the // escapable characters (\n, \t, \", \\, etc) return Err((MadeProgress, EString::UnknownEscape(state.pos()), state)); } } } _ => { // All other characters need no special handling. } } } // We ran out of characters before finding a closed quote Err(( MadeProgress, if is_multiline { EString::EndlessMulti(start_state.pos()) } else { EString::EndlessSingle(start_state.pos()) }, start_state, )) } }