roc/compiler/parse/src/string_literal.rs

use crate::ast::{Attempting, EscapedChar, StrLiteral, StrSegment};
use crate::expr;
use crate::parser::{
    allocated, ascii_char, ascii_hex_digits, loc, parse_utf8, unexpected, unexpected_eof, Fail,
    FailReason, ParseResult, Parser, State,
};
use bumpalo::collections::vec::Vec;
use bumpalo::Bump;

pub fn parse<'a>() -> impl Parser<'a, StrLiteral<'a>> {
    use StrLiteral::*;

    move |arena: &'a Bump, mut state: State<'a>| {
        let mut bytes = state.bytes.iter();
        // String literals must start with a quote.
        // If this doesn't, it must not be a string literal!
        match bytes.next() {
            Some(&byte) => {
                if byte != b'"' {
                    return Err(unexpected(0, state, Attempting::StrLiteral));
                }
            }
            None => {
                return Err(unexpected_eof(0, Attempting::StrLiteral, state));
            }
        }

        // Advance past the opening quotation mark.
        state = state.advance_without_indenting(1)?;

        // At the parsing stage we keep the entire raw string, because the formatter
        // needs the raw string. (For example, so it can "remember" whether you
        // wrote \u{...} or the actual unicode character itself.)
        //
        // Since we're keeping the entire raw string, all we need to track is
        // how many characters we've parsed. So far, that's 1 (the opening `"`).
        let mut segment_parsed_bytes = 0;
        let mut segments = Vec::new_in(arena);

        macro_rules! escaped_char {
            ($ch:expr) => {
                // Record the escaped char.
                segments.push(StrSegment::EscapedChar($ch));

                // Advance past the segment we just added
                state = state.advance_without_indenting(segment_parsed_bytes)?;

                // Reset the segment
                segment_parsed_bytes = 0;
            };
        }

        macro_rules! end_segment {
            ($transform:expr) => {
                // Don't push anything if the string would be empty.
                if segment_parsed_bytes > 1 {
                    // This function is always called after we just parsed
                    // something which signalled that we should end the
                    // current segment - so use segment_parsed_bytes - 1 here,
                    // to exclude that char we just parsed.
                    let string_bytes = &state.bytes[0..(segment_parsed_bytes - 1)];

                    match parse_utf8(string_bytes) {
                        Ok(string) => {
                            state = state.advance_without_indenting(string.len())?;

                            segments.push($transform(string));
                        }
                        Err(reason) => {
                            return state.fail(reason);
                        }
                    }
                }

                // Depending on where this macro is used, in some
                // places this is unused.
                #[allow(unused_assignments)]
                {
                    // This function is always called after we just parsed
                    // something which signalled that we should end the
                    // current segment.
                    segment_parsed_bytes = 1;
                }
            };
        }

        while let Some(&byte) = bytes.next() {
            // This is for the byte we just grabbed from the iterator.
            segment_parsed_bytes += 1;

            match byte {
                b'"' => {
                    // This is the end of the string!
                    if segment_parsed_bytes == 1 && segments.is_empty() {
                        match bytes.next() {
                            Some(b'"') => {
                                // If the very first three chars were all `"`,
                                // then this literal begins with `"""`
                                // and is a block string.
                                return parse_block_string(arena, state, &mut bytes);
                            }
                            _ => {
                                // Advance 1 for the close quote
                                return Ok((PlainLine(""), state.advance_without_indenting(1)?));
                            }
                        }
                    } else {
                        end_segment!(StrSegment::Plaintext);

                        let expr = if segments.len() == 1 {
                            // We had exactly one segment, so this is a candidate
                            // to be StrLiteral::Plaintext
                            match segments.pop().unwrap() {
                                StrSegment::Plaintext(string) => StrLiteral::PlainLine(string),
                                other => {
                                    let vec = bumpalo::vec![in arena; other];

                                    StrLiteral::Line(vec.into_bump_slice())
                                }
                            }
                        } else {
                            Line(segments.into_bump_slice())
                        };

                        // Advance the state 1 to account for the closing `"`
                        return Ok((expr, state.advance_without_indenting(1)?));
                    };
                }
                b'\n' => {
                    // This is a single-line string, which cannot have newlines!
                    // Treat this as an unclosed string literal, and consume
                    // all remaining chars. This will mask all other errors, but
                    // it should make it easiest to debug; the file will be a giant
                    // error starting from where the open quote appeared.
                    return Err(unexpected(
                        state.bytes.len() - 1,
                        state,
                        Attempting::StrLiteral,
                    ));
                }
                b'\\' => {
                    // We're about to begin an escaped segment of some sort!
                    //
                    // Record the current segment so we can begin a new one.
                    // End it right before the `\` char we just parsed.
                    end_segment!(StrSegment::Plaintext);

                    // This is for the byte we're about to parse.
                    segment_parsed_bytes += 1;

                    // This is the start of a new escape. Look at the next byte
                    // to figure out what type of escape it is.
                    match bytes.next() {
                        Some(b'(') => {
                            // Advance past the `\(` before using the expr parser
                            state = state.advance_without_indenting(2)?;

                            let original_byte_count = state.bytes.len();

                            // This is an interpolated variable.
                            // Parse an arbitrary expression, then give a
                            // canonicalization error if that expression variant
                            // is not allowed inside a string interpolation.
                            let (loc_expr, new_state) =
                                skip_second!(loc(allocated(expr::expr(0))), ascii_char(b')'))
                                    .parse(arena, state)?;

                            // Advance the iterator past the expr we just parsed.
                            for _ in 0..(original_byte_count - new_state.bytes.len()) {
                                bytes.next();
                            }

                            segments.push(StrSegment::Interpolated(loc_expr));

                            // Reset the segment
                            segment_parsed_bytes = 0;
                            state = new_state;
                        }
                        Some(b'u') => {
                            // Advance past the `\u` before using the expr parser
                            state = state.advance_without_indenting(2)?;

                            let original_byte_count = state.bytes.len();

                            // Parse the hex digits, surrounded by parens, then
                            // give a canonicalization error if the digits form
                            // an invalid unicode code point.
                            let (loc_digits, new_state) = between!(
                                ascii_char(b'('),
                                loc(ascii_hex_digits()),
                                ascii_char(b')')
                            )
                            .parse(arena, state)?;

                            // Advance the iterator past the expr we just parsed.
                            for _ in 0..(original_byte_count - new_state.bytes.len()) {
                                bytes.next();
                            }

                            segments.push(StrSegment::Unicode(loc_digits));

                            // Reset the segment
                            segment_parsed_bytes = 0;
                            state = new_state;
                        }
                        Some(b'\\') => {
                            escaped_char!(EscapedChar::Backslash);
                        }
                        Some(b'"') => {
                            escaped_char!(EscapedChar::Quote);
                        }
                        Some(b'r') => {
                            escaped_char!(EscapedChar::CarriageReturn);
                        }
                        Some(b't') => {
                            escaped_char!(EscapedChar::Tab);
                        }
                        Some(b'n') => {
                            escaped_char!(EscapedChar::Newline);
                        }
                        _ => {
                            // Invalid escape! A backslash must be followed
                            // by either an open paren or else one of the
                            // escapable characters (\n, \t, \", \\, etc)
                            return Err(unexpected(
                                state.bytes.len() - 1,
                                state,
                                Attempting::StrLiteral,
                            ));
                        }
                    }
                }
                _ => {
                    // All other characters need no special handling.
                }
            }
        }

        // We ran out of characters before finding a closed quote
        Err(unexpected_eof(
            state.bytes.len(),
            Attempting::StrLiteral,
            state.clone(),
        ))
    }
}

fn parse_block_string<'a, I>(
    arena: &'a Bump,
    state: State<'a>,
    bytes: &mut I,
) -> ParseResult<'a, StrLiteral<'a>>
where
    I: Iterator<Item = &'a u8>,
{
    // So far we have consumed the `"""` and that's it.
    let mut parsed_chars = 3;
    let mut prev_byte = b'"';
    let mut quotes_seen = 0;

    // start at 3 to omit the opening `"`.
    let mut line_start = 3;

    let mut lines: Vec<'a, &'a str> = Vec::new_in(arena);

    for byte in bytes {
        parsed_chars += 1;

        // Potentially end the string (unless this is an escaped `"`!)
        match byte {
            b'"' if prev_byte != b'\\' => {
                if quotes_seen == 2 {
                    // three consecutive qoutes, end string

                    // Subtract 3 from parsed_chars so we omit the closing `"`.
                    let line_bytes = &state.bytes[line_start..(parsed_chars - 3)];

                    return match parse_utf8(line_bytes) {
                        Ok(line) => {
                            // state = state.advance_without_indenting(parsed_chars)?;

                            // lines.push(line);

                            // Ok((StrLiteral::Block(lines.into_bump_slice()), state))
                            Err((
                                Fail {
                                    attempting: state.attempting,
                                    reason: FailReason::NotYetImplemented(format!(
                                        "TODO parse this line in a block string: {:?}",
                                        line
                                    )),
                                },
                                state,
                            ))
                        }
                        Err(reason) => state.fail(reason),
                    };
                }
                quotes_seen += 1;
            }
            b'\n' => {
                // note this includes the newline
                let line_bytes = &state.bytes[line_start..parsed_chars];

                match parse_utf8(line_bytes) {
                    Ok(line) => {
                        lines.push(line);

                        quotes_seen = 0;
                        line_start = parsed_chars;
                    }
                    Err(reason) => {
                        return state.fail(reason);
                    }
                }
            }
            _ => {
                quotes_seen = 0;
            }
        }

        prev_byte = *byte;
    }

    // We ran out of characters before finding 3 closing quotes
    Err(unexpected_eof(
        parsed_chars,
        // TODO custom BlockStrLiteral?
        Attempting::StrLiteral,
        state,
    ))
}