roc/crates/compiler/parse/src/string_literal.rs

use crate::ast::{EscapedChar, SingleQuoteLiteral, StrLiteral, StrSegment};
use crate::expr;
use crate::parser::Progress::{self, *};
use crate::parser::{
    allocated, loc, reset_min_indent, specialize_ref, then, word1, BadInputError, ESingleQuote,
    EString, Parser,
};
use crate::state::State;
use bumpalo::collections::vec::Vec;
use bumpalo::Bump;

/// One or more ASCII hex digits. (Useful when parsing unicode escape codes,
/// which must consist entirely of ASCII hex digits.)
fn ascii_hex_digits<'a>() -> impl Parser<'a, &'a str, EString<'a>> {
    move |arena, mut state: State<'a>, _min_indent: u32| {
        let mut buf = bumpalo::collections::String::new_in(arena);

        for &byte in state.bytes().iter() {
            if (byte as char).is_ascii_hexdigit() {
                buf.push(byte as char);
            } else if buf.is_empty() {
                // We didn't find any hex digits!
                return Err((NoProgress, EString::CodePtEnd(state.pos())));
            } else {
                state.advance_mut(buf.len());

                return Ok((MadeProgress, buf.into_bump_str(), state));
            }
        }

        Err((NoProgress, EString::CodePtEnd(state.pos())))
    }
}

fn consume_indent(mut state: State, mut indent: u32) -> Result<State, (Progress, EString)> {
    while indent > 0 {
        match state.bytes().first() {
            Some(b' ') => {
                state.advance_mut(1);
                indent -= 1;
            }
            None | Some(b'\n') => {
                break;
            }
            Some(_) => {
                return Err((
                    MadeProgress,
                    EString::MultilineInsufficientIndent(state.pos()),
                ));
            }
        }
    }

    Ok(state)
}

fn utf8<'a>(state: State<'a>, string_bytes: &'a [u8]) -> Result<&'a str, (Progress, EString<'a>)> {
    std::str::from_utf8(string_bytes).map_err(|_| {
        // Note Based on where this `utf8` function is used, the fact that we know the whole string
        // in the parser is valid utf8, and barring bugs in the parser itself
        // (e.g. where we accidentally split a multibyte utf8 char), this error _should_ actually be unreachable.
        (
            MadeProgress,
            EString::Space(BadInputError::BadUtf8, state.pos()),
        )
    })
}

pub enum StrLikeLiteral<'a> {
    SingleQuote(SingleQuoteLiteral<'a>),
    Str(StrLiteral<'a>),
}

pub fn parse_str_literal<'a>() -> impl Parser<'a, StrLiteral<'a>, EString<'a>> {
    then(
        loc!(parse_str_like_literal()),
        |_arena, state, progress, str_like| match str_like.value {
            StrLikeLiteral::SingleQuote(_) => Err((
                progress,
                EString::ExpectedDoubleQuoteGotSingleQuote(str_like.region.start()),
            )),
            StrLikeLiteral::Str(str_literal) => Ok((progress, str_literal, state)),
        },
    )
}

pub fn parse_str_like_literal<'a>() -> impl Parser<'a, StrLikeLiteral<'a>, EString<'a>> {
    move |arena: &'a Bump, mut state: State<'a>, min_indent: u32| {
        let is_multiline;
        let is_single_quote;

        let indent = state.column();

        let start_state;

        if state.consume_mut("\"\"\"") {
            start_state = state.clone();

            // we will be parsing a multi-line string
            is_multiline = true;
            is_single_quote = false;

            if state.consume_mut("\n") {
                state = consume_indent(state, indent)?;
            }
        } else if state.consume_mut("\"") {
            start_state = state.clone();

            // we will be parsing a single-line string
            is_multiline = false;
            is_single_quote = false;
        } else if state.consume_mut("'") {
            start_state = state.clone();

            is_multiline = false;
            is_single_quote = true;
        } else {
            return Err((NoProgress, EString::Open(state.pos())));
        }

        let mut bytes = state.bytes().iter();

        let mut segment_parsed_bytes = 0;
        let mut segments = Vec::new_in(arena);

        macro_rules! escaped_char {
            ($ch:expr) => {
                // Record the escaped char.
                segments.push(StrSegment::EscapedChar($ch));

                // Advance past the segment we just added
                state.advance_mut(segment_parsed_bytes);

                // Reset the segment
                segment_parsed_bytes = 0;
            };
        }

        macro_rules! end_segment {
            ($transform:expr) => {
                // Don't push anything if the string would be empty.
                if segment_parsed_bytes > 1 {
                    // This function is always called after we just parsed
                    // something which signalled that we should end the
                    // current segment - so use segment_parsed_bytes - 1 here,
                    // to exclude that char we just parsed.
                    let string_bytes = &state.bytes()[0..(segment_parsed_bytes - 1)];

                    match std::str::from_utf8(string_bytes) {
                        Ok(string) => {
                            state.advance_mut(string.len());

                            segments.push($transform(string));
                        }
                        Err(_) => {
                            return Err((
                                MadeProgress,
                                EString::Space(BadInputError::BadUtf8, state.pos()),
                            ));
                        }
                    }
                }

                // Depending on where this macro is used, in some
                // places this is unused.
                #[allow(unused_assignments)]
                {
                    // This function is always called after we just parsed
                    // something which signalled that we should end the
                    // current segment.
                    segment_parsed_bytes = 1;
                }
            };
        }

        while let Some(&byte) = bytes.next() {
            // This is for the byte we just grabbed from the iterator.
            segment_parsed_bytes += 1;

            match byte {
                b'"' if !is_single_quote => {
                    if segment_parsed_bytes == 1 && segments.is_empty() {
                        // special case of the empty string
                        if is_multiline {
                            if bytes.as_slice().starts_with(b"\"\"") {
                                return Ok((
                                    MadeProgress,
                                    StrLikeLiteral::Str(StrLiteral::Block(&[])),
                                    state.advance(3),
                                ));
                            } else {
                                // this quote is in a block string
                                continue;
                            }
                        } else {
                            // This is the end of the string!
                            // Advance 1 for the close quote
                            return Ok((
                                MadeProgress,
                                StrLikeLiteral::Str(StrLiteral::PlainLine("")),
                                state.advance(1),
                            ));
                        }
                    } else {
                        // the string is non-empty, which means we need to convert any previous segments
                        // and the current segment into a string literal
                        if is_multiline {
                            if bytes.as_slice().starts_with(b"\"\"") {
                                end_segment!(StrSegment::Plaintext);

                                let expr = if segments.len() == 1 {
                                    // We had exactly one segment, so this is a candidate
                                    // to be StrLiteral::Plaintext
                                    match segments.pop().unwrap() {
                                        StrSegment::Plaintext(string) => {
                                            StrLiteral::PlainLine(string)
                                        }
                                        other => StrLiteral::Line(arena.alloc([other])),
                                    }
                                } else {
                                    StrLiteral::Block(arena.alloc([segments.into_bump_slice()]))
                                };

                                return Ok((
                                    MadeProgress,
                                    StrLikeLiteral::Str(expr),
                                    state.advance(3),
                                ));
                            } else {
                                // this quote is in a block string
                                continue;
                            }
                        } else {
                            end_segment!(StrSegment::Plaintext);

                            let expr = if segments.len() == 1 {
                                // We had exactly one segment, so this is a candidate
                                // to be StrLiteral::Plaintext
                                match segments.pop().unwrap() {
                                    StrSegment::Plaintext(string) => StrLiteral::PlainLine(string),
                                    other => StrLiteral::Line(arena.alloc([other])),
                                }
                            } else {
                                StrLiteral::Line(segments.into_bump_slice())
                            };

                            // Advance the state 1 to account for the closing `"`
                            return Ok((MadeProgress, StrLikeLiteral::Str(expr), state.advance(1)));
                        }
                    };
                }
                b'\'' if is_single_quote => {
                    end_segment!(StrSegment::Plaintext);

                    let expr = if segments.len() == 1 {
                        // We had exactly one segment, so this is a candidate
                        // to be SingleQuoteLiteral::Plaintext
                        match segments.pop().unwrap() {
                            StrSegment::Plaintext(string) => SingleQuoteLiteral::PlainLine(string),
                            other => {
                                let o = other.try_into().map_err(|e| {
                                    (
                                        MadeProgress,
                                        EString::InvalidSingleQuote(e, start_state.pos()),
                                    )
                                })?;
                                SingleQuoteLiteral::Line(arena.alloc([o]))
                            }
                        }
                    } else {
                        let mut new_segments = Vec::with_capacity_in(segments.len(), arena);
                        for segment in segments {
                            let segment = segment.try_into().map_err(|e| {
                                (
                                    MadeProgress,
                                    EString::InvalidSingleQuote(e, start_state.pos()),
                                )
                            })?;
                            new_segments.push(segment);
                        }

                        SingleQuoteLiteral::Line(new_segments.into_bump_slice())
                    };

                    // Validate that the string is a valid char literal.
                    // Note that currently, we accept anything that:
                    // * Is between 1 and 5 bytes long
                    //   -> utf-8 encoding is trivial to extend to 5 bytes, even tho 4 is the technical max
                    //   -> TODO: do we want to change this?
                    // * Decodes as valid UTF-8
                    //   -> Might be a single code point, or multiple code points
                    //   -> TODO: do we want to change this?

                    // Simply by decoding this, it's guaranteed to be valid utf-8
                    let text = expr.to_str_in(arena);

                    if text.len() > 5 {
                        return Err((
                            MadeProgress,
                            EString::InvalidSingleQuote(ESingleQuote::TooLong, start_state.pos()),
                        ));
                    }

                    if text.is_empty() {
                        return Err((
                            MadeProgress,
                            EString::InvalidSingleQuote(ESingleQuote::Empty, start_state.pos()),
                        ));
                    }

                    // Advance the state 1 to account for the closing `'`
                    return Ok((
                        MadeProgress,
                        StrLikeLiteral::SingleQuote(expr),
                        state.advance(1),
                    ));
                }
                b'\n' => {
                    if is_multiline {
                        let without_newline = &state.bytes()[0..(segment_parsed_bytes - 1)];
                        let with_newline = &state.bytes()[0..segment_parsed_bytes];

                        state.advance_mut(segment_parsed_bytes);
                        state = consume_indent(state, indent)?;
                        bytes = state.bytes().iter();

                        if state.bytes().starts_with(b"\"\"\"") {
                            // ending the string; don't use the last newline
                            if !without_newline.is_empty() {
                                segments.push(StrSegment::Plaintext(utf8(
                                    state.clone(),
                                    without_newline,
                                )?));
                            }
                        } else {
                            segments
                                .push(StrSegment::Plaintext(utf8(state.clone(), with_newline)?));
                        }

                        segment_parsed_bytes = 0;

                        continue;
                    } else {
                        // This is a single-line string, which cannot have newlines!
                        // Treat this as an unclosed string literal, and consume
                        // all remaining chars. This will mask all other errors, but
                        // it should make it easiest to debug; the file will be a giant
                        // error starting from where the open quote appeared.
                        return Err((MadeProgress, EString::EndlessSingleLine(start_state.pos())));
                    }
                }
                b'\\' => {
                    // We're about to begin an escaped segment of some sort!
                    //
                    // Record the current segment so we can begin a new one.
                    // End it right before the `\` char we just parsed.
                    end_segment!(StrSegment::Plaintext);

                    // This is for the byte we're about to parse.
                    segment_parsed_bytes += 1;

                    // This is the start of a new escape. Look at the next byte
                    // to figure out what type of escape it is.
                    match bytes.next() {
                        Some(b'(') => {
                            // Advance past the `\(` before using the expr parser
                            state.advance_mut(2);

                            let original_byte_count = state.bytes().len();

                            // This is an interpolated variable.
                            // Parse an arbitrary expression, then give a
                            // canonicalization error if that expression variant
                            // is not allowed inside a string interpolation.
                            let (_progress, loc_expr, new_state) = skip_second!(
                                specialize_ref(
                                    EString::Format,
                                    loc(allocated(reset_min_indent(expr::expr_help())))
                                ),
                                word1(b')', EString::FormatEnd)
                            )
                            .parse(arena, state, min_indent)?;

                            // Advance the iterator past the expr we just parsed.
                            for _ in 0..(original_byte_count - new_state.bytes().len()) {
                                bytes.next();
                            }

                            segments.push(StrSegment::Interpolated(loc_expr));

                            // Reset the segment
                            segment_parsed_bytes = 0;
                            state = new_state;
                        }
                        Some(b'u') => {
                            // Advance past the `\u` before using the expr parser
                            state.advance_mut(2);

                            let original_byte_count = state.bytes().len();

                            // Parse the hex digits, surrounded by parens, then
                            // give a canonicalization error if the digits form
                            // an invalid unicode code point.
                            let (_progress, loc_digits, new_state) = between!(
                                word1(b'(', EString::CodePtOpen),
                                loc(ascii_hex_digits()),
                                word1(b')', EString::CodePtEnd)
                            )
                            .parse(arena, state, min_indent)?;

                            // Advance the iterator past the expr we just parsed.
                            for _ in 0..(original_byte_count - new_state.bytes().len()) {
                                bytes.next();
                            }

                            segments.push(StrSegment::Unicode(loc_digits));

                            // Reset the segment
                            segment_parsed_bytes = 0;
                            state = new_state;
                        }
                        Some(b'\\') => {
                            escaped_char!(EscapedChar::Backslash);
                        }
                        Some(b'"') => {
                            escaped_char!(EscapedChar::DoubleQuote);
                        }
                        Some(b'\'') => {
                            escaped_char!(EscapedChar::SingleQuote);
                        }
                        Some(b'r') => {
                            escaped_char!(EscapedChar::CarriageReturn);
                        }
                        Some(b't') => {
                            escaped_char!(EscapedChar::Tab);
                        }
                        Some(b'n') => {
                            escaped_char!(EscapedChar::Newline);
                        }
                        _ => {
                            // Invalid escape! A backslash must be followed
                            // by either an open paren or else one of the
                            // escapable characters (\n, \t, \", \\, etc)
                            return Err((MadeProgress, EString::UnknownEscape(state.pos())));
                        }
                    }
                }
                _ => {
                    // All other characters need no special handling.
                }
            }
        }

        // We ran out of characters before finding a closed quote
        Err((
            MadeProgress,
            if is_single_quote {
                EString::EndlessSingleQuote(start_state.pos())
            } else if is_multiline {
                EString::EndlessMultiLine(start_state.pos())
            } else {
                EString::EndlessSingleLine(start_state.pos())
            },
        ))
    }
}