update to literal-escaper 0.0.4 for better API without unreachable and faster string parsing

2025-11-24 13:10:28 +00:00 · 2025-03-07 11:17:39 +00:00 · 2025-03-07 11:17:39 +00:00 · 13a46eab7d
commit 13a46eab7d
parent e2c3647c6a
7 changed files with 137 additions and 165 deletions
--- a/crates/parser/src/lexed_str.rs
+++ b/crates/parser/src/lexed_str.rs
@ -11,7 +11,8 @@
 use std::ops;

 use rustc_literal_escaper::{
-    EscapeError, Mode, unescape_byte, unescape_char, unescape_mixed, unescape_unicode,
+    EscapeError, Mode, unescape_byte, unescape_byte_str, unescape_c_str, unescape_char,
+    unescape_str,
 };

 use crate::{
@ -151,14 +152,14 @@ impl<'a> Converter<'a> {
        self.res
    }

-    fn push(&mut self, kind: SyntaxKind, len: usize, err: Option<&str>) {
+    fn push(&mut self, kind: SyntaxKind, len: usize, errors: Vec<String>) {
        self.res.push(kind, self.offset);
        self.offset += len;

-        if let Some(err) = err {
-            let token = self.res.len() as u32;
-            let msg = err.to_owned();
-            self.res.error.push(LexError { msg, token });
+        for msg in errors {
+            if !msg.is_empty() {
+                self.res.error.push(LexError { msg, token: self.res.len() as u32 });
+            }
        }
    }

@ -167,14 +168,16 @@ impl<'a> Converter<'a> {
        // We drop some useful information here (see patterns with double dots `..`)
        // Storing that info in `SyntaxKind` is not possible due to its layout requirements of
        // being `u16` that come from `rowan::SyntaxKind`.
-        let mut err = "";
+        let mut errors: Vec<String> = vec![];

        let syntax_kind = {
            match kind {
                rustc_lexer::TokenKind::LineComment { doc_style: _ } => COMMENT,
                rustc_lexer::TokenKind::BlockComment { doc_style: _, terminated } => {
                    if !terminated {
-                        err = "Missing trailing `*/` symbols to terminate the block comment";
+                        errors.push(
+                            "Missing trailing `*/` symbols to terminate the block comment".into(),
+                        );
                    }
                    COMMENT
                }
@ -184,9 +187,9 @@ impl<'a> Converter<'a> {
                    invalid_infostring,
                } => {
                    if *has_invalid_preceding_whitespace {
-                        err = "invalid preceding whitespace for frontmatter opening"
+                        errors.push("invalid preceding whitespace for frontmatter opening".into());
                    } else if *invalid_infostring {
-                        err = "invalid infostring for frontmatter"
+                        errors.push("invalid infostring for frontmatter".into());
                    }
                    FRONTMATTER
                }
@ -198,7 +201,7 @@ impl<'a> Converter<'a> {
                    SyntaxKind::from_keyword(token_text, self.edition).unwrap_or(IDENT)
                }
                rustc_lexer::TokenKind::InvalidIdent => {
-                    err = "Ident contains invalid characters";
+                    errors.push("Ident contains invalid characters".into());
                    IDENT
                }

@ -206,7 +209,7 @@ impl<'a> Converter<'a> {

                rustc_lexer::TokenKind::GuardedStrPrefix if self.edition.at_least_2024() => {
                    // FIXME: rustc does something better for recovery.
-                    err = "Invalid string literal (reserved syntax)";
+                    errors.push("Invalid string literal (reserved syntax)".into());
                    ERROR
                }
                rustc_lexer::TokenKind::GuardedStrPrefix => {
@ -222,12 +225,12 @@ impl<'a> Converter<'a> {

                rustc_lexer::TokenKind::Lifetime { starts_with_number } => {
                    if *starts_with_number {
-                        err = "Lifetime name cannot start with a number";
+                        errors.push("Lifetime name cannot start with a number".into());
                    }
                    LIFETIME_IDENT
                }
                rustc_lexer::TokenKind::UnknownPrefixLifetime => {
-                    err = "Unknown lifetime prefix";
+                    errors.push("Unknown lifetime prefix".into());
                    LIFETIME_IDENT
                }
                rustc_lexer::TokenKind::RawLifetime => LIFETIME_IDENT,
@ -262,119 +265,128 @@ impl<'a> Converter<'a> {
                rustc_lexer::TokenKind::Unknown => ERROR,
                rustc_lexer::TokenKind::UnknownPrefix if token_text == "builtin" => IDENT,
                rustc_lexer::TokenKind::UnknownPrefix => {
-                    err = "unknown literal prefix";
+                    errors.push("unknown literal prefix".into());
                    IDENT
                }
                rustc_lexer::TokenKind::Eof => EOF,
            }
        };

-        let err = if err.is_empty() { None } else { Some(err) };
-        self.push(syntax_kind, token_text.len(), err);
+        self.push(syntax_kind, token_text.len(), errors);
    }

    fn extend_literal(&mut self, len: usize, kind: &rustc_lexer::LiteralKind) {
-        let mut err = "";
+        let invalid_raw_msg = String::from("Invalid raw string literal");
+
+        let mut errors = vec![];
+        let mut no_end_quote = |c: char, kind: &str| {
+            errors.push(format!("Missing trailing `{c}` symbol to terminate the {kind} literal"));
+        };

        let syntax_kind = match *kind {
            rustc_lexer::LiteralKind::Int { empty_int, base: _ } => {
                if empty_int {
-                    err = "Missing digits after the integer base prefix";
+                    errors.push("Missing digits after the integer base prefix".into());
                }
                INT_NUMBER
            }
            rustc_lexer::LiteralKind::Float { empty_exponent, base: _ } => {
                if empty_exponent {
-                    err = "Missing digits after the exponent symbol";
+                    errors.push("Missing digits after the exponent symbol".into());
                }
                FLOAT_NUMBER
            }
            rustc_lexer::LiteralKind::Char { terminated } => {
                if !terminated {
-                    err = "Missing trailing `'` symbol to terminate the character literal";
+                    no_end_quote('\'', "character");
                } else {
                    let text = &self.res.text[self.offset + 1..][..len - 1];
-                    let i = text.rfind('\'').unwrap();
-                    let text = &text[..i];
+                    let text = &text[..text.rfind('\'').unwrap()];
                    if let Err(e) = unescape_char(text) {
-                        err = error_to_diagnostic_message(e, Mode::Char);
+                        errors.push(err_to_msg(e, Mode::Char));
                    }
                }
                CHAR
            }
            rustc_lexer::LiteralKind::Byte { terminated } => {
                if !terminated {
-                    err = "Missing trailing `'` symbol to terminate the byte literal";
+                    no_end_quote('\'', "byte");
                } else {
                    let text = &self.res.text[self.offset + 2..][..len - 2];
-                    let i = text.rfind('\'').unwrap();
-                    let text = &text[..i];
+                    let text = &text[..text.rfind('\'').unwrap()];
                    if let Err(e) = unescape_byte(text) {
-                        err = error_to_diagnostic_message(e, Mode::Byte);
+                        errors.push(err_to_msg(e, Mode::Byte));
                    }
                }
-
                BYTE
            }
            rustc_lexer::LiteralKind::Str { terminated } => {
                if !terminated {
-                    err = "Missing trailing `\"` symbol to terminate the string literal";
+                    no_end_quote('"', "string");
                } else {
                    let text = &self.res.text[self.offset + 1..][..len - 1];
-                    let i = text.rfind('"').unwrap();
-                    let text = &text[..i];
-                    err = unescape_string_error_message(text, Mode::Str);
+                    let text = &text[..text.rfind('"').unwrap()];
+                    unescape_str(text, |_, res| {
+                        if let Err(e) = res {
+                            errors.push(err_to_msg(e, Mode::Str));
+                        }
+                    });
                }
                STRING
            }
            rustc_lexer::LiteralKind::ByteStr { terminated } => {
                if !terminated {
-                    err = "Missing trailing `\"` symbol to terminate the byte string literal";
+                    no_end_quote('"', "byte string");
                } else {
                    let text = &self.res.text[self.offset + 2..][..len - 2];
-                    let i = text.rfind('"').unwrap();
-                    let text = &text[..i];
-                    err = unescape_string_error_message(text, Mode::ByteStr);
+                    let text = &text[..text.rfind('"').unwrap()];
+                    unescape_byte_str(text, |_, res| {
+                        if let Err(e) = res {
+                            errors.push(err_to_msg(e, Mode::ByteStr));
+                        }
+                    });
                }
                BYTE_STRING
            }
            rustc_lexer::LiteralKind::CStr { terminated } => {
                if !terminated {
-                    err = "Missing trailing `\"` symbol to terminate the string literal";
+                    no_end_quote('"', "C string")
                } else {
                    let text = &self.res.text[self.offset + 2..][..len - 2];
-                    let i = text.rfind('"').unwrap();
-                    let text = &text[..i];
-                    err = unescape_string_error_message(text, Mode::CStr);
+                    let text = &text[..text.rfind('"').unwrap()];
+                    unescape_c_str(text, |_, res| {
+                        if let Err(e) = res {
+                            errors.push(err_to_msg(e, Mode::CStr));
+                        }
+                    });
                }
                C_STRING
            }
            rustc_lexer::LiteralKind::RawStr { n_hashes } => {
                if n_hashes.is_none() {
-                    err = "Invalid raw string literal";
+                    errors.push(invalid_raw_msg);
                }
                STRING
            }
            rustc_lexer::LiteralKind::RawByteStr { n_hashes } => {
                if n_hashes.is_none() {
-                    err = "Invalid raw string literal";
+                    errors.push(invalid_raw_msg);
                }
                BYTE_STRING
            }
            rustc_lexer::LiteralKind::RawCStr { n_hashes } => {
                if n_hashes.is_none() {
-                    err = "Invalid raw string literal";
+                    errors.push(invalid_raw_msg);
                }
                C_STRING
            }
        };

-        let err = if err.is_empty() { None } else { Some(err) };
-        self.push(syntax_kind, len, err);
+        self.push(syntax_kind, len, errors);
    }
 }

-fn error_to_diagnostic_message(error: EscapeError, mode: Mode) -> &'static str {
+fn err_to_msg(error: EscapeError, mode: Mode) -> String {
    match error {
        EscapeError::ZeroChars => "empty character literal",
        EscapeError::MoreThanOneChar => "character literal may only contain one codepoint",
@ -410,28 +422,5 @@ fn error_to_diagnostic_message(error: EscapeError, mode: Mode) -> &'static str {
        EscapeError::UnskippedWhitespaceWarning => "",
        EscapeError::MultipleSkippedLinesWarning => "",
    }
-}
-
-fn unescape_string_error_message(text: &str, mode: Mode) -> &'static str {
-    let mut error_message = "";
-    match mode {
-        Mode::CStr => {
-            unescape_mixed(text, mode, &mut |_, res| {
-                if let Err(e) = res {
-                    error_message = error_to_diagnostic_message(e, mode);
-                }
-            });
-        }
-        Mode::ByteStr | Mode::Str => {
-            unescape_unicode(text, mode, &mut |_, res| {
-                if let Err(e) = res {
-                    error_message = error_to_diagnostic_message(e, mode);
-                }
-            });
-        }
-        _ => {
-            // Other Modes are not supported yet or do not apply
-        }
-    }
-    error_message
+    .into()
 }