Remove unnecessary string cloning from the parser (#9884)

Closes https://github.com/astral-sh/ruff/issues/9869.
2025-07-12 07:35:07 +00:00 · 2024-02-09 16:03:27 -05:00 · 2024-02-09 16:03:27 -05:00 · 6f0e4ad332
commit 6f0e4ad332
parent 7ca515c0aa
11 changed files with 227 additions and 119 deletions
--- a/crates/ruff_python_parser/Cargo.toml
+++ b/crates/ruff_python_parser/Cargo.toml
@ -19,14 +19,15 @@ ruff_text_size = { path = "../ruff_text_size" }

 anyhow = { workspace = true }
 bitflags = { workspace = true }
+bstr = { workspace = true }
 is-macro = { workspace = true }
 itertools = { workspace = true }
 lalrpop-util = { workspace = true, default-features = false }
 memchr = { workspace = true }
-unicode-ident = { workspace = true }
-unicode_names2 = { workspace = true }
 rustc-hash = { workspace = true }
 static_assertions = { workspace = true }
+unicode-ident = { workspace = true }
+unicode_names2 = { workspace = true }

 [dev-dependencies]
 insta = { workspace = true }
--- a/crates/ruff_python_parser/src/lib.rs
+++ b/crates/ruff_python_parser/src/lib.rs
@ -119,10 +119,10 @@ pub use token::{StringKind, Tok, TokenKind};

 use crate::lexer::LexResult;

-mod function;
-// Skip flattening lexer to distinguish from full ruff_python_parser
 mod context;
+mod function;
 mod invalid;
+// Skip flattening lexer to distinguish from full ruff_python_parser
 pub mod lexer;
 mod parser;
 mod soft_keywords;
--- a/crates/ruff_python_parser/src/python.lalrpop
+++ b/crates/ruff_python_parser/src/python.lalrpop
@ -1616,7 +1616,7 @@ StringLiteralOrFString: StringType = {
 StringLiteral: StringType = {
    <location:@L> <string:string> <end_location:@R> =>? {
        let (source, kind, triple_quoted) = string;
-        Ok(parse_string_literal(&source, kind, triple_quoted, (location..end_location).into())?)
+        Ok(parse_string_literal(source, kind, triple_quoted, (location..end_location).into())?)
    }
 };

@ -1633,7 +1633,7 @@ FStringMiddlePattern: ast::FStringElement = {
    FStringReplacementField,
    <location:@L> <fstring_middle:fstring_middle> <end_location:@R> =>? {
        let (source, is_raw, _) = fstring_middle;
-        Ok(parse_fstring_literal_element(&source, is_raw, (location..end_location).into())?)
+        Ok(parse_fstring_literal_element(source, is_raw, (location..end_location).into())?)
    }
 };

--- a/crates/ruff_python_parser/src/python.rs
+++ b/crates/ruff_python_parser/src/python.rs
@ -1,5 +1,5 @@
 // auto-generated: "lalrpop 0.20.0"
-// sha3: 02c60b5c591440061dda68775005d87a203b5448c205120bda1566a62fc2147c
+// sha3: d38cc0f2252a58db42d3bd63a102b537865992b3cf51d402cdb4828f48989c9d
 use ruff_text_size::{Ranged, TextLen, TextRange, TextSize};
 use ruff_python_ast::{self as ast, Int, IpyEscapeKind};
 use crate::{
@ -36369,7 +36369,7 @@ fn __action217<
 {
    {
        let (source, kind, triple_quoted) = string;
-        Ok(parse_string_literal(&source, kind, triple_quoted, (location..end_location).into())?)
+        Ok(parse_string_literal(source, kind, triple_quoted, (location..end_location).into())?)
    }
 }

@ -36419,7 +36419,7 @@ fn __action220<
 {
    {
        let (source, is_raw, _) = fstring_middle;
-        Ok(parse_fstring_literal_element(&source, is_raw, (location..end_location).into())?)
+        Ok(parse_fstring_literal_element(source, is_raw, (location..end_location).into())?)
    }
 }

--- a/crates/ruff_python_parser/src/string.rs
+++ b/crates/ruff_python_parser/src/string.rs
@ -1,7 +1,9 @@
 //! Parsing of string literals, bytes literals, and implicit string concatenation.

+use bstr::ByteSlice;
+
 use ruff_python_ast::{self as ast, Expr};
-use ruff_text_size::{Ranged, TextLen, TextRange, TextSize};
+use ruff_text_size::{Ranged, TextRange, TextSize};

 use crate::lexer::{LexicalError, LexicalErrorType};
 use crate::token::{StringKind, Tok};
@ -32,34 +34,40 @@ impl From<StringType> for Expr {
    }
 }

-struct StringParser<'a> {
-    rest: &'a str,
+enum EscapedChar {
+    Literal(char),
+    Escape(char),
+}
+
+struct StringParser {
+    source: Box<str>,
+    cursor: usize,
    kind: StringKind,
-    location: TextSize,
+    offset: TextSize,
    range: TextRange,
 }

-impl<'a> StringParser<'a> {
-    fn new(source: &'a str, kind: StringKind, start: TextSize, range: TextRange) -> Self {
+impl StringParser {
+    fn new(source: Box<str>, kind: StringKind, offset: TextSize, range: TextRange) -> Self {
        Self {
-            rest: source,
+            source,
+            cursor: 0,
            kind,
-            location: start,
+            offset,
            range,
        }
    }

    #[inline]
-    fn skip_bytes(&mut self, bytes: usize) -> &'a str {
-        let skipped_str = &self.rest[..bytes];
-        self.rest = &self.rest[bytes..];
-        self.location += skipped_str.text_len();
+    fn skip_bytes(&mut self, bytes: usize) -> &str {
+        let skipped_str = &self.source[self.cursor..self.cursor + bytes];
+        self.cursor += bytes;
        skipped_str
    }

    #[inline]
    fn get_pos(&self) -> TextSize {
-        self.location
+        self.offset + TextSize::try_from(self.cursor).unwrap()
    }

    /// Returns the next byte in the string, if there is one.
@ -69,25 +77,23 @@ impl<'a> StringParser<'a> {
    /// When the next byte is a part of a multi-byte character.
    #[inline]
    fn next_byte(&mut self) -> Option<u8> {
-        self.rest.as_bytes().first().map(|&byte| {
-            self.rest = &self.rest[1..];
-            self.location += TextSize::new(1);
+        self.source[self.cursor..].as_bytes().first().map(|&byte| {
+            self.cursor += 1;
            byte
        })
    }

    #[inline]
    fn next_char(&mut self) -> Option<char> {
-        self.rest.chars().next().map(|c| {
-            self.rest = &self.rest[c.len_utf8()..];
-            self.location += c.text_len();
+        self.source[self.cursor..].chars().next().map(|c| {
+            self.cursor += c.len_utf8();
            c
        })
    }

    #[inline]
    fn peek_byte(&self) -> Option<u8> {
-        self.rest.as_bytes().first().copied()
+        self.source[self.cursor..].as_bytes().first().copied()
    }

    fn parse_unicode_literal(&mut self, literal_number: usize) -> Result<char, LexicalError> {
@ -135,7 +141,7 @@ impl<'a> StringParser<'a> {
        };

        let start_pos = self.get_pos();
-        let Some(close_idx) = self.rest.find('}') else {
+        let Some(close_idx) = self.source[self.cursor..].find('}') else {
            return Err(LexicalError::new(
                LexicalErrorType::StringError,
                self.get_pos(),
@ -149,7 +155,8 @@ impl<'a> StringParser<'a> {
            .ok_or_else(|| LexicalError::new(LexicalErrorType::UnicodeError, start_pos))
    }

-    fn parse_escaped_char(&mut self, string: &mut String) -> Result<(), LexicalError> {
+    /// Parse an escaped character, returning the new character.
+    fn parse_escaped_char(&mut self) -> Result<Option<EscapedChar>, LexicalError> {
        let Some(first_char) = self.next_char() else {
            return Err(LexicalError::new(
                LexicalErrorType::StringError,
@ -174,13 +181,13 @@ impl<'a> StringParser<'a> {
            'U' if !self.kind.is_any_bytes() => self.parse_unicode_literal(8)?,
            'N' if !self.kind.is_any_bytes() => self.parse_unicode_name()?,
            // Special cases where the escape sequence is not a single character
-            '\n' => return Ok(()),
+            '\n' => return Ok(None),
            '\r' => {
                if self.peek_byte() == Some(b'\n') {
                    self.next_byte();
                }

-                return Ok(());
+                return Ok(None);
            }
            _ => {
                if self.kind.is_any_bytes() && !first_char.is_ascii() {
@ -194,21 +201,42 @@ impl<'a> StringParser<'a> {
                    ));
                }

-                string.push('\\');
-
-                first_char
+                return Ok(Some(EscapedChar::Escape(first_char)));
            }
        };

-        string.push(new_char);
-
-        Ok(())
+        Ok(Some(EscapedChar::Literal(new_char)))
    }

-    fn parse_fstring_middle(&mut self) -> Result<ast::FStringElement, LexicalError> {
-        let mut value = String::with_capacity(self.rest.len());
-        while let Some(ch) = self.next_char() {
-            match ch {
+    fn parse_fstring_middle(mut self) -> Result<ast::FStringElement, LexicalError> {
+        // Fast-path: if the f-string doesn't contain any escape sequences, return the literal.
+        let Some(mut index) = memchr::memchr3(b'{', b'}', b'\\', self.source.as_bytes()) else {
+            return Ok(ast::FStringElement::Literal(ast::FStringLiteralElement {
+                value: self.source,
+                range: self.range,
+            }));
+        };
+
+        let mut value = String::with_capacity(self.source.len());
+        loop {
+            // Add the characters before the escape sequence (or curly brace) to the string.
+            let before_with_slash_or_brace = self.skip_bytes(index + 1);
+            let before = &before_with_slash_or_brace[..before_with_slash_or_brace.len() - 1];
+            value.push_str(before);
+
+            // Add the escaped character to the string.
+            match &self.source.as_bytes()[self.cursor - 1] {
+                // If there are any curly braces inside a `FStringMiddle` token,
+                // then they were escaped (i.e. `{{` or `}}`). This means that
+                // we need increase the location by 2 instead of 1.
+                b'{' => {
+                    self.offset += TextSize::from(1);
+                    value.push('{');
+                }
+                b'}' => {
+                    self.offset += TextSize::from(1);
+                    value.push('}');
+                }
                // We can encounter a `\` as the last character in a `FStringMiddle`
                // token which is valid in this context. For example,
                //
@ -229,71 +257,152 @@ impl<'a> StringParser<'a> {
                // This is still an invalid escape sequence, but we don't want to
                // raise a syntax error as is done by the CPython parser. It might
                // be supported in the future, refer to point 3: https://peps.python.org/pep-0701/#rejected-ideas
-                '\\' if !self.kind.is_raw() && self.peek_byte().is_some() => {
-                    self.parse_escaped_char(&mut value)?;
-                }
-                // If there are any curly braces inside a `FStringMiddle` token,
-                // then they were escaped (i.e. `{{` or `}}`). This means that
-                // we need increase the location by 2 instead of 1.
-                ch @ ('{' | '}') => {
-                    self.location += ch.text_len();
-                    value.push(ch);
-                }
-                ch => value.push(ch),
-            }
-        }
-        Ok(ast::FStringElement::Literal(ast::FStringLiteralElement {
-            value,
-            range: self.range,
-        }))
-    }
-
-    fn parse_bytes(&mut self) -> Result<StringType, LexicalError> {
-        let mut content = String::with_capacity(self.rest.len());
-        while let Some(ch) = self.next_char() {
-            match ch {
-                '\\' if !self.kind.is_raw() => {
-                    self.parse_escaped_char(&mut content)?;
+                b'\\' if !self.kind.is_raw() && self.peek_byte().is_some() => {
+                    match self.parse_escaped_char()? {
+                        None => {}
+                        Some(EscapedChar::Literal(c)) => value.push(c),
+                        Some(EscapedChar::Escape(c)) => {
+                            value.push('\\');
+                            value.push(c);
+                        }
+                    }
                }
                ch => {
-                    if !ch.is_ascii() {
-                        return Err(LexicalError::new(
-                            LexicalErrorType::OtherError(
-                                "bytes can only contain ASCII literal characters"
-                                    .to_string()
-                                    .into_boxed_str(),
-                            ),
-                            self.get_pos(),
-                        ));
-                    }
-                    content.push(ch);
+                    value.push(char::from(*ch));
                }
            }
+
+            let Some(next_index) =
+                memchr::memchr3(b'{', b'}', b'\\', self.source[self.cursor..].as_bytes())
+            else {
+                // Add the rest of the string to the value.
+                let rest = &self.source[self.cursor..];
+                value.push_str(rest);
+                break;
+            };
+
+            index = next_index;
        }
-        Ok(StringType::Bytes(ast::BytesLiteral {
-            value: content.chars().map(|c| c as u8).collect::<Vec<u8>>(),
+
+        Ok(ast::FStringElement::Literal(ast::FStringLiteralElement {
+            value: value.into_boxed_str(),
            range: self.range,
        }))
    }

-    fn parse_string(&mut self) -> Result<StringType, LexicalError> {
-        let mut value = String::with_capacity(self.rest.len());
-        if self.kind.is_raw() {
-            value.push_str(self.skip_bytes(self.rest.len()));
-        } else {
-            loop {
-                let Some(escape_idx) = self.rest.find('\\') else {
-                    value.push_str(self.skip_bytes(self.rest.len()));
-                    break;
-                };
-
-                let before_with_slash = self.skip_bytes(escape_idx + 1);
-                let before = &before_with_slash[..before_with_slash.len() - 1];
-
-                value.push_str(before);
-                self.parse_escaped_char(&mut value)?;
-            }
+    fn parse_bytes(mut self) -> Result<StringType, LexicalError> {
+        if let Some(index) = self.source.as_bytes().find_non_ascii_byte() {
+            return Err(LexicalError::new(
+                LexicalErrorType::OtherError(
+                    "bytes can only contain ASCII literal characters"
+                        .to_string()
+                        .into_boxed_str(),
+                ),
+                self.offset + TextSize::try_from(index).unwrap(),
+            ));
        }
+
+        if self.kind.is_raw() {
+            // For raw strings, no escaping is necessary.
+            return Ok(StringType::Bytes(ast::BytesLiteral {
+                value: self.source.into_boxed_bytes(),
+                range: self.range,
+            }));
+        }
+
+        let Some(mut escape) = memchr::memchr(b'\\', self.source.as_bytes()) else {
+            // If the string doesn't contain any escape sequences, return the owned string.
+            return Ok(StringType::Bytes(ast::BytesLiteral {
+                value: self.source.into_boxed_bytes(),
+                range: self.range,
+            }));
+        };
+
+        // If the string contains escape sequences, we need to parse them.
+        let mut value = Vec::with_capacity(self.source.len());
+        loop {
+            // Add the characters before the escape sequence to the string.
+            let before_with_slash = self.skip_bytes(escape + 1);
+            let before = &before_with_slash[..before_with_slash.len() - 1];
+            value.extend_from_slice(before.as_bytes());
+
+            // Add the escaped character to the string.
+            match self.parse_escaped_char()? {
+                None => {}
+                Some(EscapedChar::Literal(c)) => value.push(c as u8),
+                Some(EscapedChar::Escape(c)) => {
+                    value.push(b'\\');
+                    value.push(c as u8);
+                }
+            }
+
+            let Some(next_escape) = memchr::memchr(b'\\', self.source[self.cursor..].as_bytes())
+            else {
+                // Add the rest of the string to the value.
+                let rest = &self.source[self.cursor..];
+                value.extend_from_slice(rest.as_bytes());
+                break;
+            };
+
+            // Update the position of the next escape sequence.
+            escape = next_escape;
+        }
+
+        Ok(StringType::Bytes(ast::BytesLiteral {
+            value: value.into_boxed_slice(),
+            range: self.range,
+        }))
+    }
+
+    fn parse_string(mut self) -> Result<StringType, LexicalError> {
+        if self.kind.is_raw() {
+            // For raw strings, no escaping is necessary.
+            return Ok(StringType::Str(ast::StringLiteral {
+                value: self.source,
+                unicode: self.kind.is_unicode(),
+                range: self.range,
+            }));
+        }
+
+        let Some(mut escape) = memchr::memchr(b'\\', self.source.as_bytes()) else {
+            // If the string doesn't contain any escape sequences, return the owned string.
+            return Ok(StringType::Str(ast::StringLiteral {
+                value: self.source,
+                unicode: self.kind.is_unicode(),
+                range: self.range,
+            }));
+        };
+
+        // If the string contains escape sequences, we need to parse them.
+        let mut value = String::with_capacity(self.source.len());
+
+        loop {
+            // Add the characters before the escape sequence to the string.
+            let before_with_slash = self.skip_bytes(escape + 1);
+            let before = &before_with_slash[..before_with_slash.len() - 1];
+            value.push_str(before);
+
+            // Add the escaped character to the string.
+            match self.parse_escaped_char()? {
+                None => {}
+                Some(EscapedChar::Literal(c)) => value.push(c),
+                Some(EscapedChar::Escape(c)) => {
+                    value.push('\\');
+                    value.push(c);
+                }
+            }
+
+            let Some(next_escape) = self.source[self.cursor..].find('\\') else {
+                // Add the rest of the string to the value.
+                let rest = &self.source[self.cursor..];
+                value.push_str(rest);
+                break;
+            };
+
+            // Update the position of the next escape sequence.
+            escape = next_escape;
+        }
+
        Ok(StringType::Str(ast::StringLiteral {
            value: value.into_boxed_str(),
            unicode: self.kind.is_unicode(),
@ -301,7 +410,7 @@ impl<'a> StringParser<'a> {
        }))
    }

-    fn parse(&mut self) -> Result<StringType, LexicalError> {
+    fn parse(self) -> Result<StringType, LexicalError> {
        if self.kind.is_any_bytes() {
            self.parse_bytes()
        } else {
@ -311,7 +420,7 @@ impl<'a> StringParser<'a> {
 }

 pub(crate) fn parse_string_literal(
-    source: &str,
+    source: Box<str>,
    kind: StringKind,
    triple_quoted: bool,
    range: TextRange,
@ -327,7 +436,7 @@ pub(crate) fn parse_string_literal(
 }

 pub(crate) fn parse_fstring_literal_element(
-    source: &str,
+    source: Box<str>,
    is_raw: bool,
    range: TextRange,
 ) -> Result<ast::FStringElement, LexicalError> {
@ -360,7 +469,7 @@ pub(crate) fn concatenated_strings(
    if has_bytes && byte_literal_count < strings.len() {
        return Err(LexicalError::new(
            LexicalErrorType::OtherError(
-                "cannot mix bytes and nonbytes literals"
+                "cannot mix bytes and non-bytes literals"
                    .to_string()
                    .into_boxed_str(),
            ),