Remove unnecessary string cloning from the parser (#9884)

Closes https://github.com/astral-sh/ruff/issues/9869.
2025-08-04 02:38:25 +00:00 · 2024-02-09 16:03:27 -05:00 · 2024-02-09 16:03:27 -05:00 · 6f0e4ad332
commit 6f0e4ad332
parent 7ca515c0aa
11 changed files with 227 additions and 119 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -217,12 +217,12 @@ checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07"

 [[package]]
 name = "bstr"
-version = "1.6.2"
+version = "1.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4c2f7349907b712260e64b0afe2f84692af14a454be26187d9df565c7f69266a"
+checksum = "c48f0051a4b4c5e0b6d365cd04af53aeaa209e3cc15ec2cdb69e73cc87fbd0dc"
 dependencies = [
 "memchr",
- "regex-automata 0.3.9",
+ "regex-automata 0.4.3",
 "serde",
 ]

@ -1921,12 +1921,6 @@ dependencies = [
 "regex-syntax 0.6.29",
 ]

-[[package]]
-name = "regex-automata"
-version = "0.3.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "59b23e92ee4318893fa3fe3e6fb365258efbfe6ac6ab30f090cdcbb7aa37efa9"
-
 [[package]]
 name = "regex-automata"
 version = "0.4.3"
@ -2342,6 +2336,7 @@ version = "0.0.0"
 dependencies = [
 "anyhow",
 "bitflags 2.4.1",
+ "bstr",
 "insta",
 "is-macro",
 "itertools 0.12.1",
--- a/Cargo.toml
+++ b/Cargo.toml
@ -19,6 +19,7 @@ argfile = { version = "0.1.6" }
 assert_cmd = { version = "2.0.13" }
 bincode = { version = "1.3.3" }
 bitflags = { version = "2.4.1" }
+bstr = { version = "1.9.0" }
 cachedir = { version = "0.3.1" }
 chrono = { version = "0.4.33", default-features = false, features = ["clock"] }
 clap = { version = "4.4.18", features = ["derive"] }
--- a/crates/ruff_linter/src/rules/flake8_bandit/rules/hardcoded_bind_all_interfaces.rs
+++ b/crates/ruff_linter/src/rules/flake8_bandit/rules/hardcoded_bind_all_interfaces.rs
@ -40,7 +40,9 @@ impl Violation for HardcodedBindAllInterfaces {
 pub(crate) fn hardcoded_bind_all_interfaces(checker: &mut Checker, string: StringLike) {
    let is_bind_all_interface = match string {
        StringLike::StringLiteral(ast::ExprStringLiteral { value, .. }) => value == "0.0.0.0",
-        StringLike::FStringLiteral(ast::FStringLiteralElement { value, .. }) => value == "0.0.0.0",
+        StringLike::FStringLiteral(ast::FStringLiteralElement { value, .. }) => {
+            &**value == "0.0.0.0"
+        }
        StringLike::BytesLiteral(_) => return,
    };

--- a/crates/ruff_linter/src/rules/flynt/helpers.rs
+++ b/crates/ruff_linter/src/rules/flynt/helpers.rs
@ -15,7 +15,7 @@ fn to_f_string_expression_element(inner: &Expr) -> ast::FStringElement {
 /// Convert a string to a [`ast::FStringElement::Literal`].
 pub(super) fn to_f_string_literal_element(s: &str) -> ast::FStringElement {
    ast::FStringElement::Literal(ast::FStringLiteralElement {
-        value: s.to_owned(),
+        value: s.to_string().into_boxed_str(),
        range: TextRange::default(),
    })
 }
@ -53,7 +53,7 @@ pub(super) fn to_f_string_element(expr: &Expr) -> Option<ast::FStringElement> {
    match expr {
        Expr::StringLiteral(ast::ExprStringLiteral { value, range }) => {
            Some(ast::FStringElement::Literal(ast::FStringLiteralElement {
-                value: value.to_string(),
+                value: value.to_string().into_boxed_str(),
                range: *range,
            }))
        }
--- a/crates/ruff_python_ast/src/comparable.rs
+++ b/crates/ruff_python_ast/src/comparable.rs
@ -644,7 +644,7 @@ pub struct ComparableBytesLiteral<'a> {
 impl<'a> From<&'a ast::BytesLiteral> for ComparableBytesLiteral<'a> {
    fn from(bytes_literal: &'a ast::BytesLiteral) -> Self {
        Self {
-            value: bytes_literal.value.as_slice(),
+            value: &bytes_literal.value,
        }
    }
 }
--- a/crates/ruff_python_ast/src/nodes.rs
+++ b/crates/ruff_python_ast/src/nodes.rs
@ -949,7 +949,7 @@ impl Ranged for FStringExpressionElement {
 #[derive(Clone, Debug, PartialEq)]
 pub struct FStringLiteralElement {
    pub range: TextRange,
-    pub value: String,
+    pub value: Box<str>,
 }

 impl Ranged for FStringLiteralElement {
@ -962,7 +962,7 @@ impl Deref for FStringLiteralElement {
    type Target = str;

    fn deref(&self) -> &Self::Target {
-        self.value.as_str()
+        &self.value
    }
 }

@ -1607,7 +1607,7 @@ impl Default for BytesLiteralValueInner {
 #[derive(Clone, Debug, Default, PartialEq)]
 pub struct BytesLiteral {
    pub range: TextRange,
-    pub value: Vec<u8>,
+    pub value: Box<[u8]>,
 }

 impl Ranged for BytesLiteral {
@ -1620,7 +1620,7 @@ impl Deref for BytesLiteral {
    type Target = [u8];

    fn deref(&self) -> &Self::Target {
-        self.value.as_slice()
+        &self.value
    }
 }

--- a/crates/ruff_python_parser/Cargo.toml
+++ b/crates/ruff_python_parser/Cargo.toml
@ -19,14 +19,15 @@ ruff_text_size = { path = "../ruff_text_size" }

 anyhow = { workspace = true }
 bitflags = { workspace = true }
+bstr = { workspace = true }
 is-macro = { workspace = true }
 itertools = { workspace = true }
 lalrpop-util = { workspace = true, default-features = false }
 memchr = { workspace = true }
-unicode-ident = { workspace = true }
-unicode_names2 = { workspace = true }
 rustc-hash = { workspace = true }
 static_assertions = { workspace = true }
+unicode-ident = { workspace = true }
+unicode_names2 = { workspace = true }

 [dev-dependencies]
 insta = { workspace = true }
--- a/crates/ruff_python_parser/src/lib.rs
+++ b/crates/ruff_python_parser/src/lib.rs
@ -119,10 +119,10 @@ pub use token::{StringKind, Tok, TokenKind};

 use crate::lexer::LexResult;

-mod function;
-// Skip flattening lexer to distinguish from full ruff_python_parser
 mod context;
+mod function;
 mod invalid;
+// Skip flattening lexer to distinguish from full ruff_python_parser
 pub mod lexer;
 mod parser;
 mod soft_keywords;
--- a/crates/ruff_python_parser/src/python.lalrpop
+++ b/crates/ruff_python_parser/src/python.lalrpop
@ -1616,7 +1616,7 @@ StringLiteralOrFString: StringType = {
 StringLiteral: StringType = {
    <location:@L> <string:string> <end_location:@R> =>? {
        let (source, kind, triple_quoted) = string;
-        Ok(parse_string_literal(&source, kind, triple_quoted, (location..end_location).into())?)
+        Ok(parse_string_literal(source, kind, triple_quoted, (location..end_location).into())?)
    }
 };

@ -1633,7 +1633,7 @@ FStringMiddlePattern: ast::FStringElement = {
    FStringReplacementField,
    <location:@L> <fstring_middle:fstring_middle> <end_location:@R> =>? {
        let (source, is_raw, _) = fstring_middle;
-        Ok(parse_fstring_literal_element(&source, is_raw, (location..end_location).into())?)
+        Ok(parse_fstring_literal_element(source, is_raw, (location..end_location).into())?)
    }
 };

--- a/crates/ruff_python_parser/src/python.rs
+++ b/crates/ruff_python_parser/src/python.rs
@ -1,5 +1,5 @@
 // auto-generated: "lalrpop 0.20.0"
-// sha3: 02c60b5c591440061dda68775005d87a203b5448c205120bda1566a62fc2147c
+// sha3: d38cc0f2252a58db42d3bd63a102b537865992b3cf51d402cdb4828f48989c9d
 use ruff_text_size::{Ranged, TextLen, TextRange, TextSize};
 use ruff_python_ast::{self as ast, Int, IpyEscapeKind};
 use crate::{
@ -36369,7 +36369,7 @@ fn __action217<
 {
    {
        let (source, kind, triple_quoted) = string;
-        Ok(parse_string_literal(&source, kind, triple_quoted, (location..end_location).into())?)
+        Ok(parse_string_literal(source, kind, triple_quoted, (location..end_location).into())?)
    }
 }

@ -36419,7 +36419,7 @@ fn __action220<
 {
    {
        let (source, is_raw, _) = fstring_middle;
-        Ok(parse_fstring_literal_element(&source, is_raw, (location..end_location).into())?)
+        Ok(parse_fstring_literal_element(source, is_raw, (location..end_location).into())?)
    }
 }

--- a/crates/ruff_python_parser/src/string.rs
+++ b/crates/ruff_python_parser/src/string.rs
@ -1,7 +1,9 @@
 //! Parsing of string literals, bytes literals, and implicit string concatenation.

+use bstr::ByteSlice;
+
 use ruff_python_ast::{self as ast, Expr};
-use ruff_text_size::{Ranged, TextLen, TextRange, TextSize};
+use ruff_text_size::{Ranged, TextRange, TextSize};

 use crate::lexer::{LexicalError, LexicalErrorType};
 use crate::token::{StringKind, Tok};
@ -32,34 +34,40 @@ impl From<StringType> for Expr {
    }
 }

-struct StringParser<'a> {
-    rest: &'a str,
+enum EscapedChar {
+    Literal(char),
+    Escape(char),
+}
+
+struct StringParser {
+    source: Box<str>,
+    cursor: usize,
    kind: StringKind,
-    location: TextSize,
+    offset: TextSize,
    range: TextRange,
 }

-impl<'a> StringParser<'a> {
-    fn new(source: &'a str, kind: StringKind, start: TextSize, range: TextRange) -> Self {
+impl StringParser {
+    fn new(source: Box<str>, kind: StringKind, offset: TextSize, range: TextRange) -> Self {
        Self {
-            rest: source,
+            source,
+            cursor: 0,
            kind,
-            location: start,
+            offset,
            range,
        }
    }

    #[inline]
-    fn skip_bytes(&mut self, bytes: usize) -> &'a str {
-        let skipped_str = &self.rest[..bytes];
-        self.rest = &self.rest[bytes..];
-        self.location += skipped_str.text_len();
+    fn skip_bytes(&mut self, bytes: usize) -> &str {
+        let skipped_str = &self.source[self.cursor..self.cursor + bytes];
+        self.cursor += bytes;
        skipped_str
    }

    #[inline]
    fn get_pos(&self) -> TextSize {
-        self.location
+        self.offset + TextSize::try_from(self.cursor).unwrap()
    }

    /// Returns the next byte in the string, if there is one.
@ -69,25 +77,23 @@ impl<'a> StringParser<'a> {
    /// When the next byte is a part of a multi-byte character.
    #[inline]
    fn next_byte(&mut self) -> Option<u8> {
-        self.rest.as_bytes().first().map(|&byte| {
-            self.rest = &self.rest[1..];
-            self.location += TextSize::new(1);
+        self.source[self.cursor..].as_bytes().first().map(|&byte| {
+            self.cursor += 1;
            byte
        })
    }

    #[inline]
    fn next_char(&mut self) -> Option<char> {
-        self.rest.chars().next().map(|c| {
-            self.rest = &self.rest[c.len_utf8()..];
-            self.location += c.text_len();
+        self.source[self.cursor..].chars().next().map(|c| {
+            self.cursor += c.len_utf8();
            c
        })
    }

    #[inline]
    fn peek_byte(&self) -> Option<u8> {
-        self.rest.as_bytes().first().copied()
+        self.source[self.cursor..].as_bytes().first().copied()
    }

    fn parse_unicode_literal(&mut self, literal_number: usize) -> Result<char, LexicalError> {
@ -135,7 +141,7 @@ impl<'a> StringParser<'a> {
        };

        let start_pos = self.get_pos();
-        let Some(close_idx) = self.rest.find('}') else {
+        let Some(close_idx) = self.source[self.cursor..].find('}') else {
            return Err(LexicalError::new(
                LexicalErrorType::StringError,
                self.get_pos(),
@ -149,7 +155,8 @@ impl<'a> StringParser<'a> {
            .ok_or_else(|| LexicalError::new(LexicalErrorType::UnicodeError, start_pos))
    }

-    fn parse_escaped_char(&mut self, string: &mut String) -> Result<(), LexicalError> {
+    /// Parse an escaped character, returning the new character.
+    fn parse_escaped_char(&mut self) -> Result<Option<EscapedChar>, LexicalError> {
        let Some(first_char) = self.next_char() else {
            return Err(LexicalError::new(
                LexicalErrorType::StringError,
@ -174,13 +181,13 @@ impl<'a> StringParser<'a> {
            'U' if !self.kind.is_any_bytes() => self.parse_unicode_literal(8)?,
            'N' if !self.kind.is_any_bytes() => self.parse_unicode_name()?,
            // Special cases where the escape sequence is not a single character
-            '\n' => return Ok(()),
+            '\n' => return Ok(None),
            '\r' => {
                if self.peek_byte() == Some(b'\n') {
                    self.next_byte();
                }

-                return Ok(());
+                return Ok(None);
            }
            _ => {
                if self.kind.is_any_bytes() && !first_char.is_ascii() {
@ -194,21 +201,42 @@ impl<'a> StringParser<'a> {
                    ));
                }

-                string.push('\\');
-
-                first_char
+                return Ok(Some(EscapedChar::Escape(first_char)));
            }
        };

-        string.push(new_char);
-
-        Ok(())
+        Ok(Some(EscapedChar::Literal(new_char)))
    }

-    fn parse_fstring_middle(&mut self) -> Result<ast::FStringElement, LexicalError> {
-        let mut value = String::with_capacity(self.rest.len());
-        while let Some(ch) = self.next_char() {
-            match ch {
+    fn parse_fstring_middle(mut self) -> Result<ast::FStringElement, LexicalError> {
+        // Fast-path: if the f-string doesn't contain any escape sequences, return the literal.
+        let Some(mut index) = memchr::memchr3(b'{', b'}', b'\\', self.source.as_bytes()) else {
+            return Ok(ast::FStringElement::Literal(ast::FStringLiteralElement {
+                value: self.source,
+                range: self.range,
+            }));
+        };
+
+        let mut value = String::with_capacity(self.source.len());
+        loop {
+            // Add the characters before the escape sequence (or curly brace) to the string.
+            let before_with_slash_or_brace = self.skip_bytes(index + 1);
+            let before = &before_with_slash_or_brace[..before_with_slash_or_brace.len() - 1];
+            value.push_str(before);
+
+            // Add the escaped character to the string.
+            match &self.source.as_bytes()[self.cursor - 1] {
+                // If there are any curly braces inside a `FStringMiddle` token,
+                // then they were escaped (i.e. `{{` or `}}`). This means that
+                // we need increase the location by 2 instead of 1.
+                b'{' => {
+                    self.offset += TextSize::from(1);
+                    value.push('{');
+                }
+                b'}' => {
+                    self.offset += TextSize::from(1);
+                    value.push('}');
+                }
                // We can encounter a `\` as the last character in a `FStringMiddle`
                // token which is valid in this context. For example,
                //
@ -229,71 +257,152 @@ impl<'a> StringParser<'a> {
                // This is still an invalid escape sequence, but we don't want to
                // raise a syntax error as is done by the CPython parser. It might
                // be supported in the future, refer to point 3: https://peps.python.org/pep-0701/#rejected-ideas
-                '\\' if !self.kind.is_raw() && self.peek_byte().is_some() => {
-                    self.parse_escaped_char(&mut value)?;
-                }
-                // If there are any curly braces inside a `FStringMiddle` token,
-                // then they were escaped (i.e. `{{` or `}}`). This means that
-                // we need increase the location by 2 instead of 1.
-                ch @ ('{' | '}') => {
-                    self.location += ch.text_len();
-                    value.push(ch);
-                }
-                ch => value.push(ch),
+                b'\\' if !self.kind.is_raw() && self.peek_byte().is_some() => {
+                    match self.parse_escaped_char()? {
+                        None => {}
+                        Some(EscapedChar::Literal(c)) => value.push(c),
+                        Some(EscapedChar::Escape(c)) => {
+                            value.push('\\');
+                            value.push(c);
                        }
                    }
+                }
+                ch => {
+                    value.push(char::from(*ch));
+                }
+            }
+
+            let Some(next_index) =
+                memchr::memchr3(b'{', b'}', b'\\', self.source[self.cursor..].as_bytes())
+            else {
+                // Add the rest of the string to the value.
+                let rest = &self.source[self.cursor..];
+                value.push_str(rest);
+                break;
+            };
+
+            index = next_index;
+        }
+
        Ok(ast::FStringElement::Literal(ast::FStringLiteralElement {
-            value,
+            value: value.into_boxed_str(),
            range: self.range,
        }))
    }

-    fn parse_bytes(&mut self) -> Result<StringType, LexicalError> {
-        let mut content = String::with_capacity(self.rest.len());
-        while let Some(ch) = self.next_char() {
-            match ch {
-                '\\' if !self.kind.is_raw() => {
-                    self.parse_escaped_char(&mut content)?;
-                }
-                ch => {
-                    if !ch.is_ascii() {
+    fn parse_bytes(mut self) -> Result<StringType, LexicalError> {
+        if let Some(index) = self.source.as_bytes().find_non_ascii_byte() {
            return Err(LexicalError::new(
                LexicalErrorType::OtherError(
                    "bytes can only contain ASCII literal characters"
                        .to_string()
                        .into_boxed_str(),
                ),
-                            self.get_pos(),
+                self.offset + TextSize::try_from(index).unwrap(),
            ));
        }
-                    content.push(ch);
+
+        if self.kind.is_raw() {
+            // For raw strings, no escaping is necessary.
+            return Ok(StringType::Bytes(ast::BytesLiteral {
+                value: self.source.into_boxed_bytes(),
+                range: self.range,
+            }));
+        }
+
+        let Some(mut escape) = memchr::memchr(b'\\', self.source.as_bytes()) else {
+            // If the string doesn't contain any escape sequences, return the owned string.
+            return Ok(StringType::Bytes(ast::BytesLiteral {
+                value: self.source.into_boxed_bytes(),
+                range: self.range,
+            }));
+        };
+
+        // If the string contains escape sequences, we need to parse them.
+        let mut value = Vec::with_capacity(self.source.len());
+        loop {
+            // Add the characters before the escape sequence to the string.
+            let before_with_slash = self.skip_bytes(escape + 1);
+            let before = &before_with_slash[..before_with_slash.len() - 1];
+            value.extend_from_slice(before.as_bytes());
+
+            // Add the escaped character to the string.
+            match self.parse_escaped_char()? {
+                None => {}
+                Some(EscapedChar::Literal(c)) => value.push(c as u8),
+                Some(EscapedChar::Escape(c)) => {
+                    value.push(b'\\');
+                    value.push(c as u8);
                }
            }
+
+            let Some(next_escape) = memchr::memchr(b'\\', self.source[self.cursor..].as_bytes())
+            else {
+                // Add the rest of the string to the value.
+                let rest = &self.source[self.cursor..];
+                value.extend_from_slice(rest.as_bytes());
+                break;
+            };
+
+            // Update the position of the next escape sequence.
+            escape = next_escape;
        }
+
        Ok(StringType::Bytes(ast::BytesLiteral {
-            value: content.chars().map(|c| c as u8).collect::<Vec<u8>>(),
+            value: value.into_boxed_slice(),
            range: self.range,
        }))
    }

-    fn parse_string(&mut self) -> Result<StringType, LexicalError> {
-        let mut value = String::with_capacity(self.rest.len());
+    fn parse_string(mut self) -> Result<StringType, LexicalError> {
        if self.kind.is_raw() {
-            value.push_str(self.skip_bytes(self.rest.len()));
-        } else {
+            // For raw strings, no escaping is necessary.
+            return Ok(StringType::Str(ast::StringLiteral {
+                value: self.source,
+                unicode: self.kind.is_unicode(),
+                range: self.range,
+            }));
+        }
+
+        let Some(mut escape) = memchr::memchr(b'\\', self.source.as_bytes()) else {
+            // If the string doesn't contain any escape sequences, return the owned string.
+            return Ok(StringType::Str(ast::StringLiteral {
+                value: self.source,
+                unicode: self.kind.is_unicode(),
+                range: self.range,
+            }));
+        };
+
+        // If the string contains escape sequences, we need to parse them.
+        let mut value = String::with_capacity(self.source.len());
+
        loop {
-                let Some(escape_idx) = self.rest.find('\\') else {
-                    value.push_str(self.skip_bytes(self.rest.len()));
+            // Add the characters before the escape sequence to the string.
+            let before_with_slash = self.skip_bytes(escape + 1);
+            let before = &before_with_slash[..before_with_slash.len() - 1];
+            value.push_str(before);
+
+            // Add the escaped character to the string.
+            match self.parse_escaped_char()? {
+                None => {}
+                Some(EscapedChar::Literal(c)) => value.push(c),
+                Some(EscapedChar::Escape(c)) => {
+                    value.push('\\');
+                    value.push(c);
+                }
+            }
+
+            let Some(next_escape) = self.source[self.cursor..].find('\\') else {
+                // Add the rest of the string to the value.
+                let rest = &self.source[self.cursor..];
+                value.push_str(rest);
                break;
            };

-                let before_with_slash = self.skip_bytes(escape_idx + 1);
-                let before = &before_with_slash[..before_with_slash.len() - 1];
+            // Update the position of the next escape sequence.
+            escape = next_escape;
+        }

-                value.push_str(before);
-                self.parse_escaped_char(&mut value)?;
-            }
-        }
        Ok(StringType::Str(ast::StringLiteral {
            value: value.into_boxed_str(),
            unicode: self.kind.is_unicode(),
@ -301,7 +410,7 @@ impl<'a> StringParser<'a> {
        }))
    }

-    fn parse(&mut self) -> Result<StringType, LexicalError> {
+    fn parse(self) -> Result<StringType, LexicalError> {
        if self.kind.is_any_bytes() {
            self.parse_bytes()
        } else {
@ -311,7 +420,7 @@ impl<'a> StringParser<'a> {
 }

 pub(crate) fn parse_string_literal(
-    source: &str,
+    source: Box<str>,
    kind: StringKind,
    triple_quoted: bool,
    range: TextRange,
@ -327,7 +436,7 @@ pub(crate) fn parse_string_literal(
 }

 pub(crate) fn parse_fstring_literal_element(
-    source: &str,
+    source: Box<str>,
    is_raw: bool,
    range: TextRange,
 ) -> Result<ast::FStringElement, LexicalError> {
@ -360,7 +469,7 @@ pub(crate) fn concatenated_strings(
    if has_bytes && byte_literal_count < strings.len() {
        return Err(LexicalError::new(
            LexicalErrorType::OtherError(
-                "cannot mix bytes and nonbytes literals"
+                "cannot mix bytes and non-bytes literals"
                    .to_string()
                    .into_boxed_str(),
            ),