perf(parser): use faster string parser methods (#8227)

## Summary This makes use of memchr and other methods to parse the strings (hopefully) faster. It might also be worth converting the `parse_fstring_middle` helper to use similar techniques, but I did not implement it in this PR. ## Test Plan This was tested using the existing tests and passed all of them.
2025-09-30 13:51:37 +00:00 · 2023-10-28 17:50:54 -05:00 · 2023-10-28 17:50:54 -05:00 · 2f5734d1ac
commit 2f5734d1ac
parent c39ea6ef05
1 changed files with 132 additions and 99 deletions
--- a/crates/ruff_python_parser/src/string.rs
+++ b/crates/ruff_python_parser/src/string.rs
@ -6,9 +6,6 @@ use ruff_text_size::{Ranged, TextLen, TextRange, TextSize};
 use crate::lexer::{LexicalError, LexicalErrorType};
 use crate::token::{StringKind, Tok};
 // unicode_name2 does not expose `MAX_NAME_LENGTH`, so we replicate that constant here, fix #3798
 const MAX_UNICODE_NAME: usize = 88;
 pub(crate) struct StringConstantWithRange {
    value: StringConstant,
    range: TextRange,
@ -57,7 +54,7 @@ impl StringType {
 }
 struct StringParser<'a> {
-    chars: std::str::Chars<'a>,
+    rest: &'a str,
    kind: StringKind,
    location: TextSize,
 }
@ -65,22 +62,18 @@ struct StringParser<'a> {
 impl<'a> StringParser<'a> {
    fn new(source: &'a str, kind: StringKind, start: TextSize) -> Self {
        Self {
-            chars: source.chars(),
+            rest: source,
            kind,
            location: start,
        }
    }
    #[inline]
-    fn next_char(&mut self) -> Option<char> {
+    fn skip_bytes(&mut self, bytes: usize) -> &'a str {
-        let c = self.chars.next()?;
+        let skipped_str = &self.rest[..bytes];
-        self.location += c.text_len();
+        self.rest = &self.rest[bytes..];
-        Some(c)
+        self.location += skipped_str.text_len();
-    }
+        skipped_str
    #[inline]
    fn peek(&mut self) -> Option<char> {
        self.chars.clone().next()
    }
    #[inline]
@ -93,6 +86,34 @@ impl<'a> StringParser<'a> {
        TextRange::new(start_location, self.location)
    }
    /// Returns the next byte in the string, if there is one.
    ///
    /// # Panics
    ///
    /// When the next byte is a part of a multi-byte character.
    #[inline]
    fn next_byte(&mut self) -> Option<u8> {
        self.rest.as_bytes().first().map(|&byte| {
            self.rest = &self.rest[1..];
            self.location += TextSize::new(1);
            byte
        })
    }
    #[inline]
    fn next_char(&mut self) -> Option<char> {
        self.rest.chars().next().map(|c| {
            self.rest = &self.rest[c.len_utf8()..];
            self.location += c.text_len();
            c
        })
    }
    #[inline]
    fn peek_byte(&self) -> Option<u8> {
        self.rest.as_bytes().first().copied()
    }
    fn parse_unicode_literal(&mut self, literal_number: usize) -> Result<char, LexicalError> {
        let mut p: u32 = 0u32;
        let unicode_error = LexicalError::new(LexicalErrorType::UnicodeError, self.get_pos());
@ -110,57 +131,58 @@ impl<'a> StringParser<'a> {
            _ => std::char::from_u32(p).ok_or(unicode_error),
        }
    }
    fn parse_octet(&mut self, o: u8) -> char {
        let mut radix_bytes = [o, 0, 0];
        let mut len = 1;
-    fn parse_octet(&mut self, first: char) -> char {
+        while len < 3 {
-        let mut octet_content = String::new();
+            let Some(b'0'..=b'8') = self.peek_byte() else {
        octet_content.push(first);
        while octet_content.len() < 3 {
            if let Some('0'..='7') = self.peek() {
                octet_content.push(self.next_char().unwrap());
            } else {
                break;
            };
            radix_bytes[len] = self.next_byte().unwrap();
            len += 1;
        }
-        }
+
-        let value = u32::from_str_radix(&octet_content, 8).unwrap();
+        // SAFETY: radix_bytes is always going to be in the ASCII range.
        #[allow(unsafe_code)]
        let radix_str = unsafe { std::str::from_utf8_unchecked(&radix_bytes[..len]) };
        let value = u32::from_str_radix(radix_str, 8).unwrap();
        char::from_u32(value).unwrap()
    }
    fn parse_unicode_name(&mut self) -> Result<char, LexicalError> {
        let start_pos = self.get_pos();
-        match self.next_char() {
+
-            Some('{') => {}
+        let Some('{') = self.next_char() else {
-            _ => return Err(LexicalError::new(LexicalErrorType::StringError, start_pos)),
+            return Err(LexicalError::new(LexicalErrorType::StringError, start_pos));
-        }
+        };
        let start_pos = self.get_pos();
-        let mut name = String::new();
+        let Some(close_idx) = self.rest.find('}') else {
        loop {
            match self.next_char() {
                Some('}') => break,
                Some(c) => name.push(c),
                None => {
            return Err(LexicalError::new(
                LexicalErrorType::StringError,
                self.get_pos(),
                    ))
                }
            }
        }
        if name.len() > MAX_UNICODE_NAME {
            return Err(LexicalError::new(
                LexicalErrorType::UnicodeError,
                self.get_pos(),
            ));
-        }
+        };
-        unicode_names2::character(&name)
+        let name_and_ending = self.skip_bytes(close_idx + 1);
        let name = &name_and_ending[..name_and_ending.len() - 1];
        unicode_names2::character(name)
            .ok_or_else(|| LexicalError::new(LexicalErrorType::UnicodeError, start_pos))
    }
-    fn parse_escaped_char(&mut self) -> Result<String, LexicalError> {
+    fn parse_escaped_char(&mut self, string: &mut String) -> Result<(), LexicalError> {
-        match self.next_char() {
+        let Some(first_char) = self.next_char() else {
-            Some(c) => {
+            return Err(LexicalError {
-                let char = match c {
+                error: LexicalErrorType::StringError,
                location: self.get_pos(),
            });
        };
        let new_char = match first_char {
            '\\' => '\\',
            '\'' => '\'',
            '\"' => '"',
@ -171,21 +193,22 @@ impl<'a> StringParser<'a> {
            'r' => '\r',
            't' => '\t',
            'v' => '\x0b',
-                    o @ '0'..='7' => self.parse_octet(o),
+            o @ '0'..='7' => self.parse_octet(o as u8),
            'x' => self.parse_unicode_literal(2)?,
            'u' if !self.kind.is_any_bytes() => self.parse_unicode_literal(4)?,
            'U' if !self.kind.is_any_bytes() => self.parse_unicode_literal(8)?,
            'N' if !self.kind.is_any_bytes() => self.parse_unicode_name()?,
            // Special cases where the escape sequence is not a single character
-                    '\n' => return Ok(String::new()),
+            '\n' => return Ok(()),
            '\r' => {
-                        if self.peek() == Some('\n') {
+                if self.peek_byte() == Some(b'\n') {
-                            self.next_char();
+                    self.next_byte();
                }
-                        return Ok(String::new());
+
                return Ok(());
            }
-                    c => {
+            _ => {
-                        if self.kind.is_any_bytes() && !c.is_ascii() {
+                if self.kind.is_any_bytes() && !first_char.is_ascii() {
                    return Err(LexicalError {
                        error: LexicalErrorType::OtherError(
                            "bytes can only contain ASCII literal characters".to_owned(),
@ -193,16 +216,16 @@ impl<'a> StringParser<'a> {
                        location: self.get_pos(),
                    });
                }
-                        return Ok(format!("\\{c}"));
+
                string.push('\\');
                first_char
            }
        };
-                Ok(char.to_string())
+
-            }
+        string.push(new_char);
-            None => Err(LexicalError {
+
-                error: LexicalErrorType::StringError,
+        Ok(())
                location: self.get_pos(),
            }),
        }
    }
    fn parse_fstring_middle(&mut self) -> Result<Expr, LexicalError> {
@ -230,8 +253,8 @@ impl<'a> StringParser<'a> {
                // This is still an invalid escape sequence, but we don't want to
                // raise a syntax error as is done by the CPython parser. It might
                // be supported in the future, refer to point 3: https://peps.python.org/pep-0701/#rejected-ideas
-                '\\' if !self.kind.is_raw() && self.peek().is_some() => {
+                '\\' if !self.kind.is_raw() && self.peek_byte().is_some() => {
-                    value.push_str(&self.parse_escaped_char()?);
+                    self.parse_escaped_char(&mut value)?;
                }
                // If there are any curly braces inside a `FStringMiddle` token,
                // then they were escaped (i.e. `{{` or `}}`). This means that
@ -255,7 +278,7 @@ impl<'a> StringParser<'a> {
        while let Some(ch) = self.next_char() {
            match ch {
                '\\' if !self.kind.is_raw() => {
-                    content.push_str(&self.parse_escaped_char()?);
+                    self.parse_escaped_char(&mut content)?;
                }
                ch => {
                    if !ch.is_ascii() {
@ -278,16 +301,26 @@ impl<'a> StringParser<'a> {
    }
    fn parse_string(&mut self) -> Result<StringType, LexicalError> {
        let mut value = String::new();
        let start_location = self.get_pos();
-        while let Some(ch) = self.next_char() {
+        let mut value = String::new();
-            match ch {
+
-                '\\' if !self.kind.is_raw() => {
+        if self.kind.is_raw() {
-                    value.push_str(&self.parse_escaped_char()?);
+            value.push_str(self.skip_bytes(self.rest.len()));
-                }
+        } else {
-                ch => value.push(ch),
+            loop {
                let Some(escape_idx) = self.rest.find('\\') else {
                    value.push_str(self.skip_bytes(self.rest.len()));
                    break;
                };
                let before_with_slash = self.skip_bytes(escape_idx + 1);
                let before = &before_with_slash[..before_with_slash.len() - 1];
                value.push_str(before);
                self.parse_escaped_char(&mut value)?;
            }
        }
        Ok(StringType::Str(StringConstantWithRange {
            value: StringConstant {
                value,