Perf: Skip string normalization when possible (#10116)

2025-07-24 05:26:23 +00:00 · 2024-02-26 18:35:29 +01:00 · 2024-02-26 18:35:29 +01:00 · 8dc22d5793
commit 8dc22d5793
parent 15b87ea8be
3 changed files with 170 additions and 88 deletions
--- a/crates/ruff_python_formatter/src/other/f_string.rs
+++ b/crates/ruff_python_formatter/src/other/f_string.rs
@ -59,16 +59,16 @@ impl Format<PyFormatContext<'_>> for FormatFString<'_> {
            return result;
        }

-        let quotes = normalizer.choose_quotes(&string, &locator);
+        let quote_selection = normalizer.choose_quotes(&string, &locator);

        let context = FStringContext::new(
            string.prefix(),
-            quotes,
+            quote_selection.quotes(),
            FStringLayout::from_f_string(self.value, &locator),
        );

        // Starting prefix and quote
-        write!(f, [string.prefix(), quotes])?;
+        write!(f, [string.prefix(), quote_selection.quotes()])?;

        f.join()
            .entries(
@ -80,7 +80,7 @@ impl Format<PyFormatContext<'_>> for FormatFString<'_> {
            .finish()?;

        // Ending quote
-        quotes.fmt(f)
+        quote_selection.quotes().fmt(f)
    }
 }

--- a/crates/ruff_python_formatter/src/other/f_string_element.rs
+++ b/crates/ruff_python_formatter/src/other/f_string_element.rs
@ -59,6 +59,7 @@ impl Format<PyFormatContext<'_>> for FormatFStringLiteralElement<'_> {
        let literal_content = f.context().locator().slice(self.element.range());
        let normalized = normalize_string(
            literal_content,
+            0,
            self.context.quotes(),
            self.context.prefix(),
            is_hex_codes_in_unicode_sequences_enabled(f.context()),
--- a/crates/ruff_python_formatter/src/string/normalize.rs
+++ b/crates/ruff_python_formatter/src/string/normalize.rs
@ -1,4 +1,5 @@
 use std::borrow::Cow;
+use std::iter::FusedIterator;

 use ruff_formatter::FormatContext;
 use ruff_source_file::Locator;
@ -44,68 +45,8 @@ impl StringNormalizer {
        self
    }

-    /// Computes the strings preferred quotes.
-    pub(crate) fn choose_quotes(&self, string: &StringPart, locator: &Locator) -> StringQuotes {
-        // Per PEP 8, always prefer double quotes for triple-quoted strings.
-        // Except when using quote-style-preserve.
-        let preferred_style = if string.quotes().triple {
-            // ... unless we're formatting a code snippet inside a docstring,
-            // then we specifically want to invert our quote style to avoid
-            // writing out invalid Python.
-            //
-            // It's worth pointing out that we can actually wind up being
-            // somewhat out of sync with PEP8 in this case. Consider this
-            // example:
-            //
-            //     def foo():
-            //         '''
-            //         Something.
-            //
-            //         >>> """tricksy"""
-            //         '''
-            //         pass
-            //
-            // Ideally, this would be reformatted as:
-            //
-            //     def foo():
-            //         """
-            //         Something.
-            //
-            //         >>> '''tricksy'''
-            //         """
-            //         pass
-            //
-            // But the logic here results in the original quoting being
-            // preserved. This is because the quoting style of the outer
-            // docstring is determined, in part, by looking at its contents. In
-            // this case, it notices that it contains a `"""` and thus infers
-            // that using `'''` would overall read better because it avoids
-            // the need to escape the interior `"""`. Except... in this case,
-            // the `"""` is actually part of a code snippet that could get
-            // reformatted to using a different quoting style itself.
-            //
-            // Fixing this would, I believe, require some fairly seismic
-            // changes to how formatting strings works. Namely, we would need
-            // to look for code snippets before normalizing the docstring, and
-            // then figure out the quoting style more holistically by looking
-            // at the various kinds of quotes used in the code snippets and
-            // what reformatting them might look like.
-            //
-            // Overall this is a bit of a corner case and just inverting the
-            // style from what the parent ultimately decided upon works, even
-            // if it doesn't have perfect alignment with PEP8.
-            if let Some(quote) = self.parent_docstring_quote_char {
-                QuoteStyle::from(quote.invert())
-            } else if self.preferred_quote_style.is_preserve() {
-                QuoteStyle::Preserve
-            } else {
-                QuoteStyle::Double
-            }
-        } else {
-            self.preferred_quote_style
-        };
-
-        let quoting = if let FStringState::InsideExpressionElement(context) = self.f_string_state {
+    fn quoting(&self, string: &StringPart) -> Quoting {
+        if let FStringState::InsideExpressionElement(context) = self.f_string_state {
            // If we're inside an f-string, we need to make sure to preserve the
            // existing quotes unless we're inside a triple-quoted f-string and
            // the inner string itself isn't triple-quoted. For example:
@ -129,22 +70,110 @@ impl StringNormalizer {
            }
        } else {
            self.quoting
-        };
+        }
+    }

-        match quoting {
+    /// Computes the strings preferred quotes.
+    pub(crate) fn choose_quotes(&self, string: &StringPart, locator: &Locator) -> QuoteSelection {
+        let raw_content = locator.slice(string.content_range());
+        let first_quote_or_normalized_char_offset = raw_content
+            .bytes()
+            .position(|b| matches!(b, b'\\' | b'"' | b'\'' | b'\r' | b'{'));
+
+        let quotes = match self.quoting(string) {
            Quoting::Preserve => string.quotes(),
            Quoting::CanChange => {
-                if let Some(preferred_quote) = QuoteChar::from_style(preferred_style) {
-                    let raw_content = locator.slice(string.content_range());
-                    if string.prefix().is_raw_string() {
-                        choose_quotes_for_raw_string(raw_content, string.quotes(), preferred_quote)
+                // Per PEP 8, always prefer double quotes for triple-quoted strings.
+                // Except when using quote-style-preserve.
+                let preferred_style = if string.quotes().triple {
+                    // ... unless we're formatting a code snippet inside a docstring,
+                    // then we specifically want to invert our quote style to avoid
+                    // writing out invalid Python.
+                    //
+                    // It's worth pointing out that we can actually wind up being
+                    // somewhat out of sync with PEP8 in this case. Consider this
+                    // example:
+                    //
+                    //     def foo():
+                    //         '''
+                    //         Something.
+                    //
+                    //         >>> """tricksy"""
+                    //         '''
+                    //         pass
+                    //
+                    // Ideally, this would be reformatted as:
+                    //
+                    //     def foo():
+                    //         """
+                    //         Something.
+                    //
+                    //         >>> '''tricksy'''
+                    //         """
+                    //         pass
+                    //
+                    // But the logic here results in the original quoting being
+                    // preserved. This is because the quoting style of the outer
+                    // docstring is determined, in part, by looking at its contents. In
+                    // this case, it notices that it contains a `"""` and thus infers
+                    // that using `'''` would overall read better because it avoids
+                    // the need to escape the interior `"""`. Except... in this case,
+                    // the `"""` is actually part of a code snippet that could get
+                    // reformatted to using a different quoting style itself.
+                    //
+                    // Fixing this would, I believe, require some fairly seismic
+                    // changes to how formatting strings works. Namely, we would need
+                    // to look for code snippets before normalizing the docstring, and
+                    // then figure out the quoting style more holistically by looking
+                    // at the various kinds of quotes used in the code snippets and
+                    // what reformatting them might look like.
+                    //
+                    // Overall this is a bit of a corner case and just inverting the
+                    // style from what the parent ultimately decided upon works, even
+                    // if it doesn't have perfect alignment with PEP8.
+                    if let Some(quote) = self.parent_docstring_quote_char {
+                        QuoteStyle::from(quote.invert())
+                    } else if self.preferred_quote_style.is_preserve() {
+                        QuoteStyle::Preserve
                    } else {
-                        choose_quotes_impl(raw_content, string.quotes(), preferred_quote)
+                        QuoteStyle::Double
+                    }
+                } else {
+                    self.preferred_quote_style
+                };
+
+                if let Some(preferred_quote) = QuoteChar::from_style(preferred_style) {
+                    if let Some(first_quote_or_normalized_char_offset) =
+                        first_quote_or_normalized_char_offset
+                    {
+                        if string.prefix().is_raw_string() {
+                            choose_quotes_for_raw_string(
+                                &raw_content[first_quote_or_normalized_char_offset..],
+                                string.quotes(),
+                                preferred_quote,
+                            )
+                        } else {
+                            choose_quotes_impl(
+                                &raw_content[first_quote_or_normalized_char_offset..],
+                                string.quotes(),
+                                preferred_quote,
+                            )
+                        }
+                    } else {
+                        StringQuotes {
+                            quote_char: preferred_quote,
+                            triple: string.quotes().is_triple(),
+                        }
                    }
                } else {
                    string.quotes()
                }
            }
+        };
+
+        QuoteSelection {
+            quotes,
+            first_quote_or_normalized_char_offset,
        }
    }

@ -156,25 +185,48 @@ impl StringNormalizer {
    ) -> NormalizedString<'a> {
        let raw_content = locator.slice(string.content_range());

-        let quotes = self.choose_quotes(string, locator);
+        let quote_selection = self.choose_quotes(string, locator);

-        let normalized = normalize_string(
-            raw_content,
-            quotes,
-            string.prefix(),
-            self.normalize_hex,
-            self.format_fstring,
-        );
+        let normalized = if let Some(first_quote_or_escape_offset) =
+            quote_selection.first_quote_or_normalized_char_offset
+        {
+            normalize_string(
+                raw_content,
+                first_quote_or_escape_offset,
+                quote_selection.quotes,
+                string.prefix(),
+                self.normalize_hex,
+                // TODO: Remove the `b'{'` in `choose_quotes` when promoting the
+                // `format_fstring` preview style
+                self.format_fstring,
+            )
+        } else {
+            Cow::Borrowed(raw_content)
+        };

        NormalizedString {
            prefix: string.prefix(),
            content_range: string.content_range(),
            text: normalized,
-            quotes,
+            quotes: quote_selection.quotes,
        }
    }
 }

+#[derive(Debug)]
+pub(crate) struct QuoteSelection {
+    quotes: StringQuotes,
+
+    /// Offset to the first quote character or character that needs special handling in [`normalize_string`].
+    first_quote_or_normalized_char_offset: Option<usize>,
+}
+
+impl QuoteSelection {
+    pub(crate) fn quotes(&self) -> StringQuotes {
+        self.quotes
+    }
+}
+
 #[derive(Debug)]
 pub(crate) struct NormalizedString<'a> {
    prefix: crate::string::StringPrefix,
@ -399,6 +451,7 @@ fn choose_quotes_impl(
 /// Returns the normalized string and whether it contains new lines.
 pub(crate) fn normalize_string(
    input: &str,
+    start_offset: usize,
    quotes: StringQuotes,
    prefix: StringPrefix,
    normalize_hex: bool,
@ -415,7 +468,7 @@ pub(crate) fn normalize_string(
    let preferred_quote = quote.as_char();
    let opposite_quote = quote.invert().as_char();

-    let mut chars = input.char_indices().peekable();
+    let mut chars = CharIndicesWithOffset::new(input, start_offset).peekable();

    let is_raw = prefix.is_raw_string();
    let is_fstring = !format_fstring && prefix.is_fstring();
@ -454,13 +507,11 @@ pub(crate) fn normalize_string(
                        // Skip over escaped backslashes
                        chars.next();
                    } else if normalize_hex {
+                        // Length of the `\` plus the length of the escape sequence character (`u` | `U` | `x`)
+                        let escape_start_len = '\\'.len_utf8() + next.len_utf8();
                        if let Some(normalised) = UnicodeEscape::new(next, !prefix.is_byte())
-                            .and_then(|escape| {
-                                escape.normalize(&input[index + c.len_utf8() + next.len_utf8()..])
-                            })
+                            .and_then(|escape| escape.normalize(&input[index + escape_start_len..]))
                        {
-                            // Length of the `\` plus the length of the escape sequence character (`u` | `U` | `x`)
-                            let escape_start_len = '\\'.len_utf8() + next.len_utf8();
                            let escape_start_offset = index + escape_start_len;
                            if let Cow::Owned(normalised) = &normalised {
                                output.push_str(&input[last_index..escape_start_offset]);
@ -510,6 +561,35 @@ pub(crate) fn normalize_string(
    normalized
 }

+#[derive(Clone, Debug)]
+struct CharIndicesWithOffset<'str> {
+    chars: std::str::Chars<'str>,
+    next_offset: usize,
+}
+
+impl<'str> CharIndicesWithOffset<'str> {
+    fn new(input: &'str str, start_offset: usize) -> Self {
+        Self {
+            chars: input[start_offset..].chars(),
+            next_offset: start_offset,
+        }
+    }
+}
+
+impl<'str> Iterator for CharIndicesWithOffset<'str> {
+    type Item = (usize, char);
+
+    fn next(&mut self) -> Option<Self::Item> {
+        self.chars.next().map(|c| {
+            let index = self.next_offset;
+            self.next_offset += c.len_utf8();
+            (index, c)
+        })
+    }
+}
+
+impl FusedIterator for CharIndicesWithOffset<'_> {}
+
 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
 enum UnicodeEscape {
    /// A hex escape sequence of either 2 (`\x`), 4 (`\u`) or 8 (`\U`) hex characters.
@ -651,6 +731,7 @@ mod tests {

        let normalized = normalize_string(
            input,
+            0,
            StringQuotes {
                triple: false,
                quote_char: QuoteChar::Double,