Perf: Skip string normalization when possible (#10116)

This commit is contained in:
Micha Reiser 2024-02-26 18:35:29 +01:00 committed by GitHub
parent 15b87ea8be
commit 8dc22d5793
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 170 additions and 88 deletions

View file

@ -59,16 +59,16 @@ impl Format<PyFormatContext<'_>> for FormatFString<'_> {
return result;
}
let quotes = normalizer.choose_quotes(&string, &locator);
let quote_selection = normalizer.choose_quotes(&string, &locator);
let context = FStringContext::new(
string.prefix(),
quotes,
quote_selection.quotes(),
FStringLayout::from_f_string(self.value, &locator),
);
// Starting prefix and quote
write!(f, [string.prefix(), quotes])?;
write!(f, [string.prefix(), quote_selection.quotes()])?;
f.join()
.entries(
@ -80,7 +80,7 @@ impl Format<PyFormatContext<'_>> for FormatFString<'_> {
.finish()?;
// Ending quote
quotes.fmt(f)
quote_selection.quotes().fmt(f)
}
}

View file

@ -59,6 +59,7 @@ impl Format<PyFormatContext<'_>> for FormatFStringLiteralElement<'_> {
let literal_content = f.context().locator().slice(self.element.range());
let normalized = normalize_string(
literal_content,
0,
self.context.quotes(),
self.context.prefix(),
is_hex_codes_in_unicode_sequences_enabled(f.context()),

View file

@ -1,4 +1,5 @@
use std::borrow::Cow;
use std::iter::FusedIterator;
use ruff_formatter::FormatContext;
use ruff_source_file::Locator;
@ -44,68 +45,8 @@ impl StringNormalizer {
self
}
/// Computes the strings preferred quotes.
pub(crate) fn choose_quotes(&self, string: &StringPart, locator: &Locator) -> StringQuotes {
// Per PEP 8, always prefer double quotes for triple-quoted strings.
// Except when using quote-style-preserve.
let preferred_style = if string.quotes().triple {
// ... unless we're formatting a code snippet inside a docstring,
// then we specifically want to invert our quote style to avoid
// writing out invalid Python.
//
// It's worth pointing out that we can actually wind up being
// somewhat out of sync with PEP8 in this case. Consider this
// example:
//
// def foo():
// '''
// Something.
//
// >>> """tricksy"""
// '''
// pass
//
// Ideally, this would be reformatted as:
//
// def foo():
// """
// Something.
//
// >>> '''tricksy'''
// """
// pass
//
// But the logic here results in the original quoting being
// preserved. This is because the quoting style of the outer
// docstring is determined, in part, by looking at its contents. In
// this case, it notices that it contains a `"""` and thus infers
// that using `'''` would overall read better because it avoids
// the need to escape the interior `"""`. Except... in this case,
// the `"""` is actually part of a code snippet that could get
// reformatted to using a different quoting style itself.
//
// Fixing this would, I believe, require some fairly seismic
// changes to how formatting strings works. Namely, we would need
// to look for code snippets before normalizing the docstring, and
// then figure out the quoting style more holistically by looking
// at the various kinds of quotes used in the code snippets and
// what reformatting them might look like.
//
// Overall this is a bit of a corner case and just inverting the
// style from what the parent ultimately decided upon works, even
// if it doesn't have perfect alignment with PEP8.
if let Some(quote) = self.parent_docstring_quote_char {
QuoteStyle::from(quote.invert())
} else if self.preferred_quote_style.is_preserve() {
QuoteStyle::Preserve
} else {
QuoteStyle::Double
}
} else {
self.preferred_quote_style
};
let quoting = if let FStringState::InsideExpressionElement(context) = self.f_string_state {
fn quoting(&self, string: &StringPart) -> Quoting {
if let FStringState::InsideExpressionElement(context) = self.f_string_state {
// If we're inside an f-string, we need to make sure to preserve the
// existing quotes unless we're inside a triple-quoted f-string and
// the inner string itself isn't triple-quoted. For example:
@ -129,22 +70,110 @@ impl StringNormalizer {
}
} else {
self.quoting
};
}
}
match quoting {
/// Computes the strings preferred quotes.
pub(crate) fn choose_quotes(&self, string: &StringPart, locator: &Locator) -> QuoteSelection {
let raw_content = locator.slice(string.content_range());
let first_quote_or_normalized_char_offset = raw_content
.bytes()
.position(|b| matches!(b, b'\\' | b'"' | b'\'' | b'\r' | b'{'));
let quotes = match self.quoting(string) {
Quoting::Preserve => string.quotes(),
Quoting::CanChange => {
if let Some(preferred_quote) = QuoteChar::from_style(preferred_style) {
let raw_content = locator.slice(string.content_range());
if string.prefix().is_raw_string() {
choose_quotes_for_raw_string(raw_content, string.quotes(), preferred_quote)
// Per PEP 8, always prefer double quotes for triple-quoted strings.
// Except when using quote-style-preserve.
let preferred_style = if string.quotes().triple {
// ... unless we're formatting a code snippet inside a docstring,
// then we specifically want to invert our quote style to avoid
// writing out invalid Python.
//
// It's worth pointing out that we can actually wind up being
// somewhat out of sync with PEP8 in this case. Consider this
// example:
//
// def foo():
// '''
// Something.
//
// >>> """tricksy"""
// '''
// pass
//
// Ideally, this would be reformatted as:
//
// def foo():
// """
// Something.
//
// >>> '''tricksy'''
// """
// pass
//
// But the logic here results in the original quoting being
// preserved. This is because the quoting style of the outer
// docstring is determined, in part, by looking at its contents. In
// this case, it notices that it contains a `"""` and thus infers
// that using `'''` would overall read better because it avoids
// the need to escape the interior `"""`. Except... in this case,
// the `"""` is actually part of a code snippet that could get
// reformatted to using a different quoting style itself.
//
// Fixing this would, I believe, require some fairly seismic
// changes to how formatting strings works. Namely, we would need
// to look for code snippets before normalizing the docstring, and
// then figure out the quoting style more holistically by looking
// at the various kinds of quotes used in the code snippets and
// what reformatting them might look like.
//
// Overall this is a bit of a corner case and just inverting the
// style from what the parent ultimately decided upon works, even
// if it doesn't have perfect alignment with PEP8.
if let Some(quote) = self.parent_docstring_quote_char {
QuoteStyle::from(quote.invert())
} else if self.preferred_quote_style.is_preserve() {
QuoteStyle::Preserve
} else {
choose_quotes_impl(raw_content, string.quotes(), preferred_quote)
QuoteStyle::Double
}
} else {
self.preferred_quote_style
};
if let Some(preferred_quote) = QuoteChar::from_style(preferred_style) {
if let Some(first_quote_or_normalized_char_offset) =
first_quote_or_normalized_char_offset
{
if string.prefix().is_raw_string() {
choose_quotes_for_raw_string(
&raw_content[first_quote_or_normalized_char_offset..],
string.quotes(),
preferred_quote,
)
} else {
choose_quotes_impl(
&raw_content[first_quote_or_normalized_char_offset..],
string.quotes(),
preferred_quote,
)
}
} else {
StringQuotes {
quote_char: preferred_quote,
triple: string.quotes().is_triple(),
}
}
} else {
string.quotes()
}
}
};
QuoteSelection {
quotes,
first_quote_or_normalized_char_offset,
}
}
@ -156,25 +185,48 @@ impl StringNormalizer {
) -> NormalizedString<'a> {
let raw_content = locator.slice(string.content_range());
let quotes = self.choose_quotes(string, locator);
let quote_selection = self.choose_quotes(string, locator);
let normalized = normalize_string(
raw_content,
quotes,
string.prefix(),
self.normalize_hex,
self.format_fstring,
);
let normalized = if let Some(first_quote_or_escape_offset) =
quote_selection.first_quote_or_normalized_char_offset
{
normalize_string(
raw_content,
first_quote_or_escape_offset,
quote_selection.quotes,
string.prefix(),
self.normalize_hex,
// TODO: Remove the `b'{'` in `choose_quotes` when promoting the
// `format_fstring` preview style
self.format_fstring,
)
} else {
Cow::Borrowed(raw_content)
};
NormalizedString {
prefix: string.prefix(),
content_range: string.content_range(),
text: normalized,
quotes,
quotes: quote_selection.quotes,
}
}
}
#[derive(Debug)]
pub(crate) struct QuoteSelection {
quotes: StringQuotes,
/// Offset to the first quote character or character that needs special handling in [`normalize_string`].
first_quote_or_normalized_char_offset: Option<usize>,
}
impl QuoteSelection {
pub(crate) fn quotes(&self) -> StringQuotes {
self.quotes
}
}
#[derive(Debug)]
pub(crate) struct NormalizedString<'a> {
prefix: crate::string::StringPrefix,
@ -399,6 +451,7 @@ fn choose_quotes_impl(
/// Returns the normalized string and whether it contains new lines.
pub(crate) fn normalize_string(
input: &str,
start_offset: usize,
quotes: StringQuotes,
prefix: StringPrefix,
normalize_hex: bool,
@ -415,7 +468,7 @@ pub(crate) fn normalize_string(
let preferred_quote = quote.as_char();
let opposite_quote = quote.invert().as_char();
let mut chars = input.char_indices().peekable();
let mut chars = CharIndicesWithOffset::new(input, start_offset).peekable();
let is_raw = prefix.is_raw_string();
let is_fstring = !format_fstring && prefix.is_fstring();
@ -454,13 +507,11 @@ pub(crate) fn normalize_string(
// Skip over escaped backslashes
chars.next();
} else if normalize_hex {
// Length of the `\` plus the length of the escape sequence character (`u` | `U` | `x`)
let escape_start_len = '\\'.len_utf8() + next.len_utf8();
if let Some(normalised) = UnicodeEscape::new(next, !prefix.is_byte())
.and_then(|escape| {
escape.normalize(&input[index + c.len_utf8() + next.len_utf8()..])
})
.and_then(|escape| escape.normalize(&input[index + escape_start_len..]))
{
// Length of the `\` plus the length of the escape sequence character (`u` | `U` | `x`)
let escape_start_len = '\\'.len_utf8() + next.len_utf8();
let escape_start_offset = index + escape_start_len;
if let Cow::Owned(normalised) = &normalised {
output.push_str(&input[last_index..escape_start_offset]);
@ -510,6 +561,35 @@ pub(crate) fn normalize_string(
normalized
}
#[derive(Clone, Debug)]
struct CharIndicesWithOffset<'str> {
chars: std::str::Chars<'str>,
next_offset: usize,
}
impl<'str> CharIndicesWithOffset<'str> {
fn new(input: &'str str, start_offset: usize) -> Self {
Self {
chars: input[start_offset..].chars(),
next_offset: start_offset,
}
}
}
impl<'str> Iterator for CharIndicesWithOffset<'str> {
type Item = (usize, char);
fn next(&mut self) -> Option<Self::Item> {
self.chars.next().map(|c| {
let index = self.next_offset;
self.next_offset += c.len_utf8();
(index, c)
})
}
}
impl FusedIterator for CharIndicesWithOffset<'_> {}
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
enum UnicodeEscape {
/// A hex escape sequence of either 2 (`\x`), 4 (`\u`) or 8 (`\U`) hex characters.
@ -651,6 +731,7 @@ mod tests {
let normalized = normalize_string(
input,
0,
StringQuotes {
triple: false,
quote_char: QuoteChar::Double,