formatter: Introduce QuoteMetadata (#13858)

This commit is contained in:
Micha Reiser 2024-10-21 21:23:46 +02:00 committed by GitHub
parent 9e3cf14dde
commit e9dd92107c
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -14,7 +14,7 @@ use crate::QuoteStyle;
pub(crate) struct StringNormalizer<'a, 'src> { pub(crate) struct StringNormalizer<'a, 'src> {
quoting: Quoting, quoting: Quoting,
preferred_quote_style: QuoteStyle, preferred_quote_style: Option<QuoteStyle>,
context: &'a PyFormatContext<'src>, context: &'a PyFormatContext<'src>,
} }
@ -22,13 +22,13 @@ impl<'a, 'src> StringNormalizer<'a, 'src> {
pub(crate) fn from_context(context: &'a PyFormatContext<'src>) -> Self { pub(crate) fn from_context(context: &'a PyFormatContext<'src>) -> Self {
Self { Self {
quoting: Quoting::default(), quoting: Quoting::default(),
preferred_quote_style: context.options().quote_style(), preferred_quote_style: None,
context, context,
} }
} }
pub(crate) fn with_preferred_quote_style(mut self, quote_style: QuoteStyle) -> Self { pub(crate) fn with_preferred_quote_style(mut self, quote_style: QuoteStyle) -> Self {
self.preferred_quote_style = quote_style; self.preferred_quote_style = Some(quote_style);
self self
} }
@ -38,7 +38,9 @@ impl<'a, 'src> StringNormalizer<'a, 'src> {
} }
fn quoting(&self, string: StringLikePart) -> Quoting { fn quoting(&self, string: StringLikePart) -> Quoting {
if let FStringState::InsideExpressionElement(context) = self.context.f_string_state() { match (self.quoting, self.context.f_string_state()) {
(Quoting::Preserve, _) => Quoting::Preserve,
// If we're inside an f-string, we need to make sure to preserve the // If we're inside an f-string, we need to make sure to preserve the
// existing quotes unless we're inside a triple-quoted f-string and // existing quotes unless we're inside a triple-quoted f-string and
// the inner string itself isn't triple-quoted. For example: // the inner string itself isn't triple-quoted. For example:
@ -53,32 +55,36 @@ impl<'a, 'src> StringNormalizer<'a, 'src> {
// The reason to preserve the quotes is based on the assumption that // The reason to preserve the quotes is based on the assumption that
// the original f-string is valid in terms of quoting, and we don't // the original f-string is valid in terms of quoting, and we don't
// want to change that to make it invalid. // want to change that to make it invalid.
if (context.f_string().flags().is_triple_quoted() && !string.flags().is_triple_quoted()) (Quoting::CanChange, FStringState::InsideExpressionElement(context)) => {
|| self.context.options().target_version().supports_pep_701() if (context.f_string().flags().is_triple_quoted()
{ && !string.flags().is_triple_quoted())
self.quoting || self.context.options().target_version().supports_pep_701()
} else { {
Quoting::Preserve Quoting::CanChange
} else {
Quoting::Preserve
}
} }
} else {
self.quoting (Quoting::CanChange, _) => Quoting::CanChange,
} }
} }
/// Computes the strings preferred quotes. /// Determines the preferred quote style for `string`.
pub(crate) fn choose_quotes(&self, string: StringLikePart) -> QuoteSelection { /// The formatter should use the preferred quote style unless
let raw_content = self.context.locator().slice(string.content_range()); /// it can't because the string contains the preferred quotes OR
let first_quote_or_normalized_char_offset = raw_content /// it leads to more escaping.
.bytes() pub(super) fn preferred_quote_style(&self, string: StringLikePart) -> QuoteStyle {
.position(|b| matches!(b, b'\\' | b'"' | b'\'' | b'\r' | b'{')); match self.quoting(string) {
let string_flags = string.flags(); Quoting::Preserve => QuoteStyle::Preserve,
let new_kind = match self.quoting(string) {
Quoting::Preserve => string_flags,
Quoting::CanChange => { Quoting::CanChange => {
let preferred_quote_style = self
.preferred_quote_style
.unwrap_or(self.context.options().quote_style());
// Per PEP 8, always prefer double quotes for triple-quoted strings. // Per PEP 8, always prefer double quotes for triple-quoted strings.
// Except when using quote-style-preserve. // Except when using quote-style-preserve.
let preferred_style = if string_flags.is_triple_quoted() { if string.flags().is_triple_quoted() {
// ... unless we're formatting a code snippet inside a docstring, // ... unless we're formatting a code snippet inside a docstring,
// then we specifically want to invert our quote style to avoid // then we specifically want to invert our quote style to avoid
// writing out invalid Python. // writing out invalid Python.
@ -126,39 +132,48 @@ impl<'a, 'src> StringNormalizer<'a, 'src> {
// if it doesn't have perfect alignment with PEP8. // if it doesn't have perfect alignment with PEP8.
if let Some(quote) = self.context.docstring() { if let Some(quote) = self.context.docstring() {
QuoteStyle::from(quote.opposite()) QuoteStyle::from(quote.opposite())
} else if self.preferred_quote_style.is_preserve() { } else if preferred_quote_style.is_preserve() {
QuoteStyle::Preserve QuoteStyle::Preserve
} else { } else {
QuoteStyle::Double QuoteStyle::Double
} }
} else { } else {
self.preferred_quote_style preferred_quote_style
};
if let Ok(preferred_quote) = Quote::try_from(preferred_style) {
if let Some(first_quote_or_normalized_char_offset) =
first_quote_or_normalized_char_offset
{
if string_flags.is_raw_string() {
choose_quotes_for_raw_string(
&raw_content[first_quote_or_normalized_char_offset..],
string_flags,
preferred_quote,
)
} else {
choose_quotes_impl(
&raw_content[first_quote_or_normalized_char_offset..],
string_flags,
preferred_quote,
)
}
} else {
string_flags.with_quote_style(preferred_quote)
}
} else {
string_flags
} }
} }
}
}
/// Computes the strings preferred quotes.
pub(crate) fn choose_quotes(&self, string: StringLikePart) -> QuoteSelection {
let raw_content = self.context.locator().slice(string.content_range());
let first_quote_or_normalized_char_offset = raw_content
.bytes()
.position(|b| matches!(b, b'\\' | b'"' | b'\'' | b'\r' | b'{'));
let string_flags = string.flags();
let preferred_style = self.preferred_quote_style(string);
let new_kind = match (
Quote::try_from(preferred_style),
first_quote_or_normalized_char_offset,
) {
// The string contains no quotes so it's safe to use the preferred quote style
(Ok(preferred_quote), None) => string_flags.with_quote_style(preferred_quote),
// The preferred quote style is single or double quotes, and the string contains a quote or
// another character that may require escaping
(Ok(preferred_quote), Some(first_quote_or_normalized_char_offset)) => {
let quote = QuoteMetadata::from_str(
&raw_content[first_quote_or_normalized_char_offset..],
string.flags(),
preferred_quote,
)
.choose(preferred_quote);
string_flags.with_quote_style(quote)
}
// The preferred quote style is to preserve the quotes, so let's do that.
(Err(()), _) => string_flags,
}; };
QuoteSelection { QuoteSelection {
@ -209,119 +224,93 @@ impl QuoteSelection {
} }
} }
#[derive(Debug)] #[derive(Clone, Debug)]
pub(crate) struct NormalizedString<'a> { pub(crate) struct QuoteMetadata {
/// Holds data about the quotes and prefix of the string kind: QuoteMetadataKind,
flags: AnyStringFlags,
/// The range of the string's content in the source (minus prefix and quotes). /// The quote style in the source.
content_range: TextRange, source_style: Quote,
/// The normalized text
text: Cow<'a, str>,
} }
impl<'a> NormalizedString<'a> { /// Tracks information about the used quotes in a string which is used
pub(crate) fn text(&self) -> &Cow<'a, str> { /// to choose the quotes for a part.
&self.text impl QuoteMetadata {
} pub(crate) fn from_str(text: &str, flags: AnyStringFlags, preferred_quote: Quote) -> Self {
let kind = if flags.is_raw_string() {
QuoteMetadataKind::raw(text, preferred_quote, flags.is_triple_quoted())
} else if flags.is_triple_quoted() {
QuoteMetadataKind::triple_quoted(text, preferred_quote)
} else {
QuoteMetadataKind::regular(text)
};
pub(crate) fn flags(&self) -> AnyStringFlags { Self {
self.flags kind,
} source_style: flags.quote_style(),
}
impl Ranged for NormalizedString<'_> {
fn range(&self) -> TextRange {
self.content_range
}
}
impl Format<PyFormatContext<'_>> for NormalizedString<'_> {
fn fmt(&self, f: &mut Formatter<PyFormatContext<'_>>) -> FormatResult<()> {
let quotes = StringQuotes::from(self.flags);
ruff_formatter::write!(f, [self.flags.prefix(), quotes])?;
match &self.text {
Cow::Borrowed(_) => {
source_text_slice(self.range()).fmt(f)?;
}
Cow::Owned(normalized) => {
text(normalized).fmt(f)?;
}
} }
quotes.fmt(f)
} }
}
/// Choose the appropriate quote style for a raw string. pub(super) fn choose(&self, preferred_quote: Quote) -> Quote {
/// match self.kind {
/// The preferred quote style is chosen unless the string contains unescaped quotes of the QuoteMetadataKind::Raw { contains_preferred } => {
/// preferred style. For example, `r"foo"` is chosen over `r'foo'` if the preferred quote if contains_preferred {
/// style is double quotes. self.source_style
fn choose_quotes_for_raw_string( } else {
input: &str, preferred_quote
flags: AnyStringFlags,
preferred_quote: Quote,
) -> AnyStringFlags {
let preferred_quote_char = preferred_quote.as_char();
let mut chars = input.chars().peekable();
let contains_unescaped_configured_quotes = loop {
match chars.next() {
Some('\\') => {
// Ignore escaped characters
chars.next();
}
// `"` or `'`
Some(c) if c == preferred_quote_char => {
if !flags.is_triple_quoted() {
break true;
}
match chars.peek() {
// We can't turn `r'''\""'''` into `r"""\"""""`, this would confuse the parser
// about where the closing triple quotes start
None => break true,
Some(next) if *next == preferred_quote_char => {
// `""` or `''`
chars.next();
// We can't turn `r'''""'''` into `r""""""""`, nor can we have
// `"""` or `'''` respectively inside the string
if chars.peek().is_none() || chars.peek() == Some(&preferred_quote_char) {
break true;
}
}
_ => {}
} }
} }
Some(_) => continue, QuoteMetadataKind::Triple { contains_preferred } => {
None => break false, if contains_preferred {
self.source_style
} else {
preferred_quote
}
}
QuoteMetadataKind::Regular {
single_quotes,
double_quotes,
} => match single_quotes.cmp(&double_quotes) {
Ordering::Less => Quote::Single,
Ordering::Equal => preferred_quote,
Ordering::Greater => Quote::Double,
},
} }
};
if contains_unescaped_configured_quotes {
flags
} else {
flags.with_quote_style(preferred_quote)
} }
} }
/// Choose the appropriate quote style for a string. #[derive(Copy, Clone, Debug)]
/// enum QuoteMetadataKind {
/// For single quoted strings, the preferred quote style is used, unless the alternative quote style /// A raw string.
/// would require fewer escapes. ///
/// /// For raw strings it's only possible to change the quotes if the preferred quote style
/// For triple quoted strings, the preferred quote style is always used, unless the string contains /// isn't used inside the string.
/// a triplet of the quote character (e.g., if double quotes are preferred, double quotes will be Raw { contains_preferred: bool },
/// used unless the string contains `"""`).
fn choose_quotes_impl( /// Regular (non raw) triple quoted string.
input: &str, ///
flags: AnyStringFlags, /// For triple quoted strings it's only possible to change the quotes if no
preferred_quote: Quote, /// triple of the preferred quotes is used inside the string.
) -> AnyStringFlags { Triple { contains_preferred: bool },
let quote = if flags.is_triple_quoted() {
/// A single quoted string that uses either double or single quotes.
///
/// For regular strings it's desired to pick the quote style that requires the least escaping.
/// E.g. pick single quotes for `'A "dog"'` because using single quotes would require escaping
/// the two `"`.
Regular {
single_quotes: u32,
double_quotes: u32,
},
}
impl QuoteMetadataKind {
/// For triple quoted strings, the preferred quote style can't be used if the string contains
/// a tripled of the quote character (e.g., if double quotes are preferred, double quotes will be
/// used unless the string contains `"""`).
fn triple_quoted(content: &str, preferred_quote: Quote) -> Self {
// True if the string contains a triple quote sequence of the configured quote style. // True if the string contains a triple quote sequence of the configured quote style.
let mut uses_triple_quotes = false; let mut uses_triple_quotes = false;
let mut chars = input.chars().peekable(); let mut chars = content.chars().peekable();
while let Some(c) = chars.next() { while let Some(c) = chars.next() {
let preferred_quote_char = preferred_quote.as_char(); let preferred_quote_char = preferred_quote.as_char();
@ -369,18 +358,18 @@ fn choose_quotes_impl(
} }
} }
if uses_triple_quotes { Self::Triple {
// String contains a triple quote sequence of the configured quote style. contains_preferred: uses_triple_quotes,
// Keep the existing quote style.
flags.quote_style()
} else {
preferred_quote
} }
} else { }
/// For single quoted strings, the preferred quote style is used, unless the alternative quote style
/// would require fewer escapes.
fn regular(text: &str) -> Self {
let mut single_quotes = 0u32; let mut single_quotes = 0u32;
let mut double_quotes = 0u32; let mut double_quotes = 0u32;
for c in input.chars() { for c in text.chars() {
match c { match c {
'\'' => { '\'' => {
single_quotes += 1; single_quotes += 1;
@ -394,25 +383,105 @@ fn choose_quotes_impl(
} }
} }
match single_quotes.cmp(&double_quotes) { Self::Regular {
Ordering::Less => Quote::Single, single_quotes,
Ordering::Equal => preferred_quote, double_quotes,
Ordering::Greater => Quote::Double,
} }
}; }
flags.with_quote_style(quote) /// Computes if a raw string uses the preferred quote. If it does, then it's not possible
/// to change the quote style because it would require escaping which isn't possible in raw strings.
fn raw(text: &str, preferred: Quote, triple_quoted: bool) -> Self {
let mut chars = text.chars().peekable();
let preferred_quote_char = preferred.as_char();
let contains_unescaped_configured_quotes = loop {
match chars.next() {
Some('\\') => {
// Ignore escaped characters
chars.next();
}
// `"` or `'`
Some(c) if c == preferred_quote_char => {
if !triple_quoted {
break true;
}
match chars.peek() {
// We can't turn `r'''\""'''` into `r"""\"""""`, this would confuse the parser
// about where the closing triple quotes start
None => break true,
Some(next) if *next == preferred_quote_char => {
// `""` or `''`
chars.next();
// We can't turn `r'''""'''` into `r""""""""`, nor can we have
// `"""` or `'''` respectively inside the string
if chars.peek().is_none() || chars.peek() == Some(&preferred_quote_char)
{
break true;
}
}
_ => {}
}
}
Some(_) => continue,
None => break false,
}
};
Self::Raw {
contains_preferred: contains_unescaped_configured_quotes,
}
}
}
#[derive(Debug)]
pub(crate) struct NormalizedString<'a> {
/// Holds data about the quotes and prefix of the string
flags: AnyStringFlags,
/// The range of the string's content in the source (minus prefix and quotes).
content_range: TextRange,
/// The normalized text
text: Cow<'a, str>,
}
impl<'a> NormalizedString<'a> {
pub(crate) fn text(&self) -> &Cow<'a, str> {
&self.text
}
pub(crate) fn flags(&self) -> AnyStringFlags {
self.flags
}
}
impl Ranged for NormalizedString<'_> {
fn range(&self) -> TextRange {
self.content_range
}
}
impl Format<PyFormatContext<'_>> for NormalizedString<'_> {
fn fmt(&self, f: &mut Formatter<PyFormatContext<'_>>) -> FormatResult<()> {
let quotes = StringQuotes::from(self.flags);
ruff_formatter::write!(f, [self.flags.prefix(), quotes])?;
match &self.text {
Cow::Borrowed(_) => source_text_slice(self.range()).fmt(f)?,
Cow::Owned(normalized) => text(normalized).fmt(f)?,
}
quotes.fmt(f)
}
} }
/// Adds the necessary quote escapes and removes unnecessary escape sequences when quoting `input`
/// with the provided [`StringQuotes`] style.
///
/// Returns the normalized string and whether it contains new lines.
pub(crate) fn normalize_string( pub(crate) fn normalize_string(
input: &str, input: &str,
start_offset: usize, start_offset: usize,
flags: AnyStringFlags, new_flags: AnyStringFlags,
format_fstring: bool, format_f_string: bool,
) -> Cow<str> { ) -> Cow<str> {
// The normalized string if `input` is not yet normalized. // The normalized string if `input` is not yet normalized.
// `output` must remain empty if `input` is already normalized. // `output` must remain empty if `input` is already normalized.
@ -421,18 +490,19 @@ pub(crate) fn normalize_string(
// If `last_index` is `0` at the end, then the input is already normalized and can be returned as is. // If `last_index` is `0` at the end, then the input is already normalized and can be returned as is.
let mut last_index = 0; let mut last_index = 0;
let quote = flags.quote_style(); let quote = new_flags.quote_style();
let preferred_quote = quote.as_char(); let preferred_quote = quote.as_char();
let opposite_quote = quote.opposite().as_char(); let opposite_quote = quote.opposite().as_char();
let mut chars = CharIndicesWithOffset::new(input, start_offset).peekable(); let mut chars = CharIndicesWithOffset::new(input, start_offset).peekable();
let is_raw = flags.is_raw_string(); let is_raw = new_flags.is_raw_string();
let is_fstring = !format_fstring && flags.is_f_string();
let is_fstring = !format_f_string && new_flags.is_f_string();
let mut formatted_value_nesting = 0u32; let mut formatted_value_nesting = 0u32;
while let Some((index, c)) = chars.next() { while let Some((index, c)) = chars.next() {
if is_fstring && matches!(c, '{' | '}') { if matches!(c, '{' | '}') && is_fstring {
if chars.peek().copied().is_some_and(|(_, next)| next == c) { if chars.peek().copied().is_some_and(|(_, next)| next == c) {
// Skip over the second character of the double braces // Skip over the second character of the double braces
chars.next(); chars.next();
@ -444,6 +514,7 @@ pub(crate) fn normalize_string(
} }
continue; continue;
} }
if c == '\r' { if c == '\r' {
output.push_str(&input[last_index..index]); output.push_str(&input[last_index..index]);
@ -466,8 +537,10 @@ pub(crate) fn normalize_string(
} else { } else {
// Length of the `\` plus the length of the escape sequence character (`u` | `U` | `x`) // Length of the `\` plus the length of the escape sequence character (`u` | `U` | `x`)
let escape_start_len = '\\'.len_utf8() + next.len_utf8(); let escape_start_len = '\\'.len_utf8() + next.len_utf8();
if let Some(normalised) = UnicodeEscape::new(next, !flags.is_byte_string()) if let Some(normalised) =
.and_then(|escape| escape.normalize(&input[index + escape_start_len..])) UnicodeEscape::new(next, !new_flags.is_byte_string()).and_then(
|escape| escape.normalize(&input[index + escape_start_len..]),
)
{ {
let escape_start_offset = index + escape_start_len; let escape_start_offset = index + escape_start_len;
if let Cow::Owned(normalised) = &normalised { if let Cow::Owned(normalised) = &normalised {
@ -485,7 +558,7 @@ pub(crate) fn normalize_string(
} }
} }
if !flags.is_triple_quoted() { if !new_flags.is_triple_quoted() {
#[allow(clippy::if_same_then_else)] #[allow(clippy::if_same_then_else)]
if next == opposite_quote && formatted_value_nesting == 0 { if next == opposite_quote && formatted_value_nesting == 0 {
// Remove the escape by ending before the backslash and starting again with the quote // Remove the escape by ending before the backslash and starting again with the quote
@ -498,7 +571,7 @@ pub(crate) fn normalize_string(
} }
} }
} }
} else if !flags.is_triple_quoted() } else if !new_flags.is_triple_quoted()
&& c == preferred_quote && c == preferred_quote
&& formatted_value_nesting == 0 && formatted_value_nesting == 0
{ {
@ -511,14 +584,12 @@ pub(crate) fn normalize_string(
} }
} }
let normalized = if last_index == 0 { if last_index == 0 {
Cow::Borrowed(input) Cow::Borrowed(input)
} else { } else {
output.push_str(&input[last_index..]); output.push_str(&input[last_index..]);
Cow::Owned(output) Cow::Owned(output)
}; }
normalized
} }
#[derive(Clone, Debug)] #[derive(Clone, Debug)]
@ -671,14 +742,14 @@ impl UnicodeEscape {
mod tests { mod tests {
use std::borrow::Cow; use std::borrow::Cow;
use super::UnicodeEscape;
use crate::string::normalize_string;
use ruff_python_ast::{ use ruff_python_ast::{
str::Quote, str::Quote,
str_prefix::{AnyStringPrefix, ByteStringPrefix}, str_prefix::{AnyStringPrefix, ByteStringPrefix},
AnyStringFlags, AnyStringFlags,
}; };
use super::{normalize_string, UnicodeEscape};
#[test] #[test]
fn normalize_32_escape() { fn normalize_32_escape() {
let escape_sequence = UnicodeEscape::new('U', true).unwrap(); let escape_sequence = UnicodeEscape::new('U', true).unwrap();