formatter: Introduce QuoteMetadata (#13858)

This commit is contained in:
Micha Reiser 2024-10-21 21:23:46 +02:00 committed by GitHub
parent 9e3cf14dde
commit e9dd92107c
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -14,7 +14,7 @@ use crate::QuoteStyle;
pub(crate) struct StringNormalizer<'a, 'src> {
quoting: Quoting,
preferred_quote_style: QuoteStyle,
preferred_quote_style: Option<QuoteStyle>,
context: &'a PyFormatContext<'src>,
}
@ -22,13 +22,13 @@ impl<'a, 'src> StringNormalizer<'a, 'src> {
pub(crate) fn from_context(context: &'a PyFormatContext<'src>) -> Self {
Self {
quoting: Quoting::default(),
preferred_quote_style: context.options().quote_style(),
preferred_quote_style: None,
context,
}
}
pub(crate) fn with_preferred_quote_style(mut self, quote_style: QuoteStyle) -> Self {
self.preferred_quote_style = quote_style;
self.preferred_quote_style = Some(quote_style);
self
}
@ -38,7 +38,9 @@ impl<'a, 'src> StringNormalizer<'a, 'src> {
}
fn quoting(&self, string: StringLikePart) -> Quoting {
if let FStringState::InsideExpressionElement(context) = self.context.f_string_state() {
match (self.quoting, self.context.f_string_state()) {
(Quoting::Preserve, _) => Quoting::Preserve,
// If we're inside an f-string, we need to make sure to preserve the
// existing quotes unless we're inside a triple-quoted f-string and
// the inner string itself isn't triple-quoted. For example:
@ -53,32 +55,36 @@ impl<'a, 'src> StringNormalizer<'a, 'src> {
// The reason to preserve the quotes is based on the assumption that
// the original f-string is valid in terms of quoting, and we don't
// want to change that to make it invalid.
if (context.f_string().flags().is_triple_quoted() && !string.flags().is_triple_quoted())
|| self.context.options().target_version().supports_pep_701()
{
self.quoting
} else {
Quoting::Preserve
(Quoting::CanChange, FStringState::InsideExpressionElement(context)) => {
if (context.f_string().flags().is_triple_quoted()
&& !string.flags().is_triple_quoted())
|| self.context.options().target_version().supports_pep_701()
{
Quoting::CanChange
} else {
Quoting::Preserve
}
}
} else {
self.quoting
(Quoting::CanChange, _) => Quoting::CanChange,
}
}
/// Computes the strings preferred quotes.
pub(crate) fn choose_quotes(&self, string: StringLikePart) -> QuoteSelection {
let raw_content = self.context.locator().slice(string.content_range());
let first_quote_or_normalized_char_offset = raw_content
.bytes()
.position(|b| matches!(b, b'\\' | b'"' | b'\'' | b'\r' | b'{'));
let string_flags = string.flags();
let new_kind = match self.quoting(string) {
Quoting::Preserve => string_flags,
/// Determines the preferred quote style for `string`.
/// The formatter should use the preferred quote style unless
/// it can't because the string contains the preferred quotes OR
/// it leads to more escaping.
pub(super) fn preferred_quote_style(&self, string: StringLikePart) -> QuoteStyle {
match self.quoting(string) {
Quoting::Preserve => QuoteStyle::Preserve,
Quoting::CanChange => {
let preferred_quote_style = self
.preferred_quote_style
.unwrap_or(self.context.options().quote_style());
// Per PEP 8, always prefer double quotes for triple-quoted strings.
// Except when using quote-style-preserve.
let preferred_style = if string_flags.is_triple_quoted() {
if string.flags().is_triple_quoted() {
// ... unless we're formatting a code snippet inside a docstring,
// then we specifically want to invert our quote style to avoid
// writing out invalid Python.
@ -126,39 +132,48 @@ impl<'a, 'src> StringNormalizer<'a, 'src> {
// if it doesn't have perfect alignment with PEP8.
if let Some(quote) = self.context.docstring() {
QuoteStyle::from(quote.opposite())
} else if self.preferred_quote_style.is_preserve() {
} else if preferred_quote_style.is_preserve() {
QuoteStyle::Preserve
} else {
QuoteStyle::Double
}
} else {
self.preferred_quote_style
};
if let Ok(preferred_quote) = Quote::try_from(preferred_style) {
if let Some(first_quote_or_normalized_char_offset) =
first_quote_or_normalized_char_offset
{
if string_flags.is_raw_string() {
choose_quotes_for_raw_string(
&raw_content[first_quote_or_normalized_char_offset..],
string_flags,
preferred_quote,
)
} else {
choose_quotes_impl(
&raw_content[first_quote_or_normalized_char_offset..],
string_flags,
preferred_quote,
)
}
} else {
string_flags.with_quote_style(preferred_quote)
}
} else {
string_flags
preferred_quote_style
}
}
}
}
/// Computes the strings preferred quotes.
pub(crate) fn choose_quotes(&self, string: StringLikePart) -> QuoteSelection {
let raw_content = self.context.locator().slice(string.content_range());
let first_quote_or_normalized_char_offset = raw_content
.bytes()
.position(|b| matches!(b, b'\\' | b'"' | b'\'' | b'\r' | b'{'));
let string_flags = string.flags();
let preferred_style = self.preferred_quote_style(string);
let new_kind = match (
Quote::try_from(preferred_style),
first_quote_or_normalized_char_offset,
) {
// The string contains no quotes so it's safe to use the preferred quote style
(Ok(preferred_quote), None) => string_flags.with_quote_style(preferred_quote),
// The preferred quote style is single or double quotes, and the string contains a quote or
// another character that may require escaping
(Ok(preferred_quote), Some(first_quote_or_normalized_char_offset)) => {
let quote = QuoteMetadata::from_str(
&raw_content[first_quote_or_normalized_char_offset..],
string.flags(),
preferred_quote,
)
.choose(preferred_quote);
string_flags.with_quote_style(quote)
}
// The preferred quote style is to preserve the quotes, so let's do that.
(Err(()), _) => string_flags,
};
QuoteSelection {
@ -209,119 +224,93 @@ impl QuoteSelection {
}
}
#[derive(Debug)]
pub(crate) struct NormalizedString<'a> {
/// Holds data about the quotes and prefix of the string
flags: AnyStringFlags,
#[derive(Clone, Debug)]
pub(crate) struct QuoteMetadata {
kind: QuoteMetadataKind,
/// The range of the string's content in the source (minus prefix and quotes).
content_range: TextRange,
/// The normalized text
text: Cow<'a, str>,
/// The quote style in the source.
source_style: Quote,
}
impl<'a> NormalizedString<'a> {
pub(crate) fn text(&self) -> &Cow<'a, str> {
&self.text
}
/// Tracks information about the used quotes in a string which is used
/// to choose the quotes for a part.
impl QuoteMetadata {
pub(crate) fn from_str(text: &str, flags: AnyStringFlags, preferred_quote: Quote) -> Self {
let kind = if flags.is_raw_string() {
QuoteMetadataKind::raw(text, preferred_quote, flags.is_triple_quoted())
} else if flags.is_triple_quoted() {
QuoteMetadataKind::triple_quoted(text, preferred_quote)
} else {
QuoteMetadataKind::regular(text)
};
pub(crate) fn flags(&self) -> AnyStringFlags {
self.flags
}
}
impl Ranged for NormalizedString<'_> {
fn range(&self) -> TextRange {
self.content_range
}
}
impl Format<PyFormatContext<'_>> for NormalizedString<'_> {
fn fmt(&self, f: &mut Formatter<PyFormatContext<'_>>) -> FormatResult<()> {
let quotes = StringQuotes::from(self.flags);
ruff_formatter::write!(f, [self.flags.prefix(), quotes])?;
match &self.text {
Cow::Borrowed(_) => {
source_text_slice(self.range()).fmt(f)?;
}
Cow::Owned(normalized) => {
text(normalized).fmt(f)?;
}
Self {
kind,
source_style: flags.quote_style(),
}
quotes.fmt(f)
}
}
/// Choose the appropriate quote style for a raw string.
///
/// The preferred quote style is chosen unless the string contains unescaped quotes of the
/// preferred style. For example, `r"foo"` is chosen over `r'foo'` if the preferred quote
/// style is double quotes.
fn choose_quotes_for_raw_string(
input: &str,
flags: AnyStringFlags,
preferred_quote: Quote,
) -> AnyStringFlags {
let preferred_quote_char = preferred_quote.as_char();
let mut chars = input.chars().peekable();
let contains_unescaped_configured_quotes = loop {
match chars.next() {
Some('\\') => {
// Ignore escaped characters
chars.next();
}
// `"` or `'`
Some(c) if c == preferred_quote_char => {
if !flags.is_triple_quoted() {
break true;
}
match chars.peek() {
// We can't turn `r'''\""'''` into `r"""\"""""`, this would confuse the parser
// about where the closing triple quotes start
None => break true,
Some(next) if *next == preferred_quote_char => {
// `""` or `''`
chars.next();
// We can't turn `r'''""'''` into `r""""""""`, nor can we have
// `"""` or `'''` respectively inside the string
if chars.peek().is_none() || chars.peek() == Some(&preferred_quote_char) {
break true;
}
}
_ => {}
pub(super) fn choose(&self, preferred_quote: Quote) -> Quote {
match self.kind {
QuoteMetadataKind::Raw { contains_preferred } => {
if contains_preferred {
self.source_style
} else {
preferred_quote
}
}
Some(_) => continue,
None => break false,
QuoteMetadataKind::Triple { contains_preferred } => {
if contains_preferred {
self.source_style
} else {
preferred_quote
}
}
QuoteMetadataKind::Regular {
single_quotes,
double_quotes,
} => match single_quotes.cmp(&double_quotes) {
Ordering::Less => Quote::Single,
Ordering::Equal => preferred_quote,
Ordering::Greater => Quote::Double,
},
}
};
if contains_unescaped_configured_quotes {
flags
} else {
flags.with_quote_style(preferred_quote)
}
}
/// Choose the appropriate quote style for a string.
///
/// For single quoted strings, the preferred quote style is used, unless the alternative quote style
/// would require fewer escapes.
///
/// For triple quoted strings, the preferred quote style is always used, unless the string contains
/// a triplet of the quote character (e.g., if double quotes are preferred, double quotes will be
/// used unless the string contains `"""`).
fn choose_quotes_impl(
input: &str,
flags: AnyStringFlags,
preferred_quote: Quote,
) -> AnyStringFlags {
let quote = if flags.is_triple_quoted() {
#[derive(Copy, Clone, Debug)]
enum QuoteMetadataKind {
/// A raw string.
///
/// For raw strings it's only possible to change the quotes if the preferred quote style
/// isn't used inside the string.
Raw { contains_preferred: bool },
/// Regular (non raw) triple quoted string.
///
/// For triple quoted strings it's only possible to change the quotes if no
/// triple of the preferred quotes is used inside the string.
Triple { contains_preferred: bool },
/// A single quoted string that uses either double or single quotes.
///
/// For regular strings it's desired to pick the quote style that requires the least escaping.
/// E.g. pick single quotes for `'A "dog"'` because using single quotes would require escaping
/// the two `"`.
Regular {
single_quotes: u32,
double_quotes: u32,
},
}
impl QuoteMetadataKind {
/// For triple quoted strings, the preferred quote style can't be used if the string contains
/// a tripled of the quote character (e.g., if double quotes are preferred, double quotes will be
/// used unless the string contains `"""`).
fn triple_quoted(content: &str, preferred_quote: Quote) -> Self {
// True if the string contains a triple quote sequence of the configured quote style.
let mut uses_triple_quotes = false;
let mut chars = input.chars().peekable();
let mut chars = content.chars().peekable();
while let Some(c) = chars.next() {
let preferred_quote_char = preferred_quote.as_char();
@ -369,18 +358,18 @@ fn choose_quotes_impl(
}
}
if uses_triple_quotes {
// String contains a triple quote sequence of the configured quote style.
// Keep the existing quote style.
flags.quote_style()
} else {
preferred_quote
Self::Triple {
contains_preferred: uses_triple_quotes,
}
} else {
}
/// For single quoted strings, the preferred quote style is used, unless the alternative quote style
/// would require fewer escapes.
fn regular(text: &str) -> Self {
let mut single_quotes = 0u32;
let mut double_quotes = 0u32;
for c in input.chars() {
for c in text.chars() {
match c {
'\'' => {
single_quotes += 1;
@ -394,25 +383,105 @@ fn choose_quotes_impl(
}
}
match single_quotes.cmp(&double_quotes) {
Ordering::Less => Quote::Single,
Ordering::Equal => preferred_quote,
Ordering::Greater => Quote::Double,
Self::Regular {
single_quotes,
double_quotes,
}
};
}
flags.with_quote_style(quote)
/// Computes if a raw string uses the preferred quote. If it does, then it's not possible
/// to change the quote style because it would require escaping which isn't possible in raw strings.
fn raw(text: &str, preferred: Quote, triple_quoted: bool) -> Self {
let mut chars = text.chars().peekable();
let preferred_quote_char = preferred.as_char();
let contains_unescaped_configured_quotes = loop {
match chars.next() {
Some('\\') => {
// Ignore escaped characters
chars.next();
}
// `"` or `'`
Some(c) if c == preferred_quote_char => {
if !triple_quoted {
break true;
}
match chars.peek() {
// We can't turn `r'''\""'''` into `r"""\"""""`, this would confuse the parser
// about where the closing triple quotes start
None => break true,
Some(next) if *next == preferred_quote_char => {
// `""` or `''`
chars.next();
// We can't turn `r'''""'''` into `r""""""""`, nor can we have
// `"""` or `'''` respectively inside the string
if chars.peek().is_none() || chars.peek() == Some(&preferred_quote_char)
{
break true;
}
}
_ => {}
}
}
Some(_) => continue,
None => break false,
}
};
Self::Raw {
contains_preferred: contains_unescaped_configured_quotes,
}
}
}
#[derive(Debug)]
pub(crate) struct NormalizedString<'a> {
/// Holds data about the quotes and prefix of the string
flags: AnyStringFlags,
/// The range of the string's content in the source (minus prefix and quotes).
content_range: TextRange,
/// The normalized text
text: Cow<'a, str>,
}
impl<'a> NormalizedString<'a> {
pub(crate) fn text(&self) -> &Cow<'a, str> {
&self.text
}
pub(crate) fn flags(&self) -> AnyStringFlags {
self.flags
}
}
impl Ranged for NormalizedString<'_> {
fn range(&self) -> TextRange {
self.content_range
}
}
impl Format<PyFormatContext<'_>> for NormalizedString<'_> {
fn fmt(&self, f: &mut Formatter<PyFormatContext<'_>>) -> FormatResult<()> {
let quotes = StringQuotes::from(self.flags);
ruff_formatter::write!(f, [self.flags.prefix(), quotes])?;
match &self.text {
Cow::Borrowed(_) => source_text_slice(self.range()).fmt(f)?,
Cow::Owned(normalized) => text(normalized).fmt(f)?,
}
quotes.fmt(f)
}
}
/// Adds the necessary quote escapes and removes unnecessary escape sequences when quoting `input`
/// with the provided [`StringQuotes`] style.
///
/// Returns the normalized string and whether it contains new lines.
pub(crate) fn normalize_string(
input: &str,
start_offset: usize,
flags: AnyStringFlags,
format_fstring: bool,
new_flags: AnyStringFlags,
format_f_string: bool,
) -> Cow<str> {
// The normalized string if `input` is not yet normalized.
// `output` must remain empty if `input` is already normalized.
@ -421,18 +490,19 @@ pub(crate) fn normalize_string(
// If `last_index` is `0` at the end, then the input is already normalized and can be returned as is.
let mut last_index = 0;
let quote = flags.quote_style();
let quote = new_flags.quote_style();
let preferred_quote = quote.as_char();
let opposite_quote = quote.opposite().as_char();
let mut chars = CharIndicesWithOffset::new(input, start_offset).peekable();
let is_raw = flags.is_raw_string();
let is_fstring = !format_fstring && flags.is_f_string();
let is_raw = new_flags.is_raw_string();
let is_fstring = !format_f_string && new_flags.is_f_string();
let mut formatted_value_nesting = 0u32;
while let Some((index, c)) = chars.next() {
if is_fstring && matches!(c, '{' | '}') {
if matches!(c, '{' | '}') && is_fstring {
if chars.peek().copied().is_some_and(|(_, next)| next == c) {
// Skip over the second character of the double braces
chars.next();
@ -444,6 +514,7 @@ pub(crate) fn normalize_string(
}
continue;
}
if c == '\r' {
output.push_str(&input[last_index..index]);
@ -466,8 +537,10 @@ pub(crate) fn normalize_string(
} else {
// Length of the `\` plus the length of the escape sequence character (`u` | `U` | `x`)
let escape_start_len = '\\'.len_utf8() + next.len_utf8();
if let Some(normalised) = UnicodeEscape::new(next, !flags.is_byte_string())
.and_then(|escape| escape.normalize(&input[index + escape_start_len..]))
if let Some(normalised) =
UnicodeEscape::new(next, !new_flags.is_byte_string()).and_then(
|escape| escape.normalize(&input[index + escape_start_len..]),
)
{
let escape_start_offset = index + escape_start_len;
if let Cow::Owned(normalised) = &normalised {
@ -485,7 +558,7 @@ pub(crate) fn normalize_string(
}
}
if !flags.is_triple_quoted() {
if !new_flags.is_triple_quoted() {
#[allow(clippy::if_same_then_else)]
if next == opposite_quote && formatted_value_nesting == 0 {
// Remove the escape by ending before the backslash and starting again with the quote
@ -498,7 +571,7 @@ pub(crate) fn normalize_string(
}
}
}
} else if !flags.is_triple_quoted()
} else if !new_flags.is_triple_quoted()
&& c == preferred_quote
&& formatted_value_nesting == 0
{
@ -511,14 +584,12 @@ pub(crate) fn normalize_string(
}
}
let normalized = if last_index == 0 {
if last_index == 0 {
Cow::Borrowed(input)
} else {
output.push_str(&input[last_index..]);
Cow::Owned(output)
};
normalized
}
}
#[derive(Clone, Debug)]
@ -671,14 +742,14 @@ impl UnicodeEscape {
mod tests {
use std::borrow::Cow;
use super::UnicodeEscape;
use crate::string::normalize_string;
use ruff_python_ast::{
str::Quote,
str_prefix::{AnyStringPrefix, ByteStringPrefix},
AnyStringFlags,
};
use super::{normalize_string, UnicodeEscape};
#[test]
fn normalize_32_escape() {
let escape_sequence = UnicodeEscape::new('U', true).unwrap();