ruff/crates/ruff_python_formatter/tests/normalizer.rs
Brent Westbrook 23c98849fc
Preserve quotes in generated f-strings (#15794)
## Summary

This is another follow-up to #15726 and #15778, extending the
quote-preserving behavior to f-strings and deleting the now-unused
`Generator::quote` field.

## Details
I also made one unrelated change to `rules/flynt/helpers.rs` to remove a
`to_string` call for making a `Box<str>` and tweaked some arguments to
some of the `Generator::unparse_f_string` methods to make the code
easier to follow, in my opinion. Happy to revert especially the latter
of these if needed.

Unfortunately this still does not fix the issue in #9660, which appears
to be more of an escaping issue than a quote-preservation issue. After
#15726, the result is now `a = f'# {"".join([])}' if 1 else ""` instead
of `a = f"# {''.join([])}" if 1 else ""` (single quotes on the outside
now), but we still don't have the desired behavior of double quotes
everywhere on Python 3.12+. I added a test for this but split it off
into another branch since it ended up being unaddressed here, but my
`dbg!` statements showed the correct preferred quotes going into
[`UnicodeEscape::with_preferred_quote`](https://github.com/astral-sh/ruff/blob/main/crates/ruff_python_literal/src/escape.rs#L54).

## Test Plan

Existing rule and `Generator` tests.

---------

Co-authored-by: Alex Waygood <Alex.Waygood@Gmail.com>
2025-01-29 13:28:22 -05:00

259 lines
11 KiB
Rust

use std::sync::LazyLock;
use {
itertools::Either::{Left, Right},
regex::Regex,
};
use ruff_python_ast::visitor::transformer::Transformer;
use ruff_python_ast::{
self as ast, BytesLiteralFlags, Expr, FStringElement, FStringFlags, FStringLiteralElement,
FStringPart, Stmt, StringFlags,
};
use ruff_python_ast::{visitor::transformer, StringLiteralFlags};
use ruff_text_size::{Ranged, TextRange};
/// A struct to normalize AST nodes for the purpose of comparing formatted representations for
/// semantic equivalence.
///
/// Vis-à-vis comparing ASTs, comparing these normalized representations does the following:
/// - Ignores non-abstraction information that we've encoded into the AST, e.g., the difference
/// between `class C: ...` and `class C(): ...`, which is part of our AST but not `CPython`'s.
/// - Normalize strings. The formatter can re-indent docstrings, so we need to compare string
/// contents ignoring whitespace. (Black does the same.)
/// - The formatter can also reformat code snippets when they're Python code, which can of
/// course change the string in arbitrary ways. Black itself does not reformat code snippets,
/// so we carve our own path here by stripping everything that looks like code snippets from
/// string literals.
/// - Ignores nested tuples in deletions. (Black does the same.)
pub(crate) struct Normalizer;
impl Normalizer {
/// Transform an AST module into a normalized representation.
#[allow(dead_code)]
pub(crate) fn visit_module(&self, module: &mut ast::Mod) {
match module {
ast::Mod::Module(module) => {
self.visit_body(&mut module.body);
}
ast::Mod::Expression(expression) => {
self.visit_expr(&mut expression.body);
}
}
}
}
impl Transformer for Normalizer {
fn visit_stmt(&self, stmt: &mut Stmt) {
if let Stmt::Delete(delete) = stmt {
// Treat `del a, b` and `del (a, b)` equivalently.
delete.targets = delete
.targets
.clone()
.into_iter()
.flat_map(|target| {
if let Expr::Tuple(tuple) = target {
Left(tuple.elts.into_iter())
} else {
Right(std::iter::once(target))
}
})
.collect();
}
transformer::walk_stmt(self, stmt);
}
fn visit_expr(&self, expr: &mut Expr) {
// Ruff supports joining implicitly concatenated strings. The code below implements this
// at an AST level by joining the string literals in the AST if they can be joined (it doesn't mean that
// they'll be joined in the formatted output but they could).
// Comparable expression handles some of this by comparing the concatenated string
// but not joining here doesn't play nicely with other string normalizations done in the
// Normalizer.
match expr {
Expr::StringLiteral(string) => {
if string.value.is_implicit_concatenated() {
let can_join = string.value.iter().all(|literal| {
!literal.flags.is_triple_quoted() && !literal.flags.prefix().is_raw()
});
if can_join {
string.value = ast::StringLiteralValue::single(ast::StringLiteral {
value: Box::from(string.value.to_str()),
range: string.range,
flags: StringLiteralFlags::empty(),
});
}
}
}
Expr::BytesLiteral(bytes) => {
if bytes.value.is_implicit_concatenated() {
let can_join = bytes.value.iter().all(|literal| {
!literal.flags.is_triple_quoted() && !literal.flags.prefix().is_raw()
});
if can_join {
bytes.value = ast::BytesLiteralValue::single(ast::BytesLiteral {
value: bytes.value.bytes().collect(),
range: bytes.range,
flags: BytesLiteralFlags::empty(),
});
}
}
}
Expr::FString(fstring) => {
if fstring.value.is_implicit_concatenated() {
let can_join = fstring.value.iter().all(|part| match part {
FStringPart::Literal(literal) => {
!literal.flags.is_triple_quoted() && !literal.flags.prefix().is_raw()
}
FStringPart::FString(string) => {
!string.flags.is_triple_quoted() && !string.flags.prefix().is_raw()
}
});
if can_join {
#[derive(Default)]
struct Collector {
elements: Vec<FStringElement>,
}
impl Collector {
// The logic for concatenating adjacent string literals
// occurs here, implicitly: when we encounter a sequence
// of string literals, the first gets pushed to the
// `elements` vector, while subsequent strings
// are concatenated onto this top string.
fn push_literal(&mut self, literal: &str, range: TextRange) {
if let Some(FStringElement::Literal(existing_literal)) =
self.elements.last_mut()
{
let value = std::mem::take(&mut existing_literal.value);
let mut value = value.into_string();
value.push_str(literal);
existing_literal.value = value.into_boxed_str();
existing_literal.range =
TextRange::new(existing_literal.start(), range.end());
} else {
self.elements.push(FStringElement::Literal(
FStringLiteralElement {
range,
value: literal.into(),
},
));
}
}
fn push_expression(
&mut self,
expression: ast::FStringExpressionElement,
) {
self.elements.push(FStringElement::Expression(expression));
}
}
let mut collector = Collector::default();
for part in &fstring.value {
match part {
ast::FStringPart::Literal(string_literal) => {
collector
.push_literal(&string_literal.value, string_literal.range);
}
ast::FStringPart::FString(fstring) => {
for element in &fstring.elements {
match element {
ast::FStringElement::Literal(literal) => {
collector
.push_literal(&literal.value, literal.range);
}
ast::FStringElement::Expression(expression) => {
collector.push_expression(expression.clone());
}
}
}
}
}
}
fstring.value = ast::FStringValue::single(ast::FString {
elements: collector.elements.into(),
range: fstring.range,
flags: FStringFlags::empty(),
});
}
}
}
_ => {}
}
transformer::walk_expr(self, expr);
}
fn visit_string_literal(&self, string_literal: &mut ast::StringLiteral) {
static STRIP_DOC_TESTS: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(
r"(?mx)
(
# strip doctest PS1 prompt lines
^\s*>>>\s.*(\n|$)
|
# strip doctest PS2 prompt lines
# Also handles the case of an empty ... line.
^\s*\.\.\.((\n|$)|\s.*(\n|$))
)+
",
)
.unwrap()
});
static STRIP_RST_BLOCKS: LazyLock<Regex> = LazyLock::new(|| {
// This is kind of unfortunate, but it's pretty tricky (likely
// impossible) to detect a reStructuredText block with a simple
// regex. So we just look for the start of a block and remove
// everything after it. Talk about a hammer.
Regex::new(r"::(?s:.*)").unwrap()
});
static STRIP_MARKDOWN_BLOCKS: LazyLock<Regex> = LazyLock::new(|| {
// This covers more than valid Markdown blocks, but that's OK.
Regex::new(r"(```|~~~)\p{any}*(```|~~~|$)").unwrap()
});
// Start by (1) stripping everything that looks like a code
// snippet, since code snippets may be completely reformatted if
// they are Python code.
string_literal.value = STRIP_DOC_TESTS
.replace_all(
&string_literal.value,
"<DOCTEST-CODE-SNIPPET: Removed by normalizer>\n",
)
.into_owned()
.into_boxed_str();
string_literal.value = STRIP_RST_BLOCKS
.replace_all(
&string_literal.value,
"<RSTBLOCK-CODE-SNIPPET: Removed by normalizer>\n",
)
.into_owned()
.into_boxed_str();
string_literal.value = STRIP_MARKDOWN_BLOCKS
.replace_all(
&string_literal.value,
"<MARKDOWN-CODE-SNIPPET: Removed by normalizer>\n",
)
.into_owned()
.into_boxed_str();
// Normalize a string by (2) stripping any leading and trailing space from each
// line, and (3) removing any blank lines from the start and end of the string.
string_literal.value = string_literal
.value
.lines()
.map(str::trim)
.collect::<Vec<_>>()
.join("\n")
.trim()
.to_owned()
.into_boxed_str();
}
}