ruff/crates/ruff_python_formatter/tests/normalizer.rs

281 lines
12 KiB
Rust

use std::sync::LazyLock;
use {
itertools::Either::{Left, Right},
regex::Regex,
};
use ruff_python_ast::{
self as ast, BytesLiteralFlags, Expr, FStringFlags, FStringPart, InterpolatedStringElement,
InterpolatedStringLiteralElement, Stmt, StringFlags,
};
use ruff_python_ast::{AtomicNodeIndex, visitor::transformer::Transformer};
use ruff_python_ast::{StringLiteralFlags, visitor::transformer};
use ruff_text_size::{Ranged, TextRange};
/// A struct to normalize AST nodes for the purpose of comparing formatted representations for
/// semantic equivalence.
///
/// Vis-à-vis comparing ASTs, comparing these normalized representations does the following:
/// - Ignores non-abstraction information that we've encoded into the AST, e.g., the difference
/// between `class C: ...` and `class C(): ...`, which is part of our AST but not `CPython`'s.
/// - Normalize strings. The formatter can re-indent docstrings, so we need to compare string
/// contents ignoring whitespace. (Black does the same.)
/// - The formatter can also reformat code snippets when they're Python code, which can of
/// course change the string in arbitrary ways. Black itself does not reformat code snippets,
/// so we carve our own path here by stripping everything that looks like code snippets from
/// string literals.
/// - Ignores nested tuples in deletions. (Black does the same.)
pub(crate) struct Normalizer;
impl Normalizer {
/// Transform an AST module into a normalized representation.
#[allow(dead_code)]
pub(crate) fn visit_module(&self, module: &mut ast::Mod) {
match module {
ast::Mod::Module(module) => {
self.visit_body(&mut module.body);
}
ast::Mod::Expression(expression) => {
self.visit_expr(&mut expression.body);
}
}
}
}
impl Transformer for Normalizer {
fn visit_stmt(&self, stmt: &mut Stmt) {
if let Stmt::Delete(delete) = stmt {
// Treat `del a, b` and `del (a, b)` equivalently.
delete.targets = delete
.targets
.clone()
.into_iter()
.flat_map(|target| {
if let Expr::Tuple(tuple) = target {
Left(tuple.elts.into_iter())
} else {
Right(std::iter::once(target))
}
})
.collect();
}
transformer::walk_stmt(self, stmt);
}
fn visit_expr(&self, expr: &mut Expr) {
// Ruff supports joining implicitly concatenated strings. The code below implements this
// at an AST level by joining the string literals in the AST if they can be joined (it doesn't mean that
// they'll be joined in the formatted output but they could).
// Comparable expression handles some of this by comparing the concatenated string
// but not joining here doesn't play nicely with other string normalizations done in the
// Normalizer.
match expr {
Expr::StringLiteral(string) => {
if string.value.is_implicit_concatenated() {
let can_join = string.value.iter().all(|literal| {
!literal.flags.is_triple_quoted() && !literal.flags.prefix().is_raw()
});
if can_join {
string.value = ast::StringLiteralValue::single(ast::StringLiteral {
value: Box::from(string.value.to_str()),
range: string.range,
flags: StringLiteralFlags::empty(),
node_index: AtomicNodeIndex::dummy(),
});
}
}
}
Expr::BytesLiteral(bytes) => {
if bytes.value.is_implicit_concatenated() {
let can_join = bytes.value.iter().all(|literal| {
!literal.flags.is_triple_quoted() && !literal.flags.prefix().is_raw()
});
if can_join {
bytes.value = ast::BytesLiteralValue::single(ast::BytesLiteral {
value: bytes.value.bytes().collect(),
range: bytes.range,
flags: BytesLiteralFlags::empty(),
node_index: AtomicNodeIndex::dummy(),
});
}
}
}
Expr::FString(fstring) => {
if fstring.value.is_implicit_concatenated() {
let can_join = fstring.value.iter().all(|part| match part {
FStringPart::Literal(literal) => {
!literal.flags.is_triple_quoted() && !literal.flags.prefix().is_raw()
}
FStringPart::FString(string) => {
!string.flags.is_triple_quoted() && !string.flags.prefix().is_raw()
}
});
if can_join {
#[derive(Default)]
struct Collector {
elements: Vec<InterpolatedStringElement>,
}
impl Collector {
// The logic for concatenating adjacent string literals
// occurs here, implicitly: when we encounter a sequence
// of string literals, the first gets pushed to the
// `elements` vector, while subsequent strings
// are concatenated onto this top string.
fn push_literal(&mut self, literal: &str, range: TextRange) {
if let Some(InterpolatedStringElement::Literal(existing_literal)) =
self.elements.last_mut()
{
let value = std::mem::take(&mut existing_literal.value);
let mut value = value.into_string();
value.push_str(literal);
existing_literal.value = value.into_boxed_str();
existing_literal.range =
TextRange::new(existing_literal.start(), range.end());
} else {
self.elements.push(InterpolatedStringElement::Literal(
InterpolatedStringLiteralElement {
range,
value: literal.into(),
node_index: AtomicNodeIndex::dummy(),
},
));
}
}
fn push_expression(&mut self, expression: ast::InterpolatedElement) {
self.elements
.push(InterpolatedStringElement::Interpolation(expression));
}
}
let mut collector = Collector::default();
for part in &fstring.value {
match part {
ast::FStringPart::Literal(string_literal) => {
collector
.push_literal(&string_literal.value, string_literal.range);
}
ast::FStringPart::FString(fstring) => {
for element in &fstring.elements {
match element {
ast::InterpolatedStringElement::Literal(literal) => {
collector
.push_literal(&literal.value, literal.range);
}
ast::InterpolatedStringElement::Interpolation(
expression,
) => {
collector.push_expression(expression.clone());
}
}
}
}
}
}
fstring.value = ast::FStringValue::single(ast::FString {
elements: collector.elements.into(),
range: fstring.range,
flags: FStringFlags::empty(),
node_index: AtomicNodeIndex::dummy(),
});
}
}
}
_ => {}
}
transformer::walk_expr(self, expr);
}
fn visit_interpolated_string_element(
&self,
interpolated_string_element: &mut InterpolatedStringElement,
) {
let InterpolatedStringElement::Interpolation(interpolation) = interpolated_string_element
else {
return;
};
let Some(debug) = &mut interpolation.debug_text else {
return;
};
// Changing the newlines to the configured newline is okay because Python normalizes all newlines to `\n`
debug.leading = debug.leading.replace("\r\n", "\n").replace('\r', "\n");
debug.trailing = debug.trailing.replace("\r\n", "\n").replace('\r', "\n");
}
fn visit_string_literal(&self, string_literal: &mut ast::StringLiteral) {
static STRIP_DOC_TESTS: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(
r"(?mx)
(
# strip doctest PS1 prompt lines
^\s*>>>\s.*(\n|$)
|
# strip doctest PS2 prompt lines
# Also handles the case of an empty ... line.
^\s*\.\.\.((\n|$)|\s.*(\n|$))
)+
",
)
.unwrap()
});
static STRIP_RST_BLOCKS: LazyLock<Regex> = LazyLock::new(|| {
// This is kind of unfortunate, but it's pretty tricky (likely
// impossible) to detect a reStructuredText block with a simple
// regex. So we just look for the start of a block and remove
// everything after it. Talk about a hammer.
Regex::new(r"::(?s:.*)").unwrap()
});
static STRIP_MARKDOWN_BLOCKS: LazyLock<Regex> = LazyLock::new(|| {
// This covers more than valid Markdown blocks, but that's OK.
Regex::new(r"(```|~~~)\p{any}*(```|~~~|$)").unwrap()
});
// Start by (1) stripping everything that looks like a code
// snippet, since code snippets may be completely reformatted if
// they are Python code.
string_literal.value = STRIP_DOC_TESTS
.replace_all(
&string_literal.value,
"<DOCTEST-CODE-SNIPPET: Removed by normalizer>\n",
)
.into_owned()
.into_boxed_str();
string_literal.value = STRIP_RST_BLOCKS
.replace_all(
&string_literal.value,
"<RSTBLOCK-CODE-SNIPPET: Removed by normalizer>\n",
)
.into_owned()
.into_boxed_str();
string_literal.value = STRIP_MARKDOWN_BLOCKS
.replace_all(
&string_literal.value,
"<MARKDOWN-CODE-SNIPPET: Removed by normalizer>\n",
)
.into_owned()
.into_boxed_str();
// Normalize a string by (2) stripping any leading and trailing space from each
// line, and (3) removing any blank lines from the start and end of the string.
string_literal.value = string_literal
.value
.lines()
.map(str::trim)
.collect::<Vec<_>>()
.join("\n")
.trim()
.to_owned()
.into_boxed_str();
}
}