mirror of
https://github.com/astral-sh/ruff.git
synced 2025-07-19 11:05:24 +00:00

(This is not possible to actually use until https://github.com/astral-sh/ruff/pull/8854 is merged.) This commit slots in support for formatting Markdown fenced code blocks[1]. With the refactoring done for reStructuredText previously, this ended up being pretty easy to add. Markdown code blocks are also quite a bit easier to parse and recognize correctly. One point of contention in #8860 is whether to assume that unlabeled Markdown code fences are Python or not by default. In this PR, we make such an assumption. This follows what `rustdoc` does. The mitigation here is that if an unlabeled code block isn't Python, then it probably won't parse as Python. And we'll end up skipping it. So in the vast majority of cases, the worst thing that can happen is a little bit of wasted work. Closes #8860 [1]: https://spec.commonmark.org/0.30/#fenced-code-blocks
122 lines
4.7 KiB
Rust
122 lines
4.7 KiB
Rust
use {
|
|
itertools::Either::{Left, Right},
|
|
once_cell::sync::Lazy,
|
|
regex::Regex,
|
|
};
|
|
|
|
use ruff_python_ast::visitor::transformer;
|
|
use ruff_python_ast::visitor::transformer::Transformer;
|
|
use ruff_python_ast::{self as ast, Expr, Stmt};
|
|
|
|
/// A struct to normalize AST nodes for the purpose of comparing formatted representations for
|
|
/// semantic equivalence.
|
|
///
|
|
/// Vis-à-vis comparing ASTs, comparing these normalized representations does the following:
|
|
/// - Ignores non-abstraction information that we've encoded into the AST, e.g., the difference
|
|
/// between `class C: ...` and `class C(): ...`, which is part of our AST but not `CPython`'s.
|
|
/// - Normalize strings. The formatter can re-indent docstrings, so we need to compare string
|
|
/// contents ignoring whitespace. (Black does the same.)
|
|
/// - The formatter can also reformat code snippets when they're Python code, which can of
|
|
/// course change the string in arbitrary ways. Black itself does not reformat code snippets,
|
|
/// so we carve our own path here by stripping everything that looks like code snippets from
|
|
/// string literals.
|
|
/// - Ignores nested tuples in deletions. (Black does the same.)
|
|
pub(crate) struct Normalizer;
|
|
|
|
impl Normalizer {
|
|
/// Transform an AST module into a normalized representation.
|
|
#[allow(dead_code)]
|
|
pub(crate) fn visit_module(&self, module: &mut ast::Mod) {
|
|
match module {
|
|
ast::Mod::Module(module) => {
|
|
self.visit_body(&mut module.body);
|
|
}
|
|
ast::Mod::Expression(expression) => {
|
|
self.visit_expr(&mut expression.body);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
impl Transformer for Normalizer {
|
|
fn visit_stmt(&self, stmt: &mut Stmt) {
|
|
if let Stmt::Delete(delete) = stmt {
|
|
// Treat `del a, b` and `del (a, b)` equivalently.
|
|
delete.targets = delete
|
|
.targets
|
|
.clone()
|
|
.into_iter()
|
|
.flat_map(|target| {
|
|
if let Expr::Tuple(tuple) = target {
|
|
Left(tuple.elts.into_iter())
|
|
} else {
|
|
Right(std::iter::once(target))
|
|
}
|
|
})
|
|
.collect();
|
|
}
|
|
|
|
transformer::walk_stmt(self, stmt);
|
|
}
|
|
|
|
fn visit_string_literal(&self, string_literal: &mut ast::StringLiteral) {
|
|
static STRIP_DOC_TESTS: Lazy<Regex> = Lazy::new(|| {
|
|
Regex::new(
|
|
r#"(?mx)
|
|
(
|
|
# strip doctest PS1 prompt lines
|
|
^\s*>>>\s.*(\n|$)
|
|
|
|
|
# strip doctest PS2 prompt lines
|
|
# Also handles the case of an empty ... line.
|
|
^\s*\.\.\.((\n|$)|\s.*(\n|$))
|
|
)+
|
|
"#,
|
|
)
|
|
.unwrap()
|
|
});
|
|
static STRIP_RST_BLOCKS: Lazy<Regex> = Lazy::new(|| {
|
|
// This is kind of unfortunate, but it's pretty tricky (likely
|
|
// impossible) to detect a reStructuredText block with a simple
|
|
// regex. So we just look for the start of a block and remove
|
|
// everything after it. Talk about a hammer.
|
|
Regex::new(r#"::(?s:.*)"#).unwrap()
|
|
});
|
|
static STRIP_MARKDOWN_BLOCKS: Lazy<Regex> = Lazy::new(|| {
|
|
// This covers more than valid Markdown blocks, but that's OK.
|
|
Regex::new(r#"(```|~~~)\p{any}*(```|~~~|$)"#).unwrap()
|
|
});
|
|
|
|
// Start by (1) stripping everything that looks like a code
|
|
// snippet, since code snippets may be completely reformatted if
|
|
// they are Python code.
|
|
string_literal.value = STRIP_DOC_TESTS
|
|
.replace_all(
|
|
&string_literal.value,
|
|
"<DOCTEST-CODE-SNIPPET: Removed by normalizer>\n",
|
|
)
|
|
.into_owned();
|
|
string_literal.value = STRIP_RST_BLOCKS
|
|
.replace_all(
|
|
&string_literal.value,
|
|
"<RSTBLOCK-CODE-SNIPPET: Removed by normalizer>\n",
|
|
)
|
|
.into_owned();
|
|
string_literal.value = STRIP_MARKDOWN_BLOCKS
|
|
.replace_all(
|
|
&string_literal.value,
|
|
"<MARKDOWN-CODE-SNIPPET: Removed by normalizer>\n",
|
|
)
|
|
.into_owned();
|
|
// Normalize a string by (2) stripping any leading and trailing space from each
|
|
// line, and (3) removing any blank lines from the start and end of the string.
|
|
string_literal.value = string_literal
|
|
.value
|
|
.lines()
|
|
.map(str::trim)
|
|
.collect::<Vec<_>>()
|
|
.join("\n")
|
|
.trim()
|
|
.to_owned();
|
|
}
|
|
}
|