mirror of
https://github.com/astral-sh/ruff.git
synced 2025-09-27 04:19:18 +00:00

## Summary Enable using the new `Mode::Jupyter` for the tokenizer/parser to parse Jupyter line magic tokens. The individual call to the lexer i.e., `lex_starts_at` done by various rules should consider the context of the source code (is this content from a Jupyter Notebook?). Thus, a new field `source_type` (of type `PySourceType`) is added to `Checker` which is being passed around as an argument to the relevant functions. This is then used to determine the `Mode` for the lexer. ## Test Plan Add new test cases to make sure that the magic statement is considered while generating the diagnostic and autofix: * For `I001`, if there's a magic statement in between two import blocks, they should be sorted independently fixes: #6090
493 lines
16 KiB
Rust
493 lines
16 KiB
Rust
//! Minimize a failing python file, a process known as [shrinking](https://www.educative.io/answers/what-is-shrinking)
|
|
//!
|
|
//! See `--help` (or the [Args] docs) for usage.
|
|
//!
|
|
//! ## Example
|
|
//!
|
|
//! Shrink a file with unstable formatting
|
|
//! ```shell
|
|
//! cargo run --bin ruff-minimizer -- target/cpython/Lib/test/inspect_fodder.py target/minirepo/a.py "Unstable formatting" "target/debug/ruff_dev format-dev --stability-check target/minirepo"
|
|
//! ```
|
|
//! This could emit
|
|
//! ```python
|
|
//! class WhichComments: # before f return 1 # end f # after f # before asyncf - line 108
|
|
//! async def asyncf(self): return 2
|
|
//! # end asyncf # end of WhichComments - line 114# a closing parenthesis with the opening paren being in another line
|
|
//! (
|
|
//! );
|
|
//! ```
|
|
//!
|
|
//! Shrink a file with a syntax error
|
|
//! ```shell
|
|
//! cargo run --bin ruff_shrinking -- target/checkouts/jhnnsrs:mikro-napari/mikro_napari/models/representation.py target/minirepo/code.py "invalid syntax" "target/debug/ruff_dev format-dev --stability-check target/minirepo"
|
|
//! ```
|
|
//! This could emit
|
|
//! ```python
|
|
//! class RepresentationQtModel():
|
|
//! data[:,] = rep.data
|
|
//! ```
|
|
|
|
#![allow(clippy::print_stdout, clippy::print_stderr)]
|
|
|
|
use anyhow::{Context, Result};
|
|
use clap::Parser;
|
|
use fs_err as fs;
|
|
use regex::Regex;
|
|
use ruff_python_ast::statement_visitor::{walk_body, walk_stmt, StatementVisitor};
|
|
use ruff_python_ast::visitor::{walk_expr, Visitor};
|
|
use ruff_python_ast::{Expr, Ranged, Stmt, Suite};
|
|
use ruff_python_parser::Mode;
|
|
use ruff_text_size::TextRange;
|
|
use std::collections::HashMap;
|
|
use std::path::{Path, PathBuf};
|
|
use std::process::{Command, ExitCode};
|
|
use std::str;
|
|
use std::time::Instant;
|
|
use tracing::debug;
|
|
|
|
const STRATEGIES: &[&dyn Strategy] = &[
|
|
(&StrategyRemoveModuleMember),
|
|
(&StrategyRemoveStatement),
|
|
(&StrategyRemoveExpression),
|
|
(&StrategyRemoveLine),
|
|
(&StrategyRemoveNewline),
|
|
(&StrategyRemoveToken),
|
|
(&StrategyRemoveChar),
|
|
];
|
|
|
|
/// Each strategy is a way of producing possible minimizations
|
|
trait Strategy {
|
|
fn name(&self) -> &'static str;
|
|
|
|
fn candidates<'a>(
|
|
&self,
|
|
input: &'a str,
|
|
ast: &'a Suite,
|
|
) -> Result<Box<dyn ExactSizeStringIter + 'a>>;
|
|
}
|
|
|
|
/// vtable surrogate trait that rust wants
|
|
trait ExactSizeStringIter: Iterator<Item = String> + ExactSizeIterator {}
|
|
|
|
impl<T> ExactSizeStringIter for T where T: Iterator<Item = String> + ExactSizeIterator {}
|
|
|
|
/// Remove a top level member from a module. Generally the most effective strategy since most
|
|
/// top level items will be unrelated to the error.
|
|
struct StrategyRemoveModuleMember;
|
|
|
|
impl Strategy for StrategyRemoveModuleMember {
|
|
fn name(&self) -> &'static str {
|
|
"remove module member"
|
|
}
|
|
|
|
fn candidates<'a>(
|
|
&self,
|
|
input: &'a str,
|
|
ast: &'a Suite,
|
|
) -> Result<Box<dyn ExactSizeStringIter + 'a>> {
|
|
let iter = ast.iter().map(|stmt| {
|
|
// trim the newlines the range misses
|
|
input[..stmt.range().start().to_usize()]
|
|
.trim_end()
|
|
.to_string()
|
|
+ input[stmt.range().end().to_usize()..].trim_start()
|
|
});
|
|
Ok(Box::new(iter))
|
|
}
|
|
}
|
|
|
|
/// Finds the ranges of all statements.
|
|
#[derive(Default)]
|
|
struct StatementCollector {
|
|
/// The ranges of all statements
|
|
ranges: Vec<TextRange>,
|
|
}
|
|
|
|
impl StatementVisitor<'_> for StatementCollector {
|
|
fn visit_body(&mut self, body: &[Stmt]) {
|
|
if let (Some(first), Some(last)) = (body.first(), body.last()) {
|
|
if !(first == last && matches!(first, Stmt::Pass(_))) {
|
|
self.ranges.push(TextRange::new(first.start(), last.end()));
|
|
}
|
|
}
|
|
walk_body(self, body);
|
|
}
|
|
|
|
fn visit_stmt(&mut self, stmt: &Stmt) {
|
|
if !matches!(stmt, Stmt::Pass(_)) {
|
|
self.ranges.push(stmt.range());
|
|
}
|
|
walk_stmt(self, stmt);
|
|
}
|
|
}
|
|
|
|
/// Try to remove each statement or replace it statement with `pass`
|
|
fn strategy_statement<'a>(
|
|
input: &'a str,
|
|
ast: &'a Suite,
|
|
pass_dummy: bool,
|
|
) -> Box<dyn ExactSizeStringIter + 'a> {
|
|
let mut visitor = StatementCollector::default();
|
|
visitor.visit_body(ast);
|
|
|
|
// Remove the largest statements first
|
|
let mut ranges = visitor.ranges;
|
|
ranges.sort_by_key(|range| range.len());
|
|
ranges.reverse();
|
|
|
|
let iter = ranges.into_iter().map(move |range| {
|
|
let mut without_stmt = String::new();
|
|
// trim the newlines the range misses
|
|
without_stmt.push_str(input[..range.start().to_usize()].trim_end());
|
|
if pass_dummy {
|
|
without_stmt.push_str("pass");
|
|
}
|
|
without_stmt.push_str(&input[range.end().to_usize()..]);
|
|
without_stmt
|
|
});
|
|
Box::new(iter)
|
|
}
|
|
|
|
struct StrategyRemoveStatement;
|
|
|
|
impl Strategy for StrategyRemoveStatement {
|
|
fn name(&self) -> &'static str {
|
|
"remove statement"
|
|
}
|
|
fn candidates<'a>(
|
|
&self,
|
|
input: &'a str,
|
|
ast: &'a Suite,
|
|
) -> Result<Box<dyn ExactSizeStringIter + 'a>> {
|
|
Ok(strategy_statement(input, ast, false))
|
|
}
|
|
}
|
|
|
|
/// Finds the ranges of all expressions.
|
|
#[derive(Default)]
|
|
struct ExpressionCollector {
|
|
/// The ranges of all expressions
|
|
ranges: Vec<TextRange>,
|
|
}
|
|
|
|
impl Visitor<'_> for ExpressionCollector {
|
|
fn visit_expr(&mut self, expr: &Expr) {
|
|
self.ranges.push(expr.range());
|
|
walk_expr(self, expr);
|
|
}
|
|
}
|
|
|
|
struct StrategyRemoveExpression;
|
|
|
|
impl Strategy for StrategyRemoveExpression {
|
|
fn name(&self) -> &'static str {
|
|
"remove expression"
|
|
}
|
|
|
|
/// Try to remove each expression
|
|
fn candidates<'a>(
|
|
&self,
|
|
input: &'a str,
|
|
ast: &'a Suite,
|
|
) -> Result<Box<dyn ExactSizeStringIter + 'a>> {
|
|
let mut visitor = ExpressionCollector::default();
|
|
visitor.visit_body(ast);
|
|
let iter = visitor.ranges.into_iter().map(move |range| {
|
|
input[TextRange::up_to(range.start())].to_string() + &input[range.end().to_usize()..]
|
|
});
|
|
Ok(Box::new(iter))
|
|
}
|
|
}
|
|
|
|
/// Remove each line (physical lines, not logical lines).
|
|
struct StrategyRemoveLine;
|
|
|
|
impl Strategy for StrategyRemoveLine {
|
|
fn name(&self) -> &'static str {
|
|
"remove line"
|
|
}
|
|
|
|
/// Returns the number of permutations and each permutation
|
|
fn candidates<'a>(
|
|
&self,
|
|
input: &'a str,
|
|
_ast: &'a Suite,
|
|
) -> Result<Box<dyn ExactSizeStringIter + 'a>> {
|
|
let lines: Vec<_> = input.lines().collect();
|
|
let iter = (0..lines.len()).map(move |removed_line| {
|
|
let mut result = String::new();
|
|
result.push_str(&lines[..removed_line].join("\n"));
|
|
if removed_line > 0 {
|
|
result.push('\n');
|
|
}
|
|
result.push_str(&lines[removed_line + 1..].join("\n"));
|
|
result
|
|
});
|
|
Ok(Box::new(iter))
|
|
}
|
|
}
|
|
|
|
/// Try removing newline characters
|
|
struct StrategyRemoveNewline;
|
|
|
|
impl Strategy for StrategyRemoveNewline {
|
|
fn name(&self) -> &'static str {
|
|
"remove newline"
|
|
}
|
|
|
|
/// Returns the number of permutations and each permutation
|
|
fn candidates<'a>(
|
|
&self,
|
|
input: &'a str,
|
|
_ast: &'a Suite,
|
|
) -> Result<Box<dyn ExactSizeStringIter + 'a>> {
|
|
let newline_positions: Vec<_> = input
|
|
.char_indices()
|
|
.filter_map(|(pos, char)| {
|
|
// Don't remove newlines after `:`. Indexing is save because pos > 0 is checked and
|
|
// pos - 1 is behind a position we know exists and we're indexing into bytes instead
|
|
// of chars
|
|
if char == '\n' && pos > 0 && input.as_bytes()[pos - 1] != b':' {
|
|
Some(pos)
|
|
} else {
|
|
None
|
|
}
|
|
})
|
|
.collect();
|
|
let iter = newline_positions.into_iter().map(move |newline_position| {
|
|
// trim to remove the indentation
|
|
input[..newline_position].to_string()
|
|
+ input[newline_position + '\n'.len_utf8()..].trim_start()
|
|
});
|
|
Ok(Box::new(iter))
|
|
}
|
|
}
|
|
|
|
/// Try removing each python token. This is really slow and runs at the end
|
|
struct StrategyRemoveToken;
|
|
|
|
impl Strategy for StrategyRemoveToken {
|
|
fn name(&self) -> &'static str {
|
|
"remove token"
|
|
}
|
|
|
|
fn candidates<'a>(
|
|
&self,
|
|
input: &'a str,
|
|
_ast: &'a Suite,
|
|
) -> Result<Box<dyn ExactSizeStringIter + 'a>> {
|
|
let token_ranges: Vec<_> = ruff_python_parser::tokenize(input, Mode::Module)
|
|
.into_iter()
|
|
// At this point we know we have valid python code
|
|
.map(Result::unwrap)
|
|
.filter(|token| token.1.len().to_usize() > 0)
|
|
.map(|token| token.1)
|
|
.collect();
|
|
|
|
let iter = token_ranges.into_iter().map(move |range| {
|
|
input[..range.start().to_usize()].to_string() + &input[range.end().to_usize()..]
|
|
});
|
|
Ok(Box::new(iter))
|
|
}
|
|
}
|
|
|
|
/// Try removing each individual character in the file. This is really slow and runs at the end
|
|
struct StrategyRemoveChar;
|
|
|
|
impl Strategy for StrategyRemoveChar {
|
|
fn name(&self) -> &'static str {
|
|
"remove character"
|
|
}
|
|
|
|
fn candidates<'a>(
|
|
&self,
|
|
input: &'a str,
|
|
_ast: &'a Suite,
|
|
) -> Result<Box<dyn ExactSizeStringIter + 'a>> {
|
|
let char_indices: Vec<_> = input.char_indices().collect();
|
|
let iter = char_indices
|
|
.into_iter()
|
|
.map(move |(pos, char)| input[..pos].to_string() + &input[pos + char.len_utf8()..]);
|
|
Ok(Box::new(iter))
|
|
}
|
|
}
|
|
|
|
/// Returns strategy, posing in the iteration (so they can be skipped in the next attempt) and
|
|
/// minimized code.
|
|
fn minimization_step(
|
|
input: &str,
|
|
location: &Path,
|
|
command_args: &[String],
|
|
pattern: &Regex,
|
|
last_strategy_and_idx: Option<(&'static dyn Strategy, usize)>,
|
|
) -> Result<Option<(&'static dyn Strategy, usize, String)>> {
|
|
let tokens = ruff_python_parser::tokenize(input, Mode::Module);
|
|
let ast = ruff_python_parser::parse_program_tokens(tokens, "input.py", false)
|
|
.context("not valid python")?;
|
|
|
|
// Try the last succeeding strategy first, skipping all that failed last time
|
|
if let Some((last_strategy, last_idx)) = last_strategy_and_idx {
|
|
let iter = last_strategy.candidates(input, &ast)?;
|
|
println!(
|
|
"Trying {} ({last_idx} skipped) {} candidates",
|
|
iter.len() - last_idx,
|
|
last_strategy.name()
|
|
);
|
|
for (idx, entry) in iter.enumerate().skip(last_idx) {
|
|
if is_failing(&entry, location, command_args, pattern)? {
|
|
// This one is still failing in the right way
|
|
return Ok(Some((last_strategy, idx, entry)));
|
|
}
|
|
}
|
|
}
|
|
|
|
// Try all strategies in order, including the last successful one without skipping inputs
|
|
for strategy in STRATEGIES {
|
|
let iter = strategy.candidates(input, &ast)?;
|
|
println!("Trying {} {} candidates", iter.len(), strategy.name());
|
|
for (idx, entry) in iter.enumerate() {
|
|
if is_failing(&entry, location, command_args, pattern)? {
|
|
// This one is still failing in the right way
|
|
return Ok(Some((*strategy, idx, entry)));
|
|
}
|
|
}
|
|
}
|
|
|
|
// None of the minimizations worked
|
|
Ok(None)
|
|
}
|
|
|
|
/// Does the candidate still produce the expected error?
|
|
fn is_failing(
|
|
input: &str,
|
|
location: &Path,
|
|
command_args: &[String],
|
|
pattern: &Regex,
|
|
) -> Result<bool> {
|
|
fs::write(location, input).context("Invalid location")?;
|
|
|
|
let output = Command::new(&command_args[0])
|
|
.args(&command_args[1..])
|
|
.output()
|
|
.context("Failed to launch command")?;
|
|
|
|
let stdout = str::from_utf8(&output.stdout).context("stdout was not utf8")?;
|
|
let stderr = str::from_utf8(&output.stderr).context("stderr was not utf8")?;
|
|
if pattern.is_match(stdout) {
|
|
debug!("stdout matches");
|
|
Ok(true)
|
|
} else if pattern.is_match(stderr) {
|
|
debug!("stderr matches");
|
|
Ok(true)
|
|
} else {
|
|
Ok(false)
|
|
}
|
|
}
|
|
|
|
/// You specify an input file that fails. The minimizer will write minimization candidates to the
|
|
/// the file given as second argument. It will run the command and if the output still matches the
|
|
/// error pattern, the candidate will be considered a successful minimization step, otherwise it's
|
|
/// rolled back.
|
|
///
|
|
/// ## Example
|
|
/// ```shell
|
|
/// cargo run --bin ruff_shrinking -- target/checkouts/jhnnsrs:mikro-napari/mikro_napari/models/representation.py target/minirepo/code.py "invalid syntax" "target/debug/ruff_dev format-dev --stability-check target/minirepo"
|
|
/// ```
|
|
/// This could emit (if it wasn't fixed):
|
|
/// ```python
|
|
/// class RepresentationQtModel():
|
|
/// data[:,] = rep.data
|
|
/// ```
|
|
#[derive(Parser)]
|
|
struct Args {
|
|
/// The input file that fails
|
|
input_file: PathBuf,
|
|
/// The minimization attempt is written to this location
|
|
output_file: PathBuf,
|
|
/// Continue this path of the minimization if either stderr or stdout match this regex
|
|
error_pattern: String,
|
|
/// The command to run to test if the smaller version still emits the same error
|
|
///
|
|
/// TODO(konstin): Move this to some form of trailing args so we don't need shlex
|
|
command: String,
|
|
}
|
|
|
|
fn run() -> Result<()> {
|
|
// e.g. `RUST_LOG=ruff_shrinking=debug`
|
|
tracing_subscriber::fmt::init();
|
|
|
|
let args: Args = Args::parse();
|
|
let pattern = Regex::new(&args.error_pattern).context("Invalid error_pattern")?;
|
|
let command_args = shlex::split(&args.command).context("Couldn't split command input")?;
|
|
|
|
let loop_start = Instant::now();
|
|
let mut stats = HashMap::new();
|
|
|
|
// Normalize line endings for the remove newline dependent rules
|
|
let mut input = fs::read_to_string(args.input_file)?.replace('\r', "");
|
|
|
|
// This can happen e.g. when main changed between collecting the errors list and running this
|
|
// script
|
|
if !is_failing(&input, &args.output_file, &command_args, &pattern)? {
|
|
println!("Input doesn't match");
|
|
fs::write(&args.output_file, "")?;
|
|
return Ok(());
|
|
}
|
|
|
|
let mut num_iterations = 0;
|
|
let mut last_strategy_and_idx = None;
|
|
loop {
|
|
let start = Instant::now();
|
|
num_iterations += 1;
|
|
let smaller_failure = minimization_step(
|
|
&input,
|
|
&args.output_file,
|
|
&command_args,
|
|
&pattern,
|
|
last_strategy_and_idx,
|
|
)?;
|
|
let duration = start.elapsed();
|
|
if let Some((strategy, idx, smaller_failure)) = smaller_failure {
|
|
println!(
|
|
"Match found with {} {idx} in {:.2}s, {} bytes remaining",
|
|
strategy.name(),
|
|
duration.as_secs_f32(),
|
|
smaller_failure.len()
|
|
);
|
|
*stats.entry(strategy.name()).or_insert(0) += 1;
|
|
input = smaller_failure;
|
|
last_strategy_and_idx = Some((strategy, idx));
|
|
} else {
|
|
// The last minimization failed, write back the original content
|
|
fs::write(&args.output_file, input.as_bytes())?;
|
|
println!(
|
|
"Last iteration in {:.2}s, {} bytes remaining",
|
|
duration.as_secs_f32(),
|
|
input.as_bytes().len()
|
|
);
|
|
break;
|
|
}
|
|
}
|
|
|
|
println!("Strategies taken: {stats:?}");
|
|
println!(
|
|
"Done with {num_iterations} iterations in {:.2}s. Find your minimized example in {}:\n---\n{}\n---\n",
|
|
loop_start.elapsed().as_secs_f32(),
|
|
args.output_file.display(),
|
|
input
|
|
);
|
|
|
|
Ok(())
|
|
}
|
|
|
|
fn main() -> ExitCode {
|
|
if let Err(e) = run() {
|
|
eprintln!("💥 Minimizer failed");
|
|
for cause in e.chain() {
|
|
eprintln!(" Cause: {cause}");
|
|
}
|
|
ExitCode::FAILURE
|
|
} else {
|
|
ExitCode::SUCCESS
|
|
}
|
|
}
|