ruff/crates/ruff_shrinking/src/main.rs
Dhruv Manilawala 32fa05765a
Use Jupyter mode while parsing Notebook files (#5552)
## Summary

Enable using the new `Mode::Jupyter` for the tokenizer/parser to parse
Jupyter line magic tokens.

The individual call to the lexer i.e., `lex_starts_at` done by various
rules should consider the context of the source code (is this content
from a Jupyter Notebook?). Thus, a new field `source_type` (of type
`PySourceType`) is added to `Checker` which is being passed around as an
argument to the relevant functions. This is then used to determine the
`Mode` for the lexer.

## Test Plan

Add new test cases to make sure that the magic statement is considered
while generating the diagnostic and autofix:
* For `I001`, if there's a magic statement in between two import blocks,
they should be sorted independently

fixes: #6090
2023-08-05 00:32:07 +00:00

493 lines
16 KiB
Rust

//! Minimize a failing python file, a process known as [shrinking](https://www.educative.io/answers/what-is-shrinking)
//!
//! See `--help` (or the [Args] docs) for usage.
//!
//! ## Example
//!
//! Shrink a file with unstable formatting
//! ```shell
//! cargo run --bin ruff-minimizer -- target/cpython/Lib/test/inspect_fodder.py target/minirepo/a.py "Unstable formatting" "target/debug/ruff_dev format-dev --stability-check target/minirepo"
//! ```
//! This could emit
//! ```python
//! class WhichComments: # before f return 1 # end f # after f # before asyncf - line 108
//! async def asyncf(self): return 2
//! # end asyncf # end of WhichComments - line 114# a closing parenthesis with the opening paren being in another line
//! (
//! );
//! ```
//!
//! Shrink a file with a syntax error
//! ```shell
//! cargo run --bin ruff_shrinking -- target/checkouts/jhnnsrs:mikro-napari/mikro_napari/models/representation.py target/minirepo/code.py "invalid syntax" "target/debug/ruff_dev format-dev --stability-check target/minirepo"
//! ```
//! This could emit
//! ```python
//! class RepresentationQtModel():
//! data[:,] = rep.data
//! ```
#![allow(clippy::print_stdout, clippy::print_stderr)]
use anyhow::{Context, Result};
use clap::Parser;
use fs_err as fs;
use regex::Regex;
use ruff_python_ast::statement_visitor::{walk_body, walk_stmt, StatementVisitor};
use ruff_python_ast::visitor::{walk_expr, Visitor};
use ruff_python_ast::{Expr, Ranged, Stmt, Suite};
use ruff_python_parser::Mode;
use ruff_text_size::TextRange;
use std::collections::HashMap;
use std::path::{Path, PathBuf};
use std::process::{Command, ExitCode};
use std::str;
use std::time::Instant;
use tracing::debug;
const STRATEGIES: &[&dyn Strategy] = &[
(&StrategyRemoveModuleMember),
(&StrategyRemoveStatement),
(&StrategyRemoveExpression),
(&StrategyRemoveLine),
(&StrategyRemoveNewline),
(&StrategyRemoveToken),
(&StrategyRemoveChar),
];
/// Each strategy is a way of producing possible minimizations
trait Strategy {
fn name(&self) -> &'static str;
fn candidates<'a>(
&self,
input: &'a str,
ast: &'a Suite,
) -> Result<Box<dyn ExactSizeStringIter + 'a>>;
}
/// vtable surrogate trait that rust wants
trait ExactSizeStringIter: Iterator<Item = String> + ExactSizeIterator {}
impl<T> ExactSizeStringIter for T where T: Iterator<Item = String> + ExactSizeIterator {}
/// Remove a top level member from a module. Generally the most effective strategy since most
/// top level items will be unrelated to the error.
struct StrategyRemoveModuleMember;
impl Strategy for StrategyRemoveModuleMember {
fn name(&self) -> &'static str {
"remove module member"
}
fn candidates<'a>(
&self,
input: &'a str,
ast: &'a Suite,
) -> Result<Box<dyn ExactSizeStringIter + 'a>> {
let iter = ast.iter().map(|stmt| {
// trim the newlines the range misses
input[..stmt.range().start().to_usize()]
.trim_end()
.to_string()
+ input[stmt.range().end().to_usize()..].trim_start()
});
Ok(Box::new(iter))
}
}
/// Finds the ranges of all statements.
#[derive(Default)]
struct StatementCollector {
/// The ranges of all statements
ranges: Vec<TextRange>,
}
impl StatementVisitor<'_> for StatementCollector {
fn visit_body(&mut self, body: &[Stmt]) {
if let (Some(first), Some(last)) = (body.first(), body.last()) {
if !(first == last && matches!(first, Stmt::Pass(_))) {
self.ranges.push(TextRange::new(first.start(), last.end()));
}
}
walk_body(self, body);
}
fn visit_stmt(&mut self, stmt: &Stmt) {
if !matches!(stmt, Stmt::Pass(_)) {
self.ranges.push(stmt.range());
}
walk_stmt(self, stmt);
}
}
/// Try to remove each statement or replace it statement with `pass`
fn strategy_statement<'a>(
input: &'a str,
ast: &'a Suite,
pass_dummy: bool,
) -> Box<dyn ExactSizeStringIter + 'a> {
let mut visitor = StatementCollector::default();
visitor.visit_body(ast);
// Remove the largest statements first
let mut ranges = visitor.ranges;
ranges.sort_by_key(|range| range.len());
ranges.reverse();
let iter = ranges.into_iter().map(move |range| {
let mut without_stmt = String::new();
// trim the newlines the range misses
without_stmt.push_str(input[..range.start().to_usize()].trim_end());
if pass_dummy {
without_stmt.push_str("pass");
}
without_stmt.push_str(&input[range.end().to_usize()..]);
without_stmt
});
Box::new(iter)
}
struct StrategyRemoveStatement;
impl Strategy for StrategyRemoveStatement {
fn name(&self) -> &'static str {
"remove statement"
}
fn candidates<'a>(
&self,
input: &'a str,
ast: &'a Suite,
) -> Result<Box<dyn ExactSizeStringIter + 'a>> {
Ok(strategy_statement(input, ast, false))
}
}
/// Finds the ranges of all expressions.
#[derive(Default)]
struct ExpressionCollector {
/// The ranges of all expressions
ranges: Vec<TextRange>,
}
impl Visitor<'_> for ExpressionCollector {
fn visit_expr(&mut self, expr: &Expr) {
self.ranges.push(expr.range());
walk_expr(self, expr);
}
}
struct StrategyRemoveExpression;
impl Strategy for StrategyRemoveExpression {
fn name(&self) -> &'static str {
"remove expression"
}
/// Try to remove each expression
fn candidates<'a>(
&self,
input: &'a str,
ast: &'a Suite,
) -> Result<Box<dyn ExactSizeStringIter + 'a>> {
let mut visitor = ExpressionCollector::default();
visitor.visit_body(ast);
let iter = visitor.ranges.into_iter().map(move |range| {
input[TextRange::up_to(range.start())].to_string() + &input[range.end().to_usize()..]
});
Ok(Box::new(iter))
}
}
/// Remove each line (physical lines, not logical lines).
struct StrategyRemoveLine;
impl Strategy for StrategyRemoveLine {
fn name(&self) -> &'static str {
"remove line"
}
/// Returns the number of permutations and each permutation
fn candidates<'a>(
&self,
input: &'a str,
_ast: &'a Suite,
) -> Result<Box<dyn ExactSizeStringIter + 'a>> {
let lines: Vec<_> = input.lines().collect();
let iter = (0..lines.len()).map(move |removed_line| {
let mut result = String::new();
result.push_str(&lines[..removed_line].join("\n"));
if removed_line > 0 {
result.push('\n');
}
result.push_str(&lines[removed_line + 1..].join("\n"));
result
});
Ok(Box::new(iter))
}
}
/// Try removing newline characters
struct StrategyRemoveNewline;
impl Strategy for StrategyRemoveNewline {
fn name(&self) -> &'static str {
"remove newline"
}
/// Returns the number of permutations and each permutation
fn candidates<'a>(
&self,
input: &'a str,
_ast: &'a Suite,
) -> Result<Box<dyn ExactSizeStringIter + 'a>> {
let newline_positions: Vec<_> = input
.char_indices()
.filter_map(|(pos, char)| {
// Don't remove newlines after `:`. Indexing is save because pos > 0 is checked and
// pos - 1 is behind a position we know exists and we're indexing into bytes instead
// of chars
if char == '\n' && pos > 0 && input.as_bytes()[pos - 1] != b':' {
Some(pos)
} else {
None
}
})
.collect();
let iter = newline_positions.into_iter().map(move |newline_position| {
// trim to remove the indentation
input[..newline_position].to_string()
+ input[newline_position + '\n'.len_utf8()..].trim_start()
});
Ok(Box::new(iter))
}
}
/// Try removing each python token. This is really slow and runs at the end
struct StrategyRemoveToken;
impl Strategy for StrategyRemoveToken {
fn name(&self) -> &'static str {
"remove token"
}
fn candidates<'a>(
&self,
input: &'a str,
_ast: &'a Suite,
) -> Result<Box<dyn ExactSizeStringIter + 'a>> {
let token_ranges: Vec<_> = ruff_python_parser::tokenize(input, Mode::Module)
.into_iter()
// At this point we know we have valid python code
.map(Result::unwrap)
.filter(|token| token.1.len().to_usize() > 0)
.map(|token| token.1)
.collect();
let iter = token_ranges.into_iter().map(move |range| {
input[..range.start().to_usize()].to_string() + &input[range.end().to_usize()..]
});
Ok(Box::new(iter))
}
}
/// Try removing each individual character in the file. This is really slow and runs at the end
struct StrategyRemoveChar;
impl Strategy for StrategyRemoveChar {
fn name(&self) -> &'static str {
"remove character"
}
fn candidates<'a>(
&self,
input: &'a str,
_ast: &'a Suite,
) -> Result<Box<dyn ExactSizeStringIter + 'a>> {
let char_indices: Vec<_> = input.char_indices().collect();
let iter = char_indices
.into_iter()
.map(move |(pos, char)| input[..pos].to_string() + &input[pos + char.len_utf8()..]);
Ok(Box::new(iter))
}
}
/// Returns strategy, posing in the iteration (so they can be skipped in the next attempt) and
/// minimized code.
fn minimization_step(
input: &str,
location: &Path,
command_args: &[String],
pattern: &Regex,
last_strategy_and_idx: Option<(&'static dyn Strategy, usize)>,
) -> Result<Option<(&'static dyn Strategy, usize, String)>> {
let tokens = ruff_python_parser::tokenize(input, Mode::Module);
let ast = ruff_python_parser::parse_program_tokens(tokens, "input.py", false)
.context("not valid python")?;
// Try the last succeeding strategy first, skipping all that failed last time
if let Some((last_strategy, last_idx)) = last_strategy_and_idx {
let iter = last_strategy.candidates(input, &ast)?;
println!(
"Trying {} ({last_idx} skipped) {} candidates",
iter.len() - last_idx,
last_strategy.name()
);
for (idx, entry) in iter.enumerate().skip(last_idx) {
if is_failing(&entry, location, command_args, pattern)? {
// This one is still failing in the right way
return Ok(Some((last_strategy, idx, entry)));
}
}
}
// Try all strategies in order, including the last successful one without skipping inputs
for strategy in STRATEGIES {
let iter = strategy.candidates(input, &ast)?;
println!("Trying {} {} candidates", iter.len(), strategy.name());
for (idx, entry) in iter.enumerate() {
if is_failing(&entry, location, command_args, pattern)? {
// This one is still failing in the right way
return Ok(Some((*strategy, idx, entry)));
}
}
}
// None of the minimizations worked
Ok(None)
}
/// Does the candidate still produce the expected error?
fn is_failing(
input: &str,
location: &Path,
command_args: &[String],
pattern: &Regex,
) -> Result<bool> {
fs::write(location, input).context("Invalid location")?;
let output = Command::new(&command_args[0])
.args(&command_args[1..])
.output()
.context("Failed to launch command")?;
let stdout = str::from_utf8(&output.stdout).context("stdout was not utf8")?;
let stderr = str::from_utf8(&output.stderr).context("stderr was not utf8")?;
if pattern.is_match(stdout) {
debug!("stdout matches");
Ok(true)
} else if pattern.is_match(stderr) {
debug!("stderr matches");
Ok(true)
} else {
Ok(false)
}
}
/// You specify an input file that fails. The minimizer will write minimization candidates to the
/// the file given as second argument. It will run the command and if the output still matches the
/// error pattern, the candidate will be considered a successful minimization step, otherwise it's
/// rolled back.
///
/// ## Example
/// ```shell
/// cargo run --bin ruff_shrinking -- target/checkouts/jhnnsrs:mikro-napari/mikro_napari/models/representation.py target/minirepo/code.py "invalid syntax" "target/debug/ruff_dev format-dev --stability-check target/minirepo"
/// ```
/// This could emit (if it wasn't fixed):
/// ```python
/// class RepresentationQtModel():
/// data[:,] = rep.data
/// ```
#[derive(Parser)]
struct Args {
/// The input file that fails
input_file: PathBuf,
/// The minimization attempt is written to this location
output_file: PathBuf,
/// Continue this path of the minimization if either stderr or stdout match this regex
error_pattern: String,
/// The command to run to test if the smaller version still emits the same error
///
/// TODO(konstin): Move this to some form of trailing args so we don't need shlex
command: String,
}
fn run() -> Result<()> {
// e.g. `RUST_LOG=ruff_shrinking=debug`
tracing_subscriber::fmt::init();
let args: Args = Args::parse();
let pattern = Regex::new(&args.error_pattern).context("Invalid error_pattern")?;
let command_args = shlex::split(&args.command).context("Couldn't split command input")?;
let loop_start = Instant::now();
let mut stats = HashMap::new();
// Normalize line endings for the remove newline dependent rules
let mut input = fs::read_to_string(args.input_file)?.replace('\r', "");
// This can happen e.g. when main changed between collecting the errors list and running this
// script
if !is_failing(&input, &args.output_file, &command_args, &pattern)? {
println!("Input doesn't match");
fs::write(&args.output_file, "")?;
return Ok(());
}
let mut num_iterations = 0;
let mut last_strategy_and_idx = None;
loop {
let start = Instant::now();
num_iterations += 1;
let smaller_failure = minimization_step(
&input,
&args.output_file,
&command_args,
&pattern,
last_strategy_and_idx,
)?;
let duration = start.elapsed();
if let Some((strategy, idx, smaller_failure)) = smaller_failure {
println!(
"Match found with {} {idx} in {:.2}s, {} bytes remaining",
strategy.name(),
duration.as_secs_f32(),
smaller_failure.len()
);
*stats.entry(strategy.name()).or_insert(0) += 1;
input = smaller_failure;
last_strategy_and_idx = Some((strategy, idx));
} else {
// The last minimization failed, write back the original content
fs::write(&args.output_file, input.as_bytes())?;
println!(
"Last iteration in {:.2}s, {} bytes remaining",
duration.as_secs_f32(),
input.as_bytes().len()
);
break;
}
}
println!("Strategies taken: {stats:?}");
println!(
"Done with {num_iterations} iterations in {:.2}s. Find your minimized example in {}:\n---\n{}\n---\n",
loop_start.elapsed().as_secs_f32(),
args.output_file.display(),
input
);
Ok(())
}
fn main() -> ExitCode {
if let Err(e) = run() {
eprintln!("💥 Minimizer failed");
for cause in e.chain() {
eprintln!(" Cause: {cause}");
}
ExitCode::FAILURE
} else {
ExitCode::SUCCESS
}
}