mirror of
https://github.com/astral-sh/ruff.git
synced 2025-07-24 05:25:17 +00:00
Switch from jaccard index to similarity index (#5679)
## Summary The similarity index, the fraction of unchanged lines, is easier to understand than the jaccard index, the fraction between intersection and union. ## Test Plan I ran this on django and git a 0.945 index, meaning 5.5% of lines are currently reformatted when compared to black
This commit is contained in:
parent
4b58a9c092
commit
212fd86bf0
2 changed files with 33 additions and 16 deletions
|
@ -48,16 +48,27 @@ fn ruff_check_paths(dirs: &[PathBuf]) -> anyhow::Result<Vec<Result<DirEntry, ign
|
||||||
Ok(paths)
|
Ok(paths)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Collects statistics over the formatted files, currently only computes the Jaccard index
|
/// Collects statistics over the formatted files to compute the Jaccard index or the similarity
|
||||||
|
/// index.
|
||||||
|
///
|
||||||
|
/// If we define `B` as the black formatted input and `R` as the ruff formatted output, then
|
||||||
|
/// * `B∩R`: Unchanged lines, neutral in the diff
|
||||||
|
/// * `B\R`: Black only lines, minus in the diff
|
||||||
|
/// * `R\B`: Ruff only lines, plus in the diff
|
||||||
///
|
///
|
||||||
/// The [Jaccard index](https://en.wikipedia.org/wiki/Jaccard_index) can be defined as
|
/// The [Jaccard index](https://en.wikipedia.org/wiki/Jaccard_index) can be defined as
|
||||||
/// ```text
|
/// ```text
|
||||||
/// J(A, B) = |A∩B| / (|A\B| + |B\A| + |A∩B|)
|
/// J(B, R) = |B∩R| / (|B\R| + |R\B| + |B∩R|)
|
||||||
/// ```
|
/// ```
|
||||||
/// where in our case `A` is the black formatted input, `B` is the ruff formatted output and the
|
/// which you can read as number unchanged lines in the diff divided by all lines in the diff. If
|
||||||
/// intersection are the lines in the diff that didn't change. We don't just track intersection and
|
/// the input is not black formatted, this only becomes a measure for the changes made to the
|
||||||
/// union so we can make statistics about size changes. If the input is not black formatted, this
|
/// codebase during the initial formatting.
|
||||||
/// only becomes a measure for the changes made to the codebase during the initial formatting.
|
///
|
||||||
|
/// Another measure is the similarity index, the percentage of unchanged lines. We compute it as
|
||||||
|
/// ```text
|
||||||
|
/// Sim(B, R) = |B∩R| / (|B\R| + |B∩R|)
|
||||||
|
/// ```
|
||||||
|
/// which you can alternatively read as all lines in the input
|
||||||
#[derive(Default, Debug, Copy, Clone)]
|
#[derive(Default, Debug, Copy, Clone)]
|
||||||
pub(crate) struct Statistics {
|
pub(crate) struct Statistics {
|
||||||
/// The size of `A\B`, the number of lines only in the input, which we assume to be black
|
/// The size of `A\B`, the number of lines only in the input, which we assume to be black
|
||||||
|
@ -92,10 +103,16 @@ impl Statistics {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[allow(clippy::cast_precision_loss)]
|
/// We currently prefer the the similarity index, but i'd like to keep this around
|
||||||
|
#[allow(clippy::cast_precision_loss, unused)]
|
||||||
pub(crate) fn jaccard_index(&self) -> f32 {
|
pub(crate) fn jaccard_index(&self) -> f32 {
|
||||||
self.intersection as f32 / (self.black_input + self.ruff_output + self.intersection) as f32
|
self.intersection as f32 / (self.black_input + self.ruff_output + self.intersection) as f32
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[allow(clippy::cast_precision_loss)]
|
||||||
|
pub(crate) fn similarity_index(&self) -> f32 {
|
||||||
|
self.intersection as f32 / (self.black_input + self.intersection) as f32
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Add<Statistics> for Statistics {
|
impl Add<Statistics> for Statistics {
|
||||||
|
@ -171,10 +188,10 @@ pub(crate) fn main(args: &Args) -> anyhow::Result<ExitCode> {
|
||||||
{
|
{
|
||||||
print!("{}", result.display(args.format));
|
print!("{}", result.display(args.format));
|
||||||
println!(
|
println!(
|
||||||
"Found {} stability errors in {} files (jaccard index {:.3}) in {:.2}s",
|
"Found {} stability errors in {} files (similarity index {:.3}) in {:.2}s",
|
||||||
error_count,
|
error_count,
|
||||||
result.file_count,
|
result.file_count,
|
||||||
result.statistics.jaccard_index(),
|
result.statistics.similarity_index(),
|
||||||
result.duration.as_secs_f32(),
|
result.duration.as_secs_f32(),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
@ -253,10 +270,10 @@ fn format_dev_multi_project(args: &Args) -> bool {
|
||||||
total_files += result.file_count;
|
total_files += result.file_count;
|
||||||
|
|
||||||
bar.println(format!(
|
bar.println(format!(
|
||||||
"Finished {} with {} files (jaccard index {:.3}) in {:.2}s",
|
"Finished {} with {} files (similarity index {:.3}) in {:.2}s",
|
||||||
path.display(),
|
path.display(),
|
||||||
result.file_count,
|
result.file_count,
|
||||||
result.statistics.jaccard_index(),
|
result.statistics.similarity_index(),
|
||||||
result.duration.as_secs_f32(),
|
result.duration.as_secs_f32(),
|
||||||
));
|
));
|
||||||
bar.println(result.display(args.format).to_string().trim_end());
|
bar.println(result.display(args.format).to_string().trim_end());
|
||||||
|
|
|
@ -249,11 +249,11 @@ It's possible to format an entire project:
|
||||||
cargo run --bin ruff_dev -- format-dev --write my_project
|
cargo run --bin ruff_dev -- format-dev --write my_project
|
||||||
```
|
```
|
||||||
|
|
||||||
This will format all files that `ruff check` would lint and computes the
|
This will format all files that `ruff check` would lint and computes the similarity index, the
|
||||||
[Jaccard index](https://en.wikipedia.org/wiki/Jaccard_index), a measure for how close the original
|
fraction of changed lines. The similarity index is 1 if there were no changes at all, while 0 means
|
||||||
and formatted versions are. The Jaccard index is 1 if there were no changes at all, while 0 means
|
we changed every single line. If you run this on a black formatted projects, this tells you how
|
||||||
every line was changed. If you run this on a black formatted projects, this tells you how similar
|
similar the ruff formatter is to black for the given project, with our goal being as close to 1 as
|
||||||
the ruff formatter is to black for the given project, with our goal being as close to 1 as possible.
|
possible.
|
||||||
|
|
||||||
There are three common problems with the formatter: The second formatting pass looks different than
|
There are three common problems with the formatter: The second formatting pass looks different than
|
||||||
the first (formatter instability or lack of idempotency), we print invalid syntax (e.g. missing
|
the first (formatter instability or lack of idempotency), we print invalid syntax (e.g. missing
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue