Add NotebookIndex to the cache (#6863)

## Summary

This PR updates the `FileCache` to include an optional `NotebookIndex`
to support caching for Jupyter Notebooks.

We only require the index to compute the diagnostics and thus we don't
really need to store the entire `Notebook` on the `Diagnostics` struct.
This means we only need the index to be stored in the cache to
reconstruct the `Diagnostics`.

## Test Plan

Update an existing test case to run over the fixtures under
`ruff_notebook` crate where there are multiple Jupyter Notebook.

Locally, the following commands were run in order:
1. Remove the cache: `rm -rf .ruff_cache`
2. Run without cache: `cargo run --bin ruff -- check --isolated
crates/ruff_notebook/resources/test/fixtures/jupyter/unused_variable.ipynb
--no-cache`
3. Run with cache: `cargo run --bin ruff -- check --isolated
crates/ruff_notebook/resources/test/fixtures/jupyter/unused_variable.ipynb`
4. Check whether the `.ruff_cache` directory was created or not
5. Run with cache again and verify: `cargo run --bin ruff -- check
--isolated
crates/ruff_notebook/resources/test/fixtures/jupyter/unused_variable.ipynb`

## Benchmarks

https://github.com/astral-sh/ruff/pull/6863#issuecomment-1715675186

fixes: #6671
This commit is contained in:
Dhruv Manilawala 2023-09-12 18:29:03 +05:30 committed by GitHub
parent e7b7e4a18d
commit ee0f1270cf
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
11 changed files with 84 additions and 58 deletions

1
Cargo.lock generated
View file

@ -2183,6 +2183,7 @@ dependencies = [
"similar",
"strum",
"tempfile",
"test-case",
"thiserror",
"tikv-jemallocator",
"tracing",

View file

@ -4,7 +4,7 @@ use std::num::NonZeroUsize;
use colored::Colorize;
use ruff_notebook::{Notebook, NotebookIndex};
use ruff_notebook::NotebookIndex;
use ruff_source_file::OneIndexed;
use crate::fs::relativize_path;
@ -65,7 +65,7 @@ impl Emitter for GroupedEmitter {
writer,
"{}",
DisplayGroupedMessage {
jupyter_index: context.notebook(message.filename()).map(Notebook::index),
notebook_index: context.notebook_index(message.filename()),
message,
show_fix_status: self.show_fix_status,
show_source: self.show_source,
@ -92,7 +92,7 @@ struct DisplayGroupedMessage<'a> {
show_source: bool,
row_length: NonZeroUsize,
column_length: NonZeroUsize,
jupyter_index: Option<&'a NotebookIndex>,
notebook_index: Option<&'a NotebookIndex>,
}
impl Display for DisplayGroupedMessage<'_> {
@ -110,7 +110,7 @@ impl Display for DisplayGroupedMessage<'_> {
)?;
// Check if we're working on a jupyter notebook and translate positions with cell accordingly
let (row, col) = if let Some(jupyter_index) = self.jupyter_index {
let (row, col) = if let Some(jupyter_index) = self.notebook_index {
write!(
f,
"cell {cell}{sep}",
@ -150,7 +150,7 @@ impl Display for DisplayGroupedMessage<'_> {
"{}",
MessageCodeFrame {
message,
jupyter_index: self.jupyter_index
notebook_index: self.notebook_index
}
)?;
}

View file

@ -14,7 +14,7 @@ pub use json_lines::JsonLinesEmitter;
pub use junit::JunitEmitter;
pub use pylint::PylintEmitter;
use ruff_diagnostics::{Diagnostic, DiagnosticKind, Fix};
use ruff_notebook::Notebook;
use ruff_notebook::NotebookIndex;
use ruff_source_file::{SourceFile, SourceLocation};
use ruff_text_size::{Ranged, TextRange, TextSize};
pub use text::TextEmitter;
@ -127,21 +127,21 @@ pub trait Emitter {
/// Context passed to [`Emitter`].
pub struct EmitterContext<'a> {
notebooks: &'a FxHashMap<String, Notebook>,
notebook_indexes: &'a FxHashMap<String, NotebookIndex>,
}
impl<'a> EmitterContext<'a> {
pub fn new(notebooks: &'a FxHashMap<String, Notebook>) -> Self {
Self { notebooks }
pub fn new(notebook_indexes: &'a FxHashMap<String, NotebookIndex>) -> Self {
Self { notebook_indexes }
}
/// Tests if the file with `name` is a jupyter notebook.
pub fn is_notebook(&self, name: &str) -> bool {
self.notebooks.contains_key(name)
self.notebook_indexes.contains_key(name)
}
pub fn notebook(&self, name: &str) -> Option<&Notebook> {
self.notebooks.get(name)
pub fn notebook_index(&self, name: &str) -> Option<&NotebookIndex> {
self.notebook_indexes.get(name)
}
}
@ -225,8 +225,8 @@ def fibonacci(n):
emitter: &mut dyn Emitter,
messages: &[Message],
) -> String {
let source_kinds = FxHashMap::default();
let context = EmitterContext::new(&source_kinds);
let notebook_indexes = FxHashMap::default();
let context = EmitterContext::new(&notebook_indexes);
let mut output: Vec<u8> = Vec::new();
emitter.emit(&mut output, messages, &context).unwrap();

View file

@ -7,7 +7,7 @@ use annotate_snippets::snippet::{Annotation, AnnotationType, Slice, Snippet, Sou
use bitflags::bitflags;
use colored::Colorize;
use ruff_notebook::{Notebook, NotebookIndex};
use ruff_notebook::NotebookIndex;
use ruff_source_file::{OneIndexed, SourceLocation};
use ruff_text_size::{Ranged, TextRange, TextSize};
@ -71,14 +71,14 @@ impl Emitter for TextEmitter {
)?;
let start_location = message.compute_start_location();
let jupyter_index = context.notebook(message.filename()).map(Notebook::index);
let notebook_index = context.notebook_index(message.filename());
// Check if we're working on a jupyter notebook and translate positions with cell accordingly
let diagnostic_location = if let Some(jupyter_index) = jupyter_index {
let diagnostic_location = if let Some(notebook_index) = notebook_index {
write!(
writer,
"cell {cell}{sep}",
cell = jupyter_index
cell = notebook_index
.cell(start_location.row.get())
.unwrap_or_default(),
sep = ":".cyan(),
@ -86,7 +86,7 @@ impl Emitter for TextEmitter {
SourceLocation {
row: OneIndexed::new(
jupyter_index
notebook_index
.cell_row(start_location.row.get())
.unwrap_or(1) as usize,
)
@ -115,7 +115,7 @@ impl Emitter for TextEmitter {
"{}",
MessageCodeFrame {
message,
jupyter_index
notebook_index
}
)?;
}
@ -161,7 +161,7 @@ impl Display for RuleCodeAndBody<'_> {
pub(super) struct MessageCodeFrame<'a> {
pub(crate) message: &'a Message,
pub(crate) jupyter_index: Option<&'a NotebookIndex>,
pub(crate) notebook_index: Option<&'a NotebookIndex>,
}
impl Display for MessageCodeFrame<'_> {
@ -186,14 +186,12 @@ impl Display for MessageCodeFrame<'_> {
let content_start_index = source_code.line_index(range.start());
let mut start_index = content_start_index.saturating_sub(2);
// If we're working on a jupyter notebook, skip the lines which are
// If we're working with a Jupyter Notebook, skip the lines which are
// outside of the cell containing the diagnostic.
if let Some(jupyter_index) = self.jupyter_index {
let content_start_cell = jupyter_index
.cell(content_start_index.get())
.unwrap_or_default();
if let Some(index) = self.notebook_index {
let content_start_cell = index.cell(content_start_index.get()).unwrap_or_default();
while start_index < content_start_index {
if jupyter_index.cell(start_index.get()).unwrap_or_default() == content_start_cell {
if index.cell(start_index.get()).unwrap_or_default() == content_start_cell {
break;
}
start_index = start_index.saturating_add(1);
@ -213,14 +211,12 @@ impl Display for MessageCodeFrame<'_> {
.saturating_add(2)
.min(OneIndexed::from_zero_indexed(source_code.line_count()));
// If we're working on a jupyter notebook, skip the lines which are
// If we're working with a Jupyter Notebook, skip the lines which are
// outside of the cell containing the diagnostic.
if let Some(jupyter_index) = self.jupyter_index {
let content_end_cell = jupyter_index
.cell(content_end_index.get())
.unwrap_or_default();
if let Some(index) = self.notebook_index {
let content_end_cell = index.cell(content_end_index.get()).unwrap_or_default();
while end_index > content_end_index {
if jupyter_index.cell(end_index.get()).unwrap_or_default() == content_end_cell {
if index.cell(end_index.get()).unwrap_or_default() == content_end_cell {
break;
}
end_index = end_index.saturating_sub(1);
@ -256,10 +252,10 @@ impl Display for MessageCodeFrame<'_> {
title: None,
slices: vec![Slice {
source: &source.text,
line_start: self.jupyter_index.map_or_else(
line_start: self.notebook_index.map_or_else(
|| start_index.get(),
|jupyter_index| {
jupyter_index
|notebook_index| {
notebook_index
.cell_row(start_index.get())
.unwrap_or_default() as usize
},

View file

@ -297,7 +297,7 @@ pub(crate) fn print_jupyter_messages(
messages,
&EmitterContext::new(&FxHashMap::from_iter([(
path.file_name().unwrap().to_string_lossy().to_string(),
notebook.clone(),
notebook.index().clone(),
)])),
)
.unwrap();

View file

@ -75,6 +75,7 @@ colored = { workspace = true, features = ["no-color"]}
insta = { workspace = true, features = ["filters"] }
insta-cmd = { version = "0.4.0" }
tempfile = "3.6.0"
test-case = { workspace = true }
ureq = { version = "2.6.2", features = [] }
[target.'cfg(target_os = "windows")'.dependencies]

View file

@ -8,6 +8,7 @@ use std::sync::Mutex;
use std::time::{Duration, SystemTime};
use anyhow::{Context, Result};
use rustc_hash::FxHashMap;
use serde::{Deserialize, Serialize};
use ruff::message::Message;
@ -15,6 +16,7 @@ use ruff::settings::Settings;
use ruff::warn_user;
use ruff_cache::{CacheKey, CacheKeyHasher};
use ruff_diagnostics::{DiagnosticKind, Fix};
use ruff_notebook::NotebookIndex;
use ruff_python_ast::imports::ImportMap;
use ruff_source_file::SourceFileBuilder;
use ruff_text_size::{TextRange, TextSize};
@ -193,6 +195,7 @@ impl Cache {
key: T,
messages: &[Message],
imports: &ImportMap,
notebook_index: Option<&NotebookIndex>,
) {
let source = if let Some(msg) = messages.first() {
msg.file.source_text().to_owned()
@ -226,6 +229,7 @@ impl Cache {
imports: imports.clone(),
messages,
source,
notebook_index: notebook_index.cloned(),
};
self.new_files.lock().unwrap().insert(path, file);
}
@ -263,6 +267,8 @@ pub(crate) struct FileCache {
///
/// This will be empty if `messages` is empty.
source: String,
/// Notebook index if this file is a Jupyter Notebook.
notebook_index: Option<NotebookIndex>,
}
impl FileCache {
@ -283,7 +289,12 @@ impl FileCache {
})
.collect()
};
Diagnostics::new(messages, self.imports.clone())
let notebook_indexes = if let Some(notebook_index) = self.notebook_index.as_ref() {
FxHashMap::from_iter([(path.to_string_lossy().to_string(), notebook_index.clone())])
} else {
FxHashMap::default()
};
Diagnostics::new(messages, self.imports.clone(), notebook_indexes)
}
}
@ -350,16 +361,19 @@ mod tests {
use anyhow::Result;
use ruff_python_ast::imports::ImportMap;
#[test]
fn same_results() {
use test_case::test_case;
#[test_case("../ruff/resources/test/fixtures", "ruff_tests/cache_same_results_ruff"; "ruff_fixtures")]
#[test_case("../ruff_notebook/resources/test/fixtures", "ruff_tests/cache_same_results_ruff_notebook"; "ruff_notebook_fixtures")]
fn same_results(package_root: &str, cache_dir_path: &str) {
let mut cache_dir = temp_dir();
cache_dir.push("ruff_tests/cache_same_results");
cache_dir.push(cache_dir_path);
let _ = fs::remove_dir_all(&cache_dir);
cache::init(&cache_dir).unwrap();
let settings = AllSettings::default();
let package_root = fs::canonicalize("../ruff/resources/test/fixtures").unwrap();
let package_root = fs::canonicalize(package_root).unwrap();
let cache = Cache::open(&cache_dir, package_root.clone(), &settings.lib);
assert_eq!(cache.new_files.lock().unwrap().len(), 0);
@ -444,9 +458,6 @@ mod tests {
.unwrap();
}
// Not stored in the cache.
expected_diagnostics.notebooks.clear();
got_diagnostics.notebooks.clear();
assert_eq!(expected_diagnostics, got_diagnostics);
}
@ -614,6 +625,7 @@ mod tests {
imports: ImportMap::new(),
messages: Vec::new(),
source: String::new(),
notebook_index: None,
},
);

View file

@ -11,6 +11,7 @@ use itertools::Itertools;
use log::{debug, error, warn};
#[cfg(not(target_family = "wasm"))]
use rayon::prelude::*;
use rustc_hash::FxHashMap;
use ruff::message::Message;
use ruff::registry::Rule;
@ -156,6 +157,7 @@ pub(crate) fn check(
TextSize::default(),
)],
ImportMap::default(),
FxHashMap::default(),
)
} else {
warn!(

View file

@ -26,7 +26,7 @@ use ruff::source_kind::SourceKind;
use ruff::{fs, IOError, SyntaxError};
use ruff_diagnostics::Diagnostic;
use ruff_macros::CacheKey;
use ruff_notebook::{Cell, Notebook, NotebookError};
use ruff_notebook::{Cell, Notebook, NotebookError, NotebookIndex};
use ruff_python_ast::imports::ImportMap;
use ruff_python_ast::{PySourceType, SourceType, TomlSourceType};
use ruff_source_file::{LineIndex, SourceCode, SourceFileBuilder};
@ -64,16 +64,20 @@ pub(crate) struct Diagnostics {
pub(crate) messages: Vec<Message>,
pub(crate) fixed: FxHashMap<String, FixTable>,
pub(crate) imports: ImportMap,
pub(crate) notebooks: FxHashMap<String, Notebook>,
pub(crate) notebook_indexes: FxHashMap<String, NotebookIndex>,
}
impl Diagnostics {
pub(crate) fn new(messages: Vec<Message>, imports: ImportMap) -> Self {
pub(crate) fn new(
messages: Vec<Message>,
imports: ImportMap,
notebook_indexes: FxHashMap<String, NotebookIndex>,
) -> Self {
Self {
messages,
fixed: FxHashMap::default(),
imports,
notebooks: FxHashMap::default(),
notebook_indexes,
}
}
@ -94,6 +98,7 @@ impl Diagnostics {
TextSize::default(),
)],
ImportMap::default(),
FxHashMap::default(),
)
} else {
match path {
@ -130,7 +135,7 @@ impl AddAssign for Diagnostics {
}
}
}
self.notebooks.extend(other.notebooks);
self.notebook_indexes.extend(other.notebook_indexes);
}
}
@ -341,7 +346,13 @@ pub(crate) fn lint_path(
if let Some((cache, relative_path, key)) = caching {
// We don't cache parsing errors.
if parse_error.is_none() {
cache.update(relative_path.to_owned(), key, &messages, &imports);
cache.update(
relative_path.to_owned(),
key,
&messages,
&imports,
source_kind.as_ipy_notebook().map(Notebook::index),
);
}
}
@ -359,12 +370,13 @@ pub(crate) fn lint_path(
);
}
let notebooks = if let SourceKind::IpyNotebook(notebook) = source_kind {
let notebook_indexes = if let SourceKind::IpyNotebook(notebook) = source_kind {
FxHashMap::from_iter([(
path.to_str()
.ok_or_else(|| anyhow!("Unable to parse filename: {:?}", path))?
.to_string(),
notebook,
// Index needs to be computed always to store in cache.
notebook.index().clone(),
)])
} else {
FxHashMap::default()
@ -374,7 +386,7 @@ pub(crate) fn lint_path(
messages,
fixed: FxHashMap::from_iter([(fs::relativize_path(path), fixed)]),
imports,
notebooks,
notebook_indexes,
})
}
@ -498,7 +510,7 @@ pub(crate) fn lint_stdin(
fixed,
)]),
imports,
notebooks: FxHashMap::default(),
notebook_indexes: FxHashMap::default(),
})
}

View file

@ -177,7 +177,7 @@ impl Printer {
return Ok(());
}
let context = EmitterContext::new(&diagnostics.notebooks);
let context = EmitterContext::new(&diagnostics.notebook_indexes);
match self.format {
SerializationFormat::Json => {
@ -364,7 +364,7 @@ impl Printer {
writeln!(writer)?;
}
let context = EmitterContext::new(&diagnostics.notebooks);
let context = EmitterContext::new(&diagnostics.notebook_indexes);
TextEmitter::default()
.with_show_fix_status(show_fix_status(self.autofix_level))
.with_show_source(self.flags.intersects(Flags::SHOW_SOURCE))

View file

@ -1,8 +1,10 @@
use serde::{Deserialize, Serialize};
/// Jupyter Notebook indexing table
///
/// When we lint a jupyter notebook, we have to translate the row/column based on
/// [`ruff_text_size::TextSize`] to jupyter notebook cell/row/column.
#[derive(Clone, Debug, Eq, PartialEq)]
#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
pub struct NotebookIndex {
/// Enter a row (1-based), get back the cell (1-based)
pub(super) row_to_cell: Vec<u32>,