Reduce notebook memory footprint (#21319)

This commit is contained in:
Micha Reiser 2025-11-11 10:43:37 +01:00 committed by GitHub
parent 33b942c7ad
commit 36cce347fd
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 84 additions and 71 deletions

View file

@ -112,16 +112,16 @@ impl std::fmt::Display for Diff<'_> {
// `None`, indicating a regular script file, all the lines will be in one "cell" under the
// `None` key.
let cells = if let Some(notebook_index) = &self.notebook_index {
let mut last_cell = OneIndexed::MIN;
let mut last_cell_index = OneIndexed::MIN;
let mut cells: Vec<(Option<OneIndexed>, TextSize)> = Vec::new();
for (row, cell) in notebook_index.iter() {
if cell != last_cell {
let offset = source_code.line_start(row);
cells.push((Some(last_cell), offset));
last_cell = cell;
for cell in notebook_index.iter() {
if cell.cell_index() != last_cell_index {
let offset = source_code.line_start(cell.start_row());
cells.push((Some(last_cell_index), offset));
last_cell_index = cell.cell_index();
}
}
cells.push((Some(last_cell), source_text.text_len()));
cells.push((Some(last_cell_index), source_text.text_len()));
cells
} else {
vec![(None, source_text.text_len())]

View file

@ -8,37 +8,40 @@ use ruff_source_file::{LineColumn, OneIndexed, SourceLocation};
/// [`ruff_text_size::TextSize`] to jupyter notebook cell/row/column.
#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
pub struct NotebookIndex {
/// Enter a row (1-based), get back the cell (1-based)
pub(super) row_to_cell: Vec<OneIndexed>,
/// Enter a row (1-based), get back the row in cell (1-based)
pub(super) row_to_row_in_cell: Vec<OneIndexed>,
/// Stores the starting row and the absolute cell index for every Python (valid) cell.
///
/// The index in this vector corresponds to the Python cell index (valid cell index).
pub(super) cell_starts: Vec<CellStart>,
}
impl NotebookIndex {
pub fn new(row_to_cell: Vec<OneIndexed>, row_to_row_in_cell: Vec<OneIndexed>) -> Self {
Self {
row_to_cell,
row_to_row_in_cell,
fn find_cell(&self, row: OneIndexed) -> Option<CellStart> {
match self
.cell_starts
.binary_search_by_key(&row, |start| start.start_row)
{
Ok(cell_index) => Some(self.cell_starts[cell_index]),
Err(insertion_point) => Some(self.cell_starts[insertion_point.checked_sub(1)?]),
}
}
/// Returns the cell number (1-based) for the given row (1-based).
/// Returns the (raw) cell number (1-based) for the given row (1-based).
pub fn cell(&self, row: OneIndexed) -> Option<OneIndexed> {
self.row_to_cell.get(row.to_zero_indexed()).copied()
self.find_cell(row).map(|start| start.raw_cell_index)
}
/// Returns the row number (1-based) in the cell (1-based) for the
/// given row (1-based).
pub fn cell_row(&self, row: OneIndexed) -> Option<OneIndexed> {
self.row_to_row_in_cell.get(row.to_zero_indexed()).copied()
self.find_cell(row)
.map(|start| OneIndexed::from_zero_indexed(row.get() - start.start_row.get()))
}
/// Returns an iterator over the row:cell-number pairs (both 1-based).
pub fn iter(&self) -> impl Iterator<Item = (OneIndexed, OneIndexed)> {
self.row_to_cell
.iter()
.enumerate()
.map(|(row, cell)| (OneIndexed::from_zero_indexed(row), *cell))
/// Returns an iterator over the starting rows of each cell (1-based).
///
/// This yields one entry per Python cell (skipping over Makrdown cell).
pub fn iter(&self) -> impl Iterator<Item = CellStart> + '_ {
self.cell_starts.iter().copied()
}
/// Translates the given [`LineColumn`] based on the indexing table.
@ -67,3 +70,23 @@ impl NotebookIndex {
}
}
}
#[derive(Debug, Copy, Clone, Eq, PartialEq, Serialize, Deserialize)]
pub struct CellStart {
/// The row in the concatenated notebook source code at which
/// this cell starts.
pub(super) start_row: OneIndexed,
/// The absolute index of this cell in the notebook.
pub(super) raw_cell_index: OneIndexed,
}
impl CellStart {
pub fn start_row(&self) -> OneIndexed {
self.start_row
}
pub fn cell_index(&self) -> OneIndexed {
self.raw_cell_index
}
}

View file

@ -18,7 +18,7 @@ use ruff_text_size::TextSize;
use crate::cell::CellOffsets;
use crate::index::NotebookIndex;
use crate::schema::{Cell, RawNotebook, SortAlphabetically, SourceValue};
use crate::{CellMetadata, RawNotebookMetadata, schema};
use crate::{CellMetadata, CellStart, RawNotebookMetadata, schema};
/// Run round-trip source code generation on a given Jupyter notebook file path.
pub fn round_trip(path: &Path) -> anyhow::Result<String> {
@ -320,11 +320,19 @@ impl Notebook {
/// The index building is expensive as it needs to go through the content of
/// every valid code cell.
fn build_index(&self) -> NotebookIndex {
let mut row_to_cell = Vec::new();
let mut row_to_row_in_cell = Vec::new();
let mut cell_starts = Vec::with_capacity(self.valid_code_cells.len());
let mut current_row = OneIndexed::MIN;
for &cell_index in &self.valid_code_cells {
let line_count = match &self.raw.cells[cell_index as usize].source() {
let raw_cell_index = cell_index as usize;
// Record the starting row of this cell
cell_starts.push(CellStart {
start_row: current_row,
raw_cell_index: OneIndexed::from_zero_indexed(raw_cell_index),
});
let line_count = match &self.raw.cells[raw_cell_index].source() {
SourceValue::String(string) => {
if string.is_empty() {
1
@ -342,17 +350,11 @@ impl Notebook {
}
}
};
row_to_cell.extend(std::iter::repeat_n(
OneIndexed::from_zero_indexed(cell_index as usize),
line_count,
));
row_to_row_in_cell.extend((0..line_count).map(OneIndexed::from_zero_indexed));
current_row = current_row.saturating_add(line_count);
}
NotebookIndex {
row_to_cell,
row_to_row_in_cell,
}
NotebookIndex { cell_starts }
}
/// Return the notebook content.
@ -456,7 +458,7 @@ mod tests {
use ruff_source_file::OneIndexed;
use crate::{Cell, Notebook, NotebookError, NotebookIndex};
use crate::{Cell, CellStart, Notebook, NotebookError, NotebookIndex};
/// Construct a path to a Jupyter notebook in the `resources/test/fixtures/jupyter` directory.
fn notebook_path(path: impl AsRef<Path>) -> std::path::PathBuf {
@ -548,39 +550,27 @@ print("after empty cells")
assert_eq!(
notebook.index(),
&NotebookIndex {
row_to_cell: vec![
OneIndexed::from_zero_indexed(0),
OneIndexed::from_zero_indexed(0),
OneIndexed::from_zero_indexed(0),
OneIndexed::from_zero_indexed(0),
OneIndexed::from_zero_indexed(0),
OneIndexed::from_zero_indexed(0),
OneIndexed::from_zero_indexed(2),
OneIndexed::from_zero_indexed(2),
OneIndexed::from_zero_indexed(2),
OneIndexed::from_zero_indexed(2),
OneIndexed::from_zero_indexed(2),
OneIndexed::from_zero_indexed(4),
OneIndexed::from_zero_indexed(6),
OneIndexed::from_zero_indexed(6),
OneIndexed::from_zero_indexed(7)
],
row_to_row_in_cell: vec![
OneIndexed::from_zero_indexed(0),
OneIndexed::from_zero_indexed(1),
OneIndexed::from_zero_indexed(2),
OneIndexed::from_zero_indexed(3),
OneIndexed::from_zero_indexed(4),
OneIndexed::from_zero_indexed(5),
OneIndexed::from_zero_indexed(0),
OneIndexed::from_zero_indexed(1),
OneIndexed::from_zero_indexed(2),
OneIndexed::from_zero_indexed(3),
OneIndexed::from_zero_indexed(4),
OneIndexed::from_zero_indexed(0),
OneIndexed::from_zero_indexed(0),
OneIndexed::from_zero_indexed(1),
OneIndexed::from_zero_indexed(0)
cell_starts: vec![
CellStart {
start_row: OneIndexed::MIN,
raw_cell_index: OneIndexed::MIN
},
CellStart {
start_row: OneIndexed::from_zero_indexed(6),
raw_cell_index: OneIndexed::from_zero_indexed(2)
},
CellStart {
start_row: OneIndexed::from_zero_indexed(11),
raw_cell_index: OneIndexed::from_zero_indexed(4)
},
CellStart {
start_row: OneIndexed::from_zero_indexed(12),
raw_cell_index: OneIndexed::from_zero_indexed(6)
},
CellStart {
start_row: OneIndexed::from_zero_indexed(14),
raw_cell_index: OneIndexed::from_zero_indexed(7)
}
],
}
);