Reduce notebook memory footprint (#21319)

This commit is contained in:
Micha Reiser 2025-11-11 10:43:37 +01:00 committed by GitHub
parent 33b942c7ad
commit 36cce347fd
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 84 additions and 71 deletions

View file

@ -112,16 +112,16 @@ impl std::fmt::Display for Diff<'_> {
// `None`, indicating a regular script file, all the lines will be in one "cell" under the // `None`, indicating a regular script file, all the lines will be in one "cell" under the
// `None` key. // `None` key.
let cells = if let Some(notebook_index) = &self.notebook_index { let cells = if let Some(notebook_index) = &self.notebook_index {
let mut last_cell = OneIndexed::MIN; let mut last_cell_index = OneIndexed::MIN;
let mut cells: Vec<(Option<OneIndexed>, TextSize)> = Vec::new(); let mut cells: Vec<(Option<OneIndexed>, TextSize)> = Vec::new();
for (row, cell) in notebook_index.iter() { for cell in notebook_index.iter() {
if cell != last_cell { if cell.cell_index() != last_cell_index {
let offset = source_code.line_start(row); let offset = source_code.line_start(cell.start_row());
cells.push((Some(last_cell), offset)); cells.push((Some(last_cell_index), offset));
last_cell = cell; last_cell_index = cell.cell_index();
} }
} }
cells.push((Some(last_cell), source_text.text_len())); cells.push((Some(last_cell_index), source_text.text_len()));
cells cells
} else { } else {
vec![(None, source_text.text_len())] vec![(None, source_text.text_len())]

View file

@ -8,37 +8,40 @@ use ruff_source_file::{LineColumn, OneIndexed, SourceLocation};
/// [`ruff_text_size::TextSize`] to jupyter notebook cell/row/column. /// [`ruff_text_size::TextSize`] to jupyter notebook cell/row/column.
#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)] #[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
pub struct NotebookIndex { pub struct NotebookIndex {
/// Enter a row (1-based), get back the cell (1-based) /// Stores the starting row and the absolute cell index for every Python (valid) cell.
pub(super) row_to_cell: Vec<OneIndexed>, ///
/// Enter a row (1-based), get back the row in cell (1-based) /// The index in this vector corresponds to the Python cell index (valid cell index).
pub(super) row_to_row_in_cell: Vec<OneIndexed>, pub(super) cell_starts: Vec<CellStart>,
} }
impl NotebookIndex { impl NotebookIndex {
pub fn new(row_to_cell: Vec<OneIndexed>, row_to_row_in_cell: Vec<OneIndexed>) -> Self { fn find_cell(&self, row: OneIndexed) -> Option<CellStart> {
Self { match self
row_to_cell, .cell_starts
row_to_row_in_cell, .binary_search_by_key(&row, |start| start.start_row)
{
Ok(cell_index) => Some(self.cell_starts[cell_index]),
Err(insertion_point) => Some(self.cell_starts[insertion_point.checked_sub(1)?]),
} }
} }
/// Returns the cell number (1-based) for the given row (1-based). /// Returns the (raw) cell number (1-based) for the given row (1-based).
pub fn cell(&self, row: OneIndexed) -> Option<OneIndexed> { pub fn cell(&self, row: OneIndexed) -> Option<OneIndexed> {
self.row_to_cell.get(row.to_zero_indexed()).copied() self.find_cell(row).map(|start| start.raw_cell_index)
} }
/// Returns the row number (1-based) in the cell (1-based) for the /// Returns the row number (1-based) in the cell (1-based) for the
/// given row (1-based). /// given row (1-based).
pub fn cell_row(&self, row: OneIndexed) -> Option<OneIndexed> { pub fn cell_row(&self, row: OneIndexed) -> Option<OneIndexed> {
self.row_to_row_in_cell.get(row.to_zero_indexed()).copied() self.find_cell(row)
.map(|start| OneIndexed::from_zero_indexed(row.get() - start.start_row.get()))
} }
/// Returns an iterator over the row:cell-number pairs (both 1-based). /// Returns an iterator over the starting rows of each cell (1-based).
pub fn iter(&self) -> impl Iterator<Item = (OneIndexed, OneIndexed)> { ///
self.row_to_cell /// This yields one entry per Python cell (skipping over Makrdown cell).
.iter() pub fn iter(&self) -> impl Iterator<Item = CellStart> + '_ {
.enumerate() self.cell_starts.iter().copied()
.map(|(row, cell)| (OneIndexed::from_zero_indexed(row), *cell))
} }
/// Translates the given [`LineColumn`] based on the indexing table. /// Translates the given [`LineColumn`] based on the indexing table.
@ -67,3 +70,23 @@ impl NotebookIndex {
} }
} }
} }
#[derive(Debug, Copy, Clone, Eq, PartialEq, Serialize, Deserialize)]
pub struct CellStart {
/// The row in the concatenated notebook source code at which
/// this cell starts.
pub(super) start_row: OneIndexed,
/// The absolute index of this cell in the notebook.
pub(super) raw_cell_index: OneIndexed,
}
impl CellStart {
pub fn start_row(&self) -> OneIndexed {
self.start_row
}
pub fn cell_index(&self) -> OneIndexed {
self.raw_cell_index
}
}

View file

@ -18,7 +18,7 @@ use ruff_text_size::TextSize;
use crate::cell::CellOffsets; use crate::cell::CellOffsets;
use crate::index::NotebookIndex; use crate::index::NotebookIndex;
use crate::schema::{Cell, RawNotebook, SortAlphabetically, SourceValue}; use crate::schema::{Cell, RawNotebook, SortAlphabetically, SourceValue};
use crate::{CellMetadata, RawNotebookMetadata, schema}; use crate::{CellMetadata, CellStart, RawNotebookMetadata, schema};
/// Run round-trip source code generation on a given Jupyter notebook file path. /// Run round-trip source code generation on a given Jupyter notebook file path.
pub fn round_trip(path: &Path) -> anyhow::Result<String> { pub fn round_trip(path: &Path) -> anyhow::Result<String> {
@ -320,11 +320,19 @@ impl Notebook {
/// The index building is expensive as it needs to go through the content of /// The index building is expensive as it needs to go through the content of
/// every valid code cell. /// every valid code cell.
fn build_index(&self) -> NotebookIndex { fn build_index(&self) -> NotebookIndex {
let mut row_to_cell = Vec::new(); let mut cell_starts = Vec::with_capacity(self.valid_code_cells.len());
let mut row_to_row_in_cell = Vec::new();
let mut current_row = OneIndexed::MIN;
for &cell_index in &self.valid_code_cells { for &cell_index in &self.valid_code_cells {
let line_count = match &self.raw.cells[cell_index as usize].source() { let raw_cell_index = cell_index as usize;
// Record the starting row of this cell
cell_starts.push(CellStart {
start_row: current_row,
raw_cell_index: OneIndexed::from_zero_indexed(raw_cell_index),
});
let line_count = match &self.raw.cells[raw_cell_index].source() {
SourceValue::String(string) => { SourceValue::String(string) => {
if string.is_empty() { if string.is_empty() {
1 1
@ -342,17 +350,11 @@ impl Notebook {
} }
} }
}; };
row_to_cell.extend(std::iter::repeat_n(
OneIndexed::from_zero_indexed(cell_index as usize), current_row = current_row.saturating_add(line_count);
line_count,
));
row_to_row_in_cell.extend((0..line_count).map(OneIndexed::from_zero_indexed));
} }
NotebookIndex { NotebookIndex { cell_starts }
row_to_cell,
row_to_row_in_cell,
}
} }
/// Return the notebook content. /// Return the notebook content.
@ -456,7 +458,7 @@ mod tests {
use ruff_source_file::OneIndexed; use ruff_source_file::OneIndexed;
use crate::{Cell, Notebook, NotebookError, NotebookIndex}; use crate::{Cell, CellStart, Notebook, NotebookError, NotebookIndex};
/// Construct a path to a Jupyter notebook in the `resources/test/fixtures/jupyter` directory. /// Construct a path to a Jupyter notebook in the `resources/test/fixtures/jupyter` directory.
fn notebook_path(path: impl AsRef<Path>) -> std::path::PathBuf { fn notebook_path(path: impl AsRef<Path>) -> std::path::PathBuf {
@ -548,39 +550,27 @@ print("after empty cells")
assert_eq!( assert_eq!(
notebook.index(), notebook.index(),
&NotebookIndex { &NotebookIndex {
row_to_cell: vec![ cell_starts: vec![
OneIndexed::from_zero_indexed(0), CellStart {
OneIndexed::from_zero_indexed(0), start_row: OneIndexed::MIN,
OneIndexed::from_zero_indexed(0), raw_cell_index: OneIndexed::MIN
OneIndexed::from_zero_indexed(0), },
OneIndexed::from_zero_indexed(0), CellStart {
OneIndexed::from_zero_indexed(0), start_row: OneIndexed::from_zero_indexed(6),
OneIndexed::from_zero_indexed(2), raw_cell_index: OneIndexed::from_zero_indexed(2)
OneIndexed::from_zero_indexed(2), },
OneIndexed::from_zero_indexed(2), CellStart {
OneIndexed::from_zero_indexed(2), start_row: OneIndexed::from_zero_indexed(11),
OneIndexed::from_zero_indexed(2), raw_cell_index: OneIndexed::from_zero_indexed(4)
OneIndexed::from_zero_indexed(4), },
OneIndexed::from_zero_indexed(6), CellStart {
OneIndexed::from_zero_indexed(6), start_row: OneIndexed::from_zero_indexed(12),
OneIndexed::from_zero_indexed(7) raw_cell_index: OneIndexed::from_zero_indexed(6)
], },
row_to_row_in_cell: vec![ CellStart {
OneIndexed::from_zero_indexed(0), start_row: OneIndexed::from_zero_indexed(14),
OneIndexed::from_zero_indexed(1), raw_cell_index: OneIndexed::from_zero_indexed(7)
OneIndexed::from_zero_indexed(2), }
OneIndexed::from_zero_indexed(3),
OneIndexed::from_zero_indexed(4),
OneIndexed::from_zero_indexed(5),
OneIndexed::from_zero_indexed(0),
OneIndexed::from_zero_indexed(1),
OneIndexed::from_zero_indexed(2),
OneIndexed::from_zero_indexed(3),
OneIndexed::from_zero_indexed(4),
OneIndexed::from_zero_indexed(0),
OneIndexed::from_zero_indexed(0),
OneIndexed::from_zero_indexed(1),
OneIndexed::from_zero_indexed(0)
], ],
} }
); );