Avoid cloning source code multiple times (#6629)

## Summary

In working on https://github.com/astral-sh/ruff/pull/6628, I noticed
that we clone the source code contents, potentially multiple times,
prior to linting. The issue is that `SourceKind::Python` takes a
`String`, so we first have to provide it with a `String`. In the stdin
case, that means cloning. However, on top of this, we then have to clone
`source_kind.contents()` because `SourceKind` gets mutated. So for
stdin, we end up cloning twice. For non-stdin, we end up cloning once,
but unnecessarily (since the _contents_ don't get mutated, only the
kind).

This PR removes the `String` from `source_kind`, instead requiring that
we parse it out elsewhere. It reduces the number of clones down to 1 for
Jupyter Notebooks, and zero otherwise.
This commit is contained in:
Charlie Marsh 2023-08-18 09:32:18 -04:00 committed by GitHub
parent 0cea4975fc
commit 2aeb27334d
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 164 additions and 100 deletions

View file

@ -33,7 +33,7 @@ pub fn round_trip(path: &Path) -> anyhow::Result<String> {
err err
) )
})?; })?;
let code = notebook.content().to_string(); let code = notebook.source_code().to_string();
notebook.update_cell_content(&code); notebook.update_cell_content(&code);
let mut writer = Vec::new(); let mut writer = Vec::new();
notebook.write_inner(&mut writer)?; notebook.write_inner(&mut writer)?;
@ -103,7 +103,7 @@ pub struct Notebook {
/// separated by a newline and a trailing newline. The trailing newline /// separated by a newline and a trailing newline. The trailing newline
/// is added to make sure that each cell ends with a newline which will /// is added to make sure that each cell ends with a newline which will
/// be removed when updating the cell content. /// be removed when updating the cell content.
content: String, source_code: String,
/// The index of the notebook. This is used to map between the concatenated /// The index of the notebook. This is used to map between the concatenated
/// source code and the original notebook. /// source code and the original notebook.
index: OnceCell<JupyterIndex>, index: OnceCell<JupyterIndex>,
@ -132,8 +132,8 @@ impl Notebook {
} }
/// Read the Jupyter Notebook from its JSON string. /// Read the Jupyter Notebook from its JSON string.
pub fn from_contents(contents: &str) -> Result<Self, Box<Diagnostic>> { pub fn from_source_code(source_code: &str) -> Result<Self, Box<Diagnostic>> {
Self::from_reader(Cursor::new(contents)) Self::from_reader(Cursor::new(source_code))
} }
/// Read a Jupyter Notebook from a [`Read`] implementor. /// Read a Jupyter Notebook from a [`Read`] implementor.
@ -268,7 +268,7 @@ impl Notebook {
// The additional newline at the end is to maintain consistency for // The additional newline at the end is to maintain consistency for
// all cells. These newlines will be removed before updating the // all cells. These newlines will be removed before updating the
// source code with the transformed content. Refer `update_cell_content`. // source code with the transformed content. Refer `update_cell_content`.
content: contents.join("\n") + "\n", source_code: contents.join("\n") + "\n",
cell_offsets, cell_offsets,
valid_code_cells, valid_code_cells,
trailing_newline, trailing_newline,
@ -404,8 +404,8 @@ impl Notebook {
/// Return the notebook content. /// Return the notebook content.
/// ///
/// This is the concatenation of all Python code cells. /// This is the concatenation of all Python code cells.
pub(crate) fn content(&self) -> &str { pub fn source_code(&self) -> &str {
&self.content &self.source_code
} }
/// Return the Jupyter notebook index. /// Return the Jupyter notebook index.
@ -429,7 +429,7 @@ impl Notebook {
// it depends on the offsets to extract the cell content. // it depends on the offsets to extract the cell content.
self.update_cell_offsets(source_map); self.update_cell_offsets(source_map);
self.update_cell_content(transformed); self.update_cell_content(transformed);
self.content = transformed.to_string(); self.source_code = transformed.to_string();
} }
/// Return a slice of [`Cell`] in the Jupyter notebook. /// Return a slice of [`Cell`] in the Jupyter notebook.
@ -482,8 +482,8 @@ mod tests {
/// Read a Jupyter cell from the `resources/test/fixtures/jupyter/cell` directory. /// Read a Jupyter cell from the `resources/test/fixtures/jupyter/cell` directory.
fn read_jupyter_cell(path: impl AsRef<Path>) -> Result<Cell> { fn read_jupyter_cell(path: impl AsRef<Path>) -> Result<Cell> {
let path = test_resource_path("fixtures/jupyter/cell").join(path); let path = test_resource_path("fixtures/jupyter/cell").join(path);
let contents = std::fs::read_to_string(path)?; let source_code = std::fs::read_to_string(path)?;
Ok(serde_json::from_str(&contents)?) Ok(serde_json::from_str(&source_code)?)
} }
#[test] #[test]
@ -536,7 +536,7 @@ mod tests {
fn test_concat_notebook() -> Result<()> { fn test_concat_notebook() -> Result<()> {
let notebook = read_jupyter_notebook(Path::new("valid.ipynb"))?; let notebook = read_jupyter_notebook(Path::new("valid.ipynb"))?;
assert_eq!( assert_eq!(
notebook.content, notebook.source_code,
r#"def unused_variable(): r#"def unused_variable():
x = 1 x = 1
y = 2 y = 2

View file

@ -2,19 +2,11 @@ use crate::jupyter::Notebook;
#[derive(Clone, Debug, PartialEq, is_macro::Is)] #[derive(Clone, Debug, PartialEq, is_macro::Is)]
pub enum SourceKind { pub enum SourceKind {
Python(String), Python,
Jupyter(Notebook), Jupyter(Notebook),
} }
impl SourceKind { impl SourceKind {
/// Return the source content.
pub fn content(&self) -> &str {
match self {
SourceKind::Python(content) => content,
SourceKind::Jupyter(notebook) => notebook.content(),
}
}
/// Return the [`Notebook`] if the source kind is [`SourceKind::Jupyter`]. /// Return the [`Notebook`] if the source kind is [`SourceKind::Jupyter`].
pub fn notebook(&self) -> Option<&Notebook> { pub fn notebook(&self) -> Option<&Notebook> {
if let Self::Jupyter(notebook) = self { if let Self::Jupyter(notebook) = self {

View file

@ -6,14 +6,13 @@ use std::path::Path;
#[cfg(not(fuzzing))] #[cfg(not(fuzzing))]
use anyhow::Result; use anyhow::Result;
use itertools::Itertools; use itertools::Itertools;
use ruff_python_parser::lexer::LexResult;
use rustc_hash::FxHashMap; use rustc_hash::FxHashMap;
use ruff_diagnostics::{AutofixKind, Diagnostic}; use ruff_diagnostics::{AutofixKind, Diagnostic};
use ruff_python_ast::PySourceType; use ruff_python_ast::PySourceType;
use ruff_python_codegen::Stylist; use ruff_python_codegen::Stylist;
use ruff_python_index::Indexer; use ruff_python_index::Indexer;
use ruff_python_parser::lexer::LexResult;
use ruff_python_parser::AsMode; use ruff_python_parser::AsMode;
use ruff_python_trivia::textwrap::dedent; use ruff_python_trivia::textwrap::dedent;
use ruff_source_file::{Locator, SourceFileBuilder}; use ruff_source_file::{Locator, SourceFileBuilder};
@ -53,7 +52,8 @@ pub(crate) fn test_path(path: impl AsRef<Path>, settings: &Settings) -> Result<V
let path = test_resource_path("fixtures").join(path); let path = test_resource_path("fixtures").join(path);
let contents = std::fs::read_to_string(&path)?; let contents = std::fs::read_to_string(&path)?;
Ok(test_contents( Ok(test_contents(
&mut SourceKind::Python(contents), &contents,
&mut SourceKind::Python,
&path, &path,
settings, settings,
)) ))
@ -65,14 +65,16 @@ pub(crate) fn test_notebook_path(
expected: impl AsRef<Path>, expected: impl AsRef<Path>,
settings: &Settings, settings: &Settings,
) -> Result<(Vec<Message>, SourceKind, SourceKind)> { ) -> Result<(Vec<Message>, SourceKind, SourceKind)> {
let mut source_kind = SourceKind::Jupyter(read_jupyter_notebook(path.as_ref())?); let notebook = read_jupyter_notebook(path.as_ref())?;
let contents = notebook.source_code().to_string();
let mut source_kind = SourceKind::Jupyter(notebook);
let original_source_kind = source_kind.clone(); let original_source_kind = source_kind.clone();
let messages = test_contents(&mut source_kind, path.as_ref(), settings); let messages = test_contents(&contents, &mut source_kind, path.as_ref(), settings);
let expected_notebook = read_jupyter_notebook(expected.as_ref())?; let expected_notebook = read_jupyter_notebook(expected.as_ref())?;
if let SourceKind::Jupyter(notebook) = &source_kind { if let SourceKind::Jupyter(notebook) = &source_kind {
assert_eq!(notebook.cell_offsets(), expected_notebook.cell_offsets()); assert_eq!(notebook.cell_offsets(), expected_notebook.cell_offsets());
assert_eq!(notebook.index(), expected_notebook.index()); assert_eq!(notebook.index(), expected_notebook.index());
assert_eq!(notebook.content(), expected_notebook.content()); assert_eq!(notebook.source_code(), expected_notebook.source_code());
}; };
Ok((messages, original_source_kind, source_kind)) Ok((messages, original_source_kind, source_kind))
} }
@ -81,11 +83,7 @@ pub(crate) fn test_notebook_path(
pub fn test_snippet(contents: &str, settings: &Settings) -> Vec<Message> { pub fn test_snippet(contents: &str, settings: &Settings) -> Vec<Message> {
let path = Path::new("<filename>"); let path = Path::new("<filename>");
let contents = dedent(contents); let contents = dedent(contents);
test_contents( test_contents(&contents, &mut SourceKind::Python, path, settings)
&mut SourceKind::Python(contents.to_string()),
path,
settings,
)
} }
thread_local! { thread_local! {
@ -102,11 +100,15 @@ pub(crate) fn max_iterations() -> usize {
/// A convenient wrapper around [`check_path`], that additionally /// A convenient wrapper around [`check_path`], that additionally
/// asserts that autofixes converge after a fixed number of iterations. /// asserts that autofixes converge after a fixed number of iterations.
fn test_contents(source_kind: &mut SourceKind, path: &Path, settings: &Settings) -> Vec<Message> { fn test_contents(
let contents = source_kind.content().to_string(); contents: &str,
source_kind: &mut SourceKind,
path: &Path,
settings: &Settings,
) -> Vec<Message> {
let source_type = PySourceType::from(path); let source_type = PySourceType::from(path);
let tokens: Vec<LexResult> = ruff_python_parser::tokenize(&contents, source_type.as_mode()); let tokens: Vec<LexResult> = ruff_python_parser::tokenize(contents, source_type.as_mode());
let locator = Locator::new(&contents); let locator = Locator::new(contents);
let stylist = Stylist::from_tokens(&tokens, &locator); let stylist = Stylist::from_tokens(&tokens, &locator);
let indexer = Indexer::from_tokens(&tokens, &locator); let indexer = Indexer::from_tokens(&tokens, &locator);
let directives = directives::extract_directives( let directives = directives::extract_directives(
@ -225,7 +227,7 @@ Source with applied fixes:
let source_code = SourceFileBuilder::new( let source_code = SourceFileBuilder::new(
path.file_name().unwrap().to_string_lossy().as_ref(), path.file_name().unwrap().to_string_lossy().as_ref(),
contents.as_str(), contents,
) )
.finish(); .finish();

View file

@ -1,5 +1,6 @@
#![cfg_attr(target_family = "wasm", allow(dead_code))] #![cfg_attr(target_family = "wasm", allow(dead_code))]
use std::borrow::Cow;
use std::fs::write; use std::fs::write;
use std::io; use std::io;
use std::io::Write; use std::io::Write;
@ -75,6 +76,31 @@ impl Diagnostics {
source_kind: FxHashMap::default(), source_kind: FxHashMap::default(),
} }
} }
/// Generate [`Diagnostics`] based on an [`io::Error`].
pub(crate) fn from_io_error(err: &io::Error, path: &Path, settings: &Settings) -> Self {
if settings.rules.enabled(Rule::IOError) {
let io_err = Diagnostic::new(
IOError {
message: err.to_string(),
},
TextRange::default(),
);
let dummy = SourceFileBuilder::new(path.to_string_lossy().as_ref(), "").finish();
Self::new(
vec![Message::from_diagnostic(io_err, dummy, TextSize::default())],
ImportMap::default(),
)
} else {
warn!(
"{}{}{} {err}",
"Failed to lint ".bold(),
fs::relativize_path(path).bold(),
":".bold()
);
Self::default()
}
}
} }
impl AddAssign for Diagnostics { impl AddAssign for Diagnostics {
@ -131,11 +157,11 @@ fn notebook_from_path(path: &Path) -> Result<Notebook, Box<Diagnostics>> {
/// Parse a Jupyter Notebook from a JSON string. /// Parse a Jupyter Notebook from a JSON string.
/// ///
/// Returns either an indexed Python Jupyter Notebook or a diagnostic (which is empty if we skip). /// Returns either an indexed Python Jupyter Notebook or a diagnostic (which is empty if we skip).
fn notebook_from_contents( fn notebook_from_source_code(
contents: &str, source_code: &str,
path: Option<&Path>, path: Option<&Path>,
) -> Result<Notebook, Box<Diagnostics>> { ) -> Result<Notebook, Box<Diagnostics>> {
let notebook = match Notebook::from_contents(contents) { let notebook = match Notebook::from_source_code(source_code) {
Ok(notebook) => { Ok(notebook) => {
if !notebook.is_python_notebook() { if !notebook.is_python_notebook() {
// Not a python notebook, this could e.g. be an R notebook which we want to just skip. // Not a python notebook, this could e.g. be an R notebook which we want to just skip.
@ -202,31 +228,6 @@ pub(crate) fn lint_path(
debug!("Checking: {}", path.display()); debug!("Checking: {}", path.display());
// In case of an io error we want to exit early
let io_error_diagnostics = |err: io::Error, path: &Path| -> Diagnostics {
if settings.lib.rules.enabled(Rule::IOError) {
let io_err = Diagnostic::new(
IOError {
message: err.to_string(),
},
TextRange::default(),
);
let dummy = SourceFileBuilder::new(path.to_string_lossy().as_ref(), "").finish();
Diagnostics::new(
vec![Message::from_diagnostic(io_err, dummy, TextSize::default())],
ImportMap::default(),
)
} else {
warn!(
"{}{}{} {err}",
"Failed to lint ".bold(),
fs::relativize_path(path).bold(),
":".bold()
);
Diagnostics::default()
}
};
// We have to special case this here since the Python tokenizer doesn't work with TOML. // We have to special case this here since the Python tokenizer doesn't work with TOML.
if is_project_toml(path) { if is_project_toml(path) {
let messages = if settings let messages = if settings
@ -238,7 +239,7 @@ pub(crate) fn lint_path(
let contents = match std::fs::read_to_string(path) { let contents = match std::fs::read_to_string(path) {
Ok(contents) => contents, Ok(contents) => contents,
Err(err) => { Err(err) => {
return Ok(io_error_diagnostics(err, path)); return Ok(Diagnostics::from_io_error(&err, path, &settings.lib));
} }
}; };
let source_file = SourceFileBuilder::new(path.to_string_lossy(), contents).finish(); let source_file = SourceFileBuilder::new(path.to_string_lossy(), contents).finish();
@ -252,26 +253,20 @@ pub(crate) fn lint_path(
}); });
} }
let source_type = PySourceType::from(path); // Extract the sources from the file.
let LintSources {
// Read the file from disk source_type,
let mut source_kind = if source_type.is_jupyter() { mut source_kind,
match notebook_from_path(path) { contents,
Ok(notebook) => SourceKind::Jupyter(notebook), } = match LintSources::try_from_path(path) {
Err(diagnostic) => return Ok(*diagnostic), Ok(sources) => sources,
Err(SourceExtractionError::Io(err)) => {
return Ok(Diagnostics::from_io_error(&err, path, &settings.lib));
} }
} else { Err(SourceExtractionError::Diagnostics(diagnostics)) => {
// This is tested by ruff_cli integration test `unreadable_file` return Ok(*diagnostics);
let contents = match std::fs::read_to_string(path) {
Ok(contents) => contents,
Err(err) => {
return Ok(io_error_diagnostics(err, path));
} }
}; };
SourceKind::Python(contents)
};
let contents = source_kind.content().to_string();
// Lint the file. // Lint the file.
let ( let (
@ -297,7 +292,7 @@ pub(crate) fn lint_path(
if !fixed.is_empty() { if !fixed.is_empty() {
match autofix { match autofix {
flags::FixMode::Apply => match &source_kind { flags::FixMode::Apply => match &source_kind {
SourceKind::Python(_) => { SourceKind::Python => {
write(path, transformed.as_bytes())?; write(path, transformed.as_bytes())?;
} }
SourceKind::Jupyter(notebook) => { SourceKind::Jupyter(notebook) => {
@ -306,9 +301,9 @@ pub(crate) fn lint_path(
}, },
flags::FixMode::Diff => { flags::FixMode::Diff => {
match &source_kind { match &source_kind {
SourceKind::Python(_) => { SourceKind::Python => {
let mut stdout = io::stdout().lock(); let mut stdout = io::stdout().lock();
TextDiff::from_lines(contents.as_str(), &transformed) TextDiff::from_lines(&contents, &transformed)
.unified_diff() .unified_diff()
.header(&fs::relativize_path(path), &fs::relativize_path(path)) .header(&fs::relativize_path(path), &fs::relativize_path(path))
.to_writer(&mut stdout)?; .to_writer(&mut stdout)?;
@ -441,20 +436,22 @@ pub(crate) fn lint_stdin(
noqa: flags::Noqa, noqa: flags::Noqa,
autofix: flags::FixMode, autofix: flags::FixMode,
) -> Result<Diagnostics> { ) -> Result<Diagnostics> {
let source_type = path.map(PySourceType::from).unwrap_or_default(); // Extract the sources from the file.
let LintSources {
let mut source_kind = if source_type.is_jupyter() { source_type,
// SAFETY: Jupyter isn't the default type, so we must have a path. mut source_kind,
match notebook_from_contents(contents, path) { contents,
Ok(notebook) => SourceKind::Jupyter(notebook), } = match LintSources::try_from_source_code(contents, path) {
Err(diagnostic) => return Ok(*diagnostic), Ok(sources) => sources,
Err(SourceExtractionError::Io(err)) => {
// SAFETY: An `io::Error` can only occur if we're reading from a path.
return Ok(Diagnostics::from_io_error(&err, path.unwrap(), settings));
}
Err(SourceExtractionError::Diagnostics(diagnostics)) => {
return Ok(*diagnostics);
} }
} else {
SourceKind::Python(contents.to_string())
}; };
let contents = source_kind.content().to_string();
// Lint the inputs. // Lint the inputs.
let ( let (
LinterResult { LinterResult {
@ -484,7 +481,7 @@ pub(crate) fn lint_stdin(
flags::FixMode::Diff => { flags::FixMode::Diff => {
// But only write a diff if it's non-empty. // But only write a diff if it's non-empty.
if !fixed.is_empty() { if !fixed.is_empty() {
let text_diff = TextDiff::from_lines(contents.as_str(), &transformed); let text_diff = TextDiff::from_lines(&contents, &transformed);
let mut unified_diff = text_diff.unified_diff(); let mut unified_diff = text_diff.unified_diff();
if let Some(path) = path { if let Some(path) = path {
unified_diff unified_diff
@ -555,11 +552,84 @@ pub(crate) fn lint_stdin(
}) })
} }
#[derive(Debug)]
struct LintSources<'a> {
/// The "type" of source code, e.g. `.py`, `.pyi`, `.ipynb`, etc.
source_type: PySourceType,
/// The "kind" of source, e.g. Python file, Jupyter Notebook, etc.
source_kind: SourceKind,
/// The contents of the source code.
contents: Cow<'a, str>,
}
impl<'a> LintSources<'a> {
/// Extract the lint [`LintSources`] from the given file path.
fn try_from_path(path: &Path) -> Result<LintSources, SourceExtractionError> {
let source_type = PySourceType::from(path);
// Read the file from disk.
if source_type.is_jupyter() {
let notebook = notebook_from_path(path).map_err(SourceExtractionError::Diagnostics)?;
let contents = notebook.source_code().to_string();
let source_kind = SourceKind::Jupyter(notebook);
Ok(LintSources {
source_type,
source_kind,
contents: Cow::Owned(contents),
})
} else {
// This is tested by ruff_cli integration test `unreadable_file`
let contents = std::fs::read_to_string(path).map_err(SourceExtractionError::Io)?;
Ok(LintSources {
source_type,
source_kind: SourceKind::Python,
contents: Cow::Owned(contents),
})
}
}
/// Extract the lint [`LintSources`] from the raw string contents, optionally accompanied by a
/// file path indicating the path to the file from which the contents were read. If provided,
/// the file path should be used for diagnostics, but not for reading the file from disk.
fn try_from_source_code(
source_code: &'a str,
path: Option<&Path>,
) -> Result<LintSources<'a>, SourceExtractionError> {
let source_type = path.map(PySourceType::from).unwrap_or_default();
if source_type.is_jupyter() {
let notebook = notebook_from_source_code(source_code, path)
.map_err(SourceExtractionError::Diagnostics)?;
let contents = notebook.source_code().to_string();
let source_kind = SourceKind::Jupyter(notebook);
Ok(LintSources {
source_type,
source_kind,
contents: Cow::Owned(contents),
})
} else {
Ok(LintSources {
source_type,
source_kind: SourceKind::Python,
contents: Cow::Borrowed(source_code),
})
}
}
}
#[derive(Debug)]
enum SourceExtractionError {
/// The extraction failed due to an [`io::Error`].
Io(io::Error),
/// The extraction failed, and generated [`Diagnostics`] to report.
Diagnostics(Box<Diagnostics>),
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use std::path::Path; use std::path::Path;
use crate::diagnostics::{notebook_from_contents, notebook_from_path, Diagnostics}; use crate::diagnostics::{notebook_from_path, notebook_from_source_code, Diagnostics};
#[test] #[test]
fn test_r() { fn test_r() {
@ -573,7 +643,7 @@ mod tests {
let contents = std::fs::read_to_string(path).unwrap(); let contents = std::fs::read_to_string(path).unwrap();
// No diagnostics is used as skip signal. // No diagnostics is used as skip signal.
assert_eq!( assert_eq!(
notebook_from_contents(&contents, Some(path)).unwrap_err(), notebook_from_source_code(&contents, Some(path)).unwrap_err(),
Box::<Diagnostics>::default() Box::<Diagnostics>::default()
); );
} }