Remove lexing and parsing from the linter benchmark (#9264)

## Summary This PR adds some helper structs to the linter paths to enable passing in the pre-computed tokens and parsed source code during benchmarking, to remove lexing and parsing from the overall linter benchmark measurement. We already remove parsing for the formatter, and we have separate benchmarks for the lexer and the parser, so this should make it much easier to measure linter performance changes.
2025-08-03 18:28:24 +00:00 · 2023-12-23 16:43:11 -05:00 · 2023-12-23 16:43:11 -05:00 · 9d6444138b
commit 9d6444138b
parent 6d0c9c4e95
8 changed files with 144 additions and 30 deletions
--- a/crates/ruff_benchmark/benches/linter.rs
+++ b/crates/ruff_benchmark/benches/linter.rs
@ -2,7 +2,7 @@ use ruff_benchmark::criterion::{
    criterion_group, criterion_main, BenchmarkGroup, BenchmarkId, Criterion, Throughput,
 };
 use ruff_benchmark::{TestCase, TestFile, TestFileDownloadError};
-use ruff_linter::linter::lint_only;
+use ruff_linter::linter::{lint_only, ParseSource};
 use ruff_linter::rule_selector::PreviewOptions;
 use ruff_linter::settings::rule_table::RuleTable;
 use ruff_linter::settings::types::PreviewMode;
@ -10,6 +10,7 @@ use ruff_linter::settings::{flags, LinterSettings};
 use ruff_linter::source_kind::SourceKind;
 use ruff_linter::{registry::Rule, RuleSelector};
 use ruff_python_ast::PySourceType;
+use ruff_python_parser::{lexer, parse_program_tokens, Mode};

 #[cfg(target_os = "windows")]
 #[global_allocator]
@ -53,7 +54,13 @@ fn benchmark_linter(mut group: BenchmarkGroup, settings: &LinterSettings) {
            BenchmarkId::from_parameter(case.name()),
            &case,
            |b, case| {
-                let kind = SourceKind::Python(case.code().to_string());
+                // Tokenize the source.
+                let tokens = lexer::lex(case.code(), Mode::Module).collect::<Vec<_>>();
+
+                // Parse the source.
+                let ast =
+                    parse_program_tokens(tokens.clone(), case.code(), case.name(), false).unwrap();
+
                b.iter(|| {
                    let path = case.path();
                    let result = lint_only(
@ -61,8 +68,12 @@ fn benchmark_linter(mut group: BenchmarkGroup, settings: &LinterSettings) {
                        None,
                        settings,
                        flags::Noqa::Enabled,
-                        &kind,
+                        &SourceKind::Python(case.code().to_string()),
                        PySourceType::from(path.as_path()),
+                        ParseSource::Precomputed {
+                            tokens: &tokens,
+                            ast: &ast,
+                        },
                    );

                    // Assert that file contains no parse errors
--- a/crates/ruff_cli/src/diagnostics.rs
+++ b/crates/ruff_cli/src/diagnostics.rs
@ -12,7 +12,7 @@ use rustc_hash::FxHashMap;

 use crate::cache::{Cache, FileCacheKey, LintCacheData};
 use ruff_diagnostics::Diagnostic;
-use ruff_linter::linter::{lint_fix, lint_only, FixTable, FixerResult, LinterResult};
+use ruff_linter::linter::{lint_fix, lint_only, FixTable, FixerResult, LinterResult, ParseSource};
 use ruff_linter::logging::DisplayParseError;
 use ruff_linter::message::Message;
 use ruff_linter::pyproject_toml::lint_pyproject_toml;
@ -303,12 +303,28 @@ pub(crate) fn lint_path(
            (result, fixed)
        } else {
            // If we fail to fix, lint the original source code.
-            let result = lint_only(path, package, settings, noqa, &source_kind, source_type);
+            let result = lint_only(
+                path,
+                package,
+                settings,
+                noqa,
+                &source_kind,
+                source_type,
+                ParseSource::None,
+            );
            let fixed = FxHashMap::default();
            (result, fixed)
        }
    } else {
-        let result = lint_only(path, package, settings, noqa, &source_kind, source_type);
+        let result = lint_only(
+            path,
+            package,
+            settings,
+            noqa,
+            &source_kind,
+            source_type,
+            ParseSource::None,
+        );
        let fixed = FxHashMap::default();
        (result, fixed)
    };
@ -444,6 +460,7 @@ pub(crate) fn lint_stdin(
                noqa,
                &source_kind,
                source_type,
+                ParseSource::None,
            );
            let fixed = FxHashMap::default();

@ -462,6 +479,7 @@ pub(crate) fn lint_stdin(
            noqa,
            &source_kind,
            source_type,
+            ParseSource::None,
        );
        let fixed = FxHashMap::default();
        (result, fixed)
--- a/crates/ruff_linter/src/linter.rs
+++ b/crates/ruff_linter/src/linter.rs
@ -11,7 +11,7 @@ use rustc_hash::FxHashMap;
 use ruff_diagnostics::Diagnostic;
 use ruff_notebook::Notebook;
 use ruff_python_ast::imports::ImportMap;
-use ruff_python_ast::PySourceType;
+use ruff_python_ast::{PySourceType, Suite};
 use ruff_python_codegen::Stylist;
 use ruff_python_index::Indexer;
 use ruff_python_parser::lexer::LexResult;
@ -73,7 +73,6 @@ pub struct FixerResult<'a> {
 pub fn check_path(
    path: &Path,
    package: Option<&Path>,
-    tokens: Vec<LexResult>,
    locator: &Locator,
    stylist: &Stylist,
    indexer: &Indexer,
@ -82,6 +81,7 @@ pub fn check_path(
    noqa: flags::Noqa,
    source_kind: &SourceKind,
    source_type: PySourceType,
+    tokens: TokenSource,
 ) -> LinterResult<(Vec<Diagnostic>, Option<ImportMap>)> {
    // Aggregate all diagnostics.
    let mut diagnostics = vec![];
@ -144,12 +144,8 @@ pub fn check_path(
            .iter_enabled()
            .any(|rule_code| rule_code.lint_source().is_imports());
    if use_ast || use_imports || use_doc_lines {
-        match ruff_python_parser::parse_program_tokens(
-            tokens,
-            source_kind.source_code(),
-            &path.to_string_lossy(),
-            source_type.is_ipynb(),
-        ) {
+        // Parse, if the AST wasn't pre-provided provided.
+        match tokens.into_ast_source(source_kind, source_type, path) {
            Ok(python_ast) => {
                let cell_offsets = source_kind.as_ipy_notebook().map(Notebook::cell_offsets);
                if use_ast {
@ -325,7 +321,6 @@ pub fn add_noqa_to_path(
    } = check_path(
        path,
        package,
-        tokens,
        &locator,
        &stylist,
        &indexer,
@ -334,6 +329,7 @@ pub fn add_noqa_to_path(
        flags::Noqa::Disabled,
        source_kind,
        source_type,
+        TokenSource::Tokens(tokens),
    );

    // Log any parse errors.
@ -365,10 +361,10 @@ pub fn lint_only(
    noqa: flags::Noqa,
    source_kind: &SourceKind,
    source_type: PySourceType,
+    data: ParseSource,
 ) -> LinterResult<(Vec<Message>, Option<ImportMap>)> {
    // Tokenize once.
-    let tokens: Vec<LexResult> =
-        ruff_python_parser::tokenize(source_kind.source_code(), source_type.as_mode());
+    let tokens = data.into_token_source(source_kind, source_type);

    // Map row and column locations to byte slices (lazily).
    let locator = Locator::new(source_kind.source_code());
@ -391,7 +387,6 @@ pub fn lint_only(
    let result = check_path(
        path,
        package,
-        tokens,
        &locator,
        &stylist,
        &indexer,
@ -400,6 +395,7 @@ pub fn lint_only(
        noqa,
        source_kind,
        source_type,
+        tokens,
    );

    result.map(|(diagnostics, imports)| {
@ -487,7 +483,6 @@ pub fn lint_fix<'a>(
        let result = check_path(
            path,
            package,
-            tokens,
            &locator,
            &stylist,
            &indexer,
@ -496,6 +491,7 @@ pub fn lint_fix<'a>(
            noqa,
            &transformed,
            source_type,
+            TokenSource::Tokens(tokens),
        );

        if iterations == 0 {
@ -632,6 +628,95 @@ This indicates a bug in Ruff. If you could open an issue at:
    }
 }

+#[derive(Debug, Clone)]
+pub enum ParseSource<'a> {
+    /// Extract the tokens and AST from the given source code.
+    None,
+    /// Use the precomputed tokens and AST.
+    Precomputed {
+        tokens: &'a [LexResult],
+        ast: &'a Suite,
+    },
+}
+
+impl<'a> ParseSource<'a> {
+    /// Convert to a [`TokenSource`], tokenizing if necessary.
+    fn into_token_source(
+        self,
+        source_kind: &SourceKind,
+        source_type: PySourceType,
+    ) -> TokenSource<'a> {
+        match self {
+            Self::None => TokenSource::Tokens(ruff_python_parser::tokenize(
+                source_kind.source_code(),
+                source_type.as_mode(),
+            )),
+            Self::Precomputed { tokens, ast } => TokenSource::Precomputed { tokens, ast },
+        }
+    }
+}
+
+#[derive(Debug, Clone)]
+pub enum TokenSource<'a> {
+    /// Use the precomputed tokens to generate the AST.
+    Tokens(Vec<LexResult>),
+    /// Use the precomputed tokens and AST.
+    Precomputed {
+        tokens: &'a [LexResult],
+        ast: &'a Suite,
+    },
+}
+
+impl Deref for TokenSource<'_> {
+    type Target = [LexResult];
+
+    fn deref(&self) -> &Self::Target {
+        match self {
+            Self::Tokens(tokens) => tokens,
+            Self::Precomputed { tokens, .. } => tokens,
+        }
+    }
+}
+
+impl<'a> TokenSource<'a> {
+    /// Convert to an [`AstSource`], parsing if necessary.
+    fn into_ast_source(
+        self,
+        source_kind: &SourceKind,
+        source_type: PySourceType,
+        path: &Path,
+    ) -> Result<AstSource<'a>, ParseError> {
+        match self {
+            Self::Tokens(tokens) => Ok(AstSource::Ast(ruff_python_parser::parse_program_tokens(
+                tokens,
+                source_kind.source_code(),
+                &path.to_string_lossy(),
+                source_type.is_ipynb(),
+            )?)),
+            Self::Precomputed { ast, .. } => Ok(AstSource::Precomputed(ast)),
+        }
+    }
+}
+
+#[derive(Debug, Clone)]
+pub enum AstSource<'a> {
+    /// Extract the AST from the given source code.
+    Ast(Suite),
+    /// Use the precomputed AST.
+    Precomputed(&'a Suite),
+}
+
+impl Deref for AstSource<'_> {
+    type Target = Suite;
+
+    fn deref(&self) -> &Self::Target {
+        match self {
+            Self::Ast(ast) => ast,
+            Self::Precomputed(ast) => ast,
+        }
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use std::path::Path;
--- a/crates/ruff_linter/src/rules/pyflakes/mod.rs
+++ b/crates/ruff_linter/src/rules/pyflakes/mod.rs
@ -23,7 +23,7 @@ mod tests {
    use ruff_source_file::Locator;
    use ruff_text_size::Ranged;

-    use crate::linter::{check_path, LinterResult};
+    use crate::linter::{check_path, LinterResult, TokenSource};
    use crate::registry::{AsRule, Linter, Rule};
    use crate::rules::pyflakes;
    use crate::settings::types::PreviewMode;
@ -560,7 +560,6 @@ mod tests {
        } = check_path(
            Path::new("<filename>"),
            None,
-            tokens,
            &locator,
            &stylist,
            &indexer,
@ -569,6 +568,7 @@ mod tests {
            flags::Noqa::Enabled,
            &source_kind,
            source_type,
+            TokenSource::Tokens(tokens),
        );
        diagnostics.sort_by_key(Ranged::start);
        let actual = diagnostics
--- a/crates/ruff_linter/src/test.rs
+++ b/crates/ruff_linter/src/test.rs
@ -21,7 +21,7 @@ use ruff_text_size::Ranged;

 use crate::directives;
 use crate::fix::{fix_file, FixResult};
-use crate::linter::{check_path, LinterResult};
+use crate::linter::{check_path, LinterResult, TokenSource};
 use crate::message::{Emitter, EmitterContext, Message, TextEmitter};
 use crate::packaging::detect_package_root;
 use crate::registry::AsRule;
@ -129,7 +129,6 @@ pub(crate) fn test_contents<'a>(
        path,
        path.parent()
            .and_then(|parent| detect_package_root(parent, &settings.namespace_packages)),
-        tokens,
        &locator,
        &stylist,
        &indexer,
@ -138,6 +137,7 @@ pub(crate) fn test_contents<'a>(
        flags::Noqa::Enabled,
        source_kind,
        source_type,
+        TokenSource::Tokens(tokens),
    );

    let source_has_errors = error.is_some();
@ -195,7 +195,6 @@ pub(crate) fn test_contents<'a>(
            } = check_path(
                path,
                None,
-                tokens,
                &locator,
                &stylist,
                &indexer,
@ -204,6 +203,7 @@ pub(crate) fn test_contents<'a>(
                flags::Noqa::Enabled,
                &transformed,
                source_type,
+                TokenSource::Tokens(tokens),
            );

            if let Some(fixed_error) = fixed_error {
--- a/crates/ruff_python_parser/src/lexer.rs
+++ b/crates/ruff_python_parser/src/lexer.rs
@ -1293,7 +1293,7 @@ impl FusedIterator for Lexer<'_> {}
 /// [lexer] implementation.
 ///
 /// [lexer]: crate::lexer
-#[derive(Debug, PartialEq)]
+#[derive(Debug, Clone, PartialEq)]
 pub struct LexicalError {
    /// The type of error that occurred.
    pub error: LexicalErrorType,
@ -1309,7 +1309,7 @@ impl LexicalError {
 }

 /// Represents the different types of errors that can occur during lexing.
-#[derive(Debug, PartialEq)]
+#[derive(Debug, Clone, PartialEq)]
 pub enum LexicalErrorType {
    // TODO: Can probably be removed, the places it is used seem to be able
    // to use the `UnicodeError` variant instead.
--- a/crates/ruff_python_parser/src/string.rs
+++ b/crates/ruff_python_parser/src/string.rs
@ -409,7 +409,7 @@ pub(crate) fn concatenated_strings(

 // TODO: consolidate these with ParseError
 /// An error that occurred during parsing of an f-string.
-#[derive(Debug, PartialEq)]
+#[derive(Debug, Clone, PartialEq)]
 struct FStringError {
    /// The type of error that occurred.
    pub(crate) error: FStringErrorType,
@ -427,7 +427,7 @@ impl From<FStringError> for LexicalError {
 }

 /// Represents the different types of errors that can occur during parsing of an f-string.
-#[derive(Debug, PartialEq)]
+#[derive(Debug, Clone, PartialEq)]
 pub enum FStringErrorType {
    /// Expected a right brace after an opened left brace.
    UnclosedLbrace,
--- a/crates/ruff_wasm/src/lib.rs
+++ b/crates/ruff_wasm/src/lib.rs
@ -7,7 +7,7 @@ use wasm_bindgen::prelude::*;
 use ruff_formatter::{FormatResult, Formatted, IndentStyle};
 use ruff_linter::directives;
 use ruff_linter::line_width::{IndentWidth, LineLength};
-use ruff_linter::linter::{check_path, LinterResult};
+use ruff_linter::linter::{check_path, LinterResult, TokenSource};
 use ruff_linter::registry::AsRule;
 use ruff_linter::settings::types::PythonVersion;
 use ruff_linter::settings::{flags, DEFAULT_SELECTORS, DUMMY_VARIABLE_RGX};
@ -182,7 +182,6 @@ impl Workspace {
        } = check_path(
            Path::new("<filename>"),
            None,
-            tokens,
            &locator,
            &stylist,
            &indexer,
@ -191,6 +190,7 @@ impl Workspace {
            flags::Noqa::Enabled,
            &source_kind,
            source_type,
+            TokenSource::Tokens(tokens),
        );

        let source_code = locator.to_source_code();