Use separate structs for expression and statement tracking (#6351)

## Summary

This PR fixes the performance degradation introduced in
https://github.com/astral-sh/ruff/pull/6345. Instead of using the
generic `Nodes` structs, we now use separate `Statement` and
`Expression` structs. Importantly, we can avoid tracking a bunch of
state for expressions that we need for parents: we don't need to track
reference-to-ID pointers (we just have no use-case for this -- I'd
actually like to remove this from statements too, but we need it for
branch detection right now), we don't need to track depth, etc.

In my testing, this entirely removes the regression on all-rules, and
gets us down to 2ms slower on the default rules (as a crude hyperfine
benchmark, so this is within margin of error IMO).

No behavioral changes.
This commit is contained in:
Charlie Marsh 2023-08-07 11:27:42 -04:00 committed by GitHub
parent 61d3977f95
commit b21abe0a57
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
11 changed files with 223 additions and 157 deletions

View file

@ -1,11 +1,12 @@
use anyhow::Result;
use ruff_text_size::TextRange;
use rustc_hash::FxHashMap;
use std::borrow::Cow;
use anyhow::Result;
use rustc_hash::FxHashMap;
use ruff_diagnostics::{AutofixKind, Diagnostic, Fix, Violation};
use ruff_macros::{derive_message_formats, violation};
use ruff_python_semantic::{AnyImport, Imported, NodeId, ResolvedReferenceId, Scope};
use ruff_python_semantic::{AnyImport, Imported, ResolvedReferenceId, Scope, StatementId};
use ruff_text_size::TextRange;
use crate::autofix;
use crate::checkers::ast::Checker;
@ -70,8 +71,8 @@ pub(crate) fn runtime_import_in_type_checking_block(
diagnostics: &mut Vec<Diagnostic>,
) {
// Collect all runtime imports by statement.
let mut errors_by_statement: FxHashMap<NodeId, Vec<ImportBinding>> = FxHashMap::default();
let mut ignores_by_statement: FxHashMap<NodeId, Vec<ImportBinding>> = FxHashMap::default();
let mut errors_by_statement: FxHashMap<StatementId, Vec<ImportBinding>> = FxHashMap::default();
let mut ignores_by_statement: FxHashMap<StatementId, Vec<ImportBinding>> = FxHashMap::default();
for binding_id in scope.binding_ids() {
let binding = checker.semantic().binding(binding_id);
@ -192,7 +193,11 @@ struct ImportBinding<'a> {
}
/// Generate a [`Fix`] to remove runtime imports from a type-checking block.
fn fix_imports(checker: &Checker, statement_id: NodeId, imports: &[ImportBinding]) -> Result<Fix> {
fn fix_imports(
checker: &Checker,
statement_id: StatementId,
imports: &[ImportBinding],
) -> Result<Fix> {
let statement = checker.semantic().statement(statement_id);
let parent = checker.semantic().parent_statement(statement);

View file

@ -5,7 +5,7 @@ use rustc_hash::FxHashMap;
use ruff_diagnostics::{AutofixKind, Diagnostic, DiagnosticKind, Fix, Violation};
use ruff_macros::{derive_message_formats, violation};
use ruff_python_semantic::{AnyImport, Binding, Imported, NodeId, ResolvedReferenceId, Scope};
use ruff_python_semantic::{AnyImport, Binding, Imported, ResolvedReferenceId, Scope, StatementId};
use ruff_text_size::TextRange;
use crate::autofix;
@ -190,9 +190,9 @@ pub(crate) fn typing_only_runtime_import(
diagnostics: &mut Vec<Diagnostic>,
) {
// Collect all typing-only imports by statement and import type.
let mut errors_by_statement: FxHashMap<(NodeId, ImportType), Vec<ImportBinding>> =
let mut errors_by_statement: FxHashMap<(StatementId, ImportType), Vec<ImportBinding>> =
FxHashMap::default();
let mut ignores_by_statement: FxHashMap<(NodeId, ImportType), Vec<ImportBinding>> =
let mut ignores_by_statement: FxHashMap<(StatementId, ImportType), Vec<ImportBinding>> =
FxHashMap::default();
for binding_id in scope.binding_ids() {
@ -402,7 +402,11 @@ fn is_exempt(name: &str, exempt_modules: &[&str]) -> bool {
}
/// Generate a [`Fix`] to remove typing-only imports from a runtime context.
fn fix_imports(checker: &Checker, statement_id: NodeId, imports: &[ImportBinding]) -> Result<Fix> {
fn fix_imports(
checker: &Checker,
statement_id: StatementId,
imports: &[ImportBinding],
) -> Result<Fix> {
let statement = checker.semantic().statement(statement_id);
let parent = checker.semantic().parent_statement(statement);

View file

@ -1,10 +1,11 @@
use std::borrow::Cow;
use anyhow::Result;
use rustc_hash::FxHashMap;
use std::borrow::Cow;
use ruff_diagnostics::{AutofixKind, Diagnostic, Fix, Violation};
use ruff_macros::{derive_message_formats, violation};
use ruff_python_semantic::{AnyImport, Exceptions, Imported, NodeId, Scope};
use ruff_python_semantic::{AnyImport, Exceptions, Imported, Scope, StatementId};
use ruff_text_size::TextRange;
use crate::autofix;
@ -98,8 +99,9 @@ impl Violation for UnusedImport {
pub(crate) fn unused_import(checker: &Checker, scope: &Scope, diagnostics: &mut Vec<Diagnostic>) {
// Collect all unused imports by statement.
let mut unused: FxHashMap<(NodeId, Exceptions), Vec<ImportBinding>> = FxHashMap::default();
let mut ignored: FxHashMap<(NodeId, Exceptions), Vec<ImportBinding>> = FxHashMap::default();
let mut unused: FxHashMap<(StatementId, Exceptions), Vec<ImportBinding>> = FxHashMap::default();
let mut ignored: FxHashMap<(StatementId, Exceptions), Vec<ImportBinding>> =
FxHashMap::default();
for binding_id in scope.binding_ids() {
let binding = checker.semantic().binding(binding_id);
@ -225,7 +227,11 @@ struct ImportBinding<'a> {
}
/// Generate a [`Fix`] to remove unused imports from a statement.
fn fix_imports(checker: &Checker, statement_id: NodeId, imports: &[ImportBinding]) -> Result<Fix> {
fn fix_imports(
checker: &Checker,
statement_id: StatementId,
imports: &[ImportBinding],
) -> Result<Fix> {
let statement = checker.semantic().statement(statement_id);
let parent = checker.semantic().parent_statement(statement);

View file

@ -1,13 +1,13 @@
use itertools::Itertools;
use ruff_python_ast::{self as ast, PySourceType, Ranged, Stmt};
use ruff_python_parser::{lexer, AsMode, Tok};
use ruff_text_size::{TextRange, TextSize};
use ruff_diagnostics::{AutofixKind, Diagnostic, Edit, Fix, Violation};
use ruff_macros::{derive_message_formats, violation};
use ruff_python_ast::helpers::contains_effect;
use ruff_python_ast::{self as ast, PySourceType, Ranged, Stmt};
use ruff_python_parser::{lexer, AsMode, Tok};
use ruff_python_semantic::Scope;
use ruff_source_file::Locator;
use ruff_text_size::{TextRange, TextSize};
use crate::autofix::edits::delete_stmt;
use crate::checkers::ast::Checker;

View file

@ -3,15 +3,15 @@ use std::iter;
use ruff_python_ast::{self as ast, ExceptHandler, Stmt};
use crate::node::{NodeId, Nodes};
use crate::statements::{StatementId, Statements};
/// Return the common ancestor of `left` and `right` below `stop`, or `None`.
fn common_ancestor(
left: NodeId,
right: NodeId,
stop: Option<NodeId>,
node_tree: &Nodes<Stmt>,
) -> Option<NodeId> {
left: StatementId,
right: StatementId,
stop: Option<StatementId>,
node_tree: &Statements,
) -> Option<StatementId> {
if stop.is_some_and(|stop| left == stop || right == stop) {
return None;
}
@ -83,13 +83,13 @@ fn alternatives(stmt: &Stmt) -> Vec<Vec<&Stmt>> {
/// Return `true` if `stmt` is a descendent of any of the nodes in `ancestors`.
fn descendant_of<'a>(
stmt: NodeId,
stmt: StatementId,
ancestors: &[&'a Stmt],
stop: NodeId,
node_tree: &Nodes<'a, Stmt>,
stop: StatementId,
node_tree: &Statements<'a>,
) -> bool {
ancestors.iter().any(|ancestor| {
node_tree.node_id(ancestor).is_some_and(|ancestor| {
node_tree.statement_id(ancestor).is_some_and(|ancestor| {
common_ancestor(stmt, ancestor, Some(stop), node_tree).is_some()
})
})
@ -97,7 +97,7 @@ fn descendant_of<'a>(
/// Return `true` if `left` and `right` are on different branches of an `if` or
/// `try` statement.
pub fn different_forks(left: NodeId, right: NodeId, node_tree: &Nodes<Stmt>) -> bool {
pub fn different_forks(left: StatementId, right: StatementId, node_tree: &Statements) -> bool {
if let Some(ancestor) = common_ancestor(left, right, None, node_tree) {
for items in alternatives(node_tree[ancestor]) {
let l = descendant_of(left, &items, ancestor, node_tree);

View file

@ -11,8 +11,8 @@ use ruff_text_size::TextRange;
use crate::context::ExecutionContext;
use crate::model::SemanticModel;
use crate::node::NodeId;
use crate::reference::ResolvedReferenceId;
use crate::statements::StatementId;
use crate::ScopeId;
#[derive(Debug, Clone)]
@ -24,7 +24,7 @@ pub struct Binding<'a> {
/// The context in which the [`Binding`] was created.
pub context: ExecutionContext,
/// The statement in which the [`Binding`] was defined.
pub source: Option<NodeId>,
pub source: Option<StatementId>,
/// The references to the [`Binding`].
pub references: Vec<ResolvedReferenceId>,
/// The exceptions that were handled when the [`Binding`] was defined.

View file

@ -0,0 +1,58 @@
use std::ops::Index;
use ruff_index::{newtype_index, IndexVec};
use ruff_python_ast::Expr;
/// Id uniquely identifying an expression in a program.
///
/// Using a `u32` is sufficient because Ruff only supports parsing documents with a size of max
/// `u32::max` and it is impossible to have more nodes than characters in the file. We use a
/// `NonZeroU32` to take advantage of memory layout optimizations.
#[newtype_index]
#[derive(Ord, PartialOrd)]
pub struct ExpressionId;
/// An [`Expr`] AST node in a program, along with a pointer to its parent expression (if any).
#[derive(Debug)]
struct ExpressionWithParent<'a> {
/// A pointer to the AST node.
node: &'a Expr,
/// The ID of the parent of this node, if any.
parent: Option<ExpressionId>,
}
/// The nodes of a program indexed by [`ExpressionId`]
#[derive(Debug, Default)]
pub struct Expressions<'a> {
nodes: IndexVec<ExpressionId, ExpressionWithParent<'a>>,
}
impl<'a> Expressions<'a> {
/// Inserts a new expression into the node tree and returns its unique id.
pub(crate) fn insert(&mut self, node: &'a Expr, parent: Option<ExpressionId>) -> ExpressionId {
self.nodes.push(ExpressionWithParent { node, parent })
}
/// Return the [`ExpressionId`] of the parent node.
#[inline]
pub fn parent_id(&self, node_id: ExpressionId) -> Option<ExpressionId> {
self.nodes[node_id].parent
}
/// Returns an iterator over all [`ExpressionId`] ancestors, starting from the given [`ExpressionId`].
pub(crate) fn ancestor_ids(
&self,
node_id: ExpressionId,
) -> impl Iterator<Item = ExpressionId> + '_ {
std::iter::successors(Some(node_id), |&node_id| self.nodes[node_id].parent)
}
}
impl<'a> Index<ExpressionId> for Expressions<'a> {
type Output = &'a Expr;
#[inline]
fn index(&self, index: ExpressionId) -> &Self::Output {
&self.nodes[index].node
}
}

View file

@ -2,19 +2,21 @@ pub mod analyze;
mod binding;
mod context;
mod definition;
mod expressions;
mod globals;
mod model;
mod node;
mod reference;
mod scope;
mod star_import;
mod statements;
pub use binding::*;
pub use context::*;
pub use definition::*;
pub use expressions::*;
pub use globals::*;
pub use model::*;
pub use node::*;
pub use reference::*;
pub use scope::*;
pub use star_import::*;
pub use statements::*;

View file

@ -17,13 +17,15 @@ use crate::binding::{
};
use crate::context::ExecutionContext;
use crate::definition::{Definition, DefinitionId, Definitions, Member, Module};
use crate::expressions::{ExpressionId, Expressions};
use crate::globals::{Globals, GlobalsArena};
use crate::node::{NodeId, Nodes};
use crate::reference::{
ResolvedReference, ResolvedReferenceId, ResolvedReferences, UnresolvedReferences,
ResolvedReference, ResolvedReferenceId, ResolvedReferences, UnresolvedReference,
UnresolvedReferenceFlags, UnresolvedReferences,
};
use crate::scope::{Scope, ScopeId, ScopeKind, Scopes};
use crate::{Imported, UnresolvedReference, UnresolvedReferenceFlags};
use crate::statements::{StatementId, Statements};
use crate::Imported;
/// A semantic model for a Python module, to enable querying the module's semantic information.
pub struct SemanticModel<'a> {
@ -31,16 +33,16 @@ pub struct SemanticModel<'a> {
module_path: Option<&'a [String]>,
/// Stack of all visited statements.
statements: Nodes<'a, Stmt>,
statements: Statements<'a>,
/// The identifier of the current statement.
statement_id: Option<NodeId>,
statement_id: Option<StatementId>,
/// Stack of all visited expressions.
expressions: Nodes<'a, Expr>,
expressions: Expressions<'a>,
/// The identifier of the current expression.
expression_id: Option<NodeId>,
expression_id: Option<ExpressionId>,
/// Stack of all scopes, along with the identifier of the current scope.
pub scopes: Scopes<'a>,
@ -132,9 +134,9 @@ impl<'a> SemanticModel<'a> {
Self {
typing_modules,
module_path: module.path(),
statements: Nodes::<Stmt>::default(),
statements: Statements::default(),
statement_id: None,
expressions: Nodes::<Expr>::default(),
expressions: Expressions::default(),
expression_id: None,
scopes: Scopes::default(),
scope_id: ScopeId::global(),
@ -919,20 +921,20 @@ impl<'a> SemanticModel<'a> {
None
}
/// Return the [`Nodes`] vector of all statements.
pub const fn statements(&self) -> &Nodes<'a, Stmt> {
/// Return the [`Statements`] vector of all statements.
pub const fn statements(&self) -> &Statements<'a> {
&self.statements
}
/// Return the [`NodeId`] corresponding to the given [`Stmt`].
/// Return the [`StatementId`] corresponding to the given [`Stmt`].
#[inline]
pub fn statement_id(&self, statement: &Stmt) -> Option<NodeId> {
self.statements.node_id(statement)
pub fn statement_id(&self, statement: &Stmt) -> Option<StatementId> {
self.statements.statement_id(statement)
}
/// Return the [`Stmt]` corresponding to the given [`NodeId`].
/// Return the [`Stmt]` corresponding to the given [`StatementId`].
#[inline]
pub fn statement(&self, statement_id: NodeId) -> &'a Stmt {
pub fn statement(&self, statement_id: StatementId) -> &'a Stmt {
self.statements[statement_id]
}
@ -1519,8 +1521,8 @@ impl SemanticModelFlags {
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct Snapshot {
scope_id: ScopeId,
stmt_id: Option<NodeId>,
expr_id: Option<NodeId>,
stmt_id: Option<StatementId>,
expr_id: Option<ExpressionId>,
definition_id: DefinitionId,
flags: SemanticModelFlags,
}

View file

@ -1,105 +0,0 @@
use std::ops::{Index, IndexMut};
use ruff_index::{newtype_index, IndexVec};
use rustc_hash::FxHashMap;
use ruff_python_ast::types::RefEquality;
/// Id uniquely identifying an AST node in a program.
///
/// Using a `u32` is sufficient because Ruff only supports parsing documents with a size of max `u32::max`
/// and it is impossible to have more nodes than characters in the file. We use a `NonZeroU32` to
/// take advantage of memory layout optimizations.
#[newtype_index]
#[derive(Ord, PartialOrd)]
pub struct NodeId;
/// A [`Node`] represents an AST node in a program, along with a pointer to its parent (if any).
#[derive(Debug)]
struct Node<'a, T> {
/// A pointer to the AST node.
node: &'a T,
/// The ID of the parent of this node, if any.
parent: Option<NodeId>,
/// The depth of this node in the tree.
depth: u32,
}
/// The nodes of a program indexed by [`NodeId`]
#[derive(Debug)]
pub struct Nodes<'a, T> {
nodes: IndexVec<NodeId, Node<'a, T>>,
node_to_id: FxHashMap<RefEquality<'a, T>, NodeId>,
}
impl<'a, T> Default for Nodes<'a, T> {
fn default() -> Self {
Self {
nodes: IndexVec::default(),
node_to_id: FxHashMap::default(),
}
}
}
impl<'a, T> Nodes<'a, T> {
/// Inserts a new node into the node tree and returns its unique id.
///
/// Panics if a node with the same pointer already exists.
pub(crate) fn insert(&mut self, node: &'a T, parent: Option<NodeId>) -> NodeId {
let next_id = self.nodes.next_index();
if let Some(existing_id) = self.node_to_id.insert(RefEquality(node), next_id) {
panic!("Node already exists with id {existing_id:?}");
}
self.nodes.push(Node {
node,
parent,
depth: parent.map_or(0, |parent| self.nodes[parent].depth + 1),
})
}
/// Returns the [`NodeId`] of the given node.
#[inline]
pub fn node_id(&self, node: &'a T) -> Option<NodeId> {
self.node_to_id.get(&RefEquality(node)).copied()
}
/// Return the [`NodeId`] of the parent node.
#[inline]
pub fn parent_id(&self, node_id: NodeId) -> Option<NodeId> {
self.nodes[node_id].parent
}
/// Return the parent of the given node.
pub fn parent(&self, node: &'a T) -> Option<&'a T> {
let node_id = self.node_to_id.get(&RefEquality(node))?;
let parent_id = self.nodes[*node_id].parent?;
Some(self[parent_id])
}
/// Return the depth of the node.
#[inline]
pub(crate) fn depth(&self, node_id: NodeId) -> u32 {
self.nodes[node_id].depth
}
/// Returns an iterator over all [`NodeId`] ancestors, starting from the given [`NodeId`].
pub(crate) fn ancestor_ids(&self, node_id: NodeId) -> impl Iterator<Item = NodeId> + '_ {
std::iter::successors(Some(node_id), |&node_id| self.nodes[node_id].parent)
}
}
impl<'a, T> Index<NodeId> for Nodes<'a, T> {
type Output = &'a T;
#[inline]
fn index(&self, index: NodeId) -> &Self::Output {
&self.nodes[index].node
}
}
impl<'a, T> IndexMut<NodeId> for Nodes<'a, T> {
#[inline]
fn index_mut(&mut self, index: NodeId) -> &mut Self::Output {
&mut self.nodes[index].node
}
}

View file

@ -0,0 +1,94 @@
use std::ops::Index;
use ruff_index::{newtype_index, IndexVec};
use ruff_python_ast::Stmt;
use rustc_hash::FxHashMap;
use ruff_python_ast::types::RefEquality;
/// Id uniquely identifying a statement AST node.
///
/// Using a `u32` is sufficient because Ruff only supports parsing documents with a size of max
/// `u32::max` and it is impossible to have more nodes than characters in the file. We use a
/// `NonZeroU32` to take advantage of memory layout optimizations.
#[newtype_index]
#[derive(Ord, PartialOrd)]
pub struct StatementId;
/// A [`Stmt`] AST node, along with a pointer to its parent statement (if any).
#[derive(Debug)]
struct StatementWithParent<'a> {
/// A pointer to the AST node.
statement: &'a Stmt,
/// The ID of the parent of this node, if any.
parent: Option<StatementId>,
/// The depth of this node in the tree.
depth: u32,
}
/// The statements of a program indexed by [`StatementId`]
#[derive(Debug, Default)]
pub struct Statements<'a> {
statements: IndexVec<StatementId, StatementWithParent<'a>>,
statement_to_id: FxHashMap<RefEquality<'a, Stmt>, StatementId>,
}
impl<'a> Statements<'a> {
/// Inserts a new statement into the statement vector and returns its unique ID.
///
/// Panics if a statement with the same pointer already exists.
pub(crate) fn insert(
&mut self,
statement: &'a Stmt,
parent: Option<StatementId>,
) -> StatementId {
let next_id = self.statements.next_index();
if let Some(existing_id) = self.statement_to_id.insert(RefEquality(statement), next_id) {
panic!("Statements already exists with ID: {existing_id:?}");
}
self.statements.push(StatementWithParent {
statement,
parent,
depth: parent.map_or(0, |parent| self.statements[parent].depth + 1),
})
}
/// Returns the [`StatementId`] of the given statement.
#[inline]
pub fn statement_id(&self, statement: &'a Stmt) -> Option<StatementId> {
self.statement_to_id.get(&RefEquality(statement)).copied()
}
/// Return the [`StatementId`] of the parent statement.
#[inline]
pub fn parent_id(&self, statement_id: StatementId) -> Option<StatementId> {
self.statements[statement_id].parent
}
/// Return the parent of the given statement.
pub fn parent(&self, statement: &'a Stmt) -> Option<&'a Stmt> {
let statement_id = self.statement_to_id.get(&RefEquality(statement))?;
let parent_id = self.statements[*statement_id].parent?;
Some(self[parent_id])
}
/// Return the depth of the statement.
#[inline]
pub(crate) fn depth(&self, id: StatementId) -> u32 {
self.statements[id].depth
}
/// Returns an iterator over all [`StatementId`] ancestors, starting from the given [`StatementId`].
pub(crate) fn ancestor_ids(&self, id: StatementId) -> impl Iterator<Item = StatementId> + '_ {
std::iter::successors(Some(id), |&id| self.statements[id].parent)
}
}
impl<'a> Index<StatementId> for Statements<'a> {
type Output = &'a Stmt;
#[inline]
fn index(&self, index: StatementId) -> &Self::Output {
&self.statements[index].statement
}
}