ruff/crates/ruff_python_ast/src/source_code/indexer.rs
Charlie Marsh 4782675bf9
Remove lexer-based comment range detection (#5785)
## Summary

I'm doing some unrelated profiling, and I noticed that this method is
actually measurable on the CPython benchmark -- it's > 1% of execution
time. We don't need to lex here, we already know the ranges of all
comments, so we can just do a simple binary search for overlap, which
brings the method down to 0%.

## Test Plan

`cargo test`
2023-07-16 01:03:27 +00:00

284 lines
9.2 KiB
Rust

//! Struct used to index source code, to enable efficient lookup of tokens that
//! are omitted from the AST (e.g., commented lines).
use ruff_text_size::{TextRange, TextSize};
use rustpython_parser::lexer::LexResult;
use rustpython_parser::{StringKind, Tok};
use crate::source_code::comment_ranges::{CommentRanges, CommentRangesBuilder};
use crate::source_code::Locator;
pub struct Indexer {
comment_ranges: CommentRanges,
/// Stores the start offset of continuation lines.
continuation_lines: Vec<TextSize>,
/// The range of all triple quoted strings in the source document. The ranges are sorted by their
/// [`TextRange::start`] position in increasing order. No two ranges are overlapping.
triple_quoted_string_ranges: Vec<TextRange>,
/// The range of all f-string in the source document. The ranges are sorted by their
/// [`TextRange::start`] position in increasing order. No two ranges are overlapping.
f_string_ranges: Vec<TextRange>,
}
impl Indexer {
pub fn from_tokens(tokens: &[LexResult], locator: &Locator) -> Self {
assert!(TextSize::try_from(locator.contents().len()).is_ok());
let mut comment_ranges_builder = CommentRangesBuilder::default();
let mut continuation_lines = Vec::new();
let mut triple_quoted_string_ranges = Vec::new();
let mut f_string_ranges = Vec::new();
// Token, end
let mut prev_end = TextSize::default();
let mut prev_token: Option<&Tok> = None;
let mut line_start = TextSize::default();
for (tok, range) in tokens.iter().flatten() {
let trivia = &locator.contents()[TextRange::new(prev_end, range.start())];
// Get the trivia between the previous and the current token and detect any newlines.
// This is necessary because `RustPython` doesn't emit `[Tok::Newline]` tokens
// between any two tokens that form a continuation. That's why we have to extract the
// newlines "manually".
for (index, text) in trivia.match_indices(['\n', '\r']) {
if text == "\r" && trivia.as_bytes().get(index + 1) == Some(&b'\n') {
continue;
}
// Newlines after a newline never form a continuation.
if !matches!(prev_token, Some(Tok::Newline | Tok::NonLogicalNewline)) {
continuation_lines.push(line_start);
}
// SAFETY: Safe because of the len assertion at the top of the function.
#[allow(clippy::cast_possible_truncation)]
{
line_start = prev_end + TextSize::new((index + 1) as u32);
}
}
comment_ranges_builder.visit_token(tok, *range);
match tok {
Tok::Newline | Tok::NonLogicalNewline => {
line_start = range.end();
}
Tok::String {
triple_quoted: true,
..
} => {
triple_quoted_string_ranges.push(*range);
}
Tok::String {
kind: StringKind::FString | StringKind::RawFString,
..
} => {
f_string_ranges.push(*range);
}
_ => {}
}
prev_token = Some(tok);
prev_end = range.end();
}
Self {
comment_ranges: comment_ranges_builder.finish(),
continuation_lines,
triple_quoted_string_ranges,
f_string_ranges,
}
}
/// Returns the byte offset ranges of comments
pub const fn comment_ranges(&self) -> &CommentRanges {
&self.comment_ranges
}
/// Returns the line start positions of continuations (backslash).
pub fn continuation_line_starts(&self) -> &[TextSize] {
&self.continuation_lines
}
/// Returns `true` if the given offset is part of a continuation line.
pub fn is_continuation(&self, offset: TextSize, locator: &Locator) -> bool {
let line_start = locator.line_start(offset);
self.continuation_lines.binary_search(&line_start).is_ok()
}
/// Return the [`TextRange`] of the triple-quoted-string containing a given offset.
pub fn triple_quoted_string_range(&self, offset: TextSize) -> Option<TextRange> {
let Ok(string_range_index) = self.triple_quoted_string_ranges.binary_search_by(|range| {
if offset < range.start() {
std::cmp::Ordering::Greater
} else if range.contains(offset) {
std::cmp::Ordering::Equal
} else {
std::cmp::Ordering::Less
}
}) else {
return None;
};
Some(self.triple_quoted_string_ranges[string_range_index])
}
/// Return the [`TextRange`] of the f-string containing a given offset.
pub fn f_string_range(&self, offset: TextSize) -> Option<TextRange> {
let Ok(string_range_index) = self.f_string_ranges.binary_search_by(|range| {
if offset < range.start() {
std::cmp::Ordering::Greater
} else if range.contains(offset) {
std::cmp::Ordering::Equal
} else {
std::cmp::Ordering::Less
}
}) else {
return None;
};
Some(self.f_string_ranges[string_range_index])
}
}
#[cfg(test)]
mod tests {
use ruff_text_size::{TextRange, TextSize};
use rustpython_parser::lexer::LexResult;
use rustpython_parser::{lexer, Mode};
use crate::source_code::{Indexer, Locator};
#[test]
fn continuation() {
let contents = r#"x = 1"#;
let lxr: Vec<LexResult> = lexer::lex(contents, Mode::Module).collect();
let indexer = Indexer::from_tokens(&lxr, &Locator::new(contents));
assert_eq!(indexer.continuation_line_starts(), &[]);
let contents = r#"
# Hello, world!
x = 1
y = 2
"#
.trim();
let lxr: Vec<LexResult> = lexer::lex(contents, Mode::Module).collect();
let indexer = Indexer::from_tokens(&lxr, &Locator::new(contents));
assert_eq!(indexer.continuation_line_starts(), &[]);
let contents = r#"
x = \
1
if True:
z = \
\
2
(
"abc" # Foo
"def" \
"ghi"
)
"#
.trim();
let lxr: Vec<LexResult> = lexer::lex(contents, Mode::Module).collect();
let indexer = Indexer::from_tokens(lxr.as_slice(), &Locator::new(contents));
assert_eq!(
indexer.continuation_line_starts(),
[
// row 1
TextSize::from(0),
// row 5
TextSize::from(22),
// row 6
TextSize::from(32),
// row 11
TextSize::from(71),
]
);
let contents = r"
x = 1; import sys
import os
if True:
x = 1; import sys
import os
if True:
x = 1; \
import os
x = 1; \
import os
"
.trim();
let lxr: Vec<LexResult> = lexer::lex(contents, Mode::Module).collect();
let indexer = Indexer::from_tokens(lxr.as_slice(), &Locator::new(contents));
assert_eq!(
indexer.continuation_line_starts(),
[
// row 9
TextSize::from(84),
// row 12
TextSize::from(116)
]
);
}
#[test]
fn string_ranges() {
let contents = r#""this is a single-quoted string""#;
let lxr: Vec<LexResult> = lexer::lex(contents, Mode::Module).collect();
let indexer = Indexer::from_tokens(lxr.as_slice(), &Locator::new(contents));
assert_eq!(indexer.triple_quoted_string_ranges, []);
let contents = r#"
"""
this is a multiline string
"""
"#;
let lxr: Vec<LexResult> = lexer::lex(contents, Mode::Module).collect();
let indexer = Indexer::from_tokens(lxr.as_slice(), &Locator::new(contents));
assert_eq!(
indexer.triple_quoted_string_ranges,
[TextRange::new(TextSize::from(13), TextSize::from(71))]
);
let contents = r#"
"""
'''this is a multiline string with multiple delimiter types'''
"""
"#;
let lxr: Vec<LexResult> = lexer::lex(contents, Mode::Module).collect();
let indexer = Indexer::from_tokens(lxr.as_slice(), &Locator::new(contents));
assert_eq!(
indexer.triple_quoted_string_ranges,
[TextRange::new(TextSize::from(13), TextSize::from(107))]
);
let contents = r#"
"""
this is one
multiline string
"""
"""
and this is
another
"""
"#;
let lxr: Vec<LexResult> = lexer::lex(contents, Mode::Module).collect();
let indexer = Indexer::from_tokens(lxr.as_slice(), &Locator::new(contents));
assert_eq!(
indexer.triple_quoted_string_ranges,
&[
TextRange::new(TextSize::from(13), TextSize::from(85)),
TextRange::new(TextSize::from(98), TextSize::from(161))
]
);
}
}