ruff/crates/ruff_python_index/src/indexer.rs
Dhruv Manilawala 549cc1e437
Build CommentRanges outside the parser (#11792)
## Summary

This PR updates the parser to remove building the `CommentRanges` and
instead it'll be built by the linter and the formatter when it's
required.

For the linter, it'll be built and owned by the `Indexer` while for the
formatter it'll be built from the `Tokens` struct and passed as an
argument.

## Test Plan

`cargo insta test`
2024-06-09 09:55:17 +00:00

540 lines
17 KiB
Rust

//! Struct used to index source code, to enable efficient lookup of tokens that
//! are omitted from the AST (e.g., commented lines).
use ruff_python_ast::Stmt;
use ruff_python_parser::{TokenKind, Tokens};
use ruff_python_trivia::{
has_leading_content, has_trailing_content, is_python_whitespace, CommentRanges,
};
use ruff_source_file::Locator;
use ruff_text_size::{Ranged, TextRange, TextSize};
use crate::fstring_ranges::{FStringRanges, FStringRangesBuilder};
use crate::multiline_ranges::{MultilineRanges, MultilineRangesBuilder};
pub struct Indexer {
/// Stores the start offset of continuation lines.
continuation_lines: Vec<TextSize>,
/// The range of all f-string in the source document.
fstring_ranges: FStringRanges,
/// The range of all multiline strings in the source document.
multiline_ranges: MultilineRanges,
/// The range of all comments in the source document.
comment_ranges: CommentRanges,
}
impl Indexer {
pub fn from_tokens(tokens: &Tokens, locator: &Locator<'_>) -> Self {
assert!(TextSize::try_from(locator.contents().len()).is_ok());
let mut fstring_ranges_builder = FStringRangesBuilder::default();
let mut multiline_ranges_builder = MultilineRangesBuilder::default();
let mut continuation_lines = Vec::new();
let mut comment_ranges = Vec::new();
// Token, end
let mut prev_end = TextSize::default();
let mut line_start = TextSize::default();
for token in tokens.up_to_first_unknown() {
let trivia = locator.slice(TextRange::new(prev_end, token.start()));
// Get the trivia between the previous and the current token and detect any newlines.
// This is necessary because `RustPython` doesn't emit `[Tok::Newline]` tokens
// between any two tokens that form a continuation. That's why we have to extract the
// newlines "manually".
for (index, text) in trivia.match_indices(['\n', '\r']) {
if text == "\r" && trivia.as_bytes().get(index + 1) == Some(&b'\n') {
continue;
}
continuation_lines.push(line_start);
// SAFETY: Safe because of the len assertion at the top of the function.
#[allow(clippy::cast_possible_truncation)]
{
line_start = prev_end + TextSize::new((index + 1) as u32);
}
}
fstring_ranges_builder.visit_token(token);
multiline_ranges_builder.visit_token(token);
match token.kind() {
TokenKind::Newline | TokenKind::NonLogicalNewline => {
line_start = token.end();
}
TokenKind::String => {
// If the previous token was a string, find the start of the line that contains
// the closing delimiter, since the token itself can span multiple lines.
line_start = locator.line_start(token.end());
}
TokenKind::Comment => {
comment_ranges.push(token.range());
}
_ => {}
}
prev_end = token.end();
}
// TODO(dhruvmanila): This is temporary until Ruff becomes error resilient. To understand
// why this is required, refer to https://github.com/astral-sh/ruff/pull/11457#issuecomment-2144990269
// which was released at the time of this writing. Now we can't just revert that behavior,
// so we need to visit the remaining tokens if there are any for the comment ranges.
for token in tokens.after(prev_end) {
if token.kind() == TokenKind::Comment {
comment_ranges.push(token.range());
}
}
Self {
continuation_lines,
fstring_ranges: fstring_ranges_builder.finish(),
multiline_ranges: multiline_ranges_builder.finish(),
comment_ranges: CommentRanges::new(comment_ranges),
}
}
/// Returns the byte offset ranges of comments.
pub const fn comment_ranges(&self) -> &CommentRanges {
&self.comment_ranges
}
/// Returns the byte offset ranges of f-strings.
pub const fn fstring_ranges(&self) -> &FStringRanges {
&self.fstring_ranges
}
/// Returns the byte offset ranges of multiline strings.
pub const fn multiline_ranges(&self) -> &MultilineRanges {
&self.multiline_ranges
}
/// Returns the line start positions of continuations (backslash).
pub fn continuation_line_starts(&self) -> &[TextSize] {
&self.continuation_lines
}
/// Returns `true` if the given offset is part of a continuation line.
pub fn is_continuation(&self, offset: TextSize, locator: &Locator) -> bool {
let line_start = locator.line_start(offset);
self.continuation_lines.binary_search(&line_start).is_ok()
}
/// Given an offset at the end of a line (including newlines), return the offset of the
/// continuation at the end of that line.
fn find_continuation(&self, offset: TextSize, locator: &Locator) -> Option<TextSize> {
let newline_pos = usize::from(offset).saturating_sub(1);
// Skip the newline.
let newline_len = match locator.contents().as_bytes()[newline_pos] {
b'\n' => {
if locator
.contents()
.as_bytes()
.get(newline_pos.saturating_sub(1))
== Some(&b'\r')
{
2
} else {
1
}
}
b'\r' => 1,
// No preceding line.
_ => return None,
};
self.is_continuation(offset - TextSize::from(newline_len), locator)
.then(|| offset - TextSize::from(newline_len) - TextSize::from(1))
}
/// If the node starting at the given [`TextSize`] is preceded by at least one continuation line
/// (i.e., a line ending in a backslash), return the starting offset of the first such continuation
/// character.
///
/// For example, given:
/// ```python
/// x = 1; \
/// y = 2
/// ```
///
/// When passed the offset of `y`, this function will return the offset of the backslash at the end
/// of the first line.
///
/// Similarly, given:
/// ```python
/// x = 1; \
/// \
/// y = 2;
/// ```
///
/// When passed the offset of `y`, this function will again return the offset of the backslash at
/// the end of the first line.
pub fn preceded_by_continuations(
&self,
offset: TextSize,
locator: &Locator,
) -> Option<TextSize> {
// Find the first preceding continuation. If the offset isn't the first non-whitespace
// character on the line, then we can't have a continuation.
let previous_line_end = locator.line_start(offset);
if !locator
.slice(TextRange::new(previous_line_end, offset))
.chars()
.all(is_python_whitespace)
{
return None;
}
let mut continuation = self.find_continuation(previous_line_end, locator)?;
// Continue searching for continuations, in the unlikely event that we have multiple
// continuations in a row.
loop {
let previous_line_end = locator.line_start(continuation);
if locator
.slice(TextRange::new(previous_line_end, continuation))
.chars()
.all(is_python_whitespace)
{
if let Some(next_continuation) = self.find_continuation(previous_line_end, locator)
{
continuation = next_continuation;
continue;
}
}
break;
}
Some(continuation)
}
/// Return `true` if a [`Stmt`] appears to be preceded by other statements in a multi-statement
/// line.
pub fn preceded_by_multi_statement_line(&self, stmt: &Stmt, locator: &Locator) -> bool {
has_leading_content(stmt.start(), locator)
|| self
.preceded_by_continuations(stmt.start(), locator)
.is_some()
}
/// Return `true` if a [`Stmt`] appears to be followed by other statements in a multi-statement
/// line.
pub fn followed_by_multi_statement_line(&self, stmt: &Stmt, locator: &Locator) -> bool {
has_trailing_content(stmt.end(), locator)
}
/// Return `true` if a [`Stmt`] appears to be part of a multi-statement line.
pub fn in_multi_statement_line(&self, stmt: &Stmt, locator: &Locator) -> bool {
self.followed_by_multi_statement_line(stmt, locator)
|| self.preceded_by_multi_statement_line(stmt, locator)
}
}
#[cfg(test)]
mod tests {
use ruff_python_parser::parse_module;
use ruff_source_file::Locator;
use ruff_text_size::{TextRange, TextSize};
use crate::Indexer;
fn new_indexer(contents: &str) -> Indexer {
let parsed = parse_module(contents).unwrap();
let locator = Locator::new(contents);
Indexer::from_tokens(parsed.tokens(), &locator)
}
#[test]
fn continuation() {
let contents = r"x = 1";
assert_eq!(new_indexer(contents).continuation_line_starts(), &[]);
let contents = r"
# Hello, world!
x = 1
y = 2
"
.trim();
assert_eq!(new_indexer(contents).continuation_line_starts(), &[]);
let contents = r#"
x = \
1
if True:
z = \
\
2
(
"abc" # Foo
"def" \
"ghi"
)
"#
.trim();
assert_eq!(
new_indexer(contents).continuation_line_starts(),
[
// row 1
TextSize::from(0),
// row 5
TextSize::from(22),
// row 6
TextSize::from(32),
// row 11
TextSize::from(71),
]
);
let contents = r"
x = 1; import sys
import os
if True:
x = 1; import sys
import os
if True:
x = 1; \
import os
x = 1; \
import os
"
.trim();
assert_eq!(
new_indexer(contents).continuation_line_starts(),
[
// row 9
TextSize::from(84),
// row 12
TextSize::from(116)
]
);
let contents = r"
f'foo { 'str1' \
'str2' \
'str3'
f'nested { 'str4'
'str5' \
'str6'
}'
}'
"
.trim();
assert_eq!(
new_indexer(contents).continuation_line_starts(),
[
// row 1
TextSize::new(0),
// row 2
TextSize::new(17),
// row 5
TextSize::new(63),
]
);
let contents = r"
x = (
1
\
\
\
\
+ 2)
"
.trim();
assert_eq!(
new_indexer(contents).continuation_line_starts(),
[
// row 3
TextSize::new(12),
// row 4
TextSize::new(18),
// row 5
TextSize::new(24),
// row 7
TextSize::new(31),
]
);
}
#[test]
fn test_f_string_ranges() {
let contents = r#"
f"normal f-string"
f"start {f"inner {f"another"}"} end"
f"implicit " f"concatenation"
"#
.trim();
assert_eq!(
new_indexer(contents)
.fstring_ranges()
.values()
.copied()
.collect::<Vec<_>>(),
&[
TextRange::new(TextSize::from(0), TextSize::from(18)),
TextRange::new(TextSize::from(19), TextSize::from(55)),
TextRange::new(TextSize::from(28), TextSize::from(49)),
TextRange::new(TextSize::from(37), TextSize::from(47)),
TextRange::new(TextSize::from(56), TextSize::from(68)),
TextRange::new(TextSize::from(69), TextSize::from(85)),
]
);
}
#[test]
fn test_triple_quoted_f_string_ranges() {
let contents = r#"
f"""
this is one
multiline f-string
"""
f'''
and this is
another
'''
f"""
this is a {f"""nested multiline
f-string"""}
"""
"#
.trim();
assert_eq!(
new_indexer(contents)
.fstring_ranges()
.values()
.copied()
.collect::<Vec<_>>(),
&[
TextRange::new(TextSize::from(0), TextSize::from(39)),
TextRange::new(TextSize::from(40), TextSize::from(68)),
TextRange::new(TextSize::from(69), TextSize::from(122)),
TextRange::new(TextSize::from(85), TextSize::from(117)),
]
);
}
#[test]
fn test_fstring_innermost_outermost() {
let contents = r#"
f"no nested f-string"
if True:
f"first {f"second {f"third"} second"} first"
foo = "normal string"
f"implicit " f"concatenation"
f"first line {
foo + f"second line {bar}"
} third line"
f"""this is a
multi-line {f"""nested
f-string"""}
the end"""
"#
.trim();
let indexer = new_indexer(contents);
// For reference, the ranges of the f-strings in the above code are as
// follows where the ones inside parentheses are nested f-strings:
//
// [0..21, (36..80, 45..72, 55..63), 108..120, 121..137, (139..198, 164..184), (200..260, 226..248)]
for (offset, innermost_range, outermost_range) in [
// Inside a normal f-string
(
TextSize::new(130),
TextRange::new(TextSize::new(121), TextSize::new(137)),
TextRange::new(TextSize::new(121), TextSize::new(137)),
),
// Left boundary
(
TextSize::new(121),
TextRange::new(TextSize::new(121), TextSize::new(137)),
TextRange::new(TextSize::new(121), TextSize::new(137)),
),
// Right boundary
(
TextSize::new(136), // End offsets are exclusive
TextRange::new(TextSize::new(121), TextSize::new(137)),
TextRange::new(TextSize::new(121), TextSize::new(137)),
),
// "first" left
(
TextSize::new(40),
TextRange::new(TextSize::new(36), TextSize::new(80)),
TextRange::new(TextSize::new(36), TextSize::new(80)),
),
// "second" left
(
TextSize::new(50),
TextRange::new(TextSize::new(45), TextSize::new(72)),
TextRange::new(TextSize::new(36), TextSize::new(80)),
),
// "third"
(
TextSize::new(60),
TextRange::new(TextSize::new(55), TextSize::new(63)),
TextRange::new(TextSize::new(36), TextSize::new(80)),
),
// "second" right
(
TextSize::new(70),
TextRange::new(TextSize::new(45), TextSize::new(72)),
TextRange::new(TextSize::new(36), TextSize::new(80)),
),
// "first" right
(
TextSize::new(75),
TextRange::new(TextSize::new(36), TextSize::new(80)),
TextRange::new(TextSize::new(36), TextSize::new(80)),
),
// Single-quoted f-strings spanning across multiple lines
(
TextSize::new(160),
TextRange::new(TextSize::new(139), TextSize::new(198)),
TextRange::new(TextSize::new(139), TextSize::new(198)),
),
(
TextSize::new(170),
TextRange::new(TextSize::new(164), TextSize::new(184)),
TextRange::new(TextSize::new(139), TextSize::new(198)),
),
// Multi-line f-strings
(
TextSize::new(220),
TextRange::new(TextSize::new(200), TextSize::new(260)),
TextRange::new(TextSize::new(200), TextSize::new(260)),
),
(
TextSize::new(240),
TextRange::new(TextSize::new(226), TextSize::new(248)),
TextRange::new(TextSize::new(200), TextSize::new(260)),
),
] {
assert_eq!(
indexer.fstring_ranges().innermost(offset).unwrap(),
innermost_range
);
assert_eq!(
indexer.fstring_ranges().outermost(offset).unwrap(),
outermost_range
);
}
}
}