mirror of
https://github.com/astral-sh/ruff.git
synced 2025-08-22 03:14:41 +00:00

## Summary This PR updates the linter, specifically the token-based rules, to work on the tokens that come after a syntax error. For context, the token-based rules only diagnose the tokens up to the first lexical error. This PR builds up an error resilience by introducing a `TokenIterWithContext` which updates the `nesting` level and tries to reflect it with what the lexer is seeing. This isn't 100% accurate because if the parser recovered from an unclosed parenthesis in the middle of the line, the context won't reduce the nesting level until it sees the newline token at the end of the line. resolves: #11915 ## Test Plan * Add test cases for a bunch of rules that are affected by this change. * Run the fuzzer for a long time, making sure to fix any other bugs.
530 lines
16 KiB
Rust
530 lines
16 KiB
Rust
//! Struct used to index source code, to enable efficient lookup of tokens that
|
|
//! are omitted from the AST (e.g., commented lines).
|
|
|
|
use ruff_python_ast::Stmt;
|
|
use ruff_python_parser::{TokenKind, Tokens};
|
|
use ruff_python_trivia::{
|
|
has_leading_content, has_trailing_content, is_python_whitespace, CommentRanges,
|
|
};
|
|
use ruff_source_file::Locator;
|
|
use ruff_text_size::{Ranged, TextRange, TextSize};
|
|
|
|
use crate::fstring_ranges::{FStringRanges, FStringRangesBuilder};
|
|
use crate::multiline_ranges::{MultilineRanges, MultilineRangesBuilder};
|
|
|
|
pub struct Indexer {
|
|
/// Stores the start offset of continuation lines.
|
|
continuation_lines: Vec<TextSize>,
|
|
|
|
/// The range of all f-string in the source document.
|
|
fstring_ranges: FStringRanges,
|
|
|
|
/// The range of all multiline strings in the source document.
|
|
multiline_ranges: MultilineRanges,
|
|
|
|
/// The range of all comments in the source document.
|
|
comment_ranges: CommentRanges,
|
|
}
|
|
|
|
impl Indexer {
|
|
pub fn from_tokens(tokens: &Tokens, locator: &Locator<'_>) -> Self {
|
|
assert!(TextSize::try_from(locator.contents().len()).is_ok());
|
|
|
|
let mut fstring_ranges_builder = FStringRangesBuilder::default();
|
|
let mut multiline_ranges_builder = MultilineRangesBuilder::default();
|
|
let mut continuation_lines = Vec::new();
|
|
let mut comment_ranges = Vec::new();
|
|
|
|
// Token, end
|
|
let mut prev_end = TextSize::default();
|
|
let mut line_start = TextSize::default();
|
|
|
|
for token in tokens {
|
|
let trivia = locator.slice(TextRange::new(prev_end, token.start()));
|
|
|
|
// Get the trivia between the previous and the current token and detect any newlines.
|
|
// This is necessary because `RustPython` doesn't emit `[Tok::Newline]` tokens
|
|
// between any two tokens that form a continuation. That's why we have to extract the
|
|
// newlines "manually".
|
|
for (index, text) in trivia.match_indices(['\n', '\r']) {
|
|
if text == "\r" && trivia.as_bytes().get(index + 1) == Some(&b'\n') {
|
|
continue;
|
|
}
|
|
continuation_lines.push(line_start);
|
|
|
|
// SAFETY: Safe because of the len assertion at the top of the function.
|
|
#[allow(clippy::cast_possible_truncation)]
|
|
{
|
|
line_start = prev_end + TextSize::new((index + 1) as u32);
|
|
}
|
|
}
|
|
|
|
fstring_ranges_builder.visit_token(token);
|
|
multiline_ranges_builder.visit_token(token);
|
|
|
|
match token.kind() {
|
|
TokenKind::Newline | TokenKind::NonLogicalNewline => {
|
|
line_start = token.end();
|
|
}
|
|
TokenKind::String => {
|
|
// If the previous token was a string, find the start of the line that contains
|
|
// the closing delimiter, since the token itself can span multiple lines.
|
|
line_start = locator.line_start(token.end());
|
|
}
|
|
TokenKind::Comment => {
|
|
comment_ranges.push(token.range());
|
|
}
|
|
_ => {}
|
|
}
|
|
|
|
prev_end = token.end();
|
|
}
|
|
|
|
Self {
|
|
continuation_lines,
|
|
fstring_ranges: fstring_ranges_builder.finish(),
|
|
multiline_ranges: multiline_ranges_builder.finish(),
|
|
comment_ranges: CommentRanges::new(comment_ranges),
|
|
}
|
|
}
|
|
|
|
/// Returns the byte offset ranges of comments.
|
|
pub const fn comment_ranges(&self) -> &CommentRanges {
|
|
&self.comment_ranges
|
|
}
|
|
|
|
/// Returns the byte offset ranges of f-strings.
|
|
pub const fn fstring_ranges(&self) -> &FStringRanges {
|
|
&self.fstring_ranges
|
|
}
|
|
|
|
/// Returns the byte offset ranges of multiline strings.
|
|
pub const fn multiline_ranges(&self) -> &MultilineRanges {
|
|
&self.multiline_ranges
|
|
}
|
|
|
|
/// Returns the line start positions of continuations (backslash).
|
|
pub fn continuation_line_starts(&self) -> &[TextSize] {
|
|
&self.continuation_lines
|
|
}
|
|
|
|
/// Returns `true` if the given offset is part of a continuation line.
|
|
pub fn is_continuation(&self, offset: TextSize, locator: &Locator) -> bool {
|
|
let line_start = locator.line_start(offset);
|
|
self.continuation_lines.binary_search(&line_start).is_ok()
|
|
}
|
|
|
|
/// Given an offset at the end of a line (including newlines), return the offset of the
|
|
/// continuation at the end of that line.
|
|
fn find_continuation(&self, offset: TextSize, locator: &Locator) -> Option<TextSize> {
|
|
let newline_pos = usize::from(offset).saturating_sub(1);
|
|
|
|
// Skip the newline.
|
|
let newline_len = match locator.contents().as_bytes()[newline_pos] {
|
|
b'\n' => {
|
|
if locator
|
|
.contents()
|
|
.as_bytes()
|
|
.get(newline_pos.saturating_sub(1))
|
|
== Some(&b'\r')
|
|
{
|
|
2
|
|
} else {
|
|
1
|
|
}
|
|
}
|
|
b'\r' => 1,
|
|
// No preceding line.
|
|
_ => return None,
|
|
};
|
|
|
|
self.is_continuation(offset - TextSize::from(newline_len), locator)
|
|
.then(|| offset - TextSize::from(newline_len) - TextSize::from(1))
|
|
}
|
|
|
|
/// If the node starting at the given [`TextSize`] is preceded by at least one continuation line
|
|
/// (i.e., a line ending in a backslash), return the starting offset of the first such continuation
|
|
/// character.
|
|
///
|
|
/// For example, given:
|
|
/// ```python
|
|
/// x = 1; \
|
|
/// y = 2
|
|
/// ```
|
|
///
|
|
/// When passed the offset of `y`, this function will return the offset of the backslash at the end
|
|
/// of the first line.
|
|
///
|
|
/// Similarly, given:
|
|
/// ```python
|
|
/// x = 1; \
|
|
/// \
|
|
/// y = 2;
|
|
/// ```
|
|
///
|
|
/// When passed the offset of `y`, this function will again return the offset of the backslash at
|
|
/// the end of the first line.
|
|
pub fn preceded_by_continuations(
|
|
&self,
|
|
offset: TextSize,
|
|
locator: &Locator,
|
|
) -> Option<TextSize> {
|
|
// Find the first preceding continuation. If the offset isn't the first non-whitespace
|
|
// character on the line, then we can't have a continuation.
|
|
let previous_line_end = locator.line_start(offset);
|
|
if !locator
|
|
.slice(TextRange::new(previous_line_end, offset))
|
|
.chars()
|
|
.all(is_python_whitespace)
|
|
{
|
|
return None;
|
|
}
|
|
|
|
let mut continuation = self.find_continuation(previous_line_end, locator)?;
|
|
|
|
// Continue searching for continuations, in the unlikely event that we have multiple
|
|
// continuations in a row.
|
|
loop {
|
|
let previous_line_end = locator.line_start(continuation);
|
|
if locator
|
|
.slice(TextRange::new(previous_line_end, continuation))
|
|
.chars()
|
|
.all(is_python_whitespace)
|
|
{
|
|
if let Some(next_continuation) = self.find_continuation(previous_line_end, locator)
|
|
{
|
|
continuation = next_continuation;
|
|
continue;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
|
|
Some(continuation)
|
|
}
|
|
|
|
/// Return `true` if a [`Stmt`] appears to be preceded by other statements in a multi-statement
|
|
/// line.
|
|
pub fn preceded_by_multi_statement_line(&self, stmt: &Stmt, locator: &Locator) -> bool {
|
|
has_leading_content(stmt.start(), locator)
|
|
|| self
|
|
.preceded_by_continuations(stmt.start(), locator)
|
|
.is_some()
|
|
}
|
|
|
|
/// Return `true` if a [`Stmt`] appears to be followed by other statements in a multi-statement
|
|
/// line.
|
|
pub fn followed_by_multi_statement_line(&self, stmt: &Stmt, locator: &Locator) -> bool {
|
|
has_trailing_content(stmt.end(), locator)
|
|
}
|
|
|
|
/// Return `true` if a [`Stmt`] appears to be part of a multi-statement line.
|
|
pub fn in_multi_statement_line(&self, stmt: &Stmt, locator: &Locator) -> bool {
|
|
self.followed_by_multi_statement_line(stmt, locator)
|
|
|| self.preceded_by_multi_statement_line(stmt, locator)
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use ruff_python_parser::parse_module;
|
|
use ruff_source_file::Locator;
|
|
use ruff_text_size::{TextRange, TextSize};
|
|
|
|
use crate::Indexer;
|
|
|
|
fn new_indexer(contents: &str) -> Indexer {
|
|
let parsed = parse_module(contents).unwrap();
|
|
let locator = Locator::new(contents);
|
|
Indexer::from_tokens(parsed.tokens(), &locator)
|
|
}
|
|
|
|
#[test]
|
|
fn continuation() {
|
|
let contents = r"x = 1";
|
|
assert_eq!(new_indexer(contents).continuation_line_starts(), &[]);
|
|
|
|
let contents = r"
|
|
# Hello, world!
|
|
|
|
x = 1
|
|
|
|
y = 2
|
|
"
|
|
.trim();
|
|
|
|
assert_eq!(new_indexer(contents).continuation_line_starts(), &[]);
|
|
|
|
let contents = r#"
|
|
x = \
|
|
1
|
|
|
|
if True:
|
|
z = \
|
|
\
|
|
2
|
|
|
|
(
|
|
"abc" # Foo
|
|
"def" \
|
|
"ghi"
|
|
)
|
|
"#
|
|
.trim();
|
|
assert_eq!(
|
|
new_indexer(contents).continuation_line_starts(),
|
|
[
|
|
// row 1
|
|
TextSize::from(0),
|
|
// row 5
|
|
TextSize::from(22),
|
|
// row 6
|
|
TextSize::from(32),
|
|
// row 11
|
|
TextSize::from(71),
|
|
]
|
|
);
|
|
|
|
let contents = r"
|
|
x = 1; import sys
|
|
import os
|
|
|
|
if True:
|
|
x = 1; import sys
|
|
import os
|
|
|
|
if True:
|
|
x = 1; \
|
|
import os
|
|
|
|
x = 1; \
|
|
import os
|
|
"
|
|
.trim();
|
|
assert_eq!(
|
|
new_indexer(contents).continuation_line_starts(),
|
|
[
|
|
// row 9
|
|
TextSize::from(84),
|
|
// row 12
|
|
TextSize::from(116)
|
|
]
|
|
);
|
|
|
|
let contents = r"
|
|
f'foo { 'str1' \
|
|
'str2' \
|
|
'str3'
|
|
f'nested { 'str4'
|
|
'str5' \
|
|
'str6'
|
|
}'
|
|
}'
|
|
"
|
|
.trim();
|
|
assert_eq!(
|
|
new_indexer(contents).continuation_line_starts(),
|
|
[
|
|
// row 1
|
|
TextSize::new(0),
|
|
// row 2
|
|
TextSize::new(17),
|
|
// row 5
|
|
TextSize::new(63),
|
|
]
|
|
);
|
|
|
|
let contents = r"
|
|
x = (
|
|
1
|
|
\
|
|
\
|
|
\
|
|
|
|
\
|
|
+ 2)
|
|
"
|
|
.trim();
|
|
assert_eq!(
|
|
new_indexer(contents).continuation_line_starts(),
|
|
[
|
|
// row 3
|
|
TextSize::new(12),
|
|
// row 4
|
|
TextSize::new(18),
|
|
// row 5
|
|
TextSize::new(24),
|
|
// row 7
|
|
TextSize::new(31),
|
|
]
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_f_string_ranges() {
|
|
let contents = r#"
|
|
f"normal f-string"
|
|
f"start {f"inner {f"another"}"} end"
|
|
f"implicit " f"concatenation"
|
|
"#
|
|
.trim();
|
|
assert_eq!(
|
|
new_indexer(contents)
|
|
.fstring_ranges()
|
|
.values()
|
|
.copied()
|
|
.collect::<Vec<_>>(),
|
|
&[
|
|
TextRange::new(TextSize::from(0), TextSize::from(18)),
|
|
TextRange::new(TextSize::from(19), TextSize::from(55)),
|
|
TextRange::new(TextSize::from(28), TextSize::from(49)),
|
|
TextRange::new(TextSize::from(37), TextSize::from(47)),
|
|
TextRange::new(TextSize::from(56), TextSize::from(68)),
|
|
TextRange::new(TextSize::from(69), TextSize::from(85)),
|
|
]
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_triple_quoted_f_string_ranges() {
|
|
let contents = r#"
|
|
f"""
|
|
this is one
|
|
multiline f-string
|
|
"""
|
|
f'''
|
|
and this is
|
|
another
|
|
'''
|
|
f"""
|
|
this is a {f"""nested multiline
|
|
f-string"""}
|
|
"""
|
|
"#
|
|
.trim();
|
|
assert_eq!(
|
|
new_indexer(contents)
|
|
.fstring_ranges()
|
|
.values()
|
|
.copied()
|
|
.collect::<Vec<_>>(),
|
|
&[
|
|
TextRange::new(TextSize::from(0), TextSize::from(39)),
|
|
TextRange::new(TextSize::from(40), TextSize::from(68)),
|
|
TextRange::new(TextSize::from(69), TextSize::from(122)),
|
|
TextRange::new(TextSize::from(85), TextSize::from(117)),
|
|
]
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_fstring_innermost_outermost() {
|
|
let contents = r#"
|
|
f"no nested f-string"
|
|
|
|
if True:
|
|
f"first {f"second {f"third"} second"} first"
|
|
foo = "normal string"
|
|
|
|
f"implicit " f"concatenation"
|
|
|
|
f"first line {
|
|
foo + f"second line {bar}"
|
|
} third line"
|
|
|
|
f"""this is a
|
|
multi-line {f"""nested
|
|
f-string"""}
|
|
the end"""
|
|
"#
|
|
.trim();
|
|
let indexer = new_indexer(contents);
|
|
|
|
// For reference, the ranges of the f-strings in the above code are as
|
|
// follows where the ones inside parentheses are nested f-strings:
|
|
//
|
|
// [0..21, (36..80, 45..72, 55..63), 108..120, 121..137, (139..198, 164..184), (200..260, 226..248)]
|
|
|
|
for (offset, innermost_range, outermost_range) in [
|
|
// Inside a normal f-string
|
|
(
|
|
TextSize::new(130),
|
|
TextRange::new(TextSize::new(121), TextSize::new(137)),
|
|
TextRange::new(TextSize::new(121), TextSize::new(137)),
|
|
),
|
|
// Left boundary
|
|
(
|
|
TextSize::new(121),
|
|
TextRange::new(TextSize::new(121), TextSize::new(137)),
|
|
TextRange::new(TextSize::new(121), TextSize::new(137)),
|
|
),
|
|
// Right boundary
|
|
(
|
|
TextSize::new(136), // End offsets are exclusive
|
|
TextRange::new(TextSize::new(121), TextSize::new(137)),
|
|
TextRange::new(TextSize::new(121), TextSize::new(137)),
|
|
),
|
|
// "first" left
|
|
(
|
|
TextSize::new(40),
|
|
TextRange::new(TextSize::new(36), TextSize::new(80)),
|
|
TextRange::new(TextSize::new(36), TextSize::new(80)),
|
|
),
|
|
// "second" left
|
|
(
|
|
TextSize::new(50),
|
|
TextRange::new(TextSize::new(45), TextSize::new(72)),
|
|
TextRange::new(TextSize::new(36), TextSize::new(80)),
|
|
),
|
|
// "third"
|
|
(
|
|
TextSize::new(60),
|
|
TextRange::new(TextSize::new(55), TextSize::new(63)),
|
|
TextRange::new(TextSize::new(36), TextSize::new(80)),
|
|
),
|
|
// "second" right
|
|
(
|
|
TextSize::new(70),
|
|
TextRange::new(TextSize::new(45), TextSize::new(72)),
|
|
TextRange::new(TextSize::new(36), TextSize::new(80)),
|
|
),
|
|
// "first" right
|
|
(
|
|
TextSize::new(75),
|
|
TextRange::new(TextSize::new(36), TextSize::new(80)),
|
|
TextRange::new(TextSize::new(36), TextSize::new(80)),
|
|
),
|
|
// Single-quoted f-strings spanning across multiple lines
|
|
(
|
|
TextSize::new(160),
|
|
TextRange::new(TextSize::new(139), TextSize::new(198)),
|
|
TextRange::new(TextSize::new(139), TextSize::new(198)),
|
|
),
|
|
(
|
|
TextSize::new(170),
|
|
TextRange::new(TextSize::new(164), TextSize::new(184)),
|
|
TextRange::new(TextSize::new(139), TextSize::new(198)),
|
|
),
|
|
// Multi-line f-strings
|
|
(
|
|
TextSize::new(220),
|
|
TextRange::new(TextSize::new(200), TextSize::new(260)),
|
|
TextRange::new(TextSize::new(200), TextSize::new(260)),
|
|
),
|
|
(
|
|
TextSize::new(240),
|
|
TextRange::new(TextSize::new(226), TextSize::new(248)),
|
|
TextRange::new(TextSize::new(200), TextSize::new(260)),
|
|
),
|
|
] {
|
|
assert_eq!(
|
|
indexer.fstring_ranges().innermost(offset).unwrap(),
|
|
innermost_range
|
|
);
|
|
assert_eq!(
|
|
indexer.fstring_ranges().outermost(offset).unwrap(),
|
|
outermost_range
|
|
);
|
|
}
|
|
}
|
|
}
|