mirror of
https://github.com/astral-sh/ruff.git
synced 2025-09-29 13:24:57 +00:00
Modify comment_ranges
slice in BackwardsTokenizer
(#7432)
## Summary I was kinda curious to understand this issue (https://github.com/astral-sh/ruff/issues/7426) and just ended up attempting to address it. ## Test Plan `cargo test`
This commit is contained in:
parent
aae02cf275
commit
8d0a5e01bd
1 changed files with 21 additions and 42 deletions
|
@ -743,12 +743,8 @@ impl Iterator for SimpleTokenizer<'_> {
|
||||||
pub struct BackwardsTokenizer<'a> {
|
pub struct BackwardsTokenizer<'a> {
|
||||||
offset: TextSize,
|
offset: TextSize,
|
||||||
back_offset: TextSize,
|
back_offset: TextSize,
|
||||||
/// Remember if we have check for comments
|
/// Not `&CommentRanges` to avoid a circular dependency.
|
||||||
after_newline: bool,
|
|
||||||
/// Not `&CommentRanges` to avoid a circular dependency
|
|
||||||
comment_ranges: &'a [TextRange],
|
comment_ranges: &'a [TextRange],
|
||||||
/// The index the previously line ending comment
|
|
||||||
previous_comment_idx: Option<usize>,
|
|
||||||
bogus: bool,
|
bogus: bool,
|
||||||
source: &'a str,
|
source: &'a str,
|
||||||
cursor: Cursor<'a>,
|
cursor: Cursor<'a>,
|
||||||
|
@ -759,10 +755,9 @@ impl<'a> BackwardsTokenizer<'a> {
|
||||||
Self {
|
Self {
|
||||||
offset: range.start(),
|
offset: range.start(),
|
||||||
back_offset: range.end(),
|
back_offset: range.end(),
|
||||||
// We could start tokenizing at a comment
|
// Throw out any comments that follow the range.
|
||||||
after_newline: true,
|
comment_ranges: &comment_range
|
||||||
comment_ranges: comment_range,
|
[..comment_range.partition_point(|comment| comment.start() <= range.end())],
|
||||||
previous_comment_idx: None,
|
|
||||||
bogus: false,
|
bogus: false,
|
||||||
source,
|
source,
|
||||||
cursor: Cursor::new(&source[range]),
|
cursor: Cursor::new(&source[range]),
|
||||||
|
@ -781,33 +776,6 @@ impl<'a> BackwardsTokenizer<'a> {
|
||||||
self.cursor.start_token();
|
self.cursor.start_token();
|
||||||
self.back_offset = self.cursor.text_len() + self.offset;
|
self.back_offset = self.cursor.text_len() + self.offset;
|
||||||
|
|
||||||
if self.after_newline {
|
|
||||||
// This comment ended a line with a higher line number, not the current one
|
|
||||||
let previous_comment_idx = self.previous_comment_idx.unwrap_or_else(|| {
|
|
||||||
self.comment_ranges
|
|
||||||
.partition_point(|comment| comment.end() <= self.back_offset)
|
|
||||||
});
|
|
||||||
// If `previous_comment_idx == 0`, we're in a comment free region
|
|
||||||
if previous_comment_idx > 0 {
|
|
||||||
let comment = self.comment_ranges[previous_comment_idx - 1];
|
|
||||||
if comment.end() == self.back_offset {
|
|
||||||
// Skip the comment without iterating over the chars manually
|
|
||||||
self.cursor =
|
|
||||||
Cursor::new(&self.source[TextRange::new(self.offset, comment.start())]);
|
|
||||||
debug_assert_eq!(self.cursor.text_len() + self.offset, comment.start());
|
|
||||||
self.after_newline = false;
|
|
||||||
self.previous_comment_idx = Some(previous_comment_idx - 1);
|
|
||||||
return SimpleToken {
|
|
||||||
kind: SimpleTokenKind::Comment,
|
|
||||||
range: comment.range(),
|
|
||||||
};
|
|
||||||
}
|
|
||||||
// At least memoize the binary search
|
|
||||||
self.previous_comment_idx = Some(previous_comment_idx);
|
|
||||||
}
|
|
||||||
self.after_newline = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
let Some(last) = self.cursor.bump_back() else {
|
let Some(last) = self.cursor.bump_back() else {
|
||||||
return SimpleToken {
|
return SimpleToken {
|
||||||
kind: SimpleTokenKind::EndOfFile,
|
kind: SimpleTokenKind::EndOfFile,
|
||||||
|
@ -825,6 +793,22 @@ impl<'a> BackwardsTokenizer<'a> {
|
||||||
return token;
|
return token;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if let Some(comment) = self
|
||||||
|
.comment_ranges
|
||||||
|
.last()
|
||||||
|
.filter(|comment| comment.contains_inclusive(self.back_offset))
|
||||||
|
{
|
||||||
|
self.comment_ranges = &self.comment_ranges[..self.comment_ranges.len() - 1];
|
||||||
|
|
||||||
|
// Skip the comment without iterating over the chars manually.
|
||||||
|
self.cursor = Cursor::new(&self.source[TextRange::new(self.offset, comment.start())]);
|
||||||
|
debug_assert_eq!(self.cursor.text_len() + self.offset, comment.start());
|
||||||
|
return SimpleToken {
|
||||||
|
kind: SimpleTokenKind::Comment,
|
||||||
|
range: comment.range(),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
let kind = match last {
|
let kind = match last {
|
||||||
// This may not be 100% correct because it will lex-out trailing whitespace from a comment
|
// This may not be 100% correct because it will lex-out trailing whitespace from a comment
|
||||||
// as whitespace rather than being part of the token. This shouldn't matter for what we use the lexer for.
|
// as whitespace rather than being part of the token. This shouldn't matter for what we use the lexer for.
|
||||||
|
@ -833,14 +817,9 @@ impl<'a> BackwardsTokenizer<'a> {
|
||||||
SimpleTokenKind::Whitespace
|
SimpleTokenKind::Whitespace
|
||||||
}
|
}
|
||||||
|
|
||||||
'\r' => {
|
'\r' => SimpleTokenKind::Newline,
|
||||||
self.after_newline = true;
|
|
||||||
SimpleTokenKind::Newline
|
|
||||||
}
|
|
||||||
|
|
||||||
'\n' => {
|
'\n' => {
|
||||||
self.cursor.eat_char_back('\r');
|
self.cursor.eat_char_back('\r');
|
||||||
self.after_newline = true;
|
|
||||||
SimpleTokenKind::Newline
|
SimpleTokenKind::Newline
|
||||||
}
|
}
|
||||||
_ => self.next_token_inner(last),
|
_ => self.next_token_inner(last),
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue