mirror of
https://github.com/astral-sh/ruff.git
synced 2025-08-03 10:23:11 +00:00
Use CommentRanges in backwards lexing (#7360)
## Summary The tokenizer was split into a forward and a backwards tokenizer. The backwards tokenizer uses the same names as the forwards ones (e.g. `next_token`). The backwards tokenizer gets the comment ranges that we already built to skip comments. --------- Co-authored-by: Micha Reiser <micha@reiser.io>
This commit is contained in:
parent
1f6e1485f9
commit
2cbe1733c8
41 changed files with 744 additions and 628 deletions
|
@ -16,8 +16,7 @@ license = { workspace = true }
|
|||
ruff_text_size = { path = "../ruff_text_size" }
|
||||
ruff_source_file = { path = "../ruff_source_file" }
|
||||
|
||||
memchr = { workspace = true }
|
||||
smallvec = { workspace = true }
|
||||
itertools = { workspace = true }
|
||||
unicode-ident = { workspace = true }
|
||||
|
||||
[dev-dependencies]
|
||||
|
|
71
crates/ruff_python_trivia/src/comment_ranges.rs
Normal file
71
crates/ruff_python_trivia/src/comment_ranges.rs
Normal file
|
@ -0,0 +1,71 @@
|
|||
use std::fmt::{Debug, Formatter};
|
||||
use std::ops::Deref;
|
||||
|
||||
use itertools::Itertools;
|
||||
|
||||
use ruff_text_size::{Ranged, TextRange};
|
||||
|
||||
/// Stores the ranges of comments sorted by [`TextRange::start`] in increasing order. No two ranges are overlapping.
|
||||
#[derive(Clone, Default)]
|
||||
pub struct CommentRanges {
|
||||
raw: Vec<TextRange>,
|
||||
}
|
||||
|
||||
impl CommentRanges {
|
||||
pub fn new(ranges: Vec<TextRange>) -> Self {
|
||||
Self { raw: ranges }
|
||||
}
|
||||
|
||||
/// Returns `true` if the given range includes a comment.
|
||||
pub fn intersects(&self, target: TextRange) -> bool {
|
||||
self.raw
|
||||
.binary_search_by(|range| {
|
||||
if target.contains_range(*range) {
|
||||
std::cmp::Ordering::Equal
|
||||
} else if range.end() < target.start() {
|
||||
std::cmp::Ordering::Less
|
||||
} else {
|
||||
std::cmp::Ordering::Greater
|
||||
}
|
||||
})
|
||||
.is_ok()
|
||||
}
|
||||
|
||||
/// Returns the comments who are within the range
|
||||
pub fn comments_in_range(&self, range: TextRange) -> &[TextRange] {
|
||||
let start = self
|
||||
.raw
|
||||
.partition_point(|comment| comment.start() < range.start());
|
||||
// We expect there are few comments, so switching to find should be faster
|
||||
match self.raw[start..]
|
||||
.iter()
|
||||
.find_position(|comment| comment.end() > range.end())
|
||||
{
|
||||
Some((in_range, _element)) => &self.raw[start..start + in_range],
|
||||
None => &self.raw[start..],
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Deref for CommentRanges {
|
||||
type Target = [TextRange];
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
self.raw.as_slice()
|
||||
}
|
||||
}
|
||||
|
||||
impl Debug for CommentRanges {
|
||||
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_tuple("CommentRanges").field(&self.raw).finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> IntoIterator for &'a CommentRanges {
|
||||
type Item = &'a TextRange;
|
||||
type IntoIter = std::slice::Iter<'a, TextRange>;
|
||||
|
||||
fn into_iter(self) -> Self::IntoIter {
|
||||
self.raw.iter()
|
||||
}
|
||||
}
|
|
@ -44,7 +44,7 @@ impl<'a> Cursor<'a> {
|
|||
self.chars.clone().next_back().unwrap_or(EOF_CHAR)
|
||||
}
|
||||
|
||||
// SAFETY: THe `source.text_len` call in `new` would panic if the string length is larger than a `u32`.
|
||||
// SAFETY: The `source.text_len` call in `new` would panic if the string length is larger than a `u32`.
|
||||
#[allow(clippy::cast_possible_truncation)]
|
||||
pub fn text_len(&self) -> TextSize {
|
||||
TextSize::new(self.chars.as_str().len() as u32)
|
||||
|
|
|
@ -1,8 +1,10 @@
|
|||
mod comment_ranges;
|
||||
mod cursor;
|
||||
pub mod textwrap;
|
||||
mod tokenizer;
|
||||
mod whitespace;
|
||||
|
||||
pub use comment_ranges::CommentRanges;
|
||||
pub use cursor::*;
|
||||
pub use tokenizer::*;
|
||||
pub use whitespace::*;
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
use memchr::{memchr2, memchr3, memrchr3_iter};
|
||||
use unicode_ident::{is_xid_continue, is_xid_start};
|
||||
|
||||
use ruff_text_size::{Ranged, TextLen, TextRange, TextSize};
|
||||
|
@ -121,6 +120,47 @@ fn is_identifier_continuation(c: char) -> bool {
|
|||
}
|
||||
}
|
||||
|
||||
fn to_keyword_or_other(source: &str) -> SimpleTokenKind {
|
||||
match source {
|
||||
"and" => SimpleTokenKind::And,
|
||||
"as" => SimpleTokenKind::As,
|
||||
"assert" => SimpleTokenKind::Assert,
|
||||
"async" => SimpleTokenKind::Async,
|
||||
"await" => SimpleTokenKind::Await,
|
||||
"break" => SimpleTokenKind::Break,
|
||||
"class" => SimpleTokenKind::Class,
|
||||
"continue" => SimpleTokenKind::Continue,
|
||||
"def" => SimpleTokenKind::Def,
|
||||
"del" => SimpleTokenKind::Del,
|
||||
"elif" => SimpleTokenKind::Elif,
|
||||
"else" => SimpleTokenKind::Else,
|
||||
"except" => SimpleTokenKind::Except,
|
||||
"finally" => SimpleTokenKind::Finally,
|
||||
"for" => SimpleTokenKind::For,
|
||||
"from" => SimpleTokenKind::From,
|
||||
"global" => SimpleTokenKind::Global,
|
||||
"if" => SimpleTokenKind::If,
|
||||
"import" => SimpleTokenKind::Import,
|
||||
"in" => SimpleTokenKind::In,
|
||||
"is" => SimpleTokenKind::Is,
|
||||
"lambda" => SimpleTokenKind::Lambda,
|
||||
"nonlocal" => SimpleTokenKind::Nonlocal,
|
||||
"not" => SimpleTokenKind::Not,
|
||||
"or" => SimpleTokenKind::Or,
|
||||
"pass" => SimpleTokenKind::Pass,
|
||||
"raise" => SimpleTokenKind::Raise,
|
||||
"return" => SimpleTokenKind::Return,
|
||||
"try" => SimpleTokenKind::Try,
|
||||
"while" => SimpleTokenKind::While,
|
||||
"match" => SimpleTokenKind::Match, // Match is a soft keyword that depends on the context but we can always lex it as a keyword and leave it to the caller (parser) to decide if it should be handled as an identifier or keyword.
|
||||
"type" => SimpleTokenKind::Type, // Type is a soft keyword that depends on the context but we can always lex it as a keyword and leave it to the caller (parser) to decide if it should be handled as an identifier or keyword.
|
||||
"case" => SimpleTokenKind::Case,
|
||||
"with" => SimpleTokenKind::With,
|
||||
"yield" => SimpleTokenKind::Yield,
|
||||
_ => SimpleTokenKind::Other, // Potentially an identifier, but only if it isn't a string prefix. We can ignore this for now https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Eq, PartialEq, Hash)]
|
||||
pub struct SimpleToken {
|
||||
pub kind: SimpleTokenKind,
|
||||
|
@ -421,17 +461,15 @@ impl SimpleTokenKind {
|
|||
}
|
||||
}
|
||||
|
||||
/// Simple zero allocation tokenizer for tokenizing trivia (and some tokens).
|
||||
/// Simple zero allocation tokenizer handling most tokens.
|
||||
///
|
||||
/// The tokenizer must start at an offset that is trivia (e.g. not inside of a multiline string).
|
||||
///
|
||||
/// The tokenizer doesn't guarantee any correctness after it returned a [`SimpleTokenKind::Other`]. That's why it
|
||||
/// will return [`SimpleTokenKind::Bogus`] for every character after until it reaches the end of the file.
|
||||
/// In case it finds something it can't parse, the tokenizer will return a
|
||||
/// [`SimpleTokenKind::Other`] and then only a final [`SimpleTokenKind::Bogus`] afterwards.
|
||||
pub struct SimpleTokenizer<'a> {
|
||||
offset: TextSize,
|
||||
back_offset: TextSize,
|
||||
/// `true` when it is known that the current `back` line has no comment for sure.
|
||||
back_line_has_no_comment: bool,
|
||||
bogus: bool,
|
||||
source: &'a str,
|
||||
cursor: Cursor<'a>,
|
||||
|
@ -441,8 +479,6 @@ impl<'a> SimpleTokenizer<'a> {
|
|||
pub fn new(source: &'a str, range: TextRange) -> Self {
|
||||
Self {
|
||||
offset: range.start(),
|
||||
back_offset: range.end(),
|
||||
back_line_has_no_comment: false,
|
||||
bogus: false,
|
||||
source,
|
||||
cursor: Cursor::new(&source[range]),
|
||||
|
@ -454,64 +490,6 @@ impl<'a> SimpleTokenizer<'a> {
|
|||
Self::new(source, range)
|
||||
}
|
||||
|
||||
/// Creates a tokenizer that lexes tokens from the start of `source` up to `offset`.
|
||||
///
|
||||
/// Consider using [`SimpleTokenizer::up_to_without_back_comment`] if intend to lex backwards.
|
||||
pub fn up_to(offset: TextSize, source: &'a str) -> Self {
|
||||
Self::new(source, TextRange::up_to(offset))
|
||||
}
|
||||
|
||||
/// Creates a tokenizer that lexes tokens from the start of `source` up to `offset`, and informs
|
||||
/// the lexer that the line at `offset` contains no comments. This can significantly speed up backwards lexing
|
||||
/// because the lexer doesn't need to scan for comments.
|
||||
pub fn up_to_without_back_comment(offset: TextSize, source: &'a str) -> Self {
|
||||
let mut tokenizer = Self::up_to(offset, source);
|
||||
tokenizer.back_line_has_no_comment = true;
|
||||
tokenizer
|
||||
}
|
||||
|
||||
fn to_keyword_or_other(&self, range: TextRange) -> SimpleTokenKind {
|
||||
let source = &self.source[range];
|
||||
match source {
|
||||
"and" => SimpleTokenKind::And,
|
||||
"as" => SimpleTokenKind::As,
|
||||
"assert" => SimpleTokenKind::Assert,
|
||||
"async" => SimpleTokenKind::Async,
|
||||
"await" => SimpleTokenKind::Await,
|
||||
"break" => SimpleTokenKind::Break,
|
||||
"class" => SimpleTokenKind::Class,
|
||||
"continue" => SimpleTokenKind::Continue,
|
||||
"def" => SimpleTokenKind::Def,
|
||||
"del" => SimpleTokenKind::Del,
|
||||
"elif" => SimpleTokenKind::Elif,
|
||||
"else" => SimpleTokenKind::Else,
|
||||
"except" => SimpleTokenKind::Except,
|
||||
"finally" => SimpleTokenKind::Finally,
|
||||
"for" => SimpleTokenKind::For,
|
||||
"from" => SimpleTokenKind::From,
|
||||
"global" => SimpleTokenKind::Global,
|
||||
"if" => SimpleTokenKind::If,
|
||||
"import" => SimpleTokenKind::Import,
|
||||
"in" => SimpleTokenKind::In,
|
||||
"is" => SimpleTokenKind::Is,
|
||||
"lambda" => SimpleTokenKind::Lambda,
|
||||
"nonlocal" => SimpleTokenKind::Nonlocal,
|
||||
"not" => SimpleTokenKind::Not,
|
||||
"or" => SimpleTokenKind::Or,
|
||||
"pass" => SimpleTokenKind::Pass,
|
||||
"raise" => SimpleTokenKind::Raise,
|
||||
"return" => SimpleTokenKind::Return,
|
||||
"try" => SimpleTokenKind::Try,
|
||||
"while" => SimpleTokenKind::While,
|
||||
"match" => SimpleTokenKind::Match, // Match is a soft keyword that depends on the context but we can always lex it as a keyword and leave it to the caller (parser) to decide if it should be handled as an identifier or keyword.
|
||||
"type" => SimpleTokenKind::Type, // Type is a soft keyword that depends on the context but we can always lex it as a keyword and leave it to the caller (parser) to decide if it should be handled as an identifier or keyword.
|
||||
"case" => SimpleTokenKind::Case,
|
||||
"with" => SimpleTokenKind::With,
|
||||
"yield" => SimpleTokenKind::Yield,
|
||||
_ => SimpleTokenKind::Other, // Potentially an identifier, but only if it isn't a string prefix. We can ignore this for now https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals
|
||||
}
|
||||
}
|
||||
|
||||
fn next_token(&mut self) -> SimpleToken {
|
||||
self.cursor.start_token();
|
||||
|
||||
|
@ -523,6 +501,7 @@ impl<'a> SimpleTokenizer<'a> {
|
|||
};
|
||||
|
||||
if self.bogus {
|
||||
// Emit a single final bogus token
|
||||
let token = SimpleToken {
|
||||
kind: SimpleTokenKind::Bogus,
|
||||
range: TextRange::at(self.offset, first.text_len()),
|
||||
|
@ -532,14 +511,29 @@ impl<'a> SimpleTokenizer<'a> {
|
|||
return token;
|
||||
}
|
||||
|
||||
let kind = match first {
|
||||
let kind = self.next_token_inner(first);
|
||||
|
||||
let token_len = self.cursor.token_len();
|
||||
|
||||
let token = SimpleToken {
|
||||
kind,
|
||||
range: TextRange::at(self.offset, token_len),
|
||||
};
|
||||
|
||||
self.offset += token_len;
|
||||
|
||||
token
|
||||
}
|
||||
|
||||
fn next_token_inner(&mut self, first: char) -> SimpleTokenKind {
|
||||
match first {
|
||||
// Keywords and identifiers
|
||||
c if is_identifier_start(c) => {
|
||||
self.cursor.eat_while(is_identifier_continuation);
|
||||
let token_len = self.cursor.token_len();
|
||||
|
||||
let range = TextRange::at(self.offset, token_len);
|
||||
let kind = self.to_keyword_or_other(range);
|
||||
let kind = to_keyword_or_other(&self.source[range]);
|
||||
|
||||
if kind == SimpleTokenKind::Other {
|
||||
self.bogus = true;
|
||||
|
@ -717,24 +711,102 @@ impl<'a> SimpleTokenizer<'a> {
|
|||
self.bogus = true;
|
||||
SimpleTokenKind::Other
|
||||
}
|
||||
};
|
||||
|
||||
let token_len = self.cursor.token_len();
|
||||
|
||||
let token = SimpleToken {
|
||||
kind,
|
||||
range: TextRange::at(self.offset, token_len),
|
||||
};
|
||||
|
||||
self.offset += token_len;
|
||||
|
||||
token
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the next token from the back. Prefer iterating forwards. Iterating backwards is significantly more expensive
|
||||
/// because it needs to check if the line has any comments when encountering any non-trivia token.
|
||||
pub fn next_token_back(&mut self) -> SimpleToken {
|
||||
pub fn skip_trivia(self) -> impl Iterator<Item = SimpleToken> + 'a {
|
||||
self.filter(|t| !t.kind().is_trivia())
|
||||
}
|
||||
}
|
||||
|
||||
impl Iterator for SimpleTokenizer<'_> {
|
||||
type Item = SimpleToken;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
let token = self.next_token();
|
||||
|
||||
if token.kind == SimpleTokenKind::EndOfFile {
|
||||
None
|
||||
} else {
|
||||
Some(token)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Simple zero allocation backwards tokenizer for finding preceding tokens.
|
||||
///
|
||||
/// The tokenizer must start at an offset that is trivia (e.g. not inside of a multiline string).
|
||||
/// It will fail when reaching a string.
|
||||
///
|
||||
/// In case it finds something it can't parse, the tokenizer will return a
|
||||
/// [`SimpleTokenKind::Other`] and then only a final [`SimpleTokenKind::Bogus`] afterwards.
|
||||
pub struct BackwardsTokenizer<'a> {
|
||||
offset: TextSize,
|
||||
back_offset: TextSize,
|
||||
/// Remember if we have check for comments
|
||||
after_newline: bool,
|
||||
/// Not `&CommentRanges` to avoid a circular dependency
|
||||
comment_ranges: &'a [TextRange],
|
||||
/// The index the previously line ending comment
|
||||
previous_comment_idx: Option<usize>,
|
||||
bogus: bool,
|
||||
source: &'a str,
|
||||
cursor: Cursor<'a>,
|
||||
}
|
||||
|
||||
impl<'a> BackwardsTokenizer<'a> {
|
||||
pub fn new(source: &'a str, range: TextRange, comment_range: &'a [TextRange]) -> Self {
|
||||
Self {
|
||||
offset: range.start(),
|
||||
back_offset: range.end(),
|
||||
// We could start tokenizing at a comment
|
||||
after_newline: true,
|
||||
comment_ranges: comment_range,
|
||||
previous_comment_idx: None,
|
||||
bogus: false,
|
||||
source,
|
||||
cursor: Cursor::new(&source[range]),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn up_to(offset: TextSize, source: &'a str, comment_range: &'a [TextRange]) -> Self {
|
||||
Self::new(source, TextRange::up_to(offset), comment_range)
|
||||
}
|
||||
|
||||
pub fn skip_trivia(self) -> impl Iterator<Item = SimpleToken> + 'a {
|
||||
self.filter(|t| !t.kind().is_trivia())
|
||||
}
|
||||
|
||||
pub fn next_token(&mut self) -> SimpleToken {
|
||||
self.cursor.start_token();
|
||||
self.back_offset = self.cursor.text_len() + self.offset;
|
||||
|
||||
if self.after_newline {
|
||||
// This comment ended a line with a higher line number, not the current one
|
||||
let previous_comment_idx = self.previous_comment_idx.unwrap_or_else(|| {
|
||||
self.comment_ranges
|
||||
.partition_point(|comment| comment.end() <= self.back_offset)
|
||||
});
|
||||
// If `previous_comment_idx == 0`, we're in a comment free region
|
||||
if previous_comment_idx > 0 {
|
||||
let comment = self.comment_ranges[previous_comment_idx - 1];
|
||||
if comment.end() == self.back_offset {
|
||||
// Skip the comment without iterating over the chars manually
|
||||
self.cursor =
|
||||
Cursor::new(&self.source[TextRange::new(self.offset, comment.start())]);
|
||||
debug_assert_eq!(self.cursor.text_len() + self.offset, comment.start());
|
||||
self.after_newline = false;
|
||||
self.previous_comment_idx = Some(previous_comment_idx - 1);
|
||||
return SimpleToken {
|
||||
kind: SimpleTokenKind::Comment,
|
||||
range: comment.range(),
|
||||
};
|
||||
}
|
||||
// At least memoize the binary search
|
||||
self.previous_comment_idx = Some(previous_comment_idx);
|
||||
}
|
||||
self.after_newline = false;
|
||||
}
|
||||
|
||||
let Some(last) = self.cursor.bump_back() else {
|
||||
return SimpleToken {
|
||||
|
@ -762,322 +834,132 @@ impl<'a> SimpleTokenizer<'a> {
|
|||
}
|
||||
|
||||
'\r' => {
|
||||
self.back_line_has_no_comment = false;
|
||||
self.after_newline = true;
|
||||
SimpleTokenKind::Newline
|
||||
}
|
||||
|
||||
'\n' => {
|
||||
self.back_line_has_no_comment = false;
|
||||
self.cursor.eat_char_back('\r');
|
||||
self.after_newline = true;
|
||||
SimpleTokenKind::Newline
|
||||
}
|
||||
|
||||
// Empty comment (could also be a comment nested in another comment, but this shouldn't matter for what we use the lexer for)
|
||||
'#' => SimpleTokenKind::Comment,
|
||||
|
||||
// For all other tokens, test if the character isn't part of a comment.
|
||||
c => {
|
||||
// Skip the test whether there's a preceding comment if it has been performed before.
|
||||
let comment_length = if self.back_line_has_no_comment {
|
||||
None
|
||||
} else {
|
||||
let bytes = self.cursor.chars().as_str().as_bytes();
|
||||
let mut potential_comment_starts: smallvec::SmallVec<[TextSize; 2]> =
|
||||
smallvec::SmallVec::new();
|
||||
|
||||
// Find the start of the line, or any potential comments.
|
||||
for index in memrchr3_iter(b'\n', b'\r', b'#', bytes) {
|
||||
if bytes[index] == b'#' {
|
||||
// Potentially a comment, but not guaranteed
|
||||
// SAFETY: Safe, because ruff only supports files up to 4GB
|
||||
potential_comment_starts.push(TextSize::try_from(index).unwrap());
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// No comments
|
||||
if potential_comment_starts.is_empty() {
|
||||
None
|
||||
} else {
|
||||
// The line contains at least one `#` token. The `#` can indicate the start of a
|
||||
// comment, meaning the current token is commented out, or it is a regular `#` inside of a string.
|
||||
self.comment_from_hash_positions(&potential_comment_starts)
|
||||
}
|
||||
};
|
||||
|
||||
// From here on it is guaranteed that this line has no other comment.
|
||||
self.back_line_has_no_comment = true;
|
||||
|
||||
if let Some(comment_length) = comment_length {
|
||||
// It is a comment, bump all tokens
|
||||
for _ in 0..usize::from(comment_length) {
|
||||
self.cursor.bump_back().unwrap();
|
||||
}
|
||||
|
||||
SimpleTokenKind::Comment
|
||||
} else {
|
||||
match c {
|
||||
// Keywords and identifiers
|
||||
c if is_identifier_continuation(c) => {
|
||||
// if we only have identifier continuations but no start (e.g. 555) we
|
||||
// don't want to consume the chars, so in that case, we want to rewind the
|
||||
// cursor to here
|
||||
let savepoint = self.cursor.clone();
|
||||
self.cursor.eat_back_while(is_identifier_continuation);
|
||||
|
||||
let token_len = self.cursor.token_len();
|
||||
let range = TextRange::at(self.back_offset - token_len, token_len);
|
||||
|
||||
if self.source[range]
|
||||
.chars()
|
||||
.next()
|
||||
.is_some_and(is_identifier_start)
|
||||
{
|
||||
self.to_keyword_or_other(range)
|
||||
} else {
|
||||
self.cursor = savepoint;
|
||||
self.bogus = true;
|
||||
SimpleTokenKind::Other
|
||||
}
|
||||
}
|
||||
|
||||
// Non-trivia tokens that are unambiguous when lexing backwards.
|
||||
// In other words: these are characters that _don't_ appear at the
|
||||
// end of a multi-character token (like `!=`).
|
||||
'\\' => SimpleTokenKind::Continuation,
|
||||
':' => SimpleTokenKind::Colon,
|
||||
'~' => SimpleTokenKind::Tilde,
|
||||
'%' => SimpleTokenKind::Percent,
|
||||
'|' => SimpleTokenKind::Vbar,
|
||||
',' => SimpleTokenKind::Comma,
|
||||
';' => SimpleTokenKind::Semi,
|
||||
'(' => SimpleTokenKind::LParen,
|
||||
')' => SimpleTokenKind::RParen,
|
||||
'[' => SimpleTokenKind::LBracket,
|
||||
']' => SimpleTokenKind::RBracket,
|
||||
'{' => SimpleTokenKind::LBrace,
|
||||
'}' => SimpleTokenKind::RBrace,
|
||||
'&' => SimpleTokenKind::Ampersand,
|
||||
'^' => SimpleTokenKind::Circumflex,
|
||||
'+' => SimpleTokenKind::Plus,
|
||||
'-' => SimpleTokenKind::Minus,
|
||||
|
||||
// Non-trivia tokens that _are_ ambiguous when lexing backwards.
|
||||
// In other words: these are characters that _might_ mark the end
|
||||
// of a multi-character token (like `!=` or `->` or `//` or `**`).
|
||||
'=' | '*' | '/' | '@' | '!' | '<' | '>' | '.' => {
|
||||
// This could be a single-token token, like `+` in `x + y`, or a
|
||||
// multi-character token, like `+=` in `x += y`. It could also be a sequence
|
||||
// of multi-character tokens, like `x ==== y`, which is invalid, _but_ it's
|
||||
// important that we produce the same token stream when lexing backwards as
|
||||
// we do when lexing forwards. So, identify the range of the sequence, lex
|
||||
// forwards, and return the last token.
|
||||
let mut cursor = self.cursor.clone();
|
||||
cursor.eat_back_while(|c| {
|
||||
matches!(
|
||||
c,
|
||||
':' | '~'
|
||||
| '%'
|
||||
| '|'
|
||||
| '&'
|
||||
| '^'
|
||||
| '+'
|
||||
| '-'
|
||||
| '='
|
||||
| '*'
|
||||
| '/'
|
||||
| '@'
|
||||
| '!'
|
||||
| '<'
|
||||
| '>'
|
||||
| '.'
|
||||
)
|
||||
});
|
||||
|
||||
let token_len = cursor.token_len();
|
||||
let range = TextRange::at(self.back_offset - token_len, token_len);
|
||||
|
||||
let forward_lexer = Self::new(self.source, range);
|
||||
if let Some(token) = forward_lexer.last() {
|
||||
// If the token spans multiple characters, bump the cursor. Note,
|
||||
// though, that we already bumped the cursor to past the last character
|
||||
// in the token at the very start of `next_token_back`.
|
||||
for _ in self.source[token.range].chars().rev().skip(1) {
|
||||
self.cursor.bump_back().unwrap();
|
||||
}
|
||||
token.kind()
|
||||
} else {
|
||||
self.bogus = true;
|
||||
SimpleTokenKind::Other
|
||||
}
|
||||
}
|
||||
|
||||
_ => {
|
||||
self.bogus = true;
|
||||
SimpleTokenKind::Other
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => self.next_token_inner(last),
|
||||
};
|
||||
|
||||
let token_len = self.cursor.token_len();
|
||||
|
||||
let start = self.back_offset - token_len;
|
||||
|
||||
let token = SimpleToken {
|
||||
SimpleToken {
|
||||
kind,
|
||||
range: TextRange::at(start, token_len),
|
||||
};
|
||||
|
||||
self.back_offset = start;
|
||||
|
||||
token
|
||||
}
|
||||
|
||||
pub fn skip_trivia(self) -> impl Iterator<Item = SimpleToken> + DoubleEndedIterator + 'a {
|
||||
self.filter(|t| !t.kind().is_trivia())
|
||||
}
|
||||
|
||||
/// Given the position of `#` tokens on a line, test if any `#` is the start of a comment and, if so, return the
|
||||
/// length of the comment.
|
||||
///
|
||||
/// The challenge is that `#` tokens can also appear inside of strings:
|
||||
///
|
||||
/// ```python
|
||||
/// ' #not a comment'
|
||||
/// ```
|
||||
///
|
||||
/// This looks innocent but is the `'` really the start of the new string or could it be a closing delimiter
|
||||
/// of a previously started string:
|
||||
///
|
||||
/// ```python
|
||||
/// ' a string\
|
||||
/// ` # a comment '
|
||||
/// ```
|
||||
///
|
||||
/// The only way to reliability tell whether the `#` is a comment when the comment contains a quote char is
|
||||
/// to forward lex all strings and comments and test if there's any unclosed string literal. If so, then
|
||||
/// the hash cannot be a comment.
|
||||
fn comment_from_hash_positions(&self, hash_positions: &[TextSize]) -> Option<TextSize> {
|
||||
// Iterate over the `#` positions from the start to the end of the line.
|
||||
// This is necessary to correctly support `a # comment # comment`.
|
||||
for possible_start in hash_positions.iter().rev() {
|
||||
let comment_bytes =
|
||||
self.source[TextRange::new(*possible_start, self.back_offset)].as_bytes();
|
||||
|
||||
// Test if the comment contains any quotes. If so, then it's possible that the `#` token isn't
|
||||
// the start of a comment, but instead part of a string:
|
||||
// ```python
|
||||
// a + 'a string # not a comment'
|
||||
// a + '''a string
|
||||
// # not a comment'''
|
||||
// ```
|
||||
match memchr2(b'\'', b'"', comment_bytes) {
|
||||
// Most comments don't contain quotes, and most strings don't contain comments.
|
||||
// For these it's safe to assume that they are comments.
|
||||
None => return Some(self.cursor.chars().as_str().text_len() - possible_start),
|
||||
// Now it gets complicated... There's no good way to know whether this is a string or not.
|
||||
// It is necessary to lex all strings and comments from the start to know if it is one or the other.
|
||||
Some(_) => {
|
||||
if find_unterminated_string_kind(
|
||||
&self.cursor.chars().as_str()[TextRange::up_to(*possible_start)],
|
||||
)
|
||||
.is_none()
|
||||
{
|
||||
// There's no unterminated string at the comment's start position. This *must*
|
||||
// be a comment.
|
||||
return Some(self.cursor.chars().as_str().text_len() - possible_start);
|
||||
}
|
||||
|
||||
// This is a hash inside of a string: `'test # not a comment'` continue with the next potential comment on the line.
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
fn find_unterminated_string_kind(input: &str) -> Option<StringKind> {
|
||||
let mut rest = input;
|
||||
|
||||
while let Some(comment_or_string_start) = memchr3(b'#', b'\'', b'\"', rest.as_bytes()) {
|
||||
let c = rest.as_bytes()[comment_or_string_start] as char;
|
||||
let after = &rest[comment_or_string_start + 1..];
|
||||
|
||||
if c == '#' {
|
||||
let comment_end = memchr2(b'\n', b'\r', after.as_bytes()).unwrap_or(after.len());
|
||||
rest = &after[comment_end..];
|
||||
} else {
|
||||
let mut cursor = Cursor::new(after);
|
||||
let quote_kind = if c == '\'' {
|
||||
QuoteKind::Single
|
||||
} else {
|
||||
QuoteKind::Double
|
||||
};
|
||||
|
||||
let string_kind = if cursor.eat_char(quote_kind.as_char()) {
|
||||
// `''` or `""`
|
||||
if cursor.eat_char(quote_kind.as_char()) {
|
||||
// `'''` or `"""`
|
||||
StringKind::Triple(quote_kind)
|
||||
} else {
|
||||
// empty string literal, nothing more to lex
|
||||
rest = cursor.chars().as_str();
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
StringKind::Single(quote_kind)
|
||||
};
|
||||
|
||||
if !is_string_terminated(string_kind, &mut cursor) {
|
||||
return Some(string_kind);
|
||||
}
|
||||
|
||||
rest = cursor.chars().as_str();
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
/// Helper to parser the previous token once we skipped all whitespace
|
||||
fn next_token_inner(&mut self, last: char) -> SimpleTokenKind {
|
||||
match last {
|
||||
// Keywords and identifiers
|
||||
c if is_identifier_continuation(c) => {
|
||||
// if we only have identifier continuations but no start (e.g. 555) we
|
||||
// don't want to consume the chars, so in that case, we want to rewind the
|
||||
// cursor to here
|
||||
let savepoint = self.cursor.clone();
|
||||
self.cursor.eat_back_while(is_identifier_continuation);
|
||||
|
||||
fn is_string_terminated(kind: StringKind, cursor: &mut Cursor) -> bool {
|
||||
let quote_char = kind.quote_kind().as_char();
|
||||
let token_len = self.cursor.token_len();
|
||||
let range = TextRange::at(self.back_offset - token_len, token_len);
|
||||
|
||||
while let Some(c) = cursor.bump() {
|
||||
match c {
|
||||
'\n' | '\r' if kind.is_single() => {
|
||||
// Reached the end of the line without a closing quote, this is an unterminated string literal.
|
||||
return false;
|
||||
}
|
||||
'\\' => {
|
||||
// Skip over escaped quotes that match this strings quotes or double escaped backslashes
|
||||
if cursor.eat_char(quote_char) || cursor.eat_char('\\') {
|
||||
continue;
|
||||
}
|
||||
// Eat over line continuation
|
||||
cursor.eat_char('\r');
|
||||
cursor.eat_char('\n');
|
||||
}
|
||||
c if c == quote_char => {
|
||||
if kind.is_single() || (cursor.eat_char(quote_char) && cursor.eat_char(quote_char))
|
||||
if self.source[range]
|
||||
.chars()
|
||||
.next()
|
||||
.is_some_and(is_identifier_start)
|
||||
{
|
||||
return true;
|
||||
to_keyword_or_other(&self.source[range])
|
||||
} else {
|
||||
self.cursor = savepoint;
|
||||
self.bogus = true;
|
||||
SimpleTokenKind::Other
|
||||
}
|
||||
}
|
||||
|
||||
// Non-trivia tokens that are unambiguous when lexing backwards.
|
||||
// In other words: these are characters that _don't_ appear at the
|
||||
// end of a multi-character token (like `!=`).
|
||||
'\\' => SimpleTokenKind::Continuation,
|
||||
':' => SimpleTokenKind::Colon,
|
||||
'~' => SimpleTokenKind::Tilde,
|
||||
'%' => SimpleTokenKind::Percent,
|
||||
'|' => SimpleTokenKind::Vbar,
|
||||
',' => SimpleTokenKind::Comma,
|
||||
';' => SimpleTokenKind::Semi,
|
||||
'(' => SimpleTokenKind::LParen,
|
||||
')' => SimpleTokenKind::RParen,
|
||||
'[' => SimpleTokenKind::LBracket,
|
||||
']' => SimpleTokenKind::RBracket,
|
||||
'{' => SimpleTokenKind::LBrace,
|
||||
'}' => SimpleTokenKind::RBrace,
|
||||
'&' => SimpleTokenKind::Ampersand,
|
||||
'^' => SimpleTokenKind::Circumflex,
|
||||
'+' => SimpleTokenKind::Plus,
|
||||
'-' => SimpleTokenKind::Minus,
|
||||
|
||||
// Non-trivia tokens that _are_ ambiguous when lexing backwards.
|
||||
// In other words: these are characters that _might_ mark the end
|
||||
// of a multi-character token (like `!=` or `->` or `//` or `**`).
|
||||
'=' | '*' | '/' | '@' | '!' | '<' | '>' | '.' => {
|
||||
// This could be a single-token token, like `+` in `x + y`, or a
|
||||
// multi-character token, like `+=` in `x += y`. It could also be a sequence
|
||||
// of multi-character tokens, like `x ==== y`, which is invalid, _but_ it's
|
||||
// important that we produce the same token stream when lexing backwards as
|
||||
// we do when lexing forwards. So, identify the range of the sequence, lex
|
||||
// forwards, and return the last token.
|
||||
let mut cursor = self.cursor.clone();
|
||||
cursor.eat_back_while(|c| {
|
||||
matches!(
|
||||
c,
|
||||
':' | '~'
|
||||
| '%'
|
||||
| '|'
|
||||
| '&'
|
||||
| '^'
|
||||
| '+'
|
||||
| '-'
|
||||
| '='
|
||||
| '*'
|
||||
| '/'
|
||||
| '@'
|
||||
| '!'
|
||||
| '<'
|
||||
| '>'
|
||||
| '.'
|
||||
)
|
||||
});
|
||||
|
||||
let token_len = cursor.token_len();
|
||||
let range = TextRange::at(self.back_offset - token_len, token_len);
|
||||
|
||||
let forward_lexer = SimpleTokenizer::new(self.source, range);
|
||||
if let Some(token) = forward_lexer.last() {
|
||||
// If the token spans multiple characters, bump the cursor. Note,
|
||||
// though, that we already bumped the cursor to past the last character
|
||||
// in the token at the very start of `next_token_back`.y
|
||||
for _ in self.source[token.range].chars().rev().skip(1) {
|
||||
self.cursor.bump_back().unwrap();
|
||||
}
|
||||
token.kind()
|
||||
} else {
|
||||
self.bogus = true;
|
||||
SimpleTokenKind::Other
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
// continue
|
||||
self.bogus = true;
|
||||
SimpleTokenKind::Other
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Reached end without a closing quote
|
||||
false
|
||||
}
|
||||
|
||||
impl Iterator for SimpleTokenizer<'_> {
|
||||
impl Iterator for BackwardsTokenizer<'_> {
|
||||
type Item = SimpleToken;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
|
@ -1091,64 +973,16 @@ impl Iterator for SimpleTokenizer<'_> {
|
|||
}
|
||||
}
|
||||
|
||||
impl DoubleEndedIterator for SimpleTokenizer<'_> {
|
||||
fn next_back(&mut self) -> Option<Self::Item> {
|
||||
let token = self.next_token_back();
|
||||
|
||||
if token.kind == SimpleTokenKind::EndOfFile {
|
||||
None
|
||||
} else {
|
||||
Some(token)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Eq, PartialEq, Debug)]
|
||||
enum StringKind {
|
||||
/// `'...'` or `"..."`
|
||||
Single(QuoteKind),
|
||||
/// `'''...'''` or `"""..."""`
|
||||
Triple(QuoteKind),
|
||||
}
|
||||
|
||||
impl StringKind {
|
||||
const fn quote_kind(self) -> QuoteKind {
|
||||
match self {
|
||||
StringKind::Single(kind) => kind,
|
||||
StringKind::Triple(kind) => kind,
|
||||
}
|
||||
}
|
||||
|
||||
const fn is_single(self) -> bool {
|
||||
matches!(self, StringKind::Single(_))
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Eq, PartialEq, Debug)]
|
||||
enum QuoteKind {
|
||||
/// `'``
|
||||
Single,
|
||||
|
||||
/// `"`
|
||||
Double,
|
||||
}
|
||||
|
||||
impl QuoteKind {
|
||||
const fn as_char(self) -> char {
|
||||
match self {
|
||||
QuoteKind::Single => '\'',
|
||||
QuoteKind::Double => '"',
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use insta::assert_debug_snapshot;
|
||||
|
||||
use ruff_python_parser::lexer::lex;
|
||||
use ruff_python_parser::{Mode, Tok};
|
||||
use ruff_text_size::{TextLen, TextRange, TextSize};
|
||||
|
||||
use crate::tokenizer::{lines_after, lines_before, SimpleToken, SimpleTokenizer};
|
||||
use crate::{BackwardsTokenizer, SimpleTokenKind};
|
||||
|
||||
struct TokenizationTestCase {
|
||||
source: &'static str,
|
||||
|
@ -1167,9 +1001,17 @@ mod tests {
|
|||
}
|
||||
|
||||
fn tokenize_reverse(&self) -> Vec<SimpleToken> {
|
||||
SimpleTokenizer::new(self.source, self.range)
|
||||
.rev()
|
||||
.collect()
|
||||
let comment_ranges: Vec<_> = lex(self.source, Mode::Module)
|
||||
.filter_map(|result| {
|
||||
let (token, range) = result.expect("Input to be a valid python program.");
|
||||
if matches!(token, Tok::Comment(_)) {
|
||||
Some(range)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
BackwardsTokenizer::new(self.source, self.range, &comment_ranges).collect()
|
||||
}
|
||||
|
||||
fn tokens(&self) -> &[SimpleToken] {
|
||||
|
@ -1495,4 +1337,22 @@ mod tests {
|
|||
1
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_previous_token_simple() {
|
||||
let cases = &["x = (", "x = ( ", "x = (\n"];
|
||||
for source in cases {
|
||||
let token = BackwardsTokenizer::up_to(source.text_len(), source, &[])
|
||||
.skip_trivia()
|
||||
.next()
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
token,
|
||||
SimpleToken {
|
||||
kind: SimpleTokenKind::LParen,
|
||||
range: TextRange::new(TextSize::new(4), TextSize::new(5)),
|
||||
}
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue