Use CommentRanges in backwards lexing (#7360)

## Summary

The tokenizer was split into a forward and a backwards tokenizer. The
backwards tokenizer uses the same names as the forwards ones (e.g.
`next_token`). The backwards tokenizer gets the comment ranges that we
already built to skip comments.

---------

Co-authored-by: Micha Reiser <micha@reiser.io>
This commit is contained in:
konsti 2023-09-16 05:21:45 +02:00 committed by GitHub
parent 1f6e1485f9
commit 2cbe1733c8
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
41 changed files with 744 additions and 628 deletions

View file

@ -16,8 +16,7 @@ license = { workspace = true }
ruff_text_size = { path = "../ruff_text_size" }
ruff_source_file = { path = "../ruff_source_file" }
memchr = { workspace = true }
smallvec = { workspace = true }
itertools = { workspace = true }
unicode-ident = { workspace = true }
[dev-dependencies]

View file

@ -0,0 +1,71 @@
use std::fmt::{Debug, Formatter};
use std::ops::Deref;
use itertools::Itertools;
use ruff_text_size::{Ranged, TextRange};
/// Stores the ranges of comments sorted by [`TextRange::start`] in increasing order. No two ranges are overlapping.
#[derive(Clone, Default)]
pub struct CommentRanges {
raw: Vec<TextRange>,
}
impl CommentRanges {
pub fn new(ranges: Vec<TextRange>) -> Self {
Self { raw: ranges }
}
/// Returns `true` if the given range includes a comment.
pub fn intersects(&self, target: TextRange) -> bool {
self.raw
.binary_search_by(|range| {
if target.contains_range(*range) {
std::cmp::Ordering::Equal
} else if range.end() < target.start() {
std::cmp::Ordering::Less
} else {
std::cmp::Ordering::Greater
}
})
.is_ok()
}
/// Returns the comments who are within the range
pub fn comments_in_range(&self, range: TextRange) -> &[TextRange] {
let start = self
.raw
.partition_point(|comment| comment.start() < range.start());
// We expect there are few comments, so switching to find should be faster
match self.raw[start..]
.iter()
.find_position(|comment| comment.end() > range.end())
{
Some((in_range, _element)) => &self.raw[start..start + in_range],
None => &self.raw[start..],
}
}
}
impl Deref for CommentRanges {
type Target = [TextRange];
fn deref(&self) -> &Self::Target {
self.raw.as_slice()
}
}
impl Debug for CommentRanges {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
f.debug_tuple("CommentRanges").field(&self.raw).finish()
}
}
impl<'a> IntoIterator for &'a CommentRanges {
type Item = &'a TextRange;
type IntoIter = std::slice::Iter<'a, TextRange>;
fn into_iter(self) -> Self::IntoIter {
self.raw.iter()
}
}

View file

@ -44,7 +44,7 @@ impl<'a> Cursor<'a> {
self.chars.clone().next_back().unwrap_or(EOF_CHAR)
}
// SAFETY: THe `source.text_len` call in `new` would panic if the string length is larger than a `u32`.
// SAFETY: The `source.text_len` call in `new` would panic if the string length is larger than a `u32`.
#[allow(clippy::cast_possible_truncation)]
pub fn text_len(&self) -> TextSize {
TextSize::new(self.chars.as_str().len() as u32)

View file

@ -1,8 +1,10 @@
mod comment_ranges;
mod cursor;
pub mod textwrap;
mod tokenizer;
mod whitespace;
pub use comment_ranges::CommentRanges;
pub use cursor::*;
pub use tokenizer::*;
pub use whitespace::*;

View file

@ -1,4 +1,3 @@
use memchr::{memchr2, memchr3, memrchr3_iter};
use unicode_ident::{is_xid_continue, is_xid_start};
use ruff_text_size::{Ranged, TextLen, TextRange, TextSize};
@ -121,6 +120,47 @@ fn is_identifier_continuation(c: char) -> bool {
}
}
fn to_keyword_or_other(source: &str) -> SimpleTokenKind {
match source {
"and" => SimpleTokenKind::And,
"as" => SimpleTokenKind::As,
"assert" => SimpleTokenKind::Assert,
"async" => SimpleTokenKind::Async,
"await" => SimpleTokenKind::Await,
"break" => SimpleTokenKind::Break,
"class" => SimpleTokenKind::Class,
"continue" => SimpleTokenKind::Continue,
"def" => SimpleTokenKind::Def,
"del" => SimpleTokenKind::Del,
"elif" => SimpleTokenKind::Elif,
"else" => SimpleTokenKind::Else,
"except" => SimpleTokenKind::Except,
"finally" => SimpleTokenKind::Finally,
"for" => SimpleTokenKind::For,
"from" => SimpleTokenKind::From,
"global" => SimpleTokenKind::Global,
"if" => SimpleTokenKind::If,
"import" => SimpleTokenKind::Import,
"in" => SimpleTokenKind::In,
"is" => SimpleTokenKind::Is,
"lambda" => SimpleTokenKind::Lambda,
"nonlocal" => SimpleTokenKind::Nonlocal,
"not" => SimpleTokenKind::Not,
"or" => SimpleTokenKind::Or,
"pass" => SimpleTokenKind::Pass,
"raise" => SimpleTokenKind::Raise,
"return" => SimpleTokenKind::Return,
"try" => SimpleTokenKind::Try,
"while" => SimpleTokenKind::While,
"match" => SimpleTokenKind::Match, // Match is a soft keyword that depends on the context but we can always lex it as a keyword and leave it to the caller (parser) to decide if it should be handled as an identifier or keyword.
"type" => SimpleTokenKind::Type, // Type is a soft keyword that depends on the context but we can always lex it as a keyword and leave it to the caller (parser) to decide if it should be handled as an identifier or keyword.
"case" => SimpleTokenKind::Case,
"with" => SimpleTokenKind::With,
"yield" => SimpleTokenKind::Yield,
_ => SimpleTokenKind::Other, // Potentially an identifier, but only if it isn't a string prefix. We can ignore this for now https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals
}
}
#[derive(Clone, Debug, Eq, PartialEq, Hash)]
pub struct SimpleToken {
pub kind: SimpleTokenKind,
@ -421,17 +461,15 @@ impl SimpleTokenKind {
}
}
/// Simple zero allocation tokenizer for tokenizing trivia (and some tokens).
/// Simple zero allocation tokenizer handling most tokens.
///
/// The tokenizer must start at an offset that is trivia (e.g. not inside of a multiline string).
///
/// The tokenizer doesn't guarantee any correctness after it returned a [`SimpleTokenKind::Other`]. That's why it
/// will return [`SimpleTokenKind::Bogus`] for every character after until it reaches the end of the file.
/// In case it finds something it can't parse, the tokenizer will return a
/// [`SimpleTokenKind::Other`] and then only a final [`SimpleTokenKind::Bogus`] afterwards.
pub struct SimpleTokenizer<'a> {
offset: TextSize,
back_offset: TextSize,
/// `true` when it is known that the current `back` line has no comment for sure.
back_line_has_no_comment: bool,
bogus: bool,
source: &'a str,
cursor: Cursor<'a>,
@ -441,8 +479,6 @@ impl<'a> SimpleTokenizer<'a> {
pub fn new(source: &'a str, range: TextRange) -> Self {
Self {
offset: range.start(),
back_offset: range.end(),
back_line_has_no_comment: false,
bogus: false,
source,
cursor: Cursor::new(&source[range]),
@ -454,64 +490,6 @@ impl<'a> SimpleTokenizer<'a> {
Self::new(source, range)
}
/// Creates a tokenizer that lexes tokens from the start of `source` up to `offset`.
///
/// Consider using [`SimpleTokenizer::up_to_without_back_comment`] if intend to lex backwards.
pub fn up_to(offset: TextSize, source: &'a str) -> Self {
Self::new(source, TextRange::up_to(offset))
}
/// Creates a tokenizer that lexes tokens from the start of `source` up to `offset`, and informs
/// the lexer that the line at `offset` contains no comments. This can significantly speed up backwards lexing
/// because the lexer doesn't need to scan for comments.
pub fn up_to_without_back_comment(offset: TextSize, source: &'a str) -> Self {
let mut tokenizer = Self::up_to(offset, source);
tokenizer.back_line_has_no_comment = true;
tokenizer
}
fn to_keyword_or_other(&self, range: TextRange) -> SimpleTokenKind {
let source = &self.source[range];
match source {
"and" => SimpleTokenKind::And,
"as" => SimpleTokenKind::As,
"assert" => SimpleTokenKind::Assert,
"async" => SimpleTokenKind::Async,
"await" => SimpleTokenKind::Await,
"break" => SimpleTokenKind::Break,
"class" => SimpleTokenKind::Class,
"continue" => SimpleTokenKind::Continue,
"def" => SimpleTokenKind::Def,
"del" => SimpleTokenKind::Del,
"elif" => SimpleTokenKind::Elif,
"else" => SimpleTokenKind::Else,
"except" => SimpleTokenKind::Except,
"finally" => SimpleTokenKind::Finally,
"for" => SimpleTokenKind::For,
"from" => SimpleTokenKind::From,
"global" => SimpleTokenKind::Global,
"if" => SimpleTokenKind::If,
"import" => SimpleTokenKind::Import,
"in" => SimpleTokenKind::In,
"is" => SimpleTokenKind::Is,
"lambda" => SimpleTokenKind::Lambda,
"nonlocal" => SimpleTokenKind::Nonlocal,
"not" => SimpleTokenKind::Not,
"or" => SimpleTokenKind::Or,
"pass" => SimpleTokenKind::Pass,
"raise" => SimpleTokenKind::Raise,
"return" => SimpleTokenKind::Return,
"try" => SimpleTokenKind::Try,
"while" => SimpleTokenKind::While,
"match" => SimpleTokenKind::Match, // Match is a soft keyword that depends on the context but we can always lex it as a keyword and leave it to the caller (parser) to decide if it should be handled as an identifier or keyword.
"type" => SimpleTokenKind::Type, // Type is a soft keyword that depends on the context but we can always lex it as a keyword and leave it to the caller (parser) to decide if it should be handled as an identifier or keyword.
"case" => SimpleTokenKind::Case,
"with" => SimpleTokenKind::With,
"yield" => SimpleTokenKind::Yield,
_ => SimpleTokenKind::Other, // Potentially an identifier, but only if it isn't a string prefix. We can ignore this for now https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals
}
}
fn next_token(&mut self) -> SimpleToken {
self.cursor.start_token();
@ -523,6 +501,7 @@ impl<'a> SimpleTokenizer<'a> {
};
if self.bogus {
// Emit a single final bogus token
let token = SimpleToken {
kind: SimpleTokenKind::Bogus,
range: TextRange::at(self.offset, first.text_len()),
@ -532,14 +511,29 @@ impl<'a> SimpleTokenizer<'a> {
return token;
}
let kind = match first {
let kind = self.next_token_inner(first);
let token_len = self.cursor.token_len();
let token = SimpleToken {
kind,
range: TextRange::at(self.offset, token_len),
};
self.offset += token_len;
token
}
fn next_token_inner(&mut self, first: char) -> SimpleTokenKind {
match first {
// Keywords and identifiers
c if is_identifier_start(c) => {
self.cursor.eat_while(is_identifier_continuation);
let token_len = self.cursor.token_len();
let range = TextRange::at(self.offset, token_len);
let kind = self.to_keyword_or_other(range);
let kind = to_keyword_or_other(&self.source[range]);
if kind == SimpleTokenKind::Other {
self.bogus = true;
@ -717,24 +711,102 @@ impl<'a> SimpleTokenizer<'a> {
self.bogus = true;
SimpleTokenKind::Other
}
};
let token_len = self.cursor.token_len();
let token = SimpleToken {
kind,
range: TextRange::at(self.offset, token_len),
};
self.offset += token_len;
token
}
}
/// Returns the next token from the back. Prefer iterating forwards. Iterating backwards is significantly more expensive
/// because it needs to check if the line has any comments when encountering any non-trivia token.
pub fn next_token_back(&mut self) -> SimpleToken {
pub fn skip_trivia(self) -> impl Iterator<Item = SimpleToken> + 'a {
self.filter(|t| !t.kind().is_trivia())
}
}
impl Iterator for SimpleTokenizer<'_> {
type Item = SimpleToken;
fn next(&mut self) -> Option<Self::Item> {
let token = self.next_token();
if token.kind == SimpleTokenKind::EndOfFile {
None
} else {
Some(token)
}
}
}
/// Simple zero allocation backwards tokenizer for finding preceding tokens.
///
/// The tokenizer must start at an offset that is trivia (e.g. not inside of a multiline string).
/// It will fail when reaching a string.
///
/// In case it finds something it can't parse, the tokenizer will return a
/// [`SimpleTokenKind::Other`] and then only a final [`SimpleTokenKind::Bogus`] afterwards.
pub struct BackwardsTokenizer<'a> {
offset: TextSize,
back_offset: TextSize,
/// Remember if we have check for comments
after_newline: bool,
/// Not `&CommentRanges` to avoid a circular dependency
comment_ranges: &'a [TextRange],
/// The index the previously line ending comment
previous_comment_idx: Option<usize>,
bogus: bool,
source: &'a str,
cursor: Cursor<'a>,
}
impl<'a> BackwardsTokenizer<'a> {
pub fn new(source: &'a str, range: TextRange, comment_range: &'a [TextRange]) -> Self {
Self {
offset: range.start(),
back_offset: range.end(),
// We could start tokenizing at a comment
after_newline: true,
comment_ranges: comment_range,
previous_comment_idx: None,
bogus: false,
source,
cursor: Cursor::new(&source[range]),
}
}
pub fn up_to(offset: TextSize, source: &'a str, comment_range: &'a [TextRange]) -> Self {
Self::new(source, TextRange::up_to(offset), comment_range)
}
pub fn skip_trivia(self) -> impl Iterator<Item = SimpleToken> + 'a {
self.filter(|t| !t.kind().is_trivia())
}
pub fn next_token(&mut self) -> SimpleToken {
self.cursor.start_token();
self.back_offset = self.cursor.text_len() + self.offset;
if self.after_newline {
// This comment ended a line with a higher line number, not the current one
let previous_comment_idx = self.previous_comment_idx.unwrap_or_else(|| {
self.comment_ranges
.partition_point(|comment| comment.end() <= self.back_offset)
});
// If `previous_comment_idx == 0`, we're in a comment free region
if previous_comment_idx > 0 {
let comment = self.comment_ranges[previous_comment_idx - 1];
if comment.end() == self.back_offset {
// Skip the comment without iterating over the chars manually
self.cursor =
Cursor::new(&self.source[TextRange::new(self.offset, comment.start())]);
debug_assert_eq!(self.cursor.text_len() + self.offset, comment.start());
self.after_newline = false;
self.previous_comment_idx = Some(previous_comment_idx - 1);
return SimpleToken {
kind: SimpleTokenKind::Comment,
range: comment.range(),
};
}
// At least memoize the binary search
self.previous_comment_idx = Some(previous_comment_idx);
}
self.after_newline = false;
}
let Some(last) = self.cursor.bump_back() else {
return SimpleToken {
@ -762,322 +834,132 @@ impl<'a> SimpleTokenizer<'a> {
}
'\r' => {
self.back_line_has_no_comment = false;
self.after_newline = true;
SimpleTokenKind::Newline
}
'\n' => {
self.back_line_has_no_comment = false;
self.cursor.eat_char_back('\r');
self.after_newline = true;
SimpleTokenKind::Newline
}
// Empty comment (could also be a comment nested in another comment, but this shouldn't matter for what we use the lexer for)
'#' => SimpleTokenKind::Comment,
// For all other tokens, test if the character isn't part of a comment.
c => {
// Skip the test whether there's a preceding comment if it has been performed before.
let comment_length = if self.back_line_has_no_comment {
None
} else {
let bytes = self.cursor.chars().as_str().as_bytes();
let mut potential_comment_starts: smallvec::SmallVec<[TextSize; 2]> =
smallvec::SmallVec::new();
// Find the start of the line, or any potential comments.
for index in memrchr3_iter(b'\n', b'\r', b'#', bytes) {
if bytes[index] == b'#' {
// Potentially a comment, but not guaranteed
// SAFETY: Safe, because ruff only supports files up to 4GB
potential_comment_starts.push(TextSize::try_from(index).unwrap());
} else {
break;
}
}
// No comments
if potential_comment_starts.is_empty() {
None
} else {
// The line contains at least one `#` token. The `#` can indicate the start of a
// comment, meaning the current token is commented out, or it is a regular `#` inside of a string.
self.comment_from_hash_positions(&potential_comment_starts)
}
};
// From here on it is guaranteed that this line has no other comment.
self.back_line_has_no_comment = true;
if let Some(comment_length) = comment_length {
// It is a comment, bump all tokens
for _ in 0..usize::from(comment_length) {
self.cursor.bump_back().unwrap();
}
SimpleTokenKind::Comment
} else {
match c {
// Keywords and identifiers
c if is_identifier_continuation(c) => {
// if we only have identifier continuations but no start (e.g. 555) we
// don't want to consume the chars, so in that case, we want to rewind the
// cursor to here
let savepoint = self.cursor.clone();
self.cursor.eat_back_while(is_identifier_continuation);
let token_len = self.cursor.token_len();
let range = TextRange::at(self.back_offset - token_len, token_len);
if self.source[range]
.chars()
.next()
.is_some_and(is_identifier_start)
{
self.to_keyword_or_other(range)
} else {
self.cursor = savepoint;
self.bogus = true;
SimpleTokenKind::Other
}
}
// Non-trivia tokens that are unambiguous when lexing backwards.
// In other words: these are characters that _don't_ appear at the
// end of a multi-character token (like `!=`).
'\\' => SimpleTokenKind::Continuation,
':' => SimpleTokenKind::Colon,
'~' => SimpleTokenKind::Tilde,
'%' => SimpleTokenKind::Percent,
'|' => SimpleTokenKind::Vbar,
',' => SimpleTokenKind::Comma,
';' => SimpleTokenKind::Semi,
'(' => SimpleTokenKind::LParen,
')' => SimpleTokenKind::RParen,
'[' => SimpleTokenKind::LBracket,
']' => SimpleTokenKind::RBracket,
'{' => SimpleTokenKind::LBrace,
'}' => SimpleTokenKind::RBrace,
'&' => SimpleTokenKind::Ampersand,
'^' => SimpleTokenKind::Circumflex,
'+' => SimpleTokenKind::Plus,
'-' => SimpleTokenKind::Minus,
// Non-trivia tokens that _are_ ambiguous when lexing backwards.
// In other words: these are characters that _might_ mark the end
// of a multi-character token (like `!=` or `->` or `//` or `**`).
'=' | '*' | '/' | '@' | '!' | '<' | '>' | '.' => {
// This could be a single-token token, like `+` in `x + y`, or a
// multi-character token, like `+=` in `x += y`. It could also be a sequence
// of multi-character tokens, like `x ==== y`, which is invalid, _but_ it's
// important that we produce the same token stream when lexing backwards as
// we do when lexing forwards. So, identify the range of the sequence, lex
// forwards, and return the last token.
let mut cursor = self.cursor.clone();
cursor.eat_back_while(|c| {
matches!(
c,
':' | '~'
| '%'
| '|'
| '&'
| '^'
| '+'
| '-'
| '='
| '*'
| '/'
| '@'
| '!'
| '<'
| '>'
| '.'
)
});
let token_len = cursor.token_len();
let range = TextRange::at(self.back_offset - token_len, token_len);
let forward_lexer = Self::new(self.source, range);
if let Some(token) = forward_lexer.last() {
// If the token spans multiple characters, bump the cursor. Note,
// though, that we already bumped the cursor to past the last character
// in the token at the very start of `next_token_back`.
for _ in self.source[token.range].chars().rev().skip(1) {
self.cursor.bump_back().unwrap();
}
token.kind()
} else {
self.bogus = true;
SimpleTokenKind::Other
}
}
_ => {
self.bogus = true;
SimpleTokenKind::Other
}
}
}
}
_ => self.next_token_inner(last),
};
let token_len = self.cursor.token_len();
let start = self.back_offset - token_len;
let token = SimpleToken {
SimpleToken {
kind,
range: TextRange::at(start, token_len),
};
self.back_offset = start;
token
}
pub fn skip_trivia(self) -> impl Iterator<Item = SimpleToken> + DoubleEndedIterator + 'a {
self.filter(|t| !t.kind().is_trivia())
}
/// Given the position of `#` tokens on a line, test if any `#` is the start of a comment and, if so, return the
/// length of the comment.
///
/// The challenge is that `#` tokens can also appear inside of strings:
///
/// ```python
/// ' #not a comment'
/// ```
///
/// This looks innocent but is the `'` really the start of the new string or could it be a closing delimiter
/// of a previously started string:
///
/// ```python
/// ' a string\
/// ` # a comment '
/// ```
///
/// The only way to reliability tell whether the `#` is a comment when the comment contains a quote char is
/// to forward lex all strings and comments and test if there's any unclosed string literal. If so, then
/// the hash cannot be a comment.
fn comment_from_hash_positions(&self, hash_positions: &[TextSize]) -> Option<TextSize> {
// Iterate over the `#` positions from the start to the end of the line.
// This is necessary to correctly support `a # comment # comment`.
for possible_start in hash_positions.iter().rev() {
let comment_bytes =
self.source[TextRange::new(*possible_start, self.back_offset)].as_bytes();
// Test if the comment contains any quotes. If so, then it's possible that the `#` token isn't
// the start of a comment, but instead part of a string:
// ```python
// a + 'a string # not a comment'
// a + '''a string
// # not a comment'''
// ```
match memchr2(b'\'', b'"', comment_bytes) {
// Most comments don't contain quotes, and most strings don't contain comments.
// For these it's safe to assume that they are comments.
None => return Some(self.cursor.chars().as_str().text_len() - possible_start),
// Now it gets complicated... There's no good way to know whether this is a string or not.
// It is necessary to lex all strings and comments from the start to know if it is one or the other.
Some(_) => {
if find_unterminated_string_kind(
&self.cursor.chars().as_str()[TextRange::up_to(*possible_start)],
)
.is_none()
{
// There's no unterminated string at the comment's start position. This *must*
// be a comment.
return Some(self.cursor.chars().as_str().text_len() - possible_start);
}
// This is a hash inside of a string: `'test # not a comment'` continue with the next potential comment on the line.
}
}
}
None
}
}
fn find_unterminated_string_kind(input: &str) -> Option<StringKind> {
let mut rest = input;
while let Some(comment_or_string_start) = memchr3(b'#', b'\'', b'\"', rest.as_bytes()) {
let c = rest.as_bytes()[comment_or_string_start] as char;
let after = &rest[comment_or_string_start + 1..];
if c == '#' {
let comment_end = memchr2(b'\n', b'\r', after.as_bytes()).unwrap_or(after.len());
rest = &after[comment_end..];
} else {
let mut cursor = Cursor::new(after);
let quote_kind = if c == '\'' {
QuoteKind::Single
} else {
QuoteKind::Double
};
let string_kind = if cursor.eat_char(quote_kind.as_char()) {
// `''` or `""`
if cursor.eat_char(quote_kind.as_char()) {
// `'''` or `"""`
StringKind::Triple(quote_kind)
} else {
// empty string literal, nothing more to lex
rest = cursor.chars().as_str();
continue;
}
} else {
StringKind::Single(quote_kind)
};
if !is_string_terminated(string_kind, &mut cursor) {
return Some(string_kind);
}
rest = cursor.chars().as_str();
}
}
None
}
/// Helper to parser the previous token once we skipped all whitespace
fn next_token_inner(&mut self, last: char) -> SimpleTokenKind {
match last {
// Keywords and identifiers
c if is_identifier_continuation(c) => {
// if we only have identifier continuations but no start (e.g. 555) we
// don't want to consume the chars, so in that case, we want to rewind the
// cursor to here
let savepoint = self.cursor.clone();
self.cursor.eat_back_while(is_identifier_continuation);
fn is_string_terminated(kind: StringKind, cursor: &mut Cursor) -> bool {
let quote_char = kind.quote_kind().as_char();
let token_len = self.cursor.token_len();
let range = TextRange::at(self.back_offset - token_len, token_len);
while let Some(c) = cursor.bump() {
match c {
'\n' | '\r' if kind.is_single() => {
// Reached the end of the line without a closing quote, this is an unterminated string literal.
return false;
}
'\\' => {
// Skip over escaped quotes that match this strings quotes or double escaped backslashes
if cursor.eat_char(quote_char) || cursor.eat_char('\\') {
continue;
}
// Eat over line continuation
cursor.eat_char('\r');
cursor.eat_char('\n');
}
c if c == quote_char => {
if kind.is_single() || (cursor.eat_char(quote_char) && cursor.eat_char(quote_char))
if self.source[range]
.chars()
.next()
.is_some_and(is_identifier_start)
{
return true;
to_keyword_or_other(&self.source[range])
} else {
self.cursor = savepoint;
self.bogus = true;
SimpleTokenKind::Other
}
}
// Non-trivia tokens that are unambiguous when lexing backwards.
// In other words: these are characters that _don't_ appear at the
// end of a multi-character token (like `!=`).
'\\' => SimpleTokenKind::Continuation,
':' => SimpleTokenKind::Colon,
'~' => SimpleTokenKind::Tilde,
'%' => SimpleTokenKind::Percent,
'|' => SimpleTokenKind::Vbar,
',' => SimpleTokenKind::Comma,
';' => SimpleTokenKind::Semi,
'(' => SimpleTokenKind::LParen,
')' => SimpleTokenKind::RParen,
'[' => SimpleTokenKind::LBracket,
']' => SimpleTokenKind::RBracket,
'{' => SimpleTokenKind::LBrace,
'}' => SimpleTokenKind::RBrace,
'&' => SimpleTokenKind::Ampersand,
'^' => SimpleTokenKind::Circumflex,
'+' => SimpleTokenKind::Plus,
'-' => SimpleTokenKind::Minus,
// Non-trivia tokens that _are_ ambiguous when lexing backwards.
// In other words: these are characters that _might_ mark the end
// of a multi-character token (like `!=` or `->` or `//` or `**`).
'=' | '*' | '/' | '@' | '!' | '<' | '>' | '.' => {
// This could be a single-token token, like `+` in `x + y`, or a
// multi-character token, like `+=` in `x += y`. It could also be a sequence
// of multi-character tokens, like `x ==== y`, which is invalid, _but_ it's
// important that we produce the same token stream when lexing backwards as
// we do when lexing forwards. So, identify the range of the sequence, lex
// forwards, and return the last token.
let mut cursor = self.cursor.clone();
cursor.eat_back_while(|c| {
matches!(
c,
':' | '~'
| '%'
| '|'
| '&'
| '^'
| '+'
| '-'
| '='
| '*'
| '/'
| '@'
| '!'
| '<'
| '>'
| '.'
)
});
let token_len = cursor.token_len();
let range = TextRange::at(self.back_offset - token_len, token_len);
let forward_lexer = SimpleTokenizer::new(self.source, range);
if let Some(token) = forward_lexer.last() {
// If the token spans multiple characters, bump the cursor. Note,
// though, that we already bumped the cursor to past the last character
// in the token at the very start of `next_token_back`.y
for _ in self.source[token.range].chars().rev().skip(1) {
self.cursor.bump_back().unwrap();
}
token.kind()
} else {
self.bogus = true;
SimpleTokenKind::Other
}
}
_ => {
// continue
self.bogus = true;
SimpleTokenKind::Other
}
}
}
// Reached end without a closing quote
false
}
impl Iterator for SimpleTokenizer<'_> {
impl Iterator for BackwardsTokenizer<'_> {
type Item = SimpleToken;
fn next(&mut self) -> Option<Self::Item> {
@ -1091,64 +973,16 @@ impl Iterator for SimpleTokenizer<'_> {
}
}
impl DoubleEndedIterator for SimpleTokenizer<'_> {
fn next_back(&mut self) -> Option<Self::Item> {
let token = self.next_token_back();
if token.kind == SimpleTokenKind::EndOfFile {
None
} else {
Some(token)
}
}
}
#[derive(Copy, Clone, Eq, PartialEq, Debug)]
enum StringKind {
/// `'...'` or `"..."`
Single(QuoteKind),
/// `'''...'''` or `"""..."""`
Triple(QuoteKind),
}
impl StringKind {
const fn quote_kind(self) -> QuoteKind {
match self {
StringKind::Single(kind) => kind,
StringKind::Triple(kind) => kind,
}
}
const fn is_single(self) -> bool {
matches!(self, StringKind::Single(_))
}
}
#[derive(Copy, Clone, Eq, PartialEq, Debug)]
enum QuoteKind {
/// `'``
Single,
/// `"`
Double,
}
impl QuoteKind {
const fn as_char(self) -> char {
match self {
QuoteKind::Single => '\'',
QuoteKind::Double => '"',
}
}
}
#[cfg(test)]
mod tests {
use insta::assert_debug_snapshot;
use ruff_python_parser::lexer::lex;
use ruff_python_parser::{Mode, Tok};
use ruff_text_size::{TextLen, TextRange, TextSize};
use crate::tokenizer::{lines_after, lines_before, SimpleToken, SimpleTokenizer};
use crate::{BackwardsTokenizer, SimpleTokenKind};
struct TokenizationTestCase {
source: &'static str,
@ -1167,9 +1001,17 @@ mod tests {
}
fn tokenize_reverse(&self) -> Vec<SimpleToken> {
SimpleTokenizer::new(self.source, self.range)
.rev()
.collect()
let comment_ranges: Vec<_> = lex(self.source, Mode::Module)
.filter_map(|result| {
let (token, range) = result.expect("Input to be a valid python program.");
if matches!(token, Tok::Comment(_)) {
Some(range)
} else {
None
}
})
.collect();
BackwardsTokenizer::new(self.source, self.range, &comment_ranges).collect()
}
fn tokens(&self) -> &[SimpleToken] {
@ -1495,4 +1337,22 @@ mod tests {
1
);
}
#[test]
fn test_previous_token_simple() {
let cases = &["x = (", "x = ( ", "x = (\n"];
for source in cases {
let token = BackwardsTokenizer::up_to(source.text_len(), source, &[])
.skip_trivia()
.next()
.unwrap();
assert_eq!(
token,
SimpleToken {
kind: SimpleTokenKind::LParen,
range: TextRange::new(TextSize::new(4), TextSize::new(5)),
}
);
}
}
}