mirror of
https://github.com/astral-sh/ruff.git
synced 2025-07-13 16:15:07 +00:00

## Summary Resolves #11672. ## Test Plan `cargo nextest run` and `cargo insta test`. --------- Co-authored-by: Alex Waygood <Alex.Waygood@Gmail.com>
1027 lines
28 KiB
Rust
1027 lines
28 KiB
Rust
use unicode_ident::{is_xid_continue, is_xid_start};
|
|
|
|
use ruff_text_size::{Ranged, TextLen, TextRange, TextSize};
|
|
|
|
use crate::{is_python_whitespace, Cursor};
|
|
|
|
/// Searches for the first non-trivia character after `offset`.
|
|
///
|
|
/// The search skips over any whitespace and comments.
|
|
///
|
|
/// Returns `Some` if the source code after `offset` contains any non-trivia character.///
|
|
/// Returns `None` if the text after `offset` is empty or only contains trivia (whitespace or comments).
|
|
pub fn first_non_trivia_token(offset: TextSize, code: &str) -> Option<SimpleToken> {
|
|
SimpleTokenizer::starts_at(offset, code)
|
|
.skip_trivia()
|
|
.next()
|
|
}
|
|
|
|
/// Returns the only non-trivia, non-closing parenthesis token in `range`.
|
|
///
|
|
/// Includes debug assertions that the range only contains that single token.
|
|
pub fn find_only_token_in_range(
|
|
range: TextRange,
|
|
token_kind: SimpleTokenKind,
|
|
code: &str,
|
|
) -> SimpleToken {
|
|
let mut tokens = SimpleTokenizer::new(code, range)
|
|
.skip_trivia()
|
|
.skip_while(|token| token.kind == SimpleTokenKind::RParen);
|
|
let token = tokens.next().expect("Expected a token");
|
|
debug_assert_eq!(token.kind(), token_kind);
|
|
let mut tokens = tokens.skip_while(|token| token.kind == SimpleTokenKind::LParen);
|
|
#[allow(clippy::debug_assert_with_mut_call)]
|
|
{
|
|
debug_assert_eq!(tokens.next(), None);
|
|
}
|
|
token
|
|
}
|
|
|
|
/// Returns the number of newlines between `offset` and the first non whitespace character in the source code.
|
|
pub fn lines_before(offset: TextSize, code: &str) -> u32 {
|
|
let mut cursor = Cursor::new(&code[TextRange::up_to(offset)]);
|
|
|
|
let mut newlines = 0u32;
|
|
while let Some(c) = cursor.bump_back() {
|
|
match c {
|
|
'\n' => {
|
|
cursor.eat_char_back('\r');
|
|
newlines += 1;
|
|
}
|
|
'\r' => {
|
|
newlines += 1;
|
|
}
|
|
c if is_python_whitespace(c) => {
|
|
continue;
|
|
}
|
|
_ => {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
newlines
|
|
}
|
|
|
|
/// Counts the empty lines between `offset` and the first non-whitespace character.
|
|
pub fn lines_after(offset: TextSize, code: &str) -> u32 {
|
|
let mut cursor = Cursor::new(&code[offset.to_usize()..]);
|
|
|
|
let mut newlines = 0u32;
|
|
while let Some(c) = cursor.bump() {
|
|
match c {
|
|
'\n' => {
|
|
newlines += 1;
|
|
}
|
|
'\r' => {
|
|
cursor.eat_char('\n');
|
|
newlines += 1;
|
|
}
|
|
c if is_python_whitespace(c) => {
|
|
continue;
|
|
}
|
|
_ => {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
newlines
|
|
}
|
|
|
|
/// Counts the empty lines after `offset`, ignoring any trailing trivia: end-of-line comments,
|
|
/// own-line comments, and any intermediary newlines.
|
|
pub fn lines_after_ignoring_trivia(offset: TextSize, code: &str) -> u32 {
|
|
let mut newlines = 0u32;
|
|
for token in SimpleTokenizer::starts_at(offset, code) {
|
|
match token.kind() {
|
|
SimpleTokenKind::Newline => {
|
|
newlines += 1;
|
|
}
|
|
SimpleTokenKind::Whitespace => {}
|
|
// If we see a comment, reset the newlines counter.
|
|
SimpleTokenKind::Comment => {
|
|
newlines = 0;
|
|
}
|
|
// As soon as we see a non-trivia token, we're done.
|
|
_ => {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
newlines
|
|
}
|
|
|
|
/// Counts the empty lines after `offset`, ignoring any trailing trivia on the same line as
|
|
/// `offset`.
|
|
#[allow(clippy::cast_possible_truncation)]
|
|
pub fn lines_after_ignoring_end_of_line_trivia(offset: TextSize, code: &str) -> u32 {
|
|
// SAFETY: We don't support files greater than 4GB, so casting to u32 is safe.
|
|
SimpleTokenizer::starts_at(offset, code)
|
|
.skip_while(|token| token.kind != SimpleTokenKind::Newline && token.kind.is_trivia())
|
|
.take_while(|token| {
|
|
token.kind == SimpleTokenKind::Newline || token.kind == SimpleTokenKind::Whitespace
|
|
})
|
|
.filter(|token| token.kind == SimpleTokenKind::Newline)
|
|
.count() as u32
|
|
}
|
|
|
|
fn is_identifier_start(c: char) -> bool {
|
|
if c.is_ascii() {
|
|
c.is_ascii_alphabetic() || c == '_'
|
|
} else {
|
|
is_xid_start(c)
|
|
}
|
|
}
|
|
|
|
// Checks if the character c is a valid continuation character as described
|
|
// in https://docs.python.org/3/reference/lexical_analysis.html#identifiers
|
|
fn is_identifier_continuation(c: char) -> bool {
|
|
// Arrange things such that ASCII codepoints never
|
|
// result in the slower `is_xid_continue` getting called.
|
|
if c.is_ascii() {
|
|
matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9')
|
|
} else {
|
|
is_xid_continue(c)
|
|
}
|
|
}
|
|
|
|
fn to_keyword_or_other(source: &str) -> SimpleTokenKind {
|
|
match source {
|
|
"and" => SimpleTokenKind::And,
|
|
"as" => SimpleTokenKind::As,
|
|
"assert" => SimpleTokenKind::Assert,
|
|
"async" => SimpleTokenKind::Async,
|
|
"await" => SimpleTokenKind::Await,
|
|
"break" => SimpleTokenKind::Break,
|
|
"class" => SimpleTokenKind::Class,
|
|
"continue" => SimpleTokenKind::Continue,
|
|
"def" => SimpleTokenKind::Def,
|
|
"del" => SimpleTokenKind::Del,
|
|
"elif" => SimpleTokenKind::Elif,
|
|
"else" => SimpleTokenKind::Else,
|
|
"except" => SimpleTokenKind::Except,
|
|
"finally" => SimpleTokenKind::Finally,
|
|
"for" => SimpleTokenKind::For,
|
|
"from" => SimpleTokenKind::From,
|
|
"global" => SimpleTokenKind::Global,
|
|
"if" => SimpleTokenKind::If,
|
|
"import" => SimpleTokenKind::Import,
|
|
"in" => SimpleTokenKind::In,
|
|
"is" => SimpleTokenKind::Is,
|
|
"lambda" => SimpleTokenKind::Lambda,
|
|
"nonlocal" => SimpleTokenKind::Nonlocal,
|
|
"not" => SimpleTokenKind::Not,
|
|
"or" => SimpleTokenKind::Or,
|
|
"pass" => SimpleTokenKind::Pass,
|
|
"raise" => SimpleTokenKind::Raise,
|
|
"return" => SimpleTokenKind::Return,
|
|
"try" => SimpleTokenKind::Try,
|
|
"while" => SimpleTokenKind::While,
|
|
"match" => SimpleTokenKind::Match, // Match is a soft keyword that depends on the context but we can always lex it as a keyword and leave it to the caller (parser) to decide if it should be handled as an identifier or keyword.
|
|
"type" => SimpleTokenKind::Type, // Type is a soft keyword that depends on the context but we can always lex it as a keyword and leave it to the caller (parser) to decide if it should be handled as an identifier or keyword.
|
|
"case" => SimpleTokenKind::Case,
|
|
"with" => SimpleTokenKind::With,
|
|
"yield" => SimpleTokenKind::Yield,
|
|
_ => SimpleTokenKind::Name, // Potentially an identifier, but only if it isn't a string prefix. The caller (SimpleTokenizer) is responsible for enforcing that constraint.
|
|
}
|
|
}
|
|
|
|
#[derive(Clone, Debug, Eq, PartialEq, Hash)]
|
|
pub struct SimpleToken {
|
|
pub kind: SimpleTokenKind,
|
|
pub range: TextRange,
|
|
}
|
|
|
|
impl SimpleToken {
|
|
pub const fn kind(&self) -> SimpleTokenKind {
|
|
self.kind
|
|
}
|
|
}
|
|
|
|
impl Ranged for SimpleToken {
|
|
fn range(&self) -> TextRange {
|
|
self.range
|
|
}
|
|
}
|
|
|
|
#[derive(Copy, Clone, Debug, Eq, PartialEq, Hash)]
|
|
pub enum SimpleTokenKind {
|
|
/// A comment, not including the trailing new line.
|
|
Comment,
|
|
|
|
/// Sequence of ' ' or '\t'
|
|
Whitespace,
|
|
|
|
/// Start or end of the file
|
|
EndOfFile,
|
|
|
|
/// `\\`
|
|
Continuation,
|
|
|
|
/// `\n` or `\r` or `\r\n`
|
|
Newline,
|
|
|
|
/// `(`
|
|
LParen,
|
|
|
|
/// `)`
|
|
RParen,
|
|
|
|
/// `{`
|
|
LBrace,
|
|
|
|
/// `}`
|
|
RBrace,
|
|
|
|
/// `[`
|
|
LBracket,
|
|
|
|
/// `]`
|
|
RBracket,
|
|
|
|
/// `,`
|
|
Comma,
|
|
|
|
/// `:`
|
|
Colon,
|
|
|
|
/// `;`
|
|
Semi,
|
|
|
|
/// `/`
|
|
Slash,
|
|
|
|
/// `*`
|
|
Star,
|
|
|
|
/// `.`
|
|
Dot,
|
|
|
|
/// `+`
|
|
Plus,
|
|
|
|
/// `-`
|
|
Minus,
|
|
|
|
/// `=`
|
|
Equals,
|
|
|
|
/// `>`
|
|
Greater,
|
|
|
|
/// `<`
|
|
Less,
|
|
|
|
/// `%`
|
|
Percent,
|
|
|
|
/// `&`
|
|
Ampersand,
|
|
|
|
/// `^`
|
|
Circumflex,
|
|
|
|
/// `|`
|
|
Vbar,
|
|
|
|
/// `@`
|
|
At,
|
|
|
|
/// `~`
|
|
Tilde,
|
|
|
|
/// `==`
|
|
EqEqual,
|
|
|
|
/// `!=`
|
|
NotEqual,
|
|
|
|
/// `<=`
|
|
LessEqual,
|
|
|
|
/// `>=`
|
|
GreaterEqual,
|
|
|
|
/// `<<`
|
|
LeftShift,
|
|
|
|
/// `>>`
|
|
RightShift,
|
|
|
|
/// `**`
|
|
DoubleStar,
|
|
|
|
/// `**=`
|
|
DoubleStarEqual,
|
|
|
|
/// `+=`
|
|
PlusEqual,
|
|
|
|
/// `-=`
|
|
MinusEqual,
|
|
|
|
/// `*=`
|
|
StarEqual,
|
|
|
|
/// `/=`
|
|
SlashEqual,
|
|
|
|
/// `%=`
|
|
PercentEqual,
|
|
|
|
/// `&=`
|
|
AmperEqual,
|
|
|
|
/// `|=`
|
|
VbarEqual,
|
|
|
|
/// `^=`
|
|
CircumflexEqual,
|
|
|
|
/// `<<=`
|
|
LeftShiftEqual,
|
|
|
|
/// `>>=`
|
|
RightShiftEqual,
|
|
|
|
/// `//`
|
|
DoubleSlash,
|
|
|
|
/// `//=`
|
|
DoubleSlashEqual,
|
|
|
|
/// `:=`
|
|
ColonEqual,
|
|
|
|
/// `...`
|
|
Ellipsis,
|
|
|
|
/// `@=`
|
|
AtEqual,
|
|
|
|
/// `->`
|
|
RArrow,
|
|
|
|
/// `and`
|
|
And,
|
|
|
|
/// `as`
|
|
As,
|
|
|
|
/// `assert`
|
|
Assert,
|
|
|
|
/// `async`
|
|
Async,
|
|
|
|
/// `await`
|
|
Await,
|
|
|
|
/// `break`
|
|
Break,
|
|
|
|
/// `class`
|
|
Class,
|
|
|
|
/// `continue`
|
|
Continue,
|
|
|
|
/// `def`
|
|
Def,
|
|
|
|
/// `del`
|
|
Del,
|
|
|
|
/// `elif`
|
|
Elif,
|
|
|
|
/// `else`
|
|
Else,
|
|
|
|
/// `except`
|
|
Except,
|
|
|
|
/// `finally`
|
|
Finally,
|
|
|
|
/// `for`
|
|
For,
|
|
|
|
/// `from`
|
|
From,
|
|
|
|
/// `global`
|
|
Global,
|
|
|
|
/// `if`
|
|
If,
|
|
|
|
/// `import`
|
|
Import,
|
|
|
|
/// `in`
|
|
In,
|
|
|
|
/// `is`
|
|
Is,
|
|
|
|
/// `lambda`
|
|
Lambda,
|
|
|
|
/// `nonlocal`
|
|
Nonlocal,
|
|
|
|
/// `not`
|
|
Not,
|
|
|
|
/// `or`
|
|
Or,
|
|
|
|
/// `pass`
|
|
Pass,
|
|
|
|
/// `raise`
|
|
Raise,
|
|
|
|
/// `return`
|
|
Return,
|
|
|
|
/// `try`
|
|
Try,
|
|
|
|
/// `while`
|
|
While,
|
|
|
|
/// `match`
|
|
Match,
|
|
|
|
/// `type`
|
|
Type,
|
|
|
|
/// `case`
|
|
Case,
|
|
|
|
/// `with`
|
|
With,
|
|
|
|
/// `yield`
|
|
Yield,
|
|
|
|
/// An identifier or keyword.
|
|
Name,
|
|
|
|
/// Any other non trivia token.
|
|
Other,
|
|
|
|
/// Returned for each character after [`SimpleTokenKind::Other`] has been returned once.
|
|
Bogus,
|
|
}
|
|
|
|
impl SimpleTokenKind {
|
|
pub const fn is_trivia(self) -> bool {
|
|
matches!(
|
|
self,
|
|
SimpleTokenKind::Whitespace
|
|
| SimpleTokenKind::Newline
|
|
| SimpleTokenKind::Comment
|
|
| SimpleTokenKind::Continuation
|
|
)
|
|
}
|
|
|
|
pub const fn is_comment(self) -> bool {
|
|
matches!(self, SimpleTokenKind::Comment)
|
|
}
|
|
}
|
|
|
|
/// Simple zero allocation tokenizer handling most tokens.
|
|
///
|
|
/// The tokenizer must start at an offset that is trivia (e.g. not inside of a multiline string).
|
|
///
|
|
/// In case it finds something it can't parse, the tokenizer will return a
|
|
/// [`SimpleTokenKind::Other`] and then only a final [`SimpleTokenKind::Bogus`] afterwards.
|
|
pub struct SimpleTokenizer<'a> {
|
|
offset: TextSize,
|
|
/// `true` when it is known that the current `back` line has no comment for sure.
|
|
bogus: bool,
|
|
source: &'a str,
|
|
cursor: Cursor<'a>,
|
|
}
|
|
|
|
impl<'a> SimpleTokenizer<'a> {
|
|
pub fn new(source: &'a str, range: TextRange) -> Self {
|
|
Self {
|
|
offset: range.start(),
|
|
bogus: false,
|
|
source,
|
|
cursor: Cursor::new(&source[range]),
|
|
}
|
|
}
|
|
|
|
pub fn starts_at(offset: TextSize, source: &'a str) -> Self {
|
|
let range = TextRange::new(offset, source.text_len());
|
|
Self::new(source, range)
|
|
}
|
|
|
|
fn next_token(&mut self) -> SimpleToken {
|
|
self.cursor.start_token();
|
|
|
|
let Some(first) = self.cursor.bump() else {
|
|
return SimpleToken {
|
|
kind: SimpleTokenKind::EndOfFile,
|
|
range: TextRange::empty(self.offset),
|
|
};
|
|
};
|
|
|
|
if self.bogus {
|
|
// Emit a single final bogus token
|
|
let token = SimpleToken {
|
|
kind: SimpleTokenKind::Bogus,
|
|
range: TextRange::new(self.offset, self.source.text_len()),
|
|
};
|
|
|
|
// Set the cursor to EOF
|
|
self.cursor = Cursor::new("");
|
|
self.offset = self.source.text_len();
|
|
return token;
|
|
}
|
|
|
|
let kind = self.next_token_inner(first);
|
|
|
|
let token_len = self.cursor.token_len();
|
|
|
|
let token = SimpleToken {
|
|
kind,
|
|
range: TextRange::at(self.offset, token_len),
|
|
};
|
|
|
|
self.offset += token_len;
|
|
|
|
token
|
|
}
|
|
|
|
fn next_token_inner(&mut self, first: char) -> SimpleTokenKind {
|
|
match first {
|
|
// Keywords and identifiers
|
|
c if is_identifier_start(c) => {
|
|
self.cursor.eat_while(is_identifier_continuation);
|
|
let token_len = self.cursor.token_len();
|
|
|
|
let range = TextRange::at(self.offset, token_len);
|
|
let kind = to_keyword_or_other(&self.source[range]);
|
|
|
|
// If the next character is a quote, we may be in a string prefix. For example:
|
|
// `f"foo`.
|
|
if kind == SimpleTokenKind::Name
|
|
&& matches!(self.cursor.first(), '"' | '\'')
|
|
&& matches!(
|
|
&self.source[range],
|
|
"B" | "BR"
|
|
| "Br"
|
|
| "F"
|
|
| "FR"
|
|
| "Fr"
|
|
| "R"
|
|
| "RB"
|
|
| "RF"
|
|
| "Rb"
|
|
| "Rf"
|
|
| "U"
|
|
| "b"
|
|
| "bR"
|
|
| "br"
|
|
| "f"
|
|
| "fR"
|
|
| "fr"
|
|
| "r"
|
|
| "rB"
|
|
| "rF"
|
|
| "rb"
|
|
| "rf"
|
|
| "u"
|
|
)
|
|
{
|
|
self.bogus = true;
|
|
SimpleTokenKind::Other
|
|
} else {
|
|
kind
|
|
}
|
|
}
|
|
|
|
// Space, tab, or form feed. We ignore the true semantics of form feed, and treat it as
|
|
// whitespace.
|
|
' ' | '\t' | '\x0C' => {
|
|
self.cursor.eat_while(|c| matches!(c, ' ' | '\t' | '\x0C'));
|
|
SimpleTokenKind::Whitespace
|
|
}
|
|
|
|
'\n' => SimpleTokenKind::Newline,
|
|
|
|
'\r' => {
|
|
self.cursor.eat_char('\n');
|
|
SimpleTokenKind::Newline
|
|
}
|
|
|
|
'#' => {
|
|
self.cursor.eat_while(|c| !matches!(c, '\n' | '\r'));
|
|
SimpleTokenKind::Comment
|
|
}
|
|
|
|
'\\' => SimpleTokenKind::Continuation,
|
|
|
|
// Non-trivia, non-keyword tokens
|
|
'=' => {
|
|
if self.cursor.eat_char('=') {
|
|
SimpleTokenKind::EqEqual
|
|
} else {
|
|
SimpleTokenKind::Equals
|
|
}
|
|
}
|
|
'+' => {
|
|
if self.cursor.eat_char('=') {
|
|
SimpleTokenKind::PlusEqual
|
|
} else {
|
|
SimpleTokenKind::Plus
|
|
}
|
|
}
|
|
'*' => {
|
|
if self.cursor.eat_char('=') {
|
|
SimpleTokenKind::StarEqual
|
|
} else if self.cursor.eat_char('*') {
|
|
if self.cursor.eat_char('=') {
|
|
SimpleTokenKind::DoubleStarEqual
|
|
} else {
|
|
SimpleTokenKind::DoubleStar
|
|
}
|
|
} else {
|
|
SimpleTokenKind::Star
|
|
}
|
|
}
|
|
'/' => {
|
|
if self.cursor.eat_char('=') {
|
|
SimpleTokenKind::SlashEqual
|
|
} else if self.cursor.eat_char('/') {
|
|
if self.cursor.eat_char('=') {
|
|
SimpleTokenKind::DoubleSlashEqual
|
|
} else {
|
|
SimpleTokenKind::DoubleSlash
|
|
}
|
|
} else {
|
|
SimpleTokenKind::Slash
|
|
}
|
|
}
|
|
'%' => {
|
|
if self.cursor.eat_char('=') {
|
|
SimpleTokenKind::PercentEqual
|
|
} else {
|
|
SimpleTokenKind::Percent
|
|
}
|
|
}
|
|
'|' => {
|
|
if self.cursor.eat_char('=') {
|
|
SimpleTokenKind::VbarEqual
|
|
} else {
|
|
SimpleTokenKind::Vbar
|
|
}
|
|
}
|
|
'^' => {
|
|
if self.cursor.eat_char('=') {
|
|
SimpleTokenKind::CircumflexEqual
|
|
} else {
|
|
SimpleTokenKind::Circumflex
|
|
}
|
|
}
|
|
'&' => {
|
|
if self.cursor.eat_char('=') {
|
|
SimpleTokenKind::AmperEqual
|
|
} else {
|
|
SimpleTokenKind::Ampersand
|
|
}
|
|
}
|
|
'-' => {
|
|
if self.cursor.eat_char('=') {
|
|
SimpleTokenKind::MinusEqual
|
|
} else if self.cursor.eat_char('>') {
|
|
SimpleTokenKind::RArrow
|
|
} else {
|
|
SimpleTokenKind::Minus
|
|
}
|
|
}
|
|
'@' => {
|
|
if self.cursor.eat_char('=') {
|
|
SimpleTokenKind::AtEqual
|
|
} else {
|
|
SimpleTokenKind::At
|
|
}
|
|
}
|
|
'!' => {
|
|
if self.cursor.eat_char('=') {
|
|
SimpleTokenKind::NotEqual
|
|
} else {
|
|
self.bogus = true;
|
|
SimpleTokenKind::Other
|
|
}
|
|
}
|
|
'~' => SimpleTokenKind::Tilde,
|
|
':' => {
|
|
if self.cursor.eat_char('=') {
|
|
SimpleTokenKind::ColonEqual
|
|
} else {
|
|
SimpleTokenKind::Colon
|
|
}
|
|
}
|
|
';' => SimpleTokenKind::Semi,
|
|
'<' => {
|
|
if self.cursor.eat_char('<') {
|
|
if self.cursor.eat_char('=') {
|
|
SimpleTokenKind::LeftShiftEqual
|
|
} else {
|
|
SimpleTokenKind::LeftShift
|
|
}
|
|
} else if self.cursor.eat_char('=') {
|
|
SimpleTokenKind::LessEqual
|
|
} else {
|
|
SimpleTokenKind::Less
|
|
}
|
|
}
|
|
'>' => {
|
|
if self.cursor.eat_char('>') {
|
|
if self.cursor.eat_char('=') {
|
|
SimpleTokenKind::RightShiftEqual
|
|
} else {
|
|
SimpleTokenKind::RightShift
|
|
}
|
|
} else if self.cursor.eat_char('=') {
|
|
SimpleTokenKind::GreaterEqual
|
|
} else {
|
|
SimpleTokenKind::Greater
|
|
}
|
|
}
|
|
',' => SimpleTokenKind::Comma,
|
|
'.' => {
|
|
if self.cursor.first() == '.' && self.cursor.second() == '.' {
|
|
self.cursor.bump();
|
|
self.cursor.bump();
|
|
SimpleTokenKind::Ellipsis
|
|
} else {
|
|
SimpleTokenKind::Dot
|
|
}
|
|
}
|
|
|
|
// Bracket tokens
|
|
'(' => SimpleTokenKind::LParen,
|
|
')' => SimpleTokenKind::RParen,
|
|
'[' => SimpleTokenKind::LBracket,
|
|
']' => SimpleTokenKind::RBracket,
|
|
'{' => SimpleTokenKind::LBrace,
|
|
'}' => SimpleTokenKind::RBrace,
|
|
|
|
_ => {
|
|
self.bogus = true;
|
|
SimpleTokenKind::Other
|
|
}
|
|
}
|
|
}
|
|
|
|
pub fn skip_trivia(self) -> impl Iterator<Item = SimpleToken> + 'a {
|
|
self.filter(|t| !t.kind().is_trivia())
|
|
}
|
|
}
|
|
|
|
impl Iterator for SimpleTokenizer<'_> {
|
|
type Item = SimpleToken;
|
|
|
|
fn next(&mut self) -> Option<Self::Item> {
|
|
let token = self.next_token();
|
|
|
|
if token.kind == SimpleTokenKind::EndOfFile {
|
|
None
|
|
} else {
|
|
Some(token)
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Simple zero allocation backwards tokenizer for finding preceding tokens.
|
|
///
|
|
/// The tokenizer must start at an offset that is trivia (e.g. not inside of a multiline string).
|
|
/// It will fail when reaching a string.
|
|
///
|
|
/// In case it finds something it can't parse, the tokenizer will return a
|
|
/// [`SimpleTokenKind::Other`] and then only a final [`SimpleTokenKind::Bogus`] afterwards.
|
|
pub struct BackwardsTokenizer<'a> {
|
|
offset: TextSize,
|
|
back_offset: TextSize,
|
|
/// Not `&CommentRanges` to avoid a circular dependency.
|
|
comment_ranges: &'a [TextRange],
|
|
bogus: bool,
|
|
source: &'a str,
|
|
cursor: Cursor<'a>,
|
|
}
|
|
|
|
impl<'a> BackwardsTokenizer<'a> {
|
|
pub fn new(source: &'a str, range: TextRange, comment_range: &'a [TextRange]) -> Self {
|
|
Self {
|
|
offset: range.start(),
|
|
back_offset: range.end(),
|
|
// Throw out any comments that follow the range.
|
|
comment_ranges: &comment_range
|
|
[..comment_range.partition_point(|comment| comment.start() <= range.end())],
|
|
bogus: false,
|
|
source,
|
|
cursor: Cursor::new(&source[range]),
|
|
}
|
|
}
|
|
|
|
pub fn up_to(offset: TextSize, source: &'a str, comment_range: &'a [TextRange]) -> Self {
|
|
Self::new(source, TextRange::up_to(offset), comment_range)
|
|
}
|
|
|
|
pub fn skip_trivia(self) -> impl Iterator<Item = SimpleToken> + 'a {
|
|
self.filter(|t| !t.kind().is_trivia())
|
|
}
|
|
|
|
pub fn next_token(&mut self) -> SimpleToken {
|
|
self.cursor.start_token();
|
|
self.back_offset = self.cursor.text_len() + self.offset;
|
|
|
|
let Some(last) = self.cursor.bump_back() else {
|
|
return SimpleToken {
|
|
kind: SimpleTokenKind::EndOfFile,
|
|
range: TextRange::empty(self.back_offset),
|
|
};
|
|
};
|
|
|
|
if self.bogus {
|
|
let token = SimpleToken {
|
|
kind: SimpleTokenKind::Bogus,
|
|
range: TextRange::up_to(self.back_offset),
|
|
};
|
|
|
|
// Set the cursor to EOF
|
|
self.cursor = Cursor::new("");
|
|
self.back_offset = TextSize::new(0);
|
|
return token;
|
|
}
|
|
|
|
if let Some(comment) = self
|
|
.comment_ranges
|
|
.last()
|
|
.filter(|comment| comment.contains_inclusive(self.back_offset))
|
|
{
|
|
self.comment_ranges = &self.comment_ranges[..self.comment_ranges.len() - 1];
|
|
|
|
// Skip the comment without iterating over the chars manually.
|
|
self.cursor = Cursor::new(&self.source[TextRange::new(self.offset, comment.start())]);
|
|
debug_assert_eq!(self.cursor.text_len() + self.offset, comment.start());
|
|
return SimpleToken {
|
|
kind: SimpleTokenKind::Comment,
|
|
range: comment.range(),
|
|
};
|
|
}
|
|
|
|
let kind = match last {
|
|
// Space, tab, or form feed. We ignore the true semantics of form feed, and treat it as
|
|
// whitespace. Note that this will lex-out trailing whitespace from a comment as
|
|
// whitespace rather than as part of the comment token, but this shouldn't matter for
|
|
// our use case.
|
|
' ' | '\t' | '\x0C' => {
|
|
self.cursor
|
|
.eat_back_while(|c| matches!(c, ' ' | '\t' | '\x0C'));
|
|
SimpleTokenKind::Whitespace
|
|
}
|
|
|
|
'\r' => SimpleTokenKind::Newline,
|
|
'\n' => {
|
|
self.cursor.eat_char_back('\r');
|
|
SimpleTokenKind::Newline
|
|
}
|
|
_ => self.next_token_inner(last),
|
|
};
|
|
|
|
let token_len = self.cursor.token_len();
|
|
let start = self.back_offset - token_len;
|
|
SimpleToken {
|
|
kind,
|
|
range: TextRange::at(start, token_len),
|
|
}
|
|
}
|
|
|
|
/// Helper to parser the previous token once we skipped all whitespace
|
|
fn next_token_inner(&mut self, last: char) -> SimpleTokenKind {
|
|
match last {
|
|
// Keywords and identifiers
|
|
c if is_identifier_continuation(c) => {
|
|
// if we only have identifier continuations but no start (e.g. 555) we
|
|
// don't want to consume the chars, so in that case, we want to rewind the
|
|
// cursor to here
|
|
let savepoint = self.cursor.clone();
|
|
self.cursor.eat_back_while(is_identifier_continuation);
|
|
|
|
let token_len = self.cursor.token_len();
|
|
let range = TextRange::at(self.back_offset - token_len, token_len);
|
|
|
|
if self.source[range]
|
|
.chars()
|
|
.next()
|
|
.is_some_and(is_identifier_start)
|
|
{
|
|
to_keyword_or_other(&self.source[range])
|
|
} else {
|
|
self.cursor = savepoint;
|
|
self.bogus = true;
|
|
SimpleTokenKind::Other
|
|
}
|
|
}
|
|
|
|
// Non-trivia tokens that are unambiguous when lexing backwards.
|
|
// In other words: these are characters that _don't_ appear at the
|
|
// end of a multi-character token (like `!=`).
|
|
'\\' => SimpleTokenKind::Continuation,
|
|
':' => SimpleTokenKind::Colon,
|
|
'~' => SimpleTokenKind::Tilde,
|
|
'%' => SimpleTokenKind::Percent,
|
|
'|' => SimpleTokenKind::Vbar,
|
|
',' => SimpleTokenKind::Comma,
|
|
';' => SimpleTokenKind::Semi,
|
|
'(' => SimpleTokenKind::LParen,
|
|
')' => SimpleTokenKind::RParen,
|
|
'[' => SimpleTokenKind::LBracket,
|
|
']' => SimpleTokenKind::RBracket,
|
|
'{' => SimpleTokenKind::LBrace,
|
|
'}' => SimpleTokenKind::RBrace,
|
|
'&' => SimpleTokenKind::Ampersand,
|
|
'^' => SimpleTokenKind::Circumflex,
|
|
'+' => SimpleTokenKind::Plus,
|
|
'-' => SimpleTokenKind::Minus,
|
|
|
|
// Non-trivia tokens that _are_ ambiguous when lexing backwards.
|
|
// In other words: these are characters that _might_ mark the end
|
|
// of a multi-character token (like `!=` or `->` or `//` or `**`).
|
|
'=' | '*' | '/' | '@' | '!' | '<' | '>' | '.' => {
|
|
// This could be a single-token token, like `+` in `x + y`, or a
|
|
// multi-character token, like `+=` in `x += y`. It could also be a sequence
|
|
// of multi-character tokens, like `x ==== y`, which is invalid, _but_ it's
|
|
// important that we produce the same token stream when lexing backwards as
|
|
// we do when lexing forwards. So, identify the range of the sequence, lex
|
|
// forwards, and return the last token.
|
|
let mut cursor = self.cursor.clone();
|
|
cursor.eat_back_while(|c| {
|
|
matches!(
|
|
c,
|
|
':' | '~'
|
|
| '%'
|
|
| '|'
|
|
| '&'
|
|
| '^'
|
|
| '+'
|
|
| '-'
|
|
| '='
|
|
| '*'
|
|
| '/'
|
|
| '@'
|
|
| '!'
|
|
| '<'
|
|
| '>'
|
|
| '.'
|
|
)
|
|
});
|
|
|
|
let token_len = cursor.token_len();
|
|
let range = TextRange::at(self.back_offset - token_len, token_len);
|
|
|
|
let forward_lexer = SimpleTokenizer::new(self.source, range);
|
|
if let Some(token) = forward_lexer.last() {
|
|
// If the token spans multiple characters, bump the cursor. Note,
|
|
// though, that we already bumped the cursor to past the last character
|
|
// in the token at the very start of `next_token_back`.y
|
|
for _ in self.source[token.range].chars().rev().skip(1) {
|
|
self.cursor.bump_back().unwrap();
|
|
}
|
|
token.kind()
|
|
} else {
|
|
self.bogus = true;
|
|
SimpleTokenKind::Other
|
|
}
|
|
}
|
|
_ => {
|
|
self.bogus = true;
|
|
SimpleTokenKind::Other
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
impl Iterator for BackwardsTokenizer<'_> {
|
|
type Item = SimpleToken;
|
|
|
|
fn next(&mut self) -> Option<Self::Item> {
|
|
let token = self.next_token();
|
|
|
|
if token.kind == SimpleTokenKind::EndOfFile {
|
|
None
|
|
} else {
|
|
Some(token)
|
|
}
|
|
}
|
|
}
|