Add support for multi-character operator tokens to SimpleTokenizer (#6563)

## Summary

Allows for proper lexing of tokens like `->`.

The main challenge is to ensure that our forward and backwards
representations are the same for cases like `===`. Specifically, we want
that to lex as `==` followed by `=` regardless of whether it's a
forwards or backwards lex. To do so, we identify the range of the
sequential characters (the full span of `===`), lex it forwards, then
return the last token.

## Test Plan

`cargo test`
This commit is contained in:
Charlie Marsh 2023-08-16 09:09:19 -04:00 committed by GitHub
parent e28858bb29
commit 86ccdcc9d9
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 538 additions and 149 deletions

View file

@ -951,39 +951,11 @@ fn handle_dict_unpacking_comment<'a>(
// if the remaining tokens from the previous node are exactly `**`,
// re-assign the comment to the one that follows the stars
let mut count = 0u32;
// we start from the preceding node but we skip its token
if let Some(token) = tokens.next() {
// The Keyword case
if token.kind == SimpleTokenKind::Star {
count += 1;
if tokens.any(|token| token.kind == SimpleTokenKind::DoubleStar) {
CommentPlacement::trailing(following, comment)
} else {
// The dict case
debug_assert!(
matches!(
token,
SimpleToken {
kind: SimpleTokenKind::LBrace
| SimpleTokenKind::Comma
| SimpleTokenKind::Colon,
..
}
),
"{token:?}",
);
}
}
for token in tokens {
debug_assert!(token.kind == SimpleTokenKind::Star, "Expected star token");
count += 1;
}
if count == 2 {
return CommentPlacement::trailing(following, comment);
}
CommentPlacement::Default(comment)
}
}
/// Own line comments coming after the node are always dangling comments

View file

@ -30,6 +30,14 @@ impl<'a> Cursor<'a> {
self.chars.clone().next().unwrap_or(EOF_CHAR)
}
/// Peeks the second character from the input stream without consuming it.
/// Returns [`EOF_CHAR`] if the position is past the end of the file.
pub fn second(&self) -> char {
let mut chars = self.chars.clone();
chars.next();
chars.next().unwrap_or(EOF_CHAR)
}
/// Peeks the next character from the input stream without consuming it.
/// Returns [`EOF_CHAR`] if the file is at the end of the file.
pub fn last(&self) -> char {

View file

@ -1,46 +0,0 @@
---
source: crates/ruff_python_trivia/src/tokenizer.rs
expression: test_case.tokens()
---
[
SimpleToken {
kind: Minus,
range: 0..1,
},
SimpleToken {
kind: Greater,
range: 1..2,
},
SimpleToken {
kind: Whitespace,
range: 2..3,
},
SimpleToken {
kind: Star,
range: 3..4,
},
SimpleToken {
kind: Equals,
range: 4..5,
},
SimpleToken {
kind: Whitespace,
range: 5..6,
},
SimpleToken {
kind: LParen,
range: 6..7,
},
SimpleToken {
kind: Tilde,
range: 7..8,
},
SimpleToken {
kind: Equals,
range: 8..9,
},
SimpleToken {
kind: RParen,
range: 9..10,
},
]

View file

@ -0,0 +1,14 @@
---
source: crates/ruff_python_trivia/src/tokenizer.rs
expression: test_case.tokens()
---
[
SimpleToken {
kind: EqEqual,
range: 0..2,
},
SimpleToken {
kind: Equals,
range: 2..3,
},
]

View file

@ -0,0 +1,22 @@
---
source: crates/ruff_python_trivia/src/tokenizer.rs
expression: test_case.tokens()
---
[
SimpleToken {
kind: RArrow,
range: 0..2,
},
SimpleToken {
kind: Whitespace,
range: 2..3,
},
SimpleToken {
kind: Other,
range: 3..4,
},
SimpleToken {
kind: Bogus,
range: 4..5,
},
]

View file

@ -0,0 +1,14 @@
---
source: crates/ruff_python_trivia/src/tokenizer.rs
expression: test_case.tokens()
---
[
SimpleToken {
kind: NotEqual,
range: 0..2,
},
SimpleToken {
kind: Equals,
range: 2..3,
},
]

View file

@ -0,0 +1,106 @@
---
source: crates/ruff_python_trivia/src/tokenizer.rs
expression: test_case.tokens()
---
[
SimpleToken {
kind: RArrow,
range: 0..2,
},
SimpleToken {
kind: Whitespace,
range: 2..3,
},
SimpleToken {
kind: StarEqual,
range: 3..5,
},
SimpleToken {
kind: Whitespace,
range: 5..6,
},
SimpleToken {
kind: LParen,
range: 6..7,
},
SimpleToken {
kind: Whitespace,
range: 7..8,
},
SimpleToken {
kind: MinusEqual,
range: 8..10,
},
SimpleToken {
kind: Whitespace,
range: 10..11,
},
SimpleToken {
kind: RParen,
range: 11..12,
},
SimpleToken {
kind: Whitespace,
range: 12..13,
},
SimpleToken {
kind: Tilde,
range: 13..14,
},
SimpleToken {
kind: Whitespace,
range: 14..15,
},
SimpleToken {
kind: DoubleSlash,
range: 15..17,
},
SimpleToken {
kind: Whitespace,
range: 17..18,
},
SimpleToken {
kind: DoubleStar,
range: 18..20,
},
SimpleToken {
kind: Whitespace,
range: 20..21,
},
SimpleToken {
kind: DoubleStarEqual,
range: 21..24,
},
SimpleToken {
kind: Whitespace,
range: 24..25,
},
SimpleToken {
kind: Circumflex,
range: 25..26,
},
SimpleToken {
kind: Whitespace,
range: 26..27,
},
SimpleToken {
kind: CircumflexEqual,
range: 27..29,
},
SimpleToken {
kind: Whitespace,
range: 29..30,
},
SimpleToken {
kind: Vbar,
range: 30..31,
},
SimpleToken {
kind: Whitespace,
range: 31..32,
},
SimpleToken {
kind: VbarEqual,
range: 32..34,
},
]

View file

@ -1,7 +1,8 @@
use memchr::{memchr2, memchr3, memrchr3_iter};
use ruff_text_size::{TextLen, TextRange, TextSize};
use unic_ucd_ident::{is_xid_continue, is_xid_start};
use ruff_text_size::{TextLen, TextRange, TextSize};
use crate::{is_python_whitespace, Cursor};
/// Searches for the first non-trivia character in `range`.
@ -213,6 +214,78 @@ pub enum SimpleTokenKind {
/// `~`
Tilde,
/// `==`
EqEqual,
/// `!=`
NotEqual,
/// `<=`
LessEqual,
/// `>=`
GreaterEqual,
/// `<<`
LeftShift,
/// `>>`
RightShift,
/// `**`
DoubleStar,
/// `**=`
DoubleStarEqual,
/// `+=`
PlusEqual,
/// `-=`
MinusEqual,
/// `*=`
StarEqual,
/// `/=`
SlashEqual,
/// `%=`
PercentEqual,
/// `&=`
AmperEqual,
/// `|=`
VbarEqual,
/// `^=`
CircumflexEqual,
/// `<<=`
LeftShiftEqual,
/// `>>=`
RightShiftEqual,
/// `//`
DoubleSlash,
/// `//=`
DoubleSlashEqual,
/// `:=`
ColonEqual,
/// `...`
Ellipsis,
/// `@=`
AtEqual,
/// `->`
RArrow,
/// `and`
And,
@ -326,35 +399,6 @@ pub enum SimpleTokenKind {
}
impl SimpleTokenKind {
const fn from_non_trivia_char(c: char) -> SimpleTokenKind {
match c {
'(' => SimpleTokenKind::LParen,
')' => SimpleTokenKind::RParen,
'[' => SimpleTokenKind::LBracket,
']' => SimpleTokenKind::RBracket,
'{' => SimpleTokenKind::LBrace,
'}' => SimpleTokenKind::RBrace,
',' => SimpleTokenKind::Comma,
':' => SimpleTokenKind::Colon,
';' => SimpleTokenKind::Semi,
'/' => SimpleTokenKind::Slash,
'*' => SimpleTokenKind::Star,
'.' => SimpleTokenKind::Dot,
'+' => SimpleTokenKind::Plus,
'-' => SimpleTokenKind::Minus,
'=' => SimpleTokenKind::Equals,
'>' => SimpleTokenKind::Greater,
'<' => SimpleTokenKind::Less,
'%' => SimpleTokenKind::Percent,
'&' => SimpleTokenKind::Ampersand,
'^' => SimpleTokenKind::Circumflex,
'|' => SimpleTokenKind::Vbar,
'@' => SimpleTokenKind::At,
'~' => SimpleTokenKind::Tilde,
_ => SimpleTokenKind::Other,
}
}
const fn is_trivia(self) -> bool {
matches!(
self,
@ -478,6 +522,20 @@ impl<'a> SimpleTokenizer<'a> {
}
let kind = match first {
// Keywords and identifiers
c if is_identifier_start(c) => {
self.cursor.eat_while(is_identifier_continuation);
let token_len = self.cursor.token_len();
let range = TextRange::at(self.offset, token_len);
let kind = self.to_keyword_or_other(range);
if kind == SimpleTokenKind::Other {
self.bogus = true;
}
kind
}
' ' | '\t' => {
self.cursor.eat_while(|c| matches!(c, ' ' | '\t'));
SimpleTokenKind::Whitespace
@ -497,21 +555,156 @@ impl<'a> SimpleTokenizer<'a> {
'\\' => SimpleTokenKind::Continuation,
c => {
let kind = if is_identifier_start(c) {
self.cursor.eat_while(is_identifier_continuation);
let token_len = self.cursor.token_len();
let range = TextRange::at(self.offset, token_len);
self.to_keyword_or_other(range)
// Non-trivia, non-keyword tokens
'=' => {
if self.cursor.eat_char('=') {
SimpleTokenKind::EqEqual
} else {
SimpleTokenKind::from_non_trivia_char(c)
};
if kind == SimpleTokenKind::Other {
self.bogus = true;
SimpleTokenKind::Equals
}
kind
}
'+' => {
if self.cursor.eat_char('=') {
SimpleTokenKind::PlusEqual
} else {
SimpleTokenKind::Plus
}
}
'*' => {
if self.cursor.eat_char('=') {
SimpleTokenKind::StarEqual
} else if self.cursor.eat_char('*') {
if self.cursor.eat_char('=') {
SimpleTokenKind::DoubleStarEqual
} else {
SimpleTokenKind::DoubleStar
}
} else {
SimpleTokenKind::Star
}
}
'/' => {
if self.cursor.eat_char('=') {
SimpleTokenKind::SlashEqual
} else if self.cursor.eat_char('/') {
if self.cursor.eat_char('=') {
SimpleTokenKind::DoubleSlashEqual
} else {
SimpleTokenKind::DoubleSlash
}
} else {
SimpleTokenKind::Slash
}
}
'%' => {
if self.cursor.eat_char('=') {
SimpleTokenKind::PercentEqual
} else {
SimpleTokenKind::Percent
}
}
'|' => {
if self.cursor.eat_char('=') {
SimpleTokenKind::VbarEqual
} else {
SimpleTokenKind::Vbar
}
}
'^' => {
if self.cursor.eat_char('=') {
SimpleTokenKind::CircumflexEqual
} else {
SimpleTokenKind::Circumflex
}
}
'&' => {
if self.cursor.eat_char('=') {
SimpleTokenKind::AmperEqual
} else {
SimpleTokenKind::Ampersand
}
}
'-' => {
if self.cursor.eat_char('=') {
SimpleTokenKind::MinusEqual
} else if self.cursor.eat_char('>') {
SimpleTokenKind::RArrow
} else {
SimpleTokenKind::Minus
}
}
'@' => {
if self.cursor.eat_char('=') {
SimpleTokenKind::AtEqual
} else {
SimpleTokenKind::At
}
}
'!' => {
if self.cursor.eat_char('=') {
SimpleTokenKind::NotEqual
} else {
self.bogus = true;
SimpleTokenKind::Other
}
}
'~' => SimpleTokenKind::Tilde,
':' => {
if self.cursor.eat_char('=') {
SimpleTokenKind::ColonEqual
} else {
SimpleTokenKind::Colon
}
}
';' => SimpleTokenKind::Semi,
'<' => {
if self.cursor.eat_char('<') {
if self.cursor.eat_char('=') {
SimpleTokenKind::LeftShiftEqual
} else {
SimpleTokenKind::LeftShift
}
} else if self.cursor.eat_char('=') {
SimpleTokenKind::LessEqual
} else {
SimpleTokenKind::Less
}
}
'>' => {
if self.cursor.eat_char('>') {
if self.cursor.eat_char('=') {
SimpleTokenKind::RightShiftEqual
} else {
SimpleTokenKind::RightShift
}
} else if self.cursor.eat_char('=') {
SimpleTokenKind::GreaterEqual
} else {
SimpleTokenKind::Greater
}
}
',' => SimpleTokenKind::Comma,
'.' => {
if self.cursor.first() == '.' && self.cursor.second() == '.' {
self.cursor.bump();
self.cursor.bump();
SimpleTokenKind::Ellipsis
} else {
SimpleTokenKind::Dot
}
}
// Bracket tokens
'(' => SimpleTokenKind::LParen,
')' => SimpleTokenKind::RParen,
'[' => SimpleTokenKind::LBracket,
']' => SimpleTokenKind::RBracket,
'{' => SimpleTokenKind::LBrace,
'}' => SimpleTokenKind::RBrace,
_ => {
self.bogus = true;
SimpleTokenKind::Other
}
};
@ -612,10 +805,10 @@ impl<'a> SimpleTokenizer<'a> {
}
SimpleTokenKind::Comment
} else if c == '\\' {
SimpleTokenKind::Continuation
} else {
let kind = if is_identifier_continuation(c) {
match c {
// Keywords and identifiers
c if is_identifier_continuation(c) => {
// if we only have identifier continuations but no start (e.g. 555) we
// don't want to consume the chars, so in that case, we want to rewind the
// cursor to here
@ -633,17 +826,87 @@ impl<'a> SimpleTokenizer<'a> {
self.to_keyword_or_other(range)
} else {
self.cursor = savepoint;
self.bogus = true;
SimpleTokenKind::Other
}
} else {
SimpleTokenKind::from_non_trivia_char(c)
};
if kind == SimpleTokenKind::Other {
self.bogus = true;
}
kind
// Non-trivia tokens that are unambiguous when lexing backwards.
// In other words: these are characters that _don't_ appear at the
// end of a multi-character token (like `!=`).
'\\' => SimpleTokenKind::Continuation,
':' => SimpleTokenKind::Colon,
'~' => SimpleTokenKind::Tilde,
'%' => SimpleTokenKind::Percent,
'|' => SimpleTokenKind::Vbar,
',' => SimpleTokenKind::Comma,
';' => SimpleTokenKind::Semi,
'(' => SimpleTokenKind::LParen,
')' => SimpleTokenKind::RParen,
'[' => SimpleTokenKind::LBracket,
']' => SimpleTokenKind::RBracket,
'{' => SimpleTokenKind::LBrace,
'}' => SimpleTokenKind::RBrace,
'&' => SimpleTokenKind::Ampersand,
'^' => SimpleTokenKind::Circumflex,
'+' => SimpleTokenKind::Plus,
'-' => SimpleTokenKind::Minus,
// Non-trivia tokens that _are_ ambiguous when lexing backwards.
// In other words: these are characters that _might_ mark the end
// of a multi-character token (like `!=` or `->` or `//` or `**`).
'=' | '*' | '/' | '@' | '!' | '<' | '>' | '.' => {
// This could be a single-token token, like `+` in `x + y`, or a
// multi-character token, like `+=` in `x += y`. It could also be a sequence
// of multi-character tokens, like `x ==== y`, which is invalid, _but_ it's
// important that we produce the same token stream when lexing backwards as
// we do when lexing forwards. So, identify the range of the sequence, lex
// forwards, and return the last token.
let mut cursor = self.cursor.clone();
cursor.eat_back_while(|c| {
matches!(
c,
':' | '~'
| '%'
| '|'
| '&'
| '^'
| '+'
| '-'
| '='
| '*'
| '/'
| '@'
| '!'
| '<'
| '>'
| '.'
)
});
let token_len = cursor.token_len();
let range = TextRange::at(self.back_offset - token_len, token_len);
let forward_lexer = Self::new(self.source, range);
if let Some(token) = forward_lexer.last() {
// If the token spans multiple characters, bump the cursor. Note,
// though, that we already bumped the cursor to past the last character
// in the token at the very start of `next_token_back`.
for _ in self.source[token.range].chars().rev().skip(1) {
self.cursor.bump_back().unwrap();
}
token.kind()
} else {
self.bogus = true;
SimpleTokenKind::Other
}
}
_ => {
self.bogus = true;
SimpleTokenKind::Other
}
}
}
}
};
@ -871,6 +1134,7 @@ impl QuoteKind {
#[cfg(test)]
mod tests {
use insta::assert_debug_snapshot;
use ruff_text_size::{TextLen, TextRange, TextSize};
use crate::tokenizer::{lines_after, lines_before, SimpleToken, SimpleTokenizer};
@ -946,6 +1210,30 @@ mod tests {
test_case.assert_reverse_tokenization();
}
#[test]
fn tokenize_eq() {
// Should tokenize as `==`, then `=`, regardless of whether we're lexing forwards or
// backwards.
let source = "===";
let test_case = tokenize(source);
assert_debug_snapshot!(test_case.tokens());
test_case.assert_reverse_tokenization();
}
#[test]
fn tokenize_not_eq() {
// Should tokenize as `!=`, then `=`, regardless of whether we're lexing forwards or
// backwards.
let source = "!==";
let test_case = tokenize(source);
assert_debug_snapshot!(test_case.tokens());
test_case.assert_reverse_tokenization();
}
#[test]
fn tokenize_continuation() {
let source = "( \\\n )";
@ -957,8 +1245,8 @@ mod tests {
}
#[test]
fn tokenize_characters() {
let source = "-> *= (~=)";
fn tokenize_operators() {
let source = "-> *= ( -= ) ~ // ** **= ^ ^= | |=";
let test_case = tokenize(source);
@ -966,6 +1254,17 @@ mod tests {
test_case.assert_reverse_tokenization();
}
#[test]
fn tokenize_invalid_operators() {
let source = "-> $=";
let test_case = tokenize(source);
assert_debug_snapshot!(test_case.tokens());
// note: not reversible: [other, bogus, bogus] vs [bogus, bogus, other]
}
#[test]
fn tricky_unicode() {
let source = "មុ";