Expand SimpleTokenizer to all keywords and single-character tokens (#6518)

## Summary

For #6485, I need to be able to use the `SimpleTokenizer` to lex the
space between any two adjacent expressions (i.e., the space between a
preceding and following node). This requires that we support a wider
range of keywords (like `and`, to connect the pieces of `x and y`), and
some additional single-character tokens (like `-` and `>`, to support
`->`). Note that the `SimpleTokenizer` does not support multi-character
tokens, so the `->` in a function signature is lexed as a `-` followed
by a `>` -- but this is fine for our purposes.
This commit is contained in:
Charlie Marsh 2023-08-14 10:35:31 -04:00 committed by GitHub
parent a7cf8f0b77
commit 3711f8ad59
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 217 additions and 12 deletions

View file

@ -177,29 +177,141 @@ pub enum SimpleTokenKind {
/// `.`.
Dot,
/// `else`
Else,
/// `+`
Plus,
/// `if`
If,
/// `-`
Minus,
/// `elif`
Elif,
/// `=`
Equals,
/// `in`
In,
/// `>`
Greater,
/// `<`
Less,
/// `%`
Percent,
/// `&`
Ampersand,
/// `^`
Circumflex,
/// `|`
Vbar,
/// `@`
At,
/// `~`
Tilde,
/// `and`
And,
/// `as`
As,
/// `assert`
Assert,
/// `async`
Async,
/// `await`
Await,
/// `break`
Break,
/// `class`
Class,
/// `continue`
Continue,
/// `def`
Def,
/// `del`
Del,
/// `elif`
Elif,
/// `else`
Else,
/// `except`
Except,
/// `finally`
Finally,
/// `for`
For,
/// `from`
From,
/// `global`
Global,
/// `if`
If,
/// `import`
Import,
/// `in`
In,
/// `is`
Is,
/// `lambda`
Lambda,
/// `nonlocal`
Nonlocal,
/// `not`
Not,
/// `or`
Or,
/// `pass`
Pass,
/// `raise`
Raise,
/// `return`
Return,
/// `try`
Try,
/// `while`
While,
/// `match`
Match,
/// `type`
Type,
/// `case`
Case,
/// `with`
With,
/// `async`
Async,
/// `yield`
Yield,
/// Any other non trivia token.
Other,
@ -222,6 +334,17 @@ impl SimpleTokenKind {
'/' => SimpleTokenKind::Slash,
'*' => SimpleTokenKind::Star,
'.' => SimpleTokenKind::Dot,
'+' => SimpleTokenKind::Plus,
'-' => SimpleTokenKind::Minus,
'=' => SimpleTokenKind::Equals,
'>' => SimpleTokenKind::Greater,
'<' => SimpleTokenKind::Less,
'%' => SimpleTokenKind::Percent,
'&' => SimpleTokenKind::Ampersand,
'^' => SimpleTokenKind::Circumflex,
'|' => SimpleTokenKind::Vbar,
'@' => SimpleTokenKind::At,
'~' => SimpleTokenKind::Tilde,
_ => SimpleTokenKind::Other,
}
}
@ -289,15 +412,41 @@ impl<'a> SimpleTokenizer<'a> {
fn to_keyword_or_other(&self, range: TextRange) -> SimpleTokenKind {
let source = &self.source[range];
match source {
"and" => SimpleTokenKind::And,
"as" => SimpleTokenKind::As,
"assert" => SimpleTokenKind::Assert,
"async" => SimpleTokenKind::Async,
"else" => SimpleTokenKind::Else,
"await" => SimpleTokenKind::Await,
"break" => SimpleTokenKind::Break,
"class" => SimpleTokenKind::Class,
"continue" => SimpleTokenKind::Continue,
"def" => SimpleTokenKind::Def,
"del" => SimpleTokenKind::Del,
"elif" => SimpleTokenKind::Elif,
"else" => SimpleTokenKind::Else,
"except" => SimpleTokenKind::Except,
"finally" => SimpleTokenKind::Finally,
"for" => SimpleTokenKind::For,
"from" => SimpleTokenKind::From,
"global" => SimpleTokenKind::Global,
"if" => SimpleTokenKind::If,
"import" => SimpleTokenKind::Import,
"in" => SimpleTokenKind::In,
"is" => SimpleTokenKind::Is,
"lambda" => SimpleTokenKind::Lambda,
"nonlocal" => SimpleTokenKind::Nonlocal,
"not" => SimpleTokenKind::Not,
"or" => SimpleTokenKind::Or,
"pass" => SimpleTokenKind::Pass,
"raise" => SimpleTokenKind::Raise,
"return" => SimpleTokenKind::Return,
"try" => SimpleTokenKind::Try,
"while" => SimpleTokenKind::While,
"match" => SimpleTokenKind::Match, // Match is a soft keyword that depends on the context but we can always lex it as a keyword and leave it to the caller (parser) to decide if it should be handled as an identifier or keyword.
"type" => SimpleTokenKind::Type, // Type is a soft keyword that depends on the context but we can always lex it as a keyword and leave it to the caller (parser) to decide if it should be handled as an identifier or keyword.
"case" => SimpleTokenKind::Case,
"with" => SimpleTokenKind::With,
// ...,
"yield" => SimpleTokenKind::Yield,
_ => SimpleTokenKind::Other, // Potentially an identifier, but only if it isn't a string prefix. We can ignore this for now https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals
}
}
@ -801,6 +950,16 @@ mod tests {
test_case.assert_reverse_tokenization();
}
#[test]
fn tokenize_characters() {
let source = "-> *= (~=)";
let test_case = tokenize(source);
assert_debug_snapshot!(test_case.tokens());
test_case.assert_reverse_tokenization();
}
#[test]
fn tricky_unicode() {
let source = "មុ";