Remove exception-handler lexing from unused-bound-exception fix (#5851)

## Summary

The motivation here is that it will make this rule easier to rewrite as
a deferred check. Right now, we can't run this rule in the deferred
phase, because it depends on the `except_handler` to power its autofix.
Instead of lexing the `except_handler`, we can use the `SimpleTokenizer`
from the formatter, and just lex forwards and backwards.

For context, this rule detects the unused `e` in:

```python
try:
  pass
except ValueError as e:
  pass
```
This commit is contained in:
Charlie Marsh 2023-07-18 14:27:46 -04:00 committed by GitHub
parent 41da52a61b
commit 4204fc002d
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
32 changed files with 125 additions and 112 deletions

View file

@ -2,10 +2,12 @@ use ruff_text_size::{TextRange, TextSize};
use rustpython_parser::ast::Ranged;
use ruff_formatter::{format_args, write, Argument, Arguments};
use ruff_python_whitespace::{
lines_after, skip_trailing_trivia, SimpleTokenizer, Token, TokenKind,
};
use crate::context::NodeLevel;
use crate::prelude::*;
use crate::trivia::{lines_after, skip_trailing_trivia, SimpleTokenizer, Token, TokenKind};
use crate::MagicTrailingComma;
/// Adds parentheses and indents `content` if it doesn't fit on a line.

View file

@ -3,11 +3,11 @@ use rustpython_parser::ast::Ranged;
use ruff_formatter::{format_args, write, FormatError, SourceCode};
use ruff_python_ast::node::{AnyNodeRef, AstNode};
use ruff_python_whitespace::{lines_after, lines_before, skip_trailing_trivia};
use crate::comments::SourceComment;
use crate::context::NodeLevel;
use crate::prelude::*;
use crate::trivia::{lines_after, lines_before, skip_trailing_trivia};
/// Formats the leading comments of a node.
pub(crate) fn leading_node_comments<T>(node: &T) -> FormatLeadingComments

View file

@ -7,14 +7,16 @@ use rustpython_parser::ast::{Expr, ExprIfExp, ExprSlice, Ranged};
use ruff_python_ast::node::{AnyNodeRef, AstNode};
use ruff_python_ast::source_code::Locator;
use ruff_python_ast::whitespace;
use ruff_python_whitespace::{PythonWhitespace, UniversalNewlines};
use ruff_python_whitespace::{
first_non_trivia_token_rev, PythonWhitespace, SimpleTokenizer, Token, TokenKind,
UniversalNewlines,
};
use crate::comments::visitor::{CommentPlacement, DecoratedComment};
use crate::expression::expr_slice::{assign_comment_in_slice, ExprSliceCommentSection};
use crate::other::arguments::{
assign_argument_separator_comment_placement, find_argument_separators,
};
use crate::trivia::{first_non_trivia_token_rev, SimpleTokenizer, Token, TokenKind};
/// Implements the custom comment placement logic.
pub(super) fn place_comment<'a>(

View file

@ -3,14 +3,13 @@ use rustpython_parser::ast::{Expr, ExprCall, Ranged};
use ruff_formatter::write;
use ruff_python_ast::node::AnyNodeRef;
use ruff_python_whitespace::{SimpleTokenizer, TokenKind};
use crate::comments::dangling_comments;
use crate::expression::parentheses::{
parenthesized, NeedsParentheses, OptionalParentheses, Parentheses,
};
use crate::prelude::*;
use crate::trivia::{SimpleTokenizer, TokenKind};
use crate::FormatNodeRule;
#[derive(Default)]

View file

@ -1,16 +1,17 @@
use crate::comments::{dangling_comments, SourceComment};
use crate::context::PyFormatContext;
use crate::expression::parentheses::{NeedsParentheses, OptionalParentheses};
use crate::trivia::Token;
use crate::trivia::{first_non_trivia_token, TokenKind};
use crate::{AsFormat, FormatNodeRule, PyFormatter};
use ruff_formatter::prelude::{hard_line_break, line_suffix_boundary, space, text};
use ruff_formatter::{write, Buffer, Format, FormatError, FormatResult};
use ruff_python_ast::node::{AnyNodeRef, AstNode};
use ruff_text_size::TextRange;
use rustpython_parser::ast::ExprSlice;
use rustpython_parser::ast::{Expr, Ranged};
use ruff_formatter::prelude::{hard_line_break, line_suffix_boundary, space, text};
use ruff_formatter::{write, Buffer, Format, FormatError, FormatResult};
use ruff_python_ast::node::{AnyNodeRef, AstNode};
use ruff_python_whitespace::{first_non_trivia_token, Token, TokenKind};
use crate::comments::{dangling_comments, SourceComment};
use crate::context::PyFormatContext;
use crate::expression::parentheses::{NeedsParentheses, OptionalParentheses};
use crate::{AsFormat, FormatNodeRule, PyFormatter};
#[derive(Default)]
pub struct FormatExprSlice;

View file

@ -1,15 +1,17 @@
use crate::comments::trailing_comments;
use crate::context::PyFormatContext;
use crate::expression::parentheses::{NeedsParentheses, OptionalParentheses};
use crate::trivia::{SimpleTokenizer, TokenKind};
use crate::{AsFormat, FormatNodeRule, PyFormatter};
use ruff_formatter::prelude::{hard_line_break, space, text};
use ruff_formatter::{Format, FormatContext, FormatResult};
use ruff_python_ast::node::AnyNodeRef;
use ruff_text_size::{TextLen, TextRange};
use rustpython_parser::ast::UnaryOp;
use rustpython_parser::ast::{ExprUnaryOp, Ranged};
use ruff_formatter::prelude::{hard_line_break, space, text};
use ruff_formatter::{Format, FormatContext, FormatResult};
use ruff_python_ast::node::AnyNodeRef;
use ruff_python_whitespace::{SimpleTokenizer, TokenKind};
use crate::comments::trailing_comments;
use crate::context::PyFormatContext;
use crate::expression::parentheses::{NeedsParentheses, OptionalParentheses};
use crate::{AsFormat, FormatNodeRule, PyFormatter};
#[derive(Default)]
pub struct FormatExprUnaryOp;

View file

@ -1,10 +1,12 @@
use crate::context::NodeLevel;
use crate::prelude::*;
use crate::trivia::{first_non_trivia_token, SimpleTokenizer, Token, TokenKind};
use rustpython_parser::ast::Ranged;
use ruff_formatter::prelude::tag::Condition;
use ruff_formatter::{format_args, write, Argument, Arguments};
use ruff_python_ast::node::AnyNodeRef;
use rustpython_parser::ast::Ranged;
use ruff_python_whitespace::{first_non_trivia_token, SimpleTokenizer, Token, TokenKind};
use crate::context::NodeLevel;
use crate::prelude::*;
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
pub(crate) enum OptionalParentheses {

View file

@ -33,7 +33,6 @@ pub(crate) mod other;
pub(crate) mod pattern;
mod prelude;
pub(crate) mod statement;
mod trivia;
include!("../../ruff_formatter/shared_traits.rs");

View file

@ -1,9 +1,11 @@
use std::usize;
use ruff_text_size::{TextRange, TextSize};
use rustpython_parser::ast::{Arguments, Ranged};
use ruff_formatter::{format_args, write};
use ruff_python_ast::node::{AnyNodeRef, AstNode};
use ruff_python_whitespace::{first_non_trivia_token, SimpleTokenizer, Token, TokenKind};
use crate::comments::{
dangling_comments, leading_comments, leading_node_comments, trailing_comments,
@ -12,9 +14,7 @@ use crate::comments::{
use crate::context::NodeLevel;
use crate::expression::parentheses::parenthesized;
use crate::prelude::*;
use crate::trivia::{first_non_trivia_token, SimpleTokenizer, Token, TokenKind};
use crate::FormatNodeRule;
use ruff_text_size::{TextRange, TextSize};
#[derive(Default)]
pub struct FormatArguments;

View file

@ -1,218 +0,0 @@
---
source: crates/ruff_python_formatter/src/trivia.rs
expression: test_case.tokenize_reverse()
---
[
Token {
kind: RParen,
range: 52..53,
},
Token {
kind: Other,
range: 51..52,
},
Token {
kind: Bogus,
range: 50..51,
},
Token {
kind: Bogus,
range: 49..50,
},
Token {
kind: Bogus,
range: 48..49,
},
Token {
kind: Bogus,
range: 47..48,
},
Token {
kind: Bogus,
range: 46..47,
},
Token {
kind: Bogus,
range: 45..46,
},
Token {
kind: Bogus,
range: 44..45,
},
Token {
kind: Bogus,
range: 43..44,
},
Token {
kind: Bogus,
range: 42..43,
},
Token {
kind: Bogus,
range: 41..42,
},
Token {
kind: Bogus,
range: 40..41,
},
Token {
kind: Bogus,
range: 39..40,
},
Token {
kind: Bogus,
range: 38..39,
},
Token {
kind: Bogus,
range: 37..38,
},
Token {
kind: Bogus,
range: 36..37,
},
Token {
kind: Bogus,
range: 35..36,
},
Token {
kind: Bogus,
range: 34..35,
},
Token {
kind: Bogus,
range: 33..34,
},
Token {
kind: Bogus,
range: 32..33,
},
Token {
kind: Bogus,
range: 31..32,
},
Token {
kind: Bogus,
range: 30..31,
},
Token {
kind: Bogus,
range: 29..30,
},
Token {
kind: Bogus,
range: 28..29,
},
Token {
kind: Bogus,
range: 27..28,
},
Token {
kind: Bogus,
range: 26..27,
},
Token {
kind: Bogus,
range: 25..26,
},
Token {
kind: Bogus,
range: 24..25,
},
Token {
kind: Bogus,
range: 23..24,
},
Token {
kind: Bogus,
range: 22..23,
},
Token {
kind: Bogus,
range: 21..22,
},
Token {
kind: Bogus,
range: 20..21,
},
Token {
kind: Bogus,
range: 19..20,
},
Token {
kind: Bogus,
range: 18..19,
},
Token {
kind: Bogus,
range: 17..18,
},
Token {
kind: Bogus,
range: 16..17,
},
Token {
kind: Bogus,
range: 15..16,
},
Token {
kind: Bogus,
range: 14..15,
},
Token {
kind: Bogus,
range: 13..14,
},
Token {
kind: Bogus,
range: 12..13,
},
Token {
kind: Bogus,
range: 11..12,
},
Token {
kind: Bogus,
range: 10..11,
},
Token {
kind: Bogus,
range: 9..10,
},
Token {
kind: Bogus,
range: 8..9,
},
Token {
kind: Bogus,
range: 7..8,
},
Token {
kind: Bogus,
range: 6..7,
},
Token {
kind: Bogus,
range: 5..6,
},
Token {
kind: Bogus,
range: 4..5,
},
Token {
kind: Bogus,
range: 3..4,
},
Token {
kind: Bogus,
range: 2..3,
},
Token {
kind: Bogus,
range: 1..2,
},
Token {
kind: Bogus,
range: 0..1,
},
]

View file

@ -1,10 +0,0 @@
---
source: crates/ruff_python_formatter/src/trivia.rs
expression: test_case.tokens()
---
[
Token {
kind: Other,
range: 0..2,
},
]

View file

@ -1,18 +0,0 @@
---
source: crates/ruff_python_formatter/src/trivia.rs
expression: test_case.tokens()
---
[
Token {
kind: Other,
range: 0..1,
},
Token {
kind: Bogus,
range: 1..2,
},
Token {
kind: Bogus,
range: 2..3,
},
]

View file

@ -1,126 +0,0 @@
---
source: crates/ruff_python_formatter/src/trivia.rs
expression: test_case.tokens()
---
[
Token {
kind: Comment,
range: 0..17,
},
Token {
kind: Newline,
range: 17..18,
},
Token {
kind: Whitespace,
range: 18..26,
},
Token {
kind: Other,
range: 26..27,
},
Token {
kind: Bogus,
range: 27..28,
},
Token {
kind: Bogus,
range: 28..29,
},
Token {
kind: Bogus,
range: 29..30,
},
Token {
kind: Bogus,
range: 30..31,
},
Token {
kind: Bogus,
range: 31..32,
},
Token {
kind: Bogus,
range: 32..33,
},
Token {
kind: Bogus,
range: 33..34,
},
Token {
kind: Bogus,
range: 34..35,
},
Token {
kind: Bogus,
range: 35..36,
},
Token {
kind: Bogus,
range: 36..37,
},
Token {
kind: Bogus,
range: 37..38,
},
Token {
kind: Bogus,
range: 38..39,
},
Token {
kind: Bogus,
range: 39..40,
},
Token {
kind: Bogus,
range: 40..41,
},
Token {
kind: Bogus,
range: 41..42,
},
Token {
kind: Bogus,
range: 42..43,
},
Token {
kind: Bogus,
range: 43..44,
},
Token {
kind: Bogus,
range: 44..45,
},
Token {
kind: Bogus,
range: 45..46,
},
Token {
kind: Bogus,
range: 46..47,
},
Token {
kind: Bogus,
range: 47..48,
},
Token {
kind: Bogus,
range: 48..49,
},
Token {
kind: Bogus,
range: 49..50,
},
Token {
kind: Bogus,
range: 50..51,
},
Token {
kind: Bogus,
range: 51..52,
},
Token {
kind: Bogus,
range: 52..53,
},
]

View file

@ -1,22 +0,0 @@
---
source: crates/ruff_python_formatter/src/trivia.rs
expression: test_case.tokens()
---
[
Token {
kind: Comma,
range: 0..1,
},
Token {
kind: Comma,
range: 1..2,
},
Token {
kind: Comma,
range: 2..3,
},
Token {
kind: Comma,
range: 3..4,
},
]

View file

@ -1,30 +0,0 @@
---
source: crates/ruff_python_formatter/src/trivia.rs
expression: test_case.tokens()
---
[
Token {
kind: LParen,
range: 0..1,
},
Token {
kind: Whitespace,
range: 1..2,
},
Token {
kind: Continuation,
range: 2..3,
},
Token {
kind: Newline,
range: 3..4,
},
Token {
kind: Whitespace,
range: 4..5,
},
Token {
kind: RParen,
range: 5..6,
},
]

View file

@ -1,34 +0,0 @@
---
source: crates/ruff_python_formatter/src/trivia.rs
expression: test_case.tokens()
---
[
Token {
kind: If,
range: 0..2,
},
Token {
kind: Whitespace,
range: 2..3,
},
Token {
kind: In,
range: 3..5,
},
Token {
kind: Whitespace,
range: 5..6,
},
Token {
kind: Else,
range: 6..10,
},
Token {
kind: Whitespace,
range: 10..11,
},
Token {
kind: Match,
range: 11..16,
},
]

View file

@ -1,30 +0,0 @@
---
source: crates/ruff_python_formatter/src/trivia.rs
expression: test_case.tokens()
---
[
Token {
kind: LParen,
range: 0..1,
},
Token {
kind: LBracket,
range: 1..2,
},
Token {
kind: LBrace,
range: 2..3,
},
Token {
kind: RBrace,
range: 3..4,
},
Token {
kind: RBracket,
range: 4..5,
},
Token {
kind: RParen,
range: 5..6,
},
]

View file

@ -1,42 +0,0 @@
---
source: crates/ruff_python_formatter/src/trivia.rs
expression: test_case.tokens()
---
[
Token {
kind: Whitespace,
range: 0..1,
},
Token {
kind: Comment,
range: 1..30,
},
Token {
kind: Newline,
range: 30..31,
},
Token {
kind: Whitespace,
range: 31..39,
},
Token {
kind: Comment,
range: 39..77,
},
Token {
kind: Newline,
range: 77..78,
},
Token {
kind: Whitespace,
range: 78..86,
},
Token {
kind: Comma,
range: 86..87,
},
Token {
kind: Slash,
range: 87..88,
},
]

View file

@ -1,18 +0,0 @@
---
source: crates/ruff_python_formatter/src/trivia.rs
expression: test_case.tokens()
---
[
Token {
kind: RParen,
range: 14..15,
},
Token {
kind: Whitespace,
range: 15..16,
},
Token {
kind: Comment,
range: 16..25,
},
]

View file

@ -1,22 +0,0 @@
---
source: crates/ruff_python_formatter/src/trivia.rs
expression: test_case.tokens()
---
[
Token {
kind: Comment,
range: 0..9,
},
Token {
kind: Newline,
range: 9..10,
},
Token {
kind: Whitespace,
range: 10..14,
},
Token {
kind: Comment,
range: 14..23,
},
]

View file

@ -1,10 +0,0 @@
---
source: crates/ruff_python_formatter/src/trivia.rs
expression: test_case.tokens()
---
[
Token {
kind: Other,
range: 0..6,
},
]

View file

@ -1,12 +1,13 @@
use crate::comments::trailing_comments;
use crate::expression::parentheses::{parenthesized, Parentheses};
use crate::prelude::*;
use crate::trivia::{SimpleTokenizer, TokenKind};
use ruff_formatter::write;
use ruff_text_size::TextRange;
use rustpython_parser::ast::{Ranged, StmtClassDef};
use ruff_formatter::write;
use ruff_python_whitespace::{SimpleTokenizer, TokenKind};
use crate::comments::trailing_comments;
use crate::expression::parentheses::{parenthesized, Parentheses};
use crate::prelude::*;
#[derive(Default)]
pub struct FormatStmtClassDef;

View file

@ -2,12 +2,12 @@ use rustpython_parser::ast::{Ranged, StmtFunctionDef};
use ruff_formatter::{write, FormatOwnedWithRule, FormatRefWithRule};
use ruff_python_ast::function::AnyFunctionDefinition;
use ruff_python_whitespace::{lines_after, skip_trailing_trivia};
use crate::comments::{leading_comments, trailing_comments};
use crate::context::NodeLevel;
use crate::expression::parentheses::{optional_parentheses, Parentheses};
use crate::prelude::*;
use crate::trivia::{lines_after, skip_trailing_trivia};
use crate::FormatNodeRule;
#[derive(Default)]

View file

@ -3,13 +3,13 @@ use rustpython_parser::ast::{Ranged, StmtAsyncWith, StmtWith, Suite, WithItem};
use ruff_formatter::{format_args, write, FormatError};
use ruff_python_ast::node::AnyNodeRef;
use ruff_python_whitespace::{SimpleTokenizer, TokenKind};
use crate::comments::trailing_comments;
use crate::expression::parentheses::{
in_parentheses_only_soft_line_break_or_space, optional_parentheses,
};
use crate::prelude::*;
use crate::trivia::{SimpleTokenizer, TokenKind};
use crate::FormatNodeRule;
pub(super) enum AnyStatementWith<'a> {

View file

@ -1,10 +1,12 @@
use crate::context::NodeLevel;
use crate::prelude::*;
use crate::trivia::lines_before;
use rustpython_parser::ast::{Ranged, Stmt, Suite};
use ruff_formatter::{
format_args, write, FormatOwnedWithRule, FormatRefWithRule, FormatRuleWithOptions,
};
use rustpython_parser::ast::{Ranged, Stmt, Suite};
use ruff_python_whitespace::lines_before;
use crate::context::NodeLevel;
use crate::prelude::*;
/// Level at which the [`Suite`] appears in the source code.
#[derive(Copy, Clone, Debug)]
@ -185,13 +187,15 @@ impl<'ast> IntoFormat<PyFormatContext<'ast>> for Suite {
#[cfg(test)]
mod tests {
use rustpython_parser::ast::Suite;
use rustpython_parser::Parse;
use ruff_formatter::format;
use crate::comments::Comments;
use crate::prelude::*;
use crate::statement::suite::SuiteLevel;
use crate::PyFormatOptions;
use ruff_formatter::format;
use rustpython_parser::ast::Suite;
use rustpython_parser::Parse;
fn format_suite(level: SuiteLevel) -> String {
let source = r#"

View file

@ -1,771 +0,0 @@
use ruff_text_size::{TextLen, TextRange, TextSize};
use unic_ucd_ident::{is_xid_continue, is_xid_start};
use ruff_python_whitespace::{is_python_whitespace, Cursor};
/// Searches for the first non-trivia character in `range`.
///
/// The search skips over any whitespace and comments.
///
/// Returns `Some` if the range contains any non-trivia character. The first item is the absolute offset
/// of the character, the second item the non-trivia character.
///
/// Returns `None` if the range is empty or only contains trivia (whitespace or comments).
pub(crate) fn first_non_trivia_token(offset: TextSize, code: &str) -> Option<Token> {
SimpleTokenizer::starts_at(offset, code)
.skip_trivia()
.next()
}
/// Returns the first non-trivia token right before `offset` or `None` if at the start of the file
/// or all preceding tokens are trivia tokens.
///
/// ## Notes
///
/// Prefer [`first_non_trivia_token`] whenever possible because reverse lookup is expensive because of comments.
pub(crate) fn first_non_trivia_token_rev(offset: TextSize, code: &str) -> Option<Token> {
SimpleTokenizer::up_to(offset, code)
.skip_trivia()
.next_back()
}
/// Returns the number of newlines between `offset` and the first non whitespace character in the source code.
pub(crate) fn lines_before(offset: TextSize, code: &str) -> u32 {
let tokens = SimpleTokenizer::up_to(offset, code);
let mut newlines = 0u32;
for token in tokens.rev() {
match token.kind() {
TokenKind::Newline => {
newlines += 1;
}
TokenKind::Whitespace => {
// ignore
}
_ => {
break;
}
}
}
newlines
}
/// Counts the empty lines between `offset` and the first non-whitespace character.
pub(crate) fn lines_after(offset: TextSize, code: &str) -> u32 {
let tokens = SimpleTokenizer::starts_at(offset, code);
let mut newlines = 0u32;
for token in tokens {
match token.kind() {
TokenKind::Newline => {
newlines += 1;
}
TokenKind::Whitespace => {
// ignore
}
_ => {
break;
}
}
}
newlines
}
/// Returns the position after skipping any trailing trivia up to, but not including the newline character.
pub(crate) fn skip_trailing_trivia(offset: TextSize, code: &str) -> TextSize {
let tokenizer = SimpleTokenizer::starts_at(offset, code);
for token in tokenizer {
match token.kind() {
TokenKind::Whitespace | TokenKind::Comment | TokenKind::Continuation => {
// No op
}
_ => {
return token.start();
}
}
}
offset
}
fn is_identifier_start(c: char) -> bool {
c.is_ascii_alphabetic() || c == '_' || is_non_ascii_identifier_start(c)
}
// Checks if the character c is a valid continuation character as described
// in https://docs.python.org/3/reference/lexical_analysis.html#identifiers
fn is_identifier_continuation(c: char) -> bool {
if c.is_ascii() {
matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9')
} else {
is_xid_continue(c)
}
}
fn is_non_ascii_identifier_start(c: char) -> bool {
is_xid_start(c)
}
#[derive(Clone, Debug, Eq, PartialEq, Hash)]
pub(crate) struct Token {
pub(crate) kind: TokenKind,
pub(crate) range: TextRange,
}
impl Token {
pub(crate) const fn kind(&self) -> TokenKind {
self.kind
}
#[allow(unused)]
pub(crate) const fn range(&self) -> TextRange {
self.range
}
pub(crate) const fn start(&self) -> TextSize {
self.range.start()
}
pub(crate) const fn end(&self) -> TextSize {
self.range.end()
}
}
#[derive(Copy, Clone, Debug, Eq, PartialEq, Hash)]
pub(crate) enum TokenKind {
/// A comment, not including the trailing new line.
Comment,
/// Sequence of ' ' or '\t'
Whitespace,
/// Start or end of the file
EndOfFile,
/// `\\`
Continuation,
/// `\n` or `\r` or `\r\n`
Newline,
/// `(`
LParen,
/// `)`
RParen,
/// `{`
LBrace,
/// `}`
RBrace,
/// `[`
LBracket,
/// `]`
RBracket,
/// `,`
Comma,
/// `:`
Colon,
/// '/'
Slash,
/// '*'
Star,
/// `.`.
Dot,
/// `else`
Else,
/// `if`
If,
/// `in`
In,
/// `as`
As,
/// `match`
Match,
/// `with`
With,
/// `async`
Async,
/// Any other non trivia token.
Other,
/// Returned for each character after [`TokenKind::Other`] has been returned once.
Bogus,
}
impl TokenKind {
const fn from_non_trivia_char(c: char) -> TokenKind {
match c {
'(' => TokenKind::LParen,
')' => TokenKind::RParen,
'[' => TokenKind::LBracket,
']' => TokenKind::RBracket,
'{' => TokenKind::LBrace,
'}' => TokenKind::RBrace,
',' => TokenKind::Comma,
':' => TokenKind::Colon,
'/' => TokenKind::Slash,
'*' => TokenKind::Star,
'.' => TokenKind::Dot,
_ => TokenKind::Other,
}
}
const fn is_trivia(self) -> bool {
matches!(
self,
TokenKind::Whitespace
| TokenKind::Newline
| TokenKind::Comment
| TokenKind::Continuation
)
}
}
/// Simple zero allocation tokenizer for tokenizing trivia (and some tokens).
///
/// The tokenizer must start at an offset that is trivia (e.g. not inside of a multiline string).
///
/// The tokenizer doesn't guarantee any correctness after it returned a [`TokenKind::Other`]. That's why it
/// will return [`TokenKind::Bogus`] for every character after until it reaches the end of the file.
pub(crate) struct SimpleTokenizer<'a> {
offset: TextSize,
back_offset: TextSize,
/// `true` when it is known that the current `back` line has no comment for sure.
back_line_has_no_comment: bool,
bogus: bool,
source: &'a str,
cursor: Cursor<'a>,
}
impl<'a> SimpleTokenizer<'a> {
pub(crate) fn new(source: &'a str, range: TextRange) -> Self {
Self {
offset: range.start(),
back_offset: range.end(),
back_line_has_no_comment: false,
bogus: false,
source,
cursor: Cursor::new(&source[range]),
}
}
pub(crate) fn starts_at(offset: TextSize, source: &'a str) -> Self {
let range = TextRange::new(offset, source.text_len());
Self::new(source, range)
}
/// Creates a tokenizer that lexes tokens from the start of `source` up to `offset`.
pub(crate) fn up_to(offset: TextSize, source: &'a str) -> Self {
Self::new(source, TextRange::up_to(offset))
}
/// Creates a tokenizer that lexes tokens from the start of `source` up to `offset`, and informs
/// the lexer that the line at `offset` contains no comments. This can significantly speed up backwards lexing
/// because the lexer doesn't need to scan for comments.
pub(crate) fn up_to_without_back_comment(offset: TextSize, source: &'a str) -> Self {
let mut tokenizer = Self::up_to(offset, source);
tokenizer.back_line_has_no_comment = true;
tokenizer
}
fn to_keyword_or_other(&self, range: TextRange) -> TokenKind {
let source = &self.source[range];
match source {
"as" => TokenKind::As,
"async" => TokenKind::Async,
"else" => TokenKind::Else,
"if" => TokenKind::If,
"in" => TokenKind::In,
"match" => TokenKind::Match, // Match is a soft keyword that depends on the context but we can always lex it as a keyword and leave it to the caller (parser) to decide if it should be handled as an identifier or keyword.
"with" => TokenKind::With,
// ...,
_ => TokenKind::Other, // Potentially an identifier, but only if it isn't a string prefix. We can ignore this for now https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals
}
}
fn next_token(&mut self) -> Token {
self.cursor.start_token();
let Some(first) = self.cursor.bump() else {
return Token {
kind: TokenKind::EndOfFile,
range: TextRange::empty(self.offset),
};
};
if self.bogus {
let token = Token {
kind: TokenKind::Bogus,
range: TextRange::at(self.offset, first.text_len()),
};
self.offset += first.text_len();
return token;
}
let kind = match first {
' ' | '\t' => {
self.cursor.eat_while(|c| matches!(c, ' ' | '\t'));
TokenKind::Whitespace
}
'\n' => TokenKind::Newline,
'\r' => {
self.cursor.eat_char('\n');
TokenKind::Newline
}
'#' => {
self.cursor.eat_while(|c| !matches!(c, '\n' | '\r'));
TokenKind::Comment
}
'\\' => TokenKind::Continuation,
c => {
let kind = if is_identifier_start(c) {
self.cursor.eat_while(is_identifier_continuation);
let token_len = self.cursor.token_len();
let range = TextRange::at(self.offset, token_len);
self.to_keyword_or_other(range)
} else {
TokenKind::from_non_trivia_char(c)
};
if kind == TokenKind::Other {
self.bogus = true;
}
kind
}
};
let token_len = self.cursor.token_len();
let token = Token {
kind,
range: TextRange::at(self.offset, token_len),
};
self.offset += token_len;
token
}
/// Returns the next token from the back. Prefer iterating forwards. Iterating backwards is significantly more expensive
/// because it needs to check if the line has any comments when encountering any non-trivia token.
pub(crate) fn next_token_back(&mut self) -> Token {
self.cursor.start_token();
let Some(last) = self.cursor.bump_back() else {
return Token {
kind: TokenKind::EndOfFile,
range: TextRange::empty(self.back_offset),
};
};
if self.bogus {
let token = Token {
kind: TokenKind::Bogus,
range: TextRange::at(self.back_offset - last.text_len(), last.text_len()),
};
self.back_offset -= last.text_len();
return token;
}
let kind = match last {
// This may not be 100% correct because it will lex-out trailing whitespace from a comment
// as whitespace rather than being part of the token. This shouldn't matter for what we use the lexer for.
' ' | '\t' => {
self.cursor.eat_back_while(|c| matches!(c, ' ' | '\t'));
TokenKind::Whitespace
}
'\r' => {
self.back_line_has_no_comment = false;
TokenKind::Newline
}
'\n' => {
self.back_line_has_no_comment = false;
self.cursor.eat_char_back('\r');
TokenKind::Newline
}
// Empty comment (could also be a comment nested in another comment, but this shouldn't matter for what we use the lexer for)
'#' => TokenKind::Comment,
// For all other tokens, test if the character isn't part of a comment.
c => {
let mut comment_offset = None;
// Skip the test whether there's a preceding comment if it has been performed before.
if !self.back_line_has_no_comment {
for (back_index, c) in self.cursor.chars().rev().enumerate() {
match c {
'#' => {
// Potentially a comment
comment_offset = Some(back_index + 1);
}
'\r' | '\n' | '\\' => {
break;
}
c => {
if !is_python_whitespace(c)
&& TokenKind::from_non_trivia_char(c) == TokenKind::Other
{
comment_offset = None;
}
}
}
}
}
// From here on it is guaranteed that this line has no other comment.
self.back_line_has_no_comment = true;
if let Some(comment_offset) = comment_offset {
// It is a comment, bump all tokens
for _ in 0..comment_offset {
self.cursor.bump_back().unwrap();
}
TokenKind::Comment
} else if c == '\\' {
TokenKind::Continuation
} else {
let kind = if is_identifier_continuation(c) {
// if we only have identifier continuations but no start (e.g. 555) we
// don't want to consume the chars, so in that case, we want to rewind the
// cursor to here
let savepoint = self.cursor.clone();
self.cursor.eat_back_while(is_identifier_continuation);
let token_len = self.cursor.token_len();
let range = TextRange::at(self.back_offset - token_len, token_len);
if self.source[range]
.chars()
.next()
.is_some_and(is_identifier_start)
{
self.to_keyword_or_other(range)
} else {
self.cursor = savepoint;
TokenKind::Other
}
} else {
TokenKind::from_non_trivia_char(c)
};
if kind == TokenKind::Other {
self.bogus = true;
}
kind
}
}
};
let token_len = self.cursor.token_len();
let start = self.back_offset - token_len;
let token = Token {
kind,
range: TextRange::at(start, token_len),
};
self.back_offset = start;
token
}
pub(crate) fn skip_trivia(self) -> impl Iterator<Item = Token> + DoubleEndedIterator + 'a {
self.filter(|t| !t.kind().is_trivia())
}
}
impl Iterator for SimpleTokenizer<'_> {
type Item = Token;
fn next(&mut self) -> Option<Self::Item> {
let token = self.next_token();
if token.kind == TokenKind::EndOfFile {
None
} else {
Some(token)
}
}
}
impl DoubleEndedIterator for SimpleTokenizer<'_> {
fn next_back(&mut self) -> Option<Self::Item> {
let token = self.next_token_back();
if token.kind == TokenKind::EndOfFile {
None
} else {
Some(token)
}
}
}
#[cfg(test)]
mod tests {
use insta::assert_debug_snapshot;
use ruff_text_size::{TextLen, TextRange, TextSize};
use crate::trivia::{lines_after, lines_before, SimpleTokenizer, Token};
struct TokenizationTestCase {
source: &'static str,
range: TextRange,
tokens: Vec<Token>,
}
impl TokenizationTestCase {
fn assert_reverse_tokenization(&self) {
let mut backwards = self.tokenize_reverse();
// Re-reverse to get the tokens in forward order.
backwards.reverse();
assert_eq!(&backwards, &self.tokens);
}
fn tokenize_reverse(&self) -> Vec<Token> {
SimpleTokenizer::new(self.source, self.range)
.rev()
.collect()
}
fn tokens(&self) -> &[Token] {
&self.tokens
}
}
fn tokenize_range(source: &'static str, range: TextRange) -> TokenizationTestCase {
let tokens: Vec<_> = SimpleTokenizer::new(source, range).collect();
TokenizationTestCase {
source,
range,
tokens,
}
}
fn tokenize(source: &'static str) -> TokenizationTestCase {
tokenize_range(source, TextRange::new(TextSize::new(0), source.text_len()))
}
#[test]
fn tokenize_trivia() {
let source = "# comment\n # comment";
let test_case = tokenize(source);
assert_debug_snapshot!(test_case.tokens());
test_case.assert_reverse_tokenization();
}
#[test]
fn tokenize_parentheses() {
let source = "([{}])";
let test_case = tokenize(source);
assert_debug_snapshot!(test_case.tokens());
test_case.assert_reverse_tokenization();
}
#[test]
fn tokenize_comma() {
let source = ",,,,";
let test_case = tokenize(source);
assert_debug_snapshot!(test_case.tokens());
test_case.assert_reverse_tokenization();
}
#[test]
fn tokenize_continuation() {
let source = "( \\\n )";
let test_case = tokenize(source);
assert_debug_snapshot!(test_case.tokens());
test_case.assert_reverse_tokenization();
}
#[test]
fn tricky_unicode() {
let source = "មុ";
let test_case = tokenize(source);
assert_debug_snapshot!(test_case.tokens());
test_case.assert_reverse_tokenization();
}
#[test]
fn identifier_ending_in_non_start_char() {
let source = "i5";
let test_case = tokenize(source);
assert_debug_snapshot!(test_case.tokens());
test_case.assert_reverse_tokenization();
}
#[test]
fn ignore_word_with_only_id_continuing_chars() {
let source = "555";
let test_case = tokenize(source);
assert_debug_snapshot!(test_case.tokens());
// note: not reversible: [other, bogus, bogus] vs [bogus, bogus, other]
}
#[test]
fn tokenize_multichar() {
let source = "if in else match";
let test_case = tokenize(source);
assert_debug_snapshot!(test_case.tokens());
test_case.assert_reverse_tokenization();
}
#[test]
fn tokenize_substring() {
let source = "('some string') # comment";
let test_case =
tokenize_range(source, TextRange::new(TextSize::new(14), source.text_len()));
assert_debug_snapshot!(test_case.tokens());
test_case.assert_reverse_tokenization();
}
#[test]
fn tokenize_slash() {
let source = r#" # trailing positional comment
# Positional arguments only after here
,/"#;
let test_case = tokenize(source);
assert_debug_snapshot!(test_case.tokens());
test_case.assert_reverse_tokenization();
}
#[test]
fn tokenize_bogus() {
let source = r#"# leading comment
"a string"
a = (10)"#;
let test_case = tokenize(source);
assert_debug_snapshot!(test_case.tokens());
assert_debug_snapshot!("Reverse", test_case.tokenize_reverse());
}
#[test]
fn lines_before_empty_string() {
assert_eq!(lines_before(TextSize::new(0), ""), 0);
}
#[test]
fn lines_before_in_the_middle_of_a_line() {
assert_eq!(lines_before(TextSize::new(4), "a = 20"), 0);
}
#[test]
fn lines_before_on_a_new_line() {
assert_eq!(lines_before(TextSize::new(7), "a = 20\nb = 10"), 1);
}
#[test]
fn lines_before_multiple_leading_newlines() {
assert_eq!(lines_before(TextSize::new(9), "a = 20\n\r\nb = 10"), 2);
}
#[test]
fn lines_before_with_comment_offset() {
assert_eq!(lines_before(TextSize::new(8), "a = 20\n# a comment"), 0);
}
#[test]
fn lines_before_with_trailing_comment() {
assert_eq!(
lines_before(TextSize::new(22), "a = 20 # some comment\nb = 10"),
1
);
}
#[test]
fn lines_before_with_comment_only_line() {
assert_eq!(
lines_before(TextSize::new(22), "a = 20\n# some comment\nb = 10"),
1
);
}
#[test]
fn lines_after_empty_string() {
assert_eq!(lines_after(TextSize::new(0), ""), 0);
}
#[test]
fn lines_after_in_the_middle_of_a_line() {
assert_eq!(lines_after(TextSize::new(4), "a = 20"), 0);
}
#[test]
fn lines_after_before_a_new_line() {
assert_eq!(lines_after(TextSize::new(6), "a = 20\nb = 10"), 1);
}
#[test]
fn lines_after_multiple_newlines() {
assert_eq!(lines_after(TextSize::new(6), "a = 20\n\r\nb = 10"), 2);
}
#[test]
fn lines_after_before_comment_offset() {
assert_eq!(lines_after(TextSize::new(7), "a = 20 # a comment\n"), 0);
}
#[test]
fn lines_after_with_comment_only_line() {
assert_eq!(
lines_after(TextSize::new(6), "a = 20\n# some comment\nb = 10"),
1
);
}
}