mirror of
https://github.com/astral-sh/ruff.git
synced 2025-09-13 13:57:22 +00:00
formatter: multi char tokens in SimpleTokenizer (#5610)
This commit is contained in:
parent
52b22ceb6e
commit
1e894f328c
8 changed files with 209 additions and 33 deletions
1
Cargo.lock
generated
1
Cargo.lock
generated
|
@ -2101,6 +2101,7 @@ dependencies = [
|
||||||
"similar",
|
"similar",
|
||||||
"smallvec",
|
"smallvec",
|
||||||
"thiserror",
|
"thiserror",
|
||||||
|
"unic-ucd-ident",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
|
|
@ -28,6 +28,7 @@ rustpython-parser = { workspace = true }
|
||||||
serde = { workspace = true, optional = true }
|
serde = { workspace = true, optional = true }
|
||||||
smallvec = { workspace = true }
|
smallvec = { workspace = true }
|
||||||
thiserror = { workspace = true }
|
thiserror = { workspace = true }
|
||||||
|
unic-ucd-ident = "0.9.0"
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
ruff_formatter = { path = "../ruff_formatter", features = ["serde"]}
|
ruff_formatter = { path = "../ruff_formatter", features = ["serde"]}
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
use std::cmp::Ordering;
|
use std::cmp::Ordering;
|
||||||
|
|
||||||
use ruff_text_size::{TextLen, TextRange};
|
use ruff_text_size::TextRange;
|
||||||
use rustpython_parser::ast;
|
use rustpython_parser::ast;
|
||||||
use rustpython_parser::ast::{Expr, ExprIfExp, ExprSlice, Ranged};
|
use rustpython_parser::ast::{Expr, ExprIfExp, ExprSlice, Ranged};
|
||||||
|
|
||||||
|
@ -14,9 +14,7 @@ use crate::expression::expr_slice::{assign_comment_in_slice, ExprSliceCommentSec
|
||||||
use crate::other::arguments::{
|
use crate::other::arguments::{
|
||||||
assign_argument_separator_comment_placement, find_argument_separators,
|
assign_argument_separator_comment_placement, find_argument_separators,
|
||||||
};
|
};
|
||||||
use crate::trivia::{
|
use crate::trivia::{first_non_trivia_token_rev, SimpleTokenizer, Token, TokenKind};
|
||||||
first_non_trivia_token, first_non_trivia_token_rev, SimpleTokenizer, Token, TokenKind,
|
|
||||||
};
|
|
||||||
|
|
||||||
/// Implements the custom comment placement logic.
|
/// Implements the custom comment placement logic.
|
||||||
pub(super) fn place_comment<'a>(
|
pub(super) fn place_comment<'a>(
|
||||||
|
@ -1191,10 +1189,16 @@ fn handle_expr_if_comment<'a>(
|
||||||
}
|
}
|
||||||
|
|
||||||
// Find the if and the else
|
// Find the if and the else
|
||||||
let if_token =
|
let if_token = find_only_token_in_range(
|
||||||
find_only_token_str_in_range(TextRange::new(body.end(), test.start()), locator, "if");
|
TextRange::new(body.end(), test.start()),
|
||||||
let else_token =
|
locator,
|
||||||
find_only_token_str_in_range(TextRange::new(test.end(), orelse.start()), locator, "else");
|
TokenKind::If,
|
||||||
|
);
|
||||||
|
let else_token = find_only_token_in_range(
|
||||||
|
TextRange::new(test.end(), orelse.start()),
|
||||||
|
locator,
|
||||||
|
TokenKind::Else,
|
||||||
|
);
|
||||||
|
|
||||||
// Between `if` and `test`
|
// Between `if` and `test`
|
||||||
if if_token.range.start() < comment.slice().start() && comment.slice().start() < test.start() {
|
if if_token.range.start() < comment.slice().start() && comment.slice().start() < test.start() {
|
||||||
|
@ -1211,25 +1215,12 @@ fn handle_expr_if_comment<'a>(
|
||||||
CommentPlacement::Default(comment)
|
CommentPlacement::Default(comment)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Looks for a multi char token in the range that contains no other tokens. `SimpleTokenizer` only
|
/// Looks for a token in the range that contains no other tokens.
|
||||||
/// works with single char tokens so we check that we have the right token by string comparison.
|
fn find_only_token_in_range(range: TextRange, locator: &Locator, token_kind: TokenKind) -> Token {
|
||||||
fn find_only_token_str_in_range(range: TextRange, locator: &Locator, token_str: &str) -> Token {
|
let mut tokens = SimpleTokenizer::new(locator.contents(), range).skip_trivia();
|
||||||
let token =
|
let token = tokens.next().expect("Expected a token");
|
||||||
first_non_trivia_token(range.start(), locator.contents()).expect("Expected a token");
|
debug_assert_eq!(token.kind(), token_kind);
|
||||||
debug_assert!(
|
debug_assert_eq!(tokens.next(), None);
|
||||||
locator.after(token.start()).starts_with(token_str),
|
|
||||||
"expected a `{token_str}` token",
|
|
||||||
);
|
|
||||||
debug_assert!(
|
|
||||||
SimpleTokenizer::new(
|
|
||||||
locator.contents(),
|
|
||||||
TextRange::new(token.start() + token_str.text_len(), range.end())
|
|
||||||
)
|
|
||||||
.skip_trivia()
|
|
||||||
.next()
|
|
||||||
.is_none(),
|
|
||||||
"Didn't expect any other token"
|
|
||||||
);
|
|
||||||
token
|
token
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,10 @@
|
||||||
|
---
|
||||||
|
source: crates/ruff_python_formatter/src/trivia.rs
|
||||||
|
expression: test_case.tokens()
|
||||||
|
---
|
||||||
|
[
|
||||||
|
Token {
|
||||||
|
kind: Other,
|
||||||
|
range: 0..2,
|
||||||
|
},
|
||||||
|
]
|
|
@ -0,0 +1,18 @@
|
||||||
|
---
|
||||||
|
source: crates/ruff_python_formatter/src/trivia.rs
|
||||||
|
expression: test_case.tokens()
|
||||||
|
---
|
||||||
|
[
|
||||||
|
Token {
|
||||||
|
kind: Other,
|
||||||
|
range: 0..1,
|
||||||
|
},
|
||||||
|
Token {
|
||||||
|
kind: Bogus,
|
||||||
|
range: 1..2,
|
||||||
|
},
|
||||||
|
Token {
|
||||||
|
kind: Bogus,
|
||||||
|
range: 2..3,
|
||||||
|
},
|
||||||
|
]
|
|
@ -0,0 +1,34 @@
|
||||||
|
---
|
||||||
|
source: crates/ruff_python_formatter/src/trivia.rs
|
||||||
|
expression: test_case.tokens()
|
||||||
|
---
|
||||||
|
[
|
||||||
|
Token {
|
||||||
|
kind: If,
|
||||||
|
range: 0..2,
|
||||||
|
},
|
||||||
|
Token {
|
||||||
|
kind: Whitespace,
|
||||||
|
range: 2..3,
|
||||||
|
},
|
||||||
|
Token {
|
||||||
|
kind: In,
|
||||||
|
range: 3..5,
|
||||||
|
},
|
||||||
|
Token {
|
||||||
|
kind: Whitespace,
|
||||||
|
range: 5..6,
|
||||||
|
},
|
||||||
|
Token {
|
||||||
|
kind: Else,
|
||||||
|
range: 6..10,
|
||||||
|
},
|
||||||
|
Token {
|
||||||
|
kind: Whitespace,
|
||||||
|
range: 10..11,
|
||||||
|
},
|
||||||
|
Token {
|
||||||
|
kind: Match,
|
||||||
|
range: 11..16,
|
||||||
|
},
|
||||||
|
]
|
|
@ -0,0 +1,10 @@
|
||||||
|
---
|
||||||
|
source: crates/ruff_python_formatter/src/trivia.rs
|
||||||
|
expression: test_case.tokens()
|
||||||
|
---
|
||||||
|
[
|
||||||
|
Token {
|
||||||
|
kind: Other,
|
||||||
|
range: 0..6,
|
||||||
|
},
|
||||||
|
]
|
|
@ -1,8 +1,8 @@
|
||||||
use std::str::Chars;
|
use std::str::Chars;
|
||||||
|
|
||||||
use ruff_text_size::{TextLen, TextRange, TextSize};
|
|
||||||
|
|
||||||
use ruff_python_whitespace::is_python_whitespace;
|
use ruff_python_whitespace::is_python_whitespace;
|
||||||
|
use ruff_text_size::{TextLen, TextRange, TextSize};
|
||||||
|
use unic_ucd_ident::{is_xid_continue, is_xid_start};
|
||||||
|
|
||||||
/// Searches for the first non-trivia character in `range`.
|
/// Searches for the first non-trivia character in `range`.
|
||||||
///
|
///
|
||||||
|
@ -92,6 +92,24 @@ pub(crate) fn skip_trailing_trivia(offset: TextSize, code: &str) -> TextSize {
|
||||||
offset
|
offset
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn is_identifier_start(c: char) -> bool {
|
||||||
|
c.is_ascii_alphabetic() || c == '_' || is_non_ascii_identifier_start(c)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Checks if the character c is a valid continuation character as described
|
||||||
|
// in https://docs.python.org/3/reference/lexical_analysis.html#identifiers
|
||||||
|
fn is_identifier_continuation(c: char) -> bool {
|
||||||
|
if c.is_ascii() {
|
||||||
|
matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9')
|
||||||
|
} else {
|
||||||
|
is_xid_continue(c)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_non_ascii_identifier_start(c: char) -> bool {
|
||||||
|
is_xid_start(c)
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Clone, Debug, Eq, PartialEq, Hash)]
|
#[derive(Clone, Debug, Eq, PartialEq, Hash)]
|
||||||
pub(crate) struct Token {
|
pub(crate) struct Token {
|
||||||
pub(crate) kind: TokenKind,
|
pub(crate) kind: TokenKind,
|
||||||
|
@ -167,7 +185,19 @@ pub(crate) enum TokenKind {
|
||||||
/// `.`.
|
/// `.`.
|
||||||
Dot,
|
Dot,
|
||||||
|
|
||||||
/// Any other non trivia token. Always has a length of 1
|
/// `else`
|
||||||
|
Else,
|
||||||
|
|
||||||
|
/// `if`
|
||||||
|
If,
|
||||||
|
|
||||||
|
/// `in`
|
||||||
|
In,
|
||||||
|
|
||||||
|
/// `match`
|
||||||
|
Match,
|
||||||
|
|
||||||
|
/// Any other non trivia token.
|
||||||
Other,
|
Other,
|
||||||
|
|
||||||
/// Returned for each character after [`TokenKind::Other`] has been returned once.
|
/// Returned for each character after [`TokenKind::Other`] has been returned once.
|
||||||
|
@ -215,6 +245,7 @@ pub(crate) struct SimpleTokenizer<'a> {
|
||||||
/// `true` when it is known that the current `back` line has no comment for sure.
|
/// `true` when it is known that the current `back` line has no comment for sure.
|
||||||
back_line_has_no_comment: bool,
|
back_line_has_no_comment: bool,
|
||||||
bogus: bool,
|
bogus: bool,
|
||||||
|
source: &'a str,
|
||||||
cursor: Cursor<'a>,
|
cursor: Cursor<'a>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -225,6 +256,7 @@ impl<'a> SimpleTokenizer<'a> {
|
||||||
back_offset: range.end(),
|
back_offset: range.end(),
|
||||||
back_line_has_no_comment: false,
|
back_line_has_no_comment: false,
|
||||||
bogus: false,
|
bogus: false,
|
||||||
|
source,
|
||||||
cursor: Cursor::new(&source[range]),
|
cursor: Cursor::new(&source[range]),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -238,6 +270,18 @@ impl<'a> SimpleTokenizer<'a> {
|
||||||
Self::new(source, TextRange::up_to(offset))
|
Self::new(source, TextRange::up_to(offset))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn to_keyword_or_other(&self, range: TextRange) -> TokenKind {
|
||||||
|
let source = &self.source[range];
|
||||||
|
match source {
|
||||||
|
"if" => TokenKind::If,
|
||||||
|
"else" => TokenKind::Else,
|
||||||
|
"in" => TokenKind::In,
|
||||||
|
"match" => TokenKind::Match, // Match is a soft keyword that depends on the context but we can always lex it as a keyword and leave it to the caller (parser) to decide if it should be handled as an identifier or keyword.
|
||||||
|
// ...,
|
||||||
|
_ => TokenKind::Other, // Potentially an identifier, but only if it isn't a string prefix. We can ignore this for now https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
fn next_token(&mut self) -> Token {
|
fn next_token(&mut self) -> Token {
|
||||||
self.cursor.start_token();
|
self.cursor.start_token();
|
||||||
|
|
||||||
|
@ -279,12 +323,19 @@ impl<'a> SimpleTokenizer<'a> {
|
||||||
'\\' => TokenKind::Continuation,
|
'\\' => TokenKind::Continuation,
|
||||||
|
|
||||||
c => {
|
c => {
|
||||||
let kind = TokenKind::from_non_trivia_char(c);
|
let kind = if is_identifier_start(c) {
|
||||||
|
self.cursor.eat_while(is_identifier_continuation);
|
||||||
|
let token_len = self.cursor.token_len();
|
||||||
|
|
||||||
|
let range = TextRange::at(self.offset, token_len);
|
||||||
|
self.to_keyword_or_other(range)
|
||||||
|
} else {
|
||||||
|
TokenKind::from_non_trivia_char(c)
|
||||||
|
};
|
||||||
|
|
||||||
if kind == TokenKind::Other {
|
if kind == TokenKind::Other {
|
||||||
self.bogus = true;
|
self.bogus = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
kind
|
kind
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -386,7 +437,29 @@ impl<'a> SimpleTokenizer<'a> {
|
||||||
} else if c == '\\' {
|
} else if c == '\\' {
|
||||||
TokenKind::Continuation
|
TokenKind::Continuation
|
||||||
} else {
|
} else {
|
||||||
let kind = TokenKind::from_non_trivia_char(c);
|
let kind = if is_identifier_continuation(c) {
|
||||||
|
// if we only have identifier continuations but no start (e.g. 555) we
|
||||||
|
// don't want to consume the chars, so in that case, we want to rewind the
|
||||||
|
// cursor to here
|
||||||
|
let savepoint = self.cursor.clone();
|
||||||
|
self.cursor.eat_back_while(is_identifier_continuation);
|
||||||
|
|
||||||
|
let token_len = self.cursor.token_len();
|
||||||
|
let range = TextRange::at(self.back_offset - token_len, token_len);
|
||||||
|
|
||||||
|
if self.source[range]
|
||||||
|
.chars()
|
||||||
|
.next()
|
||||||
|
.is_some_and(is_identifier_start)
|
||||||
|
{
|
||||||
|
self.to_keyword_or_other(range)
|
||||||
|
} else {
|
||||||
|
self.cursor = savepoint;
|
||||||
|
TokenKind::Other
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
TokenKind::from_non_trivia_char(c)
|
||||||
|
};
|
||||||
|
|
||||||
if kind == TokenKind::Other {
|
if kind == TokenKind::Other {
|
||||||
self.bogus = true;
|
self.bogus = true;
|
||||||
|
@ -624,6 +697,44 @@ mod tests {
|
||||||
test_case.assert_reverse_tokenization();
|
test_case.assert_reverse_tokenization();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn tricky_unicode() {
|
||||||
|
let source = "មុ";
|
||||||
|
|
||||||
|
let test_case = tokenize(source);
|
||||||
|
assert_debug_snapshot!(test_case.tokens());
|
||||||
|
test_case.assert_reverse_tokenization();
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn identifier_ending_in_non_start_char() {
|
||||||
|
let source = "i5";
|
||||||
|
|
||||||
|
let test_case = tokenize(source);
|
||||||
|
assert_debug_snapshot!(test_case.tokens());
|
||||||
|
test_case.assert_reverse_tokenization();
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn ignore_word_with_only_id_continuing_chars() {
|
||||||
|
let source = "555";
|
||||||
|
|
||||||
|
let test_case = tokenize(source);
|
||||||
|
assert_debug_snapshot!(test_case.tokens());
|
||||||
|
|
||||||
|
// note: not reversible: [other, bogus, bogus] vs [bogus, bogus, other]
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn tokenize_multichar() {
|
||||||
|
let source = "if in else match";
|
||||||
|
|
||||||
|
let test_case = tokenize(source);
|
||||||
|
|
||||||
|
assert_debug_snapshot!(test_case.tokens());
|
||||||
|
test_case.assert_reverse_tokenization();
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn tokenize_substring() {
|
fn tokenize_substring() {
|
||||||
let source = "('some string') # comment";
|
let source = "('some string') # comment";
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue