mirror of
https://github.com/astral-sh/ruff.git
synced 2025-07-24 13:33:50 +00:00
formatter: multi char tokens in SimpleTokenizer (#5610)
This commit is contained in:
parent
52b22ceb6e
commit
1e894f328c
8 changed files with 209 additions and 33 deletions
|
@ -1,6 +1,6 @@
|
|||
use std::cmp::Ordering;
|
||||
|
||||
use ruff_text_size::{TextLen, TextRange};
|
||||
use ruff_text_size::TextRange;
|
||||
use rustpython_parser::ast;
|
||||
use rustpython_parser::ast::{Expr, ExprIfExp, ExprSlice, Ranged};
|
||||
|
||||
|
@ -14,9 +14,7 @@ use crate::expression::expr_slice::{assign_comment_in_slice, ExprSliceCommentSec
|
|||
use crate::other::arguments::{
|
||||
assign_argument_separator_comment_placement, find_argument_separators,
|
||||
};
|
||||
use crate::trivia::{
|
||||
first_non_trivia_token, first_non_trivia_token_rev, SimpleTokenizer, Token, TokenKind,
|
||||
};
|
||||
use crate::trivia::{first_non_trivia_token_rev, SimpleTokenizer, Token, TokenKind};
|
||||
|
||||
/// Implements the custom comment placement logic.
|
||||
pub(super) fn place_comment<'a>(
|
||||
|
@ -1191,10 +1189,16 @@ fn handle_expr_if_comment<'a>(
|
|||
}
|
||||
|
||||
// Find the if and the else
|
||||
let if_token =
|
||||
find_only_token_str_in_range(TextRange::new(body.end(), test.start()), locator, "if");
|
||||
let else_token =
|
||||
find_only_token_str_in_range(TextRange::new(test.end(), orelse.start()), locator, "else");
|
||||
let if_token = find_only_token_in_range(
|
||||
TextRange::new(body.end(), test.start()),
|
||||
locator,
|
||||
TokenKind::If,
|
||||
);
|
||||
let else_token = find_only_token_in_range(
|
||||
TextRange::new(test.end(), orelse.start()),
|
||||
locator,
|
||||
TokenKind::Else,
|
||||
);
|
||||
|
||||
// Between `if` and `test`
|
||||
if if_token.range.start() < comment.slice().start() && comment.slice().start() < test.start() {
|
||||
|
@ -1211,25 +1215,12 @@ fn handle_expr_if_comment<'a>(
|
|||
CommentPlacement::Default(comment)
|
||||
}
|
||||
|
||||
/// Looks for a multi char token in the range that contains no other tokens. `SimpleTokenizer` only
|
||||
/// works with single char tokens so we check that we have the right token by string comparison.
|
||||
fn find_only_token_str_in_range(range: TextRange, locator: &Locator, token_str: &str) -> Token {
|
||||
let token =
|
||||
first_non_trivia_token(range.start(), locator.contents()).expect("Expected a token");
|
||||
debug_assert!(
|
||||
locator.after(token.start()).starts_with(token_str),
|
||||
"expected a `{token_str}` token",
|
||||
);
|
||||
debug_assert!(
|
||||
SimpleTokenizer::new(
|
||||
locator.contents(),
|
||||
TextRange::new(token.start() + token_str.text_len(), range.end())
|
||||
)
|
||||
.skip_trivia()
|
||||
.next()
|
||||
.is_none(),
|
||||
"Didn't expect any other token"
|
||||
);
|
||||
/// Looks for a token in the range that contains no other tokens.
|
||||
fn find_only_token_in_range(range: TextRange, locator: &Locator, token_kind: TokenKind) -> Token {
|
||||
let mut tokens = SimpleTokenizer::new(locator.contents(), range).skip_trivia();
|
||||
let token = tokens.next().expect("Expected a token");
|
||||
debug_assert_eq!(token.kind(), token_kind);
|
||||
debug_assert_eq!(tokens.next(), None);
|
||||
token
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,10 @@
|
|||
---
|
||||
source: crates/ruff_python_formatter/src/trivia.rs
|
||||
expression: test_case.tokens()
|
||||
---
|
||||
[
|
||||
Token {
|
||||
kind: Other,
|
||||
range: 0..2,
|
||||
},
|
||||
]
|
|
@ -0,0 +1,18 @@
|
|||
---
|
||||
source: crates/ruff_python_formatter/src/trivia.rs
|
||||
expression: test_case.tokens()
|
||||
---
|
||||
[
|
||||
Token {
|
||||
kind: Other,
|
||||
range: 0..1,
|
||||
},
|
||||
Token {
|
||||
kind: Bogus,
|
||||
range: 1..2,
|
||||
},
|
||||
Token {
|
||||
kind: Bogus,
|
||||
range: 2..3,
|
||||
},
|
||||
]
|
|
@ -0,0 +1,34 @@
|
|||
---
|
||||
source: crates/ruff_python_formatter/src/trivia.rs
|
||||
expression: test_case.tokens()
|
||||
---
|
||||
[
|
||||
Token {
|
||||
kind: If,
|
||||
range: 0..2,
|
||||
},
|
||||
Token {
|
||||
kind: Whitespace,
|
||||
range: 2..3,
|
||||
},
|
||||
Token {
|
||||
kind: In,
|
||||
range: 3..5,
|
||||
},
|
||||
Token {
|
||||
kind: Whitespace,
|
||||
range: 5..6,
|
||||
},
|
||||
Token {
|
||||
kind: Else,
|
||||
range: 6..10,
|
||||
},
|
||||
Token {
|
||||
kind: Whitespace,
|
||||
range: 10..11,
|
||||
},
|
||||
Token {
|
||||
kind: Match,
|
||||
range: 11..16,
|
||||
},
|
||||
]
|
|
@ -0,0 +1,10 @@
|
|||
---
|
||||
source: crates/ruff_python_formatter/src/trivia.rs
|
||||
expression: test_case.tokens()
|
||||
---
|
||||
[
|
||||
Token {
|
||||
kind: Other,
|
||||
range: 0..6,
|
||||
},
|
||||
]
|
|
@ -1,8 +1,8 @@
|
|||
use std::str::Chars;
|
||||
|
||||
use ruff_text_size::{TextLen, TextRange, TextSize};
|
||||
|
||||
use ruff_python_whitespace::is_python_whitespace;
|
||||
use ruff_text_size::{TextLen, TextRange, TextSize};
|
||||
use unic_ucd_ident::{is_xid_continue, is_xid_start};
|
||||
|
||||
/// Searches for the first non-trivia character in `range`.
|
||||
///
|
||||
|
@ -92,6 +92,24 @@ pub(crate) fn skip_trailing_trivia(offset: TextSize, code: &str) -> TextSize {
|
|||
offset
|
||||
}
|
||||
|
||||
fn is_identifier_start(c: char) -> bool {
|
||||
c.is_ascii_alphabetic() || c == '_' || is_non_ascii_identifier_start(c)
|
||||
}
|
||||
|
||||
// Checks if the character c is a valid continuation character as described
|
||||
// in https://docs.python.org/3/reference/lexical_analysis.html#identifiers
|
||||
fn is_identifier_continuation(c: char) -> bool {
|
||||
if c.is_ascii() {
|
||||
matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9')
|
||||
} else {
|
||||
is_xid_continue(c)
|
||||
}
|
||||
}
|
||||
|
||||
fn is_non_ascii_identifier_start(c: char) -> bool {
|
||||
is_xid_start(c)
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Eq, PartialEq, Hash)]
|
||||
pub(crate) struct Token {
|
||||
pub(crate) kind: TokenKind,
|
||||
|
@ -167,7 +185,19 @@ pub(crate) enum TokenKind {
|
|||
/// `.`.
|
||||
Dot,
|
||||
|
||||
/// Any other non trivia token. Always has a length of 1
|
||||
/// `else`
|
||||
Else,
|
||||
|
||||
/// `if`
|
||||
If,
|
||||
|
||||
/// `in`
|
||||
In,
|
||||
|
||||
/// `match`
|
||||
Match,
|
||||
|
||||
/// Any other non trivia token.
|
||||
Other,
|
||||
|
||||
/// Returned for each character after [`TokenKind::Other`] has been returned once.
|
||||
|
@ -215,6 +245,7 @@ pub(crate) struct SimpleTokenizer<'a> {
|
|||
/// `true` when it is known that the current `back` line has no comment for sure.
|
||||
back_line_has_no_comment: bool,
|
||||
bogus: bool,
|
||||
source: &'a str,
|
||||
cursor: Cursor<'a>,
|
||||
}
|
||||
|
||||
|
@ -225,6 +256,7 @@ impl<'a> SimpleTokenizer<'a> {
|
|||
back_offset: range.end(),
|
||||
back_line_has_no_comment: false,
|
||||
bogus: false,
|
||||
source,
|
||||
cursor: Cursor::new(&source[range]),
|
||||
}
|
||||
}
|
||||
|
@ -238,6 +270,18 @@ impl<'a> SimpleTokenizer<'a> {
|
|||
Self::new(source, TextRange::up_to(offset))
|
||||
}
|
||||
|
||||
fn to_keyword_or_other(&self, range: TextRange) -> TokenKind {
|
||||
let source = &self.source[range];
|
||||
match source {
|
||||
"if" => TokenKind::If,
|
||||
"else" => TokenKind::Else,
|
||||
"in" => TokenKind::In,
|
||||
"match" => TokenKind::Match, // Match is a soft keyword that depends on the context but we can always lex it as a keyword and leave it to the caller (parser) to decide if it should be handled as an identifier or keyword.
|
||||
// ...,
|
||||
_ => TokenKind::Other, // Potentially an identifier, but only if it isn't a string prefix. We can ignore this for now https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals
|
||||
}
|
||||
}
|
||||
|
||||
fn next_token(&mut self) -> Token {
|
||||
self.cursor.start_token();
|
||||
|
||||
|
@ -279,12 +323,19 @@ impl<'a> SimpleTokenizer<'a> {
|
|||
'\\' => TokenKind::Continuation,
|
||||
|
||||
c => {
|
||||
let kind = TokenKind::from_non_trivia_char(c);
|
||||
let kind = if is_identifier_start(c) {
|
||||
self.cursor.eat_while(is_identifier_continuation);
|
||||
let token_len = self.cursor.token_len();
|
||||
|
||||
let range = TextRange::at(self.offset, token_len);
|
||||
self.to_keyword_or_other(range)
|
||||
} else {
|
||||
TokenKind::from_non_trivia_char(c)
|
||||
};
|
||||
|
||||
if kind == TokenKind::Other {
|
||||
self.bogus = true;
|
||||
}
|
||||
|
||||
kind
|
||||
}
|
||||
};
|
||||
|
@ -386,7 +437,29 @@ impl<'a> SimpleTokenizer<'a> {
|
|||
} else if c == '\\' {
|
||||
TokenKind::Continuation
|
||||
} else {
|
||||
let kind = TokenKind::from_non_trivia_char(c);
|
||||
let kind = if is_identifier_continuation(c) {
|
||||
// if we only have identifier continuations but no start (e.g. 555) we
|
||||
// don't want to consume the chars, so in that case, we want to rewind the
|
||||
// cursor to here
|
||||
let savepoint = self.cursor.clone();
|
||||
self.cursor.eat_back_while(is_identifier_continuation);
|
||||
|
||||
let token_len = self.cursor.token_len();
|
||||
let range = TextRange::at(self.back_offset - token_len, token_len);
|
||||
|
||||
if self.source[range]
|
||||
.chars()
|
||||
.next()
|
||||
.is_some_and(is_identifier_start)
|
||||
{
|
||||
self.to_keyword_or_other(range)
|
||||
} else {
|
||||
self.cursor = savepoint;
|
||||
TokenKind::Other
|
||||
}
|
||||
} else {
|
||||
TokenKind::from_non_trivia_char(c)
|
||||
};
|
||||
|
||||
if kind == TokenKind::Other {
|
||||
self.bogus = true;
|
||||
|
@ -624,6 +697,44 @@ mod tests {
|
|||
test_case.assert_reverse_tokenization();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tricky_unicode() {
|
||||
let source = "មុ";
|
||||
|
||||
let test_case = tokenize(source);
|
||||
assert_debug_snapshot!(test_case.tokens());
|
||||
test_case.assert_reverse_tokenization();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn identifier_ending_in_non_start_char() {
|
||||
let source = "i5";
|
||||
|
||||
let test_case = tokenize(source);
|
||||
assert_debug_snapshot!(test_case.tokens());
|
||||
test_case.assert_reverse_tokenization();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ignore_word_with_only_id_continuing_chars() {
|
||||
let source = "555";
|
||||
|
||||
let test_case = tokenize(source);
|
||||
assert_debug_snapshot!(test_case.tokens());
|
||||
|
||||
// note: not reversible: [other, bogus, bogus] vs [bogus, bogus, other]
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tokenize_multichar() {
|
||||
let source = "if in else match";
|
||||
|
||||
let test_case = tokenize(source);
|
||||
|
||||
assert_debug_snapshot!(test_case.tokens());
|
||||
test_case.assert_reverse_tokenization();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tokenize_substring() {
|
||||
let source = "('some string') # comment";
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue