formatter: multi char tokens in SimpleTokenizer (#5610)

This commit is contained in:
David Szotten 2023-07-10 09:00:59 +01:00 committed by GitHub
parent 52b22ceb6e
commit 1e894f328c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 209 additions and 33 deletions

View file

@ -1,6 +1,6 @@
use std::cmp::Ordering;
use ruff_text_size::{TextLen, TextRange};
use ruff_text_size::TextRange;
use rustpython_parser::ast;
use rustpython_parser::ast::{Expr, ExprIfExp, ExprSlice, Ranged};
@ -14,9 +14,7 @@ use crate::expression::expr_slice::{assign_comment_in_slice, ExprSliceCommentSec
use crate::other::arguments::{
assign_argument_separator_comment_placement, find_argument_separators,
};
use crate::trivia::{
first_non_trivia_token, first_non_trivia_token_rev, SimpleTokenizer, Token, TokenKind,
};
use crate::trivia::{first_non_trivia_token_rev, SimpleTokenizer, Token, TokenKind};
/// Implements the custom comment placement logic.
pub(super) fn place_comment<'a>(
@ -1191,10 +1189,16 @@ fn handle_expr_if_comment<'a>(
}
// Find the if and the else
let if_token =
find_only_token_str_in_range(TextRange::new(body.end(), test.start()), locator, "if");
let else_token =
find_only_token_str_in_range(TextRange::new(test.end(), orelse.start()), locator, "else");
let if_token = find_only_token_in_range(
TextRange::new(body.end(), test.start()),
locator,
TokenKind::If,
);
let else_token = find_only_token_in_range(
TextRange::new(test.end(), orelse.start()),
locator,
TokenKind::Else,
);
// Between `if` and `test`
if if_token.range.start() < comment.slice().start() && comment.slice().start() < test.start() {
@ -1211,25 +1215,12 @@ fn handle_expr_if_comment<'a>(
CommentPlacement::Default(comment)
}
/// Looks for a multi char token in the range that contains no other tokens. `SimpleTokenizer` only
/// works with single char tokens so we check that we have the right token by string comparison.
fn find_only_token_str_in_range(range: TextRange, locator: &Locator, token_str: &str) -> Token {
let token =
first_non_trivia_token(range.start(), locator.contents()).expect("Expected a token");
debug_assert!(
locator.after(token.start()).starts_with(token_str),
"expected a `{token_str}` token",
);
debug_assert!(
SimpleTokenizer::new(
locator.contents(),
TextRange::new(token.start() + token_str.text_len(), range.end())
)
.skip_trivia()
.next()
.is_none(),
"Didn't expect any other token"
);
/// Looks for a token in the range that contains no other tokens.
fn find_only_token_in_range(range: TextRange, locator: &Locator, token_kind: TokenKind) -> Token {
let mut tokens = SimpleTokenizer::new(locator.contents(), range).skip_trivia();
let token = tokens.next().expect("Expected a token");
debug_assert_eq!(token.kind(), token_kind);
debug_assert_eq!(tokens.next(), None);
token
}

View file

@ -0,0 +1,10 @@
---
source: crates/ruff_python_formatter/src/trivia.rs
expression: test_case.tokens()
---
[
Token {
kind: Other,
range: 0..2,
},
]

View file

@ -0,0 +1,18 @@
---
source: crates/ruff_python_formatter/src/trivia.rs
expression: test_case.tokens()
---
[
Token {
kind: Other,
range: 0..1,
},
Token {
kind: Bogus,
range: 1..2,
},
Token {
kind: Bogus,
range: 2..3,
},
]

View file

@ -0,0 +1,34 @@
---
source: crates/ruff_python_formatter/src/trivia.rs
expression: test_case.tokens()
---
[
Token {
kind: If,
range: 0..2,
},
Token {
kind: Whitespace,
range: 2..3,
},
Token {
kind: In,
range: 3..5,
},
Token {
kind: Whitespace,
range: 5..6,
},
Token {
kind: Else,
range: 6..10,
},
Token {
kind: Whitespace,
range: 10..11,
},
Token {
kind: Match,
range: 11..16,
},
]

View file

@ -0,0 +1,10 @@
---
source: crates/ruff_python_formatter/src/trivia.rs
expression: test_case.tokens()
---
[
Token {
kind: Other,
range: 0..6,
},
]

View file

@ -1,8 +1,8 @@
use std::str::Chars;
use ruff_text_size::{TextLen, TextRange, TextSize};
use ruff_python_whitespace::is_python_whitespace;
use ruff_text_size::{TextLen, TextRange, TextSize};
use unic_ucd_ident::{is_xid_continue, is_xid_start};
/// Searches for the first non-trivia character in `range`.
///
@ -92,6 +92,24 @@ pub(crate) fn skip_trailing_trivia(offset: TextSize, code: &str) -> TextSize {
offset
}
fn is_identifier_start(c: char) -> bool {
c.is_ascii_alphabetic() || c == '_' || is_non_ascii_identifier_start(c)
}
// Checks if the character c is a valid continuation character as described
// in https://docs.python.org/3/reference/lexical_analysis.html#identifiers
fn is_identifier_continuation(c: char) -> bool {
if c.is_ascii() {
matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9')
} else {
is_xid_continue(c)
}
}
fn is_non_ascii_identifier_start(c: char) -> bool {
is_xid_start(c)
}
#[derive(Clone, Debug, Eq, PartialEq, Hash)]
pub(crate) struct Token {
pub(crate) kind: TokenKind,
@ -167,7 +185,19 @@ pub(crate) enum TokenKind {
/// `.`.
Dot,
/// Any other non trivia token. Always has a length of 1
/// `else`
Else,
/// `if`
If,
/// `in`
In,
/// `match`
Match,
/// Any other non trivia token.
Other,
/// Returned for each character after [`TokenKind::Other`] has been returned once.
@ -215,6 +245,7 @@ pub(crate) struct SimpleTokenizer<'a> {
/// `true` when it is known that the current `back` line has no comment for sure.
back_line_has_no_comment: bool,
bogus: bool,
source: &'a str,
cursor: Cursor<'a>,
}
@ -225,6 +256,7 @@ impl<'a> SimpleTokenizer<'a> {
back_offset: range.end(),
back_line_has_no_comment: false,
bogus: false,
source,
cursor: Cursor::new(&source[range]),
}
}
@ -238,6 +270,18 @@ impl<'a> SimpleTokenizer<'a> {
Self::new(source, TextRange::up_to(offset))
}
fn to_keyword_or_other(&self, range: TextRange) -> TokenKind {
let source = &self.source[range];
match source {
"if" => TokenKind::If,
"else" => TokenKind::Else,
"in" => TokenKind::In,
"match" => TokenKind::Match, // Match is a soft keyword that depends on the context but we can always lex it as a keyword and leave it to the caller (parser) to decide if it should be handled as an identifier or keyword.
// ...,
_ => TokenKind::Other, // Potentially an identifier, but only if it isn't a string prefix. We can ignore this for now https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals
}
}
fn next_token(&mut self) -> Token {
self.cursor.start_token();
@ -279,12 +323,19 @@ impl<'a> SimpleTokenizer<'a> {
'\\' => TokenKind::Continuation,
c => {
let kind = TokenKind::from_non_trivia_char(c);
let kind = if is_identifier_start(c) {
self.cursor.eat_while(is_identifier_continuation);
let token_len = self.cursor.token_len();
let range = TextRange::at(self.offset, token_len);
self.to_keyword_or_other(range)
} else {
TokenKind::from_non_trivia_char(c)
};
if kind == TokenKind::Other {
self.bogus = true;
}
kind
}
};
@ -386,7 +437,29 @@ impl<'a> SimpleTokenizer<'a> {
} else if c == '\\' {
TokenKind::Continuation
} else {
let kind = TokenKind::from_non_trivia_char(c);
let kind = if is_identifier_continuation(c) {
// if we only have identifier continuations but no start (e.g. 555) we
// don't want to consume the chars, so in that case, we want to rewind the
// cursor to here
let savepoint = self.cursor.clone();
self.cursor.eat_back_while(is_identifier_continuation);
let token_len = self.cursor.token_len();
let range = TextRange::at(self.back_offset - token_len, token_len);
if self.source[range]
.chars()
.next()
.is_some_and(is_identifier_start)
{
self.to_keyword_or_other(range)
} else {
self.cursor = savepoint;
TokenKind::Other
}
} else {
TokenKind::from_non_trivia_char(c)
};
if kind == TokenKind::Other {
self.bogus = true;
@ -624,6 +697,44 @@ mod tests {
test_case.assert_reverse_tokenization();
}
#[test]
fn tricky_unicode() {
let source = "មុ";
let test_case = tokenize(source);
assert_debug_snapshot!(test_case.tokens());
test_case.assert_reverse_tokenization();
}
#[test]
fn identifier_ending_in_non_start_char() {
let source = "i5";
let test_case = tokenize(source);
assert_debug_snapshot!(test_case.tokens());
test_case.assert_reverse_tokenization();
}
#[test]
fn ignore_word_with_only_id_continuing_chars() {
let source = "555";
let test_case = tokenize(source);
assert_debug_snapshot!(test_case.tokens());
// note: not reversible: [other, bogus, bogus] vs [bogus, bogus, other]
}
#[test]
fn tokenize_multichar() {
let source = "if in else match";
let test_case = tokenize(source);
assert_debug_snapshot!(test_case.tokens());
test_case.assert_reverse_tokenization();
}
#[test]
fn tokenize_substring() {
let source = "('some string') # comment";