From 1e894f328c13faedec9b028ab87543dbb379e246 Mon Sep 17 00:00:00 2001 From: David Szotten Date: Mon, 10 Jul 2023 09:00:59 +0100 Subject: [PATCH] formatter: multi char tokens in SimpleTokenizer (#5610) --- Cargo.lock | 1 + crates/ruff_python_formatter/Cargo.toml | 1 + .../src/comments/placement.rs | 45 +++---- ...__identifier_ending_in_non_start_char.snap | 10 ++ ...re_word_with_only_id_continuing_chars.snap | 18 +++ ...er__trivia__tests__tokenize_multichar.snap | 34 +++++ ...matter__trivia__tests__tricky_unicode.snap | 10 ++ crates/ruff_python_formatter/src/trivia.rs | 123 +++++++++++++++++- 8 files changed, 209 insertions(+), 33 deletions(-) create mode 100644 crates/ruff_python_formatter/src/snapshots/ruff_python_formatter__trivia__tests__identifier_ending_in_non_start_char.snap create mode 100644 crates/ruff_python_formatter/src/snapshots/ruff_python_formatter__trivia__tests__ignore_word_with_only_id_continuing_chars.snap create mode 100644 crates/ruff_python_formatter/src/snapshots/ruff_python_formatter__trivia__tests__tokenize_multichar.snap create mode 100644 crates/ruff_python_formatter/src/snapshots/ruff_python_formatter__trivia__tests__tricky_unicode.snap diff --git a/Cargo.lock b/Cargo.lock index 541991d0fb..d64986ffc2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2101,6 +2101,7 @@ dependencies = [ "similar", "smallvec", "thiserror", + "unic-ucd-ident", ] [[package]] diff --git a/crates/ruff_python_formatter/Cargo.toml b/crates/ruff_python_formatter/Cargo.toml index c27d92a907..dac4a95d80 100644 --- a/crates/ruff_python_formatter/Cargo.toml +++ b/crates/ruff_python_formatter/Cargo.toml @@ -28,6 +28,7 @@ rustpython-parser = { workspace = true } serde = { workspace = true, optional = true } smallvec = { workspace = true } thiserror = { workspace = true } +unic-ucd-ident = "0.9.0" [dev-dependencies] ruff_formatter = { path = "../ruff_formatter", features = ["serde"]} diff --git a/crates/ruff_python_formatter/src/comments/placement.rs b/crates/ruff_python_formatter/src/comments/placement.rs index 207b9a68a8..89dc52896b 100644 --- a/crates/ruff_python_formatter/src/comments/placement.rs +++ b/crates/ruff_python_formatter/src/comments/placement.rs @@ -1,6 +1,6 @@ use std::cmp::Ordering; -use ruff_text_size::{TextLen, TextRange}; +use ruff_text_size::TextRange; use rustpython_parser::ast; use rustpython_parser::ast::{Expr, ExprIfExp, ExprSlice, Ranged}; @@ -14,9 +14,7 @@ use crate::expression::expr_slice::{assign_comment_in_slice, ExprSliceCommentSec use crate::other::arguments::{ assign_argument_separator_comment_placement, find_argument_separators, }; -use crate::trivia::{ - first_non_trivia_token, first_non_trivia_token_rev, SimpleTokenizer, Token, TokenKind, -}; +use crate::trivia::{first_non_trivia_token_rev, SimpleTokenizer, Token, TokenKind}; /// Implements the custom comment placement logic. pub(super) fn place_comment<'a>( @@ -1191,10 +1189,16 @@ fn handle_expr_if_comment<'a>( } // Find the if and the else - let if_token = - find_only_token_str_in_range(TextRange::new(body.end(), test.start()), locator, "if"); - let else_token = - find_only_token_str_in_range(TextRange::new(test.end(), orelse.start()), locator, "else"); + let if_token = find_only_token_in_range( + TextRange::new(body.end(), test.start()), + locator, + TokenKind::If, + ); + let else_token = find_only_token_in_range( + TextRange::new(test.end(), orelse.start()), + locator, + TokenKind::Else, + ); // Between `if` and `test` if if_token.range.start() < comment.slice().start() && comment.slice().start() < test.start() { @@ -1211,25 +1215,12 @@ fn handle_expr_if_comment<'a>( CommentPlacement::Default(comment) } -/// Looks for a multi char token in the range that contains no other tokens. `SimpleTokenizer` only -/// works with single char tokens so we check that we have the right token by string comparison. -fn find_only_token_str_in_range(range: TextRange, locator: &Locator, token_str: &str) -> Token { - let token = - first_non_trivia_token(range.start(), locator.contents()).expect("Expected a token"); - debug_assert!( - locator.after(token.start()).starts_with(token_str), - "expected a `{token_str}` token", - ); - debug_assert!( - SimpleTokenizer::new( - locator.contents(), - TextRange::new(token.start() + token_str.text_len(), range.end()) - ) - .skip_trivia() - .next() - .is_none(), - "Didn't expect any other token" - ); +/// Looks for a token in the range that contains no other tokens. +fn find_only_token_in_range(range: TextRange, locator: &Locator, token_kind: TokenKind) -> Token { + let mut tokens = SimpleTokenizer::new(locator.contents(), range).skip_trivia(); + let token = tokens.next().expect("Expected a token"); + debug_assert_eq!(token.kind(), token_kind); + debug_assert_eq!(tokens.next(), None); token } diff --git a/crates/ruff_python_formatter/src/snapshots/ruff_python_formatter__trivia__tests__identifier_ending_in_non_start_char.snap b/crates/ruff_python_formatter/src/snapshots/ruff_python_formatter__trivia__tests__identifier_ending_in_non_start_char.snap new file mode 100644 index 0000000000..15e9d84407 --- /dev/null +++ b/crates/ruff_python_formatter/src/snapshots/ruff_python_formatter__trivia__tests__identifier_ending_in_non_start_char.snap @@ -0,0 +1,10 @@ +--- +source: crates/ruff_python_formatter/src/trivia.rs +expression: test_case.tokens() +--- +[ + Token { + kind: Other, + range: 0..2, + }, +] diff --git a/crates/ruff_python_formatter/src/snapshots/ruff_python_formatter__trivia__tests__ignore_word_with_only_id_continuing_chars.snap b/crates/ruff_python_formatter/src/snapshots/ruff_python_formatter__trivia__tests__ignore_word_with_only_id_continuing_chars.snap new file mode 100644 index 0000000000..26e9fd18bc --- /dev/null +++ b/crates/ruff_python_formatter/src/snapshots/ruff_python_formatter__trivia__tests__ignore_word_with_only_id_continuing_chars.snap @@ -0,0 +1,18 @@ +--- +source: crates/ruff_python_formatter/src/trivia.rs +expression: test_case.tokens() +--- +[ + Token { + kind: Other, + range: 0..1, + }, + Token { + kind: Bogus, + range: 1..2, + }, + Token { + kind: Bogus, + range: 2..3, + }, +] diff --git a/crates/ruff_python_formatter/src/snapshots/ruff_python_formatter__trivia__tests__tokenize_multichar.snap b/crates/ruff_python_formatter/src/snapshots/ruff_python_formatter__trivia__tests__tokenize_multichar.snap new file mode 100644 index 0000000000..16a1293b44 --- /dev/null +++ b/crates/ruff_python_formatter/src/snapshots/ruff_python_formatter__trivia__tests__tokenize_multichar.snap @@ -0,0 +1,34 @@ +--- +source: crates/ruff_python_formatter/src/trivia.rs +expression: test_case.tokens() +--- +[ + Token { + kind: If, + range: 0..2, + }, + Token { + kind: Whitespace, + range: 2..3, + }, + Token { + kind: In, + range: 3..5, + }, + Token { + kind: Whitespace, + range: 5..6, + }, + Token { + kind: Else, + range: 6..10, + }, + Token { + kind: Whitespace, + range: 10..11, + }, + Token { + kind: Match, + range: 11..16, + }, +] diff --git a/crates/ruff_python_formatter/src/snapshots/ruff_python_formatter__trivia__tests__tricky_unicode.snap b/crates/ruff_python_formatter/src/snapshots/ruff_python_formatter__trivia__tests__tricky_unicode.snap new file mode 100644 index 0000000000..91b9cb397a --- /dev/null +++ b/crates/ruff_python_formatter/src/snapshots/ruff_python_formatter__trivia__tests__tricky_unicode.snap @@ -0,0 +1,10 @@ +--- +source: crates/ruff_python_formatter/src/trivia.rs +expression: test_case.tokens() +--- +[ + Token { + kind: Other, + range: 0..6, + }, +] diff --git a/crates/ruff_python_formatter/src/trivia.rs b/crates/ruff_python_formatter/src/trivia.rs index 86516170a1..4d8a1fa8cd 100644 --- a/crates/ruff_python_formatter/src/trivia.rs +++ b/crates/ruff_python_formatter/src/trivia.rs @@ -1,8 +1,8 @@ use std::str::Chars; -use ruff_text_size::{TextLen, TextRange, TextSize}; - use ruff_python_whitespace::is_python_whitespace; +use ruff_text_size::{TextLen, TextRange, TextSize}; +use unic_ucd_ident::{is_xid_continue, is_xid_start}; /// Searches for the first non-trivia character in `range`. /// @@ -92,6 +92,24 @@ pub(crate) fn skip_trailing_trivia(offset: TextSize, code: &str) -> TextSize { offset } +fn is_identifier_start(c: char) -> bool { + c.is_ascii_alphabetic() || c == '_' || is_non_ascii_identifier_start(c) +} + +// Checks if the character c is a valid continuation character as described +// in https://docs.python.org/3/reference/lexical_analysis.html#identifiers +fn is_identifier_continuation(c: char) -> bool { + if c.is_ascii() { + matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9') + } else { + is_xid_continue(c) + } +} + +fn is_non_ascii_identifier_start(c: char) -> bool { + is_xid_start(c) +} + #[derive(Clone, Debug, Eq, PartialEq, Hash)] pub(crate) struct Token { pub(crate) kind: TokenKind, @@ -167,7 +185,19 @@ pub(crate) enum TokenKind { /// `.`. Dot, - /// Any other non trivia token. Always has a length of 1 + /// `else` + Else, + + /// `if` + If, + + /// `in` + In, + + /// `match` + Match, + + /// Any other non trivia token. Other, /// Returned for each character after [`TokenKind::Other`] has been returned once. @@ -215,6 +245,7 @@ pub(crate) struct SimpleTokenizer<'a> { /// `true` when it is known that the current `back` line has no comment for sure. back_line_has_no_comment: bool, bogus: bool, + source: &'a str, cursor: Cursor<'a>, } @@ -225,6 +256,7 @@ impl<'a> SimpleTokenizer<'a> { back_offset: range.end(), back_line_has_no_comment: false, bogus: false, + source, cursor: Cursor::new(&source[range]), } } @@ -238,6 +270,18 @@ impl<'a> SimpleTokenizer<'a> { Self::new(source, TextRange::up_to(offset)) } + fn to_keyword_or_other(&self, range: TextRange) -> TokenKind { + let source = &self.source[range]; + match source { + "if" => TokenKind::If, + "else" => TokenKind::Else, + "in" => TokenKind::In, + "match" => TokenKind::Match, // Match is a soft keyword that depends on the context but we can always lex it as a keyword and leave it to the caller (parser) to decide if it should be handled as an identifier or keyword. + // ..., + _ => TokenKind::Other, // Potentially an identifier, but only if it isn't a string prefix. We can ignore this for now https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals + } + } + fn next_token(&mut self) -> Token { self.cursor.start_token(); @@ -279,12 +323,19 @@ impl<'a> SimpleTokenizer<'a> { '\\' => TokenKind::Continuation, c => { - let kind = TokenKind::from_non_trivia_char(c); + let kind = if is_identifier_start(c) { + self.cursor.eat_while(is_identifier_continuation); + let token_len = self.cursor.token_len(); + + let range = TextRange::at(self.offset, token_len); + self.to_keyword_or_other(range) + } else { + TokenKind::from_non_trivia_char(c) + }; if kind == TokenKind::Other { self.bogus = true; } - kind } }; @@ -386,7 +437,29 @@ impl<'a> SimpleTokenizer<'a> { } else if c == '\\' { TokenKind::Continuation } else { - let kind = TokenKind::from_non_trivia_char(c); + let kind = if is_identifier_continuation(c) { + // if we only have identifier continuations but no start (e.g. 555) we + // don't want to consume the chars, so in that case, we want to rewind the + // cursor to here + let savepoint = self.cursor.clone(); + self.cursor.eat_back_while(is_identifier_continuation); + + let token_len = self.cursor.token_len(); + let range = TextRange::at(self.back_offset - token_len, token_len); + + if self.source[range] + .chars() + .next() + .is_some_and(is_identifier_start) + { + self.to_keyword_or_other(range) + } else { + self.cursor = savepoint; + TokenKind::Other + } + } else { + TokenKind::from_non_trivia_char(c) + }; if kind == TokenKind::Other { self.bogus = true; @@ -624,6 +697,44 @@ mod tests { test_case.assert_reverse_tokenization(); } + #[test] + fn tricky_unicode() { + let source = "មុ"; + + let test_case = tokenize(source); + assert_debug_snapshot!(test_case.tokens()); + test_case.assert_reverse_tokenization(); + } + + #[test] + fn identifier_ending_in_non_start_char() { + let source = "i5"; + + let test_case = tokenize(source); + assert_debug_snapshot!(test_case.tokens()); + test_case.assert_reverse_tokenization(); + } + + #[test] + fn ignore_word_with_only_id_continuing_chars() { + let source = "555"; + + let test_case = tokenize(source); + assert_debug_snapshot!(test_case.tokens()); + + // note: not reversible: [other, bogus, bogus] vs [bogus, bogus, other] + } + + #[test] + fn tokenize_multichar() { + let source = "if in else match"; + + let test_case = tokenize(source); + + assert_debug_snapshot!(test_case.tokens()); + test_case.assert_reverse_tokenization(); + } + #[test] fn tokenize_substring() { let source = "('some string') # comment";