formatter: multi char tokens in SimpleTokenizer (#5610)

2025-07-25 05:53:51 +00:00 · 2023-07-10 09:00:59 +01:00 · 2023-07-10 09:00:59 +01:00 · 1e894f328c
commit 1e894f328c
parent 52b22ceb6e
8 changed files with 209 additions and 33 deletions
--- a/crates/ruff_python_formatter/src/comments/placement.rs
+++ b/crates/ruff_python_formatter/src/comments/placement.rs
@ -1,6 +1,6 @@
 use std::cmp::Ordering;

-use ruff_text_size::{TextLen, TextRange};
+use ruff_text_size::TextRange;
 use rustpython_parser::ast;
 use rustpython_parser::ast::{Expr, ExprIfExp, ExprSlice, Ranged};

@ -14,9 +14,7 @@ use crate::expression::expr_slice::{assign_comment_in_slice, ExprSliceCommentSec
 use crate::other::arguments::{
    assign_argument_separator_comment_placement, find_argument_separators,
 };
-use crate::trivia::{
-    first_non_trivia_token, first_non_trivia_token_rev, SimpleTokenizer, Token, TokenKind,
-};
+use crate::trivia::{first_non_trivia_token_rev, SimpleTokenizer, Token, TokenKind};

 /// Implements the custom comment placement logic.
 pub(super) fn place_comment<'a>(
@ -1191,10 +1189,16 @@ fn handle_expr_if_comment<'a>(
    }

    // Find the if and the else
-    let if_token =
-        find_only_token_str_in_range(TextRange::new(body.end(), test.start()), locator, "if");
-    let else_token =
-        find_only_token_str_in_range(TextRange::new(test.end(), orelse.start()), locator, "else");
+    let if_token = find_only_token_in_range(
+        TextRange::new(body.end(), test.start()),
+        locator,
+        TokenKind::If,
+    );
+    let else_token = find_only_token_in_range(
+        TextRange::new(test.end(), orelse.start()),
+        locator,
+        TokenKind::Else,
+    );

    // Between `if` and `test`
    if if_token.range.start() < comment.slice().start() && comment.slice().start() < test.start() {
@ -1211,25 +1215,12 @@ fn handle_expr_if_comment<'a>(
    CommentPlacement::Default(comment)
 }

-/// Looks for a multi char token in the range that contains no other tokens. `SimpleTokenizer` only
-/// works with single char tokens so we check that we have the right token by string comparison.
-fn find_only_token_str_in_range(range: TextRange, locator: &Locator, token_str: &str) -> Token {
-    let token =
-        first_non_trivia_token(range.start(), locator.contents()).expect("Expected a token");
-    debug_assert!(
-        locator.after(token.start()).starts_with(token_str),
-        "expected a `{token_str}` token",
-    );
-    debug_assert!(
-        SimpleTokenizer::new(
-            locator.contents(),
-            TextRange::new(token.start() + token_str.text_len(), range.end())
-        )
-        .skip_trivia()
-        .next()
-        .is_none(),
-        "Didn't expect any other token"
-    );
+/// Looks for a token in the range that contains no other tokens.
+fn find_only_token_in_range(range: TextRange, locator: &Locator, token_kind: TokenKind) -> Token {
+    let mut tokens = SimpleTokenizer::new(locator.contents(), range).skip_trivia();
+    let token = tokens.next().expect("Expected a token");
+    debug_assert_eq!(token.kind(), token_kind);
+    debug_assert_eq!(tokens.next(), None);
    token
 }

--- a/crates/ruff_python_formatter/src/snapshots/ruff_python_formattertriviatests__identifier_ending_in_non_start_char.snap
+++ b/crates/ruff_python_formatter/src/snapshots/ruff_python_formattertriviatests__identifier_ending_in_non_start_char.snap
@ -0,0 +1,10 @@
+---
+source: crates/ruff_python_formatter/src/trivia.rs
+expression: test_case.tokens()
+---
+[
+    Token {
+        kind: Other,
+        range: 0..2,
+    },
+]
--- a/crates/ruff_python_formatter/src/snapshots/ruff_python_formattertriviatests__ignore_word_with_only_id_continuing_chars.snap
+++ b/crates/ruff_python_formatter/src/snapshots/ruff_python_formattertriviatests__ignore_word_with_only_id_continuing_chars.snap
@ -0,0 +1,18 @@
+---
+source: crates/ruff_python_formatter/src/trivia.rs
+expression: test_case.tokens()
+---
+[
+    Token {
+        kind: Other,
+        range: 0..1,
+    },
+    Token {
+        kind: Bogus,
+        range: 1..2,
+    },
+    Token {
+        kind: Bogus,
+        range: 2..3,
+    },
+]
--- a/crates/ruff_python_formatter/src/snapshots/ruff_python_formattertriviatests__tokenize_multichar.snap
+++ b/crates/ruff_python_formatter/src/snapshots/ruff_python_formattertriviatests__tokenize_multichar.snap
@ -0,0 +1,34 @@
+---
+source: crates/ruff_python_formatter/src/trivia.rs
+expression: test_case.tokens()
+---
+[
+    Token {
+        kind: If,
+        range: 0..2,
+    },
+    Token {
+        kind: Whitespace,
+        range: 2..3,
+    },
+    Token {
+        kind: In,
+        range: 3..5,
+    },
+    Token {
+        kind: Whitespace,
+        range: 5..6,
+    },
+    Token {
+        kind: Else,
+        range: 6..10,
+    },
+    Token {
+        kind: Whitespace,
+        range: 10..11,
+    },
+    Token {
+        kind: Match,
+        range: 11..16,
+    },
+]
--- a/crates/ruff_python_formatter/src/snapshots/ruff_python_formattertriviatests__tricky_unicode.snap
+++ b/crates/ruff_python_formatter/src/snapshots/ruff_python_formattertriviatests__tricky_unicode.snap
@ -0,0 +1,10 @@
+---
+source: crates/ruff_python_formatter/src/trivia.rs
+expression: test_case.tokens()
+---
+[
+    Token {
+        kind: Other,
+        range: 0..6,
+    },
+]
--- a/crates/ruff_python_formatter/src/trivia.rs
+++ b/crates/ruff_python_formatter/src/trivia.rs
@ -1,8 +1,8 @@
 use std::str::Chars;

-use ruff_text_size::{TextLen, TextRange, TextSize};
-
 use ruff_python_whitespace::is_python_whitespace;
+use ruff_text_size::{TextLen, TextRange, TextSize};
+use unic_ucd_ident::{is_xid_continue, is_xid_start};

 /// Searches for the first non-trivia character in `range`.
 ///
@ -92,6 +92,24 @@ pub(crate) fn skip_trailing_trivia(offset: TextSize, code: &str) -> TextSize {
    offset
 }

+fn is_identifier_start(c: char) -> bool {
+    c.is_ascii_alphabetic() || c == '_' || is_non_ascii_identifier_start(c)
+}
+
+// Checks if the character c is a valid continuation character as described
+// in https://docs.python.org/3/reference/lexical_analysis.html#identifiers
+fn is_identifier_continuation(c: char) -> bool {
+    if c.is_ascii() {
+        matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9')
+    } else {
+        is_xid_continue(c)
+    }
+}
+
+fn is_non_ascii_identifier_start(c: char) -> bool {
+    is_xid_start(c)
+}
+
 #[derive(Clone, Debug, Eq, PartialEq, Hash)]
 pub(crate) struct Token {
    pub(crate) kind: TokenKind,
@ -167,7 +185,19 @@ pub(crate) enum TokenKind {
    /// `.`.
    Dot,

-    /// Any other non trivia token. Always has a length of 1
+    /// `else`
+    Else,
+
+    /// `if`
+    If,
+
+    /// `in`
+    In,
+
+    /// `match`
+    Match,
+
+    /// Any other non trivia token.
    Other,

    /// Returned for each character after [`TokenKind::Other`] has been returned once.
@ -215,6 +245,7 @@ pub(crate) struct SimpleTokenizer<'a> {
    /// `true` when it is known that the current `back` line has no comment for sure.
    back_line_has_no_comment: bool,
    bogus: bool,
+    source: &'a str,
    cursor: Cursor<'a>,
 }

@ -225,6 +256,7 @@ impl<'a> SimpleTokenizer<'a> {
            back_offset: range.end(),
            back_line_has_no_comment: false,
            bogus: false,
+            source,
            cursor: Cursor::new(&source[range]),
        }
    }
@ -238,6 +270,18 @@ impl<'a> SimpleTokenizer<'a> {
        Self::new(source, TextRange::up_to(offset))
    }

+    fn to_keyword_or_other(&self, range: TextRange) -> TokenKind {
+        let source = &self.source[range];
+        match source {
+            "if" => TokenKind::If,
+            "else" => TokenKind::Else,
+            "in" => TokenKind::In,
+            "match" => TokenKind::Match, // Match is a soft keyword that depends on the context but we can always lex it as a keyword and leave it to the caller (parser) to decide if it should be handled as an identifier or keyword.
+            // ...,
+            _ => TokenKind::Other, // Potentially an identifier, but only if it isn't a string prefix. We can ignore this for now https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals
+        }
+    }
+
    fn next_token(&mut self) -> Token {
        self.cursor.start_token();

@ -279,12 +323,19 @@ impl<'a> SimpleTokenizer<'a> {
            '\\' => TokenKind::Continuation,

            c => {
-                let kind = TokenKind::from_non_trivia_char(c);
+                let kind = if is_identifier_start(c) {
+                    self.cursor.eat_while(is_identifier_continuation);
+                    let token_len = self.cursor.token_len();
+
+                    let range = TextRange::at(self.offset, token_len);
+                    self.to_keyword_or_other(range)
+                } else {
+                    TokenKind::from_non_trivia_char(c)
+                };

                if kind == TokenKind::Other {
                    self.bogus = true;
                }
-
                kind
            }
        };
@ -386,7 +437,29 @@ impl<'a> SimpleTokenizer<'a> {
                } else if c == '\\' {
                    TokenKind::Continuation
                } else {
-                    let kind = TokenKind::from_non_trivia_char(c);
+                    let kind = if is_identifier_continuation(c) {
+                        // if we only have identifier continuations but no start (e.g. 555) we
+                        // don't want to consume the chars, so in that case, we want to rewind the
+                        // cursor to here
+                        let savepoint = self.cursor.clone();
+                        self.cursor.eat_back_while(is_identifier_continuation);
+
+                        let token_len = self.cursor.token_len();
+                        let range = TextRange::at(self.back_offset - token_len, token_len);
+
+                        if self.source[range]
+                            .chars()
+                            .next()
+                            .is_some_and(is_identifier_start)
+                        {
+                            self.to_keyword_or_other(range)
+                        } else {
+                            self.cursor = savepoint;
+                            TokenKind::Other
+                        }
+                    } else {
+                        TokenKind::from_non_trivia_char(c)
+                    };

                    if kind == TokenKind::Other {
                        self.bogus = true;
@ -624,6 +697,44 @@ mod tests {
        test_case.assert_reverse_tokenization();
    }

+    #[test]
+    fn tricky_unicode() {
+        let source = "មុ";
+
+        let test_case = tokenize(source);
+        assert_debug_snapshot!(test_case.tokens());
+        test_case.assert_reverse_tokenization();
+    }
+
+    #[test]
+    fn identifier_ending_in_non_start_char() {
+        let source = "i5";
+
+        let test_case = tokenize(source);
+        assert_debug_snapshot!(test_case.tokens());
+        test_case.assert_reverse_tokenization();
+    }
+
+    #[test]
+    fn ignore_word_with_only_id_continuing_chars() {
+        let source = "555";
+
+        let test_case = tokenize(source);
+        assert_debug_snapshot!(test_case.tokens());
+
+        // note: not reversible: [other, bogus, bogus] vs [bogus, bogus, other]
+    }
+
+    #[test]
+    fn tokenize_multichar() {
+        let source = "if in else match";
+
+        let test_case = tokenize(source);
+
+        assert_debug_snapshot!(test_case.tokens());
+        test_case.assert_reverse_tokenization();
+    }
+
    #[test]
    fn tokenize_substring() {
        let source = "('some string') # comment";