formatter: multi char tokens in SimpleTokenizer (#5610)

2025-09-13 13:57:22 +00:00 · 2023-07-10 09:00:59 +01:00 · 2023-07-10 09:00:59 +01:00 · 1e894f328c
commit 1e894f328c
parent 52b22ceb6e
8 changed files with 209 additions and 33 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -2101,6 +2101,7 @@ dependencies = [
 "similar",
 "smallvec",
 "thiserror",
 "unic-ucd-ident",
 ]
 [[package]]
--- a/crates/ruff_python_formatter/Cargo.toml
+++ b/crates/ruff_python_formatter/Cargo.toml
@ -28,6 +28,7 @@ rustpython-parser = { workspace = true }
 serde = { workspace = true, optional = true }
 smallvec = { workspace = true }
 thiserror = { workspace = true }
 unic-ucd-ident = "0.9.0"
 [dev-dependencies]
 ruff_formatter = { path = "../ruff_formatter", features = ["serde"]}
--- a/crates/ruff_python_formatter/src/comments/placement.rs
+++ b/crates/ruff_python_formatter/src/comments/placement.rs
@ -1,6 +1,6 @@
 use std::cmp::Ordering;
-use ruff_text_size::{TextLen, TextRange};
+use ruff_text_size::TextRange;
 use rustpython_parser::ast;
 use rustpython_parser::ast::{Expr, ExprIfExp, ExprSlice, Ranged};
@ -14,9 +14,7 @@ use crate::expression::expr_slice::{assign_comment_in_slice, ExprSliceCommentSec
 use crate::other::arguments::{
    assign_argument_separator_comment_placement, find_argument_separators,
 };
-use crate::trivia::{
+use crate::trivia::{first_non_trivia_token_rev, SimpleTokenizer, Token, TokenKind};
    first_non_trivia_token, first_non_trivia_token_rev, SimpleTokenizer, Token, TokenKind,
 };
 /// Implements the custom comment placement logic.
 pub(super) fn place_comment<'a>(
@ -1191,10 +1189,16 @@ fn handle_expr_if_comment<'a>(
    }
    // Find the if and the else
-    let if_token =
+    let if_token = find_only_token_in_range(
-        find_only_token_str_in_range(TextRange::new(body.end(), test.start()), locator, "if");
+        TextRange::new(body.end(), test.start()),
-    let else_token =
+        locator,
-        find_only_token_str_in_range(TextRange::new(test.end(), orelse.start()), locator, "else");
+        TokenKind::If,
    );
    let else_token = find_only_token_in_range(
        TextRange::new(test.end(), orelse.start()),
        locator,
        TokenKind::Else,
    );
    // Between `if` and `test`
    if if_token.range.start() < comment.slice().start() && comment.slice().start() < test.start() {
@ -1211,25 +1215,12 @@ fn handle_expr_if_comment<'a>(
    CommentPlacement::Default(comment)
 }
-/// Looks for a multi char token in the range that contains no other tokens. `SimpleTokenizer` only
+/// Looks for a token in the range that contains no other tokens.
-/// works with single char tokens so we check that we have the right token by string comparison.
+fn find_only_token_in_range(range: TextRange, locator: &Locator, token_kind: TokenKind) -> Token {
-fn find_only_token_str_in_range(range: TextRange, locator: &Locator, token_str: &str) -> Token {
+    let mut tokens = SimpleTokenizer::new(locator.contents(), range).skip_trivia();
-    let token =
+    let token = tokens.next().expect("Expected a token");
-        first_non_trivia_token(range.start(), locator.contents()).expect("Expected a token");
+    debug_assert_eq!(token.kind(), token_kind);
-    debug_assert!(
+    debug_assert_eq!(tokens.next(), None);
        locator.after(token.start()).starts_with(token_str),
        "expected a `{token_str}` token",
    );
    debug_assert!(
        SimpleTokenizer::new(
            locator.contents(),
            TextRange::new(token.start() + token_str.text_len(), range.end())
        )
        .skip_trivia()
        .next()
        .is_none(),
        "Didn't expect any other token"
    );
    token
 }
--- a/crates/ruff_python_formatter/src/snapshots/ruff_python_formattertriviatests__identifier_ending_in_non_start_char.snap
+++ b/crates/ruff_python_formatter/src/snapshots/ruff_python_formattertriviatests__identifier_ending_in_non_start_char.snap
@ -0,0 +1,10 @@
 ---
 source: crates/ruff_python_formatter/src/trivia.rs
 expression: test_case.tokens()
 ---
 [
    Token {
        kind: Other,
        range: 0..2,
    },
 ]
--- a/crates/ruff_python_formatter/src/snapshots/ruff_python_formattertriviatests__ignore_word_with_only_id_continuing_chars.snap
+++ b/crates/ruff_python_formatter/src/snapshots/ruff_python_formattertriviatests__ignore_word_with_only_id_continuing_chars.snap
@ -0,0 +1,18 @@
 ---
 source: crates/ruff_python_formatter/src/trivia.rs
 expression: test_case.tokens()
 ---
 [
    Token {
        kind: Other,
        range: 0..1,
    },
    Token {
        kind: Bogus,
        range: 1..2,
    },
    Token {
        kind: Bogus,
        range: 2..3,
    },
 ]
--- a/crates/ruff_python_formatter/src/snapshots/ruff_python_formattertriviatests__tokenize_multichar.snap
+++ b/crates/ruff_python_formatter/src/snapshots/ruff_python_formattertriviatests__tokenize_multichar.snap
@ -0,0 +1,34 @@
 ---
 source: crates/ruff_python_formatter/src/trivia.rs
 expression: test_case.tokens()
 ---
 [
    Token {
        kind: If,
        range: 0..2,
    },
    Token {
        kind: Whitespace,
        range: 2..3,
    },
    Token {
        kind: In,
        range: 3..5,
    },
    Token {
        kind: Whitespace,
        range: 5..6,
    },
    Token {
        kind: Else,
        range: 6..10,
    },
    Token {
        kind: Whitespace,
        range: 10..11,
    },
    Token {
        kind: Match,
        range: 11..16,
    },
 ]
--- a/crates/ruff_python_formatter/src/snapshots/ruff_python_formattertriviatests__tricky_unicode.snap
+++ b/crates/ruff_python_formatter/src/snapshots/ruff_python_formattertriviatests__tricky_unicode.snap
@ -0,0 +1,10 @@
 ---
 source: crates/ruff_python_formatter/src/trivia.rs
 expression: test_case.tokens()
 ---
 [
    Token {
        kind: Other,
        range: 0..6,
    },
 ]
--- a/crates/ruff_python_formatter/src/trivia.rs
+++ b/crates/ruff_python_formatter/src/trivia.rs
@ -1,8 +1,8 @@
 use std::str::Chars;
 use ruff_text_size::{TextLen, TextRange, TextSize};
 use ruff_python_whitespace::is_python_whitespace;
 use ruff_text_size::{TextLen, TextRange, TextSize};
 use unic_ucd_ident::{is_xid_continue, is_xid_start};
 /// Searches for the first non-trivia character in `range`.
 ///
@ -92,6 +92,24 @@ pub(crate) fn skip_trailing_trivia(offset: TextSize, code: &str) -> TextSize {
    offset
 }
 fn is_identifier_start(c: char) -> bool {
    c.is_ascii_alphabetic() || c == '_' || is_non_ascii_identifier_start(c)
 }
 // Checks if the character c is a valid continuation character as described
 // in https://docs.python.org/3/reference/lexical_analysis.html#identifiers
 fn is_identifier_continuation(c: char) -> bool {
    if c.is_ascii() {
        matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9')
    } else {
        is_xid_continue(c)
    }
 }
 fn is_non_ascii_identifier_start(c: char) -> bool {
    is_xid_start(c)
 }
 #[derive(Clone, Debug, Eq, PartialEq, Hash)]
 pub(crate) struct Token {
    pub(crate) kind: TokenKind,
@ -167,7 +185,19 @@ pub(crate) enum TokenKind {
    /// `.`.
    Dot,
-    /// Any other non trivia token. Always has a length of 1
+    /// `else`
    Else,
    /// `if`
    If,
    /// `in`
    In,
    /// `match`
    Match,
    /// Any other non trivia token.
    Other,
    /// Returned for each character after [`TokenKind::Other`] has been returned once.
@ -215,6 +245,7 @@ pub(crate) struct SimpleTokenizer<'a> {
    /// `true` when it is known that the current `back` line has no comment for sure.
    back_line_has_no_comment: bool,
    bogus: bool,
    source: &'a str,
    cursor: Cursor<'a>,
 }
@ -225,6 +256,7 @@ impl<'a> SimpleTokenizer<'a> {
            back_offset: range.end(),
            back_line_has_no_comment: false,
            bogus: false,
            source,
            cursor: Cursor::new(&source[range]),
        }
    }
@ -238,6 +270,18 @@ impl<'a> SimpleTokenizer<'a> {
        Self::new(source, TextRange::up_to(offset))
    }
    fn to_keyword_or_other(&self, range: TextRange) -> TokenKind {
        let source = &self.source[range];
        match source {
            "if" => TokenKind::If,
            "else" => TokenKind::Else,
            "in" => TokenKind::In,
            "match" => TokenKind::Match, // Match is a soft keyword that depends on the context but we can always lex it as a keyword and leave it to the caller (parser) to decide if it should be handled as an identifier or keyword.
            // ...,
            _ => TokenKind::Other, // Potentially an identifier, but only if it isn't a string prefix. We can ignore this for now https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals
        }
    }
    fn next_token(&mut self) -> Token {
        self.cursor.start_token();
@ -279,12 +323,19 @@ impl<'a> SimpleTokenizer<'a> {
            '\\' => TokenKind::Continuation,
            c => {
-                let kind = TokenKind::from_non_trivia_char(c);
+                let kind = if is_identifier_start(c) {
                    self.cursor.eat_while(is_identifier_continuation);
                    let token_len = self.cursor.token_len();
                    let range = TextRange::at(self.offset, token_len);
                    self.to_keyword_or_other(range)
                } else {
                    TokenKind::from_non_trivia_char(c)
                };
                if kind == TokenKind::Other {
                    self.bogus = true;
                }
                kind
            }
        };
@ -386,7 +437,29 @@ impl<'a> SimpleTokenizer<'a> {
                } else if c == '\\' {
                    TokenKind::Continuation
                } else {
-                    let kind = TokenKind::from_non_trivia_char(c);
+                    let kind = if is_identifier_continuation(c) {
                        // if we only have identifier continuations but no start (e.g. 555) we
                        // don't want to consume the chars, so in that case, we want to rewind the
                        // cursor to here
                        let savepoint = self.cursor.clone();
                        self.cursor.eat_back_while(is_identifier_continuation);
                        let token_len = self.cursor.token_len();
                        let range = TextRange::at(self.back_offset - token_len, token_len);
                        if self.source[range]
                            .chars()
                            .next()
                            .is_some_and(is_identifier_start)
                        {
                            self.to_keyword_or_other(range)
                        } else {
                            self.cursor = savepoint;
                            TokenKind::Other
                        }
                    } else {
                        TokenKind::from_non_trivia_char(c)
                    };
                    if kind == TokenKind::Other {
                        self.bogus = true;
@ -624,6 +697,44 @@ mod tests {
        test_case.assert_reverse_tokenization();
    }
    #[test]
    fn tricky_unicode() {
        let source = "មុ";
        let test_case = tokenize(source);
        assert_debug_snapshot!(test_case.tokens());
        test_case.assert_reverse_tokenization();
    }
    #[test]
    fn identifier_ending_in_non_start_char() {
        let source = "i5";
        let test_case = tokenize(source);
        assert_debug_snapshot!(test_case.tokens());
        test_case.assert_reverse_tokenization();
    }
    #[test]
    fn ignore_word_with_only_id_continuing_chars() {
        let source = "555";
        let test_case = tokenize(source);
        assert_debug_snapshot!(test_case.tokens());
        // note: not reversible: [other, bogus, bogus] vs [bogus, bogus, other]
    }
    #[test]
    fn tokenize_multichar() {
        let source = "if in else match";
        let test_case = tokenize(source);
        assert_debug_snapshot!(test_case.tokens());
        test_case.assert_reverse_tokenization();
    }
    #[test]
    fn tokenize_substring() {
        let source = "('some string') # comment";