From 1e894f328c13faedec9b028ab87543dbb379e246 Mon Sep 17 00:00:00 2001
From: David Szotten <davidszotten@gmail.com>
Date: Mon, 10 Jul 2023 09:00:59 +0100
Subject: [PATCH] formatter: multi char tokens in SimpleTokenizer (#5610)

---
 Cargo.lock                                    |   1 +
 crates/ruff_python_formatter/Cargo.toml       |   1 +
 .../src/comments/placement.rs                 |  45 +++----
 ...__identifier_ending_in_non_start_char.snap |  10 ++
 ...re_word_with_only_id_continuing_chars.snap |  18 +++
 ...er__trivia__tests__tokenize_multichar.snap |  34 +++++
 ...matter__trivia__tests__tricky_unicode.snap |  10 ++
 crates/ruff_python_formatter/src/trivia.rs    | 123 +++++++++++++++++-
 8 files changed, 209 insertions(+), 33 deletions(-)
 create mode 100644 crates/ruff_python_formatter/src/snapshots/ruff_python_formatter__trivia__tests__identifier_ending_in_non_start_char.snap
 create mode 100644 crates/ruff_python_formatter/src/snapshots/ruff_python_formatter__trivia__tests__ignore_word_with_only_id_continuing_chars.snap
 create mode 100644 crates/ruff_python_formatter/src/snapshots/ruff_python_formatter__trivia__tests__tokenize_multichar.snap
 create mode 100644 crates/ruff_python_formatter/src/snapshots/ruff_python_formatter__trivia__tests__tricky_unicode.snap

diff --git a/Cargo.lock b/Cargo.lock
index 541991d0fb..d64986ffc2 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2101,6 +2101,7 @@ dependencies = [
  "similar",
  "smallvec",
  "thiserror",
+ "unic-ucd-ident",
 ]
 
 [[package]]
diff --git a/crates/ruff_python_formatter/Cargo.toml b/crates/ruff_python_formatter/Cargo.toml
index c27d92a907..dac4a95d80 100644
--- a/crates/ruff_python_formatter/Cargo.toml
+++ b/crates/ruff_python_formatter/Cargo.toml
@@ -28,6 +28,7 @@ rustpython-parser = { workspace = true }
 serde = { workspace = true, optional = true }
 smallvec = { workspace = true }
 thiserror = { workspace = true }
+unic-ucd-ident = "0.9.0"
 
 [dev-dependencies]
 ruff_formatter = { path = "../ruff_formatter", features = ["serde"]}
diff --git a/crates/ruff_python_formatter/src/comments/placement.rs b/crates/ruff_python_formatter/src/comments/placement.rs
index 207b9a68a8..89dc52896b 100644
--- a/crates/ruff_python_formatter/src/comments/placement.rs
+++ b/crates/ruff_python_formatter/src/comments/placement.rs
@@ -1,6 +1,6 @@
 use std::cmp::Ordering;
 
-use ruff_text_size::{TextLen, TextRange};
+use ruff_text_size::TextRange;
 use rustpython_parser::ast;
 use rustpython_parser::ast::{Expr, ExprIfExp, ExprSlice, Ranged};
 
@@ -14,9 +14,7 @@ use crate::expression::expr_slice::{assign_comment_in_slice, ExprSliceCommentSec
 use crate::other::arguments::{
     assign_argument_separator_comment_placement, find_argument_separators,
 };
-use crate::trivia::{
-    first_non_trivia_token, first_non_trivia_token_rev, SimpleTokenizer, Token, TokenKind,
-};
+use crate::trivia::{first_non_trivia_token_rev, SimpleTokenizer, Token, TokenKind};
 
 /// Implements the custom comment placement logic.
 pub(super) fn place_comment<'a>(
@@ -1191,10 +1189,16 @@ fn handle_expr_if_comment<'a>(
     }
 
     // Find the if and the else
-    let if_token =
-        find_only_token_str_in_range(TextRange::new(body.end(), test.start()), locator, "if");
-    let else_token =
-        find_only_token_str_in_range(TextRange::new(test.end(), orelse.start()), locator, "else");
+    let if_token = find_only_token_in_range(
+        TextRange::new(body.end(), test.start()),
+        locator,
+        TokenKind::If,
+    );
+    let else_token = find_only_token_in_range(
+        TextRange::new(test.end(), orelse.start()),
+        locator,
+        TokenKind::Else,
+    );
 
     // Between `if` and `test`
     if if_token.range.start() < comment.slice().start() && comment.slice().start() < test.start() {
@@ -1211,25 +1215,12 @@ fn handle_expr_if_comment<'a>(
     CommentPlacement::Default(comment)
 }
 
-/// Looks for a multi char token in the range that contains no other tokens. `SimpleTokenizer` only
-/// works with single char tokens so we check that we have the right token by string comparison.
-fn find_only_token_str_in_range(range: TextRange, locator: &Locator, token_str: &str) -> Token {
-    let token =
-        first_non_trivia_token(range.start(), locator.contents()).expect("Expected a token");
-    debug_assert!(
-        locator.after(token.start()).starts_with(token_str),
-        "expected a `{token_str}` token",
-    );
-    debug_assert!(
-        SimpleTokenizer::new(
-            locator.contents(),
-            TextRange::new(token.start() + token_str.text_len(), range.end())
-        )
-        .skip_trivia()
-        .next()
-        .is_none(),
-        "Didn't expect any other token"
-    );
+/// Looks for a token in the range that contains no other tokens.
+fn find_only_token_in_range(range: TextRange, locator: &Locator, token_kind: TokenKind) -> Token {
+    let mut tokens = SimpleTokenizer::new(locator.contents(), range).skip_trivia();
+    let token = tokens.next().expect("Expected a token");
+    debug_assert_eq!(token.kind(), token_kind);
+    debug_assert_eq!(tokens.next(), None);
     token
 }
 
diff --git a/crates/ruff_python_formatter/src/snapshots/ruff_python_formatter__trivia__tests__identifier_ending_in_non_start_char.snap b/crates/ruff_python_formatter/src/snapshots/ruff_python_formatter__trivia__tests__identifier_ending_in_non_start_char.snap
new file mode 100644
index 0000000000..15e9d84407
--- /dev/null
+++ b/crates/ruff_python_formatter/src/snapshots/ruff_python_formatter__trivia__tests__identifier_ending_in_non_start_char.snap
@@ -0,0 +1,10 @@
+---
+source: crates/ruff_python_formatter/src/trivia.rs
+expression: test_case.tokens()
+---
+[
+    Token {
+        kind: Other,
+        range: 0..2,
+    },
+]
diff --git a/crates/ruff_python_formatter/src/snapshots/ruff_python_formatter__trivia__tests__ignore_word_with_only_id_continuing_chars.snap b/crates/ruff_python_formatter/src/snapshots/ruff_python_formatter__trivia__tests__ignore_word_with_only_id_continuing_chars.snap
new file mode 100644
index 0000000000..26e9fd18bc
--- /dev/null
+++ b/crates/ruff_python_formatter/src/snapshots/ruff_python_formatter__trivia__tests__ignore_word_with_only_id_continuing_chars.snap
@@ -0,0 +1,18 @@
+---
+source: crates/ruff_python_formatter/src/trivia.rs
+expression: test_case.tokens()
+---
+[
+    Token {
+        kind: Other,
+        range: 0..1,
+    },
+    Token {
+        kind: Bogus,
+        range: 1..2,
+    },
+    Token {
+        kind: Bogus,
+        range: 2..3,
+    },
+]
diff --git a/crates/ruff_python_formatter/src/snapshots/ruff_python_formatter__trivia__tests__tokenize_multichar.snap b/crates/ruff_python_formatter/src/snapshots/ruff_python_formatter__trivia__tests__tokenize_multichar.snap
new file mode 100644
index 0000000000..16a1293b44
--- /dev/null
+++ b/crates/ruff_python_formatter/src/snapshots/ruff_python_formatter__trivia__tests__tokenize_multichar.snap
@@ -0,0 +1,34 @@
+---
+source: crates/ruff_python_formatter/src/trivia.rs
+expression: test_case.tokens()
+---
+[
+    Token {
+        kind: If,
+        range: 0..2,
+    },
+    Token {
+        kind: Whitespace,
+        range: 2..3,
+    },
+    Token {
+        kind: In,
+        range: 3..5,
+    },
+    Token {
+        kind: Whitespace,
+        range: 5..6,
+    },
+    Token {
+        kind: Else,
+        range: 6..10,
+    },
+    Token {
+        kind: Whitespace,
+        range: 10..11,
+    },
+    Token {
+        kind: Match,
+        range: 11..16,
+    },
+]
diff --git a/crates/ruff_python_formatter/src/snapshots/ruff_python_formatter__trivia__tests__tricky_unicode.snap b/crates/ruff_python_formatter/src/snapshots/ruff_python_formatter__trivia__tests__tricky_unicode.snap
new file mode 100644
index 0000000000..91b9cb397a
--- /dev/null
+++ b/crates/ruff_python_formatter/src/snapshots/ruff_python_formatter__trivia__tests__tricky_unicode.snap
@@ -0,0 +1,10 @@
+---
+source: crates/ruff_python_formatter/src/trivia.rs
+expression: test_case.tokens()
+---
+[
+    Token {
+        kind: Other,
+        range: 0..6,
+    },
+]
diff --git a/crates/ruff_python_formatter/src/trivia.rs b/crates/ruff_python_formatter/src/trivia.rs
index 86516170a1..4d8a1fa8cd 100644
--- a/crates/ruff_python_formatter/src/trivia.rs
+++ b/crates/ruff_python_formatter/src/trivia.rs
@@ -1,8 +1,8 @@
 use std::str::Chars;
 
-use ruff_text_size::{TextLen, TextRange, TextSize};
-
 use ruff_python_whitespace::is_python_whitespace;
+use ruff_text_size::{TextLen, TextRange, TextSize};
+use unic_ucd_ident::{is_xid_continue, is_xid_start};
 
 /// Searches for the first non-trivia character in `range`.
 ///
@@ -92,6 +92,24 @@ pub(crate) fn skip_trailing_trivia(offset: TextSize, code: &str) -> TextSize {
     offset
 }
 
+fn is_identifier_start(c: char) -> bool {
+    c.is_ascii_alphabetic() || c == '_' || is_non_ascii_identifier_start(c)
+}
+
+// Checks if the character c is a valid continuation character as described
+// in https://docs.python.org/3/reference/lexical_analysis.html#identifiers
+fn is_identifier_continuation(c: char) -> bool {
+    if c.is_ascii() {
+        matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9')
+    } else {
+        is_xid_continue(c)
+    }
+}
+
+fn is_non_ascii_identifier_start(c: char) -> bool {
+    is_xid_start(c)
+}
+
 #[derive(Clone, Debug, Eq, PartialEq, Hash)]
 pub(crate) struct Token {
     pub(crate) kind: TokenKind,
@@ -167,7 +185,19 @@ pub(crate) enum TokenKind {
     /// `.`.
     Dot,
 
-    /// Any other non trivia token. Always has a length of 1
+    /// `else`
+    Else,
+
+    /// `if`
+    If,
+
+    /// `in`
+    In,
+
+    /// `match`
+    Match,
+
+    /// Any other non trivia token.
     Other,
 
     /// Returned for each character after [`TokenKind::Other`] has been returned once.
@@ -215,6 +245,7 @@ pub(crate) struct SimpleTokenizer<'a> {
     /// `true` when it is known that the current `back` line has no comment for sure.
     back_line_has_no_comment: bool,
     bogus: bool,
+    source: &'a str,
     cursor: Cursor<'a>,
 }
 
@@ -225,6 +256,7 @@ impl<'a> SimpleTokenizer<'a> {
             back_offset: range.end(),
             back_line_has_no_comment: false,
             bogus: false,
+            source,
             cursor: Cursor::new(&source[range]),
         }
     }
@@ -238,6 +270,18 @@ impl<'a> SimpleTokenizer<'a> {
         Self::new(source, TextRange::up_to(offset))
     }
 
+    fn to_keyword_or_other(&self, range: TextRange) -> TokenKind {
+        let source = &self.source[range];
+        match source {
+            "if" => TokenKind::If,
+            "else" => TokenKind::Else,
+            "in" => TokenKind::In,
+            "match" => TokenKind::Match, // Match is a soft keyword that depends on the context but we can always lex it as a keyword and leave it to the caller (parser) to decide if it should be handled as an identifier or keyword.
+            // ...,
+            _ => TokenKind::Other, // Potentially an identifier, but only if it isn't a string prefix. We can ignore this for now https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals
+        }
+    }
+
     fn next_token(&mut self) -> Token {
         self.cursor.start_token();
 
@@ -279,12 +323,19 @@ impl<'a> SimpleTokenizer<'a> {
             '\\' => TokenKind::Continuation,
 
             c => {
-                let kind = TokenKind::from_non_trivia_char(c);
+                let kind = if is_identifier_start(c) {
+                    self.cursor.eat_while(is_identifier_continuation);
+                    let token_len = self.cursor.token_len();
+
+                    let range = TextRange::at(self.offset, token_len);
+                    self.to_keyword_or_other(range)
+                } else {
+                    TokenKind::from_non_trivia_char(c)
+                };
 
                 if kind == TokenKind::Other {
                     self.bogus = true;
                 }
-
                 kind
             }
         };
@@ -386,7 +437,29 @@ impl<'a> SimpleTokenizer<'a> {
                 } else if c == '\\' {
                     TokenKind::Continuation
                 } else {
-                    let kind = TokenKind::from_non_trivia_char(c);
+                    let kind = if is_identifier_continuation(c) {
+                        // if we only have identifier continuations but no start (e.g. 555) we
+                        // don't want to consume the chars, so in that case, we want to rewind the
+                        // cursor to here
+                        let savepoint = self.cursor.clone();
+                        self.cursor.eat_back_while(is_identifier_continuation);
+
+                        let token_len = self.cursor.token_len();
+                        let range = TextRange::at(self.back_offset - token_len, token_len);
+
+                        if self.source[range]
+                            .chars()
+                            .next()
+                            .is_some_and(is_identifier_start)
+                        {
+                            self.to_keyword_or_other(range)
+                        } else {
+                            self.cursor = savepoint;
+                            TokenKind::Other
+                        }
+                    } else {
+                        TokenKind::from_non_trivia_char(c)
+                    };
 
                     if kind == TokenKind::Other {
                         self.bogus = true;
@@ -624,6 +697,44 @@ mod tests {
         test_case.assert_reverse_tokenization();
     }
 
+    #[test]
+    fn tricky_unicode() {
+        let source = "មុ";
+
+        let test_case = tokenize(source);
+        assert_debug_snapshot!(test_case.tokens());
+        test_case.assert_reverse_tokenization();
+    }
+
+    #[test]
+    fn identifier_ending_in_non_start_char() {
+        let source = "i5";
+
+        let test_case = tokenize(source);
+        assert_debug_snapshot!(test_case.tokens());
+        test_case.assert_reverse_tokenization();
+    }
+
+    #[test]
+    fn ignore_word_with_only_id_continuing_chars() {
+        let source = "555";
+
+        let test_case = tokenize(source);
+        assert_debug_snapshot!(test_case.tokens());
+
+        // note: not reversible: [other, bogus, bogus] vs [bogus, bogus, other]
+    }
+
+    #[test]
+    fn tokenize_multichar() {
+        let source = "if in else match";
+
+        let test_case = tokenize(source);
+
+        assert_debug_snapshot!(test_case.tokens());
+        test_case.assert_reverse_tokenization();
+    }
+
     #[test]
     fn tokenize_substring() {
         let source = "('some string') # comment";