Apply NFKC normalization to unicode identifiers in the lexer (#10412)

2025-07-24 13:33:50 +00:00 · 2024-03-18 11:56:56 +00:00 · 2024-03-18 11:56:56 +00:00 · 92e6026446
commit 92e6026446
parent bb540718c2
9 changed files with 68 additions and 15 deletions
--- a/crates/ruff_python_parser/src/lexer.rs
+++ b/crates/ruff_python_parser/src/lexer.rs
@ -32,6 +32,7 @@ use std::iter::FusedIterator;
 use std::{char, cmp::Ordering, str::FromStr};

 use unicode_ident::{is_xid_continue, is_xid_start};
+use unicode_normalization::UnicodeNormalization;

 use ruff_python_ast::{Int, IpyEscapeKind};
 use ruff_text_size::{TextLen, TextRange, TextSize};
@ -197,10 +198,25 @@ impl<'source> Lexer<'source> {
            _ => {}
        }

-        self.cursor.eat_while(is_identifier_continuation);
+        // Keep track of whether the identifier is ASCII-only or not.
+        //
+        // This is important because Python applies NFKC normalization to
+        // identifiers: https://docs.python.org/3/reference/lexical_analysis.html#identifiers.
+        // We need to therefore do the same in our lexer, but applying NFKC normalization
+        // unconditionally is extremely expensive. If we know an identifier is ASCII-only,
+        // (by far the most common case), we can skip NFKC normalization of the identifier.
+        let mut is_ascii = first.is_ascii();
+        self.cursor
+            .eat_while(|c| is_identifier_continuation(c, &mut is_ascii));

        let text = self.token_text();

+        if !is_ascii {
+            return Ok(Tok::Name {
+                name: text.nfkc().collect::<String>().into_boxed_str(),
+            });
+        }
+
        let keyword = match text {
            "False" => Tok::False,
            "None" => Tok::None,
@ -1583,14 +1599,19 @@ fn is_unicode_identifier_start(c: char) -> bool {
    is_xid_start(c)
 }

-// Checks if the character c is a valid continuation character as described
-// in https://docs.python.org/3/reference/lexical_analysis.html#identifiers
-fn is_identifier_continuation(c: char) -> bool {
+/// Checks if the character c is a valid continuation character as described
+/// in <https://docs.python.org/3/reference/lexical_analysis.html#identifiers>.
+///
+/// Additionally, this function also keeps track of whether or not the total
+/// identifier is ASCII-only or not by mutably altering a reference to a
+/// boolean value passed in.
+fn is_identifier_continuation(c: char, identifier_is_ascii_only: &mut bool) -> bool {
    // Arrange things such that ASCII codepoints never
    // result in the slower `is_xid_continue` getting called.
    if c.is_ascii() {
        matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9')
    } else {
+        *identifier_is_ascii_only = false;
        is_xid_continue(c)
    }
 }
@ -2042,6 +2063,17 @@ def f(arg=%timeit a = b):
        assert_debug_snapshot!(lex_source(source));
    }

+    fn get_tokens_only(source: &str) -> Vec<Tok> {
+        lex_source(source).into_iter().map(|(tok, _)| tok).collect()
+    }
+
+    #[test]
+    fn test_nfkc_normalization() {
+        let source1 = "𝒞 = 500";
+        let source2 = "C = 500";
+        assert_eq!(get_tokens_only(source1), get_tokens_only(source2));
+    }
+
    fn triple_quoted_eol(eol: &str) -> Vec<Spanned> {
        let source = format!("\"\"\"{eol} test string{eol} \"\"\"");
        lex_source(&source)
--- a/crates/ruff_python_parser/src/token.rs
+++ b/crates/ruff_python_parser/src/token.rs
@ -16,6 +16,9 @@ pub enum Tok {
    /// Token value for a name, commonly known as an identifier.
    Name {
        /// The name value.
+        ///
+        /// Unicode names are NFKC-normalized by the lexer,
+        /// matching [the behaviour of Python's lexer](https://docs.python.org/3/reference/lexical_analysis.html#identifiers)
        name: Box<str>,
    },
    /// Token value for an integer.