mirror of
https://github.com/astral-sh/ruff.git
synced 2025-07-24 13:33:50 +00:00
Apply NFKC normalization to unicode identifiers in the lexer (#10412)
This commit is contained in:
parent
bb540718c2
commit
92e6026446
9 changed files with 68 additions and 15 deletions
|
@ -32,6 +32,7 @@ use std::iter::FusedIterator;
|
|||
use std::{char, cmp::Ordering, str::FromStr};
|
||||
|
||||
use unicode_ident::{is_xid_continue, is_xid_start};
|
||||
use unicode_normalization::UnicodeNormalization;
|
||||
|
||||
use ruff_python_ast::{Int, IpyEscapeKind};
|
||||
use ruff_text_size::{TextLen, TextRange, TextSize};
|
||||
|
@ -197,10 +198,25 @@ impl<'source> Lexer<'source> {
|
|||
_ => {}
|
||||
}
|
||||
|
||||
self.cursor.eat_while(is_identifier_continuation);
|
||||
// Keep track of whether the identifier is ASCII-only or not.
|
||||
//
|
||||
// This is important because Python applies NFKC normalization to
|
||||
// identifiers: https://docs.python.org/3/reference/lexical_analysis.html#identifiers.
|
||||
// We need to therefore do the same in our lexer, but applying NFKC normalization
|
||||
// unconditionally is extremely expensive. If we know an identifier is ASCII-only,
|
||||
// (by far the most common case), we can skip NFKC normalization of the identifier.
|
||||
let mut is_ascii = first.is_ascii();
|
||||
self.cursor
|
||||
.eat_while(|c| is_identifier_continuation(c, &mut is_ascii));
|
||||
|
||||
let text = self.token_text();
|
||||
|
||||
if !is_ascii {
|
||||
return Ok(Tok::Name {
|
||||
name: text.nfkc().collect::<String>().into_boxed_str(),
|
||||
});
|
||||
}
|
||||
|
||||
let keyword = match text {
|
||||
"False" => Tok::False,
|
||||
"None" => Tok::None,
|
||||
|
@ -1583,14 +1599,19 @@ fn is_unicode_identifier_start(c: char) -> bool {
|
|||
is_xid_start(c)
|
||||
}
|
||||
|
||||
// Checks if the character c is a valid continuation character as described
|
||||
// in https://docs.python.org/3/reference/lexical_analysis.html#identifiers
|
||||
fn is_identifier_continuation(c: char) -> bool {
|
||||
/// Checks if the character c is a valid continuation character as described
|
||||
/// in <https://docs.python.org/3/reference/lexical_analysis.html#identifiers>.
|
||||
///
|
||||
/// Additionally, this function also keeps track of whether or not the total
|
||||
/// identifier is ASCII-only or not by mutably altering a reference to a
|
||||
/// boolean value passed in.
|
||||
fn is_identifier_continuation(c: char, identifier_is_ascii_only: &mut bool) -> bool {
|
||||
// Arrange things such that ASCII codepoints never
|
||||
// result in the slower `is_xid_continue` getting called.
|
||||
if c.is_ascii() {
|
||||
matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9')
|
||||
} else {
|
||||
*identifier_is_ascii_only = false;
|
||||
is_xid_continue(c)
|
||||
}
|
||||
}
|
||||
|
@ -2042,6 +2063,17 @@ def f(arg=%timeit a = b):
|
|||
assert_debug_snapshot!(lex_source(source));
|
||||
}
|
||||
|
||||
fn get_tokens_only(source: &str) -> Vec<Tok> {
|
||||
lex_source(source).into_iter().map(|(tok, _)| tok).collect()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_nfkc_normalization() {
|
||||
let source1 = "𝒞 = 500";
|
||||
let source2 = "C = 500";
|
||||
assert_eq!(get_tokens_only(source1), get_tokens_only(source2));
|
||||
}
|
||||
|
||||
fn triple_quoted_eol(eol: &str) -> Vec<Spanned> {
|
||||
let source = format!("\"\"\"{eol} test string{eol} \"\"\"");
|
||||
lex_source(&source)
|
||||
|
|
|
@ -16,6 +16,9 @@ pub enum Tok {
|
|||
/// Token value for a name, commonly known as an identifier.
|
||||
Name {
|
||||
/// The name value.
|
||||
///
|
||||
/// Unicode names are NFKC-normalized by the lexer,
|
||||
/// matching [the behaviour of Python's lexer](https://docs.python.org/3/reference/lexical_analysis.html#identifiers)
|
||||
name: Box<str>,
|
||||
},
|
||||
/// Token value for an integer.
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue