diff --git a/Cargo.lock b/Cargo.lock index 2265b48d95..d59ed4c834 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2467,6 +2467,9 @@ dependencies = [ [[package]] name = "ruff_python_stdlib" version = "0.0.0" +dependencies = [ + "unic-ucd-ident", +] [[package]] name = "ruff_python_trivia" diff --git a/Cargo.toml b/Cargo.toml index 5a5d76595f..ecceb3f27b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -49,6 +49,7 @@ toml = { version = "0.7.2" } tracing = "0.1.37" tracing-indicatif = "0.3.4" tracing-subscriber = { version = "0.3.17", features = ["env-filter"] } +unic-ucd-ident = "0.9.0" unicode-width = "0.1.10" uuid = { version = "1.4.1", features = ["v4", "fast-rng", "macro-diagnostics", "js"] } wsl = { version = "0.1.0" } diff --git a/crates/ruff_python_parser/Cargo.toml b/crates/ruff_python_parser/Cargo.toml index 02b27577b2..a4832dc592 100644 --- a/crates/ruff_python_parser/Cargo.toml +++ b/crates/ruff_python_parser/Cargo.toml @@ -24,7 +24,7 @@ lalrpop-util = { version = "0.20.0", default-features = false } num-bigint = { workspace = true } num-traits = { workspace = true } unic-emoji-char = "0.9.0" -unic-ucd-ident = "0.9.0" +unic-ucd-ident = { workspace = true } unicode_names2 = { version = "0.6.0", git = "https://github.com/youknowone/unicode_names2.git", rev = "4ce16aa85cbcdd9cc830410f1a72ef9a235f2fde" } rustc-hash = { workspace = true } static_assertions = "1.1.0" diff --git a/crates/ruff_python_stdlib/Cargo.toml b/crates/ruff_python_stdlib/Cargo.toml index 167b9fad04..daaa540a29 100644 --- a/crates/ruff_python_stdlib/Cargo.toml +++ b/crates/ruff_python_stdlib/Cargo.toml @@ -13,3 +13,4 @@ license = { workspace = true } [lib] [dependencies] +unic-ucd-ident = { workspace = true } diff --git a/crates/ruff_python_stdlib/src/identifiers.rs b/crates/ruff_python_stdlib/src/identifiers.rs index 18c0f9a4e6..a649c19c19 100644 --- a/crates/ruff_python_stdlib/src/identifiers.rs +++ b/crates/ruff_python_stdlib/src/identifiers.rs @@ -1,3 +1,5 @@ +use unic_ucd_ident::{is_xid_continue, is_xid_start}; + use crate::keyword::is_keyword; /// Returns `true` if a string is a valid Python identifier (e.g., variable @@ -5,12 +7,12 @@ use crate::keyword::is_keyword; pub fn is_identifier(name: &str) -> bool { // Is the first character a letter or underscore? let mut chars = name.chars(); - if !chars.next().is_some_and(|c| c.is_alphabetic() || c == '_') { + if !chars.next().is_some_and(is_identifier_start) { return false; } // Are the rest of the characters letters, digits, or underscores? - if !chars.all(|c| c.is_alphanumeric() || c == '_') { + if !chars.all(is_identifier_continuation) { return false; } @@ -22,6 +24,21 @@ pub fn is_identifier(name: &str) -> bool { true } +// Checks if the character c is a valid starting character as described +// in https://docs.python.org/3/reference/lexical_analysis.html#identifiers +fn is_identifier_start(c: char) -> bool { + matches!(c, 'a'..='z' | 'A'..='Z' | '_') || is_xid_start(c) +} + +// Checks if the character c is a valid continuation character as described +// in https://docs.python.org/3/reference/lexical_analysis.html#identifiers +fn is_identifier_continuation(c: char) -> bool { + match c { + 'a'..='z' | 'A'..='Z' | '_' | '0'..='9' => true, + c => is_xid_continue(c), + } +} + /// Returns `true` if a string is a private identifier, such that, when the /// identifier is defined in a class definition, it will be mangled prior to /// code generation. @@ -76,7 +93,25 @@ pub fn is_migration_name(name: &str) -> bool { #[cfg(test)] mod tests { - use crate::identifiers::{is_migration_name, is_module_name}; + use crate::identifiers::{is_identifier, is_migration_name, is_module_name}; + + #[test] + fn valid_identifiers() { + assert!(is_identifier("_abc")); + assert!(is_identifier("abc")); + assert!(is_identifier("_")); + assert!(is_identifier("a_b_c")); + assert!(is_identifier("abc123")); + assert!(is_identifier("abc_123")); + assert!(is_identifier("漢字")); + assert!(is_identifier("ひらがな")); + assert!(is_identifier("العربية")); + assert!(is_identifier("кириллица")); + assert!(is_identifier("πr")); + assert!(!is_identifier("")); + assert!(!is_identifier("percentile_co³t")); + assert!(!is_identifier("HelloWorld❤️")); + } #[test] fn module_name() { diff --git a/crates/ruff_python_trivia/Cargo.toml b/crates/ruff_python_trivia/Cargo.toml index f0f9861e5a..75273acd1d 100644 --- a/crates/ruff_python_trivia/Cargo.toml +++ b/crates/ruff_python_trivia/Cargo.toml @@ -18,7 +18,7 @@ ruff_source_file = { path = "../ruff_source_file" } memchr = { workspace = true } smallvec = { workspace = true } -unic-ucd-ident = "0.9.0" +unic-ucd-ident = { workspace = true } [dev-dependencies] insta = { workspace = true }