Update identifier Unicode character validation to match Python spec (#7209)

Co-authored-by: Micha Reiser <micha@reiser.io>
This commit is contained in:
Victor Hugo Gomes 2023-09-07 04:08:42 -03:00 committed by GitHub
parent fda48afc23
commit 041cdb95e0
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 45 additions and 5 deletions

View file

@ -1,3 +1,5 @@
use unic_ucd_ident::{is_xid_continue, is_xid_start};
use crate::keyword::is_keyword;
/// Returns `true` if a string is a valid Python identifier (e.g., variable
@ -5,12 +7,12 @@ use crate::keyword::is_keyword;
pub fn is_identifier(name: &str) -> bool {
// Is the first character a letter or underscore?
let mut chars = name.chars();
if !chars.next().is_some_and(|c| c.is_alphabetic() || c == '_') {
if !chars.next().is_some_and(is_identifier_start) {
return false;
}
// Are the rest of the characters letters, digits, or underscores?
if !chars.all(|c| c.is_alphanumeric() || c == '_') {
if !chars.all(is_identifier_continuation) {
return false;
}
@ -22,6 +24,21 @@ pub fn is_identifier(name: &str) -> bool {
true
}
// Checks if the character c is a valid starting character as described
// in https://docs.python.org/3/reference/lexical_analysis.html#identifiers
fn is_identifier_start(c: char) -> bool {
matches!(c, 'a'..='z' | 'A'..='Z' | '_') || is_xid_start(c)
}
// Checks if the character c is a valid continuation character as described
// in https://docs.python.org/3/reference/lexical_analysis.html#identifiers
fn is_identifier_continuation(c: char) -> bool {
match c {
'a'..='z' | 'A'..='Z' | '_' | '0'..='9' => true,
c => is_xid_continue(c),
}
}
/// Returns `true` if a string is a private identifier, such that, when the
/// identifier is defined in a class definition, it will be mangled prior to
/// code generation.
@ -76,7 +93,25 @@ pub fn is_migration_name(name: &str) -> bool {
#[cfg(test)]
mod tests {
use crate::identifiers::{is_migration_name, is_module_name};
use crate::identifiers::{is_identifier, is_migration_name, is_module_name};
#[test]
fn valid_identifiers() {
assert!(is_identifier("_abc"));
assert!(is_identifier("abc"));
assert!(is_identifier("_"));
assert!(is_identifier("a_b_c"));
assert!(is_identifier("abc123"));
assert!(is_identifier("abc_123"));
assert!(is_identifier("漢字"));
assert!(is_identifier("ひらがな"));
assert!(is_identifier("العربية"));
assert!(is_identifier("кириллица"));
assert!(is_identifier("πr"));
assert!(!is_identifier(""));
assert!(!is_identifier("percentile_co³t"));
assert!(!is_identifier("HelloWorld❤"));
}
#[test]
fn module_name() {