diff --git a/Cargo.lock b/Cargo.lock index e2121a52..f7d00172 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -33,6 +33,7 @@ name = "erg_parser" version = "0.5.13" dependencies = [ "erg_common", + "unicode-xid", ] [[package]] @@ -50,6 +51,12 @@ version = "0.2.132" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8371e4e5341c3a96db127eb2465ac681ced4c433e01dd0e938adbef26ba93ba5" +[[package]] +name = "unicode-xid" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f962df74c8c05a667b5ee8bcf162993134c104e96440b663c8daa176dc772d8c" + [[package]] name = "winapi" version = "0.3.9" diff --git a/compiler/erg_parser/Cargo.toml b/compiler/erg_parser/Cargo.toml index c8f622c8..db93589e 100644 --- a/compiler/erg_parser/Cargo.toml +++ b/compiler/erg_parser/Cargo.toml @@ -19,6 +19,7 @@ pretty = ["erg_common/pretty"] [dependencies] erg_common = { version = "0.5.13", path = "../erg_common" } +unicode-xid = "0.2.4" [lib] path = "lib.rs" diff --git a/compiler/erg_parser/lex.rs b/compiler/erg_parser/lex.rs index 33b5e3a1..3f29d6e1 100644 --- a/compiler/erg_parser/lex.rs +++ b/compiler/erg_parser/lex.rs @@ -1,6 +1,8 @@ //! defines and implements `Lexer` (Tokenizer). use std::cmp::Ordering; +use unicode_xid::UnicodeXID; + use erg_common::cache::CacheSet; use erg_common::config::ErgConfig; use erg_common::config::Input; @@ -156,28 +158,12 @@ impl Lexer /*<'a>*/ { Some(Err(LexError::feature_error(0, token.loc(), feat_name))) } - const fn is_valid_symbol_ch(c: char) -> bool { - match c { - '0'..='9' => true, - // control characters - '\0' | '\u{0009}'..='\u{001F}' => false, - // white spaces - ' ' | '\u{00A0}' => false, - '\u{007F}' | '\u{0085}' | '\u{05C1}' | '\u{05C2}' => false, - '\u{0701}'..='\u{070d}' => false, - '\u{07B2}'..='\u{07BF}' => false, - '\u{1680}' | '\u{180E}' => false, - '\u{2000}'..='\u{200F}' => false, - '\u{2028}'..='\u{202F}' => false, - '\u{205F}'..='\u{206F}' => false, - '\u{3000}' | '\u{3164}' | '\u{FEFF}' => false, - // operator characters + special markers - '<' | '>' | '$' | '%' | '.' | ',' | ':' | ';' | '+' | '-' | '*' | '/' | '=' | '#' - | '&' | '|' | '^' | '~' | '@' | '!' | '?' | '\\' => false, - // enclosures - '[' | ']' | '(' | ')' | '{' | '}' | '\"' | '\'' | '`' => false, - _ => true, - } + fn is_valid_start_symbol_ch(c: char) -> bool { + c.is_xid_start() || c == '_' + } + + fn is_valid_continue_symbol_ch(c: char) -> bool { + c.is_xid_continue() && !('0'..='9').contains(&c) || c == '_' } /// Detect `c` is a bidirectional overriding character. @@ -504,7 +490,7 @@ impl Lexer /*<'a>*/ { n if n.is_ascii_digit() || n == '_' => { num.push(self.consume().unwrap()); } - c if Self::is_valid_symbol_ch(c) => { + c if Self::is_valid_continue_symbol_ch(c) => { // exponent (e.g. 10e+3) if c == 'e' && (self.peek_next_ch() == Some('+') || self.peek_next_ch() == Some('-')) @@ -544,7 +530,7 @@ impl Lexer /*<'a>*/ { } // method call of IntLit // or range operator (e.g. 1..) - Some(c) if Self::is_valid_symbol_ch(c) || c == '.' => { + Some(c) if Self::is_valid_continue_symbol_ch(c) || c == '.' => { let kind = if num.starts_with('-') && !Self::is_zero(&num) { IntLit } else { @@ -583,7 +569,7 @@ impl Lexer /*<'a>*/ { fn lex_symbol(&mut self, first_ch: char) -> LexResult { let mut cont = first_ch.to_string(); while let Some(c) = self.peek_cur_ch() { - if Self::is_valid_symbol_ch(c) { + if Self::is_valid_continue_symbol_ch(c) { cont.push(self.consume().unwrap()); } else { break; @@ -1245,7 +1231,7 @@ impl Iterator for Lexer /*<'a>*/ { // IntLit or RatioLit Some(n) if n.is_ascii_digit() => Some(self.lex_num(n)), // Symbol (includes '_') - Some(c) if Self::is_valid_symbol_ch(c) => Some(self.lex_symbol(c)), + Some(c) if Self::is_valid_start_symbol_ch(c) => Some(self.lex_symbol(c)), // Invalid character (e.g. space-like character) Some(invalid) => { let token = self.emit_token(Illegal, &invalid.to_string());