Add identifier start unicode support for Postegres, MySql and Redshift (#1944)

2025-10-17 09:17:14 +00:00 · 2025-07-15 10:26:11 +03:00 · 2025-07-15 10:26:11 +03:00 · ecd5d88638
commit ecd5d88638
parent c5e6ba5e7d
4 changed files with 18 additions and 8 deletions
--- a/src/dialect/mysql.rs
+++ b/src/dialect/mysql.rs
@ -43,11 +43,13 @@ impl Dialect for MySqlDialect {
        // See https://dev.mysql.com/doc/refman/8.0/en/identifiers.html.
        // Identifiers which begin with a digit are recognized while tokenizing numbers,
        // so they can be distinguished from exponent numeric literals.
        // MySQL also implements non ascii utf-8 charecters
        ch.is_alphabetic()
            || ch == '_'
            || ch == '$'
            || ch == '@'
            || ('\u{0080}'..='\u{ffff}').contains(&ch)
            || !ch.is_ascii()
    }
    fn is_identifier_part(&self, ch: char) -> bool {
--- a/src/dialect/postgresql.rs
+++ b/src/dialect/postgresql.rs
@ -65,10 +65,9 @@ impl Dialect for PostgreSqlDialect {
    }
    fn is_identifier_start(&self, ch: char) -> bool {
-        // See https://www.postgresql.org/docs/11/sql-syntax-lexical.html#SQL-SYNTAX-IDENTIFIERS
+        ch.is_alphabetic() || ch == '_' ||
-        // We don't yet support identifiers beginning with "letters with
+        // PostgreSQL implements Unicode characters in identifiers.
-        // diacritical marks"
+        !ch.is_ascii()
        ch.is_alphabetic() || ch == '_'
    }
    fn is_identifier_part(&self, ch: char) -> bool {
--- a/src/dialect/redshift.rs
+++ b/src/dialect/redshift.rs
@ -80,9 +80,9 @@ impl Dialect for RedshiftSqlDialect {
    }
    fn is_identifier_start(&self, ch: char) -> bool {
-        // Extends Postgres dialect with sharp and UTF-8 multibyte chars
+        // UTF-8 multibyte characters are supported in identifiers via the PostgreSqlDialect.
        // https://docs.aws.amazon.com/redshift/latest/dg/r_names.html
-        PostgreSqlDialect {}.is_identifier_start(ch) || ch == '#' || !ch.is_ascii()
+        PostgreSqlDialect {}.is_identifier_start(ch) || ch == '#'
    }
    fn is_identifier_part(&self, ch: char) -> bool {
--- a/tests/sqlparser_common.rs
+++ b/tests/sqlparser_common.rs
@ -11151,9 +11151,7 @@ fn parse_non_latin_identifiers() {
    let supported_dialects = TestedDialects::new(vec![
        Box::new(GenericDialect {}),
        Box::new(DuckDbDialect {}),
        Box::new(PostgreSqlDialect {}),
        Box::new(MsSqlDialect {}),
        Box::new(MySqlDialect {}),
    ]);
    assert!(supported_dialects
        .parse_sql_statements("SELECT 💝 FROM table1")
@ -16147,3 +16145,14 @@ fn test_identifier_unicode_support() {
    ]);
    let _ = dialects.verified_stmt(sql);
 }
 #[test]
 fn test_identifier_unicode_start() {
    let sql = r#"SELECT 💝phone AS 💝 FROM customers"#;
    let dialects = TestedDialects::new(vec![
        Box::new(MySqlDialect {}),
        Box::new(RedshiftSqlDialect {}),
        Box::new(PostgreSqlDialect {}),
    ]);
    let _ = dialects.verified_stmt(sql);
 }