Add support for IS [NOT] [form] NORMALIZED (#1655)

Co-authored-by: Alexander Beedie <alexander.beedie@adia.ae>
2025-07-07 17:04:59 +00:00 · 2025-01-17 13:59:47 +04:00 · 2025-01-17 13:59:47 +04:00 · e9498d538a
commit e9498d538a
parent 3eeb9160ea
8 changed files with 185 additions and 17 deletions
--- a/src/ast/mod.rs
+++ b/src/ast/mod.rs
@ -83,7 +83,7 @@ pub use self::trigger::{

 pub use self::value::{
    escape_double_quote_string, escape_quoted_string, DateTimeField, DollarQuotedString,
-    TrimWhereField, Value,
+    NormalizationForm, TrimWhereField, Value,
 };

 use crate::ast::helpers::stmt_data_loading::{
@ -653,6 +653,12 @@ pub enum Expr {
    IsDistinctFrom(Box<Expr>, Box<Expr>),
    /// `IS NOT DISTINCT FROM` operator
    IsNotDistinctFrom(Box<Expr>, Box<Expr>),
+    /// `<expr> IS [ NOT ] [ form ] NORMALIZED`
+    IsNormalized {
+        expr: Box<Expr>,
+        form: Option<NormalizationForm>,
+        negated: bool,
+    },
    /// `[ NOT ] IN (val1, val2, ...)`
    InList {
        expr: Box<Expr>,
@ -1118,7 +1124,7 @@ impl fmt::Display for LambdaFunction {
 /// `OneOrManyWithParens` implements `Deref<Target = [T]>` and `IntoIterator`,
 /// so you can call slice methods on it and iterate over items
 /// # Examples
-/// Acessing as a slice:
+/// Accessing as a slice:
 /// ```
 /// # use sqlparser::ast::OneOrManyWithParens;
 /// let one = OneOrManyWithParens::One("a");
@ -1419,6 +1425,24 @@ impl fmt::Display for Expr {
                if *regexp { "REGEXP" } else { "RLIKE" },
                pattern
            ),
+            Expr::IsNormalized {
+                expr,
+                form,
+                negated,
+            } => {
+                let not_ = if *negated { "NOT " } else { "" };
+                if form.is_none() {
+                    write!(f, "{} IS {}NORMALIZED", expr, not_)
+                } else {
+                    write!(
+                        f,
+                        "{} IS {}{} NORMALIZED",
+                        expr,
+                        not_,
+                        form.as_ref().unwrap()
+                    )
+                }
+            }
            Expr::SimilarTo {
                negated,
                expr,
@ -7799,7 +7823,7 @@ where
 /// ```sql
 /// EXPLAIN (ANALYZE, VERBOSE TRUE, FORMAT TEXT) SELECT * FROM my_table;
 ///
-/// VACCUM (VERBOSE, ANALYZE ON, PARALLEL 10) my_table;
+/// VACUUM (VERBOSE, ANALYZE ON, PARALLEL 10) my_table;
 /// ```
 #[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
 #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
--- a/src/ast/query.rs
+++ b/src/ast/query.rs
@ -2821,10 +2821,10 @@ impl fmt::Display for ValueTableMode {
 #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
 #[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
 pub enum UpdateTableFromKind {
-    /// Update Statment where the 'FROM' clause is before the 'SET' keyword (Supported by Snowflake)
+    /// Update Statement where the 'FROM' clause is before the 'SET' keyword (Supported by Snowflake)
    /// For Example: `UPDATE FROM t1 SET t1.name='aaa'`
    BeforeSet(TableWithJoins),
-    /// Update Statment where the 'FROM' clause is after the 'SET' keyword (Which is the standard way)
+    /// Update Statement where the 'FROM' clause is after the 'SET' keyword (Which is the standard way)
    /// For Example: `UPDATE SET t1.name='aaa' FROM t1`
    AfterSet(TableWithJoins),
 }
--- a/src/ast/spans.rs
+++ b/src/ast/spans.rs
@ -1325,6 +1325,12 @@ impl Spanned for Expr {
                escape_char: _,
                any: _,
            } => expr.span().union(&pattern.span()),
+            Expr::RLike { .. } => Span::empty(),
+            Expr::IsNormalized {
+                expr,
+                form: _,
+                negated: _,
+            } => expr.span(),
            Expr::SimilarTo {
                negated: _,
                expr,
@ -1360,7 +1366,6 @@ impl Spanned for Expr {
            Expr::Array(array) => array.span(),
            Expr::MatchAgainst { .. } => Span::empty(),
            Expr::JsonAccess { value, path } => value.span().union(&path.span()),
-            Expr::RLike { .. } => Span::empty(),
            Expr::AnyOp {
                left,
                compare_op: _,
--- a/src/ast/value.rs
+++ b/src/ast/value.rs
@ -270,6 +270,35 @@ impl fmt::Display for DateTimeField {
    }
 }

+#[derive(Debug, Clone, PartialEq, Eq, Ord, PartialOrd, Hash)]
+#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
+/// The Unicode Standard defines four normalization forms, which are intended to eliminate
+/// certain distinctions between visually or functionally identical characters.
+///
+/// See [Unicode Normalization Forms](https://unicode.org/reports/tr15/) for details.
+pub enum NormalizationForm {
+    /// Canonical Decomposition, followed by Canonical Composition.
+    NFC,
+    /// Canonical Decomposition.
+    NFD,
+    /// Compatibility Decomposition, followed by Canonical Composition.
+    NFKC,
+    /// Compatibility Decomposition.
+    NFKD,
+}
+
+impl fmt::Display for NormalizationForm {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        match self {
+            NormalizationForm::NFC => write!(f, "NFC"),
+            NormalizationForm::NFD => write!(f, "NFD"),
+            NormalizationForm::NFKC => write!(f, "NFKC"),
+            NormalizationForm::NFKD => write!(f, "NFKD"),
+        }
+    }
+}
+
 pub struct EscapeQuotedString<'a> {
    string: &'a str,
    quote: char,
--- a/src/keywords.rs
+++ b/src/keywords.rs
@ -530,6 +530,10 @@ define_keywords!(
    NESTED,
    NEW,
    NEXT,
+    NFC,
+    NFD,
+    NFKC,
+    NFKD,
    NO,
    NOBYPASSRLS,
    NOCREATEDB,
@ -540,6 +544,7 @@ define_keywords!(
    NOORDER,
    NOREPLICATION,
    NORMALIZE,
+    NORMALIZED,
    NOSCAN,
    NOSUPERUSER,
    NOT,
--- a/src/parser/mod.rs
+++ b/src/parser/mod.rs
@ -3184,9 +3184,11 @@ impl<'a> Parser<'a> {
                    {
                        let expr2 = self.parse_expr()?;
                        Ok(Expr::IsNotDistinctFrom(Box::new(expr), Box::new(expr2)))
+                    } else if let Ok(is_normalized) = self.parse_unicode_is_normalized(expr) {
+                        Ok(is_normalized)
                    } else {
                        self.expected(
-                            "[NOT] NULL or TRUE|FALSE or [NOT] DISTINCT FROM after IS",
+                            "[NOT] NULL | TRUE | FALSE | DISTINCT | [form] NORMALIZED FROM after IS",
                            self.peek_token(),
                        )
                    }
@ -3851,7 +3853,7 @@ impl<'a> Parser<'a> {
    /// If the current token is the `expected` keyword, consume the token.
    /// Otherwise, return an error.
    ///
-    // todo deprecate infavor of expected_keyword_is
+    // todo deprecate in favor of expected_keyword_is
    pub fn expect_keyword(&mut self, expected: Keyword) -> Result<TokenWithSpan, ParserError> {
        if self.parse_keyword(expected) {
            Ok(self.get_current_token().clone())
@ -8453,6 +8455,33 @@ impl<'a> Parser<'a> {
        }
    }

+    /// Parse a literal unicode normalization clause
+    pub fn parse_unicode_is_normalized(&mut self, expr: Expr) -> Result<Expr, ParserError> {
+        let neg = self.parse_keyword(Keyword::NOT);
+        let normalized_form = self.maybe_parse(|parser| {
+            match parser.parse_one_of_keywords(&[
+                Keyword::NFC,
+                Keyword::NFD,
+                Keyword::NFKC,
+                Keyword::NFKD,
+            ]) {
+                Some(Keyword::NFC) => Ok(NormalizationForm::NFC),
+                Some(Keyword::NFD) => Ok(NormalizationForm::NFD),
+                Some(Keyword::NFKC) => Ok(NormalizationForm::NFKC),
+                Some(Keyword::NFKD) => Ok(NormalizationForm::NFKD),
+                _ => parser.expected("unicode normalization form", parser.peek_token()),
+            }
+        })?;
+        if self.parse_keyword(Keyword::NORMALIZED) {
+            return Ok(Expr::IsNormalized {
+                expr: Box::new(expr),
+                form: normalized_form,
+                negated: neg,
+            });
+        }
+        self.expected("unicode normalization form", self.peek_token())
+    }
+
    pub fn parse_enum_values(&mut self) -> Result<Vec<EnumMember>, ParserError> {
        self.expect_token(&Token::LParen)?;
        let values = self.parse_comma_separated(|parser| {
@ -8979,7 +9008,7 @@ impl<'a> Parser<'a> {
        }
    }

-    /// Parse a table object for insetion
+    /// Parse a table object for insertion
    /// e.g. `some_database.some_table` or `FUNCTION some_table_func(...)`
    pub fn parse_table_object(&mut self) -> Result<TableObject, ParserError> {
        if self.dialect.supports_insert_table_function() && self.parse_keyword(Keyword::FUNCTION) {
@ -11887,7 +11916,7 @@ impl<'a> Parser<'a> {
            } else {
                let mut name = self.parse_grantee_name()?;
                if self.consume_token(&Token::Colon) {
-                    // Redshift supports namespace prefix for extenrnal users and groups:
+                    // Redshift supports namespace prefix for external users and groups:
                    // <Namespace>:<GroupName> or <Namespace>:<UserName>
                    // https://docs.aws.amazon.com/redshift/latest/mgmt/redshift-iam-access-control-native-idp.html
                    let ident = self.parse_identifier()?;
@ -12883,7 +12912,7 @@ impl<'a> Parser<'a> {
        Ok(WithFill { from, to, step })
    }

-    // Parse a set of comma seperated INTERPOLATE expressions (ClickHouse dialect)
+    // Parse a set of comma separated INTERPOLATE expressions (ClickHouse dialect)
    // that follow the INTERPOLATE keyword in an ORDER BY clause with the WITH FILL modifier
    pub fn parse_interpolations(&mut self) -> Result<Option<Interpolate>, ParserError> {
        if !self.parse_keyword(Keyword::INTERPOLATE) {
@ -14432,7 +14461,7 @@ mod tests {
        assert_eq!(
            ast,
            Err(ParserError::ParserError(
-                "Expected: [NOT] NULL or TRUE|FALSE or [NOT] DISTINCT FROM after IS, found: a at Line: 1, Column: 16"
+                "Expected: [NOT] NULL | TRUE | FALSE | DISTINCT | [form] NORMALIZED FROM after IS, found: a at Line: 1, Column: 16"
                    .to_string()
            ))
        );
--- a/tests/sqlparser_common.rs
+++ b/tests/sqlparser_common.rs
@ -4600,7 +4600,7 @@ fn run_explain_analyze(
    expected_verbose: bool,
    expected_analyze: bool,
    expected_format: Option<AnalyzeFormat>,
-    exepcted_options: Option<Vec<UtilityOption>>,
+    expected_options: Option<Vec<UtilityOption>>,
 ) {
    match dialect.verified_stmt(query) {
        Statement::Explain {
@ -4616,7 +4616,7 @@ fn run_explain_analyze(
            assert_eq!(verbose, expected_verbose);
            assert_eq!(analyze, expected_analyze);
            assert_eq!(format, expected_format);
-            assert_eq!(options, exepcted_options);
+            assert_eq!(options, expected_options);
            assert!(!query_plan);
            assert!(!estimate);
            assert_eq!("SELECT sqrt(id) FROM foo", statement.to_string());
@ -9317,6 +9317,46 @@ fn parse_is_boolean() {
        verified_expr(sql)
    );

+    let sql = "a IS NORMALIZED";
+    assert_eq!(
+        IsNormalized {
+            expr: Box::new(Identifier(Ident::new("a"))),
+            form: None,
+            negated: false,
+        },
+        verified_expr(sql)
+    );
+
+    let sql = "a IS NOT NORMALIZED";
+    assert_eq!(
+        IsNormalized {
+            expr: Box::new(Identifier(Ident::new("a"))),
+            form: None,
+            negated: true,
+        },
+        verified_expr(sql)
+    );
+
+    let sql = "a IS NFKC NORMALIZED";
+    assert_eq!(
+        IsNormalized {
+            expr: Box::new(Identifier(Ident::new("a"))),
+            form: Some(NormalizationForm::NFKC),
+            negated: false,
+        },
+        verified_expr(sql)
+    );
+
+    let sql = "a IS NOT NFKD NORMALIZED";
+    assert_eq!(
+        IsNormalized {
+            expr: Box::new(Identifier(Ident::new("a"))),
+            form: Some(NormalizationForm::NFKD),
+            negated: true,
+        },
+        verified_expr(sql)
+    );
+
    let sql = "a IS UNKNOWN";
    assert_eq!(
        IsUnknown(Box::new(Identifier(Ident::new("a")))),
@ -9335,6 +9375,12 @@ fn parse_is_boolean() {
    verified_stmt("SELECT f FROM foo WHERE field IS FALSE");
    verified_stmt("SELECT f FROM foo WHERE field IS NOT FALSE");

+    verified_stmt("SELECT f FROM foo WHERE field IS NORMALIZED");
+    verified_stmt("SELECT f FROM foo WHERE field IS NFC NORMALIZED");
+    verified_stmt("SELECT f FROM foo WHERE field IS NFD NORMALIZED");
+    verified_stmt("SELECT f FROM foo WHERE field IS NOT NORMALIZED");
+    verified_stmt("SELECT f FROM foo WHERE field IS NOT NFKC NORMALIZED");
+
    verified_stmt("SELECT f FROM foo WHERE field IS UNKNOWN");
    verified_stmt("SELECT f FROM foo WHERE field IS NOT UNKNOWN");

@ -9342,7 +9388,37 @@ fn parse_is_boolean() {
    let res = parse_sql_statements(sql);
    assert_eq!(
        ParserError::ParserError(
-            "Expected: [NOT] NULL or TRUE|FALSE or [NOT] DISTINCT FROM after IS, found: 0"
+            "Expected: [NOT] NULL | TRUE | FALSE | DISTINCT | [form] NORMALIZED FROM after IS, found: 0"
+                .to_string()
+        ),
+        res.unwrap_err()
+    );
+
+    let sql = "SELECT s, s IS XYZ NORMALIZED FROM foo";
+    let res = parse_sql_statements(sql);
+    assert_eq!(
+        ParserError::ParserError(
+            "Expected: [NOT] NULL | TRUE | FALSE | DISTINCT | [form] NORMALIZED FROM after IS, found: XYZ"
+                .to_string()
+        ),
+        res.unwrap_err()
+    );
+
+    let sql = "SELECT s, s IS NFKC FROM foo";
+    let res = parse_sql_statements(sql);
+    assert_eq!(
+        ParserError::ParserError(
+            "Expected: [NOT] NULL | TRUE | FALSE | DISTINCT | [form] NORMALIZED FROM after IS, found: FROM"
+                .to_string()
+        ),
+        res.unwrap_err()
+    );
+
+    let sql = "SELECT s, s IS TRIM(' NFKC ') FROM foo";
+    let res = parse_sql_statements(sql);
+    assert_eq!(
+        ParserError::ParserError(
+            "Expected: [NOT] NULL | TRUE | FALSE | DISTINCT | [form] NORMALIZED FROM after IS, found: TRIM"
                .to_string()
        ),
        res.unwrap_err()
@ -13003,7 +13079,7 @@ fn test_trailing_commas_in_from() {
    let sql = "SELECT a FROM b, WHERE c = 1";
    let _ = dialects.parse_sql_statements(sql).unwrap();

-    // nasted
+    // nested
    let sql = "SELECT 1, 2 FROM (SELECT * FROM t,),";
    let _ = dialects.parse_sql_statements(sql).unwrap();

--- a/tests/sqlparser_mysql.rs
+++ b/tests/sqlparser_mysql.rs
@ -2572,7 +2572,7 @@ fn parse_kill() {
 }

 #[test]
-fn parse_table_colum_option_on_update() {
+fn parse_table_column_option_on_update() {
    let sql1 = "CREATE TABLE foo (`modification_time` DATETIME ON UPDATE CURRENT_TIMESTAMP())";
    match mysql().verified_stmt(sql1) {
        Statement::CreateTable(CreateTable { name, columns, .. }) => {