Add support for IS [NOT] [form] NORMALIZED (#1655)

Co-authored-by: Alexander Beedie <alexander.beedie@adia.ae>
This commit is contained in:
Alexander Beedie 2025-01-17 13:59:47 +04:00 committed by GitHub
parent 3eeb9160ea
commit e9498d538a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 185 additions and 17 deletions

View file

@ -83,7 +83,7 @@ pub use self::trigger::{
pub use self::value::{
escape_double_quote_string, escape_quoted_string, DateTimeField, DollarQuotedString,
TrimWhereField, Value,
NormalizationForm, TrimWhereField, Value,
};
use crate::ast::helpers::stmt_data_loading::{
@ -653,6 +653,12 @@ pub enum Expr {
IsDistinctFrom(Box<Expr>, Box<Expr>),
/// `IS NOT DISTINCT FROM` operator
IsNotDistinctFrom(Box<Expr>, Box<Expr>),
/// `<expr> IS [ NOT ] [ form ] NORMALIZED`
IsNormalized {
expr: Box<Expr>,
form: Option<NormalizationForm>,
negated: bool,
},
/// `[ NOT ] IN (val1, val2, ...)`
InList {
expr: Box<Expr>,
@ -1118,7 +1124,7 @@ impl fmt::Display for LambdaFunction {
/// `OneOrManyWithParens` implements `Deref<Target = [T]>` and `IntoIterator`,
/// so you can call slice methods on it and iterate over items
/// # Examples
/// Acessing as a slice:
/// Accessing as a slice:
/// ```
/// # use sqlparser::ast::OneOrManyWithParens;
/// let one = OneOrManyWithParens::One("a");
@ -1419,6 +1425,24 @@ impl fmt::Display for Expr {
if *regexp { "REGEXP" } else { "RLIKE" },
pattern
),
Expr::IsNormalized {
expr,
form,
negated,
} => {
let not_ = if *negated { "NOT " } else { "" };
if form.is_none() {
write!(f, "{} IS {}NORMALIZED", expr, not_)
} else {
write!(
f,
"{} IS {}{} NORMALIZED",
expr,
not_,
form.as_ref().unwrap()
)
}
}
Expr::SimilarTo {
negated,
expr,
@ -7799,7 +7823,7 @@ where
/// ```sql
/// EXPLAIN (ANALYZE, VERBOSE TRUE, FORMAT TEXT) SELECT * FROM my_table;
///
/// VACCUM (VERBOSE, ANALYZE ON, PARALLEL 10) my_table;
/// VACUUM (VERBOSE, ANALYZE ON, PARALLEL 10) my_table;
/// ```
#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]

View file

@ -2821,10 +2821,10 @@ impl fmt::Display for ValueTableMode {
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
pub enum UpdateTableFromKind {
/// Update Statment where the 'FROM' clause is before the 'SET' keyword (Supported by Snowflake)
/// Update Statement where the 'FROM' clause is before the 'SET' keyword (Supported by Snowflake)
/// For Example: `UPDATE FROM t1 SET t1.name='aaa'`
BeforeSet(TableWithJoins),
/// Update Statment where the 'FROM' clause is after the 'SET' keyword (Which is the standard way)
/// Update Statement where the 'FROM' clause is after the 'SET' keyword (Which is the standard way)
/// For Example: `UPDATE SET t1.name='aaa' FROM t1`
AfterSet(TableWithJoins),
}

View file

@ -1325,6 +1325,12 @@ impl Spanned for Expr {
escape_char: _,
any: _,
} => expr.span().union(&pattern.span()),
Expr::RLike { .. } => Span::empty(),
Expr::IsNormalized {
expr,
form: _,
negated: _,
} => expr.span(),
Expr::SimilarTo {
negated: _,
expr,
@ -1360,7 +1366,6 @@ impl Spanned for Expr {
Expr::Array(array) => array.span(),
Expr::MatchAgainst { .. } => Span::empty(),
Expr::JsonAccess { value, path } => value.span().union(&path.span()),
Expr::RLike { .. } => Span::empty(),
Expr::AnyOp {
left,
compare_op: _,

View file

@ -270,6 +270,35 @@ impl fmt::Display for DateTimeField {
}
}
#[derive(Debug, Clone, PartialEq, Eq, Ord, PartialOrd, Hash)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
/// The Unicode Standard defines four normalization forms, which are intended to eliminate
/// certain distinctions between visually or functionally identical characters.
///
/// See [Unicode Normalization Forms](https://unicode.org/reports/tr15/) for details.
pub enum NormalizationForm {
/// Canonical Decomposition, followed by Canonical Composition.
NFC,
/// Canonical Decomposition.
NFD,
/// Compatibility Decomposition, followed by Canonical Composition.
NFKC,
/// Compatibility Decomposition.
NFKD,
}
impl fmt::Display for NormalizationForm {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
NormalizationForm::NFC => write!(f, "NFC"),
NormalizationForm::NFD => write!(f, "NFD"),
NormalizationForm::NFKC => write!(f, "NFKC"),
NormalizationForm::NFKD => write!(f, "NFKD"),
}
}
}
pub struct EscapeQuotedString<'a> {
string: &'a str,
quote: char,

View file

@ -530,6 +530,10 @@ define_keywords!(
NESTED,
NEW,
NEXT,
NFC,
NFD,
NFKC,
NFKD,
NO,
NOBYPASSRLS,
NOCREATEDB,
@ -540,6 +544,7 @@ define_keywords!(
NOORDER,
NOREPLICATION,
NORMALIZE,
NORMALIZED,
NOSCAN,
NOSUPERUSER,
NOT,

View file

@ -3184,9 +3184,11 @@ impl<'a> Parser<'a> {
{
let expr2 = self.parse_expr()?;
Ok(Expr::IsNotDistinctFrom(Box::new(expr), Box::new(expr2)))
} else if let Ok(is_normalized) = self.parse_unicode_is_normalized(expr) {
Ok(is_normalized)
} else {
self.expected(
"[NOT] NULL or TRUE|FALSE or [NOT] DISTINCT FROM after IS",
"[NOT] NULL | TRUE | FALSE | DISTINCT | [form] NORMALIZED FROM after IS",
self.peek_token(),
)
}
@ -3851,7 +3853,7 @@ impl<'a> Parser<'a> {
/// If the current token is the `expected` keyword, consume the token.
/// Otherwise, return an error.
///
// todo deprecate infavor of expected_keyword_is
// todo deprecate in favor of expected_keyword_is
pub fn expect_keyword(&mut self, expected: Keyword) -> Result<TokenWithSpan, ParserError> {
if self.parse_keyword(expected) {
Ok(self.get_current_token().clone())
@ -8453,6 +8455,33 @@ impl<'a> Parser<'a> {
}
}
/// Parse a literal unicode normalization clause
pub fn parse_unicode_is_normalized(&mut self, expr: Expr) -> Result<Expr, ParserError> {
let neg = self.parse_keyword(Keyword::NOT);
let normalized_form = self.maybe_parse(|parser| {
match parser.parse_one_of_keywords(&[
Keyword::NFC,
Keyword::NFD,
Keyword::NFKC,
Keyword::NFKD,
]) {
Some(Keyword::NFC) => Ok(NormalizationForm::NFC),
Some(Keyword::NFD) => Ok(NormalizationForm::NFD),
Some(Keyword::NFKC) => Ok(NormalizationForm::NFKC),
Some(Keyword::NFKD) => Ok(NormalizationForm::NFKD),
_ => parser.expected("unicode normalization form", parser.peek_token()),
}
})?;
if self.parse_keyword(Keyword::NORMALIZED) {
return Ok(Expr::IsNormalized {
expr: Box::new(expr),
form: normalized_form,
negated: neg,
});
}
self.expected("unicode normalization form", self.peek_token())
}
pub fn parse_enum_values(&mut self) -> Result<Vec<EnumMember>, ParserError> {
self.expect_token(&Token::LParen)?;
let values = self.parse_comma_separated(|parser| {
@ -8979,7 +9008,7 @@ impl<'a> Parser<'a> {
}
}
/// Parse a table object for insetion
/// Parse a table object for insertion
/// e.g. `some_database.some_table` or `FUNCTION some_table_func(...)`
pub fn parse_table_object(&mut self) -> Result<TableObject, ParserError> {
if self.dialect.supports_insert_table_function() && self.parse_keyword(Keyword::FUNCTION) {
@ -11887,7 +11916,7 @@ impl<'a> Parser<'a> {
} else {
let mut name = self.parse_grantee_name()?;
if self.consume_token(&Token::Colon) {
// Redshift supports namespace prefix for extenrnal users and groups:
// Redshift supports namespace prefix for external users and groups:
// <Namespace>:<GroupName> or <Namespace>:<UserName>
// https://docs.aws.amazon.com/redshift/latest/mgmt/redshift-iam-access-control-native-idp.html
let ident = self.parse_identifier()?;
@ -12883,7 +12912,7 @@ impl<'a> Parser<'a> {
Ok(WithFill { from, to, step })
}
// Parse a set of comma seperated INTERPOLATE expressions (ClickHouse dialect)
// Parse a set of comma separated INTERPOLATE expressions (ClickHouse dialect)
// that follow the INTERPOLATE keyword in an ORDER BY clause with the WITH FILL modifier
pub fn parse_interpolations(&mut self) -> Result<Option<Interpolate>, ParserError> {
if !self.parse_keyword(Keyword::INTERPOLATE) {
@ -14432,7 +14461,7 @@ mod tests {
assert_eq!(
ast,
Err(ParserError::ParserError(
"Expected: [NOT] NULL or TRUE|FALSE or [NOT] DISTINCT FROM after IS, found: a at Line: 1, Column: 16"
"Expected: [NOT] NULL | TRUE | FALSE | DISTINCT | [form] NORMALIZED FROM after IS, found: a at Line: 1, Column: 16"
.to_string()
))
);

View file

@ -4600,7 +4600,7 @@ fn run_explain_analyze(
expected_verbose: bool,
expected_analyze: bool,
expected_format: Option<AnalyzeFormat>,
exepcted_options: Option<Vec<UtilityOption>>,
expected_options: Option<Vec<UtilityOption>>,
) {
match dialect.verified_stmt(query) {
Statement::Explain {
@ -4616,7 +4616,7 @@ fn run_explain_analyze(
assert_eq!(verbose, expected_verbose);
assert_eq!(analyze, expected_analyze);
assert_eq!(format, expected_format);
assert_eq!(options, exepcted_options);
assert_eq!(options, expected_options);
assert!(!query_plan);
assert!(!estimate);
assert_eq!("SELECT sqrt(id) FROM foo", statement.to_string());
@ -9317,6 +9317,46 @@ fn parse_is_boolean() {
verified_expr(sql)
);
let sql = "a IS NORMALIZED";
assert_eq!(
IsNormalized {
expr: Box::new(Identifier(Ident::new("a"))),
form: None,
negated: false,
},
verified_expr(sql)
);
let sql = "a IS NOT NORMALIZED";
assert_eq!(
IsNormalized {
expr: Box::new(Identifier(Ident::new("a"))),
form: None,
negated: true,
},
verified_expr(sql)
);
let sql = "a IS NFKC NORMALIZED";
assert_eq!(
IsNormalized {
expr: Box::new(Identifier(Ident::new("a"))),
form: Some(NormalizationForm::NFKC),
negated: false,
},
verified_expr(sql)
);
let sql = "a IS NOT NFKD NORMALIZED";
assert_eq!(
IsNormalized {
expr: Box::new(Identifier(Ident::new("a"))),
form: Some(NormalizationForm::NFKD),
negated: true,
},
verified_expr(sql)
);
let sql = "a IS UNKNOWN";
assert_eq!(
IsUnknown(Box::new(Identifier(Ident::new("a")))),
@ -9335,6 +9375,12 @@ fn parse_is_boolean() {
verified_stmt("SELECT f FROM foo WHERE field IS FALSE");
verified_stmt("SELECT f FROM foo WHERE field IS NOT FALSE");
verified_stmt("SELECT f FROM foo WHERE field IS NORMALIZED");
verified_stmt("SELECT f FROM foo WHERE field IS NFC NORMALIZED");
verified_stmt("SELECT f FROM foo WHERE field IS NFD NORMALIZED");
verified_stmt("SELECT f FROM foo WHERE field IS NOT NORMALIZED");
verified_stmt("SELECT f FROM foo WHERE field IS NOT NFKC NORMALIZED");
verified_stmt("SELECT f FROM foo WHERE field IS UNKNOWN");
verified_stmt("SELECT f FROM foo WHERE field IS NOT UNKNOWN");
@ -9342,7 +9388,37 @@ fn parse_is_boolean() {
let res = parse_sql_statements(sql);
assert_eq!(
ParserError::ParserError(
"Expected: [NOT] NULL or TRUE|FALSE or [NOT] DISTINCT FROM after IS, found: 0"
"Expected: [NOT] NULL | TRUE | FALSE | DISTINCT | [form] NORMALIZED FROM after IS, found: 0"
.to_string()
),
res.unwrap_err()
);
let sql = "SELECT s, s IS XYZ NORMALIZED FROM foo";
let res = parse_sql_statements(sql);
assert_eq!(
ParserError::ParserError(
"Expected: [NOT] NULL | TRUE | FALSE | DISTINCT | [form] NORMALIZED FROM after IS, found: XYZ"
.to_string()
),
res.unwrap_err()
);
let sql = "SELECT s, s IS NFKC FROM foo";
let res = parse_sql_statements(sql);
assert_eq!(
ParserError::ParserError(
"Expected: [NOT] NULL | TRUE | FALSE | DISTINCT | [form] NORMALIZED FROM after IS, found: FROM"
.to_string()
),
res.unwrap_err()
);
let sql = "SELECT s, s IS TRIM(' NFKC ') FROM foo";
let res = parse_sql_statements(sql);
assert_eq!(
ParserError::ParserError(
"Expected: [NOT] NULL | TRUE | FALSE | DISTINCT | [form] NORMALIZED FROM after IS, found: TRIM"
.to_string()
),
res.unwrap_err()
@ -13003,7 +13079,7 @@ fn test_trailing_commas_in_from() {
let sql = "SELECT a FROM b, WHERE c = 1";
let _ = dialects.parse_sql_statements(sql).unwrap();
// nasted
// nested
let sql = "SELECT 1, 2 FROM (SELECT * FROM t,),";
let _ = dialects.parse_sql_statements(sql).unwrap();

View file

@ -2572,7 +2572,7 @@ fn parse_kill() {
}
#[test]
fn parse_table_colum_option_on_update() {
fn parse_table_column_option_on_update() {
let sql1 = "CREATE TABLE foo (`modification_time` DATETIME ON UPDATE CURRENT_TIMESTAMP())";
match mysql().verified_stmt(sql1) {
Statement::CreateTable(CreateTable { name, columns, .. }) => {