Consolidate MapAccess, and Subscript into CompoundExpr to handle the complex field access chain (#1551)

This commit is contained in:
Jax Liu 2024-12-22 22:28:44 +08:00 committed by GitHub
parent cd898cb6a4
commit 0647a4aa82
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 455 additions and 287 deletions

View file

@ -459,40 +459,6 @@ pub enum CastFormat {
ValueAtTimeZone(Value, Value),
}
/// Represents the syntax/style used in a map access.
#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
pub enum MapAccessSyntax {
/// Access using bracket notation. `mymap[mykey]`
Bracket,
/// Access using period notation. `mymap.mykey`
Period,
}
/// Expression used to access a value in a nested structure.
///
/// Example: `SAFE_OFFSET(0)` in
/// ```sql
/// SELECT mymap[SAFE_OFFSET(0)];
/// ```
#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
pub struct MapAccessKey {
pub key: Expr,
pub syntax: MapAccessSyntax,
}
impl fmt::Display for MapAccessKey {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self.syntax {
MapAccessSyntax::Bracket => write!(f, "[{}]", self.key),
MapAccessSyntax::Period => write!(f, ".{}", self.key),
}
}
}
/// An element of a JSON path.
#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
@ -629,6 +595,28 @@ pub enum Expr {
Identifier(Ident),
/// Multi-part identifier, e.g. `table_alias.column` or `schema.table.col`
CompoundIdentifier(Vec<Ident>),
/// Multi-part expression access.
///
/// This structure represents an access chain in structured / nested types
/// such as maps, arrays, and lists:
/// - Array
/// - A 1-dim array `a[1]` will be represented like:
/// `CompoundFieldAccess(Ident('a'), vec![Subscript(1)]`
/// - A 2-dim array `a[1][2]` will be represented like:
/// `CompoundFieldAccess(Ident('a'), vec![Subscript(1), Subscript(2)]`
/// - Map or Struct (Bracket-style)
/// - A map `a['field1']` will be represented like:
/// `CompoundFieldAccess(Ident('a'), vec![Subscript('field')]`
/// - A 2-dim map `a['field1']['field2']` will be represented like:
/// `CompoundFieldAccess(Ident('a'), vec![Subscript('field1'), Subscript('field2')]`
/// - Struct (Dot-style) (only effect when the chain contains both subscript and expr)
/// - A struct access `a[field1].field2` will be represented like:
/// `CompoundFieldAccess(Ident('a'), vec![Subscript('field1'), Ident('field2')]`
/// - If a struct access likes `a.field1.field2`, it will be represented by CompoundIdentifier([a, field1, field2])
CompoundFieldAccess {
root: Box<Expr>,
access_chain: Vec<AccessExpr>,
},
/// Access data nested in a value containing semi-structured data, such as
/// the `VARIANT` type on Snowflake. for example `src:customer[0].name`.
///
@ -882,14 +870,6 @@ pub enum Expr {
data_type: DataType,
value: String,
},
/// Access a map-like object by field (e.g. `column['field']` or `column[4]`
/// Note that depending on the dialect, struct like accesses may be
/// parsed as [`Subscript`](Self::Subscript) or [`MapAccess`](Self::MapAccess)
/// <https://clickhouse.com/docs/en/sql-reference/data-types/map/>
MapAccess {
column: Box<Expr>,
keys: Vec<MapAccessKey>,
},
/// Scalar function call e.g. `LEFT(foo, 5)`
Function(Function),
/// Arbitrary expr method call
@ -978,11 +958,6 @@ pub enum Expr {
/// ```
/// [1]: https://duckdb.org/docs/sql/data_types/map#creating-maps
Map(Map),
/// An access of nested data using subscript syntax, for example `array[2]`.
Subscript {
expr: Box<Expr>,
subscript: Box<Subscript>,
},
/// An array expression e.g. `ARRAY[1, 2]`
Array(Array),
/// An interval expression e.g. `INTERVAL '1' YEAR`
@ -1099,6 +1074,27 @@ impl fmt::Display for Subscript {
}
}
/// An element of a [`Expr::CompoundFieldAccess`].
/// It can be an expression or a subscript.
#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
pub enum AccessExpr {
/// Accesses a field using dot notation, e.g. `foo.bar.baz`.
Dot(Expr),
/// Accesses a field or array element using bracket notation, e.g. `foo['bar']`.
Subscript(Subscript),
}
impl fmt::Display for AccessExpr {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
AccessExpr::Dot(expr) => write!(f, ".{}", expr),
AccessExpr::Subscript(subscript) => write!(f, "[{}]", subscript),
}
}
}
/// A lambda function.
#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
@ -1295,12 +1291,16 @@ impl fmt::Display for Expr {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
Expr::Identifier(s) => write!(f, "{s}"),
Expr::MapAccess { column, keys } => {
write!(f, "{column}{}", display_separated(keys, ""))
}
Expr::Wildcard(_) => f.write_str("*"),
Expr::QualifiedWildcard(prefix, _) => write!(f, "{}.*", prefix),
Expr::CompoundIdentifier(s) => write!(f, "{}", display_separated(s, ".")),
Expr::CompoundFieldAccess { root, access_chain } => {
write!(f, "{}", root)?;
for field in access_chain {
write!(f, "{}", field)?;
}
Ok(())
}
Expr::IsTrue(ast) => write!(f, "{ast} IS TRUE"),
Expr::IsNotTrue(ast) => write!(f, "{ast} IS NOT TRUE"),
Expr::IsFalse(ast) => write!(f, "{ast} IS FALSE"),
@ -1720,12 +1720,6 @@ impl fmt::Display for Expr {
Expr::Map(map) => {
write!(f, "{map}")
}
Expr::Subscript {
expr,
subscript: key,
} => {
write!(f, "{expr}[{key}]")
}
Expr::Array(set) => {
write!(f, "{set}")
}

View file

@ -20,20 +20,20 @@ use core::iter;
use crate::tokenizer::Span;
use super::{
dcl::SecondaryRoles, AlterColumnOperation, AlterIndexOperation, AlterTableOperation, Array,
Assignment, AssignmentTarget, CloseCursor, ClusteredIndex, ColumnDef, ColumnOption,
ColumnOptionDef, ConflictTarget, ConnectBy, ConstraintCharacteristics, CopySource, CreateIndex,
CreateTable, CreateTableOptions, Cte, Delete, DoUpdate, ExceptSelectItem, ExcludeSelectItem,
Expr, ExprWithAlias, Fetch, FromTable, Function, FunctionArg, FunctionArgExpr,
FunctionArgumentClause, FunctionArgumentList, FunctionArguments, GroupByExpr, HavingBound,
IlikeSelectItem, Insert, Interpolate, InterpolateExpr, Join, JoinConstraint, JoinOperator,
JsonPath, JsonPathElem, LateralView, MatchRecognizePattern, Measure, NamedWindowDefinition,
ObjectName, Offset, OnConflict, OnConflictAction, OnInsert, OrderBy, OrderByExpr, Partition,
PivotValueSource, ProjectionSelect, Query, ReferentialAction, RenameSelectItem,
ReplaceSelectElement, ReplaceSelectItem, Select, SelectInto, SelectItem, SetExpr, SqlOption,
Statement, Subscript, SymbolDefinition, TableAlias, TableAliasColumnDef, TableConstraint,
TableFactor, TableOptionsClustered, TableWithJoins, Use, Value, Values, ViewColumnDef,
WildcardAdditionalOptions, With, WithFill,
dcl::SecondaryRoles, AccessExpr, AlterColumnOperation, AlterIndexOperation,
AlterTableOperation, Array, Assignment, AssignmentTarget, CloseCursor, ClusteredIndex,
ColumnDef, ColumnOption, ColumnOptionDef, ConflictTarget, ConnectBy, ConstraintCharacteristics,
CopySource, CreateIndex, CreateTable, CreateTableOptions, Cte, Delete, DoUpdate,
ExceptSelectItem, ExcludeSelectItem, Expr, ExprWithAlias, Fetch, FromTable, Function,
FunctionArg, FunctionArgExpr, FunctionArgumentClause, FunctionArgumentList, FunctionArguments,
GroupByExpr, HavingBound, IlikeSelectItem, Insert, Interpolate, InterpolateExpr, Join,
JoinConstraint, JoinOperator, JsonPath, JsonPathElem, LateralView, MatchRecognizePattern,
Measure, NamedWindowDefinition, ObjectName, Offset, OnConflict, OnConflictAction, OnInsert,
OrderBy, OrderByExpr, Partition, PivotValueSource, ProjectionSelect, Query, ReferentialAction,
RenameSelectItem, ReplaceSelectElement, ReplaceSelectItem, Select, SelectInto, SelectItem,
SetExpr, SqlOption, Statement, Subscript, SymbolDefinition, TableAlias, TableAliasColumnDef,
TableConstraint, TableFactor, TableOptionsClustered, TableWithJoins, Use, Value, Values,
ViewColumnDef, WildcardAdditionalOptions, With, WithFill,
};
/// Given an iterator of spans, return the [Span::union] of all spans.
@ -1262,6 +1262,9 @@ impl Spanned for Expr {
Expr::Identifier(ident) => ident.span,
Expr::CompoundIdentifier(vec) => union_spans(vec.iter().map(|i| i.span)),
Expr::CompositeAccess { expr, key } => expr.span().union(&key.span),
Expr::CompoundFieldAccess { root, access_chain } => {
union_spans(iter::once(root.span()).chain(access_chain.iter().map(|i| i.span())))
}
Expr::IsFalse(expr) => expr.span(),
Expr::IsNotFalse(expr) => expr.span(),
Expr::IsTrue(expr) => expr.span(),
@ -1336,9 +1339,6 @@ impl Spanned for Expr {
Expr::Nested(expr) => expr.span(),
Expr::Value(value) => value.span(),
Expr::TypedString { .. } => Span::empty(),
Expr::MapAccess { column, keys } => column
.span()
.union(&union_spans(keys.iter().map(|i| i.key.span()))),
Expr::Function(function) => function.span(),
Expr::GroupingSets(vec) => {
union_spans(vec.iter().flat_map(|i| i.iter().map(|k| k.span())))
@ -1434,7 +1434,6 @@ impl Spanned for Expr {
Expr::Named { .. } => Span::empty(),
Expr::Dictionary(_) => Span::empty(),
Expr::Map(_) => Span::empty(),
Expr::Subscript { expr, subscript } => expr.span().union(&subscript.span()),
Expr::Interval(interval) => interval.value.span(),
Expr::Wildcard(token) => token.0.span,
Expr::QualifiedWildcard(object_name, token) => union_spans(
@ -1473,6 +1472,15 @@ impl Spanned for Subscript {
}
}
impl Spanned for AccessExpr {
fn span(&self) -> Span {
match self {
AccessExpr::Dot(ident) => ident.span(),
AccessExpr::Subscript(subscript) => subscript.span(),
}
}
}
impl Spanned for ObjectName {
fn span(&self) -> Span {
let ObjectName(segments) = self;

View file

@ -234,6 +234,10 @@ impl Dialect for SnowflakeDialect {
RESERVED_FOR_IDENTIFIER.contains(&kw)
}
}
fn supports_partiql(&self) -> bool {
true
}
}
/// Parse snowflake create table statement.

View file

@ -1161,53 +1161,39 @@ impl<'a> Parser<'a> {
w_span: Span,
) -> Result<Expr, ParserError> {
match self.peek_token().token {
Token::LParen | Token::Period => {
let mut id_parts: Vec<Ident> = vec![w.to_ident(w_span)];
let mut ending_wildcard: Option<TokenWithSpan> = None;
while self.consume_token(&Token::Period) {
let next_token = self.next_token();
match next_token.token {
Token::Word(w) => id_parts.push(w.to_ident(next_token.span)),
Token::Mul => {
// Postgres explicitly allows funcnm(tablenm.*) and the
// function array_agg traverses this control flow
if dialect_of!(self is PostgreSqlDialect) {
ending_wildcard = Some(next_token);
break;
} else {
return self.expected("an identifier after '.'", next_token);
}
}
Token::SingleQuotedString(s) => id_parts.push(Ident::with_quote('\'', s)),
_ => {
return self.expected("an identifier or a '*' after '.'", next_token);
}
}
}
if let Some(wildcard_token) = ending_wildcard {
Ok(Expr::QualifiedWildcard(
ObjectName(id_parts),
AttachedToken(wildcard_token),
))
} else if self.consume_token(&Token::LParen) {
if dialect_of!(self is SnowflakeDialect | MsSqlDialect)
&& self.consume_tokens(&[Token::Plus, Token::RParen])
{
Ok(Expr::OuterJoin(Box::new(
match <[Ident; 1]>::try_from(id_parts) {
Ok([ident]) => Expr::Identifier(ident),
Err(parts) => Expr::CompoundIdentifier(parts),
},
)))
} else {
self.prev_token();
self.parse_function(ObjectName(id_parts))
}
Token::Period => {
self.parse_compound_field_access(Expr::Identifier(w.to_ident(w_span)), vec![])
}
Token::LParen => {
let id_parts = vec![w.to_ident(w_span)];
if let Some(expr) = self.parse_outer_join_expr(&id_parts) {
Ok(expr)
} else {
Ok(Expr::CompoundIdentifier(id_parts))
let mut expr = self.parse_function(ObjectName(id_parts))?;
// consume all period if it's a method chain
expr = self.try_parse_method(expr)?;
let fields = vec![];
self.parse_compound_field_access(expr, fields)
}
}
Token::LBracket if dialect_of!(self is PostgreSqlDialect | DuckDbDialect | GenericDialect | ClickHouseDialect | BigQueryDialect) =>
{
let ident = Expr::Identifier(w.to_ident(w_span));
let mut fields = vec![];
self.parse_multi_dim_subscript(&mut fields)?;
self.parse_compound_field_access(ident, fields)
}
// string introducer https://dev.mysql.com/doc/refman/8.0/en/charset-introducer.html
Token::SingleQuotedString(_)
| Token::DoubleQuotedString(_)
| Token::HexStringLiteral(_)
if w.value.starts_with('_') =>
{
Ok(Expr::IntroducedString {
introducer: w.value.clone(),
value: self.parse_introduced_string_value()?,
})
}
// string introducer https://dev.mysql.com/doc/refman/8.0/en/charset-introducer.html
Token::SingleQuotedString(_)
| Token::DoubleQuotedString(_)
@ -1426,6 +1412,144 @@ impl<'a> Parser<'a> {
}
}
/// Try to parse an [Expr::CompoundFieldAccess] like `a.b.c` or `a.b[1].c`.
/// If all the fields are `Expr::Identifier`s, return an [Expr::CompoundIdentifier] instead.
/// If only the root exists, return the root.
/// If self supports [Dialect::supports_partiql], it will fall back when occurs [Token::LBracket] for JsonAccess parsing.
pub fn parse_compound_field_access(
&mut self,
root: Expr,
mut chain: Vec<AccessExpr>,
) -> Result<Expr, ParserError> {
let mut ending_wildcard: Option<TokenWithSpan> = None;
let mut ending_lbracket = false;
while self.consume_token(&Token::Period) {
let next_token = self.next_token();
match next_token.token {
Token::Word(w) => {
let expr = Expr::Identifier(w.to_ident(next_token.span));
chain.push(AccessExpr::Dot(expr));
if self.peek_token().token == Token::LBracket {
if self.dialect.supports_partiql() {
self.next_token();
ending_lbracket = true;
break;
} else {
self.parse_multi_dim_subscript(&mut chain)?
}
}
}
Token::Mul => {
// Postgres explicitly allows funcnm(tablenm.*) and the
// function array_agg traverses this control flow
if dialect_of!(self is PostgreSqlDialect) {
ending_wildcard = Some(next_token);
break;
} else {
return self.expected("an identifier after '.'", next_token);
}
}
Token::SingleQuotedString(s) => {
let expr = Expr::Identifier(Ident::with_quote('\'', s));
chain.push(AccessExpr::Dot(expr));
}
_ => {
return self.expected("an identifier or a '*' after '.'", next_token);
}
}
}
// if dialect supports partiql, we need to go back one Token::LBracket for the JsonAccess parsing
if self.dialect.supports_partiql() && ending_lbracket {
self.prev_token();
}
if let Some(wildcard_token) = ending_wildcard {
if !Self::is_all_ident(&root, &chain) {
return self.expected("an identifier or a '*' after '.'", self.peek_token());
};
Ok(Expr::QualifiedWildcard(
ObjectName(Self::exprs_to_idents(root, chain)?),
AttachedToken(wildcard_token),
))
} else if self.peek_token().token == Token::LParen {
if !Self::is_all_ident(&root, &chain) {
// consume LParen
self.next_token();
return self.expected("an identifier or a '*' after '.'", self.peek_token());
};
let id_parts = Self::exprs_to_idents(root, chain)?;
if let Some(expr) = self.parse_outer_join_expr(&id_parts) {
Ok(expr)
} else {
self.parse_function(ObjectName(id_parts))
}
} else {
if Self::is_all_ident(&root, &chain) {
return Ok(Expr::CompoundIdentifier(Self::exprs_to_idents(
root, chain,
)?));
}
if chain.is_empty() {
return Ok(root);
}
Ok(Expr::CompoundFieldAccess {
root: Box::new(root),
access_chain: chain.clone(),
})
}
}
/// Check if the root is an identifier and all fields are identifiers.
fn is_all_ident(root: &Expr, fields: &[AccessExpr]) -> bool {
if !matches!(root, Expr::Identifier(_)) {
return false;
}
fields
.iter()
.all(|x| matches!(x, AccessExpr::Dot(Expr::Identifier(_))))
}
/// Convert a root and a list of fields to a list of identifiers.
fn exprs_to_idents(root: Expr, fields: Vec<AccessExpr>) -> Result<Vec<Ident>, ParserError> {
let mut idents = vec![];
if let Expr::Identifier(root) = root {
idents.push(root);
for x in fields {
if let AccessExpr::Dot(Expr::Identifier(ident)) = x {
idents.push(ident);
} else {
return parser_err!(
format!("Expected identifier, found: {}", x),
x.span().start
);
}
}
Ok(idents)
} else {
parser_err!(
format!("Expected identifier, found: {}", root),
root.span().start
)
}
}
/// Try to parse OuterJoin expression `(+)`
fn parse_outer_join_expr(&mut self, id_parts: &[Ident]) -> Option<Expr> {
if dialect_of!(self is SnowflakeDialect | MsSqlDialect)
&& self.consume_tokens(&[Token::LParen, Token::Plus, Token::RParen])
{
Some(Expr::OuterJoin(Box::new(
match <[Ident; 1]>::try_from(id_parts.to_vec()) {
Ok([ident]) => Expr::Identifier(ident),
Err(parts) => Expr::CompoundIdentifier(parts),
},
)))
} else {
None
}
}
pub fn parse_utility_options(&mut self) -> Result<Vec<UtilityOption>, ParserError> {
self.expect_token(&Token::LParen)?;
let options = self.parse_comma_separated(Self::parse_utility_option)?;
@ -3042,13 +3166,18 @@ impl<'a> Parser<'a> {
expr: Box::new(expr),
})
} else if Token::LBracket == tok {
if dialect_of!(self is PostgreSqlDialect | DuckDbDialect | GenericDialect) {
self.parse_subscript(expr)
} else if dialect_of!(self is SnowflakeDialect) || self.dialect.supports_partiql() {
if dialect_of!(self is PostgreSqlDialect | DuckDbDialect | GenericDialect | ClickHouseDialect | BigQueryDialect)
{
let mut chain = vec![];
// back to LBracket
self.prev_token();
self.parse_multi_dim_subscript(&mut chain)?;
self.parse_compound_field_access(expr, chain)
} else if self.dialect.supports_partiql() {
self.prev_token();
self.parse_json_access(expr)
} else {
self.parse_map_access(expr)
parser_err!("Array subscripting is not supported", tok.span.start)
}
} else if dialect_of!(self is SnowflakeDialect | GenericDialect) && Token::Colon == tok {
self.prev_token();
@ -3144,15 +3273,24 @@ impl<'a> Parser<'a> {
})
}
/// Parse a multi-dimension array accessing like `[1:3][1][1]`
pub fn parse_multi_dim_subscript(
&mut self,
chain: &mut Vec<AccessExpr>,
) -> Result<(), ParserError> {
while self.consume_token(&Token::LBracket) {
self.parse_subscript(chain)?;
}
Ok(())
}
/// Parses an array subscript like `[1:3]`
///
/// Parser is right after `[`
pub fn parse_subscript(&mut self, expr: Expr) -> Result<Expr, ParserError> {
fn parse_subscript(&mut self, chain: &mut Vec<AccessExpr>) -> Result<(), ParserError> {
let subscript = self.parse_subscript_inner()?;
Ok(Expr::Subscript {
expr: Box::new(expr),
subscript: Box::new(subscript),
})
chain.push(AccessExpr::Subscript(subscript));
Ok(())
}
fn parse_json_path_object_key(&mut self) -> Result<JsonPathElem, ParserError> {
@ -3214,46 +3352,6 @@ impl<'a> Parser<'a> {
Ok(JsonPath { path })
}
pub fn parse_map_access(&mut self, expr: Expr) -> Result<Expr, ParserError> {
let key = self.parse_expr()?;
self.expect_token(&Token::RBracket)?;
let mut keys = vec![MapAccessKey {
key,
syntax: MapAccessSyntax::Bracket,
}];
loop {
let key = match self.peek_token().token {
Token::LBracket => {
self.next_token(); // consume `[`
let key = self.parse_expr()?;
self.expect_token(&Token::RBracket)?;
MapAccessKey {
key,
syntax: MapAccessSyntax::Bracket,
}
}
// Access on BigQuery nested and repeated expressions can
// mix notations in the same expression.
// https://cloud.google.com/bigquery/docs/nested-repeated#query_nested_and_repeated_columns
Token::Period if dialect_of!(self is BigQueryDialect) => {
self.next_token(); // consume `.`
MapAccessKey {
key: self.parse_expr()?,
syntax: MapAccessSyntax::Period,
}
}
_ => break,
};
keys.push(key);
}
Ok(Expr::MapAccess {
column: Box::new(expr),
keys,
})
}
/// Parses the parens following the `[ NOT ] IN` operator.
pub fn parse_in(&mut self, expr: Expr, negated: bool) -> Result<Expr, ParserError> {
// BigQuery allows `IN UNNEST(array_expression)`