mirror of
https://github.com/apache/datafusion-sqlparser-rs.git
synced 2025-11-11 19:43:47 +00:00
Improve parsing of JSON accesses on Postgres and Snowflake (#1215)
Co-authored-by: Ifeanyi Ubah <ify1992@yahoo.com>
This commit is contained in:
parent
0606024353
commit
4bfa399919
7 changed files with 432 additions and 199 deletions
141
src/ast/mod.rs
141
src/ast/mod.rs
|
|
@ -51,7 +51,8 @@ pub use self::query::{
|
|||
Top, TopQuantity, ValueTableMode, Values, WildcardAdditionalOptions, With,
|
||||
};
|
||||
pub use self::value::{
|
||||
escape_quoted_string, DateTimeField, DollarQuotedString, TrimWhereField, Value,
|
||||
escape_double_quote_string, escape_quoted_string, DateTimeField, DollarQuotedString,
|
||||
TrimWhereField, Value,
|
||||
};
|
||||
|
||||
use crate::ast::helpers::stmt_data_loading::{
|
||||
|
|
@ -270,66 +271,6 @@ impl fmt::Display for Interval {
|
|||
}
|
||||
}
|
||||
|
||||
/// JsonOperator
|
||||
#[derive(Debug, Copy, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
|
||||
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
|
||||
#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
|
||||
pub enum JsonOperator {
|
||||
/// -> keeps the value as json
|
||||
Arrow,
|
||||
/// ->> keeps the value as text or int.
|
||||
LongArrow,
|
||||
/// #> Extracts JSON sub-object at the specified path
|
||||
HashArrow,
|
||||
/// #>> Extracts JSON sub-object at the specified path as text
|
||||
HashLongArrow,
|
||||
/// : Colon is used by Snowflake (Which is similar to LongArrow)
|
||||
Colon,
|
||||
/// jsonb @> jsonb -> boolean: Test whether left json contains the right json
|
||||
AtArrow,
|
||||
/// jsonb <@ jsonb -> boolean: Test whether right json contains the left json
|
||||
ArrowAt,
|
||||
/// jsonb #- text[] -> jsonb: Deletes the field or array element at the specified
|
||||
/// path, where path elements can be either field keys or array indexes.
|
||||
HashMinus,
|
||||
/// jsonb @? jsonpath -> boolean: Does JSON path return any item for the specified
|
||||
/// JSON value?
|
||||
AtQuestion,
|
||||
/// jsonb @@ jsonpath → boolean: Returns the result of a JSON path predicate check
|
||||
/// for the specified JSON value. Only the first item of the result is taken into
|
||||
/// account. If the result is not Boolean, then NULL is returned.
|
||||
AtAt,
|
||||
}
|
||||
|
||||
impl fmt::Display for JsonOperator {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match self {
|
||||
JsonOperator::Arrow => {
|
||||
write!(f, "->")
|
||||
}
|
||||
JsonOperator::LongArrow => {
|
||||
write!(f, "->>")
|
||||
}
|
||||
JsonOperator::HashArrow => {
|
||||
write!(f, "#>")
|
||||
}
|
||||
JsonOperator::HashLongArrow => {
|
||||
write!(f, "#>>")
|
||||
}
|
||||
JsonOperator::Colon => {
|
||||
write!(f, ":")
|
||||
}
|
||||
JsonOperator::AtArrow => {
|
||||
write!(f, "@>")
|
||||
}
|
||||
JsonOperator::ArrowAt => write!(f, "<@"),
|
||||
JsonOperator::HashMinus => write!(f, "#-"),
|
||||
JsonOperator::AtQuestion => write!(f, "@?"),
|
||||
JsonOperator::AtAt => write!(f, "@@"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A field definition within a struct.
|
||||
///
|
||||
/// [bigquery]: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#struct_type
|
||||
|
|
@ -412,6 +353,59 @@ impl fmt::Display for MapAccessKey {
|
|||
}
|
||||
}
|
||||
|
||||
/// An element of a JSON path.
|
||||
#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
|
||||
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
|
||||
#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
|
||||
pub enum JsonPathElem {
|
||||
/// Accesses an object field using dot notation, e.g. `obj:foo.bar.baz`.
|
||||
///
|
||||
/// See <https://docs.snowflake.com/en/user-guide/querying-semistructured#dot-notation>.
|
||||
Dot { key: String, quoted: bool },
|
||||
/// Accesses an object field or array element using bracket notation,
|
||||
/// e.g. `obj['foo']`.
|
||||
///
|
||||
/// See <https://docs.snowflake.com/en/user-guide/querying-semistructured#bracket-notation>.
|
||||
Bracket { key: Expr },
|
||||
}
|
||||
|
||||
/// A JSON path.
|
||||
///
|
||||
/// See <https://docs.snowflake.com/en/user-guide/querying-semistructured>.
|
||||
/// See <https://docs.databricks.com/en/sql/language-manual/sql-ref-json-path-expression.html>.
|
||||
#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
|
||||
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
|
||||
#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
|
||||
pub struct JsonPath {
|
||||
pub path: Vec<JsonPathElem>,
|
||||
}
|
||||
|
||||
impl fmt::Display for JsonPath {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
for (i, elem) in self.path.iter().enumerate() {
|
||||
match elem {
|
||||
JsonPathElem::Dot { key, quoted } => {
|
||||
if i == 0 {
|
||||
write!(f, ":")?;
|
||||
} else {
|
||||
write!(f, ".")?;
|
||||
}
|
||||
|
||||
if *quoted {
|
||||
write!(f, "\"{}\"", escape_double_quote_string(key))?;
|
||||
} else {
|
||||
write!(f, "{key}")?;
|
||||
}
|
||||
}
|
||||
JsonPathElem::Bracket { key } => {
|
||||
write!(f, "[{key}]")?;
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// The syntax used for in a cast expression.
|
||||
#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
|
||||
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
|
||||
|
|
@ -449,11 +443,16 @@ pub enum Expr {
|
|||
Identifier(Ident),
|
||||
/// Multi-part identifier, e.g. `table_alias.column` or `schema.table.col`
|
||||
CompoundIdentifier(Vec<Ident>),
|
||||
/// JSON access (postgres) eg: data->'tags'
|
||||
/// Access data nested in a value containing semi-structured data, such as
|
||||
/// the `VARIANT` type on Snowflake. for example `src:customer[0].name`.
|
||||
///
|
||||
/// See <https://docs.snowflake.com/en/user-guide/querying-semistructured>.
|
||||
/// See <https://docs.databricks.com/en/sql/language-manual/functions/colonsign.html>.
|
||||
JsonAccess {
|
||||
left: Box<Expr>,
|
||||
operator: JsonOperator,
|
||||
right: Box<Expr>,
|
||||
/// The value being queried.
|
||||
value: Box<Expr>,
|
||||
/// The path to the data to extract.
|
||||
path: JsonPath,
|
||||
},
|
||||
/// CompositeAccess (postgres) eg: SELECT (information_schema._pg_expandarray(array['i','i'])).n
|
||||
CompositeAccess {
|
||||
|
|
@ -1224,16 +1223,8 @@ impl fmt::Display for Expr {
|
|||
Expr::Array(set) => {
|
||||
write!(f, "{set}")
|
||||
}
|
||||
Expr::JsonAccess {
|
||||
left,
|
||||
operator,
|
||||
right,
|
||||
} => {
|
||||
if operator == &JsonOperator::Colon {
|
||||
write!(f, "{left}{operator}{right}")
|
||||
} else {
|
||||
write!(f, "{left} {operator} {right}")
|
||||
}
|
||||
Expr::JsonAccess { value, path } => {
|
||||
write!(f, "{value}{path}")
|
||||
}
|
||||
Expr::CompositeAccess { expr, key } => {
|
||||
write!(f, "{expr}.{key}")
|
||||
|
|
|
|||
|
|
@ -141,6 +141,79 @@ pub enum BinaryOperator {
|
|||
PGNotILikeMatch,
|
||||
/// String "starts with", eg: `a ^@ b` (PostgreSQL-specific)
|
||||
PGStartsWith,
|
||||
/// The `->` operator.
|
||||
///
|
||||
/// On PostgreSQL, this operator extracts a JSON object field or array
|
||||
/// element, for example `'{"a":"b"}'::json -> 'a'` or `[1, 2, 3]'::json
|
||||
/// -> 2`.
|
||||
///
|
||||
/// See <https://www.postgresql.org/docs/current/functions-json.html>.
|
||||
Arrow,
|
||||
/// The `->>` operator.
|
||||
///
|
||||
/// On PostgreSQL, this operator that extracts a JSON object field or JSON
|
||||
/// array element and converts it to text, for example `'{"a":"b"}'::json
|
||||
/// ->> 'a'` or `[1, 2, 3]'::json ->> 2`.
|
||||
///
|
||||
/// See <https://www.postgresql.org/docs/current/functions-json.html>.
|
||||
LongArrow,
|
||||
/// The `#>` operator.
|
||||
///
|
||||
/// On PostgreSQL, this operator extracts a JSON sub-object at the specified
|
||||
/// path, for example:
|
||||
///
|
||||
/// ```notrust
|
||||
///'{"a": {"b": ["foo","bar"]}}'::json #> '{a,b,1}'
|
||||
/// ```
|
||||
///
|
||||
/// See <https://www.postgresql.org/docs/current/functions-json.html>.
|
||||
HashArrow,
|
||||
/// The `#>>` operator.
|
||||
///
|
||||
/// A PostgreSQL-specific operator that extracts JSON sub-object at the
|
||||
/// specified path, for example
|
||||
///
|
||||
/// ```notrust
|
||||
///'{"a": {"b": ["foo","bar"]}}'::json #>> '{a,b,1}'
|
||||
/// ```
|
||||
///
|
||||
/// See <https://www.postgresql.org/docs/current/functions-json.html>.
|
||||
HashLongArrow,
|
||||
/// The `@@` operator.
|
||||
///
|
||||
/// On PostgreSQL, this is used for JSON and text searches.
|
||||
///
|
||||
/// See <https://www.postgresql.org/docs/current/functions-json.html>.
|
||||
/// See <https://www.postgresql.org/docs/current/functions-textsearch.html>.
|
||||
AtAt,
|
||||
/// The `@>` operator.
|
||||
///
|
||||
/// On PostgreSQL, this is used for JSON and text searches.
|
||||
///
|
||||
/// See <https://www.postgresql.org/docs/current/functions-json.html>.
|
||||
/// See <https://www.postgresql.org/docs/current/functions-textsearch.html>.
|
||||
AtArrow,
|
||||
/// The `<@` operator.
|
||||
///
|
||||
/// On PostgreSQL, this is used for JSON and text searches.
|
||||
///
|
||||
/// See <https://www.postgresql.org/docs/current/functions-json.html>.
|
||||
/// See <https://www.postgresql.org/docs/current/functions-textsearch.html>.
|
||||
ArrowAt,
|
||||
/// The `#-` operator.
|
||||
///
|
||||
/// On PostgreSQL, this operator is used to delete a field or array element
|
||||
/// at a specified path.
|
||||
///
|
||||
/// See <https://www.postgresql.org/docs/current/functions-json.html>.
|
||||
HashMinus,
|
||||
/// The `@?` operator.
|
||||
///
|
||||
/// On PostgreSQL, this operator is used to check the given JSON path
|
||||
/// returns an item for the JSON value.
|
||||
///
|
||||
/// See <https://www.postgresql.org/docs/current/functions-json.html>.
|
||||
AtQuestion,
|
||||
/// PostgreSQL-specific custom operator.
|
||||
///
|
||||
/// See [CREATE OPERATOR](https://www.postgresql.org/docs/current/sql-createoperator.html)
|
||||
|
|
@ -187,6 +260,15 @@ impl fmt::Display for BinaryOperator {
|
|||
BinaryOperator::PGNotLikeMatch => f.write_str("!~~"),
|
||||
BinaryOperator::PGNotILikeMatch => f.write_str("!~~*"),
|
||||
BinaryOperator::PGStartsWith => f.write_str("^@"),
|
||||
BinaryOperator::Arrow => f.write_str("->"),
|
||||
BinaryOperator::LongArrow => f.write_str("->>"),
|
||||
BinaryOperator::HashArrow => f.write_str("#>"),
|
||||
BinaryOperator::HashLongArrow => f.write_str("#>>"),
|
||||
BinaryOperator::AtAt => f.write_str("@@"),
|
||||
BinaryOperator::AtArrow => f.write_str("@>"),
|
||||
BinaryOperator::ArrowAt => f.write_str("<@"),
|
||||
BinaryOperator::HashMinus => f.write_str("#-"),
|
||||
BinaryOperator::AtQuestion => f.write_str("@?"),
|
||||
BinaryOperator::PGCustomBinaryOperator(idents) => {
|
||||
write!(f, "OPERATOR({})", display_separated(idents, "."))
|
||||
}
|
||||
|
|
|
|||
|
|
@ -65,8 +65,6 @@ pub enum Value {
|
|||
Null,
|
||||
/// `?` or `$` Prepared statement arg placeholder
|
||||
Placeholder(String),
|
||||
/// Add support of snowflake field:key - key should be a value
|
||||
UnQuotedString(String),
|
||||
}
|
||||
|
||||
impl fmt::Display for Value {
|
||||
|
|
@ -85,7 +83,6 @@ impl fmt::Display for Value {
|
|||
Value::RawStringLiteral(v) => write!(f, "R'{v}'"),
|
||||
Value::Null => write!(f, "NULL"),
|
||||
Value::Placeholder(v) => write!(f, "{v}"),
|
||||
Value::UnQuotedString(v) => write!(f, "{v}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2346,6 +2346,16 @@ impl<'a> Parser<'a> {
|
|||
Token::DoubleTildeAsterisk => Some(BinaryOperator::PGILikeMatch),
|
||||
Token::ExclamationMarkDoubleTilde => Some(BinaryOperator::PGNotLikeMatch),
|
||||
Token::ExclamationMarkDoubleTildeAsterisk => Some(BinaryOperator::PGNotILikeMatch),
|
||||
Token::Arrow => Some(BinaryOperator::Arrow),
|
||||
Token::LongArrow => Some(BinaryOperator::LongArrow),
|
||||
Token::HashArrow => Some(BinaryOperator::HashArrow),
|
||||
Token::HashLongArrow => Some(BinaryOperator::HashLongArrow),
|
||||
Token::AtArrow => Some(BinaryOperator::AtArrow),
|
||||
Token::ArrowAt => Some(BinaryOperator::ArrowAt),
|
||||
Token::HashMinus => Some(BinaryOperator::HashMinus),
|
||||
Token::AtQuestion => Some(BinaryOperator::AtQuestion),
|
||||
Token::AtAt => Some(BinaryOperator::AtAt),
|
||||
|
||||
Token::Word(w) => match w.keyword {
|
||||
Keyword::AND => Some(BinaryOperator::And),
|
||||
Keyword::OR => Some(BinaryOperator::Or),
|
||||
|
|
@ -2539,42 +2549,16 @@ impl<'a> Parser<'a> {
|
|||
} else if Token::LBracket == tok {
|
||||
if dialect_of!(self is PostgreSqlDialect | GenericDialect) {
|
||||
// parse index
|
||||
return self.parse_array_index(expr);
|
||||
self.parse_array_index(expr)
|
||||
} else if dialect_of!(self is SnowflakeDialect) {
|
||||
self.prev_token();
|
||||
self.parse_json_access(expr)
|
||||
} else {
|
||||
self.parse_map_access(expr)
|
||||
}
|
||||
self.parse_map_access(expr)
|
||||
} else if Token::Colon == tok {
|
||||
Ok(Expr::JsonAccess {
|
||||
left: Box::new(expr),
|
||||
operator: JsonOperator::Colon,
|
||||
right: Box::new(Expr::Value(self.parse_value()?)),
|
||||
})
|
||||
} else if Token::Arrow == tok
|
||||
|| Token::LongArrow == tok
|
||||
|| Token::HashArrow == tok
|
||||
|| Token::HashLongArrow == tok
|
||||
|| Token::AtArrow == tok
|
||||
|| Token::ArrowAt == tok
|
||||
|| Token::HashMinus == tok
|
||||
|| Token::AtQuestion == tok
|
||||
|| Token::AtAt == tok
|
||||
{
|
||||
let operator = match tok.token {
|
||||
Token::Arrow => JsonOperator::Arrow,
|
||||
Token::LongArrow => JsonOperator::LongArrow,
|
||||
Token::HashArrow => JsonOperator::HashArrow,
|
||||
Token::HashLongArrow => JsonOperator::HashLongArrow,
|
||||
Token::AtArrow => JsonOperator::AtArrow,
|
||||
Token::ArrowAt => JsonOperator::ArrowAt,
|
||||
Token::HashMinus => JsonOperator::HashMinus,
|
||||
Token::AtQuestion => JsonOperator::AtQuestion,
|
||||
Token::AtAt => JsonOperator::AtAt,
|
||||
_ => unreachable!(),
|
||||
};
|
||||
Ok(Expr::JsonAccess {
|
||||
left: Box::new(expr),
|
||||
operator,
|
||||
right: Box::new(self.parse_expr()?),
|
||||
})
|
||||
} else if dialect_of!(self is SnowflakeDialect | GenericDialect) && Token::Colon == tok {
|
||||
self.prev_token();
|
||||
self.parse_json_access(expr)
|
||||
} else {
|
||||
// Can only happen if `get_next_precedence` got out of sync with this function
|
||||
parser_err!(
|
||||
|
|
@ -2608,6 +2592,60 @@ impl<'a> Parser<'a> {
|
|||
})
|
||||
}
|
||||
|
||||
fn parse_json_path_object_key(&mut self) -> Result<JsonPathElem, ParserError> {
|
||||
let token = self.next_token();
|
||||
match token.token {
|
||||
Token::Word(Word {
|
||||
value,
|
||||
// path segments in SF dot notation can be unquoted or double quoted
|
||||
quote_style: quote_style @ (Some('"') | None),
|
||||
// some experimentation suggests that snowflake permits
|
||||
// any keyword here unquoted.
|
||||
keyword: _,
|
||||
}) => Ok(JsonPathElem::Dot {
|
||||
key: value,
|
||||
quoted: quote_style.is_some(),
|
||||
}),
|
||||
|
||||
// This token should never be generated on snowflake or generic
|
||||
// dialects, but we handle it just in case this is used on future
|
||||
// dialects.
|
||||
Token::DoubleQuotedString(key) => Ok(JsonPathElem::Dot { key, quoted: true }),
|
||||
|
||||
_ => self.expected("variant object key name", token),
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_json_access(&mut self, expr: Expr) -> Result<Expr, ParserError> {
|
||||
let mut path = Vec::new();
|
||||
loop {
|
||||
match self.next_token().token {
|
||||
Token::Colon if path.is_empty() => {
|
||||
path.push(self.parse_json_path_object_key()?);
|
||||
}
|
||||
Token::Period if !path.is_empty() => {
|
||||
path.push(self.parse_json_path_object_key()?);
|
||||
}
|
||||
Token::LBracket => {
|
||||
let key = self.parse_expr()?;
|
||||
self.expect_token(&Token::RBracket)?;
|
||||
|
||||
path.push(JsonPathElem::Bracket { key });
|
||||
}
|
||||
_ => {
|
||||
self.prev_token();
|
||||
break;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
debug_assert!(!path.is_empty());
|
||||
Ok(Expr::JsonAccess {
|
||||
value: Box::new(expr),
|
||||
path: JsonPath { path },
|
||||
})
|
||||
}
|
||||
|
||||
pub fn parse_map_access(&mut self, expr: Expr) -> Result<Expr, ParserError> {
|
||||
let key = self.parse_expr()?;
|
||||
self.expect_token(&Token::RBracket)?;
|
||||
|
|
@ -2711,6 +2749,7 @@ impl<'a> Parser<'a> {
|
|||
}
|
||||
|
||||
// use https://www.postgresql.org/docs/7.0/operators.htm#AEN2026 as a reference
|
||||
// higher number = higher precedence
|
||||
const MUL_DIV_MOD_OP_PREC: u8 = 40;
|
||||
const PLUS_MINUS_PREC: u8 = 30;
|
||||
const XOR_PREC: u8 = 24;
|
||||
|
|
@ -2718,6 +2757,7 @@ impl<'a> Parser<'a> {
|
|||
const BETWEEN_PREC: u8 = 20;
|
||||
const LIKE_PREC: u8 = 19;
|
||||
const IS_PREC: u8 = 17;
|
||||
const PG_OTHER_PREC: u8 = 16;
|
||||
const UNARY_NOT_PREC: u8 = 15;
|
||||
const AND_PREC: u8 = 10;
|
||||
const OR_PREC: u8 = 5;
|
||||
|
|
@ -2802,18 +2842,16 @@ impl<'a> Parser<'a> {
|
|||
Token::DoubleColon => Ok(50),
|
||||
Token::Colon => Ok(50),
|
||||
Token::ExclamationMark => Ok(50),
|
||||
Token::LBracket
|
||||
Token::LBracket | Token::Overlap | Token::CaretAt => Ok(50),
|
||||
Token::Arrow
|
||||
| Token::LongArrow
|
||||
| Token::Arrow
|
||||
| Token::Overlap
|
||||
| Token::CaretAt
|
||||
| Token::HashArrow
|
||||
| Token::HashLongArrow
|
||||
| Token::AtArrow
|
||||
| Token::ArrowAt
|
||||
| Token::HashMinus
|
||||
| Token::AtQuestion
|
||||
| Token::AtAt => Ok(50),
|
||||
| Token::AtAt => Ok(Self::PG_OTHER_PREC),
|
||||
_ => Ok(0),
|
||||
}
|
||||
}
|
||||
|
|
@ -6236,17 +6274,6 @@ impl<'a> Parser<'a> {
|
|||
},
|
||||
)?,
|
||||
},
|
||||
// Case when Snowflake Semi-structured data like key:value
|
||||
Keyword::NoKeyword
|
||||
| Keyword::LOCATION
|
||||
| Keyword::TYPE
|
||||
| Keyword::DATE
|
||||
| Keyword::START
|
||||
| Keyword::END
|
||||
if dialect_of!(self is SnowflakeDialect | GenericDialect) =>
|
||||
{
|
||||
Ok(Value::UnQuotedString(w.value))
|
||||
}
|
||||
_ => self.expected(
|
||||
"a concrete value",
|
||||
TokenWithLocation {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue