Improve parsing of JSON accesses on Postgres and Snowflake (#1215)

Co-authored-by: Ifeanyi Ubah <ify1992@yahoo.com>
This commit is contained in:
Joey Hain 2024-04-30 07:49:05 -07:00 committed by GitHub
parent 0606024353
commit 4bfa399919
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 432 additions and 199 deletions

View file

@ -51,7 +51,8 @@ pub use self::query::{
Top, TopQuantity, ValueTableMode, Values, WildcardAdditionalOptions, With,
};
pub use self::value::{
escape_quoted_string, DateTimeField, DollarQuotedString, TrimWhereField, Value,
escape_double_quote_string, escape_quoted_string, DateTimeField, DollarQuotedString,
TrimWhereField, Value,
};
use crate::ast::helpers::stmt_data_loading::{
@ -270,66 +271,6 @@ impl fmt::Display for Interval {
}
}
/// JsonOperator
#[derive(Debug, Copy, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
pub enum JsonOperator {
/// -> keeps the value as json
Arrow,
/// ->> keeps the value as text or int.
LongArrow,
/// #> Extracts JSON sub-object at the specified path
HashArrow,
/// #>> Extracts JSON sub-object at the specified path as text
HashLongArrow,
/// : Colon is used by Snowflake (Which is similar to LongArrow)
Colon,
/// jsonb @> jsonb -> boolean: Test whether left json contains the right json
AtArrow,
/// jsonb <@ jsonb -> boolean: Test whether right json contains the left json
ArrowAt,
/// jsonb #- text[] -> jsonb: Deletes the field or array element at the specified
/// path, where path elements can be either field keys or array indexes.
HashMinus,
/// jsonb @? jsonpath -> boolean: Does JSON path return any item for the specified
/// JSON value?
AtQuestion,
/// jsonb @@ jsonpath → boolean: Returns the result of a JSON path predicate check
/// for the specified JSON value. Only the first item of the result is taken into
/// account. If the result is not Boolean, then NULL is returned.
AtAt,
}
impl fmt::Display for JsonOperator {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
JsonOperator::Arrow => {
write!(f, "->")
}
JsonOperator::LongArrow => {
write!(f, "->>")
}
JsonOperator::HashArrow => {
write!(f, "#>")
}
JsonOperator::HashLongArrow => {
write!(f, "#>>")
}
JsonOperator::Colon => {
write!(f, ":")
}
JsonOperator::AtArrow => {
write!(f, "@>")
}
JsonOperator::ArrowAt => write!(f, "<@"),
JsonOperator::HashMinus => write!(f, "#-"),
JsonOperator::AtQuestion => write!(f, "@?"),
JsonOperator::AtAt => write!(f, "@@"),
}
}
}
/// A field definition within a struct.
///
/// [bigquery]: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#struct_type
@ -412,6 +353,59 @@ impl fmt::Display for MapAccessKey {
}
}
/// An element of a JSON path.
#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
pub enum JsonPathElem {
/// Accesses an object field using dot notation, e.g. `obj:foo.bar.baz`.
///
/// See <https://docs.snowflake.com/en/user-guide/querying-semistructured#dot-notation>.
Dot { key: String, quoted: bool },
/// Accesses an object field or array element using bracket notation,
/// e.g. `obj['foo']`.
///
/// See <https://docs.snowflake.com/en/user-guide/querying-semistructured#bracket-notation>.
Bracket { key: Expr },
}
/// A JSON path.
///
/// See <https://docs.snowflake.com/en/user-guide/querying-semistructured>.
/// See <https://docs.databricks.com/en/sql/language-manual/sql-ref-json-path-expression.html>.
#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
pub struct JsonPath {
pub path: Vec<JsonPathElem>,
}
impl fmt::Display for JsonPath {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
for (i, elem) in self.path.iter().enumerate() {
match elem {
JsonPathElem::Dot { key, quoted } => {
if i == 0 {
write!(f, ":")?;
} else {
write!(f, ".")?;
}
if *quoted {
write!(f, "\"{}\"", escape_double_quote_string(key))?;
} else {
write!(f, "{key}")?;
}
}
JsonPathElem::Bracket { key } => {
write!(f, "[{key}]")?;
}
}
}
Ok(())
}
}
/// The syntax used for in a cast expression.
#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
@ -449,11 +443,16 @@ pub enum Expr {
Identifier(Ident),
/// Multi-part identifier, e.g. `table_alias.column` or `schema.table.col`
CompoundIdentifier(Vec<Ident>),
/// JSON access (postgres) eg: data->'tags'
/// Access data nested in a value containing semi-structured data, such as
/// the `VARIANT` type on Snowflake. for example `src:customer[0].name`.
///
/// See <https://docs.snowflake.com/en/user-guide/querying-semistructured>.
/// See <https://docs.databricks.com/en/sql/language-manual/functions/colonsign.html>.
JsonAccess {
left: Box<Expr>,
operator: JsonOperator,
right: Box<Expr>,
/// The value being queried.
value: Box<Expr>,
/// The path to the data to extract.
path: JsonPath,
},
/// CompositeAccess (postgres) eg: SELECT (information_schema._pg_expandarray(array['i','i'])).n
CompositeAccess {
@ -1224,16 +1223,8 @@ impl fmt::Display for Expr {
Expr::Array(set) => {
write!(f, "{set}")
}
Expr::JsonAccess {
left,
operator,
right,
} => {
if operator == &JsonOperator::Colon {
write!(f, "{left}{operator}{right}")
} else {
write!(f, "{left} {operator} {right}")
}
Expr::JsonAccess { value, path } => {
write!(f, "{value}{path}")
}
Expr::CompositeAccess { expr, key } => {
write!(f, "{expr}.{key}")

View file

@ -141,6 +141,79 @@ pub enum BinaryOperator {
PGNotILikeMatch,
/// String "starts with", eg: `a ^@ b` (PostgreSQL-specific)
PGStartsWith,
/// The `->` operator.
///
/// On PostgreSQL, this operator extracts a JSON object field or array
/// element, for example `'{"a":"b"}'::json -> 'a'` or `[1, 2, 3]'::json
/// -> 2`.
///
/// See <https://www.postgresql.org/docs/current/functions-json.html>.
Arrow,
/// The `->>` operator.
///
/// On PostgreSQL, this operator that extracts a JSON object field or JSON
/// array element and converts it to text, for example `'{"a":"b"}'::json
/// ->> 'a'` or `[1, 2, 3]'::json ->> 2`.
///
/// See <https://www.postgresql.org/docs/current/functions-json.html>.
LongArrow,
/// The `#>` operator.
///
/// On PostgreSQL, this operator extracts a JSON sub-object at the specified
/// path, for example:
///
/// ```notrust
///'{"a": {"b": ["foo","bar"]}}'::json #> '{a,b,1}'
/// ```
///
/// See <https://www.postgresql.org/docs/current/functions-json.html>.
HashArrow,
/// The `#>>` operator.
///
/// A PostgreSQL-specific operator that extracts JSON sub-object at the
/// specified path, for example
///
/// ```notrust
///'{"a": {"b": ["foo","bar"]}}'::json #>> '{a,b,1}'
/// ```
///
/// See <https://www.postgresql.org/docs/current/functions-json.html>.
HashLongArrow,
/// The `@@` operator.
///
/// On PostgreSQL, this is used for JSON and text searches.
///
/// See <https://www.postgresql.org/docs/current/functions-json.html>.
/// See <https://www.postgresql.org/docs/current/functions-textsearch.html>.
AtAt,
/// The `@>` operator.
///
/// On PostgreSQL, this is used for JSON and text searches.
///
/// See <https://www.postgresql.org/docs/current/functions-json.html>.
/// See <https://www.postgresql.org/docs/current/functions-textsearch.html>.
AtArrow,
/// The `<@` operator.
///
/// On PostgreSQL, this is used for JSON and text searches.
///
/// See <https://www.postgresql.org/docs/current/functions-json.html>.
/// See <https://www.postgresql.org/docs/current/functions-textsearch.html>.
ArrowAt,
/// The `#-` operator.
///
/// On PostgreSQL, this operator is used to delete a field or array element
/// at a specified path.
///
/// See <https://www.postgresql.org/docs/current/functions-json.html>.
HashMinus,
/// The `@?` operator.
///
/// On PostgreSQL, this operator is used to check the given JSON path
/// returns an item for the JSON value.
///
/// See <https://www.postgresql.org/docs/current/functions-json.html>.
AtQuestion,
/// PostgreSQL-specific custom operator.
///
/// See [CREATE OPERATOR](https://www.postgresql.org/docs/current/sql-createoperator.html)
@ -187,6 +260,15 @@ impl fmt::Display for BinaryOperator {
BinaryOperator::PGNotLikeMatch => f.write_str("!~~"),
BinaryOperator::PGNotILikeMatch => f.write_str("!~~*"),
BinaryOperator::PGStartsWith => f.write_str("^@"),
BinaryOperator::Arrow => f.write_str("->"),
BinaryOperator::LongArrow => f.write_str("->>"),
BinaryOperator::HashArrow => f.write_str("#>"),
BinaryOperator::HashLongArrow => f.write_str("#>>"),
BinaryOperator::AtAt => f.write_str("@@"),
BinaryOperator::AtArrow => f.write_str("@>"),
BinaryOperator::ArrowAt => f.write_str("<@"),
BinaryOperator::HashMinus => f.write_str("#-"),
BinaryOperator::AtQuestion => f.write_str("@?"),
BinaryOperator::PGCustomBinaryOperator(idents) => {
write!(f, "OPERATOR({})", display_separated(idents, "."))
}

View file

@ -65,8 +65,6 @@ pub enum Value {
Null,
/// `?` or `$` Prepared statement arg placeholder
Placeholder(String),
/// Add support of snowflake field:key - key should be a value
UnQuotedString(String),
}
impl fmt::Display for Value {
@ -85,7 +83,6 @@ impl fmt::Display for Value {
Value::RawStringLiteral(v) => write!(f, "R'{v}'"),
Value::Null => write!(f, "NULL"),
Value::Placeholder(v) => write!(f, "{v}"),
Value::UnQuotedString(v) => write!(f, "{v}"),
}
}
}

View file

@ -2346,6 +2346,16 @@ impl<'a> Parser<'a> {
Token::DoubleTildeAsterisk => Some(BinaryOperator::PGILikeMatch),
Token::ExclamationMarkDoubleTilde => Some(BinaryOperator::PGNotLikeMatch),
Token::ExclamationMarkDoubleTildeAsterisk => Some(BinaryOperator::PGNotILikeMatch),
Token::Arrow => Some(BinaryOperator::Arrow),
Token::LongArrow => Some(BinaryOperator::LongArrow),
Token::HashArrow => Some(BinaryOperator::HashArrow),
Token::HashLongArrow => Some(BinaryOperator::HashLongArrow),
Token::AtArrow => Some(BinaryOperator::AtArrow),
Token::ArrowAt => Some(BinaryOperator::ArrowAt),
Token::HashMinus => Some(BinaryOperator::HashMinus),
Token::AtQuestion => Some(BinaryOperator::AtQuestion),
Token::AtAt => Some(BinaryOperator::AtAt),
Token::Word(w) => match w.keyword {
Keyword::AND => Some(BinaryOperator::And),
Keyword::OR => Some(BinaryOperator::Or),
@ -2539,42 +2549,16 @@ impl<'a> Parser<'a> {
} else if Token::LBracket == tok {
if dialect_of!(self is PostgreSqlDialect | GenericDialect) {
// parse index
return self.parse_array_index(expr);
self.parse_array_index(expr)
} else if dialect_of!(self is SnowflakeDialect) {
self.prev_token();
self.parse_json_access(expr)
} else {
self.parse_map_access(expr)
}
self.parse_map_access(expr)
} else if Token::Colon == tok {
Ok(Expr::JsonAccess {
left: Box::new(expr),
operator: JsonOperator::Colon,
right: Box::new(Expr::Value(self.parse_value()?)),
})
} else if Token::Arrow == tok
|| Token::LongArrow == tok
|| Token::HashArrow == tok
|| Token::HashLongArrow == tok
|| Token::AtArrow == tok
|| Token::ArrowAt == tok
|| Token::HashMinus == tok
|| Token::AtQuestion == tok
|| Token::AtAt == tok
{
let operator = match tok.token {
Token::Arrow => JsonOperator::Arrow,
Token::LongArrow => JsonOperator::LongArrow,
Token::HashArrow => JsonOperator::HashArrow,
Token::HashLongArrow => JsonOperator::HashLongArrow,
Token::AtArrow => JsonOperator::AtArrow,
Token::ArrowAt => JsonOperator::ArrowAt,
Token::HashMinus => JsonOperator::HashMinus,
Token::AtQuestion => JsonOperator::AtQuestion,
Token::AtAt => JsonOperator::AtAt,
_ => unreachable!(),
};
Ok(Expr::JsonAccess {
left: Box::new(expr),
operator,
right: Box::new(self.parse_expr()?),
})
} else if dialect_of!(self is SnowflakeDialect | GenericDialect) && Token::Colon == tok {
self.prev_token();
self.parse_json_access(expr)
} else {
// Can only happen if `get_next_precedence` got out of sync with this function
parser_err!(
@ -2608,6 +2592,60 @@ impl<'a> Parser<'a> {
})
}
fn parse_json_path_object_key(&mut self) -> Result<JsonPathElem, ParserError> {
let token = self.next_token();
match token.token {
Token::Word(Word {
value,
// path segments in SF dot notation can be unquoted or double quoted
quote_style: quote_style @ (Some('"') | None),
// some experimentation suggests that snowflake permits
// any keyword here unquoted.
keyword: _,
}) => Ok(JsonPathElem::Dot {
key: value,
quoted: quote_style.is_some(),
}),
// This token should never be generated on snowflake or generic
// dialects, but we handle it just in case this is used on future
// dialects.
Token::DoubleQuotedString(key) => Ok(JsonPathElem::Dot { key, quoted: true }),
_ => self.expected("variant object key name", token),
}
}
fn parse_json_access(&mut self, expr: Expr) -> Result<Expr, ParserError> {
let mut path = Vec::new();
loop {
match self.next_token().token {
Token::Colon if path.is_empty() => {
path.push(self.parse_json_path_object_key()?);
}
Token::Period if !path.is_empty() => {
path.push(self.parse_json_path_object_key()?);
}
Token::LBracket => {
let key = self.parse_expr()?;
self.expect_token(&Token::RBracket)?;
path.push(JsonPathElem::Bracket { key });
}
_ => {
self.prev_token();
break;
}
};
}
debug_assert!(!path.is_empty());
Ok(Expr::JsonAccess {
value: Box::new(expr),
path: JsonPath { path },
})
}
pub fn parse_map_access(&mut self, expr: Expr) -> Result<Expr, ParserError> {
let key = self.parse_expr()?;
self.expect_token(&Token::RBracket)?;
@ -2711,6 +2749,7 @@ impl<'a> Parser<'a> {
}
// use https://www.postgresql.org/docs/7.0/operators.htm#AEN2026 as a reference
// higher number = higher precedence
const MUL_DIV_MOD_OP_PREC: u8 = 40;
const PLUS_MINUS_PREC: u8 = 30;
const XOR_PREC: u8 = 24;
@ -2718,6 +2757,7 @@ impl<'a> Parser<'a> {
const BETWEEN_PREC: u8 = 20;
const LIKE_PREC: u8 = 19;
const IS_PREC: u8 = 17;
const PG_OTHER_PREC: u8 = 16;
const UNARY_NOT_PREC: u8 = 15;
const AND_PREC: u8 = 10;
const OR_PREC: u8 = 5;
@ -2802,18 +2842,16 @@ impl<'a> Parser<'a> {
Token::DoubleColon => Ok(50),
Token::Colon => Ok(50),
Token::ExclamationMark => Ok(50),
Token::LBracket
Token::LBracket | Token::Overlap | Token::CaretAt => Ok(50),
Token::Arrow
| Token::LongArrow
| Token::Arrow
| Token::Overlap
| Token::CaretAt
| Token::HashArrow
| Token::HashLongArrow
| Token::AtArrow
| Token::ArrowAt
| Token::HashMinus
| Token::AtQuestion
| Token::AtAt => Ok(50),
| Token::AtAt => Ok(Self::PG_OTHER_PREC),
_ => Ok(0),
}
}
@ -6236,17 +6274,6 @@ impl<'a> Parser<'a> {
},
)?,
},
// Case when Snowflake Semi-structured data like key:value
Keyword::NoKeyword
| Keyword::LOCATION
| Keyword::TYPE
| Keyword::DATE
| Keyword::START
| Keyword::END
if dialect_of!(self is SnowflakeDialect | GenericDialect) =>
{
Ok(Value::UnQuotedString(w.value))
}
_ => self.expected(
"a concrete value",
TokenWithLocation {