add support for custom operators in postgres (#1302)

Co-authored-by: Joey Hain <joey@sigmacomputing.com>
This commit is contained in:
Ophir LOJKINE 2024-06-07 13:12:18 +02:00 committed by GitHub
parent 2fb919d8b2
commit 4b60866bc7
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 203 additions and 46 deletions

View file

@ -111,7 +111,7 @@ pub enum BinaryOperator {
DuckIntegerDivide, DuckIntegerDivide,
/// MySQL [`DIV`](https://dev.mysql.com/doc/refman/8.0/en/arithmetic-functions.html) integer division /// MySQL [`DIV`](https://dev.mysql.com/doc/refman/8.0/en/arithmetic-functions.html) integer division
MyIntegerDivide, MyIntegerDivide,
/// Support for custom operators (built by parsers outside this crate) /// Support for custom operators (such as Postgres custom operators)
Custom(String), Custom(String),
/// Bitwise XOR, e.g. `a # b` (PostgreSQL-specific) /// Bitwise XOR, e.g. `a # b` (PostgreSQL-specific)
PGBitwiseXor, PGBitwiseXor,

View file

@ -122,6 +122,12 @@ pub trait Dialect: Debug + Any {
fn is_identifier_start(&self, ch: char) -> bool; fn is_identifier_start(&self, ch: char) -> bool;
/// Determine if a character is a valid unquoted identifier character /// Determine if a character is a valid unquoted identifier character
fn is_identifier_part(&self, ch: char) -> bool; fn is_identifier_part(&self, ch: char) -> bool;
/// Most dialects do not have custom operators. Override this method to provide custom operators.
fn is_custom_operator_part(&self, _ch: char) -> bool {
false
}
/// Determine if the dialect supports escaping characters via '\' in string literals. /// Determine if the dialect supports escaping characters via '\' in string literals.
/// ///
/// Some dialects like BigQuery and Snowflake support this while others like /// Some dialects like BigQuery and Snowflake support this while others like

View file

@ -25,6 +25,10 @@ impl Dialect for PostgreSqlDialect {
Some('"') Some('"')
} }
fn is_delimited_identifier_start(&self, ch: char) -> bool {
ch == '"' // Postgres does not support backticks to quote identifiers
}
fn is_identifier_start(&self, ch: char) -> bool { fn is_identifier_start(&self, ch: char) -> bool {
// See https://www.postgresql.org/docs/11/sql-syntax-lexical.html#SQL-SYNTAX-IDENTIFIERS // See https://www.postgresql.org/docs/11/sql-syntax-lexical.html#SQL-SYNTAX-IDENTIFIERS
// We don't yet support identifiers beginning with "letters with // We don't yet support identifiers beginning with "letters with
@ -36,6 +40,29 @@ impl Dialect for PostgreSqlDialect {
ch.is_alphabetic() || ch.is_ascii_digit() || ch == '$' || ch == '_' ch.is_alphabetic() || ch.is_ascii_digit() || ch == '$' || ch == '_'
} }
/// See <https://www.postgresql.org/docs/current/sql-createoperator.html>
fn is_custom_operator_part(&self, ch: char) -> bool {
matches!(
ch,
'+' | '-'
| '*'
| '/'
| '<'
| '>'
| '='
| '~'
| '!'
| '@'
| '#'
| '%'
| '^'
| '&'
| '|'
| '`'
| '?'
)
}
fn parse_statement(&self, parser: &mut Parser) -> Option<Result<Statement, ParserError>> { fn parse_statement(&self, parser: &mut Parser) -> Option<Result<Statement, ParserError>> {
if parser.parse_keyword(Keyword::COMMENT) { if parser.parse_keyword(Keyword::COMMENT) {
Some(parse_comment(parser)) Some(parse_comment(parser))

View file

@ -2344,9 +2344,8 @@ impl<'a> Parser<'a> {
return infix; return infix;
} }
let tok = self.next_token(); let mut tok = self.next_token();
let regular_binary_operator = match &mut tok.token {
let regular_binary_operator = match &tok.token {
Token::Spaceship => Some(BinaryOperator::Spaceship), Token::Spaceship => Some(BinaryOperator::Spaceship),
Token::DoubleEq => Some(BinaryOperator::Eq), Token::DoubleEq => Some(BinaryOperator::Eq),
Token::Eq => Some(BinaryOperator::Eq), Token::Eq => Some(BinaryOperator::Eq),
@ -2410,6 +2409,7 @@ impl<'a> Parser<'a> {
Token::Question => Some(BinaryOperator::Question), Token::Question => Some(BinaryOperator::Question),
Token::QuestionAnd => Some(BinaryOperator::QuestionAnd), Token::QuestionAnd => Some(BinaryOperator::QuestionAnd),
Token::QuestionPipe => Some(BinaryOperator::QuestionPipe), Token::QuestionPipe => Some(BinaryOperator::QuestionPipe),
Token::CustomBinaryOperator(s) => Some(BinaryOperator::Custom(core::mem::take(s))),
Token::Word(w) => match w.keyword { Token::Word(w) => match w.keyword {
Keyword::AND => Some(BinaryOperator::And), Keyword::AND => Some(BinaryOperator::And),
@ -2964,7 +2964,8 @@ impl<'a> Parser<'a> {
| Token::AtAt | Token::AtAt
| Token::Question | Token::Question
| Token::QuestionAnd | Token::QuestionAnd
| Token::QuestionPipe => Ok(Self::PG_OTHER_PREC), | Token::QuestionPipe
| Token::CustomBinaryOperator(_) => Ok(Self::PG_OTHER_PREC),
_ => Ok(0), _ => Ok(0),
} }
} }

View file

@ -231,6 +231,10 @@ pub enum Token {
/// jsonb ?| text[] -> boolean: Check whether any member of the text array exists as top-level /// jsonb ?| text[] -> boolean: Check whether any member of the text array exists as top-level
/// keys within the jsonb object /// keys within the jsonb object
QuestionPipe, QuestionPipe,
/// Custom binary operator
/// This is used to represent any custom binary operator that is not part of the SQL standard.
/// PostgreSQL allows defining custom binary operators using CREATE OPERATOR.
CustomBinaryOperator(String),
} }
impl fmt::Display for Token { impl fmt::Display for Token {
@ -320,6 +324,7 @@ impl fmt::Display for Token {
Token::Question => write!(f, "?"), Token::Question => write!(f, "?"),
Token::QuestionAnd => write!(f, "?&"), Token::QuestionAnd => write!(f, "?&"),
Token::QuestionPipe => write!(f, "?|"), Token::QuestionPipe => write!(f, "?|"),
Token::CustomBinaryOperator(s) => f.write_str(s),
} }
} }
} }
@ -961,15 +966,12 @@ impl<'a> Tokenizer<'a> {
Some('>') => { Some('>') => {
chars.next(); chars.next();
match chars.peek() { match chars.peek() {
Some('>') => { Some('>') => self.consume_for_binop(chars, "->>", Token::LongArrow),
chars.next(); _ => self.start_binop(chars, "->", Token::Arrow),
Ok(Some(Token::LongArrow))
}
_ => Ok(Some(Token::Arrow)),
} }
} }
// a regular '-' operator // a regular '-' operator
_ => Ok(Some(Token::Minus)), _ => self.start_binop(chars, "-", Token::Minus),
} }
} }
'/' => { '/' => {
@ -999,26 +1001,28 @@ impl<'a> Tokenizer<'a> {
'%' => { '%' => {
chars.next(); // advance past '%' chars.next(); // advance past '%'
match chars.peek() { match chars.peek() {
Some(' ') => Ok(Some(Token::Mod)), Some(s) if s.is_whitespace() => Ok(Some(Token::Mod)),
Some(sch) if self.dialect.is_identifier_start('%') => { Some(sch) if self.dialect.is_identifier_start('%') => {
self.tokenize_identifier_or_keyword([ch, *sch], chars) self.tokenize_identifier_or_keyword([ch, *sch], chars)
} }
_ => Ok(Some(Token::Mod)), _ => self.start_binop(chars, "%", Token::Mod),
} }
} }
'|' => { '|' => {
chars.next(); // consume the '|' chars.next(); // consume the '|'
match chars.peek() { match chars.peek() {
Some('/') => self.consume_and_return(chars, Token::PGSquareRoot), Some('/') => self.consume_for_binop(chars, "|/", Token::PGSquareRoot),
Some('|') => { Some('|') => {
chars.next(); // consume the second '|' chars.next(); // consume the second '|'
match chars.peek() { match chars.peek() {
Some('/') => self.consume_and_return(chars, Token::PGCubeRoot), Some('/') => {
_ => Ok(Some(Token::StringConcat)), self.consume_for_binop(chars, "||/", Token::PGCubeRoot)
}
_ => self.start_binop(chars, "||", Token::StringConcat),
} }
} }
// Bitshift '|' operator // Bitshift '|' operator
_ => Ok(Some(Token::Pipe)), _ => self.start_binop(chars, "|", Token::Pipe),
} }
} }
'=' => { '=' => {
@ -1061,22 +1065,22 @@ impl<'a> Tokenizer<'a> {
Some('=') => { Some('=') => {
chars.next(); chars.next();
match chars.peek() { match chars.peek() {
Some('>') => self.consume_and_return(chars, Token::Spaceship), Some('>') => self.consume_for_binop(chars, "<=>", Token::Spaceship),
_ => Ok(Some(Token::LtEq)), _ => self.start_binop(chars, "<=", Token::LtEq),
} }
} }
Some('>') => self.consume_and_return(chars, Token::Neq), Some('>') => self.consume_for_binop(chars, "<>", Token::Neq),
Some('<') => self.consume_and_return(chars, Token::ShiftLeft), Some('<') => self.consume_for_binop(chars, "<<", Token::ShiftLeft),
Some('@') => self.consume_and_return(chars, Token::ArrowAt), Some('@') => self.consume_for_binop(chars, "<@", Token::ArrowAt),
_ => Ok(Some(Token::Lt)), _ => self.start_binop(chars, "<", Token::Lt),
} }
} }
'>' => { '>' => {
chars.next(); // consume chars.next(); // consume
match chars.peek() { match chars.peek() {
Some('=') => self.consume_and_return(chars, Token::GtEq), Some('=') => self.consume_for_binop(chars, ">=", Token::GtEq),
Some('>') => self.consume_and_return(chars, Token::ShiftRight), Some('>') => self.consume_for_binop(chars, ">>", Token::ShiftRight),
_ => Ok(Some(Token::Gt)), _ => self.start_binop(chars, ">", Token::Gt),
} }
} }
':' => { ':' => {
@ -1094,9 +1098,12 @@ impl<'a> Tokenizer<'a> {
'&' => { '&' => {
chars.next(); // consume the '&' chars.next(); // consume the '&'
match chars.peek() { match chars.peek() {
Some('&') => self.consume_and_return(chars, Token::Overlap), Some('&') => {
chars.next(); // consume the second '&'
self.start_binop(chars, "&&", Token::Overlap)
}
// Bitshift '&' operator // Bitshift '&' operator
_ => Ok(Some(Token::Ampersand)), _ => self.start_binop(chars, "&", Token::Ampersand),
} }
} }
'^' => { '^' => {
@ -1119,38 +1126,37 @@ impl<'a> Tokenizer<'a> {
'~' => { '~' => {
chars.next(); // consume chars.next(); // consume
match chars.peek() { match chars.peek() {
Some('*') => self.consume_and_return(chars, Token::TildeAsterisk), Some('*') => self.consume_for_binop(chars, "~*", Token::TildeAsterisk),
Some('~') => { Some('~') => {
chars.next(); chars.next();
match chars.peek() { match chars.peek() {
Some('*') => { Some('*') => {
self.consume_and_return(chars, Token::DoubleTildeAsterisk) self.consume_for_binop(chars, "~~*", Token::DoubleTildeAsterisk)
} }
_ => Ok(Some(Token::DoubleTilde)), _ => self.start_binop(chars, "~~", Token::DoubleTilde),
} }
} }
_ => Ok(Some(Token::Tilde)), _ => self.start_binop(chars, "~", Token::Tilde),
} }
} }
'#' => { '#' => {
chars.next(); chars.next();
match chars.peek() { match chars.peek() {
Some('-') => self.consume_and_return(chars, Token::HashMinus), Some('-') => self.consume_for_binop(chars, "#-", Token::HashMinus),
Some('>') => { Some('>') => {
chars.next(); chars.next();
match chars.peek() { match chars.peek() {
Some('>') => { Some('>') => {
chars.next(); self.consume_for_binop(chars, "#>>", Token::HashLongArrow)
Ok(Some(Token::HashLongArrow))
} }
_ => Ok(Some(Token::HashArrow)), _ => self.start_binop(chars, "#>", Token::HashArrow),
} }
} }
Some(' ') => Ok(Some(Token::Sharp)), Some(' ') => Ok(Some(Token::Sharp)),
Some(sch) if self.dialect.is_identifier_start('#') => { Some(sch) if self.dialect.is_identifier_start('#') => {
self.tokenize_identifier_or_keyword([ch, *sch], chars) self.tokenize_identifier_or_keyword([ch, *sch], chars)
} }
_ => Ok(Some(Token::Sharp)), _ => self.start_binop(chars, "#", Token::Sharp),
} }
} }
'@' => { '@' => {
@ -1206,6 +1212,39 @@ impl<'a> Tokenizer<'a> {
} }
} }
/// Consume the next character, then parse a custom binary operator. The next character should be included in the prefix
fn consume_for_binop(
&self,
chars: &mut State,
prefix: &str,
default: Token,
) -> Result<Option<Token>, TokenizerError> {
chars.next(); // consume the first char
self.start_binop(chars, prefix, default)
}
/// parse a custom binary operator
fn start_binop(
&self,
chars: &mut State,
prefix: &str,
default: Token,
) -> Result<Option<Token>, TokenizerError> {
let mut custom = None;
while let Some(&ch) = chars.peek() {
if !self.dialect.is_custom_operator_part(ch) {
break;
}
custom.get_or_insert_with(|| prefix.to_string()).push(ch);
chars.next();
}
Ok(Some(
custom.map(Token::CustomBinaryOperator).unwrap_or(default),
))
}
/// Tokenize dollar preceded value (i.e: a string/placeholder) /// Tokenize dollar preceded value (i.e: a string/placeholder)
fn tokenize_dollar_preceded_value(&self, chars: &mut State) -> Result<Token, TokenizerError> { fn tokenize_dollar_preceded_value(&self, chars: &mut State) -> Result<Token, TokenizerError> {
let mut s = String::new(); let mut s = String::new();

View file

@ -437,6 +437,12 @@ fn parse_for_json_expect_ast() {
); );
} }
#[test]
fn parse_ampersand_arobase() {
// In SQL Server, a&@b means (a) & (@b), in PostgreSQL it means (a) &@ (b)
ms().expr_parses_to("a&@b", "a & @b");
}
#[test] #[test]
fn parse_cast_varchar_max() { fn parse_cast_varchar_max() {
ms_and_generic().verified_expr("CAST('foo' AS VARCHAR(MAX))"); ms_and_generic().verified_expr("CAST('foo' AS VARCHAR(MAX))");

View file

@ -1757,6 +1757,29 @@ fn parse_pg_returning() {
}; };
} }
fn test_operator(operator: &str, dialect: &TestedDialects, expected: BinaryOperator) {
let operator_tokens =
sqlparser::tokenizer::Tokenizer::new(&PostgreSqlDialect {}, &format!("a{operator}b"))
.tokenize()
.unwrap();
assert_eq!(
operator_tokens.len(),
3,
"binary op should be 3 tokens, not {operator_tokens:?}"
);
let expected_expr = Expr::BinaryOp {
left: Box::new(Expr::Identifier(Ident::new("a"))),
op: expected,
right: Box::new(Expr::Identifier(Ident::new("b"))),
};
let str_expr_canonical = format!("a {operator} b");
assert_eq!(expected_expr, dialect.verified_expr(&str_expr_canonical));
assert_eq!(
expected_expr,
dialect.expr_parses_to(&format!("a{operator}b"), &str_expr_canonical)
);
}
#[test] #[test]
fn parse_pg_binary_ops() { fn parse_pg_binary_ops() {
let binary_ops = &[ let binary_ops = &[
@ -1770,18 +1793,73 @@ fn parse_pg_binary_ops() {
]; ];
for (str_op, op, dialects) in binary_ops { for (str_op, op, dialects) in binary_ops {
let select = dialects.verified_only_select(&format!("SELECT a {} b", &str_op)); test_operator(str_op, dialects, op.clone());
assert_eq!(
SelectItem::UnnamedExpr(Expr::BinaryOp {
left: Box::new(Expr::Identifier(Ident::new("a"))),
op: op.clone(),
right: Box::new(Expr::Identifier(Ident::new("b"))),
}),
select.projection[0]
);
} }
} }
#[test]
fn parse_pg_custom_binary_ops() {
// Postgres supports declaring custom binary operators, using any character in the following set:
// + - * / < > = ~ ! @ # % ^ & | ` ?
// Here, we test the ones used by common extensions
let operators = [
// PostGIS
"&&&", // n-D bounding boxes intersect
"&<", // (is strictly to the left of)
"&>", // (is strictly to the right of)
"|=|", // distance between A and B trajectories at their closest point of approach
"<<#>>", // n-D distance between A and B bounding boxes
"|>>", // A's bounding box is strictly above B's.
"~=", // bounding box is the same
// PGroonga
"&@", // Full text search by a keyword
"&@~", // Full text search by easy to use query language
"&@*", // Similar search
"&`", // Advanced search by ECMAScript like query language
"&@|", // Full text search by an array of keywords
"&@~|", // Full text search by an array of queries in easy to use query language
// pgtrgm
"<<%", // second argument has a continuous extent of an ordered trigram set that matches word boundaries
"%>>", // commutator of <<%
"<<<->", // distance between arguments
// hstore
"#=", // Replace fields with matching values from hstore
// ranges
"-|-", // Is adjacent to
// pg_similarity
"~++", // L1 distance
"~##", // Cosine Distance
"~-~", // Dice Coefficient
"~!!", // Euclidean Distance
"~@~", // Hamming Distance
"~??", // Jaccard Coefficient
"~%%", // Jaro Distance
"~@@", // Jaro-Winkler Distance
"~==", // Levenshtein Distance
"~^^", // Matching Coefficient
"~||", // Monge-Elkan Coefficient
"~#~", // Needleman-Wunsch Coefficient
"~**", // Overlap Coefficient
"~~~", // Q-Gram Distance
"~=~", // Smith-Waterman Coefficient
"~!~", // Smith-Waterman-Gotoh Coefficient
"~*~", // Soundex Distance
// soundex_operator
">@@<", // Soundex matches
"<@@>", // Soundex doesn't match
];
for op in &operators {
test_operator(op, &pg(), BinaryOperator::Custom(op.to_string()));
}
}
#[test]
fn parse_ampersand_arobase() {
// In SQL Server, a&@b means (a) & (@b), in PostgreSQL it means (a) &@ (b)
pg().expr_parses_to("a&@b", "a &@ b");
}
#[test] #[test]
fn parse_pg_unary_ops() { fn parse_pg_unary_ops() {
let pg_unary_ops = &[ let pg_unary_ops = &[