mirror of
https://github.com/apache/datafusion-sqlparser-rs.git
synced 2025-08-27 09:24:04 +00:00
add support for custom operators in postgres (#1302)
Co-authored-by: Joey Hain <joey@sigmacomputing.com>
This commit is contained in:
parent
2fb919d8b2
commit
4b60866bc7
7 changed files with 203 additions and 46 deletions
|
@ -111,7 +111,7 @@ pub enum BinaryOperator {
|
|||
DuckIntegerDivide,
|
||||
/// MySQL [`DIV`](https://dev.mysql.com/doc/refman/8.0/en/arithmetic-functions.html) integer division
|
||||
MyIntegerDivide,
|
||||
/// Support for custom operators (built by parsers outside this crate)
|
||||
/// Support for custom operators (such as Postgres custom operators)
|
||||
Custom(String),
|
||||
/// Bitwise XOR, e.g. `a # b` (PostgreSQL-specific)
|
||||
PGBitwiseXor,
|
||||
|
|
|
@ -122,6 +122,12 @@ pub trait Dialect: Debug + Any {
|
|||
fn is_identifier_start(&self, ch: char) -> bool;
|
||||
/// Determine if a character is a valid unquoted identifier character
|
||||
fn is_identifier_part(&self, ch: char) -> bool;
|
||||
|
||||
/// Most dialects do not have custom operators. Override this method to provide custom operators.
|
||||
fn is_custom_operator_part(&self, _ch: char) -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
/// Determine if the dialect supports escaping characters via '\' in string literals.
|
||||
///
|
||||
/// Some dialects like BigQuery and Snowflake support this while others like
|
||||
|
|
|
@ -25,6 +25,10 @@ impl Dialect for PostgreSqlDialect {
|
|||
Some('"')
|
||||
}
|
||||
|
||||
fn is_delimited_identifier_start(&self, ch: char) -> bool {
|
||||
ch == '"' // Postgres does not support backticks to quote identifiers
|
||||
}
|
||||
|
||||
fn is_identifier_start(&self, ch: char) -> bool {
|
||||
// See https://www.postgresql.org/docs/11/sql-syntax-lexical.html#SQL-SYNTAX-IDENTIFIERS
|
||||
// We don't yet support identifiers beginning with "letters with
|
||||
|
@ -36,6 +40,29 @@ impl Dialect for PostgreSqlDialect {
|
|||
ch.is_alphabetic() || ch.is_ascii_digit() || ch == '$' || ch == '_'
|
||||
}
|
||||
|
||||
/// See <https://www.postgresql.org/docs/current/sql-createoperator.html>
|
||||
fn is_custom_operator_part(&self, ch: char) -> bool {
|
||||
matches!(
|
||||
ch,
|
||||
'+' | '-'
|
||||
| '*'
|
||||
| '/'
|
||||
| '<'
|
||||
| '>'
|
||||
| '='
|
||||
| '~'
|
||||
| '!'
|
||||
| '@'
|
||||
| '#'
|
||||
| '%'
|
||||
| '^'
|
||||
| '&'
|
||||
| '|'
|
||||
| '`'
|
||||
| '?'
|
||||
)
|
||||
}
|
||||
|
||||
fn parse_statement(&self, parser: &mut Parser) -> Option<Result<Statement, ParserError>> {
|
||||
if parser.parse_keyword(Keyword::COMMENT) {
|
||||
Some(parse_comment(parser))
|
||||
|
|
|
@ -2344,9 +2344,8 @@ impl<'a> Parser<'a> {
|
|||
return infix;
|
||||
}
|
||||
|
||||
let tok = self.next_token();
|
||||
|
||||
let regular_binary_operator = match &tok.token {
|
||||
let mut tok = self.next_token();
|
||||
let regular_binary_operator = match &mut tok.token {
|
||||
Token::Spaceship => Some(BinaryOperator::Spaceship),
|
||||
Token::DoubleEq => Some(BinaryOperator::Eq),
|
||||
Token::Eq => Some(BinaryOperator::Eq),
|
||||
|
@ -2410,6 +2409,7 @@ impl<'a> Parser<'a> {
|
|||
Token::Question => Some(BinaryOperator::Question),
|
||||
Token::QuestionAnd => Some(BinaryOperator::QuestionAnd),
|
||||
Token::QuestionPipe => Some(BinaryOperator::QuestionPipe),
|
||||
Token::CustomBinaryOperator(s) => Some(BinaryOperator::Custom(core::mem::take(s))),
|
||||
|
||||
Token::Word(w) => match w.keyword {
|
||||
Keyword::AND => Some(BinaryOperator::And),
|
||||
|
@ -2964,7 +2964,8 @@ impl<'a> Parser<'a> {
|
|||
| Token::AtAt
|
||||
| Token::Question
|
||||
| Token::QuestionAnd
|
||||
| Token::QuestionPipe => Ok(Self::PG_OTHER_PREC),
|
||||
| Token::QuestionPipe
|
||||
| Token::CustomBinaryOperator(_) => Ok(Self::PG_OTHER_PREC),
|
||||
_ => Ok(0),
|
||||
}
|
||||
}
|
||||
|
|
103
src/tokenizer.rs
103
src/tokenizer.rs
|
@ -231,6 +231,10 @@ pub enum Token {
|
|||
/// jsonb ?| text[] -> boolean: Check whether any member of the text array exists as top-level
|
||||
/// keys within the jsonb object
|
||||
QuestionPipe,
|
||||
/// Custom binary operator
|
||||
/// This is used to represent any custom binary operator that is not part of the SQL standard.
|
||||
/// PostgreSQL allows defining custom binary operators using CREATE OPERATOR.
|
||||
CustomBinaryOperator(String),
|
||||
}
|
||||
|
||||
impl fmt::Display for Token {
|
||||
|
@ -320,6 +324,7 @@ impl fmt::Display for Token {
|
|||
Token::Question => write!(f, "?"),
|
||||
Token::QuestionAnd => write!(f, "?&"),
|
||||
Token::QuestionPipe => write!(f, "?|"),
|
||||
Token::CustomBinaryOperator(s) => f.write_str(s),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -961,15 +966,12 @@ impl<'a> Tokenizer<'a> {
|
|||
Some('>') => {
|
||||
chars.next();
|
||||
match chars.peek() {
|
||||
Some('>') => {
|
||||
chars.next();
|
||||
Ok(Some(Token::LongArrow))
|
||||
}
|
||||
_ => Ok(Some(Token::Arrow)),
|
||||
Some('>') => self.consume_for_binop(chars, "->>", Token::LongArrow),
|
||||
_ => self.start_binop(chars, "->", Token::Arrow),
|
||||
}
|
||||
}
|
||||
// a regular '-' operator
|
||||
_ => Ok(Some(Token::Minus)),
|
||||
_ => self.start_binop(chars, "-", Token::Minus),
|
||||
}
|
||||
}
|
||||
'/' => {
|
||||
|
@ -999,26 +1001,28 @@ impl<'a> Tokenizer<'a> {
|
|||
'%' => {
|
||||
chars.next(); // advance past '%'
|
||||
match chars.peek() {
|
||||
Some(' ') => Ok(Some(Token::Mod)),
|
||||
Some(s) if s.is_whitespace() => Ok(Some(Token::Mod)),
|
||||
Some(sch) if self.dialect.is_identifier_start('%') => {
|
||||
self.tokenize_identifier_or_keyword([ch, *sch], chars)
|
||||
}
|
||||
_ => Ok(Some(Token::Mod)),
|
||||
_ => self.start_binop(chars, "%", Token::Mod),
|
||||
}
|
||||
}
|
||||
'|' => {
|
||||
chars.next(); // consume the '|'
|
||||
match chars.peek() {
|
||||
Some('/') => self.consume_and_return(chars, Token::PGSquareRoot),
|
||||
Some('/') => self.consume_for_binop(chars, "|/", Token::PGSquareRoot),
|
||||
Some('|') => {
|
||||
chars.next(); // consume the second '|'
|
||||
match chars.peek() {
|
||||
Some('/') => self.consume_and_return(chars, Token::PGCubeRoot),
|
||||
_ => Ok(Some(Token::StringConcat)),
|
||||
Some('/') => {
|
||||
self.consume_for_binop(chars, "||/", Token::PGCubeRoot)
|
||||
}
|
||||
_ => self.start_binop(chars, "||", Token::StringConcat),
|
||||
}
|
||||
}
|
||||
// Bitshift '|' operator
|
||||
_ => Ok(Some(Token::Pipe)),
|
||||
_ => self.start_binop(chars, "|", Token::Pipe),
|
||||
}
|
||||
}
|
||||
'=' => {
|
||||
|
@ -1061,22 +1065,22 @@ impl<'a> Tokenizer<'a> {
|
|||
Some('=') => {
|
||||
chars.next();
|
||||
match chars.peek() {
|
||||
Some('>') => self.consume_and_return(chars, Token::Spaceship),
|
||||
_ => Ok(Some(Token::LtEq)),
|
||||
Some('>') => self.consume_for_binop(chars, "<=>", Token::Spaceship),
|
||||
_ => self.start_binop(chars, "<=", Token::LtEq),
|
||||
}
|
||||
}
|
||||
Some('>') => self.consume_and_return(chars, Token::Neq),
|
||||
Some('<') => self.consume_and_return(chars, Token::ShiftLeft),
|
||||
Some('@') => self.consume_and_return(chars, Token::ArrowAt),
|
||||
_ => Ok(Some(Token::Lt)),
|
||||
Some('>') => self.consume_for_binop(chars, "<>", Token::Neq),
|
||||
Some('<') => self.consume_for_binop(chars, "<<", Token::ShiftLeft),
|
||||
Some('@') => self.consume_for_binop(chars, "<@", Token::ArrowAt),
|
||||
_ => self.start_binop(chars, "<", Token::Lt),
|
||||
}
|
||||
}
|
||||
'>' => {
|
||||
chars.next(); // consume
|
||||
match chars.peek() {
|
||||
Some('=') => self.consume_and_return(chars, Token::GtEq),
|
||||
Some('>') => self.consume_and_return(chars, Token::ShiftRight),
|
||||
_ => Ok(Some(Token::Gt)),
|
||||
Some('=') => self.consume_for_binop(chars, ">=", Token::GtEq),
|
||||
Some('>') => self.consume_for_binop(chars, ">>", Token::ShiftRight),
|
||||
_ => self.start_binop(chars, ">", Token::Gt),
|
||||
}
|
||||
}
|
||||
':' => {
|
||||
|
@ -1094,9 +1098,12 @@ impl<'a> Tokenizer<'a> {
|
|||
'&' => {
|
||||
chars.next(); // consume the '&'
|
||||
match chars.peek() {
|
||||
Some('&') => self.consume_and_return(chars, Token::Overlap),
|
||||
Some('&') => {
|
||||
chars.next(); // consume the second '&'
|
||||
self.start_binop(chars, "&&", Token::Overlap)
|
||||
}
|
||||
// Bitshift '&' operator
|
||||
_ => Ok(Some(Token::Ampersand)),
|
||||
_ => self.start_binop(chars, "&", Token::Ampersand),
|
||||
}
|
||||
}
|
||||
'^' => {
|
||||
|
@ -1119,38 +1126,37 @@ impl<'a> Tokenizer<'a> {
|
|||
'~' => {
|
||||
chars.next(); // consume
|
||||
match chars.peek() {
|
||||
Some('*') => self.consume_and_return(chars, Token::TildeAsterisk),
|
||||
Some('*') => self.consume_for_binop(chars, "~*", Token::TildeAsterisk),
|
||||
Some('~') => {
|
||||
chars.next();
|
||||
match chars.peek() {
|
||||
Some('*') => {
|
||||
self.consume_and_return(chars, Token::DoubleTildeAsterisk)
|
||||
self.consume_for_binop(chars, "~~*", Token::DoubleTildeAsterisk)
|
||||
}
|
||||
_ => Ok(Some(Token::DoubleTilde)),
|
||||
_ => self.start_binop(chars, "~~", Token::DoubleTilde),
|
||||
}
|
||||
}
|
||||
_ => Ok(Some(Token::Tilde)),
|
||||
_ => self.start_binop(chars, "~", Token::Tilde),
|
||||
}
|
||||
}
|
||||
'#' => {
|
||||
chars.next();
|
||||
match chars.peek() {
|
||||
Some('-') => self.consume_and_return(chars, Token::HashMinus),
|
||||
Some('-') => self.consume_for_binop(chars, "#-", Token::HashMinus),
|
||||
Some('>') => {
|
||||
chars.next();
|
||||
match chars.peek() {
|
||||
Some('>') => {
|
||||
chars.next();
|
||||
Ok(Some(Token::HashLongArrow))
|
||||
self.consume_for_binop(chars, "#>>", Token::HashLongArrow)
|
||||
}
|
||||
_ => Ok(Some(Token::HashArrow)),
|
||||
_ => self.start_binop(chars, "#>", Token::HashArrow),
|
||||
}
|
||||
}
|
||||
Some(' ') => Ok(Some(Token::Sharp)),
|
||||
Some(sch) if self.dialect.is_identifier_start('#') => {
|
||||
self.tokenize_identifier_or_keyword([ch, *sch], chars)
|
||||
}
|
||||
_ => Ok(Some(Token::Sharp)),
|
||||
_ => self.start_binop(chars, "#", Token::Sharp),
|
||||
}
|
||||
}
|
||||
'@' => {
|
||||
|
@ -1206,6 +1212,39 @@ impl<'a> Tokenizer<'a> {
|
|||
}
|
||||
}
|
||||
|
||||
/// Consume the next character, then parse a custom binary operator. The next character should be included in the prefix
|
||||
fn consume_for_binop(
|
||||
&self,
|
||||
chars: &mut State,
|
||||
prefix: &str,
|
||||
default: Token,
|
||||
) -> Result<Option<Token>, TokenizerError> {
|
||||
chars.next(); // consume the first char
|
||||
self.start_binop(chars, prefix, default)
|
||||
}
|
||||
|
||||
/// parse a custom binary operator
|
||||
fn start_binop(
|
||||
&self,
|
||||
chars: &mut State,
|
||||
prefix: &str,
|
||||
default: Token,
|
||||
) -> Result<Option<Token>, TokenizerError> {
|
||||
let mut custom = None;
|
||||
while let Some(&ch) = chars.peek() {
|
||||
if !self.dialect.is_custom_operator_part(ch) {
|
||||
break;
|
||||
}
|
||||
|
||||
custom.get_or_insert_with(|| prefix.to_string()).push(ch);
|
||||
chars.next();
|
||||
}
|
||||
|
||||
Ok(Some(
|
||||
custom.map(Token::CustomBinaryOperator).unwrap_or(default),
|
||||
))
|
||||
}
|
||||
|
||||
/// Tokenize dollar preceded value (i.e: a string/placeholder)
|
||||
fn tokenize_dollar_preceded_value(&self, chars: &mut State) -> Result<Token, TokenizerError> {
|
||||
let mut s = String::new();
|
||||
|
|
|
@ -437,6 +437,12 @@ fn parse_for_json_expect_ast() {
|
|||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_ampersand_arobase() {
|
||||
// In SQL Server, a&@b means (a) & (@b), in PostgreSQL it means (a) &@ (b)
|
||||
ms().expr_parses_to("a&@b", "a & @b");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_cast_varchar_max() {
|
||||
ms_and_generic().verified_expr("CAST('foo' AS VARCHAR(MAX))");
|
||||
|
|
|
@ -1757,6 +1757,29 @@ fn parse_pg_returning() {
|
|||
};
|
||||
}
|
||||
|
||||
fn test_operator(operator: &str, dialect: &TestedDialects, expected: BinaryOperator) {
|
||||
let operator_tokens =
|
||||
sqlparser::tokenizer::Tokenizer::new(&PostgreSqlDialect {}, &format!("a{operator}b"))
|
||||
.tokenize()
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
operator_tokens.len(),
|
||||
3,
|
||||
"binary op should be 3 tokens, not {operator_tokens:?}"
|
||||
);
|
||||
let expected_expr = Expr::BinaryOp {
|
||||
left: Box::new(Expr::Identifier(Ident::new("a"))),
|
||||
op: expected,
|
||||
right: Box::new(Expr::Identifier(Ident::new("b"))),
|
||||
};
|
||||
let str_expr_canonical = format!("a {operator} b");
|
||||
assert_eq!(expected_expr, dialect.verified_expr(&str_expr_canonical));
|
||||
assert_eq!(
|
||||
expected_expr,
|
||||
dialect.expr_parses_to(&format!("a{operator}b"), &str_expr_canonical)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_pg_binary_ops() {
|
||||
let binary_ops = &[
|
||||
|
@ -1770,18 +1793,73 @@ fn parse_pg_binary_ops() {
|
|||
];
|
||||
|
||||
for (str_op, op, dialects) in binary_ops {
|
||||
let select = dialects.verified_only_select(&format!("SELECT a {} b", &str_op));
|
||||
assert_eq!(
|
||||
SelectItem::UnnamedExpr(Expr::BinaryOp {
|
||||
left: Box::new(Expr::Identifier(Ident::new("a"))),
|
||||
op: op.clone(),
|
||||
right: Box::new(Expr::Identifier(Ident::new("b"))),
|
||||
}),
|
||||
select.projection[0]
|
||||
);
|
||||
test_operator(str_op, dialects, op.clone());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_pg_custom_binary_ops() {
|
||||
// Postgres supports declaring custom binary operators, using any character in the following set:
|
||||
// + - * / < > = ~ ! @ # % ^ & | ` ?
|
||||
|
||||
// Here, we test the ones used by common extensions
|
||||
let operators = [
|
||||
// PostGIS
|
||||
"&&&", // n-D bounding boxes intersect
|
||||
"&<", // (is strictly to the left of)
|
||||
"&>", // (is strictly to the right of)
|
||||
"|=|", // distance between A and B trajectories at their closest point of approach
|
||||
"<<#>>", // n-D distance between A and B bounding boxes
|
||||
"|>>", // A's bounding box is strictly above B's.
|
||||
"~=", // bounding box is the same
|
||||
// PGroonga
|
||||
"&@", // Full text search by a keyword
|
||||
"&@~", // Full text search by easy to use query language
|
||||
"&@*", // Similar search
|
||||
"&`", // Advanced search by ECMAScript like query language
|
||||
"&@|", // Full text search by an array of keywords
|
||||
"&@~|", // Full text search by an array of queries in easy to use query language
|
||||
// pgtrgm
|
||||
"<<%", // second argument has a continuous extent of an ordered trigram set that matches word boundaries
|
||||
"%>>", // commutator of <<%
|
||||
"<<<->", // distance between arguments
|
||||
// hstore
|
||||
"#=", // Replace fields with matching values from hstore
|
||||
// ranges
|
||||
"-|-", // Is adjacent to
|
||||
// pg_similarity
|
||||
"~++", // L1 distance
|
||||
"~##", // Cosine Distance
|
||||
"~-~", // Dice Coefficient
|
||||
"~!!", // Euclidean Distance
|
||||
"~@~", // Hamming Distance
|
||||
"~??", // Jaccard Coefficient
|
||||
"~%%", // Jaro Distance
|
||||
"~@@", // Jaro-Winkler Distance
|
||||
"~==", // Levenshtein Distance
|
||||
"~^^", // Matching Coefficient
|
||||
"~||", // Monge-Elkan Coefficient
|
||||
"~#~", // Needleman-Wunsch Coefficient
|
||||
"~**", // Overlap Coefficient
|
||||
"~~~", // Q-Gram Distance
|
||||
"~=~", // Smith-Waterman Coefficient
|
||||
"~!~", // Smith-Waterman-Gotoh Coefficient
|
||||
"~*~", // Soundex Distance
|
||||
// soundex_operator
|
||||
">@@<", // Soundex matches
|
||||
"<@@>", // Soundex doesn't match
|
||||
];
|
||||
for op in &operators {
|
||||
test_operator(op, &pg(), BinaryOperator::Custom(op.to_string()));
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_ampersand_arobase() {
|
||||
// In SQL Server, a&@b means (a) & (@b), in PostgreSQL it means (a) &@ (b)
|
||||
pg().expr_parses_to("a&@b", "a &@ b");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_pg_unary_ops() {
|
||||
let pg_unary_ops = &[
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue