add support for custom operators in postgres (#1302)

Co-authored-by: Joey Hain <joey@sigmacomputing.com>
2025-10-17 01:07:19 +00:00 · 2024-06-07 13:12:18 +02:00 · 2024-06-07 13:12:18 +02:00 · 4b60866bc7
commit 4b60866bc7
parent 2fb919d8b2
7 changed files with 203 additions and 46 deletions
--- a/src/ast/operator.rs
+++ b/src/ast/operator.rs
@ -111,7 +111,7 @@ pub enum BinaryOperator {
    DuckIntegerDivide,
    /// MySQL [`DIV`](https://dev.mysql.com/doc/refman/8.0/en/arithmetic-functions.html) integer division
    MyIntegerDivide,
-    /// Support for custom operators (built by parsers outside this crate)
+    /// Support for custom operators (such as Postgres custom operators)
    Custom(String),
    /// Bitwise XOR, e.g. `a # b` (PostgreSQL-specific)
    PGBitwiseXor,
--- a/src/dialect/mod.rs
+++ b/src/dialect/mod.rs
@ -122,6 +122,12 @@ pub trait Dialect: Debug + Any {
    fn is_identifier_start(&self, ch: char) -> bool;
    /// Determine if a character is a valid unquoted identifier character
    fn is_identifier_part(&self, ch: char) -> bool;
    /// Most dialects do not have custom operators. Override this method to provide custom operators.
    fn is_custom_operator_part(&self, _ch: char) -> bool {
        false
    }
    /// Determine if the dialect supports escaping characters via '\' in string literals.
    ///
    /// Some dialects like BigQuery and Snowflake support this while others like
--- a/src/dialect/postgresql.rs
+++ b/src/dialect/postgresql.rs
@ -25,6 +25,10 @@ impl Dialect for PostgreSqlDialect {
        Some('"')
    }
    fn is_delimited_identifier_start(&self, ch: char) -> bool {
        ch == '"' // Postgres does not support backticks to quote identifiers
    }
    fn is_identifier_start(&self, ch: char) -> bool {
        // See https://www.postgresql.org/docs/11/sql-syntax-lexical.html#SQL-SYNTAX-IDENTIFIERS
        // We don't yet support identifiers beginning with "letters with
@ -36,6 +40,29 @@ impl Dialect for PostgreSqlDialect {
        ch.is_alphabetic() || ch.is_ascii_digit() || ch == '$' || ch == '_'
    }
    /// See <https://www.postgresql.org/docs/current/sql-createoperator.html>
    fn is_custom_operator_part(&self, ch: char) -> bool {
        matches!(
            ch,
            '+' | '-'
                | '*'
                | '/'
                | '<'
                | '>'
                | '='
                | '~'
                | '!'
                | '@'
                | '#'
                | '%'
                | '^'
                | '&'
                | '|'
                | '`'
                | '?'
        )
    }
    fn parse_statement(&self, parser: &mut Parser) -> Option<Result<Statement, ParserError>> {
        if parser.parse_keyword(Keyword::COMMENT) {
            Some(parse_comment(parser))
--- a/src/parser/mod.rs
+++ b/src/parser/mod.rs
@ -2344,9 +2344,8 @@ impl<'a> Parser<'a> {
            return infix;
        }
-        let tok = self.next_token();
+        let mut tok = self.next_token();
-
+        let regular_binary_operator = match &mut tok.token {
        let regular_binary_operator = match &tok.token {
            Token::Spaceship => Some(BinaryOperator::Spaceship),
            Token::DoubleEq => Some(BinaryOperator::Eq),
            Token::Eq => Some(BinaryOperator::Eq),
@ -2410,6 +2409,7 @@ impl<'a> Parser<'a> {
            Token::Question => Some(BinaryOperator::Question),
            Token::QuestionAnd => Some(BinaryOperator::QuestionAnd),
            Token::QuestionPipe => Some(BinaryOperator::QuestionPipe),
            Token::CustomBinaryOperator(s) => Some(BinaryOperator::Custom(core::mem::take(s))),
            Token::Word(w) => match w.keyword {
                Keyword::AND => Some(BinaryOperator::And),
@ -2964,7 +2964,8 @@ impl<'a> Parser<'a> {
            | Token::AtAt
            | Token::Question
            | Token::QuestionAnd
-            | Token::QuestionPipe => Ok(Self::PG_OTHER_PREC),
+            | Token::QuestionPipe
            | Token::CustomBinaryOperator(_) => Ok(Self::PG_OTHER_PREC),
            _ => Ok(0),
        }
    }
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@ -231,6 +231,10 @@ pub enum Token {
    /// jsonb ?| text[] -> boolean: Check whether any member of the text array exists as top-level
    /// keys within the jsonb object
    QuestionPipe,
    /// Custom binary operator
    /// This is used to represent any custom binary operator that is not part of the SQL standard.
    /// PostgreSQL allows defining custom binary operators using CREATE OPERATOR.
    CustomBinaryOperator(String),
 }
 impl fmt::Display for Token {
@ -320,6 +324,7 @@ impl fmt::Display for Token {
            Token::Question => write!(f, "?"),
            Token::QuestionAnd => write!(f, "?&"),
            Token::QuestionPipe => write!(f, "?|"),
            Token::CustomBinaryOperator(s) => f.write_str(s),
        }
    }
 }
@ -961,15 +966,12 @@ impl<'a> Tokenizer<'a> {
                        Some('>') => {
                            chars.next();
                            match chars.peek() {
-                                Some('>') => {
+                                Some('>') => self.consume_for_binop(chars, "->>", Token::LongArrow),
-                                    chars.next();
+                                _ => self.start_binop(chars, "->", Token::Arrow),
                                    Ok(Some(Token::LongArrow))
                                }
                                _ => Ok(Some(Token::Arrow)),
                            }
                        }
                        // a regular '-' operator
-                        _ => Ok(Some(Token::Minus)),
+                        _ => self.start_binop(chars, "-", Token::Minus),
                    }
                }
                '/' => {
@ -999,26 +1001,28 @@ impl<'a> Tokenizer<'a> {
                '%' => {
                    chars.next(); // advance past '%'
                    match chars.peek() {
-                        Some(' ') => Ok(Some(Token::Mod)),
+                        Some(s) if s.is_whitespace() => Ok(Some(Token::Mod)),
                        Some(sch) if self.dialect.is_identifier_start('%') => {
                            self.tokenize_identifier_or_keyword([ch, *sch], chars)
                        }
-                        _ => Ok(Some(Token::Mod)),
+                        _ => self.start_binop(chars, "%", Token::Mod),
                    }
                }
                '|' => {
                    chars.next(); // consume the '|'
                    match chars.peek() {
-                        Some('/') => self.consume_and_return(chars, Token::PGSquareRoot),
+                        Some('/') => self.consume_for_binop(chars, "|/", Token::PGSquareRoot),
                        Some('|') => {
                            chars.next(); // consume the second '|'
                            match chars.peek() {
-                                Some('/') => self.consume_and_return(chars, Token::PGCubeRoot),
+                                Some('/') => {
-                                _ => Ok(Some(Token::StringConcat)),
+                                    self.consume_for_binop(chars, "||/", Token::PGCubeRoot)
                                }
                                _ => self.start_binop(chars, "||", Token::StringConcat),
                            }
                        }
                        // Bitshift '|' operator
-                        _ => Ok(Some(Token::Pipe)),
+                        _ => self.start_binop(chars, "|", Token::Pipe),
                    }
                }
                '=' => {
@ -1061,22 +1065,22 @@ impl<'a> Tokenizer<'a> {
                        Some('=') => {
                            chars.next();
                            match chars.peek() {
-                                Some('>') => self.consume_and_return(chars, Token::Spaceship),
+                                Some('>') => self.consume_for_binop(chars, "<=>", Token::Spaceship),
-                                _ => Ok(Some(Token::LtEq)),
+                                _ => self.start_binop(chars, "<=", Token::LtEq),
                            }
                        }
-                        Some('>') => self.consume_and_return(chars, Token::Neq),
+                        Some('>') => self.consume_for_binop(chars, "<>", Token::Neq),
-                        Some('<') => self.consume_and_return(chars, Token::ShiftLeft),
+                        Some('<') => self.consume_for_binop(chars, "<<", Token::ShiftLeft),
-                        Some('@') => self.consume_and_return(chars, Token::ArrowAt),
+                        Some('@') => self.consume_for_binop(chars, "<@", Token::ArrowAt),
-                        _ => Ok(Some(Token::Lt)),
+                        _ => self.start_binop(chars, "<", Token::Lt),
                    }
                }
                '>' => {
                    chars.next(); // consume
                    match chars.peek() {
-                        Some('=') => self.consume_and_return(chars, Token::GtEq),
+                        Some('=') => self.consume_for_binop(chars, ">=", Token::GtEq),
-                        Some('>') => self.consume_and_return(chars, Token::ShiftRight),
+                        Some('>') => self.consume_for_binop(chars, ">>", Token::ShiftRight),
-                        _ => Ok(Some(Token::Gt)),
+                        _ => self.start_binop(chars, ">", Token::Gt),
                    }
                }
                ':' => {
@ -1094,9 +1098,12 @@ impl<'a> Tokenizer<'a> {
                '&' => {
                    chars.next(); // consume the '&'
                    match chars.peek() {
-                        Some('&') => self.consume_and_return(chars, Token::Overlap),
+                        Some('&') => {
                            chars.next(); // consume the second '&'
                            self.start_binop(chars, "&&", Token::Overlap)
                        }
                        // Bitshift '&' operator
-                        _ => Ok(Some(Token::Ampersand)),
+                        _ => self.start_binop(chars, "&", Token::Ampersand),
                    }
                }
                '^' => {
@ -1119,38 +1126,37 @@ impl<'a> Tokenizer<'a> {
                '~' => {
                    chars.next(); // consume
                    match chars.peek() {
-                        Some('*') => self.consume_and_return(chars, Token::TildeAsterisk),
+                        Some('*') => self.consume_for_binop(chars, "~*", Token::TildeAsterisk),
                        Some('~') => {
                            chars.next();
                            match chars.peek() {
                                Some('*') => {
-                                    self.consume_and_return(chars, Token::DoubleTildeAsterisk)
+                                    self.consume_for_binop(chars, "~~*", Token::DoubleTildeAsterisk)
                                }
-                                _ => Ok(Some(Token::DoubleTilde)),
+                                _ => self.start_binop(chars, "~~", Token::DoubleTilde),
                            }
                        }
-                        _ => Ok(Some(Token::Tilde)),
+                        _ => self.start_binop(chars, "~", Token::Tilde),
                    }
                }
                '#' => {
                    chars.next();
                    match chars.peek() {
-                        Some('-') => self.consume_and_return(chars, Token::HashMinus),
+                        Some('-') => self.consume_for_binop(chars, "#-", Token::HashMinus),
                        Some('>') => {
                            chars.next();
                            match chars.peek() {
                                Some('>') => {
-                                    chars.next();
+                                    self.consume_for_binop(chars, "#>>", Token::HashLongArrow)
                                    Ok(Some(Token::HashLongArrow))
                                }
-                                _ => Ok(Some(Token::HashArrow)),
+                                _ => self.start_binop(chars, "#>", Token::HashArrow),
                            }
                        }
                        Some(' ') => Ok(Some(Token::Sharp)),
                        Some(sch) if self.dialect.is_identifier_start('#') => {
                            self.tokenize_identifier_or_keyword([ch, *sch], chars)
                        }
-                        _ => Ok(Some(Token::Sharp)),
+                        _ => self.start_binop(chars, "#", Token::Sharp),
                    }
                }
                '@' => {
@ -1206,6 +1212,39 @@ impl<'a> Tokenizer<'a> {
        }
    }
    /// Consume the next character, then parse a custom binary operator. The next character should be included in the prefix
    fn consume_for_binop(
        &self,
        chars: &mut State,
        prefix: &str,
        default: Token,
    ) -> Result<Option<Token>, TokenizerError> {
        chars.next(); // consume the first char
        self.start_binop(chars, prefix, default)
    }
    /// parse a custom binary operator
    fn start_binop(
        &self,
        chars: &mut State,
        prefix: &str,
        default: Token,
    ) -> Result<Option<Token>, TokenizerError> {
        let mut custom = None;
        while let Some(&ch) = chars.peek() {
            if !self.dialect.is_custom_operator_part(ch) {
                break;
            }
            custom.get_or_insert_with(|| prefix.to_string()).push(ch);
            chars.next();
        }
        Ok(Some(
            custom.map(Token::CustomBinaryOperator).unwrap_or(default),
        ))
    }
    /// Tokenize dollar preceded value (i.e: a string/placeholder)
    fn tokenize_dollar_preceded_value(&self, chars: &mut State) -> Result<Token, TokenizerError> {
        let mut s = String::new();
--- a/tests/sqlparser_mssql.rs
+++ b/tests/sqlparser_mssql.rs
@ -437,6 +437,12 @@ fn parse_for_json_expect_ast() {
    );
 }
 #[test]
 fn parse_ampersand_arobase() {
    // In SQL Server, a&@b means (a) & (@b), in PostgreSQL it means (a) &@ (b)
    ms().expr_parses_to("a&@b", "a & @b");
 }
 #[test]
 fn parse_cast_varchar_max() {
    ms_and_generic().verified_expr("CAST('foo' AS VARCHAR(MAX))");
--- a/tests/sqlparser_postgres.rs
+++ b/tests/sqlparser_postgres.rs
@ -1757,6 +1757,29 @@ fn parse_pg_returning() {
    };
 }
 fn test_operator(operator: &str, dialect: &TestedDialects, expected: BinaryOperator) {
    let operator_tokens =
        sqlparser::tokenizer::Tokenizer::new(&PostgreSqlDialect {}, &format!("a{operator}b"))
            .tokenize()
            .unwrap();
    assert_eq!(
        operator_tokens.len(),
        3,
        "binary op should be 3 tokens, not {operator_tokens:?}"
    );
    let expected_expr = Expr::BinaryOp {
        left: Box::new(Expr::Identifier(Ident::new("a"))),
        op: expected,
        right: Box::new(Expr::Identifier(Ident::new("b"))),
    };
    let str_expr_canonical = format!("a {operator} b");
    assert_eq!(expected_expr, dialect.verified_expr(&str_expr_canonical));
    assert_eq!(
        expected_expr,
        dialect.expr_parses_to(&format!("a{operator}b"), &str_expr_canonical)
    );
 }
 #[test]
 fn parse_pg_binary_ops() {
    let binary_ops = &[
@ -1770,18 +1793,73 @@ fn parse_pg_binary_ops() {
    ];
    for (str_op, op, dialects) in binary_ops {
-        let select = dialects.verified_only_select(&format!("SELECT a {} b", &str_op));
+        test_operator(str_op, dialects, op.clone());
        assert_eq!(
            SelectItem::UnnamedExpr(Expr::BinaryOp {
                left: Box::new(Expr::Identifier(Ident::new("a"))),
                op: op.clone(),
                right: Box::new(Expr::Identifier(Ident::new("b"))),
            }),
            select.projection[0]
        );
    }
 }
 #[test]
 fn parse_pg_custom_binary_ops() {
    // Postgres supports declaring custom binary operators, using any character in the following set:
    //  + - * / < > = ~ ! @ # % ^ & | ` ?
    // Here, we test the ones used by common extensions
    let operators = [
        // PostGIS
        "&&&",   // n-D bounding boxes intersect
        "&<",    // (is strictly to the left of)
        "&>",    // (is strictly to the right of)
        "|=|",   //  distance between A and B trajectories at their closest point of approach
        "<<#>>", // n-D distance between A and B bounding boxes
        "|>>",   // A's bounding box is strictly above B's.
        "~=",    // bounding box is the same
        // PGroonga
        "&@",   // Full text search by a keyword
        "&@~",  // Full text search by easy to use query language
        "&@*",  // Similar search
        "&`",   // Advanced search by ECMAScript like query language
        "&@|",  // Full text search by an array of keywords
        "&@~|", //  Full text search by an array of queries in easy to use query language
        // pgtrgm
        "<<%", // second argument has a continuous extent of an ordered trigram set that matches word boundaries
        "%>>", // commutator of <<%
        "<<<->", // distance between arguments
        // hstore
        "#=", // Replace fields with matching values from hstore
        // ranges
        "-|-", // Is adjacent to
        // pg_similarity
        "~++", // L1 distance
        "~##", // Cosine Distance
        "~-~", // Dice Coefficient
        "~!!", // Euclidean Distance
        "~@~", // Hamming Distance
        "~??", // Jaccard Coefficient
        "~%%", // Jaro Distance
        "~@@", // Jaro-Winkler Distance
        "~==", // Levenshtein Distance
        "~^^", // Matching Coefficient
        "~||", // Monge-Elkan Coefficient
        "~#~", // Needleman-Wunsch Coefficient
        "~**", // Overlap Coefficient
        "~~~", // Q-Gram Distance
        "~=~", // Smith-Waterman Coefficient
        "~!~", // Smith-Waterman-Gotoh Coefficient
        "~*~", // Soundex Distance
        // soundex_operator
        ">@@<", // Soundex matches
        "<@@>", // Soundex doesn't match
    ];
    for op in &operators {
        test_operator(op, &pg(), BinaryOperator::Custom(op.to_string()));
    }
 }
 #[test]
 fn parse_ampersand_arobase() {
    // In SQL Server, a&@b means (a) & (@b), in PostgreSQL it means (a) &@ (b)
    pg().expr_parses_to("a&@b", "a &@ b");
 }
 #[test]
 fn parse_pg_unary_ops() {
    let pg_unary_ops = &[