fix: parsing JsonOperator (#913)

2025-11-03 08:32:52 +00:00 · 2023-07-17 22:03:48 +03:00 · 2023-07-17 22:03:48 +03:00 · df45db1375
commit df45db1375
parent c8b6e7f2c7
2 changed files with 112 additions and 34 deletions
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@ -35,7 +35,9 @@ use serde::{Deserialize, Serialize};
 use sqlparser_derive::{Visit, VisitMut};

 use crate::ast::DollarQuotedString;
-use crate::dialect::{BigQueryDialect, DuckDbDialect, GenericDialect, SnowflakeDialect};
+use crate::dialect::{
+    BigQueryDialect, DuckDbDialect, GenericDialect, HiveDialect, SnowflakeDialect,
+};
 use crate::dialect::{Dialect, MySqlDialect};
 use crate::keywords::{Keyword, ALL_KEYWORDS, ALL_KEYWORDS_INDEX};

@ -495,9 +497,32 @@ impl<'a> Tokenizer<'a> {
        Ok(tokens)
    }

+    fn tokenize_identifier_or_keyword(
+        &self,
+        ch: String,
+        chars: &mut State,
+    ) -> Result<Option<Token>, TokenizerError> {
+        chars.next(); // consume the first char
+        let word = self.tokenize_word(ch, chars);
+
+        // TODO: implement parsing of exponent here
+        if word.chars().all(|x| x.is_ascii_digit() || x == '.') {
+            let mut inner_state = State {
+                peekable: word.chars().peekable(),
+                line: 0,
+                col: 0,
+            };
+            let mut s = peeking_take_while(&mut inner_state, |ch| matches!(ch, '0'..='9' | '.'));
+            let s2 = peeking_take_while(chars, |ch| matches!(ch, '0'..='9' | '.'));
+            s += s2.as_str();
+            return Ok(Some(Token::Number(s, false)));
+        }
+
+        Ok(Some(Token::make_word(&word, None)))
+    }
+
    /// Get the next token or return None
    fn next_token(&self, chars: &mut State) -> Result<Option<Token>, TokenizerError> {
-        //println!("next_token: {:?}", chars.peek());
        match chars.peek() {
            Some(&ch) => match ch {
                ' ' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)),
@ -525,7 +550,7 @@ impl<'a> Tokenizer<'a> {
                        }
                        _ => {
                            // regular identifier starting with an "b" or "B"
-                            let s = self.tokenize_word(b, chars);
+                            let s = self.tokenize_word(b.to_string(), chars);
                            Ok(Some(Token::make_word(&s, None)))
                        }
                    }
@ -544,7 +569,7 @@ impl<'a> Tokenizer<'a> {
                        }
                        _ => {
                            // regular identifier starting with an "r" or "R"
-                            let s = self.tokenize_word(b, chars);
+                            let s = self.tokenize_word(b.to_string(), chars);
                            Ok(Some(Token::make_word(&s, None)))
                        }
                    }
@ -560,7 +585,7 @@ impl<'a> Tokenizer<'a> {
                        }
                        _ => {
                            // regular identifier starting with an "N"
-                            let s = self.tokenize_word(n, chars);
+                            let s = self.tokenize_word(n.to_string(), chars);
                            Ok(Some(Token::make_word(&s, None)))
                        }
                    }
@ -577,7 +602,7 @@ impl<'a> Tokenizer<'a> {
                        }
                        _ => {
                            // regular identifier starting with an "E" or "e"
-                            let s = self.tokenize_word(x, chars);
+                            let s = self.tokenize_word(x.to_string(), chars);
                            Ok(Some(Token::make_word(&s, None)))
                        }
                    }
@ -594,33 +619,11 @@ impl<'a> Tokenizer<'a> {
                        }
                        _ => {
                            // regular identifier starting with an "X"
-                            let s = self.tokenize_word(x, chars);
+                            let s = self.tokenize_word(x.to_string(), chars);
                            Ok(Some(Token::make_word(&s, None)))
                        }
                    }
                }
-                // identifier or keyword
-                ch if self.dialect.is_identifier_start(ch) => {
-                    chars.next(); // consume the first char
-                    let word = self.tokenize_word(ch, chars);
-
-                    // TODO: implement parsing of exponent here
-                    if word.chars().all(|x| x.is_ascii_digit() || x == '.') {
-                        let mut inner_state = State {
-                            peekable: word.chars().peekable(),
-                            line: 0,
-                            col: 0,
-                        };
-                        let mut s = peeking_take_while(&mut inner_state, |ch| {
-                            matches!(ch, '0'..='9' | '.')
-                        });
-                        let s2 = peeking_take_while(chars, |ch| matches!(ch, '0'..='9' | '.'));
-                        s += s2.as_str();
-                        return Ok(Some(Token::Number(s, false)));
-                    }
-
-                    Ok(Some(Token::make_word(&word, None)))
-                }
                // single quoted string
                '\'' => {
                    let s = self.tokenize_quoted_string(chars, '\'')?;
@ -714,7 +717,7 @@ impl<'a> Tokenizer<'a> {

                    // mysql dialect supports identifiers that start with a numeric prefix,
                    // as long as they aren't an exponent number.
-                    if dialect_of!(self is MySqlDialect) && exponent_part.is_empty() {
+                    if dialect_of!(self is MySqlDialect | HiveDialect) && exponent_part.is_empty() {
                        let word =
                            peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch));

@ -786,7 +789,18 @@ impl<'a> Tokenizer<'a> {
                }
                '+' => self.consume_and_return(chars, Token::Plus),
                '*' => self.consume_and_return(chars, Token::Mul),
-                '%' => self.consume_and_return(chars, Token::Mod),
+                '%' => {
+                    chars.next();
+                    match chars.peek() {
+                        Some(' ') => self.consume_and_return(chars, Token::Mod),
+                        Some(sch) if self.dialect.is_identifier_start('%') => {
+                            let mut s = ch.to_string();
+                            s.push_str(&sch.to_string());
+                            self.tokenize_identifier_or_keyword(s, chars)
+                        }
+                        _ => self.consume_and_return(chars, Token::Mod),
+                    }
+                }
                '|' => {
                    chars.next(); // consume the '|'
                    match chars.peek() {
@ -901,6 +915,12 @@ impl<'a> Tokenizer<'a> {
                                _ => Ok(Some(Token::HashArrow)),
                            }
                        }
+                        Some(' ') => Ok(Some(Token::Sharp)),
+                        Some(sch) if self.dialect.is_identifier_start('#') => {
+                            let mut s = ch.to_string();
+                            s.push_str(&sch.to_string());
+                            self.tokenize_identifier_or_keyword(s, chars)
+                        }
                        _ => Ok(Some(Token::Sharp)),
                    }
                }
@ -909,7 +929,25 @@ impl<'a> Tokenizer<'a> {
                    match chars.peek() {
                        Some('>') => self.consume_and_return(chars, Token::AtArrow),
                        Some('?') => self.consume_and_return(chars, Token::AtQuestion),
-                        Some('@') => self.consume_and_return(chars, Token::AtAt),
+                        Some('@') => {
+                            chars.next();
+                            match chars.peek() {
+                                Some(' ') => Ok(Some(Token::AtAt)),
+                                Some(tch) if self.dialect.is_identifier_start('@') => {
+                                    let mut s = ch.to_string();
+                                    s.push('@');
+                                    s.push_str(&tch.to_string());
+                                    self.tokenize_identifier_or_keyword(s, chars)
+                                }
+                                _ => Ok(Some(Token::AtAt)),
+                            }
+                        }
+                        Some(' ') => Ok(Some(Token::AtSign)),
+                        Some(sch) if self.dialect.is_identifier_start('@') => {
+                            let mut s = ch.to_string();
+                            s.push_str(&sch.to_string());
+                            self.tokenize_identifier_or_keyword(s, chars)
+                        }
                        _ => Ok(Some(Token::AtSign)),
                    }
                }
@ -918,6 +956,11 @@ impl<'a> Tokenizer<'a> {
                    let s = peeking_take_while(chars, |ch| ch.is_numeric());
                    Ok(Some(Token::Placeholder(String::from("?") + &s)))
                }
+
+                // identifier or keyword
+                ch if self.dialect.is_identifier_start(ch) => {
+                    self.tokenize_identifier_or_keyword(ch.to_string(), chars)
+                }
                '$' => Ok(Some(self.tokenize_dollar_preceded_value(chars)?)),

                //whitespace check (including unicode chars) should be last as it covers some of the chars above
@ -1043,8 +1086,8 @@ impl<'a> Tokenizer<'a> {
    }

    /// Tokenize an identifier or keyword, after the first char is already consumed.
-    fn tokenize_word(&self, first_char: char, chars: &mut State) -> String {
-        let mut s = first_char.to_string();
+    fn tokenize_word(&self, first_chars: String, chars: &mut State) -> String {
+        let mut s = first_chars;
        s.push_str(&peeking_take_while(chars, |ch| {
            self.dialect.is_identifier_part(ch)
        }));
--- a/tests/sqlparser_common.rs
+++ b/tests/sqlparser_common.rs
@ -1113,6 +1113,41 @@ fn parse_unary_math_with_multiply() {
    );
 }

+fn pg_and_generic() -> TestedDialects {
+    TestedDialects {
+        dialects: vec![Box::new(PostgreSqlDialect {}), Box::new(GenericDialect {})],
+        options: None,
+    }
+}
+
+#[test]
+fn parse_json_ops_without_colon() {
+    use self::JsonOperator;
+    let binary_ops = &[
+        ("->", JsonOperator::Arrow, all_dialects()),
+        ("->>", JsonOperator::LongArrow, all_dialects()),
+        ("#>", JsonOperator::HashArrow, pg_and_generic()),
+        ("#>>", JsonOperator::HashLongArrow, pg_and_generic()),
+        ("@>", JsonOperator::AtArrow, all_dialects()),
+        ("<@", JsonOperator::ArrowAt, all_dialects()),
+        ("#-", JsonOperator::HashMinus, pg_and_generic()),
+        ("@?", JsonOperator::AtQuestion, all_dialects()),
+        ("@@", JsonOperator::AtAt, all_dialects()),
+    ];
+
+    for (str_op, op, dialects) in binary_ops {
+        let select = dialects.verified_only_select(&format!("SELECT a {} b", &str_op));
+        assert_eq!(
+            SelectItem::UnnamedExpr(Expr::JsonAccess {
+                left: Box::new(Expr::Identifier(Ident::new("a"))),
+                operator: *op,
+                right: Box::new(Expr::Identifier(Ident::new("b"))),
+            }),
+            select.projection[0]
+        );
+    }
+}
+
 #[test]
 fn parse_is_null() {
    use self::Expr::*;