mirror of
https://github.com/apache/datafusion-sqlparser-rs.git
synced 2025-08-17 12:40:17 +00:00
fix: parsing JsonOperator
(#913)
This commit is contained in:
parent
c8b6e7f2c7
commit
df45db1375
2 changed files with 112 additions and 34 deletions
111
src/tokenizer.rs
111
src/tokenizer.rs
|
@ -35,7 +35,9 @@ use serde::{Deserialize, Serialize};
|
||||||
use sqlparser_derive::{Visit, VisitMut};
|
use sqlparser_derive::{Visit, VisitMut};
|
||||||
|
|
||||||
use crate::ast::DollarQuotedString;
|
use crate::ast::DollarQuotedString;
|
||||||
use crate::dialect::{BigQueryDialect, DuckDbDialect, GenericDialect, SnowflakeDialect};
|
use crate::dialect::{
|
||||||
|
BigQueryDialect, DuckDbDialect, GenericDialect, HiveDialect, SnowflakeDialect,
|
||||||
|
};
|
||||||
use crate::dialect::{Dialect, MySqlDialect};
|
use crate::dialect::{Dialect, MySqlDialect};
|
||||||
use crate::keywords::{Keyword, ALL_KEYWORDS, ALL_KEYWORDS_INDEX};
|
use crate::keywords::{Keyword, ALL_KEYWORDS, ALL_KEYWORDS_INDEX};
|
||||||
|
|
||||||
|
@ -495,9 +497,32 @@ impl<'a> Tokenizer<'a> {
|
||||||
Ok(tokens)
|
Ok(tokens)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn tokenize_identifier_or_keyword(
|
||||||
|
&self,
|
||||||
|
ch: String,
|
||||||
|
chars: &mut State,
|
||||||
|
) -> Result<Option<Token>, TokenizerError> {
|
||||||
|
chars.next(); // consume the first char
|
||||||
|
let word = self.tokenize_word(ch, chars);
|
||||||
|
|
||||||
|
// TODO: implement parsing of exponent here
|
||||||
|
if word.chars().all(|x| x.is_ascii_digit() || x == '.') {
|
||||||
|
let mut inner_state = State {
|
||||||
|
peekable: word.chars().peekable(),
|
||||||
|
line: 0,
|
||||||
|
col: 0,
|
||||||
|
};
|
||||||
|
let mut s = peeking_take_while(&mut inner_state, |ch| matches!(ch, '0'..='9' | '.'));
|
||||||
|
let s2 = peeking_take_while(chars, |ch| matches!(ch, '0'..='9' | '.'));
|
||||||
|
s += s2.as_str();
|
||||||
|
return Ok(Some(Token::Number(s, false)));
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(Some(Token::make_word(&word, None)))
|
||||||
|
}
|
||||||
|
|
||||||
/// Get the next token or return None
|
/// Get the next token or return None
|
||||||
fn next_token(&self, chars: &mut State) -> Result<Option<Token>, TokenizerError> {
|
fn next_token(&self, chars: &mut State) -> Result<Option<Token>, TokenizerError> {
|
||||||
//println!("next_token: {:?}", chars.peek());
|
|
||||||
match chars.peek() {
|
match chars.peek() {
|
||||||
Some(&ch) => match ch {
|
Some(&ch) => match ch {
|
||||||
' ' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)),
|
' ' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)),
|
||||||
|
@ -525,7 +550,7 @@ impl<'a> Tokenizer<'a> {
|
||||||
}
|
}
|
||||||
_ => {
|
_ => {
|
||||||
// regular identifier starting with an "b" or "B"
|
// regular identifier starting with an "b" or "B"
|
||||||
let s = self.tokenize_word(b, chars);
|
let s = self.tokenize_word(b.to_string(), chars);
|
||||||
Ok(Some(Token::make_word(&s, None)))
|
Ok(Some(Token::make_word(&s, None)))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -544,7 +569,7 @@ impl<'a> Tokenizer<'a> {
|
||||||
}
|
}
|
||||||
_ => {
|
_ => {
|
||||||
// regular identifier starting with an "r" or "R"
|
// regular identifier starting with an "r" or "R"
|
||||||
let s = self.tokenize_word(b, chars);
|
let s = self.tokenize_word(b.to_string(), chars);
|
||||||
Ok(Some(Token::make_word(&s, None)))
|
Ok(Some(Token::make_word(&s, None)))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -560,7 +585,7 @@ impl<'a> Tokenizer<'a> {
|
||||||
}
|
}
|
||||||
_ => {
|
_ => {
|
||||||
// regular identifier starting with an "N"
|
// regular identifier starting with an "N"
|
||||||
let s = self.tokenize_word(n, chars);
|
let s = self.tokenize_word(n.to_string(), chars);
|
||||||
Ok(Some(Token::make_word(&s, None)))
|
Ok(Some(Token::make_word(&s, None)))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -577,7 +602,7 @@ impl<'a> Tokenizer<'a> {
|
||||||
}
|
}
|
||||||
_ => {
|
_ => {
|
||||||
// regular identifier starting with an "E" or "e"
|
// regular identifier starting with an "E" or "e"
|
||||||
let s = self.tokenize_word(x, chars);
|
let s = self.tokenize_word(x.to_string(), chars);
|
||||||
Ok(Some(Token::make_word(&s, None)))
|
Ok(Some(Token::make_word(&s, None)))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -594,33 +619,11 @@ impl<'a> Tokenizer<'a> {
|
||||||
}
|
}
|
||||||
_ => {
|
_ => {
|
||||||
// regular identifier starting with an "X"
|
// regular identifier starting with an "X"
|
||||||
let s = self.tokenize_word(x, chars);
|
let s = self.tokenize_word(x.to_string(), chars);
|
||||||
Ok(Some(Token::make_word(&s, None)))
|
Ok(Some(Token::make_word(&s, None)))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// identifier or keyword
|
|
||||||
ch if self.dialect.is_identifier_start(ch) => {
|
|
||||||
chars.next(); // consume the first char
|
|
||||||
let word = self.tokenize_word(ch, chars);
|
|
||||||
|
|
||||||
// TODO: implement parsing of exponent here
|
|
||||||
if word.chars().all(|x| x.is_ascii_digit() || x == '.') {
|
|
||||||
let mut inner_state = State {
|
|
||||||
peekable: word.chars().peekable(),
|
|
||||||
line: 0,
|
|
||||||
col: 0,
|
|
||||||
};
|
|
||||||
let mut s = peeking_take_while(&mut inner_state, |ch| {
|
|
||||||
matches!(ch, '0'..='9' | '.')
|
|
||||||
});
|
|
||||||
let s2 = peeking_take_while(chars, |ch| matches!(ch, '0'..='9' | '.'));
|
|
||||||
s += s2.as_str();
|
|
||||||
return Ok(Some(Token::Number(s, false)));
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(Some(Token::make_word(&word, None)))
|
|
||||||
}
|
|
||||||
// single quoted string
|
// single quoted string
|
||||||
'\'' => {
|
'\'' => {
|
||||||
let s = self.tokenize_quoted_string(chars, '\'')?;
|
let s = self.tokenize_quoted_string(chars, '\'')?;
|
||||||
|
@ -714,7 +717,7 @@ impl<'a> Tokenizer<'a> {
|
||||||
|
|
||||||
// mysql dialect supports identifiers that start with a numeric prefix,
|
// mysql dialect supports identifiers that start with a numeric prefix,
|
||||||
// as long as they aren't an exponent number.
|
// as long as they aren't an exponent number.
|
||||||
if dialect_of!(self is MySqlDialect) && exponent_part.is_empty() {
|
if dialect_of!(self is MySqlDialect | HiveDialect) && exponent_part.is_empty() {
|
||||||
let word =
|
let word =
|
||||||
peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch));
|
peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch));
|
||||||
|
|
||||||
|
@ -786,7 +789,18 @@ impl<'a> Tokenizer<'a> {
|
||||||
}
|
}
|
||||||
'+' => self.consume_and_return(chars, Token::Plus),
|
'+' => self.consume_and_return(chars, Token::Plus),
|
||||||
'*' => self.consume_and_return(chars, Token::Mul),
|
'*' => self.consume_and_return(chars, Token::Mul),
|
||||||
'%' => self.consume_and_return(chars, Token::Mod),
|
'%' => {
|
||||||
|
chars.next();
|
||||||
|
match chars.peek() {
|
||||||
|
Some(' ') => self.consume_and_return(chars, Token::Mod),
|
||||||
|
Some(sch) if self.dialect.is_identifier_start('%') => {
|
||||||
|
let mut s = ch.to_string();
|
||||||
|
s.push_str(&sch.to_string());
|
||||||
|
self.tokenize_identifier_or_keyword(s, chars)
|
||||||
|
}
|
||||||
|
_ => self.consume_and_return(chars, Token::Mod),
|
||||||
|
}
|
||||||
|
}
|
||||||
'|' => {
|
'|' => {
|
||||||
chars.next(); // consume the '|'
|
chars.next(); // consume the '|'
|
||||||
match chars.peek() {
|
match chars.peek() {
|
||||||
|
@ -901,6 +915,12 @@ impl<'a> Tokenizer<'a> {
|
||||||
_ => Ok(Some(Token::HashArrow)),
|
_ => Ok(Some(Token::HashArrow)),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Some(' ') => Ok(Some(Token::Sharp)),
|
||||||
|
Some(sch) if self.dialect.is_identifier_start('#') => {
|
||||||
|
let mut s = ch.to_string();
|
||||||
|
s.push_str(&sch.to_string());
|
||||||
|
self.tokenize_identifier_or_keyword(s, chars)
|
||||||
|
}
|
||||||
_ => Ok(Some(Token::Sharp)),
|
_ => Ok(Some(Token::Sharp)),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -909,7 +929,25 @@ impl<'a> Tokenizer<'a> {
|
||||||
match chars.peek() {
|
match chars.peek() {
|
||||||
Some('>') => self.consume_and_return(chars, Token::AtArrow),
|
Some('>') => self.consume_and_return(chars, Token::AtArrow),
|
||||||
Some('?') => self.consume_and_return(chars, Token::AtQuestion),
|
Some('?') => self.consume_and_return(chars, Token::AtQuestion),
|
||||||
Some('@') => self.consume_and_return(chars, Token::AtAt),
|
Some('@') => {
|
||||||
|
chars.next();
|
||||||
|
match chars.peek() {
|
||||||
|
Some(' ') => Ok(Some(Token::AtAt)),
|
||||||
|
Some(tch) if self.dialect.is_identifier_start('@') => {
|
||||||
|
let mut s = ch.to_string();
|
||||||
|
s.push('@');
|
||||||
|
s.push_str(&tch.to_string());
|
||||||
|
self.tokenize_identifier_or_keyword(s, chars)
|
||||||
|
}
|
||||||
|
_ => Ok(Some(Token::AtAt)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Some(' ') => Ok(Some(Token::AtSign)),
|
||||||
|
Some(sch) if self.dialect.is_identifier_start('@') => {
|
||||||
|
let mut s = ch.to_string();
|
||||||
|
s.push_str(&sch.to_string());
|
||||||
|
self.tokenize_identifier_or_keyword(s, chars)
|
||||||
|
}
|
||||||
_ => Ok(Some(Token::AtSign)),
|
_ => Ok(Some(Token::AtSign)),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -918,6 +956,11 @@ impl<'a> Tokenizer<'a> {
|
||||||
let s = peeking_take_while(chars, |ch| ch.is_numeric());
|
let s = peeking_take_while(chars, |ch| ch.is_numeric());
|
||||||
Ok(Some(Token::Placeholder(String::from("?") + &s)))
|
Ok(Some(Token::Placeholder(String::from("?") + &s)))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// identifier or keyword
|
||||||
|
ch if self.dialect.is_identifier_start(ch) => {
|
||||||
|
self.tokenize_identifier_or_keyword(ch.to_string(), chars)
|
||||||
|
}
|
||||||
'$' => Ok(Some(self.tokenize_dollar_preceded_value(chars)?)),
|
'$' => Ok(Some(self.tokenize_dollar_preceded_value(chars)?)),
|
||||||
|
|
||||||
//whitespace check (including unicode chars) should be last as it covers some of the chars above
|
//whitespace check (including unicode chars) should be last as it covers some of the chars above
|
||||||
|
@ -1043,8 +1086,8 @@ impl<'a> Tokenizer<'a> {
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Tokenize an identifier or keyword, after the first char is already consumed.
|
/// Tokenize an identifier or keyword, after the first char is already consumed.
|
||||||
fn tokenize_word(&self, first_char: char, chars: &mut State) -> String {
|
fn tokenize_word(&self, first_chars: String, chars: &mut State) -> String {
|
||||||
let mut s = first_char.to_string();
|
let mut s = first_chars;
|
||||||
s.push_str(&peeking_take_while(chars, |ch| {
|
s.push_str(&peeking_take_while(chars, |ch| {
|
||||||
self.dialect.is_identifier_part(ch)
|
self.dialect.is_identifier_part(ch)
|
||||||
}));
|
}));
|
||||||
|
|
|
@ -1113,6 +1113,41 @@ fn parse_unary_math_with_multiply() {
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn pg_and_generic() -> TestedDialects {
|
||||||
|
TestedDialects {
|
||||||
|
dialects: vec![Box::new(PostgreSqlDialect {}), Box::new(GenericDialect {})],
|
||||||
|
options: None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn parse_json_ops_without_colon() {
|
||||||
|
use self::JsonOperator;
|
||||||
|
let binary_ops = &[
|
||||||
|
("->", JsonOperator::Arrow, all_dialects()),
|
||||||
|
("->>", JsonOperator::LongArrow, all_dialects()),
|
||||||
|
("#>", JsonOperator::HashArrow, pg_and_generic()),
|
||||||
|
("#>>", JsonOperator::HashLongArrow, pg_and_generic()),
|
||||||
|
("@>", JsonOperator::AtArrow, all_dialects()),
|
||||||
|
("<@", JsonOperator::ArrowAt, all_dialects()),
|
||||||
|
("#-", JsonOperator::HashMinus, pg_and_generic()),
|
||||||
|
("@?", JsonOperator::AtQuestion, all_dialects()),
|
||||||
|
("@@", JsonOperator::AtAt, all_dialects()),
|
||||||
|
];
|
||||||
|
|
||||||
|
for (str_op, op, dialects) in binary_ops {
|
||||||
|
let select = dialects.verified_only_select(&format!("SELECT a {} b", &str_op));
|
||||||
|
assert_eq!(
|
||||||
|
SelectItem::UnnamedExpr(Expr::JsonAccess {
|
||||||
|
left: Box::new(Expr::Identifier(Ident::new("a"))),
|
||||||
|
operator: *op,
|
||||||
|
right: Box::new(Expr::Identifier(Ident::new("b"))),
|
||||||
|
}),
|
||||||
|
select.projection[0]
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn parse_is_null() {
|
fn parse_is_null() {
|
||||||
use self::Expr::*;
|
use self::Expr::*;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue