mirror of
https://github.com/apache/datafusion-sqlparser-rs.git
synced 2025-08-17 20:50:16 +00:00
Implement Hive QL Parsing (#235)
This commit is contained in:
parent
17f8eb9c5a
commit
8a214f9919
16 changed files with 1388 additions and 176 deletions
|
@ -35,7 +35,7 @@ pub enum Token {
|
|||
/// A keyword (like SELECT) or an optionally quoted SQL identifier
|
||||
Word(Word),
|
||||
/// An unsigned numeric literal
|
||||
Number(String),
|
||||
Number(String, bool),
|
||||
/// A character that could not be tokenized
|
||||
Char(char),
|
||||
/// Single quoted string: i.e: 'string'
|
||||
|
@ -48,6 +48,8 @@ pub enum Token {
|
|||
Comma,
|
||||
/// Whitespace (space, tab, etc)
|
||||
Whitespace(Whitespace),
|
||||
/// Double equals sign `==`
|
||||
DoubleEq,
|
||||
/// Equality operator `=`
|
||||
Eq,
|
||||
/// Not Equals operator `<>` (or `!=` in some dialects)
|
||||
|
@ -60,6 +62,8 @@ pub enum Token {
|
|||
LtEq,
|
||||
/// Greater Than Or Equals operator `>=`
|
||||
GtEq,
|
||||
/// Spaceship operator <=>
|
||||
Spaceship,
|
||||
/// Plus operator `+`
|
||||
Plus,
|
||||
/// Minus operator `-`
|
||||
|
@ -127,13 +131,15 @@ impl fmt::Display for Token {
|
|||
match self {
|
||||
Token::EOF => f.write_str("EOF"),
|
||||
Token::Word(ref w) => write!(f, "{}", w),
|
||||
Token::Number(ref n) => f.write_str(n),
|
||||
Token::Number(ref n, l) => write!(f, "{}{long}", n, long = if *l { "L" } else { "" }),
|
||||
Token::Char(ref c) => write!(f, "{}", c),
|
||||
Token::SingleQuotedString(ref s) => write!(f, "'{}'", s),
|
||||
Token::NationalStringLiteral(ref s) => write!(f, "N'{}'", s),
|
||||
Token::HexStringLiteral(ref s) => write!(f, "X'{}'", s),
|
||||
Token::Comma => f.write_str(","),
|
||||
Token::Whitespace(ws) => write!(f, "{}", ws),
|
||||
Token::DoubleEq => f.write_str("=="),
|
||||
Token::Spaceship => f.write_str("<=>"),
|
||||
Token::Eq => f.write_str("="),
|
||||
Token::Neq => f.write_str("<>"),
|
||||
Token::Lt => f.write_str("<"),
|
||||
|
@ -296,7 +302,7 @@ impl<'a> Tokenizer<'a> {
|
|||
Token::Whitespace(Whitespace::Tab) => self.col += 4,
|
||||
Token::Word(w) if w.quote_style == None => self.col += w.value.len() as u64,
|
||||
Token::Word(w) if w.quote_style != None => self.col += w.value.len() as u64 + 2,
|
||||
Token::Number(s) => self.col += s.len() as u64,
|
||||
Token::Number(s, _) => self.col += s.len() as u64,
|
||||
Token::SingleQuotedString(s) => self.col += s.len() as u64,
|
||||
_ => self.col += 1,
|
||||
}
|
||||
|
@ -358,6 +364,15 @@ impl<'a> Tokenizer<'a> {
|
|||
ch if self.dialect.is_identifier_start(ch) => {
|
||||
chars.next(); // consume the first char
|
||||
let s = self.tokenize_word(ch, chars);
|
||||
|
||||
if s.chars().all(|x| ('0'..='9').contains(&x) || x == '.') {
|
||||
let mut s = peeking_take_while(&mut s.chars().peekable(), |ch| {
|
||||
matches!(ch, '0'..='9' | '.')
|
||||
});
|
||||
let s2 = peeking_take_while(chars, |ch| matches!(ch, '0'..='9' | '.'));
|
||||
s += s2.as_str();
|
||||
return Ok(Some(Token::Number(s, false)));
|
||||
}
|
||||
Ok(Some(Token::make_word(&s, None)))
|
||||
}
|
||||
// string
|
||||
|
@ -383,7 +398,13 @@ impl<'a> Tokenizer<'a> {
|
|||
'0'..='9' => {
|
||||
// TODO: https://jakewheat.github.io/sql-overview/sql-2011-foundation-grammar.html#unsigned-numeric-literal
|
||||
let s = peeking_take_while(chars, |ch| matches!(ch, '0'..='9' | '.'));
|
||||
Ok(Some(Token::Number(s)))
|
||||
let long = if chars.peek() == Some(&'L') {
|
||||
chars.next();
|
||||
true
|
||||
} else {
|
||||
false
|
||||
};
|
||||
Ok(Some(Token::Number(s, long)))
|
||||
}
|
||||
// punctuation
|
||||
'(' => self.consume_and_return(chars, Token::LParen),
|
||||
|
@ -461,7 +482,13 @@ impl<'a> Tokenizer<'a> {
|
|||
'<' => {
|
||||
chars.next(); // consume
|
||||
match chars.peek() {
|
||||
Some('=') => self.consume_and_return(chars, Token::LtEq),
|
||||
Some('=') => {
|
||||
chars.next();
|
||||
match chars.peek() {
|
||||
Some('>') => self.consume_and_return(chars, Token::Spaceship),
|
||||
_ => Ok(Some(Token::LtEq)),
|
||||
}
|
||||
}
|
||||
Some('>') => self.consume_and_return(chars, Token::Neq),
|
||||
Some('<') => self.consume_and_return(chars, Token::ShiftLeft),
|
||||
_ => Ok(Some(Token::Lt)),
|
||||
|
@ -634,7 +661,7 @@ mod tests {
|
|||
let expected = vec![
|
||||
Token::make_keyword("SELECT"),
|
||||
Token::Whitespace(Whitespace::Space),
|
||||
Token::Number(String::from("1")),
|
||||
Token::Number(String::from("1"), false),
|
||||
];
|
||||
|
||||
compare(expected, tokens);
|
||||
|
@ -652,7 +679,7 @@ mod tests {
|
|||
Token::Whitespace(Whitespace::Space),
|
||||
Token::make_word("sqrt", None),
|
||||
Token::LParen,
|
||||
Token::Number(String::from("1")),
|
||||
Token::Number(String::from("1"), false),
|
||||
Token::RParen,
|
||||
];
|
||||
|
||||
|
@ -724,11 +751,11 @@ mod tests {
|
|||
Token::Whitespace(Whitespace::Space),
|
||||
Token::Eq,
|
||||
Token::Whitespace(Whitespace::Space),
|
||||
Token::Number(String::from("1")),
|
||||
Token::Number(String::from("1"), false),
|
||||
Token::Whitespace(Whitespace::Space),
|
||||
Token::make_keyword("LIMIT"),
|
||||
Token::Whitespace(Whitespace::Space),
|
||||
Token::Number(String::from("5")),
|
||||
Token::Number(String::from("5"), false),
|
||||
];
|
||||
|
||||
compare(expected, tokens);
|
||||
|
@ -758,7 +785,7 @@ mod tests {
|
|||
Token::Whitespace(Whitespace::Space),
|
||||
Token::Eq,
|
||||
Token::Whitespace(Whitespace::Space),
|
||||
Token::Number(String::from("1")),
|
||||
Token::Number(String::from("1"), false),
|
||||
];
|
||||
|
||||
compare(expected, tokens);
|
||||
|
@ -790,7 +817,7 @@ mod tests {
|
|||
Token::Whitespace(Whitespace::Space),
|
||||
Token::Eq,
|
||||
Token::Whitespace(Whitespace::Space),
|
||||
Token::Number(String::from("1")),
|
||||
Token::Number(String::from("1"), false),
|
||||
];
|
||||
|
||||
compare(expected, tokens);
|
||||
|
@ -943,12 +970,12 @@ mod tests {
|
|||
let mut tokenizer = Tokenizer::new(&dialect, &sql);
|
||||
let tokens = tokenizer.tokenize().unwrap();
|
||||
let expected = vec![
|
||||
Token::Number("0".to_string()),
|
||||
Token::Number("0".to_string(), false),
|
||||
Token::Whitespace(Whitespace::SingleLineComment {
|
||||
prefix: "--".to_string(),
|
||||
comment: "this is a comment\n".to_string(),
|
||||
}),
|
||||
Token::Number("1".to_string()),
|
||||
Token::Number("1".to_string(), false),
|
||||
];
|
||||
compare(expected, tokens);
|
||||
}
|
||||
|
@ -975,11 +1002,11 @@ mod tests {
|
|||
let mut tokenizer = Tokenizer::new(&dialect, &sql);
|
||||
let tokens = tokenizer.tokenize().unwrap();
|
||||
let expected = vec![
|
||||
Token::Number("0".to_string()),
|
||||
Token::Number("0".to_string(), false),
|
||||
Token::Whitespace(Whitespace::MultiLineComment(
|
||||
"multi-line\n* /comment".to_string(),
|
||||
)),
|
||||
Token::Number("1".to_string()),
|
||||
Token::Number("1".to_string(), false),
|
||||
];
|
||||
compare(expected, tokens);
|
||||
}
|
||||
|
@ -1046,7 +1073,7 @@ mod tests {
|
|||
Token::Whitespace(Whitespace::Space),
|
||||
Token::make_keyword("TOP"),
|
||||
Token::Whitespace(Whitespace::Space),
|
||||
Token::Number(String::from("5")),
|
||||
Token::Number(String::from("5"), false),
|
||||
Token::Whitespace(Whitespace::Space),
|
||||
Token::make_word("bar", Some('[')),
|
||||
Token::Whitespace(Whitespace::Space),
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue