Implement Hive QL Parsing (#235)

This commit is contained in:
Stephen Carman 2021-02-04 14:53:20 -05:00 committed by GitHub
parent 17f8eb9c5a
commit 8a214f9919
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
16 changed files with 1388 additions and 176 deletions

View file

@ -35,7 +35,7 @@ pub enum Token {
/// A keyword (like SELECT) or an optionally quoted SQL identifier
Word(Word),
/// An unsigned numeric literal
Number(String),
Number(String, bool),
/// A character that could not be tokenized
Char(char),
/// Single quoted string: i.e: 'string'
@ -48,6 +48,8 @@ pub enum Token {
Comma,
/// Whitespace (space, tab, etc)
Whitespace(Whitespace),
/// Double equals sign `==`
DoubleEq,
/// Equality operator `=`
Eq,
/// Not Equals operator `<>` (or `!=` in some dialects)
@ -60,6 +62,8 @@ pub enum Token {
LtEq,
/// Greater Than Or Equals operator `>=`
GtEq,
/// Spaceship operator <=>
Spaceship,
/// Plus operator `+`
Plus,
/// Minus operator `-`
@ -127,13 +131,15 @@ impl fmt::Display for Token {
match self {
Token::EOF => f.write_str("EOF"),
Token::Word(ref w) => write!(f, "{}", w),
Token::Number(ref n) => f.write_str(n),
Token::Number(ref n, l) => write!(f, "{}{long}", n, long = if *l { "L" } else { "" }),
Token::Char(ref c) => write!(f, "{}", c),
Token::SingleQuotedString(ref s) => write!(f, "'{}'", s),
Token::NationalStringLiteral(ref s) => write!(f, "N'{}'", s),
Token::HexStringLiteral(ref s) => write!(f, "X'{}'", s),
Token::Comma => f.write_str(","),
Token::Whitespace(ws) => write!(f, "{}", ws),
Token::DoubleEq => f.write_str("=="),
Token::Spaceship => f.write_str("<=>"),
Token::Eq => f.write_str("="),
Token::Neq => f.write_str("<>"),
Token::Lt => f.write_str("<"),
@ -296,7 +302,7 @@ impl<'a> Tokenizer<'a> {
Token::Whitespace(Whitespace::Tab) => self.col += 4,
Token::Word(w) if w.quote_style == None => self.col += w.value.len() as u64,
Token::Word(w) if w.quote_style != None => self.col += w.value.len() as u64 + 2,
Token::Number(s) => self.col += s.len() as u64,
Token::Number(s, _) => self.col += s.len() as u64,
Token::SingleQuotedString(s) => self.col += s.len() as u64,
_ => self.col += 1,
}
@ -358,6 +364,15 @@ impl<'a> Tokenizer<'a> {
ch if self.dialect.is_identifier_start(ch) => {
chars.next(); // consume the first char
let s = self.tokenize_word(ch, chars);
if s.chars().all(|x| ('0'..='9').contains(&x) || x == '.') {
let mut s = peeking_take_while(&mut s.chars().peekable(), |ch| {
matches!(ch, '0'..='9' | '.')
});
let s2 = peeking_take_while(chars, |ch| matches!(ch, '0'..='9' | '.'));
s += s2.as_str();
return Ok(Some(Token::Number(s, false)));
}
Ok(Some(Token::make_word(&s, None)))
}
// string
@ -383,7 +398,13 @@ impl<'a> Tokenizer<'a> {
'0'..='9' => {
// TODO: https://jakewheat.github.io/sql-overview/sql-2011-foundation-grammar.html#unsigned-numeric-literal
let s = peeking_take_while(chars, |ch| matches!(ch, '0'..='9' | '.'));
Ok(Some(Token::Number(s)))
let long = if chars.peek() == Some(&'L') {
chars.next();
true
} else {
false
};
Ok(Some(Token::Number(s, long)))
}
// punctuation
'(' => self.consume_and_return(chars, Token::LParen),
@ -461,7 +482,13 @@ impl<'a> Tokenizer<'a> {
'<' => {
chars.next(); // consume
match chars.peek() {
Some('=') => self.consume_and_return(chars, Token::LtEq),
Some('=') => {
chars.next();
match chars.peek() {
Some('>') => self.consume_and_return(chars, Token::Spaceship),
_ => Ok(Some(Token::LtEq)),
}
}
Some('>') => self.consume_and_return(chars, Token::Neq),
Some('<') => self.consume_and_return(chars, Token::ShiftLeft),
_ => Ok(Some(Token::Lt)),
@ -634,7 +661,7 @@ mod tests {
let expected = vec![
Token::make_keyword("SELECT"),
Token::Whitespace(Whitespace::Space),
Token::Number(String::from("1")),
Token::Number(String::from("1"), false),
];
compare(expected, tokens);
@ -652,7 +679,7 @@ mod tests {
Token::Whitespace(Whitespace::Space),
Token::make_word("sqrt", None),
Token::LParen,
Token::Number(String::from("1")),
Token::Number(String::from("1"), false),
Token::RParen,
];
@ -724,11 +751,11 @@ mod tests {
Token::Whitespace(Whitespace::Space),
Token::Eq,
Token::Whitespace(Whitespace::Space),
Token::Number(String::from("1")),
Token::Number(String::from("1"), false),
Token::Whitespace(Whitespace::Space),
Token::make_keyword("LIMIT"),
Token::Whitespace(Whitespace::Space),
Token::Number(String::from("5")),
Token::Number(String::from("5"), false),
];
compare(expected, tokens);
@ -758,7 +785,7 @@ mod tests {
Token::Whitespace(Whitespace::Space),
Token::Eq,
Token::Whitespace(Whitespace::Space),
Token::Number(String::from("1")),
Token::Number(String::from("1"), false),
];
compare(expected, tokens);
@ -790,7 +817,7 @@ mod tests {
Token::Whitespace(Whitespace::Space),
Token::Eq,
Token::Whitespace(Whitespace::Space),
Token::Number(String::from("1")),
Token::Number(String::from("1"), false),
];
compare(expected, tokens);
@ -943,12 +970,12 @@ mod tests {
let mut tokenizer = Tokenizer::new(&dialect, &sql);
let tokens = tokenizer.tokenize().unwrap();
let expected = vec![
Token::Number("0".to_string()),
Token::Number("0".to_string(), false),
Token::Whitespace(Whitespace::SingleLineComment {
prefix: "--".to_string(),
comment: "this is a comment\n".to_string(),
}),
Token::Number("1".to_string()),
Token::Number("1".to_string(), false),
];
compare(expected, tokens);
}
@ -975,11 +1002,11 @@ mod tests {
let mut tokenizer = Tokenizer::new(&dialect, &sql);
let tokens = tokenizer.tokenize().unwrap();
let expected = vec![
Token::Number("0".to_string()),
Token::Number("0".to_string(), false),
Token::Whitespace(Whitespace::MultiLineComment(
"multi-line\n* /comment".to_string(),
)),
Token::Number("1".to_string()),
Token::Number("1".to_string(), false),
];
compare(expected, tokens);
}
@ -1046,7 +1073,7 @@ mod tests {
Token::Whitespace(Whitespace::Space),
Token::make_keyword("TOP"),
Token::Whitespace(Whitespace::Space),
Token::Number(String::from("5")),
Token::Number(String::from("5"), false),
Token::Whitespace(Whitespace::Space),
Token::make_word("bar", Some('[')),
Token::Whitespace(Whitespace::Space),