Introduce location tracking in the tokenizer and parser (#710)

* Add locations

* Add PartialEq

* Add PartialEq

* Add some tests

* Fix rebase conflicts

* Fix clippy

Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
This commit is contained in:
Ankur Goyal 2022-12-05 11:47:42 -08:00 committed by GitHub
parent 512a159f08
commit 813f4a2eff
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 404 additions and 174 deletions

2
.gitignore vendored
View file

@ -13,3 +13,5 @@ Cargo.lock
# IDEs # IDEs
.idea .idea
.vscode .vscode
*.swp

View file

@ -52,7 +52,7 @@ pub fn parse_comment(parser: &mut Parser) -> Result<Statement, ParserError> {
parser.expect_keyword(Keyword::ON)?; parser.expect_keyword(Keyword::ON)?;
let token = parser.next_token(); let token = parser.next_token();
let (object_type, object_name) = match token { let (object_type, object_name) = match token.token {
Token::Word(w) if w.keyword == Keyword::COLUMN => { Token::Word(w) if w.keyword == Keyword::COLUMN => {
let object_name = parser.parse_object_name()?; let object_name = parser.parse_object_name()?;
(CommentObject::Column, object_name) (CommentObject::Column, object_name)

View file

@ -105,7 +105,7 @@ impl fmt::Display for ParserError {
impl std::error::Error for ParserError {} impl std::error::Error for ParserError {}
pub struct Parser<'a> { pub struct Parser<'a> {
tokens: Vec<Token>, tokens: Vec<TokenWithLocation>,
/// The index of the first unprocessed token in `self.tokens` /// The index of the first unprocessed token in `self.tokens`
index: usize, index: usize,
dialect: &'a dyn Dialect, dialect: &'a dyn Dialect,
@ -113,7 +113,26 @@ pub struct Parser<'a> {
impl<'a> Parser<'a> { impl<'a> Parser<'a> {
/// Parse the specified tokens /// Parse the specified tokens
/// To avoid breaking backwards compatibility, this function accepts
/// bare tokens.
pub fn new(tokens: Vec<Token>, dialect: &'a dyn Dialect) -> Self { pub fn new(tokens: Vec<Token>, dialect: &'a dyn Dialect) -> Self {
Parser::new_without_locations(tokens, dialect)
}
pub fn new_without_locations(tokens: Vec<Token>, dialect: &'a dyn Dialect) -> Self {
// Put in dummy locations
let tokens_with_locations: Vec<TokenWithLocation> = tokens
.into_iter()
.map(|token| TokenWithLocation {
token,
location: Location { line: 0, column: 0 },
})
.collect();
Parser::new_with_locations(tokens_with_locations, dialect)
}
/// Parse the specified tokens
pub fn new_with_locations(tokens: Vec<TokenWithLocation>, dialect: &'a dyn Dialect) -> Self {
Parser { Parser {
tokens, tokens,
index: 0, index: 0,
@ -157,7 +176,8 @@ impl<'a> Parser<'a> {
return statement; return statement;
} }
match self.next_token() { let next_token = self.next_token();
match &next_token.token {
Token::Word(w) => match w.keyword { Token::Word(w) => match w.keyword {
Keyword::KILL => Ok(self.parse_kill()?), Keyword::KILL => Ok(self.parse_kill()?),
Keyword::DESCRIBE => Ok(self.parse_explain(true)?), Keyword::DESCRIBE => Ok(self.parse_explain(true)?),
@ -202,13 +222,13 @@ impl<'a> Parser<'a> {
Keyword::EXECUTE => Ok(self.parse_execute()?), Keyword::EXECUTE => Ok(self.parse_execute()?),
Keyword::PREPARE => Ok(self.parse_prepare()?), Keyword::PREPARE => Ok(self.parse_prepare()?),
Keyword::MERGE => Ok(self.parse_merge()?), Keyword::MERGE => Ok(self.parse_merge()?),
_ => self.expected("an SQL statement", Token::Word(w)), _ => self.expected("an SQL statement", next_token),
}, },
Token::LParen => { Token::LParen => {
self.prev_token(); self.prev_token();
Ok(Statement::Query(Box::new(self.parse_query()?))) Ok(Statement::Query(Box::new(self.parse_query()?)))
} }
unexpected => self.expected("an SQL statement", unexpected), _ => self.expected("an SQL statement", next_token),
} }
} }
@ -314,18 +334,20 @@ impl<'a> Parser<'a> {
pub fn parse_wildcard_expr(&mut self) -> Result<WildcardExpr, ParserError> { pub fn parse_wildcard_expr(&mut self) -> Result<WildcardExpr, ParserError> {
let index = self.index; let index = self.index;
match self.next_token() { let next_token = self.next_token();
Token::Word(w) if self.peek_token() == Token::Period => { match next_token.token {
Token::Word(w) if self.peek_token().token == Token::Period => {
let mut id_parts: Vec<Ident> = vec![w.to_ident()]; let mut id_parts: Vec<Ident> = vec![w.to_ident()];
while self.consume_token(&Token::Period) { while self.consume_token(&Token::Period) {
match self.next_token() { let next_token = self.next_token();
match next_token.token {
Token::Word(w) => id_parts.push(w.to_ident()), Token::Word(w) => id_parts.push(w.to_ident()),
Token::Mul => { Token::Mul => {
return Ok(WildcardExpr::QualifiedWildcard(ObjectName(id_parts))); return Ok(WildcardExpr::QualifiedWildcard(ObjectName(id_parts)));
} }
unexpected => { _ => {
return self.expected("an identifier or a '*' after '.'", unexpected); return self.expected("an identifier or a '*' after '.'", next_token);
} }
} }
} }
@ -385,7 +407,7 @@ impl<'a> Parser<'a> {
pub fn get_next_interval_precedence(&self) -> Result<u8, ParserError> { pub fn get_next_interval_precedence(&self) -> Result<u8, ParserError> {
let token = self.peek_token(); let token = self.peek_token();
match token { match token.token {
Token::Word(w) if w.keyword == Keyword::AND => Ok(0), Token::Word(w) if w.keyword == Keyword::AND => Ok(0),
Token::Word(w) if w.keyword == Keyword::OR => Ok(0), Token::Word(w) if w.keyword == Keyword::OR => Ok(0),
Token::Word(w) if w.keyword == Keyword::XOR => Ok(0), Token::Word(w) if w.keyword == Keyword::XOR => Ok(0),
@ -450,7 +472,8 @@ impl<'a> Parser<'a> {
} }
})); }));
let expr = match self.next_token() { let next_token = self.next_token();
let expr = match next_token.token {
Token::Word(w) => match w.keyword { Token::Word(w) => match w.keyword {
Keyword::TRUE | Keyword::FALSE | Keyword::NULL => { Keyword::TRUE | Keyword::FALSE | Keyword::NULL => {
self.prev_token(); self.prev_token();
@ -510,15 +533,16 @@ impl<'a> Parser<'a> {
} }
// Here `w` is a word, check if it's a part of a multi-part // Here `w` is a word, check if it's a part of a multi-part
// identifier, a function call, or a simple identifier: // identifier, a function call, or a simple identifier:
_ => match self.peek_token() { _ => match self.peek_token().token {
Token::LParen | Token::Period => { Token::LParen | Token::Period => {
let mut id_parts: Vec<Ident> = vec![w.to_ident()]; let mut id_parts: Vec<Ident> = vec![w.to_ident()];
while self.consume_token(&Token::Period) { while self.consume_token(&Token::Period) {
match self.next_token() { let next_token = self.next_token();
match next_token.token {
Token::Word(w) => id_parts.push(w.to_ident()), Token::Word(w) => id_parts.push(w.to_ident()),
unexpected => { _ => {
return self return self
.expected("an identifier or a '*' after '.'", unexpected); .expected("an identifier or a '*' after '.'", next_token);
} }
} }
} }
@ -598,7 +622,7 @@ impl<'a> Parser<'a> {
Ok(expr) Ok(expr)
} else { } else {
let tok = self.next_token(); let tok = self.next_token();
let key = match tok { let key = match tok.token {
Token::Word(word) => word.to_ident(), Token::Word(word) => word.to_ident(),
_ => return parser_err!(format!("Expected identifier, found: {}", tok)), _ => return parser_err!(format!("Expected identifier, found: {}", tok)),
}; };
@ -612,7 +636,7 @@ impl<'a> Parser<'a> {
self.prev_token(); self.prev_token();
Ok(Expr::Value(self.parse_value()?)) Ok(Expr::Value(self.parse_value()?))
} }
unexpected => self.expected("an expression:", unexpected), _ => self.expected("an expression:", next_token),
}?; }?;
if self.parse_keyword(Keyword::COLLATE) { if self.parse_keyword(Keyword::COLLATE) {
@ -684,14 +708,15 @@ impl<'a> Parser<'a> {
} }
pub fn parse_window_frame_units(&mut self) -> Result<WindowFrameUnits, ParserError> { pub fn parse_window_frame_units(&mut self) -> Result<WindowFrameUnits, ParserError> {
match self.next_token() { let next_token = self.next_token();
match &next_token.token {
Token::Word(w) => match w.keyword { Token::Word(w) => match w.keyword {
Keyword::ROWS => Ok(WindowFrameUnits::Rows), Keyword::ROWS => Ok(WindowFrameUnits::Rows),
Keyword::RANGE => Ok(WindowFrameUnits::Range), Keyword::RANGE => Ok(WindowFrameUnits::Range),
Keyword::GROUPS => Ok(WindowFrameUnits::Groups), Keyword::GROUPS => Ok(WindowFrameUnits::Groups),
_ => self.expected("ROWS, RANGE, GROUPS", Token::Word(w))?, _ => self.expected("ROWS, RANGE, GROUPS", next_token)?,
}, },
unexpected => self.expected("ROWS, RANGE, GROUPS", unexpected), _ => self.expected("ROWS, RANGE, GROUPS", next_token),
} }
} }
@ -720,7 +745,7 @@ impl<'a> Parser<'a> {
let rows = if self.parse_keyword(Keyword::UNBOUNDED) { let rows = if self.parse_keyword(Keyword::UNBOUNDED) {
None None
} else { } else {
Some(Box::new(match self.peek_token() { Some(Box::new(match self.peek_token().token {
Token::SingleQuotedString(_) => self.parse_interval()?, Token::SingleQuotedString(_) => self.parse_interval()?,
_ => self.parse_expr()?, _ => self.parse_expr()?,
})) }))
@ -980,7 +1005,7 @@ impl<'a> Parser<'a> {
pub fn parse_trim_expr(&mut self) -> Result<Expr, ParserError> { pub fn parse_trim_expr(&mut self) -> Result<Expr, ParserError> {
self.expect_token(&Token::LParen)?; self.expect_token(&Token::LParen)?;
let mut trim_where = None; let mut trim_where = None;
if let Token::Word(word) = self.peek_token() { if let Token::Word(word) = self.peek_token().token {
if [Keyword::BOTH, Keyword::LEADING, Keyword::TRAILING] if [Keyword::BOTH, Keyword::LEADING, Keyword::TRAILING]
.iter() .iter()
.any(|d| word.keyword == *d) .any(|d| word.keyword == *d)
@ -1009,21 +1034,22 @@ impl<'a> Parser<'a> {
} }
pub fn parse_trim_where(&mut self) -> Result<TrimWhereField, ParserError> { pub fn parse_trim_where(&mut self) -> Result<TrimWhereField, ParserError> {
match self.next_token() { let next_token = self.next_token();
match &next_token.token {
Token::Word(w) => match w.keyword { Token::Word(w) => match w.keyword {
Keyword::BOTH => Ok(TrimWhereField::Both), Keyword::BOTH => Ok(TrimWhereField::Both),
Keyword::LEADING => Ok(TrimWhereField::Leading), Keyword::LEADING => Ok(TrimWhereField::Leading),
Keyword::TRAILING => Ok(TrimWhereField::Trailing), Keyword::TRAILING => Ok(TrimWhereField::Trailing),
_ => self.expected("trim_where field", Token::Word(w))?, _ => self.expected("trim_where field", next_token)?,
}, },
unexpected => self.expected("trim_where field", unexpected), _ => self.expected("trim_where field", next_token),
} }
} }
/// Parses an array expression `[ex1, ex2, ..]` /// Parses an array expression `[ex1, ex2, ..]`
/// if `named` is `true`, came from an expression like `ARRAY[ex1, ex2]` /// if `named` is `true`, came from an expression like `ARRAY[ex1, ex2]`
pub fn parse_array_expr(&mut self, named: bool) -> Result<Expr, ParserError> { pub fn parse_array_expr(&mut self, named: bool) -> Result<Expr, ParserError> {
if self.peek_token() == Token::RBracket { if self.peek_token().token == Token::RBracket {
let _ = self.next_token(); let _ = self.next_token();
Ok(Expr::Array(Array { Ok(Expr::Array(Array {
elem: vec![], elem: vec![],
@ -1060,7 +1086,7 @@ impl<'a> Parser<'a> {
Some(ListAggOnOverflow::Error) Some(ListAggOnOverflow::Error)
} else { } else {
self.expect_keyword(Keyword::TRUNCATE)?; self.expect_keyword(Keyword::TRUNCATE)?;
let filler = match self.peek_token() { let filler = match self.peek_token().token {
Token::Word(w) Token::Word(w)
if w.keyword == Keyword::WITH || w.keyword == Keyword::WITHOUT => if w.keyword == Keyword::WITH || w.keyword == Keyword::WITHOUT =>
{ {
@ -1070,9 +1096,10 @@ impl<'a> Parser<'a> {
| Token::EscapedStringLiteral(_) | Token::EscapedStringLiteral(_)
| Token::NationalStringLiteral(_) | Token::NationalStringLiteral(_)
| Token::HexStringLiteral(_) => Some(Box::new(self.parse_expr()?)), | Token::HexStringLiteral(_) => Some(Box::new(self.parse_expr()?)),
unexpected => { _ => self.expected(
self.expected("either filler, WITH, or WITHOUT in LISTAGG", unexpected)? "either filler, WITH, or WITHOUT in LISTAGG",
} self.peek_token(),
)?,
}; };
let with_count = self.parse_keyword(Keyword::WITH); let with_count = self.parse_keyword(Keyword::WITH);
if !with_count && !self.parse_keyword(Keyword::WITHOUT) { if !with_count && !self.parse_keyword(Keyword::WITHOUT) {
@ -1158,7 +1185,8 @@ impl<'a> Parser<'a> {
// EXTRACT supports a wider set of date/time fields than interval qualifiers, // EXTRACT supports a wider set of date/time fields than interval qualifiers,
// so this function may need to be split in two. // so this function may need to be split in two.
pub fn parse_date_time_field(&mut self) -> Result<DateTimeField, ParserError> { pub fn parse_date_time_field(&mut self) -> Result<DateTimeField, ParserError> {
match self.next_token() { let next_token = self.next_token();
match &next_token.token {
Token::Word(w) => match w.keyword { Token::Word(w) => match w.keyword {
Keyword::YEAR => Ok(DateTimeField::Year), Keyword::YEAR => Ok(DateTimeField::Year),
Keyword::MONTH => Ok(DateTimeField::Month), Keyword::MONTH => Ok(DateTimeField::Month),
@ -1186,14 +1214,14 @@ impl<'a> Parser<'a> {
Keyword::TIMEZONE => Ok(DateTimeField::Timezone), Keyword::TIMEZONE => Ok(DateTimeField::Timezone),
Keyword::TIMEZONE_HOUR => Ok(DateTimeField::TimezoneHour), Keyword::TIMEZONE_HOUR => Ok(DateTimeField::TimezoneHour),
Keyword::TIMEZONE_MINUTE => Ok(DateTimeField::TimezoneMinute), Keyword::TIMEZONE_MINUTE => Ok(DateTimeField::TimezoneMinute),
_ => self.expected("date/time field", Token::Word(w))?, _ => self.expected("date/time field", next_token),
}, },
unexpected => self.expected("date/time field", unexpected), _ => self.expected("date/time field", next_token),
} }
} }
pub fn parse_not(&mut self) -> Result<Expr, ParserError> { pub fn parse_not(&mut self) -> Result<Expr, ParserError> {
match self.peek_token() { match self.peek_token().token {
Token::Word(w) => match w.keyword { Token::Word(w) => match w.keyword {
Keyword::EXISTS => { Keyword::EXISTS => {
let negated = true; let negated = true;
@ -1291,7 +1319,7 @@ impl<'a> Parser<'a> {
// //
// Note that PostgreSQL allows omitting the qualifier, so we provide // Note that PostgreSQL allows omitting the qualifier, so we provide
// this more general implementation. // this more general implementation.
let leading_field = match self.peek_token() { let leading_field = match self.peek_token().token {
Token::Word(kw) Token::Word(kw)
if [ if [
Keyword::YEAR, Keyword::YEAR,
@ -1371,7 +1399,7 @@ impl<'a> Parser<'a> {
let tok = self.next_token(); let tok = self.next_token();
let regular_binary_operator = match &tok { let regular_binary_operator = match &tok.token {
Token::Spaceship => Some(BinaryOperator::Spaceship), Token::Spaceship => Some(BinaryOperator::Spaceship),
Token::DoubleEq => Some(BinaryOperator::Eq), Token::DoubleEq => Some(BinaryOperator::Eq),
Token::Eq => Some(BinaryOperator::Eq), Token::Eq => Some(BinaryOperator::Eq),
@ -1451,7 +1479,7 @@ impl<'a> Parser<'a> {
right: Box::new(self.parse_subexpr(precedence)?), right: Box::new(self.parse_subexpr(precedence)?),
}) })
} }
} else if let Token::Word(w) = &tok { } else if let Token::Word(w) = &tok.token {
match w.keyword { match w.keyword {
Keyword::IS => { Keyword::IS => {
if self.parse_keyword(Keyword::NULL) { if self.parse_keyword(Keyword::NULL) {
@ -1489,7 +1517,7 @@ impl<'a> Parser<'a> {
// self.expect_keyword(Keyword::ZONE)?; // self.expect_keyword(Keyword::ZONE)?;
if self.parse_keywords(&[Keyword::TIME, Keyword::ZONE]) { if self.parse_keywords(&[Keyword::TIME, Keyword::ZONE]) {
let time_zone = self.next_token(); let time_zone = self.next_token();
match time_zone { match time_zone.token {
Token::SingleQuotedString(time_zone) => { Token::SingleQuotedString(time_zone) => {
log::trace!("Peek token: {:?}", self.peek_token()); log::trace!("Peek token: {:?}", self.peek_token());
Ok(Expr::AtTimeZone { Ok(Expr::AtTimeZone {
@ -1497,9 +1525,9 @@ impl<'a> Parser<'a> {
time_zone, time_zone,
}) })
} }
tok => self.expected( _ => self.expected(
"Expected Token::SingleQuotedString after AT TIME ZONE", "Expected Token::SingleQuotedString after AT TIME ZONE",
tok, time_zone,
), ),
} }
} else { } else {
@ -1544,7 +1572,7 @@ impl<'a> Parser<'a> {
} }
} }
// Can only happen if `get_next_precedence` got out of sync with this function // Can only happen if `get_next_precedence` got out of sync with this function
_ => parser_err!(format!("No infix parser for token {:?}", tok)), _ => parser_err!(format!("No infix parser for token {:?}", tok.token)),
} }
} else if Token::DoubleColon == tok { } else if Token::DoubleColon == tok {
self.parse_pg_cast(expr) self.parse_pg_cast(expr)
@ -1571,7 +1599,7 @@ impl<'a> Parser<'a> {
|| Token::HashArrow == tok || Token::HashArrow == tok
|| Token::HashLongArrow == tok || Token::HashLongArrow == tok
{ {
let operator = match tok { let operator = match tok.token {
Token::Arrow => JsonOperator::Arrow, Token::Arrow => JsonOperator::Arrow,
Token::LongArrow => JsonOperator::LongArrow, Token::LongArrow => JsonOperator::LongArrow,
Token::HashArrow => JsonOperator::HashArrow, Token::HashArrow => JsonOperator::HashArrow,
@ -1585,7 +1613,7 @@ impl<'a> Parser<'a> {
}) })
} else { } else {
// Can only happen if `get_next_precedence` got out of sync with this function // Can only happen if `get_next_precedence` got out of sync with this function
parser_err!(format!("No infix parser for token {:?}", tok)) parser_err!(format!("No infix parser for token {:?}", tok.token))
} }
} }
@ -1713,13 +1741,13 @@ impl<'a> Parser<'a> {
let token_1 = self.peek_nth_token(1); let token_1 = self.peek_nth_token(1);
let token_2 = self.peek_nth_token(2); let token_2 = self.peek_nth_token(2);
debug!("0: {token_0} 1: {token_1} 2: {token_2}"); debug!("0: {token_0} 1: {token_1} 2: {token_2}");
match token { match token.token {
Token::Word(w) if w.keyword == Keyword::OR => Ok(Self::OR_PREC), Token::Word(w) if w.keyword == Keyword::OR => Ok(Self::OR_PREC),
Token::Word(w) if w.keyword == Keyword::AND => Ok(Self::AND_PREC), Token::Word(w) if w.keyword == Keyword::AND => Ok(Self::AND_PREC),
Token::Word(w) if w.keyword == Keyword::XOR => Ok(Self::XOR_PREC), Token::Word(w) if w.keyword == Keyword::XOR => Ok(Self::XOR_PREC),
Token::Word(w) if w.keyword == Keyword::AT => { Token::Word(w) if w.keyword == Keyword::AT => {
match (self.peek_nth_token(1), self.peek_nth_token(2)) { match (self.peek_nth_token(1).token, self.peek_nth_token(2).token) {
(Token::Word(w), Token::Word(w2)) (Token::Word(w), Token::Word(w2))
if w.keyword == Keyword::TIME && w2.keyword == Keyword::ZONE => if w.keyword == Keyword::TIME && w2.keyword == Keyword::ZONE =>
{ {
@ -1729,7 +1757,7 @@ impl<'a> Parser<'a> {
} }
} }
Token::Word(w) if w.keyword == Keyword::NOT => match self.peek_nth_token(1) { Token::Word(w) if w.keyword == Keyword::NOT => match self.peek_nth_token(1).token {
// The precedence of NOT varies depending on keyword that // The precedence of NOT varies depending on keyword that
// follows it. If it is followed by IN, BETWEEN, or LIKE, // follows it. If it is followed by IN, BETWEEN, or LIKE,
// it takes on the precedence of those tokens. Otherwise it // it takes on the precedence of those tokens. Otherwise it
@ -1780,20 +1808,26 @@ impl<'a> Parser<'a> {
/// Return the first non-whitespace token that has not yet been processed /// Return the first non-whitespace token that has not yet been processed
/// (or None if reached end-of-file) /// (or None if reached end-of-file)
pub fn peek_token(&self) -> Token { pub fn peek_token(&self) -> TokenWithLocation {
self.peek_nth_token(0) self.peek_nth_token(0)
} }
/// Return nth non-whitespace token that has not yet been processed /// Return nth non-whitespace token that has not yet been processed
pub fn peek_nth_token(&self, mut n: usize) -> Token { pub fn peek_nth_token(&self, mut n: usize) -> TokenWithLocation {
let mut index = self.index; let mut index = self.index;
loop { loop {
index += 1; index += 1;
match self.tokens.get(index - 1) { match self.tokens.get(index - 1) {
Some(Token::Whitespace(_)) => continue, Some(TokenWithLocation {
token: Token::Whitespace(_),
location: _,
}) => continue,
non_whitespace => { non_whitespace => {
if n == 0 { if n == 0 {
return non_whitespace.cloned().unwrap_or(Token::EOF); return non_whitespace.cloned().unwrap_or(TokenWithLocation {
token: Token::EOF,
location: Location { line: 0, column: 0 },
});
} }
n -= 1; n -= 1;
} }
@ -1804,18 +1838,25 @@ impl<'a> Parser<'a> {
/// Return the first non-whitespace token that has not yet been processed /// Return the first non-whitespace token that has not yet been processed
/// (or None if reached end-of-file) and mark it as processed. OK to call /// (or None if reached end-of-file) and mark it as processed. OK to call
/// repeatedly after reaching EOF. /// repeatedly after reaching EOF.
pub fn next_token(&mut self) -> Token { pub fn next_token(&mut self) -> TokenWithLocation {
loop { loop {
self.index += 1; self.index += 1;
match self.tokens.get(self.index - 1) { match self.tokens.get(self.index - 1) {
Some(Token::Whitespace(_)) => continue, Some(TokenWithLocation {
token => return token.cloned().unwrap_or(Token::EOF), token: Token::Whitespace(_),
location: _,
}) => continue,
token => {
return token
.cloned()
.unwrap_or_else(|| TokenWithLocation::wrap(Token::EOF))
}
} }
} }
} }
/// Return the first unprocessed token, possibly whitespace. /// Return the first unprocessed token, possibly whitespace.
pub fn next_token_no_skip(&mut self) -> Option<&Token> { pub fn next_token_no_skip(&mut self) -> Option<&TokenWithLocation> {
self.index += 1; self.index += 1;
self.tokens.get(self.index - 1) self.tokens.get(self.index - 1)
} }
@ -1827,7 +1868,11 @@ impl<'a> Parser<'a> {
loop { loop {
assert!(self.index > 0); assert!(self.index > 0);
self.index -= 1; self.index -= 1;
if let Some(Token::Whitespace(_)) = self.tokens.get(self.index) { if let Some(TokenWithLocation {
token: Token::Whitespace(_),
location: _,
}) = self.tokens.get(self.index)
{
continue; continue;
} }
return; return;
@ -1835,14 +1880,14 @@ impl<'a> Parser<'a> {
} }
/// Report unexpected token /// Report unexpected token
pub fn expected<T>(&self, expected: &str, found: Token) -> Result<T, ParserError> { pub fn expected<T>(&self, expected: &str, found: TokenWithLocation) -> Result<T, ParserError> {
parser_err!(format!("Expected {}, found: {}", expected, found)) parser_err!(format!("Expected {}, found: {}", expected, found))
} }
/// Look for an expected keyword and consume it if it exists /// Look for an expected keyword and consume it if it exists
#[must_use] #[must_use]
pub fn parse_keyword(&mut self, expected: Keyword) -> bool { pub fn parse_keyword(&mut self, expected: Keyword) -> bool {
match self.peek_token() { match self.peek_token().token {
Token::Word(w) if expected == w.keyword => { Token::Word(w) if expected == w.keyword => {
self.next_token(); self.next_token();
true true
@ -1869,7 +1914,7 @@ impl<'a> Parser<'a> {
/// Look for one of the given keywords and return the one that matches. /// Look for one of the given keywords and return the one that matches.
#[must_use] #[must_use]
pub fn parse_one_of_keywords(&mut self, keywords: &[Keyword]) -> Option<Keyword> { pub fn parse_one_of_keywords(&mut self, keywords: &[Keyword]) -> Option<Keyword> {
match self.peek_token() { match self.peek_token().token {
Token::Word(w) => { Token::Word(w) => {
keywords keywords
.iter() .iter()
@ -1945,7 +1990,7 @@ impl<'a> Parser<'a> {
// BigQuery allows trailing commas. // BigQuery allows trailing commas.
// e.g. `SELECT 1, 2, FROM t` // e.g. `SELECT 1, 2, FROM t`
// https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#trailing_commas // https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#trailing_commas
match self.peek_token() { match self.peek_token().token {
Token::Word(kw) Token::Word(kw)
if keywords::RESERVED_FOR_COLUMN_ALIAS if keywords::RESERVED_FOR_COLUMN_ALIAS
.iter() .iter()
@ -2057,14 +2102,14 @@ impl<'a> Parser<'a> {
let (mut table_flag, mut options, mut has_as, mut query) = (None, vec![], false, None); let (mut table_flag, mut options, mut has_as, mut query) = (None, vec![], false, None);
if self.parse_keyword(Keyword::TABLE) { if self.parse_keyword(Keyword::TABLE) {
let table_name = self.parse_object_name()?; let table_name = self.parse_object_name()?;
if self.peek_token() != Token::EOF { if self.peek_token().token != Token::EOF {
if let Token::Word(word) = self.peek_token() { if let Token::Word(word) = self.peek_token().token {
if word.keyword == Keyword::OPTIONS { if word.keyword == Keyword::OPTIONS {
options = self.parse_options(Keyword::OPTIONS)? options = self.parse_options(Keyword::OPTIONS)?
} }
}; };
if self.peek_token() != Token::EOF { if self.peek_token().token != Token::EOF {
let (a, q) = self.parse_as_query()?; let (a, q) = self.parse_as_query()?;
has_as = a; has_as = a;
query = Some(q); query = Some(q);
@ -2091,7 +2136,7 @@ impl<'a> Parser<'a> {
if self.parse_keyword(Keyword::TABLE) { if self.parse_keyword(Keyword::TABLE) {
let table_name = self.parse_object_name()?; let table_name = self.parse_object_name()?;
if self.peek_token() != Token::EOF { if self.peek_token() != Token::EOF {
if let Token::Word(word) = self.peek_token() { if let Token::Word(word) = self.peek_token().token {
if word.keyword == Keyword::OPTIONS { if word.keyword == Keyword::OPTIONS {
options = self.parse_options(Keyword::OPTIONS)? options = self.parse_options(Keyword::OPTIONS)?
} }
@ -2130,7 +2175,7 @@ impl<'a> Parser<'a> {
/// Parse 'AS' before as query,such as `WITH XXX AS SELECT XXX` oer `CACHE TABLE AS SELECT XXX` /// Parse 'AS' before as query,such as `WITH XXX AS SELECT XXX` oer `CACHE TABLE AS SELECT XXX`
pub fn parse_as_query(&mut self) -> Result<(bool, Query), ParserError> { pub fn parse_as_query(&mut self) -> Result<(bool, Query), ParserError> {
match self.peek_token() { match self.peek_token().token {
Token::Word(word) => match word.keyword { Token::Word(word) => match word.keyword {
Keyword::AS => { Keyword::AS => {
self.next_token(); self.next_token();
@ -2148,7 +2193,7 @@ impl<'a> Parser<'a> {
if has_table { if has_table {
let if_exists = self.parse_keywords(&[Keyword::IF, Keyword::EXISTS]); let if_exists = self.parse_keywords(&[Keyword::IF, Keyword::EXISTS]);
let table_name = self.parse_object_name()?; let table_name = self.parse_object_name()?;
if self.peek_token() == Token::EOF { if self.peek_token().token == Token::EOF {
Ok(Statement::UNCache { Ok(Statement::UNCache {
table_name, table_name,
if_exists, if_exists,
@ -2248,7 +2293,7 @@ impl<'a> Parser<'a> {
Keyword::ARCHIVE => Ok(Some(CreateFunctionUsing::Archive(uri))), Keyword::ARCHIVE => Ok(Some(CreateFunctionUsing::Archive(uri))),
_ => self.expected( _ => self.expected(
"JAR, FILE or ARCHIVE, got {:?}", "JAR, FILE or ARCHIVE, got {:?}",
Token::make_keyword(format!("{:?}", keyword).as_str()), TokenWithLocation::wrap(Token::make_keyword(format!("{:?}", keyword).as_str())),
), ),
} }
} }
@ -2410,7 +2455,8 @@ impl<'a> Parser<'a> {
} }
pub fn parse_file_format(&mut self) -> Result<FileFormat, ParserError> { pub fn parse_file_format(&mut self) -> Result<FileFormat, ParserError> {
match self.next_token() { let next_token = self.next_token();
match &next_token.token {
Token::Word(w) => match w.keyword { Token::Word(w) => match w.keyword {
Keyword::AVRO => Ok(FileFormat::AVRO), Keyword::AVRO => Ok(FileFormat::AVRO),
Keyword::JSONFILE => Ok(FileFormat::JSONFILE), Keyword::JSONFILE => Ok(FileFormat::JSONFILE),
@ -2419,21 +2465,22 @@ impl<'a> Parser<'a> {
Keyword::RCFILE => Ok(FileFormat::RCFILE), Keyword::RCFILE => Ok(FileFormat::RCFILE),
Keyword::SEQUENCEFILE => Ok(FileFormat::SEQUENCEFILE), Keyword::SEQUENCEFILE => Ok(FileFormat::SEQUENCEFILE),
Keyword::TEXTFILE => Ok(FileFormat::TEXTFILE), Keyword::TEXTFILE => Ok(FileFormat::TEXTFILE),
_ => self.expected("fileformat", Token::Word(w)), _ => self.expected("fileformat", next_token),
}, },
unexpected => self.expected("fileformat", unexpected), _ => self.expected("fileformat", next_token),
} }
} }
pub fn parse_analyze_format(&mut self) -> Result<AnalyzeFormat, ParserError> { pub fn parse_analyze_format(&mut self) -> Result<AnalyzeFormat, ParserError> {
match self.next_token() { let next_token = self.next_token();
match &next_token.token {
Token::Word(w) => match w.keyword { Token::Word(w) => match w.keyword {
Keyword::TEXT => Ok(AnalyzeFormat::TEXT), Keyword::TEXT => Ok(AnalyzeFormat::TEXT),
Keyword::GRAPHVIZ => Ok(AnalyzeFormat::GRAPHVIZ), Keyword::GRAPHVIZ => Ok(AnalyzeFormat::GRAPHVIZ),
Keyword::JSON => Ok(AnalyzeFormat::JSON), Keyword::JSON => Ok(AnalyzeFormat::JSON),
_ => self.expected("fileformat", Token::Word(w)), _ => self.expected("fileformat", next_token),
}, },
unexpected => self.expected("fileformat", unexpected), _ => self.expected("fileformat", next_token),
} }
} }
@ -2949,10 +2996,11 @@ impl<'a> Parser<'a> {
// Clickhouse has `ON CLUSTER 'cluster'` syntax for DDLs // Clickhouse has `ON CLUSTER 'cluster'` syntax for DDLs
let on_cluster = if self.parse_keywords(&[Keyword::ON, Keyword::CLUSTER]) { let on_cluster = if self.parse_keywords(&[Keyword::ON, Keyword::CLUSTER]) {
match self.next_token() { let next_token = self.next_token();
match next_token.token {
Token::SingleQuotedString(s) => Some(s), Token::SingleQuotedString(s) => Some(s),
Token::Word(s) => Some(s.to_string()), Token::Word(s) => Some(s.to_string()),
unexpected => self.expected("identifier or cluster literal", unexpected)?, _ => self.expected("identifier or cluster literal", next_token)?,
} }
} else { } else {
None None
@ -2990,9 +3038,10 @@ impl<'a> Parser<'a> {
let engine = if self.parse_keyword(Keyword::ENGINE) { let engine = if self.parse_keyword(Keyword::ENGINE) {
self.expect_token(&Token::Eq)?; self.expect_token(&Token::Eq)?;
match self.next_token() { let next_token = self.next_token();
match next_token.token {
Token::Word(w) => Some(w.value), Token::Word(w) => Some(w.value),
unexpected => self.expected("identifier", unexpected)?, _ => self.expected("identifier", next_token)?,
} }
} else { } else {
None None
@ -3000,9 +3049,10 @@ impl<'a> Parser<'a> {
let default_charset = if self.parse_keywords(&[Keyword::DEFAULT, Keyword::CHARSET]) { let default_charset = if self.parse_keywords(&[Keyword::DEFAULT, Keyword::CHARSET]) {
self.expect_token(&Token::Eq)?; self.expect_token(&Token::Eq)?;
match self.next_token() { let next_token = self.next_token();
match next_token.token {
Token::Word(w) => Some(w.value), Token::Word(w) => Some(w.value),
unexpected => self.expected("identifier", unexpected)?, _ => self.expected("identifier", next_token)?,
} }
} else { } else {
None None
@ -3010,9 +3060,10 @@ impl<'a> Parser<'a> {
let collation = if self.parse_keywords(&[Keyword::COLLATE]) { let collation = if self.parse_keywords(&[Keyword::COLLATE]) {
self.expect_token(&Token::Eq)?; self.expect_token(&Token::Eq)?;
match self.next_token() { let next_token = self.next_token();
match next_token.token {
Token::Word(w) => Some(w.value), Token::Word(w) => Some(w.value),
unexpected => self.expected("identifier", unexpected)?, _ => self.expected("identifier", next_token)?,
} }
} else { } else {
None None
@ -3068,7 +3119,7 @@ impl<'a> Parser<'a> {
loop { loop {
if let Some(constraint) = self.parse_optional_table_constraint()? { if let Some(constraint) = self.parse_optional_table_constraint()? {
constraints.push(constraint); constraints.push(constraint);
} else if let Token::Word(_) = self.peek_token() { } else if let Token::Word(_) = self.peek_token().token {
columns.push(self.parse_column_def()?); columns.push(self.parse_column_def()?);
} else { } else {
return self.expected("column name or constraint definition", self.peek_token()); return self.expected("column name or constraint definition", self.peek_token());
@ -3125,9 +3176,10 @@ impl<'a> Parser<'a> {
} else if self.parse_keywords(&[Keyword::NOT, Keyword::NULL]) { } else if self.parse_keywords(&[Keyword::NOT, Keyword::NULL]) {
Ok(Some(ColumnOption::NotNull)) Ok(Some(ColumnOption::NotNull))
} else if self.parse_keywords(&[Keyword::COMMENT]) { } else if self.parse_keywords(&[Keyword::COMMENT]) {
match self.next_token() { let next_token = self.next_token();
match next_token.token {
Token::SingleQuotedString(value, ..) => Ok(Some(ColumnOption::Comment(value))), Token::SingleQuotedString(value, ..) => Ok(Some(ColumnOption::Comment(value))),
unexpected => self.expected("string", unexpected), _ => self.expected("string", next_token),
} }
} else if self.parse_keyword(Keyword::NULL) { } else if self.parse_keyword(Keyword::NULL) {
Ok(Some(ColumnOption::Null)) Ok(Some(ColumnOption::Null))
@ -3218,7 +3270,9 @@ impl<'a> Parser<'a> {
} else { } else {
None None
}; };
match self.next_token() {
let next_token = self.next_token();
match next_token.token {
Token::Word(w) if w.keyword == Keyword::PRIMARY || w.keyword == Keyword::UNIQUE => { Token::Word(w) if w.keyword == Keyword::PRIMARY || w.keyword == Keyword::UNIQUE => {
let is_primary = w.keyword == Keyword::PRIMARY; let is_primary = w.keyword == Keyword::PRIMARY;
if is_primary { if is_primary {
@ -3271,7 +3325,7 @@ impl<'a> Parser<'a> {
{ {
let display_as_key = w.keyword == Keyword::KEY; let display_as_key = w.keyword == Keyword::KEY;
let name = match self.peek_token() { let name = match self.peek_token().token {
Token::Word(word) if word.keyword == Keyword::USING => None, Token::Word(word) if word.keyword == Keyword::USING => None,
_ => self.maybe_parse(|parser| parser.parse_identifier()), _ => self.maybe_parse(|parser| parser.parse_identifier()),
}; };
@ -3297,7 +3351,10 @@ impl<'a> Parser<'a> {
if let Some(name) = name { if let Some(name) = name {
return self.expected( return self.expected(
"FULLTEXT or SPATIAL option without constraint name", "FULLTEXT or SPATIAL option without constraint name",
Token::make_keyword(&name.to_string()), TokenWithLocation {
token: Token::make_keyword(&name.to_string()),
location: next_token.location,
},
); );
} }
@ -3322,9 +3379,9 @@ impl<'a> Parser<'a> {
columns, columns,
})) }))
} }
unexpected => { _ => {
if name.is_some() { if name.is_some() {
self.expected("PRIMARY, UNIQUE, FOREIGN, or CHECK", unexpected) self.expected("PRIMARY, UNIQUE, FOREIGN, or CHECK", next_token)
} else { } else {
self.prev_token(); self.prev_token();
Ok(None) Ok(None)
@ -3712,7 +3769,7 @@ impl<'a> Parser<'a> {
pub fn parse_tab_value(&mut self) -> Vec<Option<String>> { pub fn parse_tab_value(&mut self) -> Vec<Option<String>> {
let mut values = vec![]; let mut values = vec![];
let mut content = String::from(""); let mut content = String::from("");
while let Some(t) = self.next_token_no_skip() { while let Some(t) = self.next_token_no_skip().map(|t| &t.token) {
match t { match t {
Token::Whitespace(Whitespace::Tab) => { Token::Whitespace(Whitespace::Tab) => {
values.push(Some(content.to_string())); values.push(Some(content.to_string()));
@ -3726,7 +3783,7 @@ impl<'a> Parser<'a> {
if self.consume_token(&Token::Period) { if self.consume_token(&Token::Period) {
return values; return values;
} }
if let Token::Word(w) = self.next_token() { if let Token::Word(w) = self.next_token().token {
if w.value == "N" { if w.value == "N" {
values.push(None); values.push(None);
} }
@ -3742,7 +3799,9 @@ impl<'a> Parser<'a> {
/// Parse a literal value (numbers, strings, date/time, booleans) /// Parse a literal value (numbers, strings, date/time, booleans)
pub fn parse_value(&mut self) -> Result<Value, ParserError> { pub fn parse_value(&mut self) -> Result<Value, ParserError> {
match self.next_token() { let next_token = self.next_token();
let location = next_token.location;
match next_token.token {
Token::Word(w) => match w.keyword { Token::Word(w) => match w.keyword {
Keyword::TRUE => Ok(Value::Boolean(true)), Keyword::TRUE => Ok(Value::Boolean(true)),
Keyword::FALSE => Ok(Value::Boolean(false)), Keyword::FALSE => Ok(Value::Boolean(false)),
@ -3750,13 +3809,25 @@ impl<'a> Parser<'a> {
Keyword::NoKeyword if w.quote_style.is_some() => match w.quote_style { Keyword::NoKeyword if w.quote_style.is_some() => match w.quote_style {
Some('"') => Ok(Value::DoubleQuotedString(w.value)), Some('"') => Ok(Value::DoubleQuotedString(w.value)),
Some('\'') => Ok(Value::SingleQuotedString(w.value)), Some('\'') => Ok(Value::SingleQuotedString(w.value)),
_ => self.expected("A value?", Token::Word(w))?, _ => self.expected(
"A value?",
TokenWithLocation {
token: Token::Word(w),
location,
},
)?,
}, },
// Case when Snowflake Semi-structured data like key:value // Case when Snowflake Semi-structured data like key:value
Keyword::NoKeyword | Keyword::LOCATION if dialect_of!(self is SnowflakeDialect | GenericDialect) => { Keyword::NoKeyword | Keyword::LOCATION if dialect_of!(self is SnowflakeDialect | GenericDialect) => {
Ok(Value::UnQuotedString(w.value)) Ok(Value::UnQuotedString(w.value))
} }
_ => self.expected("a concrete value", Token::Word(w)), _ => self.expected(
"a concrete value",
TokenWithLocation {
token: Token::Word(w),
location,
},
),
}, },
// The call to n.parse() returns a bigdecimal when the // The call to n.parse() returns a bigdecimal when the
// bigdecimal feature is enabled, and is otherwise a no-op // bigdecimal feature is enabled, and is otherwise a no-op
@ -3776,7 +3847,13 @@ impl<'a> Parser<'a> {
let placeholder = tok.to_string() + &ident.value; let placeholder = tok.to_string() + &ident.value;
Ok(Value::Placeholder(placeholder)) Ok(Value::Placeholder(placeholder))
} }
unexpected => self.expected("a value", unexpected), unexpected => self.expected(
"a value",
TokenWithLocation {
token: unexpected,
location,
},
),
} }
} }
@ -3793,30 +3870,33 @@ impl<'a> Parser<'a> {
/// Parse an unsigned literal integer/long /// Parse an unsigned literal integer/long
pub fn parse_literal_uint(&mut self) -> Result<u64, ParserError> { pub fn parse_literal_uint(&mut self) -> Result<u64, ParserError> {
match self.next_token() { let next_token = self.next_token();
match next_token.token {
Token::Number(s, _) => s.parse::<u64>().map_err(|e| { Token::Number(s, _) => s.parse::<u64>().map_err(|e| {
ParserError::ParserError(format!("Could not parse '{}' as u64: {}", s, e)) ParserError::ParserError(format!("Could not parse '{}' as u64: {}", s, e))
}), }),
unexpected => self.expected("literal int", unexpected), _ => self.expected("literal int", next_token),
} }
} }
/// Parse a literal string /// Parse a literal string
pub fn parse_literal_string(&mut self) -> Result<String, ParserError> { pub fn parse_literal_string(&mut self) -> Result<String, ParserError> {
match self.next_token() { let next_token = self.next_token();
match next_token.token {
Token::Word(Word { value, keyword, .. }) if keyword == Keyword::NoKeyword => Ok(value), Token::Word(Word { value, keyword, .. }) if keyword == Keyword::NoKeyword => Ok(value),
Token::SingleQuotedString(s) => Ok(s), Token::SingleQuotedString(s) => Ok(s),
Token::DoubleQuotedString(s) => Ok(s), Token::DoubleQuotedString(s) => Ok(s),
Token::EscapedStringLiteral(s) if dialect_of!(self is PostgreSqlDialect | GenericDialect) => { Token::EscapedStringLiteral(s) if dialect_of!(self is PostgreSqlDialect | GenericDialect) => {
Ok(s) Ok(s)
} }
unexpected => self.expected("literal string", unexpected), _ => self.expected("literal string", next_token),
} }
} }
/// Parse a map key string /// Parse a map key string
pub fn parse_map_key(&mut self) -> Result<Expr, ParserError> { pub fn parse_map_key(&mut self) -> Result<Expr, ParserError> {
match self.next_token() { let next_token = self.next_token();
match next_token.token {
Token::Word(Word { value, keyword, .. }) if keyword == Keyword::NoKeyword => { Token::Word(Word { value, keyword, .. }) if keyword == Keyword::NoKeyword => {
if self.peek_token() == Token::LParen { if self.peek_token() == Token::LParen {
return self.parse_function(ObjectName(vec![Ident::new(value)])); return self.parse_function(ObjectName(vec![Ident::new(value)]));
@ -3828,13 +3908,14 @@ impl<'a> Parser<'a> {
Token::Number(s, _) => Ok(Expr::Value(Value::Number(s, false))), Token::Number(s, _) => Ok(Expr::Value(Value::Number(s, false))),
#[cfg(feature = "bigdecimal")] #[cfg(feature = "bigdecimal")]
Token::Number(s, _) => Ok(Expr::Value(Value::Number(s.parse().unwrap(), false))), Token::Number(s, _) => Ok(Expr::Value(Value::Number(s.parse().unwrap(), false))),
unexpected => self.expected("literal string, number or function", unexpected), _ => self.expected("literal string, number or function", next_token),
} }
} }
/// Parse a SQL datatype (in the context of a CREATE TABLE statement for example) /// Parse a SQL datatype (in the context of a CREATE TABLE statement for example)
pub fn parse_data_type(&mut self) -> Result<DataType, ParserError> { pub fn parse_data_type(&mut self) -> Result<DataType, ParserError> {
let mut data = match self.next_token() { let next_token = self.next_token();
let mut data = match next_token.token {
Token::Word(w) => match w.keyword { Token::Word(w) => match w.keyword {
Keyword::BOOLEAN => Ok(DataType::Boolean), Keyword::BOOLEAN => Ok(DataType::Boolean),
Keyword::FLOAT => Ok(DataType::Float(self.parse_optional_precision()?)), Keyword::FLOAT => Ok(DataType::Float(self.parse_optional_precision()?)),
@ -4003,7 +4084,7 @@ impl<'a> Parser<'a> {
} }
} }
}, },
unexpected => self.expected("a data type name", unexpected), _ => self.expected("a data type name", next_token),
}?; }?;
// Parse array data types. Note: this is postgresql-specific and different from // Parse array data types. Note: this is postgresql-specific and different from
@ -4019,14 +4100,16 @@ impl<'a> Parser<'a> {
self.expect_token(&Token::LParen)?; self.expect_token(&Token::LParen)?;
let mut values = Vec::new(); let mut values = Vec::new();
loop { loop {
match self.next_token() { let next_token = self.next_token();
match next_token.token {
Token::SingleQuotedString(value) => values.push(value), Token::SingleQuotedString(value) => values.push(value),
unexpected => self.expected("a string", unexpected)?, _ => self.expected("a string", next_token)?,
} }
match self.next_token() { let next_token = self.next_token();
match next_token.token {
Token::Comma => (), Token::Comma => (),
Token::RParen => break, Token::RParen => break,
unexpected => self.expected(", or }", unexpected)?, _ => self.expected(", or }", next_token)?,
} }
} }
Ok(values) Ok(values)
@ -4040,7 +4123,8 @@ impl<'a> Parser<'a> {
reserved_kwds: &[Keyword], reserved_kwds: &[Keyword],
) -> Result<Option<Ident>, ParserError> { ) -> Result<Option<Ident>, ParserError> {
let after_as = self.parse_keyword(Keyword::AS); let after_as = self.parse_keyword(Keyword::AS);
match self.next_token() { let next_token = self.next_token();
match next_token.token {
// Accept any identifier after `AS` (though many dialects have restrictions on // Accept any identifier after `AS` (though many dialects have restrictions on
// keywords that may appear here). If there's no `AS`: don't parse keywords, // keywords that may appear here). If there's no `AS`: don't parse keywords,
// which may start a construct allowed in this position, to be parsed as aliases. // which may start a construct allowed in this position, to be parsed as aliases.
@ -4064,9 +4148,9 @@ impl<'a> Parser<'a> {
Token::SingleQuotedString(s) => Ok(Some(Ident::with_quote('\'', s))), Token::SingleQuotedString(s) => Ok(Some(Ident::with_quote('\'', s))),
// Support for MySql dialect double qouted string, `AS "HOUR"` for example // Support for MySql dialect double qouted string, `AS "HOUR"` for example
Token::DoubleQuotedString(s) => Ok(Some(Ident::with_quote('\"', s))), Token::DoubleQuotedString(s) => Ok(Some(Ident::with_quote('\"', s))),
not_an_ident => { _ => {
if after_as { if after_as {
return self.expected("an identifier after AS", not_an_ident); return self.expected("an identifier after AS", next_token);
} }
self.prev_token(); self.prev_token();
Ok(None) // no alias found Ok(None) // no alias found
@ -4108,7 +4192,7 @@ impl<'a> Parser<'a> {
pub fn parse_identifiers(&mut self) -> Result<Vec<Ident>, ParserError> { pub fn parse_identifiers(&mut self) -> Result<Vec<Ident>, ParserError> {
let mut idents = vec![]; let mut idents = vec![];
loop { loop {
match self.peek_token() { match self.peek_token().token {
Token::Word(w) => { Token::Word(w) => {
idents.push(w.to_ident()); idents.push(w.to_ident());
} }
@ -4122,11 +4206,12 @@ impl<'a> Parser<'a> {
/// Parse a simple one-word identifier (possibly quoted, possibly a keyword) /// Parse a simple one-word identifier (possibly quoted, possibly a keyword)
pub fn parse_identifier(&mut self) -> Result<Ident, ParserError> { pub fn parse_identifier(&mut self) -> Result<Ident, ParserError> {
match self.next_token() { let next_token = self.next_token();
match next_token.token {
Token::Word(w) => Ok(w.to_ident()), Token::Word(w) => Ok(w.to_ident()),
Token::SingleQuotedString(s) => Ok(Ident::with_quote('\'', s)), Token::SingleQuotedString(s) => Ok(Ident::with_quote('\'', s)),
Token::DoubleQuotedString(s) => Ok(Ident::with_quote('\"', s)), Token::DoubleQuotedString(s) => Ok(Ident::with_quote('\"', s)),
unexpected => self.expected("identifier", unexpected), _ => self.expected("identifier", next_token),
} }
} }
@ -4231,7 +4316,8 @@ impl<'a> Parser<'a> {
if self.consume_token(&Token::LParen) { if self.consume_token(&Token::LParen) {
let mut modifiers = Vec::new(); let mut modifiers = Vec::new();
loop { loop {
match self.next_token() { let next_token = self.next_token();
match next_token.token {
Token::Word(w) => modifiers.push(w.to_string()), Token::Word(w) => modifiers.push(w.to_string()),
Token::Number(n, _) => modifiers.push(n), Token::Number(n, _) => modifiers.push(n),
Token::SingleQuotedString(s) => modifiers.push(s), Token::SingleQuotedString(s) => modifiers.push(s),
@ -4242,7 +4328,7 @@ impl<'a> Parser<'a> {
Token::RParen => { Token::RParen => {
break; break;
} }
unexpected => self.expected("type modifiers", unexpected)?, _ => self.expected("type modifiers", next_token)?,
} }
} }
@ -4485,7 +4571,7 @@ impl<'a> Parser<'a> {
loop { loop {
// The query can be optionally followed by a set operator: // The query can be optionally followed by a set operator:
let op = self.parse_set_operator(&self.peek_token()); let op = self.parse_set_operator(&self.peek_token().token);
let next_precedence = match op { let next_precedence = match op {
// UNION and EXCEPT have the same binding power and evaluate left-to-right // UNION and EXCEPT have the same binding power and evaluate left-to-right
Some(SetOperator::Union) | Some(SetOperator::Except) => 10, Some(SetOperator::Union) | Some(SetOperator::Except) => 10,
@ -4683,7 +4769,7 @@ impl<'a> Parser<'a> {
let table_name; let table_name;
let schema_name; let schema_name;
if token2 == Token::Period { if token2 == Token::Period {
match token1 { match token1.token {
Token::Word(w) => { Token::Word(w) => {
schema_name = w.value; schema_name = w.value;
} }
@ -4691,7 +4777,7 @@ impl<'a> Parser<'a> {
return self.expected("Schema name", token1); return self.expected("Schema name", token1);
} }
} }
match token3 { match token3.token {
Token::Word(w) => { Token::Word(w) => {
table_name = w.value; table_name = w.value;
} }
@ -4704,7 +4790,7 @@ impl<'a> Parser<'a> {
schema_name: Some(schema_name), schema_name: Some(schema_name),
}) })
} else { } else {
match token1 { match token1.token {
Token::Word(w) => { Token::Word(w) => {
table_name = w.value; table_name = w.value;
} }
@ -4986,7 +5072,7 @@ impl<'a> Parser<'a> {
} }
} else { } else {
let natural = self.parse_keyword(Keyword::NATURAL); let natural = self.parse_keyword(Keyword::NATURAL);
let peek_keyword = if let Token::Word(w) = self.peek_token() { let peek_keyword = if let Token::Word(w) = self.peek_token().token {
w.keyword w.keyword
} else { } else {
Keyword::NoKeyword Keyword::NoKeyword
@ -6704,4 +6790,31 @@ mod tests {
r#"UPDATE test SET name = $1, value = $2, where = $3, create = $4, is_default = $5, classification = $6, sort = $7 WHERE id = $8"# r#"UPDATE test SET name = $1, value = $2, where = $3, create = $4, is_default = $5, classification = $6, sort = $7 WHERE id = $8"#
); );
} }
#[test]
fn test_tokenizer_error_loc() {
let sql = "foo '";
let ast = Parser::parse_sql(&GenericDialect, sql);
assert_eq!(
ast,
Err(ParserError::TokenizerError(
"Unterminated string literal at Line: 1, Column 5".to_string()
))
);
}
#[test]
fn test_parser_error_loc() {
// TODO: Once we thread token locations through the parser, we should update this
// test to assert the locations of the referenced token
let sql = "SELECT this is a syntax error";
let ast = Parser::parse_sql(&GenericDialect, sql);
assert_eq!(
ast,
Err(ParserError::ParserError(
"Expected [NOT] NULL or TRUE|FALSE or [NOT] DISTINCT FROM after IS, found: a"
.to_string()
))
);
}
} }

View file

@ -300,6 +300,53 @@ impl fmt::Display for Whitespace {
} }
} }
/// Location in input string
#[derive(Debug, Eq, PartialEq, Clone)]
pub struct Location {
/// Line number, starting from 1
pub line: u64,
/// Line column, starting from 1
pub column: u64,
}
/// A [Token] with [Location] attached to it
#[derive(Debug, Eq, PartialEq, Clone)]
pub struct TokenWithLocation {
pub token: Token,
pub location: Location,
}
impl TokenWithLocation {
pub fn new(token: Token, line: u64, column: u64) -> TokenWithLocation {
TokenWithLocation {
token,
location: Location { line, column },
}
}
pub fn wrap(token: Token) -> TokenWithLocation {
TokenWithLocation::new(token, 0, 0)
}
}
impl PartialEq<Token> for TokenWithLocation {
fn eq(&self, other: &Token) -> bool {
&self.token == other
}
}
impl PartialEq<TokenWithLocation> for Token {
fn eq(&self, other: &TokenWithLocation) -> bool {
self == &other.token
}
}
impl fmt::Display for TokenWithLocation {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
self.token.fmt(f)
}
}
/// Tokenizer error /// Tokenizer error
#[derive(Debug, PartialEq, Eq)] #[derive(Debug, PartialEq, Eq)]
pub struct TokenizerError { pub struct TokenizerError {
@ -321,58 +368,88 @@ impl fmt::Display for TokenizerError {
#[cfg(feature = "std")] #[cfg(feature = "std")]
impl std::error::Error for TokenizerError {} impl std::error::Error for TokenizerError {}
struct State<'a> {
peekable: Peekable<Chars<'a>>,
pub line: u64,
pub col: u64,
}
impl<'a> State<'a> {
pub fn next(&mut self) -> Option<char> {
match self.peekable.next() {
None => None,
Some(s) => {
if s == '\n' {
self.line += 1;
self.col = 1;
} else {
self.col += 1;
}
Some(s)
}
}
}
pub fn peek(&mut self) -> Option<&char> {
self.peekable.peek()
}
pub fn location(&self) -> Location {
Location {
line: self.line,
column: self.col,
}
}
}
/// SQL Tokenizer /// SQL Tokenizer
pub struct Tokenizer<'a> { pub struct Tokenizer<'a> {
dialect: &'a dyn Dialect, dialect: &'a dyn Dialect,
query: &'a str, query: &'a str,
line: u64,
col: u64,
} }
impl<'a> Tokenizer<'a> { impl<'a> Tokenizer<'a> {
/// Create a new SQL tokenizer for the specified SQL statement /// Create a new SQL tokenizer for the specified SQL statement
pub fn new(dialect: &'a dyn Dialect, query: &'a str) -> Self { pub fn new(dialect: &'a dyn Dialect, query: &'a str) -> Self {
Self { Self { dialect, query }
dialect,
query,
line: 1,
col: 1,
}
} }
/// Tokenize the statement and produce a vector of tokens /// Tokenize the statement and produce a vector of tokens
pub fn tokenize(&mut self) -> Result<Vec<Token>, TokenizerError> { pub fn tokenize(&mut self) -> Result<Vec<Token>, TokenizerError> {
let mut peekable = self.query.chars().peekable(); let twl = self.tokenize_with_location()?;
let mut tokens: Vec<Token> = vec![]; let mut tokens: Vec<Token> = vec![];
tokens.reserve(twl.len());
for token_with_location in twl {
tokens.push(token_with_location.token);
}
Ok(tokens)
}
while let Some(token) = self.next_token(&mut peekable)? { /// Tokenize the statement and produce a vector of tokens with location information
match &token { pub fn tokenize_with_location(&mut self) -> Result<Vec<TokenWithLocation>, TokenizerError> {
Token::Whitespace(Whitespace::Newline) => { let mut state = State {
self.line += 1; peekable: self.query.chars().peekable(),
self.col = 1; line: 1,
} col: 1,
};
Token::Whitespace(Whitespace::Tab) => self.col += 4, let mut tokens: Vec<TokenWithLocation> = vec![];
Token::Word(w) => {
self.col += w.value.chars().count() as u64;
if w.quote_style.is_some() {
self.col += 2
}
}
Token::Number(s, _) => self.col += s.chars().count() as u64,
Token::SingleQuotedString(s) => self.col += s.chars().count() as u64,
Token::Placeholder(s) => self.col += s.chars().count() as u64,
_ => self.col += 1,
}
tokens.push(token); let mut location = state.location();
while let Some(token) = self.next_token(&mut state)? {
tokens.push(TokenWithLocation {
token,
location: location.clone(),
});
location = state.location();
} }
Ok(tokens) Ok(tokens)
} }
/// Get the next token or return None /// Get the next token or return None
fn next_token(&self, chars: &mut Peekable<Chars<'_>>) -> Result<Option<Token>, TokenizerError> { fn next_token(&self, chars: &mut State) -> Result<Option<Token>, TokenizerError> {
//println!("next_token: {:?}", chars.peek()); //println!("next_token: {:?}", chars.peek());
match chars.peek() { match chars.peek() {
Some(&ch) => match ch { Some(&ch) => match ch {
@ -405,10 +482,12 @@ impl<'a> Tokenizer<'a> {
} }
// PostgreSQL accepts "escape" string constants, which are an extension to the SQL standard. // PostgreSQL accepts "escape" string constants, which are an extension to the SQL standard.
x @ 'e' | x @ 'E' => { x @ 'e' | x @ 'E' => {
let starting_loc = chars.location();
chars.next(); // consume, to check the next char chars.next(); // consume, to check the next char
match chars.peek() { match chars.peek() {
Some('\'') => { Some('\'') => {
let s = self.tokenize_escaped_single_quoted_string(chars)?; let s =
self.tokenize_escaped_single_quoted_string(starting_loc, chars)?;
Ok(Some(Token::EscapedStringLiteral(s))) Ok(Some(Token::EscapedStringLiteral(s)))
} }
_ => { _ => {
@ -441,7 +520,12 @@ impl<'a> Tokenizer<'a> {
let s = self.tokenize_word(ch, chars); let s = self.tokenize_word(ch, chars);
if s.chars().all(|x| ('0'..='9').contains(&x) || x == '.') { if s.chars().all(|x| ('0'..='9').contains(&x) || x == '.') {
let mut s = peeking_take_while(&mut s.chars().peekable(), |ch| { let mut inner_state = State {
peekable: s.chars().peekable(),
line: 0,
col: 0,
};
let mut s = peeking_take_while(&mut inner_state, |ch| {
matches!(ch, '0'..='9' | '.') matches!(ch, '0'..='9' | '.')
}); });
let s2 = peeking_take_while(chars, |ch| matches!(ch, '0'..='9' | '.')); let s2 = peeking_take_while(chars, |ch| matches!(ch, '0'..='9' | '.'));
@ -469,8 +553,9 @@ impl<'a> Tokenizer<'a> {
if self.dialect.is_delimited_identifier_start(ch) if self.dialect.is_delimited_identifier_start(ch)
&& self && self
.dialect .dialect
.is_proper_identifier_inside_quotes(chars.clone()) => .is_proper_identifier_inside_quotes(chars.peekable.clone()) =>
{ {
let error_loc = chars.location();
chars.next(); // consume the opening quote chars.next(); // consume the opening quote
let quote_end = Word::matching_end_quote(quote_start); let quote_end = Word::matching_end_quote(quote_start);
let (s, last_char) = parse_quoted_ident(chars, quote_end); let (s, last_char) = parse_quoted_ident(chars, quote_end);
@ -478,10 +563,10 @@ impl<'a> Tokenizer<'a> {
if last_char == Some(quote_end) { if last_char == Some(quote_end) {
Ok(Some(Token::make_word(&s, Some(quote_start)))) Ok(Some(Token::make_word(&s, Some(quote_start))))
} else { } else {
self.tokenizer_error(format!( self.tokenizer_error(
"Expected close delimiter '{}' before EOF.", error_loc,
quote_end format!("Expected close delimiter '{}' before EOF.", quote_end),
)) )
} }
} }
// numbers and period // numbers and period
@ -698,16 +783,20 @@ impl<'a> Tokenizer<'a> {
} }
} }
fn tokenizer_error<R>(&self, message: impl Into<String>) -> Result<R, TokenizerError> { fn tokenizer_error<R>(
&self,
loc: Location,
message: impl Into<String>,
) -> Result<R, TokenizerError> {
Err(TokenizerError { Err(TokenizerError {
message: message.into(), message: message.into(),
col: self.col, col: loc.column,
line: self.line, line: loc.line,
}) })
} }
// Consume characters until newline // Consume characters until newline
fn tokenize_single_line_comment(&self, chars: &mut Peekable<Chars<'_>>) -> String { fn tokenize_single_line_comment(&self, chars: &mut State) -> String {
let mut comment = peeking_take_while(chars, |ch| ch != '\n'); let mut comment = peeking_take_while(chars, |ch| ch != '\n');
if let Some(ch) = chars.next() { if let Some(ch) = chars.next() {
assert_eq!(ch, '\n'); assert_eq!(ch, '\n');
@ -717,7 +806,7 @@ impl<'a> Tokenizer<'a> {
} }
/// Tokenize an identifier or keyword, after the first char is already consumed. /// Tokenize an identifier or keyword, after the first char is already consumed.
fn tokenize_word(&self, first_char: char, chars: &mut Peekable<Chars<'_>>) -> String { fn tokenize_word(&self, first_char: char, chars: &mut State) -> String {
let mut s = first_char.to_string(); let mut s = first_char.to_string();
s.push_str(&peeking_take_while(chars, |ch| { s.push_str(&peeking_take_while(chars, |ch| {
self.dialect.is_identifier_part(ch) self.dialect.is_identifier_part(ch)
@ -728,9 +817,13 @@ impl<'a> Tokenizer<'a> {
/// Read a single quoted string, starting with the opening quote. /// Read a single quoted string, starting with the opening quote.
fn tokenize_escaped_single_quoted_string( fn tokenize_escaped_single_quoted_string(
&self, &self,
chars: &mut Peekable<Chars<'_>>, starting_loc: Location,
chars: &mut State,
) -> Result<String, TokenizerError> { ) -> Result<String, TokenizerError> {
let mut s = String::new(); let mut s = String::new();
// This case is a bit tricky
chars.next(); // consume the opening quote chars.next(); // consume the opening quote
// slash escaping // slash escaping
@ -782,16 +875,18 @@ impl<'a> Tokenizer<'a> {
} }
} }
} }
self.tokenizer_error("Unterminated encoded string literal") self.tokenizer_error(starting_loc, "Unterminated encoded string literal")
} }
/// Read a single quoted string, starting with the opening quote. /// Read a single quoted string, starting with the opening quote.
fn tokenize_quoted_string( fn tokenize_quoted_string(
&self, &self,
chars: &mut Peekable<Chars<'_>>, chars: &mut State,
quote_style: char, quote_style: char,
) -> Result<String, TokenizerError> { ) -> Result<String, TokenizerError> {
let mut s = String::new(); let mut s = String::new();
let error_loc = chars.location();
chars.next(); // consume the opening quote chars.next(); // consume the opening quote
// slash escaping is specific to MySQL dialect // slash escaping is specific to MySQL dialect
@ -824,12 +919,12 @@ impl<'a> Tokenizer<'a> {
} }
} }
} }
self.tokenizer_error("Unterminated string literal") self.tokenizer_error(error_loc, "Unterminated string literal")
} }
fn tokenize_multiline_comment( fn tokenize_multiline_comment(
&self, &self,
chars: &mut Peekable<Chars<'_>>, chars: &mut State,
) -> Result<Option<Token>, TokenizerError> { ) -> Result<Option<Token>, TokenizerError> {
let mut s = String::new(); let mut s = String::new();
let mut nested = 1; let mut nested = 1;
@ -850,7 +945,12 @@ impl<'a> Tokenizer<'a> {
s.push(ch); s.push(ch);
last_ch = ch; last_ch = ch;
} }
None => break self.tokenizer_error("Unexpected EOF while in a multi-line comment"), None => {
break self.tokenizer_error(
chars.location(),
"Unexpected EOF while in a multi-line comment",
)
}
} }
} }
} }
@ -858,7 +958,7 @@ impl<'a> Tokenizer<'a> {
#[allow(clippy::unnecessary_wraps)] #[allow(clippy::unnecessary_wraps)]
fn consume_and_return( fn consume_and_return(
&self, &self,
chars: &mut Peekable<Chars<'_>>, chars: &mut State,
t: Token, t: Token,
) -> Result<Option<Token>, TokenizerError> { ) -> Result<Option<Token>, TokenizerError> {
chars.next(); chars.next();
@ -869,10 +969,7 @@ impl<'a> Tokenizer<'a> {
/// Read from `chars` until `predicate` returns `false` or EOF is hit. /// Read from `chars` until `predicate` returns `false` or EOF is hit.
/// Return the characters read as String, and keep the first non-matching /// Return the characters read as String, and keep the first non-matching
/// char available as `chars.next()`. /// char available as `chars.next()`.
fn peeking_take_while( fn peeking_take_while(chars: &mut State, mut predicate: impl FnMut(char) -> bool) -> String {
chars: &mut Peekable<Chars<'_>>,
mut predicate: impl FnMut(char) -> bool,
) -> String {
let mut s = String::new(); let mut s = String::new();
while let Some(&ch) = chars.peek() { while let Some(&ch) = chars.peek() {
if predicate(ch) { if predicate(ch) {
@ -885,7 +982,7 @@ fn peeking_take_while(
s s
} }
fn parse_quoted_ident(chars: &mut Peekable<Chars<'_>>, quote_end: char) -> (String, Option<char>) { fn parse_quoted_ident(chars: &mut State, quote_end: char) -> (String, Option<char>) {
let mut last_char = None; let mut last_char = None;
let mut s = String::new(); let mut s = String::new();
while let Some(ch) = chars.next() { while let Some(ch) = chars.next() {
@ -1518,7 +1615,25 @@ mod tests {
compare(expected, tokens); compare(expected, tokens);
} }
fn compare(expected: Vec<Token>, actual: Vec<Token>) { #[test]
fn tokenize_with_location() {
let sql = "SELECT a,\n b";
let dialect = GenericDialect {};
let mut tokenizer = Tokenizer::new(&dialect, sql);
let tokens = tokenizer.tokenize_with_location().unwrap();
let expected = vec![
TokenWithLocation::new(Token::make_keyword("SELECT"), 1, 1),
TokenWithLocation::new(Token::Whitespace(Whitespace::Space), 1, 7),
TokenWithLocation::new(Token::make_word("a", None), 1, 8),
TokenWithLocation::new(Token::Comma, 1, 9),
TokenWithLocation::new(Token::Whitespace(Whitespace::Newline), 1, 10),
TokenWithLocation::new(Token::Whitespace(Whitespace::Space), 2, 1),
TokenWithLocation::new(Token::make_word("b", None), 2, 2),
];
compare(expected, tokens);
}
fn compare<T: PartialEq + std::fmt::Debug>(expected: Vec<T>, actual: Vec<T>) {
//println!("------------------------------"); //println!("------------------------------");
//println!("tokens = {:?}", actual); //println!("tokens = {:?}", actual);
//println!("expected = {:?}", expected); //println!("expected = {:?}", expected);