mirror of
https://github.com/apache/datafusion-sqlparser-rs.git
synced 2025-08-04 06:18:17 +00:00

* Redshift square bracket handling We need to detect `[` or `"` for Redshift quotes around indentifier and at the same time exclude treating JSON paths as indentifer * RedshiftSqlDialect documentation update Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org> * Renamed _chars to chars * Fixed warnings * Missing license Co-authored-by: Maciej Skrzypkowski <maciej.skrzypkowski@satoricyber.com> Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
1379 lines
50 KiB
Rust
1379 lines
50 KiB
Rust
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
//! SQL Tokenizer
|
|
//!
|
|
//! The tokenizer (a.k.a. lexer) converts a string into a sequence of tokens.
|
|
//!
|
|
//! The tokens then form the input for the parser, which outputs an Abstract Syntax Tree (AST).
|
|
|
|
#[cfg(not(feature = "std"))]
|
|
use alloc::{
|
|
borrow::ToOwned,
|
|
format,
|
|
string::{String, ToString},
|
|
vec,
|
|
vec::Vec,
|
|
};
|
|
use core::fmt;
|
|
use core::iter::Peekable;
|
|
use core::str::Chars;
|
|
|
|
#[cfg(feature = "serde")]
|
|
use serde::{Deserialize, Serialize};
|
|
|
|
use crate::dialect::SnowflakeDialect;
|
|
use crate::dialect::{Dialect, MySqlDialect};
|
|
use crate::keywords::{Keyword, ALL_KEYWORDS, ALL_KEYWORDS_INDEX};
|
|
|
|
/// SQL Token enumeration
|
|
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
|
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
|
|
pub enum Token {
|
|
/// An end-of-file marker, not a real token
|
|
EOF,
|
|
/// A keyword (like SELECT) or an optionally quoted SQL identifier
|
|
Word(Word),
|
|
/// An unsigned numeric literal
|
|
Number(String, bool),
|
|
/// A character that could not be tokenized
|
|
Char(char),
|
|
/// Single quoted string: i.e: 'string'
|
|
SingleQuotedString(String),
|
|
/// "National" string literal: i.e: N'string'
|
|
NationalStringLiteral(String),
|
|
/// Hexadecimal string literal: i.e.: X'deadbeef'
|
|
HexStringLiteral(String),
|
|
/// Comma
|
|
Comma,
|
|
/// Whitespace (space, tab, etc)
|
|
Whitespace(Whitespace),
|
|
/// Double equals sign `==`
|
|
DoubleEq,
|
|
/// Equality operator `=`
|
|
Eq,
|
|
/// Not Equals operator `<>` (or `!=` in some dialects)
|
|
Neq,
|
|
/// Less Than operator `<`
|
|
Lt,
|
|
/// Greater Than operator `>`
|
|
Gt,
|
|
/// Less Than Or Equals operator `<=`
|
|
LtEq,
|
|
/// Greater Than Or Equals operator `>=`
|
|
GtEq,
|
|
/// Spaceship operator <=>
|
|
Spaceship,
|
|
/// Plus operator `+`
|
|
Plus,
|
|
/// Minus operator `-`
|
|
Minus,
|
|
/// Multiplication operator `*`
|
|
Mul,
|
|
/// Division operator `/`
|
|
Div,
|
|
/// Modulo Operator `%`
|
|
Mod,
|
|
/// String concatenation `||`
|
|
StringConcat,
|
|
/// Left parenthesis `(`
|
|
LParen,
|
|
/// Right parenthesis `)`
|
|
RParen,
|
|
/// Period (used for compound identifiers or projections into nested types)
|
|
Period,
|
|
/// Colon `:`
|
|
Colon,
|
|
/// DoubleColon `::` (used for casting in postgresql)
|
|
DoubleColon,
|
|
/// SemiColon `;` used as separator for COPY and payload
|
|
SemiColon,
|
|
/// Backslash `\` used in terminating the COPY payload with `\.`
|
|
Backslash,
|
|
/// Left bracket `[`
|
|
LBracket,
|
|
/// Right bracket `]`
|
|
RBracket,
|
|
/// Ampersand `&`
|
|
Ampersand,
|
|
/// Pipe `|`
|
|
Pipe,
|
|
/// Caret `^`
|
|
Caret,
|
|
/// Left brace `{`
|
|
LBrace,
|
|
/// Right brace `}`
|
|
RBrace,
|
|
/// Right Arrow `=>`
|
|
RArrow,
|
|
/// Sharp `#` used for PostgreSQL Bitwise XOR operator
|
|
Sharp,
|
|
/// Tilde `~` used for PostgreSQL Bitwise NOT operator or case sensitive match regular expression operator
|
|
Tilde,
|
|
/// `~*` , a case insensitive match regular expression operator in PostgreSQL
|
|
TildeAsterisk,
|
|
/// `!~` , a case sensitive not match regular expression operator in PostgreSQL
|
|
ExclamationMarkTilde,
|
|
/// `!~*` , a case insensitive not match regular expression operator in PostgreSQL
|
|
ExclamationMarkTildeAsterisk,
|
|
/// `<<`, a bitwise shift left operator in PostgreSQL
|
|
ShiftLeft,
|
|
/// `>>`, a bitwise shift right operator in PostgreSQL
|
|
ShiftRight,
|
|
/// Exclamation Mark `!` used for PostgreSQL factorial operator
|
|
ExclamationMark,
|
|
/// Double Exclamation Mark `!!` used for PostgreSQL prefix factorial operator
|
|
DoubleExclamationMark,
|
|
/// AtSign `@` used for PostgreSQL abs operator
|
|
AtSign,
|
|
/// `|/`, a square root math operator in PostgreSQL
|
|
PGSquareRoot,
|
|
/// `||/` , a cube root math operator in PostgreSQL
|
|
PGCubeRoot,
|
|
/// `?` or `$` , a prepared statement arg placeholder
|
|
Placeholder(String),
|
|
/// ->, used as a operator to extract json field in PostgreSQL
|
|
Arrow,
|
|
/// ->>, used as a operator to extract json field as text in PostgreSQL
|
|
LongArrow,
|
|
/// #> Extracts JSON sub-object at the specified path
|
|
HashArrow,
|
|
/// #>> Extracts JSON sub-object at the specified path as text
|
|
HashLongArrow,
|
|
}
|
|
|
|
impl fmt::Display for Token {
|
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
|
match self {
|
|
Token::EOF => f.write_str("EOF"),
|
|
Token::Word(ref w) => write!(f, "{}", w),
|
|
Token::Number(ref n, l) => write!(f, "{}{long}", n, long = if *l { "L" } else { "" }),
|
|
Token::Char(ref c) => write!(f, "{}", c),
|
|
Token::SingleQuotedString(ref s) => write!(f, "'{}'", s),
|
|
Token::NationalStringLiteral(ref s) => write!(f, "N'{}'", s),
|
|
Token::HexStringLiteral(ref s) => write!(f, "X'{}'", s),
|
|
Token::Comma => f.write_str(","),
|
|
Token::Whitespace(ws) => write!(f, "{}", ws),
|
|
Token::DoubleEq => f.write_str("=="),
|
|
Token::Spaceship => f.write_str("<=>"),
|
|
Token::Eq => f.write_str("="),
|
|
Token::Neq => f.write_str("<>"),
|
|
Token::Lt => f.write_str("<"),
|
|
Token::Gt => f.write_str(">"),
|
|
Token::LtEq => f.write_str("<="),
|
|
Token::GtEq => f.write_str(">="),
|
|
Token::Plus => f.write_str("+"),
|
|
Token::Minus => f.write_str("-"),
|
|
Token::Mul => f.write_str("*"),
|
|
Token::Div => f.write_str("/"),
|
|
Token::StringConcat => f.write_str("||"),
|
|
Token::Mod => f.write_str("%"),
|
|
Token::LParen => f.write_str("("),
|
|
Token::RParen => f.write_str(")"),
|
|
Token::Period => f.write_str("."),
|
|
Token::Colon => f.write_str(":"),
|
|
Token::DoubleColon => f.write_str("::"),
|
|
Token::SemiColon => f.write_str(";"),
|
|
Token::Backslash => f.write_str("\\"),
|
|
Token::LBracket => f.write_str("["),
|
|
Token::RBracket => f.write_str("]"),
|
|
Token::Ampersand => f.write_str("&"),
|
|
Token::Caret => f.write_str("^"),
|
|
Token::Pipe => f.write_str("|"),
|
|
Token::LBrace => f.write_str("{"),
|
|
Token::RBrace => f.write_str("}"),
|
|
Token::RArrow => f.write_str("=>"),
|
|
Token::Sharp => f.write_str("#"),
|
|
Token::ExclamationMark => f.write_str("!"),
|
|
Token::DoubleExclamationMark => f.write_str("!!"),
|
|
Token::Tilde => f.write_str("~"),
|
|
Token::TildeAsterisk => f.write_str("~*"),
|
|
Token::ExclamationMarkTilde => f.write_str("!~"),
|
|
Token::ExclamationMarkTildeAsterisk => f.write_str("!~*"),
|
|
Token::AtSign => f.write_str("@"),
|
|
Token::ShiftLeft => f.write_str("<<"),
|
|
Token::ShiftRight => f.write_str(">>"),
|
|
Token::PGSquareRoot => f.write_str("|/"),
|
|
Token::PGCubeRoot => f.write_str("||/"),
|
|
Token::Placeholder(ref s) => write!(f, "{}", s),
|
|
Token::Arrow => write!(f, "->"),
|
|
Token::LongArrow => write!(f, "->>"),
|
|
Token::HashArrow => write!(f, "#>"),
|
|
Token::HashLongArrow => write!(f, "#>>"),
|
|
}
|
|
}
|
|
}
|
|
|
|
impl Token {
|
|
pub fn make_keyword(keyword: &str) -> Self {
|
|
Token::make_word(keyword, None)
|
|
}
|
|
|
|
pub fn make_word(word: &str, quote_style: Option<char>) -> Self {
|
|
let word_uppercase = word.to_uppercase();
|
|
Token::Word(Word {
|
|
value: word.to_string(),
|
|
quote_style,
|
|
keyword: if quote_style == None {
|
|
let keyword = ALL_KEYWORDS.binary_search(&word_uppercase.as_str());
|
|
keyword.map_or(Keyword::NoKeyword, |x| ALL_KEYWORDS_INDEX[x])
|
|
} else {
|
|
Keyword::NoKeyword
|
|
},
|
|
})
|
|
}
|
|
}
|
|
|
|
/// A keyword (like SELECT) or an optionally quoted SQL identifier
|
|
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
|
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
|
|
pub struct Word {
|
|
/// The value of the token, without the enclosing quotes, and with the
|
|
/// escape sequences (if any) processed (TODO: escapes are not handled)
|
|
pub value: String,
|
|
/// An identifier can be "quoted" (<delimited identifier> in ANSI parlance).
|
|
/// The standard and most implementations allow using double quotes for this,
|
|
/// but some implementations support other quoting styles as well (e.g. \[MS SQL])
|
|
pub quote_style: Option<char>,
|
|
/// If the word was not quoted and it matched one of the known keywords,
|
|
/// this will have one of the values from dialect::keywords, otherwise empty
|
|
pub keyword: Keyword,
|
|
}
|
|
|
|
impl fmt::Display for Word {
|
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
|
match self.quote_style {
|
|
Some(s) if s == '"' || s == '[' || s == '`' => {
|
|
write!(f, "{}{}{}", s, self.value, Word::matching_end_quote(s))
|
|
}
|
|
None => f.write_str(&self.value),
|
|
_ => panic!("Unexpected quote_style!"),
|
|
}
|
|
}
|
|
}
|
|
|
|
impl Word {
|
|
fn matching_end_quote(ch: char) -> char {
|
|
match ch {
|
|
'"' => '"', // ANSI and most dialects
|
|
'[' => ']', // MS SQL
|
|
'`' => '`', // MySQL
|
|
_ => panic!("unexpected quoting style!"),
|
|
}
|
|
}
|
|
}
|
|
|
|
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
|
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
|
|
pub enum Whitespace {
|
|
Space,
|
|
Newline,
|
|
Tab,
|
|
SingleLineComment { comment: String, prefix: String },
|
|
MultiLineComment(String),
|
|
}
|
|
|
|
impl fmt::Display for Whitespace {
|
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
|
match self {
|
|
Whitespace::Space => f.write_str(" "),
|
|
Whitespace::Newline => f.write_str("\n"),
|
|
Whitespace::Tab => f.write_str("\t"),
|
|
Whitespace::SingleLineComment { prefix, comment } => write!(f, "{}{}", prefix, comment),
|
|
Whitespace::MultiLineComment(s) => write!(f, "/*{}*/", s),
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Tokenizer error
|
|
#[derive(Debug, PartialEq)]
|
|
pub struct TokenizerError {
|
|
pub message: String,
|
|
pub line: u64,
|
|
pub col: u64,
|
|
}
|
|
|
|
impl fmt::Display for TokenizerError {
|
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
|
write!(
|
|
f,
|
|
"{} at Line: {}, Column {}",
|
|
self.message, self.line, self.col
|
|
)
|
|
}
|
|
}
|
|
|
|
#[cfg(feature = "std")]
|
|
impl std::error::Error for TokenizerError {}
|
|
|
|
/// SQL Tokenizer
|
|
pub struct Tokenizer<'a> {
|
|
dialect: &'a dyn Dialect,
|
|
query: &'a str,
|
|
line: u64,
|
|
col: u64,
|
|
}
|
|
|
|
impl<'a> Tokenizer<'a> {
|
|
/// Create a new SQL tokenizer for the specified SQL statement
|
|
pub fn new(dialect: &'a dyn Dialect, query: &'a str) -> Self {
|
|
Self {
|
|
dialect,
|
|
query,
|
|
line: 1,
|
|
col: 1,
|
|
}
|
|
}
|
|
|
|
/// Tokenize the statement and produce a vector of tokens
|
|
pub fn tokenize(&mut self) -> Result<Vec<Token>, TokenizerError> {
|
|
let mut peekable = self.query.chars().peekable();
|
|
|
|
let mut tokens: Vec<Token> = vec![];
|
|
|
|
while let Some(token) = self.next_token(&mut peekable)? {
|
|
match &token {
|
|
Token::Whitespace(Whitespace::Newline) => {
|
|
self.line += 1;
|
|
self.col = 1;
|
|
}
|
|
|
|
Token::Whitespace(Whitespace::Tab) => self.col += 4,
|
|
Token::Word(w) if w.quote_style == None => self.col += w.value.len() as u64,
|
|
Token::Word(w) if w.quote_style != None => self.col += w.value.len() as u64 + 2,
|
|
Token::Number(s, _) => self.col += s.len() as u64,
|
|
Token::SingleQuotedString(s) => self.col += s.len() as u64,
|
|
Token::Placeholder(s) => self.col += s.len() as u64,
|
|
_ => self.col += 1,
|
|
}
|
|
|
|
tokens.push(token);
|
|
}
|
|
Ok(tokens)
|
|
}
|
|
|
|
/// Get the next token or return None
|
|
fn next_token(&self, chars: &mut Peekable<Chars<'_>>) -> Result<Option<Token>, TokenizerError> {
|
|
//println!("next_token: {:?}", chars.peek());
|
|
match chars.peek() {
|
|
Some(&ch) => match ch {
|
|
' ' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)),
|
|
'\t' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Tab)),
|
|
'\n' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Newline)),
|
|
'\r' => {
|
|
// Emit a single Whitespace::Newline token for \r and \r\n
|
|
chars.next();
|
|
if let Some('\n') = chars.peek() {
|
|
chars.next();
|
|
}
|
|
Ok(Some(Token::Whitespace(Whitespace::Newline)))
|
|
}
|
|
'N' => {
|
|
chars.next(); // consume, to check the next char
|
|
match chars.peek() {
|
|
Some('\'') => {
|
|
// N'...' - a <national character string literal>
|
|
let s = self.tokenize_single_quoted_string(chars)?;
|
|
Ok(Some(Token::NationalStringLiteral(s)))
|
|
}
|
|
_ => {
|
|
// regular identifier starting with an "N"
|
|
let s = self.tokenize_word('N', chars);
|
|
Ok(Some(Token::make_word(&s, None)))
|
|
}
|
|
}
|
|
}
|
|
// The spec only allows an uppercase 'X' to introduce a hex
|
|
// string, but PostgreSQL, at least, allows a lowercase 'x' too.
|
|
x @ 'x' | x @ 'X' => {
|
|
chars.next(); // consume, to check the next char
|
|
match chars.peek() {
|
|
Some('\'') => {
|
|
// X'...' - a <binary string literal>
|
|
let s = self.tokenize_single_quoted_string(chars)?;
|
|
Ok(Some(Token::HexStringLiteral(s)))
|
|
}
|
|
_ => {
|
|
// regular identifier starting with an "X"
|
|
let s = self.tokenize_word(x, chars);
|
|
Ok(Some(Token::make_word(&s, None)))
|
|
}
|
|
}
|
|
}
|
|
// identifier or keyword
|
|
ch if self.dialect.is_identifier_start(ch) => {
|
|
chars.next(); // consume the first char
|
|
let s = self.tokenize_word(ch, chars);
|
|
|
|
if s.chars().all(|x| ('0'..='9').contains(&x) || x == '.') {
|
|
let mut s = peeking_take_while(&mut s.chars().peekable(), |ch| {
|
|
matches!(ch, '0'..='9' | '.')
|
|
});
|
|
let s2 = peeking_take_while(chars, |ch| matches!(ch, '0'..='9' | '.'));
|
|
s += s2.as_str();
|
|
return Ok(Some(Token::Number(s, false)));
|
|
}
|
|
Ok(Some(Token::make_word(&s, None)))
|
|
}
|
|
// string
|
|
'\'' => {
|
|
let s = self.tokenize_single_quoted_string(chars)?;
|
|
|
|
Ok(Some(Token::SingleQuotedString(s)))
|
|
}
|
|
// delimited (quoted) identifier
|
|
quote_start
|
|
if self.dialect.is_delimited_identifier_start(ch)
|
|
&& self
|
|
.dialect
|
|
.is_proper_identifier_inside_quotes(chars.clone()) =>
|
|
{
|
|
chars.next(); // consume the opening quote
|
|
let quote_end = Word::matching_end_quote(quote_start);
|
|
let (s, last_char) = parse_quoted_ident(chars, quote_end);
|
|
|
|
if last_char == Some(quote_end) {
|
|
Ok(Some(Token::make_word(&s, Some(quote_start))))
|
|
} else {
|
|
self.tokenizer_error(format!(
|
|
"Expected close delimiter '{}' before EOF.",
|
|
quote_end
|
|
))
|
|
}
|
|
}
|
|
// numbers and period
|
|
'0'..='9' | '.' => {
|
|
let mut s = peeking_take_while(chars, |ch| matches!(ch, '0'..='9'));
|
|
|
|
// match binary literal that starts with 0x
|
|
if s == "0" && chars.peek() == Some(&'x') {
|
|
chars.next();
|
|
let s2 = peeking_take_while(
|
|
chars,
|
|
|ch| matches!(ch, '0'..='9' | 'A'..='F' | 'a'..='f'),
|
|
);
|
|
return Ok(Some(Token::HexStringLiteral(s2)));
|
|
}
|
|
|
|
// match one period
|
|
if let Some('.') = chars.peek() {
|
|
s.push('.');
|
|
chars.next();
|
|
}
|
|
s += &peeking_take_while(chars, |ch| matches!(ch, '0'..='9'));
|
|
|
|
// No number -> Token::Period
|
|
if s == "." {
|
|
return Ok(Some(Token::Period));
|
|
}
|
|
|
|
let long = if chars.peek() == Some(&'L') {
|
|
chars.next();
|
|
true
|
|
} else {
|
|
false
|
|
};
|
|
Ok(Some(Token::Number(s, long)))
|
|
}
|
|
// punctuation
|
|
'(' => self.consume_and_return(chars, Token::LParen),
|
|
')' => self.consume_and_return(chars, Token::RParen),
|
|
',' => self.consume_and_return(chars, Token::Comma),
|
|
// operators
|
|
'-' => {
|
|
chars.next(); // consume the '-'
|
|
match chars.peek() {
|
|
Some('-') => {
|
|
chars.next(); // consume the second '-', starting a single-line comment
|
|
let comment = self.tokenize_single_line_comment(chars);
|
|
Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
|
|
prefix: "--".to_owned(),
|
|
comment,
|
|
})))
|
|
}
|
|
Some('>') => {
|
|
chars.next();
|
|
match chars.peek() {
|
|
Some('>') => {
|
|
chars.next();
|
|
Ok(Some(Token::LongArrow))
|
|
}
|
|
_ => Ok(Some(Token::Arrow)),
|
|
}
|
|
}
|
|
// a regular '-' operator
|
|
_ => Ok(Some(Token::Minus)),
|
|
}
|
|
}
|
|
'/' => {
|
|
chars.next(); // consume the '/'
|
|
match chars.peek() {
|
|
Some('*') => {
|
|
chars.next(); // consume the '*', starting a multi-line comment
|
|
self.tokenize_multiline_comment(chars)
|
|
}
|
|
Some('/') if dialect_of!(self is SnowflakeDialect) => {
|
|
chars.next(); // consume the second '/', starting a snowflake single-line comment
|
|
let comment = self.tokenize_single_line_comment(chars);
|
|
Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
|
|
prefix: "//".to_owned(),
|
|
comment,
|
|
})))
|
|
}
|
|
// a regular '/' operator
|
|
_ => Ok(Some(Token::Div)),
|
|
}
|
|
}
|
|
'+' => self.consume_and_return(chars, Token::Plus),
|
|
'*' => self.consume_and_return(chars, Token::Mul),
|
|
'%' => self.consume_and_return(chars, Token::Mod),
|
|
'|' => {
|
|
chars.next(); // consume the '|'
|
|
match chars.peek() {
|
|
Some('/') => self.consume_and_return(chars, Token::PGSquareRoot),
|
|
Some('|') => {
|
|
chars.next(); // consume the second '|'
|
|
match chars.peek() {
|
|
Some('/') => self.consume_and_return(chars, Token::PGCubeRoot),
|
|
_ => Ok(Some(Token::StringConcat)),
|
|
}
|
|
}
|
|
// Bitshift '|' operator
|
|
_ => Ok(Some(Token::Pipe)),
|
|
}
|
|
}
|
|
'=' => {
|
|
chars.next(); // consume
|
|
match chars.peek() {
|
|
Some('>') => self.consume_and_return(chars, Token::RArrow),
|
|
_ => Ok(Some(Token::Eq)),
|
|
}
|
|
}
|
|
'!' => {
|
|
chars.next(); // consume
|
|
match chars.peek() {
|
|
Some('=') => self.consume_and_return(chars, Token::Neq),
|
|
Some('!') => self.consume_and_return(chars, Token::DoubleExclamationMark),
|
|
Some('~') => {
|
|
chars.next();
|
|
match chars.peek() {
|
|
Some('*') => self
|
|
.consume_and_return(chars, Token::ExclamationMarkTildeAsterisk),
|
|
_ => Ok(Some(Token::ExclamationMarkTilde)),
|
|
}
|
|
}
|
|
_ => Ok(Some(Token::ExclamationMark)),
|
|
}
|
|
}
|
|
'<' => {
|
|
chars.next(); // consume
|
|
match chars.peek() {
|
|
Some('=') => {
|
|
chars.next();
|
|
match chars.peek() {
|
|
Some('>') => self.consume_and_return(chars, Token::Spaceship),
|
|
_ => Ok(Some(Token::LtEq)),
|
|
}
|
|
}
|
|
Some('>') => self.consume_and_return(chars, Token::Neq),
|
|
Some('<') => self.consume_and_return(chars, Token::ShiftLeft),
|
|
_ => Ok(Some(Token::Lt)),
|
|
}
|
|
}
|
|
'>' => {
|
|
chars.next(); // consume
|
|
match chars.peek() {
|
|
Some('=') => self.consume_and_return(chars, Token::GtEq),
|
|
Some('>') => self.consume_and_return(chars, Token::ShiftRight),
|
|
_ => Ok(Some(Token::Gt)),
|
|
}
|
|
}
|
|
':' => {
|
|
chars.next();
|
|
match chars.peek() {
|
|
Some(':') => self.consume_and_return(chars, Token::DoubleColon),
|
|
_ => Ok(Some(Token::Colon)),
|
|
}
|
|
}
|
|
';' => self.consume_and_return(chars, Token::SemiColon),
|
|
'\\' => self.consume_and_return(chars, Token::Backslash),
|
|
'[' => self.consume_and_return(chars, Token::LBracket),
|
|
']' => self.consume_and_return(chars, Token::RBracket),
|
|
'&' => self.consume_and_return(chars, Token::Ampersand),
|
|
'^' => self.consume_and_return(chars, Token::Caret),
|
|
'{' => self.consume_and_return(chars, Token::LBrace),
|
|
'}' => self.consume_and_return(chars, Token::RBrace),
|
|
'#' if dialect_of!(self is SnowflakeDialect) => {
|
|
chars.next(); // consume the '#', starting a snowflake single-line comment
|
|
let comment = self.tokenize_single_line_comment(chars);
|
|
Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
|
|
prefix: "#".to_owned(),
|
|
comment,
|
|
})))
|
|
}
|
|
'~' => {
|
|
chars.next(); // consume
|
|
match chars.peek() {
|
|
Some('*') => self.consume_and_return(chars, Token::TildeAsterisk),
|
|
_ => Ok(Some(Token::Tilde)),
|
|
}
|
|
}
|
|
'#' => {
|
|
chars.next();
|
|
match chars.peek() {
|
|
Some('>') => {
|
|
chars.next();
|
|
match chars.peek() {
|
|
Some('>') => {
|
|
chars.next();
|
|
Ok(Some(Token::HashLongArrow))
|
|
}
|
|
_ => Ok(Some(Token::HashArrow)),
|
|
}
|
|
}
|
|
_ => Ok(Some(Token::Sharp)),
|
|
}
|
|
}
|
|
'@' => self.consume_and_return(chars, Token::AtSign),
|
|
'?' => self.consume_and_return(chars, Token::Placeholder(String::from("?"))),
|
|
'$' => {
|
|
chars.next();
|
|
let s = peeking_take_while(
|
|
chars,
|
|
|ch| matches!(ch, '0'..='9' | 'A'..='Z' | 'a'..='z'),
|
|
);
|
|
Ok(Some(Token::Placeholder(String::from("$") + &s)))
|
|
}
|
|
other => self.consume_and_return(chars, Token::Char(other)),
|
|
},
|
|
None => Ok(None),
|
|
}
|
|
}
|
|
|
|
fn tokenizer_error<R>(&self, message: impl Into<String>) -> Result<R, TokenizerError> {
|
|
Err(TokenizerError {
|
|
message: message.into(),
|
|
col: self.col,
|
|
line: self.line,
|
|
})
|
|
}
|
|
|
|
// Consume characters until newline
|
|
fn tokenize_single_line_comment(&self, chars: &mut Peekable<Chars<'_>>) -> String {
|
|
let mut comment = peeking_take_while(chars, |ch| ch != '\n');
|
|
if let Some(ch) = chars.next() {
|
|
assert_eq!(ch, '\n');
|
|
comment.push(ch);
|
|
}
|
|
comment
|
|
}
|
|
|
|
/// Tokenize an identifier or keyword, after the first char is already consumed.
|
|
fn tokenize_word(&self, first_char: char, chars: &mut Peekable<Chars<'_>>) -> String {
|
|
let mut s = first_char.to_string();
|
|
s.push_str(&peeking_take_while(chars, |ch| {
|
|
self.dialect.is_identifier_part(ch)
|
|
}));
|
|
s
|
|
}
|
|
|
|
/// Read a single quoted string, starting with the opening quote.
|
|
fn tokenize_single_quoted_string(
|
|
&self,
|
|
chars: &mut Peekable<Chars<'_>>,
|
|
) -> Result<String, TokenizerError> {
|
|
let mut s = String::new();
|
|
chars.next(); // consume the opening quote
|
|
|
|
// slash escaping is specific to MySQL dialect
|
|
let mut is_escaped = false;
|
|
while let Some(&ch) = chars.peek() {
|
|
match ch {
|
|
'\'' => {
|
|
chars.next(); // consume
|
|
if is_escaped {
|
|
s.push(ch);
|
|
is_escaped = false;
|
|
} else if chars.peek().map(|c| *c == '\'').unwrap_or(false) {
|
|
s.push(ch);
|
|
chars.next();
|
|
} else {
|
|
return Ok(s);
|
|
}
|
|
}
|
|
'\\' => {
|
|
if dialect_of!(self is MySqlDialect) {
|
|
is_escaped = !is_escaped;
|
|
} else {
|
|
s.push(ch);
|
|
}
|
|
chars.next();
|
|
}
|
|
_ => {
|
|
chars.next(); // consume
|
|
s.push(ch);
|
|
}
|
|
}
|
|
}
|
|
self.tokenizer_error("Unterminated string literal")
|
|
}
|
|
|
|
fn tokenize_multiline_comment(
|
|
&self,
|
|
chars: &mut Peekable<Chars<'_>>,
|
|
) -> Result<Option<Token>, TokenizerError> {
|
|
let mut s = String::new();
|
|
let mut maybe_closing_comment = false;
|
|
// TODO: deal with nested comments
|
|
loop {
|
|
match chars.next() {
|
|
Some(ch) => {
|
|
if maybe_closing_comment {
|
|
if ch == '/' {
|
|
break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
|
|
} else {
|
|
s.push('*');
|
|
}
|
|
}
|
|
maybe_closing_comment = ch == '*';
|
|
if !maybe_closing_comment {
|
|
s.push(ch);
|
|
}
|
|
}
|
|
None => break self.tokenizer_error("Unexpected EOF while in a multi-line comment"),
|
|
}
|
|
}
|
|
}
|
|
|
|
#[allow(clippy::unnecessary_wraps)]
|
|
fn consume_and_return(
|
|
&self,
|
|
chars: &mut Peekable<Chars<'_>>,
|
|
t: Token,
|
|
) -> Result<Option<Token>, TokenizerError> {
|
|
chars.next();
|
|
Ok(Some(t))
|
|
}
|
|
}
|
|
|
|
/// Read from `chars` until `predicate` returns `false` or EOF is hit.
|
|
/// Return the characters read as String, and keep the first non-matching
|
|
/// char available as `chars.next()`.
|
|
fn peeking_take_while(
|
|
chars: &mut Peekable<Chars<'_>>,
|
|
mut predicate: impl FnMut(char) -> bool,
|
|
) -> String {
|
|
let mut s = String::new();
|
|
while let Some(&ch) = chars.peek() {
|
|
if predicate(ch) {
|
|
chars.next(); // consume
|
|
s.push(ch);
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
s
|
|
}
|
|
|
|
fn parse_quoted_ident(chars: &mut Peekable<Chars<'_>>, quote_end: char) -> (String, Option<char>) {
|
|
let mut last_char = None;
|
|
let mut s = String::new();
|
|
while let Some(ch) = chars.next() {
|
|
if ch == quote_end {
|
|
if chars.peek() == Some("e_end) {
|
|
chars.next();
|
|
s.push(ch);
|
|
} else {
|
|
last_char = Some(quote_end);
|
|
break;
|
|
}
|
|
} else {
|
|
s.push(ch);
|
|
}
|
|
}
|
|
(s, last_char)
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
use crate::dialect::{GenericDialect, MsSqlDialect};
|
|
|
|
#[test]
|
|
fn tokenizer_error_impl() {
|
|
let err = TokenizerError {
|
|
message: "test".into(),
|
|
line: 1,
|
|
col: 1,
|
|
};
|
|
#[cfg(feature = "std")]
|
|
{
|
|
use std::error::Error;
|
|
assert!(err.source().is_none());
|
|
}
|
|
assert_eq!(err.to_string(), "test at Line: 1, Column 1");
|
|
}
|
|
|
|
#[test]
|
|
fn tokenize_select_1() {
|
|
let sql = String::from("SELECT 1");
|
|
let dialect = GenericDialect {};
|
|
let mut tokenizer = Tokenizer::new(&dialect, &sql);
|
|
let tokens = tokenizer.tokenize().unwrap();
|
|
|
|
let expected = vec![
|
|
Token::make_keyword("SELECT"),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::Number(String::from("1"), false),
|
|
];
|
|
|
|
compare(expected, tokens);
|
|
}
|
|
|
|
#[test]
|
|
fn tokenize_select_float() {
|
|
let sql = String::from("SELECT .1");
|
|
let dialect = GenericDialect {};
|
|
let mut tokenizer = Tokenizer::new(&dialect, &sql);
|
|
let tokens = tokenizer.tokenize().unwrap();
|
|
|
|
let expected = vec![
|
|
Token::make_keyword("SELECT"),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::Number(String::from(".1"), false),
|
|
];
|
|
|
|
compare(expected, tokens);
|
|
}
|
|
|
|
#[test]
|
|
fn tokenize_scalar_function() {
|
|
let sql = String::from("SELECT sqrt(1)");
|
|
let dialect = GenericDialect {};
|
|
let mut tokenizer = Tokenizer::new(&dialect, &sql);
|
|
let tokens = tokenizer.tokenize().unwrap();
|
|
|
|
let expected = vec![
|
|
Token::make_keyword("SELECT"),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::make_word("sqrt", None),
|
|
Token::LParen,
|
|
Token::Number(String::from("1"), false),
|
|
Token::RParen,
|
|
];
|
|
|
|
compare(expected, tokens);
|
|
}
|
|
|
|
#[test]
|
|
fn tokenize_string_string_concat() {
|
|
let sql = String::from("SELECT 'a' || 'b'");
|
|
let dialect = GenericDialect {};
|
|
let mut tokenizer = Tokenizer::new(&dialect, &sql);
|
|
let tokens = tokenizer.tokenize().unwrap();
|
|
|
|
let expected = vec![
|
|
Token::make_keyword("SELECT"),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::SingleQuotedString(String::from("a")),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::StringConcat,
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::SingleQuotedString(String::from("b")),
|
|
];
|
|
|
|
compare(expected, tokens);
|
|
}
|
|
#[test]
|
|
fn tokenize_bitwise_op() {
|
|
let sql = String::from("SELECT one | two ^ three");
|
|
let dialect = GenericDialect {};
|
|
let mut tokenizer = Tokenizer::new(&dialect, &sql);
|
|
let tokens = tokenizer.tokenize().unwrap();
|
|
|
|
let expected = vec![
|
|
Token::make_keyword("SELECT"),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::make_word("one", None),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::Pipe,
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::make_word("two", None),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::Caret,
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::make_word("three", None),
|
|
];
|
|
compare(expected, tokens);
|
|
}
|
|
|
|
#[test]
|
|
fn tokenize_logical_xor() {
|
|
let sql =
|
|
String::from("SELECT true XOR true, false XOR false, true XOR false, false XOR true");
|
|
let dialect = GenericDialect {};
|
|
let mut tokenizer = Tokenizer::new(&dialect, &sql);
|
|
let tokens = tokenizer.tokenize().unwrap();
|
|
|
|
let expected = vec![
|
|
Token::make_keyword("SELECT"),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::make_keyword("true"),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::make_keyword("XOR"),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::make_keyword("true"),
|
|
Token::Comma,
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::make_keyword("false"),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::make_keyword("XOR"),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::make_keyword("false"),
|
|
Token::Comma,
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::make_keyword("true"),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::make_keyword("XOR"),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::make_keyword("false"),
|
|
Token::Comma,
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::make_keyword("false"),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::make_keyword("XOR"),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::make_keyword("true"),
|
|
];
|
|
compare(expected, tokens);
|
|
}
|
|
|
|
#[test]
|
|
fn tokenize_simple_select() {
|
|
let sql = String::from("SELECT * FROM customer WHERE id = 1 LIMIT 5");
|
|
let dialect = GenericDialect {};
|
|
let mut tokenizer = Tokenizer::new(&dialect, &sql);
|
|
let tokens = tokenizer.tokenize().unwrap();
|
|
|
|
let expected = vec![
|
|
Token::make_keyword("SELECT"),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::Mul,
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::make_keyword("FROM"),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::make_word("customer", None),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::make_keyword("WHERE"),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::make_word("id", None),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::Eq,
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::Number(String::from("1"), false),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::make_keyword("LIMIT"),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::Number(String::from("5"), false),
|
|
];
|
|
|
|
compare(expected, tokens);
|
|
}
|
|
|
|
#[test]
|
|
fn tokenize_explain_select() {
|
|
let sql = String::from("EXPLAIN SELECT * FROM customer WHERE id = 1");
|
|
let dialect = GenericDialect {};
|
|
let mut tokenizer = Tokenizer::new(&dialect, &sql);
|
|
let tokens = tokenizer.tokenize().unwrap();
|
|
|
|
let expected = vec![
|
|
Token::make_keyword("EXPLAIN"),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::make_keyword("SELECT"),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::Mul,
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::make_keyword("FROM"),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::make_word("customer", None),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::make_keyword("WHERE"),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::make_word("id", None),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::Eq,
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::Number(String::from("1"), false),
|
|
];
|
|
|
|
compare(expected, tokens);
|
|
}
|
|
|
|
#[test]
|
|
fn tokenize_explain_analyze_select() {
|
|
let sql = String::from("EXPLAIN ANALYZE SELECT * FROM customer WHERE id = 1");
|
|
let dialect = GenericDialect {};
|
|
let mut tokenizer = Tokenizer::new(&dialect, &sql);
|
|
let tokens = tokenizer.tokenize().unwrap();
|
|
|
|
let expected = vec![
|
|
Token::make_keyword("EXPLAIN"),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::make_keyword("ANALYZE"),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::make_keyword("SELECT"),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::Mul,
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::make_keyword("FROM"),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::make_word("customer", None),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::make_keyword("WHERE"),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::make_word("id", None),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::Eq,
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::Number(String::from("1"), false),
|
|
];
|
|
|
|
compare(expected, tokens);
|
|
}
|
|
|
|
#[test]
|
|
fn tokenize_string_predicate() {
|
|
let sql = String::from("SELECT * FROM customer WHERE salary != 'Not Provided'");
|
|
let dialect = GenericDialect {};
|
|
let mut tokenizer = Tokenizer::new(&dialect, &sql);
|
|
let tokens = tokenizer.tokenize().unwrap();
|
|
|
|
let expected = vec![
|
|
Token::make_keyword("SELECT"),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::Mul,
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::make_keyword("FROM"),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::make_word("customer", None),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::make_keyword("WHERE"),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::make_word("salary", None),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::Neq,
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::SingleQuotedString(String::from("Not Provided")),
|
|
];
|
|
|
|
compare(expected, tokens);
|
|
}
|
|
|
|
#[test]
|
|
fn tokenize_invalid_string() {
|
|
let sql = String::from("\nمصطفىh");
|
|
|
|
let dialect = GenericDialect {};
|
|
let mut tokenizer = Tokenizer::new(&dialect, &sql);
|
|
let tokens = tokenizer.tokenize().unwrap();
|
|
// println!("tokens: {:#?}", tokens);
|
|
let expected = vec![
|
|
Token::Whitespace(Whitespace::Newline),
|
|
Token::Char('م'),
|
|
Token::Char('ص'),
|
|
Token::Char('ط'),
|
|
Token::Char('ف'),
|
|
Token::Char('ى'),
|
|
Token::make_word("h", None),
|
|
];
|
|
compare(expected, tokens);
|
|
}
|
|
|
|
#[test]
|
|
fn tokenize_newline_in_string_literal() {
|
|
let sql = String::from("'foo\r\nbar\nbaz'");
|
|
|
|
let dialect = GenericDialect {};
|
|
let mut tokenizer = Tokenizer::new(&dialect, &sql);
|
|
let tokens = tokenizer.tokenize().unwrap();
|
|
let expected = vec![Token::SingleQuotedString("foo\r\nbar\nbaz".to_string())];
|
|
compare(expected, tokens);
|
|
}
|
|
|
|
#[test]
|
|
fn tokenize_unterminated_string_literal() {
|
|
let sql = String::from("select 'foo");
|
|
|
|
let dialect = GenericDialect {};
|
|
let mut tokenizer = Tokenizer::new(&dialect, &sql);
|
|
assert_eq!(
|
|
tokenizer.tokenize(),
|
|
Err(TokenizerError {
|
|
message: "Unterminated string literal".to_string(),
|
|
line: 1,
|
|
col: 8
|
|
})
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn tokenize_invalid_string_cols() {
|
|
let sql = String::from("\n\nSELECT * FROM table\tمصطفىh");
|
|
|
|
let dialect = GenericDialect {};
|
|
let mut tokenizer = Tokenizer::new(&dialect, &sql);
|
|
let tokens = tokenizer.tokenize().unwrap();
|
|
// println!("tokens: {:#?}", tokens);
|
|
let expected = vec![
|
|
Token::Whitespace(Whitespace::Newline),
|
|
Token::Whitespace(Whitespace::Newline),
|
|
Token::make_keyword("SELECT"),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::Mul,
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::make_keyword("FROM"),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::make_keyword("table"),
|
|
Token::Whitespace(Whitespace::Tab),
|
|
Token::Char('م'),
|
|
Token::Char('ص'),
|
|
Token::Char('ط'),
|
|
Token::Char('ف'),
|
|
Token::Char('ى'),
|
|
Token::make_word("h", None),
|
|
];
|
|
compare(expected, tokens);
|
|
}
|
|
|
|
#[test]
|
|
fn tokenize_right_arrow() {
|
|
let sql = String::from("FUNCTION(key=>value)");
|
|
let dialect = GenericDialect {};
|
|
let mut tokenizer = Tokenizer::new(&dialect, &sql);
|
|
let tokens = tokenizer.tokenize().unwrap();
|
|
let expected = vec![
|
|
Token::make_word("FUNCTION", None),
|
|
Token::LParen,
|
|
Token::make_word("key", None),
|
|
Token::RArrow,
|
|
Token::make_word("value", None),
|
|
Token::RParen,
|
|
];
|
|
compare(expected, tokens);
|
|
}
|
|
|
|
#[test]
|
|
fn tokenize_is_null() {
|
|
let sql = String::from("a IS NULL");
|
|
let dialect = GenericDialect {};
|
|
let mut tokenizer = Tokenizer::new(&dialect, &sql);
|
|
let tokens = tokenizer.tokenize().unwrap();
|
|
|
|
let expected = vec![
|
|
Token::make_word("a", None),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::make_keyword("IS"),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::make_keyword("NULL"),
|
|
];
|
|
|
|
compare(expected, tokens);
|
|
}
|
|
|
|
#[test]
|
|
fn tokenize_comment() {
|
|
let sql = String::from("0--this is a comment\n1");
|
|
|
|
let dialect = GenericDialect {};
|
|
let mut tokenizer = Tokenizer::new(&dialect, &sql);
|
|
let tokens = tokenizer.tokenize().unwrap();
|
|
let expected = vec![
|
|
Token::Number("0".to_string(), false),
|
|
Token::Whitespace(Whitespace::SingleLineComment {
|
|
prefix: "--".to_string(),
|
|
comment: "this is a comment\n".to_string(),
|
|
}),
|
|
Token::Number("1".to_string(), false),
|
|
];
|
|
compare(expected, tokens);
|
|
}
|
|
|
|
#[test]
|
|
fn tokenize_comment_at_eof() {
|
|
let sql = String::from("--this is a comment");
|
|
|
|
let dialect = GenericDialect {};
|
|
let mut tokenizer = Tokenizer::new(&dialect, &sql);
|
|
let tokens = tokenizer.tokenize().unwrap();
|
|
let expected = vec![Token::Whitespace(Whitespace::SingleLineComment {
|
|
prefix: "--".to_string(),
|
|
comment: "this is a comment".to_string(),
|
|
})];
|
|
compare(expected, tokens);
|
|
}
|
|
|
|
#[test]
|
|
fn tokenize_multiline_comment() {
|
|
let sql = String::from("0/*multi-line\n* /comment*/1");
|
|
|
|
let dialect = GenericDialect {};
|
|
let mut tokenizer = Tokenizer::new(&dialect, &sql);
|
|
let tokens = tokenizer.tokenize().unwrap();
|
|
let expected = vec![
|
|
Token::Number("0".to_string(), false),
|
|
Token::Whitespace(Whitespace::MultiLineComment(
|
|
"multi-line\n* /comment".to_string(),
|
|
)),
|
|
Token::Number("1".to_string(), false),
|
|
];
|
|
compare(expected, tokens);
|
|
}
|
|
|
|
#[test]
|
|
fn tokenize_multiline_comment_with_even_asterisks() {
|
|
let sql = String::from("\n/** Comment **/\n");
|
|
|
|
let dialect = GenericDialect {};
|
|
let mut tokenizer = Tokenizer::new(&dialect, &sql);
|
|
let tokens = tokenizer.tokenize().unwrap();
|
|
let expected = vec![
|
|
Token::Whitespace(Whitespace::Newline),
|
|
Token::Whitespace(Whitespace::MultiLineComment("* Comment *".to_string())),
|
|
Token::Whitespace(Whitespace::Newline),
|
|
];
|
|
compare(expected, tokens);
|
|
}
|
|
|
|
#[test]
|
|
fn tokenize_mismatched_quotes() {
|
|
let sql = String::from("\"foo");
|
|
|
|
let dialect = GenericDialect {};
|
|
let mut tokenizer = Tokenizer::new(&dialect, &sql);
|
|
assert_eq!(
|
|
tokenizer.tokenize(),
|
|
Err(TokenizerError {
|
|
message: "Expected close delimiter '\"' before EOF.".to_string(),
|
|
line: 1,
|
|
col: 1
|
|
})
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn tokenize_newlines() {
|
|
let sql = String::from("line1\nline2\rline3\r\nline4\r");
|
|
|
|
let dialect = GenericDialect {};
|
|
let mut tokenizer = Tokenizer::new(&dialect, &sql);
|
|
let tokens = tokenizer.tokenize().unwrap();
|
|
let expected = vec![
|
|
Token::make_word("line1", None),
|
|
Token::Whitespace(Whitespace::Newline),
|
|
Token::make_word("line2", None),
|
|
Token::Whitespace(Whitespace::Newline),
|
|
Token::make_word("line3", None),
|
|
Token::Whitespace(Whitespace::Newline),
|
|
Token::make_word("line4", None),
|
|
Token::Whitespace(Whitespace::Newline),
|
|
];
|
|
compare(expected, tokens);
|
|
}
|
|
|
|
#[test]
|
|
fn tokenize_mssql_top() {
|
|
let sql = "SELECT TOP 5 [bar] FROM foo";
|
|
let dialect = MsSqlDialect {};
|
|
let mut tokenizer = Tokenizer::new(&dialect, sql);
|
|
let tokens = tokenizer.tokenize().unwrap();
|
|
let expected = vec![
|
|
Token::make_keyword("SELECT"),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::make_keyword("TOP"),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::Number(String::from("5"), false),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::make_word("bar", Some('[')),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::make_keyword("FROM"),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::make_word("foo", None),
|
|
];
|
|
compare(expected, tokens);
|
|
}
|
|
|
|
#[test]
|
|
fn tokenize_pg_regex_match() {
|
|
let sql = "SELECT col ~ '^a', col ~* '^a', col !~ '^a', col !~* '^a'";
|
|
let dialect = GenericDialect {};
|
|
let mut tokenizer = Tokenizer::new(&dialect, sql);
|
|
let tokens = tokenizer.tokenize().unwrap();
|
|
let expected = vec![
|
|
Token::make_keyword("SELECT"),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::make_word("col", None),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::Tilde,
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::SingleQuotedString("^a".into()),
|
|
Token::Comma,
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::make_word("col", None),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::TildeAsterisk,
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::SingleQuotedString("^a".into()),
|
|
Token::Comma,
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::make_word("col", None),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::ExclamationMarkTilde,
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::SingleQuotedString("^a".into()),
|
|
Token::Comma,
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::make_word("col", None),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::ExclamationMarkTildeAsterisk,
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::SingleQuotedString("^a".into()),
|
|
];
|
|
compare(expected, tokens);
|
|
}
|
|
|
|
#[test]
|
|
fn tokenize_quoted_identifier() {
|
|
let sql = r#" "a "" b" "a """ "c """"" "#;
|
|
let dialect = GenericDialect {};
|
|
let mut tokenizer = Tokenizer::new(&dialect, sql);
|
|
let tokens = tokenizer.tokenize().unwrap();
|
|
let expected = vec![
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::make_word(r#"a " b"#, Some('"')),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::make_word(r#"a ""#, Some('"')),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::make_word(r#"c """#, Some('"')),
|
|
Token::Whitespace(Whitespace::Space),
|
|
];
|
|
compare(expected, tokens);
|
|
}
|
|
|
|
fn compare(expected: Vec<Token>, actual: Vec<Token>) {
|
|
//println!("------------------------------");
|
|
//println!("tokens = {:?}", actual);
|
|
//println!("expected = {:?}", expected);
|
|
//println!("------------------------------");
|
|
assert_eq!(expected, actual);
|
|
}
|
|
}
|