// Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //! SQL Tokenizer //! //! The tokenizer (a.k.a. lexer) converts a string into a sequence of tokens. //! //! The tokens then form the input for the parser, which outputs an Abstract Syntax Tree (AST). #[cfg(not(feature = "std"))] use alloc::{ borrow::ToOwned, format, string::{String, ToString}, vec, vec::Vec, }; use core::fmt; use core::iter::Peekable; use core::str::Chars; #[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use crate::dialect::keywords::{Keyword, ALL_KEYWORDS, ALL_KEYWORDS_INDEX}; use crate::dialect::Dialect; use crate::dialect::SnowflakeDialect; /// SQL Token enumeration #[derive(Debug, Clone, PartialEq, Eq, Hash)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] pub enum Token { /// An end-of-file marker, not a real token EOF, /// A keyword (like SELECT) or an optionally quoted SQL identifier Word(Word), /// An unsigned numeric literal Number(String, bool), /// A character that could not be tokenized Char(char), /// Single quoted string: i.e: 'string' SingleQuotedString(String), /// "National" string literal: i.e: N'string' NationalStringLiteral(String), /// Hexadecimal string literal: i.e.: X'deadbeef' HexStringLiteral(String), /// Comma Comma, /// Whitespace (space, tab, etc) Whitespace(Whitespace), /// Double equals sign `==` DoubleEq, /// Equality operator `=` Eq, /// Not Equals operator `<>` (or `!=` in some dialects) Neq, /// Less Than operator `<` Lt, /// Greater Than operator `>` Gt, /// Less Than Or Equals operator `<=` LtEq, /// Greater Than Or Equals operator `>=` GtEq, /// Spaceship operator <=> Spaceship, /// Plus operator `+` Plus, /// Minus operator `-` Minus, /// Multiplication operator `*` Mul, /// Division operator `/` Div, /// Modulo Operator `%` Mod, /// String concatenation `||` StringConcat, /// Left parenthesis `(` LParen, /// Right parenthesis `)` RParen, /// Period (used for compound identifiers or projections into nested types) Period, /// Colon `:` Colon, /// DoubleColon `::` (used for casting in postgresql) DoubleColon, /// SemiColon `;` used as separator for COPY and payload SemiColon, /// Backslash `\` used in terminating the COPY payload with `\.` Backslash, /// Left bracket `[` LBracket, /// Right bracket `]` RBracket, /// Ampersand `&` Ampersand, /// Pipe `|` Pipe, /// Caret `^` Caret, /// Left brace `{` LBrace, /// Right brace `}` RBrace, /// Right Arrow `=>` RArrow, /// Sharp `#` used for PostgreSQL Bitwise XOR operator Sharp, /// Tilde `~` used for PostgreSQL Bitwise NOT operator or case sensitive match regular expression operator Tilde, /// `~*` , a case insensitive match regular expression operator in PostgreSQL TildeAsterisk, /// `!~` , a case sensitive not match regular expression operator in PostgreSQL ExclamationMarkTilde, /// `!~*` , a case insensitive not match regular expression operator in PostgreSQL ExclamationMarkTildeAsterisk, /// `<<`, a bitwise shift left operator in PostgreSQL ShiftLeft, /// `>>`, a bitwise shift right operator in PostgreSQL ShiftRight, /// Exclamation Mark `!` used for PostgreSQL factorial operator ExclamationMark, /// Double Exclamation Mark `!!` used for PostgreSQL prefix factorial operator DoubleExclamationMark, /// AtSign `@` used for PostgreSQL abs operator AtSign, /// `|/`, a square root math operator in PostgreSQL PGSquareRoot, /// `||/` , a cube root math operator in PostgreSQL PGCubeRoot, } impl fmt::Display for Token { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { Token::EOF => f.write_str("EOF"), Token::Word(ref w) => write!(f, "{}", w), Token::Number(ref n, l) => write!(f, "{}{long}", n, long = if *l { "L" } else { "" }), Token::Char(ref c) => write!(f, "{}", c), Token::SingleQuotedString(ref s) => write!(f, "'{}'", s), Token::NationalStringLiteral(ref s) => write!(f, "N'{}'", s), Token::HexStringLiteral(ref s) => write!(f, "X'{}'", s), Token::Comma => f.write_str(","), Token::Whitespace(ws) => write!(f, "{}", ws), Token::DoubleEq => f.write_str("=="), Token::Spaceship => f.write_str("<=>"), Token::Eq => f.write_str("="), Token::Neq => f.write_str("<>"), Token::Lt => f.write_str("<"), Token::Gt => f.write_str(">"), Token::LtEq => f.write_str("<="), Token::GtEq => f.write_str(">="), Token::Plus => f.write_str("+"), Token::Minus => f.write_str("-"), Token::Mul => f.write_str("*"), Token::Div => f.write_str("/"), Token::StringConcat => f.write_str("||"), Token::Mod => f.write_str("%"), Token::LParen => f.write_str("("), Token::RParen => f.write_str(")"), Token::Period => f.write_str("."), Token::Colon => f.write_str(":"), Token::DoubleColon => f.write_str("::"), Token::SemiColon => f.write_str(";"), Token::Backslash => f.write_str("\\"), Token::LBracket => f.write_str("["), Token::RBracket => f.write_str("]"), Token::Ampersand => f.write_str("&"), Token::Caret => f.write_str("^"), Token::Pipe => f.write_str("|"), Token::LBrace => f.write_str("{"), Token::RBrace => f.write_str("}"), Token::RArrow => f.write_str("=>"), Token::Sharp => f.write_str("#"), Token::ExclamationMark => f.write_str("!"), Token::DoubleExclamationMark => f.write_str("!!"), Token::Tilde => f.write_str("~"), Token::TildeAsterisk => f.write_str("~*"), Token::ExclamationMarkTilde => f.write_str("!~"), Token::ExclamationMarkTildeAsterisk => f.write_str("!~*"), Token::AtSign => f.write_str("@"), Token::ShiftLeft => f.write_str("<<"), Token::ShiftRight => f.write_str(">>"), Token::PGSquareRoot => f.write_str("|/"), Token::PGCubeRoot => f.write_str("||/"), } } } impl Token { pub fn make_keyword(keyword: &str) -> Self { Token::make_word(keyword, None) } pub fn make_word(word: &str, quote_style: Option) -> Self { let word_uppercase = word.to_uppercase(); Token::Word(Word { value: word.to_string(), quote_style, keyword: if quote_style == None { let keyword = ALL_KEYWORDS.binary_search(&word_uppercase.as_str()); keyword.map_or(Keyword::NoKeyword, |x| ALL_KEYWORDS_INDEX[x]) } else { Keyword::NoKeyword }, }) } } /// A keyword (like SELECT) or an optionally quoted SQL identifier #[derive(Debug, Clone, PartialEq, Eq, Hash)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] pub struct Word { /// The value of the token, without the enclosing quotes, and with the /// escape sequences (if any) processed (TODO: escapes are not handled) pub value: String, /// An identifier can be "quoted" (<delimited identifier> in ANSI parlance). /// The standard and most implementations allow using double quotes for this, /// but some implementations support other quoting styles as well (e.g. \[MS SQL]) pub quote_style: Option, /// If the word was not quoted and it matched one of the known keywords, /// this will have one of the values from dialect::keywords, otherwise empty pub keyword: Keyword, } impl fmt::Display for Word { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self.quote_style { Some(s) if s == '"' || s == '[' || s == '`' => { write!(f, "{}{}{}", s, self.value, Word::matching_end_quote(s)) } None => f.write_str(&self.value), _ => panic!("Unexpected quote_style!"), } } } impl Word { fn matching_end_quote(ch: char) -> char { match ch { '"' => '"', // ANSI and most dialects '[' => ']', // MS SQL '`' => '`', // MySQL _ => panic!("unexpected quoting style!"), } } } #[derive(Debug, Clone, PartialEq, Eq, Hash)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] pub enum Whitespace { Space, Newline, Tab, SingleLineComment { comment: String, prefix: String }, MultiLineComment(String), } impl fmt::Display for Whitespace { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { Whitespace::Space => f.write_str(" "), Whitespace::Newline => f.write_str("\n"), Whitespace::Tab => f.write_str("\t"), Whitespace::SingleLineComment { prefix, comment } => write!(f, "{}{}", prefix, comment), Whitespace::MultiLineComment(s) => write!(f, "/*{}*/", s), } } } /// Tokenizer error #[derive(Debug, PartialEq)] pub struct TokenizerError { pub message: String, pub line: u64, pub col: u64, } impl fmt::Display for TokenizerError { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!( f, "{} at Line: {}, Column {}", self.message, self.line, self.col ) } } #[cfg(feature = "std")] impl std::error::Error for TokenizerError {} /// SQL Tokenizer pub struct Tokenizer<'a> { dialect: &'a dyn Dialect, query: &'a str, line: u64, col: u64, } impl<'a> Tokenizer<'a> { /// Create a new SQL tokenizer for the specified SQL statement pub fn new(dialect: &'a dyn Dialect, query: &'a str) -> Self { Self { dialect, query, line: 1, col: 1, } } /// Tokenize the statement and produce a vector of tokens pub fn tokenize(&mut self) -> Result, TokenizerError> { let mut peekable = self.query.chars().peekable(); let mut tokens: Vec = vec![]; while let Some(token) = self.next_token(&mut peekable)? { match &token { Token::Whitespace(Whitespace::Newline) => { self.line += 1; self.col = 1; } Token::Whitespace(Whitespace::Tab) => self.col += 4, Token::Word(w) if w.quote_style == None => self.col += w.value.len() as u64, Token::Word(w) if w.quote_style != None => self.col += w.value.len() as u64 + 2, Token::Number(s, _) => self.col += s.len() as u64, Token::SingleQuotedString(s) => self.col += s.len() as u64, _ => self.col += 1, } tokens.push(token); } Ok(tokens) } /// Get the next token or return None fn next_token(&self, chars: &mut Peekable>) -> Result, TokenizerError> { //println!("next_token: {:?}", chars.peek()); match chars.peek() { Some(&ch) => match ch { ' ' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)), '\t' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Tab)), '\n' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Newline)), '\r' => { // Emit a single Whitespace::Newline token for \r and \r\n chars.next(); if let Some('\n') = chars.peek() { chars.next(); } Ok(Some(Token::Whitespace(Whitespace::Newline))) } 'N' => { chars.next(); // consume, to check the next char match chars.peek() { Some('\'') => { // N'...' - a let s = self.tokenize_single_quoted_string(chars)?; Ok(Some(Token::NationalStringLiteral(s))) } _ => { // regular identifier starting with an "N" let s = self.tokenize_word('N', chars); Ok(Some(Token::make_word(&s, None))) } } } // The spec only allows an uppercase 'X' to introduce a hex // string, but PostgreSQL, at least, allows a lowercase 'x' too. x @ 'x' | x @ 'X' => { chars.next(); // consume, to check the next char match chars.peek() { Some('\'') => { // X'...' - a let s = self.tokenize_single_quoted_string(chars)?; Ok(Some(Token::HexStringLiteral(s))) } _ => { // regular identifier starting with an "X" let s = self.tokenize_word(x, chars); Ok(Some(Token::make_word(&s, None))) } } } // identifier or keyword ch if self.dialect.is_identifier_start(ch) => { chars.next(); // consume the first char let s = self.tokenize_word(ch, chars); if s.chars().all(|x| ('0'..='9').contains(&x) || x == '.') { let mut s = peeking_take_while(&mut s.chars().peekable(), |ch| { matches!(ch, '0'..='9' | '.') }); let s2 = peeking_take_while(chars, |ch| matches!(ch, '0'..='9' | '.')); s += s2.as_str(); return Ok(Some(Token::Number(s, false))); } Ok(Some(Token::make_word(&s, None))) } // string '\'' => { let s = self.tokenize_single_quoted_string(chars)?; Ok(Some(Token::SingleQuotedString(s))) } // delimited (quoted) identifier quote_start if self.dialect.is_delimited_identifier_start(quote_start) => { chars.next(); // consume the opening quote let quote_end = Word::matching_end_quote(quote_start); let s = peeking_take_while(chars, |ch| ch != quote_end); if chars.next() == Some(quote_end) { Ok(Some(Token::make_word(&s, Some(quote_start)))) } else { self.tokenizer_error(format!( "Expected close delimiter '{}' before EOF.", quote_end )) } } // numbers and period '0'..='9' | '.' => { let mut s = peeking_take_while(chars, |ch| matches!(ch, '0'..='9')); // match binary literal that starts with 0x if s == "0" && chars.peek() == Some(&'x') { chars.next(); let s2 = peeking_take_while( chars, |ch| matches!(ch, '0'..='9' | 'A'..='F' | 'a'..='f'), ); return Ok(Some(Token::HexStringLiteral(s2))); } // match one period if let Some('.') = chars.peek() { s.push('.'); chars.next(); } s += &peeking_take_while(chars, |ch| matches!(ch, '0'..='9')); // No number -> Token::Period if s == "." { return Ok(Some(Token::Period)); } let long = if chars.peek() == Some(&'L') { chars.next(); true } else { false }; Ok(Some(Token::Number(s, long))) } // punctuation '(' => self.consume_and_return(chars, Token::LParen), ')' => self.consume_and_return(chars, Token::RParen), ',' => self.consume_and_return(chars, Token::Comma), // operators '-' => { chars.next(); // consume the '-' match chars.peek() { Some('-') => { chars.next(); // consume the second '-', starting a single-line comment let comment = self.tokenize_single_line_comment(chars); Ok(Some(Token::Whitespace(Whitespace::SingleLineComment { prefix: "--".to_owned(), comment, }))) } // a regular '-' operator _ => Ok(Some(Token::Minus)), } } '/' => { chars.next(); // consume the '/' match chars.peek() { Some('*') => { chars.next(); // consume the '*', starting a multi-line comment self.tokenize_multiline_comment(chars) } Some('/') if dialect_of!(self is SnowflakeDialect) => { chars.next(); // consume the second '/', starting a snowflake single-line comment let comment = self.tokenize_single_line_comment(chars); Ok(Some(Token::Whitespace(Whitespace::SingleLineComment { prefix: "//".to_owned(), comment, }))) } // a regular '/' operator _ => Ok(Some(Token::Div)), } } '+' => self.consume_and_return(chars, Token::Plus), '*' => self.consume_and_return(chars, Token::Mul), '%' => self.consume_and_return(chars, Token::Mod), '|' => { chars.next(); // consume the '|' match chars.peek() { Some('/') => self.consume_and_return(chars, Token::PGSquareRoot), Some('|') => { chars.next(); // consume the second '|' match chars.peek() { Some('/') => self.consume_and_return(chars, Token::PGCubeRoot), _ => Ok(Some(Token::StringConcat)), } } // Bitshift '|' operator _ => Ok(Some(Token::Pipe)), } } '=' => { chars.next(); // consume match chars.peek() { Some('>') => self.consume_and_return(chars, Token::RArrow), _ => Ok(Some(Token::Eq)), } } '!' => { chars.next(); // consume match chars.peek() { Some('=') => self.consume_and_return(chars, Token::Neq), Some('!') => self.consume_and_return(chars, Token::DoubleExclamationMark), Some('~') => { chars.next(); match chars.peek() { Some('*') => self .consume_and_return(chars, Token::ExclamationMarkTildeAsterisk), _ => Ok(Some(Token::ExclamationMarkTilde)), } } _ => Ok(Some(Token::ExclamationMark)), } } '<' => { chars.next(); // consume match chars.peek() { Some('=') => { chars.next(); match chars.peek() { Some('>') => self.consume_and_return(chars, Token::Spaceship), _ => Ok(Some(Token::LtEq)), } } Some('>') => self.consume_and_return(chars, Token::Neq), Some('<') => self.consume_and_return(chars, Token::ShiftLeft), _ => Ok(Some(Token::Lt)), } } '>' => { chars.next(); // consume match chars.peek() { Some('=') => self.consume_and_return(chars, Token::GtEq), Some('>') => self.consume_and_return(chars, Token::ShiftRight), _ => Ok(Some(Token::Gt)), } } ':' => { chars.next(); match chars.peek() { Some(':') => self.consume_and_return(chars, Token::DoubleColon), _ => Ok(Some(Token::Colon)), } } ';' => self.consume_and_return(chars, Token::SemiColon), '\\' => self.consume_and_return(chars, Token::Backslash), '[' => self.consume_and_return(chars, Token::LBracket), ']' => self.consume_and_return(chars, Token::RBracket), '&' => self.consume_and_return(chars, Token::Ampersand), '^' => self.consume_and_return(chars, Token::Caret), '{' => self.consume_and_return(chars, Token::LBrace), '}' => self.consume_and_return(chars, Token::RBrace), '#' if dialect_of!(self is SnowflakeDialect) => { chars.next(); // consume the '#', starting a snowflake single-line comment let comment = self.tokenize_single_line_comment(chars); Ok(Some(Token::Whitespace(Whitespace::SingleLineComment { prefix: "#".to_owned(), comment, }))) } '~' => { chars.next(); // consume match chars.peek() { Some('*') => self.consume_and_return(chars, Token::TildeAsterisk), _ => Ok(Some(Token::Tilde)), } } '#' => self.consume_and_return(chars, Token::Sharp), '@' => self.consume_and_return(chars, Token::AtSign), other => self.consume_and_return(chars, Token::Char(other)), }, None => Ok(None), } } fn tokenizer_error(&self, message: impl Into) -> Result { Err(TokenizerError { message: message.into(), col: self.col, line: self.line, }) } // Consume characters until newline fn tokenize_single_line_comment(&self, chars: &mut Peekable>) -> String { let mut comment = peeking_take_while(chars, |ch| ch != '\n'); if let Some(ch) = chars.next() { assert_eq!(ch, '\n'); comment.push(ch); } comment } /// Tokenize an identifier or keyword, after the first char is already consumed. fn tokenize_word(&self, first_char: char, chars: &mut Peekable>) -> String { let mut s = first_char.to_string(); s.push_str(&peeking_take_while(chars, |ch| { self.dialect.is_identifier_part(ch) })); s } /// Read a single quoted string, starting with the opening quote. fn tokenize_single_quoted_string( &self, chars: &mut Peekable>, ) -> Result { let mut s = String::new(); chars.next(); // consume the opening quote while let Some(&ch) = chars.peek() { match ch { '\'' => { chars.next(); // consume let escaped_quote = chars.peek().map(|c| *c == '\'').unwrap_or(false); if escaped_quote { s.push('\''); chars.next(); } else { return Ok(s); } } _ => { chars.next(); // consume s.push(ch); } } } self.tokenizer_error("Unterminated string literal") } fn tokenize_multiline_comment( &self, chars: &mut Peekable>, ) -> Result, TokenizerError> { let mut s = String::new(); let mut maybe_closing_comment = false; // TODO: deal with nested comments loop { match chars.next() { Some(ch) => { if maybe_closing_comment { if ch == '/' { break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s)))); } else { s.push('*'); } } maybe_closing_comment = ch == '*'; if !maybe_closing_comment { s.push(ch); } } None => break self.tokenizer_error("Unexpected EOF while in a multi-line comment"), } } } #[allow(clippy::unnecessary_wraps)] fn consume_and_return( &self, chars: &mut Peekable>, t: Token, ) -> Result, TokenizerError> { chars.next(); Ok(Some(t)) } } /// Read from `chars` until `predicate` returns `false` or EOF is hit. /// Return the characters read as String, and keep the first non-matching /// char available as `chars.next()`. fn peeking_take_while( chars: &mut Peekable>, mut predicate: impl FnMut(char) -> bool, ) -> String { let mut s = String::new(); while let Some(&ch) = chars.peek() { if predicate(ch) { chars.next(); // consume s.push(ch); } else { break; } } s } #[cfg(test)] mod tests { use super::*; use crate::dialect::{GenericDialect, MsSqlDialect}; #[test] fn tokenizer_error_impl() { let err = TokenizerError { message: "test".into(), line: 1, col: 1, }; #[cfg(feature = "std")] { use std::error::Error; assert!(err.source().is_none()); } assert_eq!(err.to_string(), "test at Line: 1, Column 1"); } #[test] fn tokenize_select_1() { let sql = String::from("SELECT 1"); let dialect = GenericDialect {}; let mut tokenizer = Tokenizer::new(&dialect, &sql); let tokens = tokenizer.tokenize().unwrap(); let expected = vec![ Token::make_keyword("SELECT"), Token::Whitespace(Whitespace::Space), Token::Number(String::from("1"), false), ]; compare(expected, tokens); } #[test] fn tokenize_select_float() { let sql = String::from("SELECT .1"); let dialect = GenericDialect {}; let mut tokenizer = Tokenizer::new(&dialect, &sql); let tokens = tokenizer.tokenize().unwrap(); let expected = vec![ Token::make_keyword("SELECT"), Token::Whitespace(Whitespace::Space), Token::Number(String::from(".1"), false), ]; compare(expected, tokens); } #[test] fn tokenize_scalar_function() { let sql = String::from("SELECT sqrt(1)"); let dialect = GenericDialect {}; let mut tokenizer = Tokenizer::new(&dialect, &sql); let tokens = tokenizer.tokenize().unwrap(); let expected = vec![ Token::make_keyword("SELECT"), Token::Whitespace(Whitespace::Space), Token::make_word("sqrt", None), Token::LParen, Token::Number(String::from("1"), false), Token::RParen, ]; compare(expected, tokens); } #[test] fn tokenize_string_string_concat() { let sql = String::from("SELECT 'a' || 'b'"); let dialect = GenericDialect {}; let mut tokenizer = Tokenizer::new(&dialect, &sql); let tokens = tokenizer.tokenize().unwrap(); let expected = vec![ Token::make_keyword("SELECT"), Token::Whitespace(Whitespace::Space), Token::SingleQuotedString(String::from("a")), Token::Whitespace(Whitespace::Space), Token::StringConcat, Token::Whitespace(Whitespace::Space), Token::SingleQuotedString(String::from("b")), ]; compare(expected, tokens); } #[test] fn tokenize_bitwise_op() { let sql = String::from("SELECT one | two ^ three"); let dialect = GenericDialect {}; let mut tokenizer = Tokenizer::new(&dialect, &sql); let tokens = tokenizer.tokenize().unwrap(); let expected = vec![ Token::make_keyword("SELECT"), Token::Whitespace(Whitespace::Space), Token::make_word("one", None), Token::Whitespace(Whitespace::Space), Token::Pipe, Token::Whitespace(Whitespace::Space), Token::make_word("two", None), Token::Whitespace(Whitespace::Space), Token::Caret, Token::Whitespace(Whitespace::Space), Token::make_word("three", None), ]; compare(expected, tokens); } #[test] fn tokenize_simple_select() { let sql = String::from("SELECT * FROM customer WHERE id = 1 LIMIT 5"); let dialect = GenericDialect {}; let mut tokenizer = Tokenizer::new(&dialect, &sql); let tokens = tokenizer.tokenize().unwrap(); let expected = vec![ Token::make_keyword("SELECT"), Token::Whitespace(Whitespace::Space), Token::Mul, Token::Whitespace(Whitespace::Space), Token::make_keyword("FROM"), Token::Whitespace(Whitespace::Space), Token::make_word("customer", None), Token::Whitespace(Whitespace::Space), Token::make_keyword("WHERE"), Token::Whitespace(Whitespace::Space), Token::make_word("id", None), Token::Whitespace(Whitespace::Space), Token::Eq, Token::Whitespace(Whitespace::Space), Token::Number(String::from("1"), false), Token::Whitespace(Whitespace::Space), Token::make_keyword("LIMIT"), Token::Whitespace(Whitespace::Space), Token::Number(String::from("5"), false), ]; compare(expected, tokens); } #[test] fn tokenize_explain_select() { let sql = String::from("EXPLAIN SELECT * FROM customer WHERE id = 1"); let dialect = GenericDialect {}; let mut tokenizer = Tokenizer::new(&dialect, &sql); let tokens = tokenizer.tokenize().unwrap(); let expected = vec![ Token::make_keyword("EXPLAIN"), Token::Whitespace(Whitespace::Space), Token::make_keyword("SELECT"), Token::Whitespace(Whitespace::Space), Token::Mul, Token::Whitespace(Whitespace::Space), Token::make_keyword("FROM"), Token::Whitespace(Whitespace::Space), Token::make_word("customer", None), Token::Whitespace(Whitespace::Space), Token::make_keyword("WHERE"), Token::Whitespace(Whitespace::Space), Token::make_word("id", None), Token::Whitespace(Whitespace::Space), Token::Eq, Token::Whitespace(Whitespace::Space), Token::Number(String::from("1"), false), ]; compare(expected, tokens); } #[test] fn tokenize_explain_analyze_select() { let sql = String::from("EXPLAIN ANALYZE SELECT * FROM customer WHERE id = 1"); let dialect = GenericDialect {}; let mut tokenizer = Tokenizer::new(&dialect, &sql); let tokens = tokenizer.tokenize().unwrap(); let expected = vec![ Token::make_keyword("EXPLAIN"), Token::Whitespace(Whitespace::Space), Token::make_keyword("ANALYZE"), Token::Whitespace(Whitespace::Space), Token::make_keyword("SELECT"), Token::Whitespace(Whitespace::Space), Token::Mul, Token::Whitespace(Whitespace::Space), Token::make_keyword("FROM"), Token::Whitespace(Whitespace::Space), Token::make_word("customer", None), Token::Whitespace(Whitespace::Space), Token::make_keyword("WHERE"), Token::Whitespace(Whitespace::Space), Token::make_word("id", None), Token::Whitespace(Whitespace::Space), Token::Eq, Token::Whitespace(Whitespace::Space), Token::Number(String::from("1"), false), ]; compare(expected, tokens); } #[test] fn tokenize_string_predicate() { let sql = String::from("SELECT * FROM customer WHERE salary != 'Not Provided'"); let dialect = GenericDialect {}; let mut tokenizer = Tokenizer::new(&dialect, &sql); let tokens = tokenizer.tokenize().unwrap(); let expected = vec![ Token::make_keyword("SELECT"), Token::Whitespace(Whitespace::Space), Token::Mul, Token::Whitespace(Whitespace::Space), Token::make_keyword("FROM"), Token::Whitespace(Whitespace::Space), Token::make_word("customer", None), Token::Whitespace(Whitespace::Space), Token::make_keyword("WHERE"), Token::Whitespace(Whitespace::Space), Token::make_word("salary", None), Token::Whitespace(Whitespace::Space), Token::Neq, Token::Whitespace(Whitespace::Space), Token::SingleQuotedString(String::from("Not Provided")), ]; compare(expected, tokens); } #[test] fn tokenize_invalid_string() { let sql = String::from("\nمصطفىh"); let dialect = GenericDialect {}; let mut tokenizer = Tokenizer::new(&dialect, &sql); let tokens = tokenizer.tokenize().unwrap(); // println!("tokens: {:#?}", tokens); let expected = vec![ Token::Whitespace(Whitespace::Newline), Token::Char('م'), Token::Char('ص'), Token::Char('ط'), Token::Char('ف'), Token::Char('ى'), Token::make_word("h", None), ]; compare(expected, tokens); } #[test] fn tokenize_newline_in_string_literal() { let sql = String::from("'foo\r\nbar\nbaz'"); let dialect = GenericDialect {}; let mut tokenizer = Tokenizer::new(&dialect, &sql); let tokens = tokenizer.tokenize().unwrap(); let expected = vec![Token::SingleQuotedString("foo\r\nbar\nbaz".to_string())]; compare(expected, tokens); } #[test] fn tokenize_unterminated_string_literal() { let sql = String::from("select 'foo"); let dialect = GenericDialect {}; let mut tokenizer = Tokenizer::new(&dialect, &sql); assert_eq!( tokenizer.tokenize(), Err(TokenizerError { message: "Unterminated string literal".to_string(), line: 1, col: 8 }) ); } #[test] fn tokenize_invalid_string_cols() { let sql = String::from("\n\nSELECT * FROM table\tمصطفىh"); let dialect = GenericDialect {}; let mut tokenizer = Tokenizer::new(&dialect, &sql); let tokens = tokenizer.tokenize().unwrap(); // println!("tokens: {:#?}", tokens); let expected = vec![ Token::Whitespace(Whitespace::Newline), Token::Whitespace(Whitespace::Newline), Token::make_keyword("SELECT"), Token::Whitespace(Whitespace::Space), Token::Mul, Token::Whitespace(Whitespace::Space), Token::make_keyword("FROM"), Token::Whitespace(Whitespace::Space), Token::make_keyword("table"), Token::Whitespace(Whitespace::Tab), Token::Char('م'), Token::Char('ص'), Token::Char('ط'), Token::Char('ف'), Token::Char('ى'), Token::make_word("h", None), ]; compare(expected, tokens); } #[test] fn tokenize_right_arrow() { let sql = String::from("FUNCTION(key=>value)"); let dialect = GenericDialect {}; let mut tokenizer = Tokenizer::new(&dialect, &sql); let tokens = tokenizer.tokenize().unwrap(); let expected = vec![ Token::make_word("FUNCTION", None), Token::LParen, Token::make_word("key", None), Token::RArrow, Token::make_word("value", None), Token::RParen, ]; compare(expected, tokens); } #[test] fn tokenize_is_null() { let sql = String::from("a IS NULL"); let dialect = GenericDialect {}; let mut tokenizer = Tokenizer::new(&dialect, &sql); let tokens = tokenizer.tokenize().unwrap(); let expected = vec![ Token::make_word("a", None), Token::Whitespace(Whitespace::Space), Token::make_keyword("IS"), Token::Whitespace(Whitespace::Space), Token::make_keyword("NULL"), ]; compare(expected, tokens); } #[test] fn tokenize_comment() { let sql = String::from("0--this is a comment\n1"); let dialect = GenericDialect {}; let mut tokenizer = Tokenizer::new(&dialect, &sql); let tokens = tokenizer.tokenize().unwrap(); let expected = vec![ Token::Number("0".to_string(), false), Token::Whitespace(Whitespace::SingleLineComment { prefix: "--".to_string(), comment: "this is a comment\n".to_string(), }), Token::Number("1".to_string(), false), ]; compare(expected, tokens); } #[test] fn tokenize_comment_at_eof() { let sql = String::from("--this is a comment"); let dialect = GenericDialect {}; let mut tokenizer = Tokenizer::new(&dialect, &sql); let tokens = tokenizer.tokenize().unwrap(); let expected = vec![Token::Whitespace(Whitespace::SingleLineComment { prefix: "--".to_string(), comment: "this is a comment".to_string(), })]; compare(expected, tokens); } #[test] fn tokenize_multiline_comment() { let sql = String::from("0/*multi-line\n* /comment*/1"); let dialect = GenericDialect {}; let mut tokenizer = Tokenizer::new(&dialect, &sql); let tokens = tokenizer.tokenize().unwrap(); let expected = vec![ Token::Number("0".to_string(), false), Token::Whitespace(Whitespace::MultiLineComment( "multi-line\n* /comment".to_string(), )), Token::Number("1".to_string(), false), ]; compare(expected, tokens); } #[test] fn tokenize_multiline_comment_with_even_asterisks() { let sql = String::from("\n/** Comment **/\n"); let dialect = GenericDialect {}; let mut tokenizer = Tokenizer::new(&dialect, &sql); let tokens = tokenizer.tokenize().unwrap(); let expected = vec![ Token::Whitespace(Whitespace::Newline), Token::Whitespace(Whitespace::MultiLineComment("* Comment *".to_string())), Token::Whitespace(Whitespace::Newline), ]; compare(expected, tokens); } #[test] fn tokenize_mismatched_quotes() { let sql = String::from("\"foo"); let dialect = GenericDialect {}; let mut tokenizer = Tokenizer::new(&dialect, &sql); assert_eq!( tokenizer.tokenize(), Err(TokenizerError { message: "Expected close delimiter '\"' before EOF.".to_string(), line: 1, col: 1 }) ); } #[test] fn tokenize_newlines() { let sql = String::from("line1\nline2\rline3\r\nline4\r"); let dialect = GenericDialect {}; let mut tokenizer = Tokenizer::new(&dialect, &sql); let tokens = tokenizer.tokenize().unwrap(); let expected = vec![ Token::make_word("line1", None), Token::Whitespace(Whitespace::Newline), Token::make_word("line2", None), Token::Whitespace(Whitespace::Newline), Token::make_word("line3", None), Token::Whitespace(Whitespace::Newline), Token::make_word("line4", None), Token::Whitespace(Whitespace::Newline), ]; compare(expected, tokens); } #[test] fn tokenize_mssql_top() { let sql = "SELECT TOP 5 [bar] FROM foo"; let dialect = MsSqlDialect {}; let mut tokenizer = Tokenizer::new(&dialect, sql); let tokens = tokenizer.tokenize().unwrap(); let expected = vec![ Token::make_keyword("SELECT"), Token::Whitespace(Whitespace::Space), Token::make_keyword("TOP"), Token::Whitespace(Whitespace::Space), Token::Number(String::from("5"), false), Token::Whitespace(Whitespace::Space), Token::make_word("bar", Some('[')), Token::Whitespace(Whitespace::Space), Token::make_keyword("FROM"), Token::Whitespace(Whitespace::Space), Token::make_word("foo", None), ]; compare(expected, tokens); } #[test] fn tokenize_pg_regex_match() { let sql = "SELECT col ~ '^a', col ~* '^a', col !~ '^a', col !~* '^a'"; let dialect = GenericDialect {}; let mut tokenizer = Tokenizer::new(&dialect, sql); let tokens = tokenizer.tokenize().unwrap(); let expected = vec![ Token::make_keyword("SELECT"), Token::Whitespace(Whitespace::Space), Token::make_word("col", None), Token::Whitespace(Whitespace::Space), Token::Tilde, Token::Whitespace(Whitespace::Space), Token::SingleQuotedString("^a".into()), Token::Comma, Token::Whitespace(Whitespace::Space), Token::make_word("col", None), Token::Whitespace(Whitespace::Space), Token::TildeAsterisk, Token::Whitespace(Whitespace::Space), Token::SingleQuotedString("^a".into()), Token::Comma, Token::Whitespace(Whitespace::Space), Token::make_word("col", None), Token::Whitespace(Whitespace::Space), Token::ExclamationMarkTilde, Token::Whitespace(Whitespace::Space), Token::SingleQuotedString("^a".into()), Token::Comma, Token::Whitespace(Whitespace::Space), Token::make_word("col", None), Token::Whitespace(Whitespace::Space), Token::ExclamationMarkTildeAsterisk, Token::Whitespace(Whitespace::Space), Token::SingleQuotedString("^a".into()), ]; compare(expected, tokens); } fn compare(expected: Vec, actual: Vec) { //println!("------------------------------"); //println!("tokens = {:?}", actual); //println!("expected = {:?}", expected); //println!("------------------------------"); assert_eq!(expected, actual); } }