mirror of
https://github.com/apache/datafusion-sqlparser-rs.git
synced 2025-08-30 10:47:22 +00:00

The dialect information is from https://en.wikibooks.org/wiki/SQL_Dialects_Reference/Data_structure_definition/Delimited_identifiers
591 lines
21 KiB
Rust
591 lines
21 KiB
Rust
// Copyright 2018 Grove Enterprises LLC
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
//! SQL Tokenizer
|
|
//!
|
|
//! The tokenizer (a.k.a. lexer) converts a string into a sequence of tokens.
|
|
//!
|
|
//! The tokens then form the input for the parser, which outputs an Abstract Syntax Tree (AST).
|
|
|
|
use std::iter::Peekable;
|
|
use std::str::Chars;
|
|
|
|
use super::dialect::keywords::ALL_KEYWORDS;
|
|
use super::dialect::Dialect;
|
|
|
|
/// SQL Token enumeration
|
|
#[derive(Debug, Clone, PartialEq)]
|
|
pub enum Token {
|
|
/// A keyword (like SELECT) or an optionally quoted SQL identifier
|
|
SQLWord(SQLWord),
|
|
/// Numeric literal
|
|
Number(String),
|
|
/// A character that could not be tokenized
|
|
Char(char),
|
|
/// Single quoted string: i.e: 'string'
|
|
SingleQuotedString(String),
|
|
/// Comma
|
|
Comma,
|
|
/// Whitespace (space, tab, etc)
|
|
Whitespace(Whitespace),
|
|
/// Equality operator `=`
|
|
Eq,
|
|
/// Not Equals operator `!=` or `<>`
|
|
Neq,
|
|
/// Less Than operator `<`
|
|
Lt,
|
|
/// Greater han operator `>`
|
|
Gt,
|
|
/// Less Than Or Equals operator `<=`
|
|
LtEq,
|
|
/// Greater Than Or Equals operator `>=`
|
|
GtEq,
|
|
/// Plus operator `+`
|
|
Plus,
|
|
/// Minus operator `-`
|
|
Minus,
|
|
/// Multiplication operator `*`
|
|
Mult,
|
|
/// Division operator `/`
|
|
Div,
|
|
/// Modulo Operator `%`
|
|
Mod,
|
|
/// Left parenthesis `(`
|
|
LParen,
|
|
/// Right parenthesis `)`
|
|
RParen,
|
|
/// Period (used for compound identifiers or projections into nested types)
|
|
Period,
|
|
/// Colon `:`
|
|
Colon,
|
|
/// DoubleColon `::` (used for casting in postgresql)
|
|
DoubleColon,
|
|
/// SemiColon `;` used as separator for COPY and payload
|
|
SemiColon,
|
|
/// Backslash `\` used in terminating the COPY payload with `\.`
|
|
Backslash,
|
|
/// Left bracket `[`
|
|
LBracket,
|
|
/// Right bracket `]`
|
|
RBracket,
|
|
/// Ampersand &
|
|
Ampersand,
|
|
/// Left brace `{`
|
|
LBrace,
|
|
/// Right brace `}`
|
|
RBrace,
|
|
}
|
|
|
|
impl ToString for Token {
|
|
fn to_string(&self) -> String {
|
|
match self {
|
|
Token::SQLWord(ref w) => w.to_string(),
|
|
Token::Number(ref n) => n.to_string(),
|
|
Token::Char(ref c) => c.to_string(),
|
|
Token::SingleQuotedString(ref s) => format!("'{}'", s),
|
|
Token::Comma => ",".to_string(),
|
|
Token::Whitespace(ws) => ws.to_string(),
|
|
Token::Eq => "=".to_string(),
|
|
Token::Neq => "-".to_string(),
|
|
Token::Lt => "<".to_string(),
|
|
Token::Gt => ">".to_string(),
|
|
Token::LtEq => "<=".to_string(),
|
|
Token::GtEq => ">=".to_string(),
|
|
Token::Plus => "+".to_string(),
|
|
Token::Minus => "-".to_string(),
|
|
Token::Mult => "*".to_string(),
|
|
Token::Div => "/".to_string(),
|
|
Token::Mod => "%".to_string(),
|
|
Token::LParen => "(".to_string(),
|
|
Token::RParen => ")".to_string(),
|
|
Token::Period => ".".to_string(),
|
|
Token::Colon => ":".to_string(),
|
|
Token::DoubleColon => "::".to_string(),
|
|
Token::SemiColon => ";".to_string(),
|
|
Token::Backslash => "\\".to_string(),
|
|
Token::LBracket => "[".to_string(),
|
|
Token::RBracket => "]".to_string(),
|
|
Token::Ampersand => "&".to_string(),
|
|
Token::LBrace => "{".to_string(),
|
|
Token::RBrace => "}".to_string(),
|
|
}
|
|
}
|
|
}
|
|
|
|
impl Token {
|
|
pub fn make_keyword(keyword: &str) -> Self {
|
|
Token::make_word(keyword, None)
|
|
}
|
|
pub fn make_word(word: &str, quote_style: Option<char>) -> Self {
|
|
let word_uppercase = word.to_uppercase();
|
|
//TODO: need to reintroduce FnvHashSet at some point .. iterating over keywords is
|
|
// not fast but I want the simplicity for now while I experiment with pluggable
|
|
// dialects
|
|
let is_keyword = quote_style == None && ALL_KEYWORDS.contains(&word_uppercase.as_str());
|
|
Token::SQLWord(SQLWord {
|
|
value: word.to_string(),
|
|
quote_style: quote_style,
|
|
keyword: if is_keyword {
|
|
word_uppercase.to_string()
|
|
} else {
|
|
"".to_string()
|
|
},
|
|
})
|
|
}
|
|
}
|
|
|
|
/// A keyword (like SELECT) or an optionally quoted SQL identifier
|
|
#[derive(Debug, Clone, PartialEq)]
|
|
pub struct SQLWord {
|
|
/// The value of the token, without the enclosing quotes, and with the
|
|
/// escape sequences (if any) processed (TODO: escapes are not handled)
|
|
pub value: String,
|
|
/// An identifier can be "quoted" (<delimited identifier> in ANSI parlance).
|
|
/// The standard and most implementations allow using double quotes for this,
|
|
/// but some implementations support other quoting styles as well (e.g. \[MS SQL])
|
|
pub quote_style: Option<char>,
|
|
/// If the word was not quoted and it matched one of the known keywords,
|
|
/// this will have one of the values from dialect::keywords, otherwise empty
|
|
pub keyword: String,
|
|
}
|
|
|
|
impl ToString for SQLWord {
|
|
fn to_string(&self) -> String {
|
|
match self.quote_style {
|
|
Some(s) if s == '"' || s == '[' || s == '`' => {
|
|
format!("{}{}{}", s, self.value, SQLWord::matching_end_quote(s))
|
|
}
|
|
None => self.value.clone(),
|
|
_ => panic!("Unexpected quote_style!"),
|
|
}
|
|
}
|
|
}
|
|
impl SQLWord {
|
|
fn matching_end_quote(ch: char) -> char {
|
|
match ch {
|
|
'"' => '"', // ANSI and most dialects
|
|
'[' => ']', // MS SQL
|
|
'`' => '`', // MySQL
|
|
_ => panic!("unexpected quoting style!"),
|
|
}
|
|
}
|
|
}
|
|
|
|
#[derive(Debug, Clone, PartialEq)]
|
|
pub enum Whitespace {
|
|
Space,
|
|
Newline,
|
|
Tab,
|
|
}
|
|
|
|
impl ToString for Whitespace {
|
|
fn to_string(&self) -> String {
|
|
match self {
|
|
Whitespace::Space => " ".to_string(),
|
|
Whitespace::Newline => "\n".to_string(),
|
|
Whitespace::Tab => "\t".to_string(),
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Tokenizer error
|
|
#[derive(Debug, PartialEq)]
|
|
pub struct TokenizerError(String);
|
|
|
|
/// SQL Tokenizer
|
|
pub struct Tokenizer<'a> {
|
|
dialect: &'a Dialect,
|
|
pub query: String,
|
|
pub line: u64,
|
|
pub col: u64,
|
|
}
|
|
|
|
impl<'a> Tokenizer<'a> {
|
|
/// Create a new SQL tokenizer for the specified SQL statement
|
|
pub fn new(dialect: &'a Dialect, query: &str) -> Self {
|
|
Self {
|
|
dialect,
|
|
query: query.to_string(),
|
|
line: 1,
|
|
col: 1,
|
|
}
|
|
}
|
|
|
|
/// Tokenize the statement and produce a vector of tokens
|
|
pub fn tokenize(&mut self) -> Result<Vec<Token>, TokenizerError> {
|
|
let mut peekable = self.query.chars().peekable();
|
|
|
|
let mut tokens: Vec<Token> = vec![];
|
|
|
|
while let Some(token) = self.next_token(&mut peekable)? {
|
|
match &token {
|
|
Token::Whitespace(Whitespace::Newline) => {
|
|
self.line += 1;
|
|
self.col = 1;
|
|
}
|
|
|
|
Token::Whitespace(Whitespace::Tab) => self.col += 4,
|
|
Token::SQLWord(w) if w.quote_style == None => self.col += w.value.len() as u64,
|
|
Token::SQLWord(w) if w.quote_style != None => self.col += w.value.len() as u64 + 2,
|
|
Token::Number(s) => self.col += s.len() as u64,
|
|
Token::SingleQuotedString(s) => self.col += s.len() as u64,
|
|
_ => self.col += 1,
|
|
}
|
|
|
|
tokens.push(token);
|
|
}
|
|
Ok(tokens)
|
|
}
|
|
|
|
/// Get the next token or return None
|
|
fn next_token(&self, chars: &mut Peekable<Chars>) -> Result<Option<Token>, TokenizerError> {
|
|
//println!("next_token: {:?}", chars.peek());
|
|
match chars.peek() {
|
|
Some(&ch) => match ch {
|
|
' ' => {
|
|
chars.next();
|
|
Ok(Some(Token::Whitespace(Whitespace::Space)))
|
|
}
|
|
'\t' => {
|
|
chars.next();
|
|
Ok(Some(Token::Whitespace(Whitespace::Tab)))
|
|
}
|
|
'\n' => {
|
|
chars.next();
|
|
Ok(Some(Token::Whitespace(Whitespace::Newline)))
|
|
}
|
|
// identifier or keyword
|
|
ch if self.dialect.is_identifier_start(ch) => {
|
|
let mut s = String::new();
|
|
chars.next(); // consume
|
|
s.push(ch);
|
|
while let Some(&ch) = chars.peek() {
|
|
if self.dialect.is_identifier_part(ch) {
|
|
chars.next(); // consume
|
|
s.push(ch);
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
Ok(Some(Token::make_word(&s, None)))
|
|
}
|
|
// string
|
|
'\'' => {
|
|
//TODO: handle escaped quotes in string
|
|
//TODO: handle newlines in string
|
|
//TODO: handle EOF before terminating quote
|
|
let mut s = String::new();
|
|
chars.next(); // consume
|
|
while let Some(&ch) = chars.peek() {
|
|
match ch {
|
|
'\'' => {
|
|
chars.next(); // consume
|
|
break;
|
|
}
|
|
_ => {
|
|
chars.next(); // consume
|
|
s.push(ch);
|
|
}
|
|
}
|
|
}
|
|
Ok(Some(Token::SingleQuotedString(s)))
|
|
}
|
|
// delimited (quoted) identifier
|
|
quote_start if self.dialect.is_delimited_identifier_start(quote_start) => {
|
|
let mut s = String::new();
|
|
chars.next(); // consume the opening quote
|
|
let quote_end = SQLWord::matching_end_quote(quote_start);
|
|
while let Some(ch) = chars.next() {
|
|
match ch {
|
|
c if c == quote_end => break,
|
|
_ => s.push(ch),
|
|
}
|
|
}
|
|
Ok(Some(Token::make_word(&s, Some(quote_start))))
|
|
}
|
|
// numbers
|
|
'0'...'9' => {
|
|
let mut s = String::new();
|
|
while let Some(&ch) = chars.peek() {
|
|
match ch {
|
|
'0'...'9' | '.' => {
|
|
chars.next(); // consume
|
|
s.push(ch);
|
|
}
|
|
_ => break,
|
|
}
|
|
}
|
|
Ok(Some(Token::Number(s)))
|
|
}
|
|
// punctuation
|
|
'(' => self.consume_and_return(chars, Token::LParen),
|
|
')' => self.consume_and_return(chars, Token::RParen),
|
|
',' => self.consume_and_return(chars, Token::Comma),
|
|
// operators
|
|
'+' => self.consume_and_return(chars, Token::Plus),
|
|
'-' => self.consume_and_return(chars, Token::Minus),
|
|
'*' => self.consume_and_return(chars, Token::Mult),
|
|
'/' => self.consume_and_return(chars, Token::Div),
|
|
'%' => self.consume_and_return(chars, Token::Mod),
|
|
'=' => self.consume_and_return(chars, Token::Eq),
|
|
'.' => self.consume_and_return(chars, Token::Period),
|
|
'!' => {
|
|
chars.next(); // consume
|
|
match chars.peek() {
|
|
Some(&ch) => match ch {
|
|
'=' => self.consume_and_return(chars, Token::Neq),
|
|
_ => Err(TokenizerError(format!(
|
|
"Tokenizer Error at Line: {}, Col: {}",
|
|
self.line, self.col
|
|
))),
|
|
},
|
|
None => Err(TokenizerError(format!(
|
|
"Tokenizer Error at Line: {}, Col: {}",
|
|
self.line, self.col
|
|
))),
|
|
}
|
|
}
|
|
'<' => {
|
|
chars.next(); // consume
|
|
match chars.peek() {
|
|
Some(&ch) => match ch {
|
|
'=' => self.consume_and_return(chars, Token::LtEq),
|
|
'>' => self.consume_and_return(chars, Token::Neq),
|
|
_ => Ok(Some(Token::Lt)),
|
|
},
|
|
None => Ok(Some(Token::Lt)),
|
|
}
|
|
}
|
|
'>' => {
|
|
chars.next(); // consume
|
|
match chars.peek() {
|
|
Some(&ch) => match ch {
|
|
'=' => self.consume_and_return(chars, Token::GtEq),
|
|
_ => Ok(Some(Token::Gt)),
|
|
},
|
|
None => Ok(Some(Token::Gt)),
|
|
}
|
|
}
|
|
// colon
|
|
':' => {
|
|
chars.next();
|
|
match chars.peek() {
|
|
Some(&ch) => match ch {
|
|
// double colon
|
|
':' => self.consume_and_return(chars, Token::DoubleColon),
|
|
_ => Ok(Some(Token::Colon)),
|
|
},
|
|
None => Ok(Some(Token::Colon)),
|
|
}
|
|
}
|
|
';' => self.consume_and_return(chars, Token::SemiColon),
|
|
'\\' => self.consume_and_return(chars, Token::Backslash),
|
|
// brakets
|
|
'[' => self.consume_and_return(chars, Token::LBracket),
|
|
']' => self.consume_and_return(chars, Token::RBracket),
|
|
'&' => self.consume_and_return(chars, Token::Ampersand),
|
|
'{' => self.consume_and_return(chars, Token::LBrace),
|
|
'}' => self.consume_and_return(chars, Token::RBrace),
|
|
other => self.consume_and_return(chars, Token::Char(other)),
|
|
},
|
|
None => Ok(None),
|
|
}
|
|
}
|
|
|
|
fn consume_and_return(
|
|
&self,
|
|
chars: &mut Peekable<Chars>,
|
|
t: Token,
|
|
) -> Result<Option<Token>, TokenizerError> {
|
|
chars.next();
|
|
Ok(Some(t))
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::super::dialect::GenericSqlDialect;
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn tokenize_select_1() {
|
|
let sql = String::from("SELECT 1");
|
|
let dialect = GenericSqlDialect {};
|
|
let mut tokenizer = Tokenizer::new(&dialect, &sql);
|
|
let tokens = tokenizer.tokenize().unwrap();
|
|
|
|
let expected = vec![
|
|
Token::make_keyword("SELECT"),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::Number(String::from("1")),
|
|
];
|
|
|
|
compare(expected, tokens);
|
|
}
|
|
|
|
#[test]
|
|
fn tokenize_scalar_function() {
|
|
let sql = String::from("SELECT sqrt(1)");
|
|
let dialect = GenericSqlDialect {};
|
|
let mut tokenizer = Tokenizer::new(&dialect, &sql);
|
|
let tokens = tokenizer.tokenize().unwrap();
|
|
|
|
let expected = vec![
|
|
Token::make_keyword("SELECT"),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::make_word("sqrt", None),
|
|
Token::LParen,
|
|
Token::Number(String::from("1")),
|
|
Token::RParen,
|
|
];
|
|
|
|
compare(expected, tokens);
|
|
}
|
|
|
|
#[test]
|
|
fn tokenize_simple_select() {
|
|
let sql = String::from("SELECT * FROM customer WHERE id = 1 LIMIT 5");
|
|
let dialect = GenericSqlDialect {};
|
|
let mut tokenizer = Tokenizer::new(&dialect, &sql);
|
|
let tokens = tokenizer.tokenize().unwrap();
|
|
|
|
let expected = vec![
|
|
Token::make_keyword("SELECT"),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::Mult,
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::make_keyword("FROM"),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::make_word("customer", None),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::make_keyword("WHERE"),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::make_word("id", None),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::Eq,
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::Number(String::from("1")),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::make_keyword("LIMIT"),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::Number(String::from("5")),
|
|
];
|
|
|
|
compare(expected, tokens);
|
|
}
|
|
|
|
#[test]
|
|
fn tokenize_string_predicate() {
|
|
let sql = String::from("SELECT * FROM customer WHERE salary != 'Not Provided'");
|
|
let dialect = GenericSqlDialect {};
|
|
let mut tokenizer = Tokenizer::new(&dialect, &sql);
|
|
let tokens = tokenizer.tokenize().unwrap();
|
|
|
|
let expected = vec![
|
|
Token::make_keyword("SELECT"),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::Mult,
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::make_keyword("FROM"),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::make_word("customer", None),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::make_keyword("WHERE"),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::make_word("salary", None),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::Neq,
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::SingleQuotedString(String::from("Not Provided")),
|
|
];
|
|
|
|
compare(expected, tokens);
|
|
}
|
|
|
|
#[test]
|
|
fn tokenize_invalid_string() {
|
|
let sql = String::from("\nمصطفىh");
|
|
|
|
let dialect = GenericSqlDialect {};
|
|
let mut tokenizer = Tokenizer::new(&dialect, &sql);
|
|
let tokens = tokenizer.tokenize().unwrap();
|
|
println!("tokens: {:#?}", tokens);
|
|
let expected = vec![
|
|
Token::Whitespace(Whitespace::Newline),
|
|
Token::Char('م'),
|
|
Token::Char('ص'),
|
|
Token::Char('ط'),
|
|
Token::Char('ف'),
|
|
Token::Char('ى'),
|
|
Token::make_word("h", None),
|
|
];
|
|
compare(expected, tokens);
|
|
}
|
|
|
|
#[test]
|
|
fn tokenize_invalid_string_cols() {
|
|
let sql = String::from("\n\nSELECT * FROM table\tمصطفىh");
|
|
|
|
let dialect = GenericSqlDialect {};
|
|
let mut tokenizer = Tokenizer::new(&dialect, &sql);
|
|
let tokens = tokenizer.tokenize().unwrap();
|
|
println!("tokens: {:#?}", tokens);
|
|
let expected = vec![
|
|
Token::Whitespace(Whitespace::Newline),
|
|
Token::Whitespace(Whitespace::Newline),
|
|
Token::make_keyword("SELECT"),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::Mult,
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::make_keyword("FROM"),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::make_keyword("table"),
|
|
Token::Whitespace(Whitespace::Tab),
|
|
Token::Char('م'),
|
|
Token::Char('ص'),
|
|
Token::Char('ط'),
|
|
Token::Char('ف'),
|
|
Token::Char('ى'),
|
|
Token::make_word("h", None),
|
|
];
|
|
compare(expected, tokens);
|
|
}
|
|
|
|
#[test]
|
|
fn tokenize_is_null() {
|
|
let sql = String::from("a IS NULL");
|
|
let dialect = GenericSqlDialect {};
|
|
let mut tokenizer = Tokenizer::new(&dialect, &sql);
|
|
let tokens = tokenizer.tokenize().unwrap();
|
|
|
|
let expected = vec![
|
|
Token::make_word("a", None),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::make_keyword("IS"),
|
|
Token::Whitespace(Whitespace::Space),
|
|
Token::make_keyword("NULL"),
|
|
];
|
|
|
|
compare(expected, tokens);
|
|
}
|
|
|
|
fn compare(expected: Vec<Token>, actual: Vec<Token>) {
|
|
//println!("------------------------------");
|
|
//println!("tokens = {:?}", actual);
|
|
//println!("expected = {:?}", expected);
|
|
//println!("------------------------------");
|
|
assert_eq!(expected, actual);
|
|
}
|
|
|
|
}
|