Refactoring

This commit is contained in:
Andy Grove 2018-09-03 09:13:43 -06:00
parent 375671e208
commit a86bd30515
6 changed files with 101 additions and 338 deletions

View file

@ -1,115 +0,0 @@
use std::sync::{Arc, Mutex};
extern crate datafusion_sql;
use datafusion_sql::ansi::tokenizer::ANSISQLTokenizer;
use datafusion_sql::ansi::parser::ANSISQLParser;
use datafusion_sql::tokenizer::*;
use datafusion_sql::parser::*;
/// This example demonstrates building a custom ACME parser that extends the generic parser
/// by adding support for a factorial expression `!! expr`.
/// Custom SQLToken
#[derive(Debug,PartialEq)]
enum AcmeToken {
/// Factorial token `!!`
Factorial
}
/// Custom SQLExpr
#[derive(Debug)]
enum AcmeExpr {
/// Factorial expression
Factorial(Box<SQLExpr<AcmeExpr>>)
}
struct AcmeTokenizer {
ansi_tokenizer: Arc<Mutex<SQLTokenizer<AcmeToken>>>
}
/// The ACME tokenizer looks for the factorial operator `!!` but delegates everything else
impl SQLTokenizer<AcmeToken> for AcmeTokenizer {
fn precedence(&self, _token: &SQLToken<AcmeToken>) -> usize {
unimplemented!()
}
fn next_token(&mut self, chars: &mut CharSeq) -> Result<Option<SQLToken<AcmeToken>>, TokenizerError> {
let mut ansi = self.ansi_tokenizer.lock().unwrap();
match chars.peek() {
Some(&ch) => match ch {
'!' => {
chars.mark();
chars.next(); // consume the first `!`
match chars.peek() {
Some(&ch) => match ch {
'!' => {
chars.next(); // consume the second `!`
Ok(Some(SQLToken::Custom(AcmeToken::Factorial)))
},
_ => {
chars.reset();
ansi.next_token(chars)
}
},
None => {
chars.reset();
ansi.next_token(chars)
}
}
}
_ => ansi.next_token(chars)
}
_ => ansi.next_token(chars)
}
}
}
struct AcmeParser {
tokenizer: Arc<Mutex<SQLTokenizer<AcmeToken>>>
}
impl AcmeParser {
pub fn new(tokenizer: Arc<Mutex<SQLTokenizer<AcmeToken>>>) -> Self {
AcmeParser { tokenizer: tokenizer.clone() }
}
}
impl SQLParser<AcmeToken, AcmeExpr> for AcmeParser {
fn parse_prefix(&mut self, chars: &mut CharSeq) -> Result<Option<Box<SQLExpr<AcmeExpr>>>, ParserError<AcmeToken>> {
Ok(None)
}
fn parse_infix(&mut self, chars: &mut CharSeq, left: &SQLExpr<AcmeExpr>, precedence: usize) -> Result<Option<Box<SQLExpr<AcmeExpr>>>, ParserError<AcmeToken>> {
Ok(None)
}
}
fn main() {
let sql = "1 + !! 5 * 2";
// ANSI SQL tokenizer
let ansi_tokenizer = Arc::new(Mutex::new(ANSISQLTokenizer { }));
// Custom ACME tokenizer
let mut acme_tokenizer = Arc::new(Mutex::new(AcmeTokenizer {
ansi_tokenizer: ansi_tokenizer.clone()
}));
// Create parsers
let ansi_parser = Arc::new(Mutex::new(ANSISQLParser::new(acme_tokenizer.clone())));
let acme_parser = Arc::new(Mutex::new(AcmeParser::new(acme_tokenizer.clone())));
let mut pratt_parser = PrattParser {
chars: CharSeq::new(sql),
parsers: vec![acme_parser, ansi_parser]
};
let expr = pratt_parser.parse_expr().unwrap();
println!("{:?}", expr);
}

20
examples/parse_sql.rs Normal file
View file

@ -0,0 +1,20 @@
use std::sync::{Arc, Mutex};
extern crate datafusion_sql;
use datafusion_sql::ansi::tokenizer::ANSISQLTokenizer;
use datafusion_sql::ansi::parser::ANSISQLParser;
use datafusion_sql::tokenizer::*;
use datafusion_sql::parser::*;
fn main() {
let sql = "SELECT 1 + 1";
// Create parsers
match ANSISQLParser::parse(sql).unwrap() {
Some(ast) => println!("{:?}", ast),
_ => {}
}
}

View file

@ -1,30 +1,56 @@
use std::cmp::PartialEq; use std::cmp::PartialEq;
use std::fmt::Debug; use std::fmt::Debug;
//use std::iter::Peekable; //use std::rc::Rc;
//use std::str::Chars; //use std::sync::{Arc, Mutex};
use std::sync::{Arc, Mutex};
use super::tokenizer::ANSISQLTokenizer;
use super::super::tokenizer::*; use super::super::tokenizer::*;
use super::super::parser::*; use super::super::parser::*;
pub struct ANSISQLParser<TokenType> { pub struct ANSISQLParser {
tokenizer: Arc<Mutex<SQLTokenizer<TokenType>>> tokenizer: Box<SQLTokenizer>
} }
impl<TokenType> ANSISQLParser<TokenType> where TokenType: Debug + PartialEq { impl ANSISQLParser where {
pub fn new(tokenizer: Arc<Mutex<SQLTokenizer<TokenType>>>) -> Self { pub fn parse(sql: &str) -> Result<Option<Box<SQLExpr>>, ParserError> {
ANSISQLParser { tokenizer: tokenizer.clone() } let mut parser = ANSISQLParser { tokenizer: Box::new(ANSISQLTokenizer::new(sql)) };
parser.parse_expr()
} }
} }
impl<TokenType, ExprType> SQLParser<TokenType, ExprType> for ANSISQLParser<TokenType> impl SQLParser for ANSISQLParser {
where TokenType: Debug + PartialEq, ExprType: Debug {
fn parse_prefix(&mut self, chars: &mut CharSeq) -> Result<Option<Box<SQLExpr<ExprType>>>, ParserError<TokenType>> { fn parse_expr(&mut self) -> Result<Option<Box<SQLExpr>>, ParserError> {
match self.tokenizer.lock().unwrap().next_token(chars)? { let precedence: usize = 0;
let mut e = self.parse_prefix()?;
match e {
Some(mut expr) => {
while let Some(token) = self.tokenizer.peek_token()? {
let next_precedence = self.tokenizer.precedence(&token);
if precedence >= next_precedence {
break;
}
expr = self.parse_infix(&expr, next_precedence)?.unwrap(); //TODO: fix me
}
Ok(Some(expr))
}
_ => {
Ok(None)
}
}
}
fn parse_prefix(&mut self) -> Result<Option<Box<SQLExpr>>, ParserError> {
match self.tokenizer.next_token()? {
Some(SQLToken::Keyword(ref k)) => match k.to_uppercase().as_ref() { Some(SQLToken::Keyword(ref k)) => match k.to_uppercase().as_ref() {
"INSERT" => unimplemented!(), "INSERT" => unimplemented!(),
"UPDATE" => unimplemented!(), "UPDATE" => unimplemented!(),
@ -37,7 +63,7 @@ impl<TokenType, ExprType> SQLParser<TokenType, ExprType> for ANSISQLParser<Token
} }
} }
fn parse_infix(&mut self, _chars: &mut CharSeq, _left: &SQLExpr<ExprType>, _precedence: usize) -> Result<Option<Box<SQLExpr<ExprType>>>, ParserError<TokenType>> { fn parse_infix(&mut self, _left: &SQLExpr, _precedence: usize) -> Result<Option<Box<SQLExpr>>, ParserError> {
unimplemented!() unimplemented!()
} }
} }

View file

@ -4,26 +4,37 @@ use std::fmt::Debug;
use super::super::tokenizer::*; use super::super::tokenizer::*;
pub struct ANSISQLTokenizer { pub struct ANSISQLTokenizer {
chars: CharSeq
} }
impl<TokenType> SQLTokenizer<TokenType> for ANSISQLTokenizer impl ANSISQLTokenizer {
where TokenType: Debug + PartialEq { pub fn new(sql: &str) -> Self {
ANSISQLTokenizer { chars: CharSeq::new(sql) }
}
}
fn precedence(&self, _token: &SQLToken<TokenType>) -> usize { impl SQLTokenizer for ANSISQLTokenizer {
fn precedence(&self, _token: &SQLToken) -> usize {
unimplemented!() unimplemented!()
} }
fn next_token(&mut self, chars: &mut CharSeq) -> Result<Option<SQLToken<TokenType>>, TokenizerError> { fn peek_token(&mut self) -> Result<Option<SQLToken>, TokenizerError> {
match chars.next() { unimplemented!()
}
fn next_token(&mut self) -> Result<Option<SQLToken>, TokenizerError> {
match self.chars.next() {
Some(ch) => match ch { Some(ch) => match ch {
' ' | '\t' | '\n' => Ok(Some(SQLToken::Whitespace(ch))), ' ' | '\t' | '\n' => Ok(Some(SQLToken::Whitespace(ch))),
'0' ... '9' => { '0' ... '9' => {
let mut s = String::new(); let mut s = String::new();
s.push(ch); s.push(ch);
while let Some(&ch) = chars.peek() { while let Some(&ch) = self.chars.peek() {
match ch { match ch {
'0' ... '9' => { '0' ... '9' => {
chars.next(); // consume self.chars.next(); // consume
s.push(ch); s.push(ch);
}, },
_ => break _ => break

View file

@ -1,6 +1,5 @@
use std::cmp::PartialEq; use std::cmp::PartialEq;
use std::fmt::Debug; use std::fmt::Debug;
use std::sync::{Arc, Mutex};
use super::tokenizer::*; use super::tokenizer::*;
@ -8,7 +7,7 @@ use super::tokenizer::*;
/// ANSI SQL:2011 Data Types /// ANSI SQL:2011 Data Types
#[derive(Debug)] #[derive(Debug)]
pub enum SQLDataType<T> { pub enum SQLDataType {
/// BOOLEAN /// BOOLEAN
Boolean, Boolean,
/// NUMERIC, DECIMAL, DEC /// NUMERIC, DECIMAL, DEC
@ -49,14 +48,12 @@ pub enum SQLDataType<T> {
Time { precision: usize, tz: bool }, Time { precision: usize, tz: bool },
/// Time: `TIMESTAMP [(precision)] [WITH TIME ZONE | WITHOUT TIME ZONE]` /// Time: `TIMESTAMP [(precision)] [WITH TIME ZONE | WITHOUT TIME ZONE]`
Timestamp { precision: usize, tz: bool }, Timestamp { precision: usize, tz: bool },
/// Custom data type
Custom(T)
} }
#[derive(Debug)] #[derive(Debug)]
pub enum SQLOperator<T> { pub enum SQLOperator {
Plus, Plus,
Minus, Minus,
Mult, Mult,
@ -66,217 +63,44 @@ pub enum SQLOperator<T> {
GtEq, GtEq,
Lt, Lt,
LtEq, LtEq,
Custom(T) // extension point for vendor-specific operators
} }
/// SQL Expressions /// SQL Expressions
#[derive(Debug)] #[derive(Debug)]
pub enum SQLExpr<ExprType> { pub enum SQLExpr{
/// Identifier e.g. table name or column name /// Identifier e.g. table name or column name
Identifier(String), Identifier(String),
/// Literal value /// Literal value
Literal(String), Literal(String),
/// Binary expression e.g. `1 + 2` or `fname LIKE "A%"` /// Binary expression e.g. `1 + 2` or `fname LIKE "A%"`
Binary(Box<SQLExpr<ExprType>>, SQLOperator<ExprType>, Box<SQLExpr<ExprType>>), Binary(Box<SQLExpr>, SQLOperator, Box<SQLExpr>),
/// Function invocation with function name and list of argument expressions /// Function invocation with function name and list of argument expressions
FunctionCall(String, Vec<SQLExpr<ExprType>>), FunctionCall(String, Vec<SQLExpr>),
Insert, Insert,
Update, Update,
Delete, Delete,
Select, Select,
CreateTable, CreateTable,
/// Custom expression (vendor-specific)
Custom(ExprType)
} }
#[derive(Debug)] #[derive(Debug)]
pub enum ParserError<TokenType> pub enum ParserError {
where TokenType: Debug + PartialEq { WrongToken { expected: Vec<SQLToken>, actual: SQLToken, line: usize, col: usize },
WrongToken { expected: Vec<SQLToken<TokenType>>, actual: SQLToken<TokenType>, line: usize, col: usize },
Custom(String) Custom(String)
} }
impl<TokenType> From<TokenizerError> for ParserError<TokenType> impl From<TokenizerError> for ParserError {
where TokenType: Debug + PartialEq { fn from(e: TokenizerError) -> Self {
ParserError::Custom(format!("{:?}", e))
fn from(_: TokenizerError) -> Self {
unimplemented!()
} }
} }
pub trait SQLParser<TokenType, ExprType> pub trait SQLParser {
where TokenType: Debug + PartialEq, ExprType: Debug { fn parse_expr(&mut self) -> Result<Option<Box<SQLExpr>>, ParserError>;
/// parse the prefix and stop once an infix operator is reached /// parse the prefix and stop once an infix operator is reached
fn parse_prefix(&mut self, chars: &mut CharSeq) -> Result<Option<Box<SQLExpr<ExprType>>>, ParserError<TokenType>> ; fn parse_prefix(&mut self) -> Result<Option<Box<SQLExpr>>, ParserError> ;
/// parse the next infix expression, returning None if the precedence has changed /// parse the next infix expression, returning None if the precedence has changed
fn parse_infix(&mut self, chars: &mut CharSeq, left: &SQLExpr<ExprType>, precedence: usize) -> Result<Option<Box<SQLExpr<ExprType>>>, ParserError<TokenType>>; fn parse_infix(&mut self, left: &SQLExpr, precedence: usize) -> Result<Option<Box<SQLExpr>>, ParserError>;
} }
pub struct PrattParser<TokenType, ExprType> {
pub chars: CharSeq,
pub parsers: Vec<Arc<Mutex<SQLParser<TokenType, ExprType>>>>
}
impl<TokenType, ExprType> PrattParser<TokenType, ExprType> where TokenType: Debug + PartialEq, ExprType: Debug {
pub fn parse_expr(&mut self) -> Result<Option<Box<SQLExpr<ExprType>>>, ParserError<TokenType>> {
for i in 0..self.parsers.len() {
let mut p = self.parsers[i].lock().unwrap();
let expr = p.parse_prefix(&mut self.chars)?;
// return as soon as we have a match
match expr {
Some(_) => return Ok(expr),
_ => {}
}
}
// found no valid token
Ok(None)
}
}
//
//pub fn parse_expr<'a, TokenType, ExprType>(parser: Arc<Mutex<SQLParser<TokenType, ExprType>>>)
// -> Result<Box<SQLExpr<ExprType>>, ParserError<TokenType>> where TokenType: Debug + PartialEq, ExprType: Debug {
// let mut guard = parser.lock().unwrap();
//
// //Result<Box<SQLExpr<ExprType>>, ParserError<TokenType>>
// let x = guard.parse_prefix();
// x
//}
//impl<'a, TokenType, ExprType> PrattParser<'a, TokenType, ExprType>
// where TokenType: Debug + PartialEq, ExprType: Debug {
//
// pub fn parse_expr(&mut self) -> Result<Box<SQLExpr<ExprType>>, ParserError<TokenType>> {
//
// let precedence: usize = 0;
// let parser_ref = self.parser.as_ref();
//
// let mut expr = parser_ref.parse_prefix()?;
//
// while let Some(token) = self.tokenizer.peek_token(&mut self.chars)? {
//
// let next_precedence = self.tokenizer.precedence(&token);
//
// if precedence >= next_precedence {
// break;
// }
//
// expr = parser_ref.parse_infix(&expr, next_precedence)?.unwrap(); //TODO: fix me
// }
//
// Ok(expr)
// }
//
//}
// fn parse_prefix(&mut self) -> Result<Box<SQLExpr>, ParserError> {
//
// match self.tokenizer.peek_token()? {
// Some(SQLToken::Keyword(ref k)) => match k.to_uppercase().as_ref() {
// "INSERT" => unimplemented!(),
// "UPDATE" => unimplemented!(),
// "DELETE" => unimplemented!(),
// "SELECT" => unimplemented!(),
// "CREATE" => unimplemented!(),
// _ => unimplemented!()
// },
// _ => unimplemented!()
// }
// unimplemented!()
// }
//
// fn parse_infix(&mut self, expr: Box<SQLExpr>, precedence: u8) -> Result<Box<SQLExpr>, ParserError> {
//
// match self.tokenizer.next_token()? {
// Some(tok) => {
// match tok {
// SQLToken::Eq | SQLToken::Gt | SQLToken::GtEq |
// SQLToken::Lt | SQLToken::LtEq => Ok(Box::new(SQLExpr::Binary(
// expr,
// self.to_sql_operator(&tok),
// self.parse_expr(precedence)?
// ))),
// _ => Err(ParserError::WrongToken {
// expected: vec![SQLToken::Eq, SQLToken::Gt], //TODO: complete
// actual: tok,
// line: 0,
// col: 0
// })
// }
// },
// None => Err(ParserError::TBD)
// }
// }
//
// fn to_sql_operator(&self, token: &SQLToken) -> SQLOperator {
// unimplemented!()
// }
//
// fn get_precedence(&self, token: &SQLToken) -> u8 {
// unimplemented!()
// }
//
// /// parse a list of SQL expressions separated by a comma
// fn parse_expr_list(&mut self, precedence: u8) -> Result<Vec<SQLExpr>, ParserError> {
// unimplemented!()
// }
//
//}
//
////impl GenericParser {
////
//// fn tokenizer(&mut self) -> &mut SQLTokenizer {
//// &mut self.tokenizer
//// }
////
//// fn parse_keywords(&mut self, keywords: Vec<&str>) -> Result<bool, ParserError> {
//// unimplemented!()
//// }
////
////// fn parse_identifier(&mut self) -> Result<String, ParserError>;
////
////}
//
//
//
//#[cfg(test)]
//mod tests {
//
// use super::SQLToken::*;
// use super::*;
// #[test]
// fn parse_Acme_create_table() {
//
// // CREATE TABLE test (col1 int8) HASH (col1)
// let tokens = vec![
// k("CREATE"), k("TABLE"), i("test"), LParen,
// i("col1"), k("int8"),
// RParen,
// k("HASH"), LParen, i("col1"), RParen
// ];
//
// //let parser = AcmeParser { generic_parser: }
// }
// }
//
// fn k(s: &str) -> SQLToken {
// Keyword(s.to_string())
// }
//
// fn i(s: &str) -> SQLToken {
// Identifier(s.to_string())
// }
//
//
//}

View file

@ -70,7 +70,7 @@ pub enum TokenizerError {
/// SQL Tokens /// SQL Tokens
#[derive(Debug,PartialEq)] #[derive(Debug,PartialEq)]
pub enum SQLToken<T: Debug + PartialEq> { pub enum SQLToken {
Whitespace(char), Whitespace(char),
Keyword(String), Keyword(String),
Identifier(String), Identifier(String),
@ -89,31 +89,28 @@ pub enum SQLToken<T: Debug + PartialEq> {
LParen, LParen,
RParen, RParen,
Comma, Comma,
/// Custom token (dialect-specific)
Custom(T)
} }
pub trait SQLTokenizer<TokenType> pub trait SQLTokenizer {
where TokenType: Debug + PartialEq {
/// get the precendence of a token /// get the precendence of a token
fn precedence(&self, token: &SQLToken<TokenType>) -> usize; fn precedence(&self, token: &SQLToken) -> usize;
fn peek_token(&mut self) -> Result<Option<SQLToken>, TokenizerError>;
/// return a reference to the next token and advance the index /// return a reference to the next token and advance the index
fn next_token(&mut self, chars: &mut CharSeq) -> Result<Option<SQLToken<TokenType>>, TokenizerError>; fn next_token(&mut self) -> Result<Option<SQLToken>, TokenizerError>;
} }
pub fn tokenize<TokenType>(sql: &str, tokenizer: &mut SQLTokenizer<TokenType>) -> Result<Vec<SQLToken<TokenType>>, TokenizerError> pub fn tokenize(sql: &str, tokenizer: &mut SQLTokenizer) -> Result<Vec<SQLToken>, TokenizerError> {
where TokenType: Debug + PartialEq
{
let mut chars = CharSeq::new(sql); let mut chars = CharSeq::new(sql);
let mut tokens : Vec<SQLToken<TokenType>> = vec![]; let mut tokens : Vec<SQLToken> = vec![];
loop { loop {
match tokenizer.next_token(&mut chars)? { match tokenizer.next_token()? {
Some(SQLToken::Whitespace(_)) => { /* ignore */ }, Some(SQLToken::Whitespace(_)) => { /* ignore */ },
Some(token) => { Some(token) => {
println!("Token: {:?}", token); println!("Token: {:?}", token);