diff --git a/examples/acme_parser.rs b/examples/acme_parser.rs new file mode 100644 index 00000000..87af229b --- /dev/null +++ b/examples/acme_parser.rs @@ -0,0 +1,52 @@ +use std::str::Chars; + +extern crate datafusion_sql; + +use datafusion_sql::tokenizer::*; +use datafusion_sql::parser::*; + +#[derive(Debug)] +enum AcmeToken { + /// Factorial operator `!!` + Factorial +} + + +#[derive(Debug)] +enum AcmeOperator { + Factorial +} + +#[derive(Debug)] +enum AcmeTokenizerError { + +} + +struct AcmeTokenizer { + //chars: &'a Chars +} + +impl SQLTokenizer for AcmeTokenizer { + + fn peek_token(&mut self) -> Result>, TokenizerError> { + Ok(Some(SQLToken::Custom(AcmeToken::Factorial))) + } + + fn next_token(&mut self) -> Result>, TokenizerError> { + Ok(Some(SQLToken::Custom(AcmeToken::Factorial))) + } +} + + + +fn main() { + + let sql = "1 + !! 5 * 2"; + + let mut tokenizer = AcmeTokenizer { }; + + println!("token = {:?}", tokenizer.peek_token().unwrap()); + + + +} diff --git a/src/lib.rs b/src/lib.rs index 4d20d9b2..81b9c468 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,278 +1,2 @@ -/// -/// This is a mock up of some key data structures and traits for a SQL parser -/// that can be used with custom dialects -/// - -/* --- TOKENIZER API --- */ - -enum TokenizerError { - TBD -} - -/// SQL Tokens -enum SQLToken { - Keyword(String), - Identifier(String), - Literal(String), //TODO: need to model different types of literal - Eq, - NotEq, - Gt, - GtEq, - Lt, - LtEq, - LParen, - RParen, - Comma, - Custom(Box) // extension point for vendor-specific tokens -} - -trait CustomToken { - //TODO: ??? -} - -trait SQLTokenizer { - // return a reference to the next token without consuming it (look ahead) - fn peek_token(&mut self) -> Result, TokenizerError>; - // return a reference to the next token and advance the index - fn next_token(&mut self) -> Result, TokenizerError>; -} - -/* --- PARSER API --- */ - -/// SQL Operators -enum SQLOperator { - Plus, - Minus, - Mult, - Div, - Eq, - Gt, - GtEq, - Lt, - LtEq, - Custom(Box) // extension point for vendor-specific operators -} - -trait CustomOperator { - //TODO: ??? -} - -/// SQL Expressions -enum SQLExpr { - /// Identifier e.g. table name or column name - Identifier(String), - /// Literal value - Literal(String), - /// Binary expression e.g. `1 + 2` or `fname LIKE "A%"` - Binary(Box, SQLOperator, Box), - /// Function invocation with function name and list of argument expressions - FunctionCall(String, Vec), - Insert, - Update, - Delete, - Select, - CreateTable, - /// Custom expression (vendor-specific) - Custom(Box) -} - -trait CustomExpr { - //TODO: ??? -} - -enum ParserError { - WrongToken { expected: Vec, actual: SQLToken, line: usize, col: usize }, - TBD -} - -impl From for ParserError { - fn from(_: TokenizerError) -> Self { - unimplemented!() - } -} - -trait Parser { - fn parse_prefix(&mut self) -> Result, ParserError> ; - fn parse_infix(&mut self, left: SQLExpr) -> Result, ParserError> ; -} - -/* -- GENERIC (ANSI SQL) PARSER -- */ - -struct GenericParser { - tokenizer: SQLTokenizer -} - -impl GenericParser { - - fn parse_expr(&mut self, precedence: u8) -> Result, ParserError> { - - let mut expr = self.parse_prefix()?; - - // loop while there are more tokens and until the precedence changes - while let Some(token) = self.tokenizer.peek_token()? { - - let next_precedence = self.get_precedence(&token); - - if precedence >= next_precedence { - break; - } - - expr = self.parse_infix(expr, next_precedence)?; - } - - Ok(expr) - } - - fn parse_prefix(&mut self) -> Result, ParserError> { - - match self.tokenizer.peek_token()? { - Some(SQLToken::Keyword(ref k)) => match k.to_uppercase().as_ref() { - "INSERT" => unimplemented!(), - "UPDATE" => unimplemented!(), - "DELETE" => unimplemented!(), - "SELECT" => unimplemented!(), - "CREATE" => unimplemented!(), - _ => unimplemented!() - }, - _ => unimplemented!() - } - unimplemented!() - } - - fn parse_infix(&mut self, expr: Box, precedence: u8) -> Result, ParserError> { - - match self.tokenizer.next_token()? { - Some(tok) => { - match tok { - SQLToken::Eq | SQLToken::Gt | SQLToken::GtEq | - SQLToken::Lt | SQLToken::LtEq => Ok(Box::new(SQLExpr::Binary( - expr, - self.to_sql_operator(&tok), - self.parse_expr(precedence)? - ))), - _ => Err(ParserError::WrongToken { - expected: vec![SQLToken::Eq, SQLToken::Gt], //TODO: complete - actual: tok, - line: 0, - col: 0 - }) - } - }, - None => Err(ParserError::TBD) - } - } - - fn to_sql_operator(&self, token: &SQLToken) -> SQLOperator { - unimplemented!() - } - - fn get_precedence(&self, token: &SQLToken) -> u8 { - unimplemented!() - } - - /// parse a list of SQL expressions separated by a comma - fn parse_expr_list(&mut self, precedence: u8) -> Result, ParserError> { - unimplemented!() - } - -} - -//impl GenericParser { -// -// fn tokenizer(&mut self) -> &mut SQLTokenizer { -// &mut self.tokenizer -// } -// -// fn parse_keywords(&mut self, keywords: Vec<&str>) -> Result { -// unimplemented!() -// } -// -//// fn parse_identifier(&mut self) -> Result; -// -//} - -/* --- KUDU PARSER IMPL --- */ - - -///// KuduParser is a wrapper around GenericParser -//struct KuduParser { -// generic: GenericParser -//} -// -//impl Parser for KuduParser { -// -// fn parse_prefix(&mut self) -> Result, ParserError> { -// -// // just take over the statements we need to and delegate everything else -// // to the generic parser -// if self.generic.parse_keywords(vec!["CREATE", "TABLE"])? { -// -// //TODO: insert kudu CREATE TABLE parsing logic here -// // .. we can delegate to the generic parsers for parts of that even -// -// // mock response -// let kudu_create_table = KuduCreateTable { -// partition: vec![KuduPartition::Hash] -// }; -// -// Ok(Box::new(SQLExpr::Custom(Box::new(kudu_create_table )))) -// } else { -// _ => self.generic.parse_prefix() -// } -// } -// -// fn parse_infix(&mut self) -> Result, ParserError> { -// self.generic.parse_infix() -// } -//} -// -//impl KuduParser { -// -// fn tokenizer(&mut self) -> &mut SQLTokenizer { -// &mut self.generic.tokenizer -// } -// -//} -// -//enum KuduPartition { -// Hash, -// Range, -//} -// -//struct KuduCreateTable { -// partition: Vec -//} -// -//impl CustomExpr for KuduCreateTable { -// -//} - -#[cfg(test)] -mod tests { - - use super::SQLToken::*; - use super::*; - #[test] - fn parse_kudu_create_table() { - - // CREATE TABLE test (col1 int8) HASH (col1) - let tokens = vec![ - k("CREATE"), k("TABLE"), i("test"), LParen, - i("col1"), k("int8"), - RParen, - k("HASH"), LParen, i("col1"), RParen - ]; - - //let parser = KuduParser { generic_parser: } - } - - fn k(s: &str) -> SQLToken { - Keyword(s.to_string()) - } - - fn i(s: &str) -> SQLToken { - Identifier(s.to_string()) - } - - -} +pub mod tokenizer; +pub mod parser; diff --git a/src/parser.rs b/src/parser.rs new file mode 100644 index 00000000..d0f012ea --- /dev/null +++ b/src/parser.rs @@ -0,0 +1,186 @@ +use super::tokenizer::*; + +#[derive(Debug)] +pub enum SQLOperator { + Plus, + Minus, + Mult, + Div, + Eq, + Gt, + GtEq, + Lt, + LtEq, + Custom(T) // extension point for vendor-specific operators +} + +/// SQL Expressions +#[derive(Debug)] +pub enum SQLExpr { + /// Identifier e.g. table name or column name + Identifier(String), + /// Literal value + Literal(String), + /// Binary expression e.g. `1 + 2` or `fname LIKE "A%"` + Binary(Box>, SQLOperator, Box>), + /// Function invocation with function name and list of argument expressions + FunctionCall(String, Vec>), + Insert, + Update, + Delete, + Select, + CreateTable, + /// Custom expression (vendor-specific) + Custom(T) +} + +#[derive(Debug)] +pub enum ParserError { + WrongToken { expected: Vec>, actual: SQLToken, line: usize, col: usize }, + Custom(T) +} + +impl From> for ParserError { + fn from(_: TokenizerError) -> Self { + unimplemented!() + } +} + + +trait Parser { + /// parse the prefix and stop once an infix operator is reached + fn parse_prefix(&mut self) -> Result>, ParserError> ; + /// parse the next infix expression, returning None if the precedence has changed + fn parse_infix(&mut self, left: SQLExpr) -> Result>>, ParserError>; +} + +// +// +//struct GenericParser { +// tokenizer: SQLTokenizer +//} +// +//impl GenericParser { +// +// fn parse_expr(&mut self, precedence: u8) -> Result, ParserError> { +// +// let mut expr = self.parse_prefix()?; +// +// // loop while there are more tokens and until the precedence changes +// while let Some(token) = self.tokenizer.peek_token()? { +// +// let next_precedence = self.get_precedence(&token); +// +// if precedence >= next_precedence { +// break; +// } +// +// expr = self.parse_infix(expr, next_precedence)?; +// } +// +// Ok(expr) +// } +// +// fn parse_prefix(&mut self) -> Result, ParserError> { +// +// match self.tokenizer.peek_token()? { +// Some(SQLToken::Keyword(ref k)) => match k.to_uppercase().as_ref() { +// "INSERT" => unimplemented!(), +// "UPDATE" => unimplemented!(), +// "DELETE" => unimplemented!(), +// "SELECT" => unimplemented!(), +// "CREATE" => unimplemented!(), +// _ => unimplemented!() +// }, +// _ => unimplemented!() +// } +// unimplemented!() +// } +// +// fn parse_infix(&mut self, expr: Box, precedence: u8) -> Result, ParserError> { +// +// match self.tokenizer.next_token()? { +// Some(tok) => { +// match tok { +// SQLToken::Eq | SQLToken::Gt | SQLToken::GtEq | +// SQLToken::Lt | SQLToken::LtEq => Ok(Box::new(SQLExpr::Binary( +// expr, +// self.to_sql_operator(&tok), +// self.parse_expr(precedence)? +// ))), +// _ => Err(ParserError::WrongToken { +// expected: vec![SQLToken::Eq, SQLToken::Gt], //TODO: complete +// actual: tok, +// line: 0, +// col: 0 +// }) +// } +// }, +// None => Err(ParserError::TBD) +// } +// } +// +// fn to_sql_operator(&self, token: &SQLToken) -> SQLOperator { +// unimplemented!() +// } +// +// fn get_precedence(&self, token: &SQLToken) -> u8 { +// unimplemented!() +// } +// +// /// parse a list of SQL expressions separated by a comma +// fn parse_expr_list(&mut self, precedence: u8) -> Result, ParserError> { +// unimplemented!() +// } +// +//} +// +////impl GenericParser { +//// +//// fn tokenizer(&mut self) -> &mut SQLTokenizer { +//// &mut self.tokenizer +//// } +//// +//// fn parse_keywords(&mut self, keywords: Vec<&str>) -> Result { +//// unimplemented!() +//// } +//// +////// fn parse_identifier(&mut self) -> Result; +//// +////} +// + +// +// +//#[cfg(test)] +//mod tests { +// +// use super::SQLToken::*; +// use super::*; +// #[test] +// fn parse_Acme_create_table() { +// +// // CREATE TABLE test (col1 int8) HASH (col1) +// let tokens = vec![ +// k("CREATE"), k("TABLE"), i("test"), LParen, +// i("col1"), k("int8"), +// RParen, +// k("HASH"), LParen, i("col1"), RParen +// ]; +// +// //let parser = AcmeParser { generic_parser: } +// } +// } +// +// fn k(s: &str) -> SQLToken { +// Keyword(s.to_string()) +// } +// +// fn i(s: &str) -> SQLToken { +// Identifier(s.to_string()) +// } +// +// +//} + + diff --git a/src/tokenizer.rs b/src/tokenizer.rs new file mode 100644 index 00000000..8a891006 --- /dev/null +++ b/src/tokenizer.rs @@ -0,0 +1,39 @@ + +#[derive(Debug)] +pub struct Position { + line: usize, + col: usize +} + +#[derive(Debug)] +pub enum TokenizerError { + UnexpectedEof(Position), + UnterminatedStringLiteral(Position), + Custom(T) +} + +/// SQL Tokens +#[derive(Debug)] +pub enum SQLToken { + Keyword(String), //TODO: &str ? + Identifier(String), //TODO: &str ? + Literal(String), //TODO: need to model different types of literal + Eq, + NotEq, + Gt, + GtEq, + Lt, + LtEq, + LParen, + RParen, + Comma, + /// Custom token + Custom(T) +} + +pub trait SQLTokenizer { + /// return a reference to the next token without consuming it (look ahead) + fn peek_token(&mut self) -> Result>, TokenizerError>; + /// return a reference to the next token and advance the index + fn next_token(&mut self) -> Result>, TokenizerError>; +} \ No newline at end of file