diff --git a/.gitignore b/.gitignore index 50281a44..2552a537 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,6 @@ Cargo.lock # These are backup files generated by rustfmt **/*.rs.bk + +# IDEs +.gitignore diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 00000000..fec5e134 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,6 @@ +[package] +name = "datafusion-sql" +version = "0.1.0" +authors = ["Andy Grove "] + +[dependencies] diff --git a/README.md b/README.md index 496513ec..4f60e990 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,9 @@ # datafusion-sql -DataFusion SQL Parser + +DataFusion SQL Parser (v2) + +Goals: + +- Support for custom SQL dialects, so other projects can implement their own parsers easily +- Zero-copy of tokens when parsing + diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 00000000..ca81043d --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,151 @@ + +/* --- TOKENIZER API --- */ + +enum TokenizerError { + WrongToken { expected: SQLToken, actual: SQLToken, line: usize, col: usize }, + TBD +} + +/// SQL Tokens +enum SQLToken { + Keyword(String), + Identifier(String), + Eq, + Gt, + GtEq, + Lt, + LtEq, + LParen, + RParen, + Comma, + Custom(Box) // extension point for vendor-specific tokens +} + +trait CustomToken { + //TODO: ??? +} + +trait SQLTokenizer<'a> { + // return a reference to the next token without consuming it (look ahead) + fn peek_token(&'a mut self) -> Result, Box>; + // return a reference to the next token and advance the index + fn next_token(&'a mut self) -> Result, Box>; +} + +/* --- PARSER API --- */ + +/// SQL Operators +enum SQLOperator { + Plus, + Minus, + Mult, + Div, + Eq, + Gt, + GtEq, + Lt, + LtEq, + Custom(Box) // extension point for vendor-specific operators +} + +trait CustomOperator { + //TODO: ??? +} + +/// SQL Expressions +enum SQLExpr { + /// Identifier e.g. table name or column name + Identifier(String), + /// Literal value + Literal(String), + /// Binary expression e.g. `1 + 2` or `fname LIKE "A%"` + Binary(Box, SQLOperator, Box), + /// Function invocation with function name and list of argument expressions + FunctionCall(String, Vec), + /// Custom expression (vendor-specific) + Custom(Box) +} + +trait CustomExpr { + //TODO: ??? +} + +enum ParserError { + TBD +} + +trait Parser<'a> { + fn parse_expr(&mut self) -> Result, Box>; + fn parse_expr_list(&mut self) -> Result, Box>; + fn parse_identifier(&mut self) -> Result>; + fn parse_keywords(&mut self, keywords: Vec<&str>) -> Result>; +} + +/* --- KUDU PARSER IMPL --- */ + +struct KuduParser<'a> { + generic_parser: Box> +} + +impl<'a> Parser<'a> for KuduParser<'a> { + + fn parse_expr(&mut self) -> Result, Box> { + self.generic_parser.parse_expr() + } + + fn parse_expr_list(&mut self) -> Result, Box> { + self.generic_parser.parse_expr_list() + } + + fn parse_identifier(&mut self) -> Result> { + self.generic_parser.parse_identifier() + } + + fn parse_keywords(&mut self, keywords: Vec<&str>) -> Result> { + self.parse_keywords(keywords) + } +} + +/* --- PRATT PARSER IMPL --- */ + +struct PrattParser<'a> { + parser: Box> +} + +impl<'a> PrattParser<'a> { + + fn parse_expr(&'a mut self, precedence: u8) -> SQLExpr { + unimplemented!() + } + +// +// // Not complete/accurate, but enough to demonstrate the concept that the pratt parser +// // does not need knowledge of the specific tokenizer or parser to operate +// +// loop { +// match self.tokenizer.peek_token() { +// Ok(Some(token)) => { +// let next_precedence = self.parser.get_precedence(&token); +// unimplemented!() +// }, +// _ => { +// } +// } +// } +// +// +} + +#[cfg(test)] +mod tests { + + use super::*; + #[test] + fn it_works() { + let tokens = vec![ + SQLToken::Keyword("CREATE".to_string()), + SQLToken::Keyword("TABLE".to_string()), + SQLToken::Keyword("test".to_string()), + ]; + } +}