diff --git a/Cargo.toml b/Cargo.toml index fec5e134..239bbb2c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,22 @@ [package] -name = "datafusion-sql" +name = "sqlparser" +description = "ANSI SQL parser" version = "0.1.0" authors = ["Andy Grove "] +homepage = "https://github.com/andygrove/sqlparser-rs" +documentation = "https://github.com/andygrove/sqlparser-rs" +keywords = [ "sql", "lexer", "parser" ] +repository = "https://github.com/andygrove/sqlparser-rs" +license = "Apache-2.0" +include = [ + "src/**/*.rs", + "Cargo.toml", +] + +[lib] +name = "sqlparser" +path = "src/lib.rs" [dependencies] +fnv = "1.0.3" +lazy_static = "1.0" \ No newline at end of file diff --git a/LICENSE b/LICENSE deleted file mode 100644 index 261eeb9e..00000000 --- a/LICENSE +++ /dev/null @@ -1,201 +0,0 @@ - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. diff --git a/LICENSE.TXT b/LICENSE.TXT new file mode 100644 index 00000000..16fe87b0 --- /dev/null +++ b/LICENSE.TXT @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + +Copyright [yyyy] [name of copyright owner] + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/README.md b/README.md index b7abbf38..a273243e 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,3 @@ -# datafusion-sql - -This is a work-in-progress to develop a new version of the DataFusion SQL Parser. - -Goals for this version: - -- Support for custom SQL dialects, so other projects can implement their own parsers easily -- Good error reporting (e.g. show line / column numbers and descriptive messages) -- Zero-copy of tokens when parsing -- Concise code -- Full support for ANSI SQL:2011 standard +# SQL Parser +TBD \ No newline at end of file diff --git a/examples/parse_sql.rs b/examples/parse_sql.rs deleted file mode 100644 index b1b85e72..00000000 --- a/examples/parse_sql.rs +++ /dev/null @@ -1,20 +0,0 @@ -use std::sync::{Arc, Mutex}; - -extern crate datafusion_sql; - -use datafusion_sql::ansi::tokenizer::ANSISQLTokenizer; -use datafusion_sql::ansi::parser::ANSISQLParser; -use datafusion_sql::tokenizer::*; -use datafusion_sql::parser::*; - - -fn main() { - - let sql = "SELECT 1 + 1"; - - // Create parsers - match ANSISQLParser::parse(sql).unwrap() { - Some(ast) => println!("{:?}", ast), - _ => {} - } -} diff --git a/src/ansi/mod.rs b/src/ansi/mod.rs deleted file mode 100644 index f5546084..00000000 --- a/src/ansi/mod.rs +++ /dev/null @@ -1,3 +0,0 @@ - -pub mod tokenizer; -pub mod parser; \ No newline at end of file diff --git a/src/ansi/parser.rs b/src/ansi/parser.rs deleted file mode 100644 index f0259c31..00000000 --- a/src/ansi/parser.rs +++ /dev/null @@ -1,70 +0,0 @@ -use std::cmp::PartialEq; -use std::fmt::Debug; -//use std::rc::Rc; -//use std::sync::{Arc, Mutex}; - -use super::tokenizer::ANSISQLTokenizer; -use super::super::tokenizer::*; -use super::super::parser::*; - -pub struct ANSISQLParser { - tokenizer: Box -} - -impl ANSISQLParser where { - - pub fn parse(sql: &str) -> Result>, ParserError> { - let mut parser = ANSISQLParser { tokenizer: Box::new(ANSISQLTokenizer::new(sql)) }; - parser.parse_expr() - } -} - -impl SQLParser for ANSISQLParser { - - fn parse_expr(&mut self) -> Result>, ParserError> { - - let precedence: usize = 0; - - let mut e = self.parse_prefix()?; - - match e { - Some(mut expr) => { - while let Some(token) = self.tokenizer.peek_token()? { - let next_precedence = self.tokenizer.precedence(&token); - - if precedence >= next_precedence { - break; - } - - expr = self.parse_infix(&expr, next_precedence)?.unwrap(); //TODO: fix me - } - - Ok(Some(expr)) - } - _ => { - Ok(None) - } - } - - } - - fn parse_prefix(&mut self) -> Result>, ParserError> { - - match self.tokenizer.next_token()? { - Some(SQLToken::Keyword(ref k)) => match k.to_uppercase().as_ref() { - "INSERT" => unimplemented!(), - "UPDATE" => unimplemented!(), - "DELETE" => unimplemented!(), - "SELECT" => unimplemented!(), - "CREATE" => unimplemented!(), - _ => unimplemented!() - }, - _ => unimplemented!() - } - } - - fn parse_infix(&mut self, _left: &SQLExpr, _precedence: usize) -> Result>, ParserError> { - unimplemented!() - } -} - diff --git a/src/ansi/tokenizer.rs b/src/ansi/tokenizer.rs deleted file mode 100644 index 9714559f..00000000 --- a/src/ansi/tokenizer.rs +++ /dev/null @@ -1,56 +0,0 @@ -use std::cmp::PartialEq; -use std::fmt::Debug; - -use super::super::tokenizer::*; - -pub struct ANSISQLTokenizer { - chars: CharSeq -} - -impl ANSISQLTokenizer { - pub fn new(sql: &str) -> Self { - ANSISQLTokenizer { chars: CharSeq::new(sql) } - } -} - -impl SQLTokenizer for ANSISQLTokenizer { - - fn precedence(&self, _token: &SQLToken) -> usize { - unimplemented!() - } - - fn peek_token(&mut self) -> Result, TokenizerError> { - unimplemented!() - } - - - fn next_token(&mut self) -> Result, TokenizerError> { - match self.chars.next() { - Some(ch) => match ch { - ' ' | '\t' | '\n' => Ok(Some(SQLToken::Whitespace(ch))), - '0' ... '9' => { - let mut s = String::new(); - s.push(ch); - while let Some(&ch) = self.chars.peek() { - match ch { - '0' ... '9' => { - self.chars.next(); // consume - s.push(ch); - }, - _ => break - } - } - Ok(Some(SQLToken::Literal(s))) - }, - '+' => Ok(Some(SQLToken::Plus)), - '-' => Ok(Some(SQLToken::Minus)), - '*' => Ok(Some(SQLToken::Mult)), - '/' => Ok(Some(SQLToken::Divide)), - _ => Err(TokenizerError::UnexpectedChar(ch,Position::new(0, 0))) - }, - None => Ok(None) - } - } - -} - diff --git a/src/lib.rs b/src/lib.rs index bbf236ec..7d09ca50 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,3 +1,22 @@ -pub mod ansi; -pub mod tokenizer; -pub mod parser; +// Copyright 2018 Grove Enterprises LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +extern crate fnv; + +#[macro_use] +extern crate lazy_static; + +pub mod sqlast; +pub mod sqlparser; +pub mod sqltokenizer; diff --git a/src/parser.rs b/src/parser.rs deleted file mode 100644 index afbfda1d..00000000 --- a/src/parser.rs +++ /dev/null @@ -1,106 +0,0 @@ -use std::cmp::PartialEq; -use std::fmt::Debug; - -use super::tokenizer::*; - -// https://jakewheat.github.io/sql-overview/sql-2011-foundation-grammar.html - -/// ANSI SQL:2011 Data Types -#[derive(Debug)] -pub enum SQLDataType { - /// BOOLEAN - Boolean, - /// NUMERIC, DECIMAL, DEC - Numeric { precision: usize, scale: Option }, - /// SMALLINT - SmallInt, - /// INT, INTEGER - Int, - /// BIGINT - BigInt, - /// Floating point: `FLOAT(precision)` - Float(usize), - /// REAL - Real, - /// Double: `DOUBLE PRECISION` - Double, - /// Fixed-length character. `CHAR, CHARACTER` - Char(usize), - /// Variable-length character: `VARCHAR, CHARACTER VARYING, CHAR VARYING` - VarChar(usize), - /// Character Large Object: `CHARACTER LARGE OBJECT, CHAR LARGE OBJECT, CLOB` - Clob(usize), - /// Fixed-length character. `NCHAR, NATIONAL CHAR, NATIONAL CHARACTER` - NChar(usize), - /// Variable-length character: `NCHAR VARYING, NATIONAL CHARACTER VARYING, NATIONAL CHAR VARYING` - NVarChar(usize), - /// National Character Large Object: `NATIONAL CHARACTER LARGE OBJECT, NCHAR LARGE OBJECT, NCLOB` - NClob(usize), - /// Fixed-length binary - Binary(usize), - /// Variable-length binary - VarBinary(usize), - /// Binary large object - Blob(usize), - /// Date - Date, - /// Time: `TIME [(precision)] [WITH TIME ZONE | WITHOUT TIME ZONE]` - Time { precision: usize, tz: bool }, - /// Time: `TIMESTAMP [(precision)] [WITH TIME ZONE | WITHOUT TIME ZONE]` - Timestamp { precision: usize, tz: bool }, -} - - - -#[derive(Debug)] -pub enum SQLOperator { - Plus, - Minus, - Mult, - Div, - Eq, - Gt, - GtEq, - Lt, - LtEq, -} - -/// SQL Expressions -#[derive(Debug)] -pub enum SQLExpr{ - /// Identifier e.g. table name or column name - Identifier(String), - /// Literal value - Literal(String), - /// Binary expression e.g. `1 + 2` or `fname LIKE "A%"` - Binary(Box, SQLOperator, Box), - /// Function invocation with function name and list of argument expressions - FunctionCall(String, Vec), - Insert, - Update, - Delete, - Select, - CreateTable, -} - -#[derive(Debug)] -pub enum ParserError { - WrongToken { expected: Vec, actual: SQLToken, line: usize, col: usize }, - Custom(String) -} - -impl From for ParserError { - fn from(e: TokenizerError) -> Self { - ParserError::Custom(format!("{:?}", e)) - } -} - - -pub trait SQLParser { - fn parse_expr(&mut self) -> Result>, ParserError>; - /// parse the prefix and stop once an infix operator is reached - fn parse_prefix(&mut self) -> Result>, ParserError> ; - /// parse the next infix expression, returning None if the precedence has changed - fn parse_infix(&mut self, left: &SQLExpr, precedence: usize) -> Result>, ParserError>; -} - diff --git a/src/sqlast.rs b/src/sqlast.rs new file mode 100644 index 00000000..d147aecc --- /dev/null +++ b/src/sqlast.rs @@ -0,0 +1,122 @@ +// Copyright 2018 Grove Enterprises LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! SQL Abstract Syntax Tree (AST) types + +/// Supported file types for `CREATE EXTERNAL TABLE` +#[derive(Debug, Clone, PartialEq)] +pub enum FileType { + CSV, + NdJson, + Parquet, +} + +/// SQL Abstract Syntax Tree (AST) +#[derive(Debug, Clone, PartialEq)] +pub enum ASTNode { + SQLIdentifier(String), + SQLWildcard, + SQLCompoundIdentifier(Vec), + SQLIsNull(Box), + SQLIsNotNull(Box), + SQLBinaryExpr { + left: Box, + op: SQLOperator, + right: Box, + }, + SQLCast { + expr: Box, + data_type: SQLType, + }, + SQLNested(Box), + SQLUnary { + operator: SQLOperator, + rex: Box, + }, + SQLLiteralLong(i64), + SQLLiteralDouble(f64), + SQLLiteralString(String), + SQLFunction { + id: String, + args: Vec, + }, + SQLOrderBy { + expr: Box, + asc: bool, + }, + SQLSelect { + projection: Vec, + relation: Option>, + selection: Option>, + order_by: Option>, + group_by: Option>, + having: Option>, + limit: Option>, + }, + SQLCreateTable { + /// Table name + name: String, + /// Optional schema + columns: Vec, + /// File type (CSV or Parquet) + file_type: FileType, + /// For CSV files, indicate whether the file has a header row or not + header_row: bool, + /// Path to file or directory contianing files + location: String, + }, +} + +/// SQL column definition +#[derive(Debug, Clone, PartialEq)] +pub struct SQLColumnDef { + pub name: String, + pub data_type: SQLType, + pub allow_null: bool, +} + +/// SQL datatypes for literals in SQL statements +#[derive(Debug, Clone, PartialEq)] +pub enum SQLType { + Boolean, + UInt8, + UInt16, + UInt32, + UInt64, + Int8, + Int16, + Int32, + Int64, + Float32, + Double64, + Utf8(usize), +} + +/// SQL Operator +#[derive(Debug, PartialEq, Clone)] +pub enum SQLOperator { + Plus, + Minus, + Multiply, + Divide, + Modulus, + Gt, + Lt, + GtEq, + LtEq, + Eq, + NotEq, + And, + Or, +} diff --git a/src/sqlparser.rs b/src/sqlparser.rs new file mode 100644 index 00000000..c2b4c56d --- /dev/null +++ b/src/sqlparser.rs @@ -0,0 +1,971 @@ +// Copyright 2018 Grove Enterprises LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! SQL Parser + +use super::sqlast::*; +use super::sqltokenizer::*; + +#[derive(Debug, Clone)] +pub enum ParserError { + TokenizerError(String), + ParserError(String), +} + +macro_rules! parser_err { + ($MSG:expr) => { + Err(ParserError::ParserError($MSG.to_string())) + }; +} + +impl From for ParserError { + fn from(e: TokenizerError) -> Self { + ParserError::TokenizerError(format!("{:?}", e)) + } +} + +/// SQL Parser +pub struct Parser { + tokens: Vec, + index: usize, +} + +impl Parser { + /// Parse the specified tokens + pub fn new(tokens: Vec) -> Self { + Parser { + tokens: tokens, + index: 0, + } + } + + /// Parse a SQL statement and produce an Abstract Syntax Tree (AST) + pub fn parse_sql(sql: String) -> Result { + let mut tokenizer = Tokenizer::new(&sql); + let tokens = tokenizer.tokenize()?; + let mut parser = Parser::new(tokens); + parser.parse() + } + + /// Parse a new expression + pub fn parse(&mut self) -> Result { + self.parse_expr(0) + } + + /// Parse tokens until the precedence changes + fn parse_expr(&mut self, precedence: u8) -> Result { + // println!("parse_expr() precendence = {}", precedence); + + let mut expr = self.parse_prefix()?; + // println!("parsed prefix: {:?}", expr); + + loop { + let next_precedence = self.get_next_precedence()?; + if precedence >= next_precedence { + // println!("break on precedence change ({} >= {})", precedence, next_precedence); + break; + } + + if let Some(infix_expr) = self.parse_infix(expr.clone(), next_precedence)? { + // println!("parsed infix: {:?}", infix_expr); + expr = infix_expr; + } + } + + // println!("parse_expr() returning {:?}", expr); + + Ok(expr) + } + + /// Parse an expression prefix + fn parse_prefix(&mut self) -> Result { + match self.next_token() { + Some(t) => { + match t { + Token::Keyword(k) => match k.to_uppercase().as_ref() { + "SELECT" => Ok(self.parse_select()?), + "CREATE" => Ok(self.parse_create()?), + _ => return parser_err!(format!("No prefix parser for keyword {}", k)), + }, + Token::Mult => Ok(ASTNode::SQLWildcard), + Token::Identifier(id) => { + match self.peek_token() { + Some(Token::LParen) => { + self.next_token(); // skip lparen + match id.to_uppercase().as_ref() { + "CAST" => self.parse_cast_expression(), + _ => { + let args = self.parse_expr_list()?; + self.next_token(); // skip rparen + Ok(ASTNode::SQLFunction { id, args }) + } + } + } + Some(Token::Period) => { + let mut id_parts: Vec = vec![id]; + while self.peek_token() == Some(Token::Period) { + self.consume_token(&Token::Period)?; + match self.next_token() { + Some(Token::Identifier(id)) => id_parts.push(id), + _ => { + return parser_err!(format!( + "Error parsing compound identifier" + )) + } + } + } + Ok(ASTNode::SQLCompoundIdentifier(id_parts)) + } + _ => Ok(ASTNode::SQLIdentifier(id)), + } + } + Token::Number(ref n) if n.contains(".") => match n.parse::() { + Ok(n) => Ok(ASTNode::SQLLiteralDouble(n)), + Err(e) => parser_err!(format!("Could not parse '{}' as i64: {}", n, e)), + }, + Token::Number(ref n) => match n.parse::() { + Ok(n) => Ok(ASTNode::SQLLiteralLong(n)), + Err(e) => parser_err!(format!("Could not parse '{}' as i64: {}", n, e)), + }, + Token::String(ref s) => Ok(ASTNode::SQLLiteralString(s.to_string())), + _ => parser_err!(format!( + "Prefix parser expected a keyword but found {:?}", + t + )), + } + } + None => parser_err!(format!("Prefix parser expected a keyword but hit EOF")), + } + } + + /// Parse a SQL CAST function e.g. `CAST(expr AS FLOAT)` + fn parse_cast_expression(&mut self) -> Result { + let expr = self.parse_expr(0)?; + self.consume_token(&Token::Keyword("AS".to_string()))?; + let data_type = self.parse_data_type()?; + self.consume_token(&Token::RParen)?; + Ok(ASTNode::SQLCast { + expr: Box::new(expr), + data_type, + }) + } + + /// Parse an expression infix (typically an operator) + fn parse_infix( + &mut self, + expr: ASTNode, + precedence: u8, + ) -> Result, ParserError> { + match self.next_token() { + Some(tok) => match tok { + Token::Keyword(ref k) => if k == "IS" { + if self.parse_keywords(vec!["NULL"]) { + Ok(Some(ASTNode::SQLIsNull(Box::new(expr)))) + } else if self.parse_keywords(vec!["NOT", "NULL"]) { + Ok(Some(ASTNode::SQLIsNotNull(Box::new(expr)))) + } else { + parser_err!("Invalid tokens after IS") + } + } else { + Ok(Some(ASTNode::SQLBinaryExpr { + left: Box::new(expr), + op: self.to_sql_operator(&tok)?, + right: Box::new(self.parse_expr(precedence)?), + })) + }, + Token::Eq + | Token::Neq + | Token::Gt + | Token::GtEq + | Token::Lt + | Token::LtEq + | Token::Plus + | Token::Minus + | Token::Mult + | Token::Mod + | Token::Div => Ok(Some(ASTNode::SQLBinaryExpr { + left: Box::new(expr), + op: self.to_sql_operator(&tok)?, + right: Box::new(self.parse_expr(precedence)?), + })), + _ => parser_err!(format!("No infix parser for token {:?}", tok)), + }, + None => Ok(None), + } + } + + /// Convert a token operator to an AST operator + fn to_sql_operator(&self, tok: &Token) -> Result { + match tok { + &Token::Eq => Ok(SQLOperator::Eq), + &Token::Neq => Ok(SQLOperator::NotEq), + &Token::Lt => Ok(SQLOperator::Lt), + &Token::LtEq => Ok(SQLOperator::LtEq), + &Token::Gt => Ok(SQLOperator::Gt), + &Token::GtEq => Ok(SQLOperator::GtEq), + &Token::Plus => Ok(SQLOperator::Plus), + &Token::Minus => Ok(SQLOperator::Minus), + &Token::Mult => Ok(SQLOperator::Multiply), + &Token::Div => Ok(SQLOperator::Divide), + &Token::Mod => Ok(SQLOperator::Modulus), + &Token::Keyword(ref k) if k == "AND" => Ok(SQLOperator::And), + &Token::Keyword(ref k) if k == "OR" => Ok(SQLOperator::Or), + _ => parser_err!(format!("Unsupported SQL operator {:?}", tok)), + } + } + + /// Get the precedence of the next token + fn get_next_precedence(&self) -> Result { + if self.index < self.tokens.len() { + self.get_precedence(&self.tokens[self.index]) + } else { + Ok(0) + } + } + + /// Get the precedence of a token + fn get_precedence(&self, tok: &Token) -> Result { + //println!("get_precedence() {:?}", tok); + + match tok { + &Token::Keyword(ref k) if k == "OR" => Ok(5), + &Token::Keyword(ref k) if k == "AND" => Ok(10), + &Token::Keyword(ref k) if k == "IS" => Ok(15), + &Token::Eq | &Token::Lt | &Token::LtEq | &Token::Neq | &Token::Gt | &Token::GtEq => { + Ok(20) + } + &Token::Plus | &Token::Minus => Ok(30), + &Token::Mult | &Token::Div | &Token::Mod => Ok(40), + _ => Ok(0), + } + } + + /// Peek at the next token + fn peek_token(&mut self) -> Option { + if self.index < self.tokens.len() { + Some(self.tokens[self.index].clone()) + } else { + None + } + } + + /// Get the next token and increment the token index + fn next_token(&mut self) -> Option { + if self.index < self.tokens.len() { + self.index = self.index + 1; + Some(self.tokens[self.index - 1].clone()) + } else { + None + } + } + + /// Get the previous token and decrement the token index + fn prev_token(&mut self) -> Option { + if self.index > 0 { + Some(self.tokens[self.index - 1].clone()) + } else { + None + } + } + + /// Look for an expected keyword and consume it if it exists + fn parse_keyword(&mut self, expected: &'static str) -> bool { + match self.peek_token() { + Some(Token::Keyword(k)) => { + if expected.eq_ignore_ascii_case(k.as_str()) { + self.next_token(); + true + } else { + false + } + } + _ => false, + } + } + + /// Look for an expected sequence of keywords and consume them if they exist + fn parse_keywords(&mut self, keywords: Vec<&'static str>) -> bool { + let index = self.index; + for keyword in keywords { + //println!("parse_keywords aborting .. expecting {}", keyword); + if !self.parse_keyword(&keyword) { + //println!("parse_keywords aborting .. did not find {}", keyword); + // reset index and return immediately + self.index = index; + return false; + } + } + true + } + + // fn parse_identifier(&mut self) -> Result { + // let expr = self.parse_expr()?; + // match expr { + // Some(ASTNode::SQLIdentifier { .. }) => Ok(expr), + // _ => parser_err!(format!("Expected identifier but found {:?}", expr))) + // } + // } + + /// Consume the next token if it matches the expected token, otherwise return an error + fn consume_token(&mut self, expected: &Token) -> Result { + match self.peek_token() { + Some(ref t) => if *t == *expected { + self.next_token(); + Ok(true) + } else { + Ok(false) + }, + _ => parser_err!(format!( + "expected token {:?} but was {:?}", + expected, + self.prev_token() + )), + } + } + + /// Parse a SQL CREATE statement + fn parse_create(&mut self) -> Result { + if self.parse_keywords(vec!["EXTERNAL", "TABLE"]) { + match self.next_token() { + Some(Token::Identifier(id)) => { + // parse optional column list (schema) + let mut columns = vec![]; + if self.consume_token(&Token::LParen)? { + loop { + if let Some(Token::Identifier(column_name)) = self.next_token() { + if let Ok(data_type) = self.parse_data_type() { + let allow_null = if self.parse_keywords(vec!["NOT", "NULL"]) { + false + } else if self.parse_keyword("NULL") { + true + } else { + true + }; + + match self.peek_token() { + Some(Token::Comma) => { + self.next_token(); + columns.push(SQLColumnDef { + name: column_name, + data_type: data_type, + allow_null, + }); + } + Some(Token::RParen) => { + self.next_token(); + columns.push(SQLColumnDef { + name: column_name, + data_type: data_type, + allow_null, + }); + break; + } + _ => { + return parser_err!( + "Expected ',' or ')' after column definition" + ); + } + } + } else { + return parser_err!( + "Error parsing data type in column definition" + ); + } + } else { + return parser_err!("Error parsing column name"); + } + } + } + + //println!("Parsed {} column defs", columns.len()); + + let mut headers = true; + let file_type: FileType = if self.parse_keywords(vec!["STORED", "AS", "CSV"]) { + if self.parse_keywords(vec!["WITH", "HEADER", "ROW"]) { + headers = true; + } else if self.parse_keywords(vec!["WITHOUT", "HEADER", "ROW"]) { + headers = false; + } + FileType::CSV + } else if self.parse_keywords(vec!["STORED", "AS", "NDJSON"]) { + FileType::NdJson + } else if self.parse_keywords(vec!["STORED", "AS", "PARQUET"]) { + FileType::Parquet + } else { + return parser_err!(format!( + "Expected 'STORED AS' clause, found {:?}", + self.peek_token() + )); + }; + + let location: String = if self.parse_keywords(vec!["LOCATION"]) { + self.parse_literal_string()? + } else { + return parser_err!("Missing 'LOCATION' clause"); + }; + + Ok(ASTNode::SQLCreateTable { + name: id, + columns, + file_type, + header_row: headers, + location, + }) + } + _ => parser_err!(format!( + "Unexpected token after CREATE EXTERNAL TABLE: {:?}", + self.peek_token() + )), + } + } else { + parser_err!(format!( + "Unexpected token after CREATE: {:?}", + self.peek_token() + )) + } + } + + /// Parse a literal integer/long + fn parse_literal_int(&mut self) -> Result { + match self.next_token() { + Some(Token::Number(s)) => s.parse::().map_err(|e| { + ParserError::ParserError(format!("Could not parse '{}' as i64: {}", s, e)) + }), + other => parser_err!(format!("Expected literal int, found {:?}", other)), + } + } + + /// Parse a literal string + fn parse_literal_string(&mut self) -> Result { + match self.next_token() { + Some(Token::String(ref s)) => Ok(s.clone()), + other => parser_err!(format!("Expected literal string, found {:?}", other)), + } + } + + /// Parse a SQL datatype (in the context of a CREATE TABLE statement for example) + fn parse_data_type(&mut self) -> Result { + match self.next_token() { + Some(Token::Keyword(k)) => match k.to_uppercase().as_ref() { + "BOOLEAN" => Ok(SQLType::Boolean), + "UINT8" => Ok(SQLType::UInt8), + "UINT16" => Ok(SQLType::UInt16), + "UINT32" => Ok(SQLType::UInt32), + "UINT64" => Ok(SQLType::UInt64), + "INT8" => Ok(SQLType::Int8), + "INT16" => Ok(SQLType::Int16), + "INT32" | "INT" | "INTEGER" => Ok(SQLType::Int32), + "INT64" | "LONG" => Ok(SQLType::Int64), + "FLOAT32" | "FLOAT" => Ok(SQLType::Float32), + "FLOAT64" | "DOUBLE" => Ok(SQLType::Double64), + "UTF8" | "VARCHAR" | "STRING" => { + // optional length + if self.consume_token(&Token::LParen)? { + let n = self.parse_literal_int()?; + self.consume_token(&Token::RParen)?; + Ok(SQLType::Utf8(n as usize)) + } else { + Ok(SQLType::Utf8(100 as usize)) + } + } + _ => parser_err!(format!("Invalid data type '{:?}'", k)), + }, + other => parser_err!(format!("Invalid data type: '{:?}'", other)), + } + } + + /// Parse a SELECT statement + fn parse_select(&mut self) -> Result { + let projection = self.parse_expr_list()?; + + let relation: Option> = if self.parse_keyword("FROM") { + //TODO: add support for JOIN + Some(Box::new(self.parse_expr(0)?)) + } else { + None + }; + + let selection = if self.parse_keyword("WHERE") { + Some(Box::new(self.parse_expr(0)?)) + } else { + None + }; + + let group_by = if self.parse_keywords(vec!["GROUP", "BY"]) { + Some(self.parse_expr_list()?) + } else { + None + }; + + let having = if self.parse_keyword("HAVING") { + Some(Box::new(self.parse_expr(0)?)) + } else { + None + }; + + let order_by = if self.parse_keywords(vec!["ORDER", "BY"]) { + Some(self.parse_order_by_expr_list()?) + } else { + None + }; + + let limit = if self.parse_keyword("LIMIT") { + self.parse_limit()? + } else { + None + }; + + if let Some(next_token) = self.peek_token() { + parser_err!(format!( + "Unexpected token at end of SELECT: {:?}", + next_token + )) + } else { + Ok(ASTNode::SQLSelect { + projection, + selection, + relation, + limit, + order_by, + group_by, + having, + }) + } + } + + /// Parse a comma-delimited list of SQL expressions + fn parse_expr_list(&mut self) -> Result, ParserError> { + let mut expr_list: Vec = vec![]; + loop { + expr_list.push(self.parse_expr(0)?); + if let Some(t) = self.peek_token() { + if t == Token::Comma { + self.next_token(); + } else { + break; + } + } else { + //EOF + break; + } + } + Ok(expr_list) + } + + /// Parse a comma-delimited list of SQL ORDER BY expressions + fn parse_order_by_expr_list(&mut self) -> Result, ParserError> { + let mut expr_list: Vec = vec![]; + loop { + let expr = self.parse_expr(0)?; + + // look for optional ASC / DESC specifier + let asc = match self.peek_token() { + Some(Token::Keyword(k)) => { + self.next_token(); // consume it + match k.to_uppercase().as_ref() { + "ASC" => true, + "DESC" => false, + _ => { + return parser_err!(format!( + "Invalid modifier for ORDER BY expression: {:?}", + k + )) + } + } + } + Some(Token::Comma) => true, + Some(other) => { + return parser_err!(format!("Unexpected token after ORDER BY expr: {:?}", other)) + } + None => true, + }; + + expr_list.push(ASTNode::SQLOrderBy { + expr: Box::new(expr), + asc, + }); + + if let Some(t) = self.peek_token() { + if t == Token::Comma { + self.next_token(); + } else { + break; + } + } else { + // EOF + break; + } + } + Ok(expr_list) + } + + /// Parse a LIMIT clause + fn parse_limit(&mut self) -> Result>, ParserError> { + if self.parse_keyword("ALL") { + Ok(None) + } else { + self.parse_literal_int() + .map(|n| Some(Box::new(ASTNode::SQLLiteralLong(n)))) + } + } +} + +#[cfg(test)] +mod tests { + + use super::*; + + #[test] + fn parse_simple_select() { + let sql = String::from("SELECT id, fname, lname FROM customer WHERE id = 1 LIMIT 5"); + let ast = parse_sql(&sql); + match ast { + ASTNode::SQLSelect { + projection, limit, .. + } => { + assert_eq!(3, projection.len()); + assert_eq!(Some(Box::new(ASTNode::SQLLiteralLong(5))), limit); + } + _ => assert!(false), + } + } + + #[test] + fn parse_select_wildcard() { + let sql = String::from("SELECT * FROM customer"); + let ast = parse_sql(&sql); + match ast { + ASTNode::SQLSelect { projection, .. } => { + assert_eq!(1, projection.len()); + assert_eq!(ASTNode::SQLWildcard, projection[0]); + } + _ => assert!(false), + } + } + + #[test] + fn parse_select_count_wildcard() { + let sql = String::from("SELECT COUNT(*) FROM customer"); + let ast = parse_sql(&sql); + match ast { + ASTNode::SQLSelect { projection, .. } => { + assert_eq!(1, projection.len()); + assert_eq!( + ASTNode::SQLFunction { + id: "COUNT".to_string(), + args: vec![ASTNode::SQLWildcard], + }, + projection[0] + ); + } + _ => assert!(false), + } + } + + #[test] + fn parse_select_string_predicate() { + let sql = String::from( + "SELECT id, fname, lname FROM customer \ + WHERE salary != 'Not Provided' AND salary != ''", + ); + let _ast = parse_sql(&sql); + //TODO: add assertions + } + + #[test] + fn parse_projection_nested_type() { + let sql = String::from("SELECT customer.address.state FROM foo"); + let _ast = parse_sql(&sql); + //TODO: add assertions + } + + #[test] + fn parse_compound_expr_1() { + use self::ASTNode::*; + use self::SQLOperator::*; + let sql = String::from("a + b * c"); + let ast = parse_sql(&sql); + assert_eq!( + SQLBinaryExpr { + left: Box::new(SQLIdentifier("a".to_string())), + op: Plus, + right: Box::new(SQLBinaryExpr { + left: Box::new(SQLIdentifier("b".to_string())), + op: Multiply, + right: Box::new(SQLIdentifier("c".to_string())) + }) + }, + ast + ); + } + + #[test] + fn parse_compound_expr_2() { + use self::ASTNode::*; + use self::SQLOperator::*; + let sql = String::from("a * b + c"); + let ast = parse_sql(&sql); + assert_eq!( + SQLBinaryExpr { + left: Box::new(SQLBinaryExpr { + left: Box::new(SQLIdentifier("a".to_string())), + op: Multiply, + right: Box::new(SQLIdentifier("b".to_string())) + }), + op: Plus, + right: Box::new(SQLIdentifier("c".to_string())) + }, + ast + ); + } + + #[test] + fn parse_is_null() { + use self::ASTNode::*; + let sql = String::from("a IS NULL"); + let ast = parse_sql(&sql); + assert_eq!(SQLIsNull(Box::new(SQLIdentifier("a".to_string()))), ast); + } + + #[test] + fn parse_is_not_null() { + use self::ASTNode::*; + let sql = String::from("a IS NOT NULL"); + let ast = parse_sql(&sql); + assert_eq!(SQLIsNotNull(Box::new(SQLIdentifier("a".to_string()))), ast); + } + + #[test] + fn parse_select_order_by() { + let sql = String::from( + "SELECT id, fname, lname FROM customer WHERE id < 5 ORDER BY lname ASC, fname DESC", + ); + let ast = parse_sql(&sql); + match ast { + ASTNode::SQLSelect { order_by, .. } => { + assert_eq!( + Some(vec![ + ASTNode::SQLOrderBy { + expr: Box::new(ASTNode::SQLIdentifier("lname".to_string())), + asc: true, + }, + ASTNode::SQLOrderBy { + expr: Box::new(ASTNode::SQLIdentifier("fname".to_string())), + asc: false, + }, + ]), + order_by + ); + } + _ => assert!(false), + } + } + + #[test] + fn parse_select_group_by() { + let sql = String::from("SELECT id, fname, lname FROM customer GROUP BY lname, fname"); + let ast = parse_sql(&sql); + match ast { + ASTNode::SQLSelect { group_by, .. } => { + assert_eq!( + Some(vec![ + ASTNode::SQLIdentifier("lname".to_string()), + ASTNode::SQLIdentifier("fname".to_string()), + ]), + group_by + ); + } + _ => assert!(false), + } + } + + #[test] + fn parse_limit_accepts_all() { + let sql = String::from("SELECT id, fname, lname FROM customer WHERE id = 1 LIMIT ALL"); + let ast = parse_sql(&sql); + match ast { + ASTNode::SQLSelect { + projection, limit, .. + } => { + assert_eq!(3, projection.len()); + assert_eq!(None, limit); + } + _ => assert!(false), + } + } + + #[test] + fn parse_cast() { + let sql = String::from("SELECT CAST(id AS DOUBLE) FROM customer"); + let ast = parse_sql(&sql); + match ast { + ASTNode::SQLSelect { projection, .. } => { + assert_eq!(1, projection.len()); + assert_eq!( + ASTNode::SQLCast { + expr: Box::new(ASTNode::SQLIdentifier("id".to_string())), + data_type: SQLType::Double64 + }, + projection[0] + ); + } + _ => assert!(false), + } + } + + #[test] + fn parse_create_external_table_csv_with_header_row() { + let sql = String::from( + "CREATE EXTERNAL TABLE uk_cities (\ + name VARCHAR(100) NOT NULL,\ + lat DOUBLE NULL,\ + lng DOUBLE NULL) \ + STORED AS CSV WITH HEADER ROW \ + LOCATION '/mnt/ssd/uk_cities.csv'", + ); + let ast = parse_sql(&sql); + match ast { + ASTNode::SQLCreateTable { + name, + columns, + file_type, + header_row, + location, + } => { + assert_eq!("uk_cities", name); + assert_eq!(3, columns.len()); + assert_eq!(FileType::CSV, file_type); + assert_eq!(true, header_row); + assert_eq!("/mnt/ssd/uk_cities.csv", location); + + let c_name = &columns[0]; + assert_eq!("name", c_name.name); + assert_eq!(SQLType::Utf8(100), c_name.data_type); + assert_eq!(false, c_name.allow_null); + + let c_lat = &columns[1]; + assert_eq!("lat", c_lat.name); + assert_eq!(SQLType::Double64, c_lat.data_type); + assert_eq!(true, c_lat.allow_null); + + let c_lng = &columns[2]; + assert_eq!("lng", c_lng.name); + assert_eq!(SQLType::Double64, c_lng.data_type); + assert_eq!(true, c_lng.allow_null); + } + _ => assert!(false), + } + } + + #[test] + fn parse_create_external_table_csv_without_header_row() { + let sql = String::from( + "CREATE EXTERNAL TABLE uk_cities (\ + name VARCHAR(100) NOT NULL,\ + lat DOUBLE NOT NULL,\ + lng DOUBLE NOT NULL) \ + STORED AS CSV WITHOUT HEADER ROW \ + LOCATION '/mnt/ssd/uk_cities.csv'", + ); + let ast = parse_sql(&sql); + match ast { + ASTNode::SQLCreateTable { + name, + columns, + file_type, + header_row, + location, + } => { + assert_eq!("uk_cities", name); + assert_eq!(3, columns.len()); + assert_eq!(FileType::CSV, file_type); + assert_eq!(false, header_row); + assert_eq!("/mnt/ssd/uk_cities.csv", location); + } + _ => assert!(false), + } + } + + #[test] + fn parse_create_external_table_parquet() { + let sql = String::from( + "CREATE EXTERNAL TABLE uk_cities \ + STORED AS PARQUET \ + LOCATION '/mnt/ssd/uk_cities.parquet'", + ); + let ast = parse_sql(&sql); + match ast { + ASTNode::SQLCreateTable { + name, + columns, + file_type, + location, + .. + } => { + assert_eq!("uk_cities", name); + assert_eq!(0, columns.len()); + assert_eq!(FileType::Parquet, file_type); + assert_eq!("/mnt/ssd/uk_cities.parquet", location); + } + _ => assert!(false), + } + } + + #[test] + fn parse_scalar_function_in_projection() { + let sql = String::from("SELECT sqrt(id) FROM foo"); + let ast = parse_sql(&sql); + if let ASTNode::SQLSelect { projection, .. } = ast { + assert_eq!( + vec![ASTNode::SQLFunction { + id: String::from("sqrt"), + args: vec![ASTNode::SQLIdentifier(String::from("id"))], + }], + projection + ); + } else { + assert!(false); + } + } + + #[test] + fn parse_aggregate_with_group_by() { + let sql = String::from("SELECT a, COUNT(1), MIN(b), MAX(b) FROM foo GROUP BY a"); + let _ast = parse_sql(&sql); + //TODO: assertions + } + + #[test] + fn parse_select_version() { + let sql = "SELECT @@version"; + match parse_sql(&sql) { + ASTNode::SQLSelect { ref projection, .. } => { + assert_eq!( + projection[0], + ASTNode::SQLIdentifier("@@version".to_string()) + ); + } + _ => panic!(), + } + } + + fn parse_sql(sql: &str) -> ASTNode { + let mut tokenizer = Tokenizer::new(&sql); + let tokens = tokenizer.tokenize().unwrap(); + let mut parser = Parser::new(tokens); + let ast = parser.parse().unwrap(); + ast + } + +} diff --git a/src/sqltokenizer.rs b/src/sqltokenizer.rs new file mode 100644 index 00000000..e93413c6 --- /dev/null +++ b/src/sqltokenizer.rs @@ -0,0 +1,427 @@ +// Copyright 2018 Grove Enterprises LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! SQL Tokenizer + +use std::iter::Peekable; +use std::str::Chars; + +use fnv::FnvHashSet; + +/// SQL Token enumeration +#[derive(Debug, Clone, PartialEq)] +pub enum Token { + /// SQL identifier e.g. table or column name + Identifier(String), + /// SQL keyword e.g. Keyword("SELECT") + Keyword(String), + /// Numeric literal + Number(String), + /// String literal + String(String), + /// Comma + Comma, + /// Whitespace (space, tab, etc) + Whitespace, + /// Equality operator `=` + Eq, + /// Not Equals operator `!=` or `<>` + Neq, + /// Less Than operator `<` + Lt, + /// Greater han operator `>` + Gt, + /// Less Than Or Equals operator `<=` + LtEq, + /// Greater Than Or Equals operator `>=` + GtEq, + /// Plus operator `+` + Plus, + /// Minus operator `-` + Minus, + /// Multiplication operator `*` + Mult, + /// Division operator `/` + Div, + /// Modulo Operator `%` + Mod, + /// Left parenthesis `(` + LParen, + /// Right parenthesis `)` + RParen, + /// Period (used for compound identifiers or projections into nested types) + Period, +} + +/// Tokenizer error +#[derive(Debug)] +pub struct TokenizerError(String); + +lazy_static! { + static ref KEYWORDS: FnvHashSet<&'static str> = { + let mut m = FnvHashSet::default(); + + m.insert("SELECT"); + m.insert("FROM"); + m.insert("WHERE"); + m.insert("LIMIT"); + m.insert("ORDER"); + m.insert("GROUP"); + m.insert("BY"); + m.insert("HAVING"); + m.insert("UNION"); + m.insert("ALL"); + m.insert("INSERT"); + m.insert("UPDATE"); + m.insert("DELETE"); + m.insert("IN"); + m.insert("IS"); + m.insert("NULL"); + m.insert("SET"); + m.insert("CREATE"); + m.insert("EXTERNAL"); + m.insert("TABLE"); + m.insert("ASC"); + m.insert("DESC"); + m.insert("AND"); + m.insert("OR"); + m.insert("NOT"); + m.insert("AS"); + m.insert("STORED"); + m.insert("CSV"); + m.insert("PARQUET"); + m.insert("LOCATION"); + m.insert("WITH"); + m.insert("WITHOUT"); + m.insert("HEADER"); + m.insert("ROW"); + + // SQL types + m.insert("STRING"); + m.insert("VARCHAR"); + m.insert("FLOAT"); + m.insert("DOUBLE"); + m.insert("INT"); + m.insert("INTEGER"); + m.insert("LONG"); + + // Arrow native types + m.insert("BOOLEAN"); + m.insert("UINT8"); + m.insert("UINT16"); + m.insert("UINT32"); + m.insert("UINT64"); + m.insert("INT8"); + m.insert("INT16"); + m.insert("INT32"); + m.insert("INT64"); + m.insert("FLOAT32"); + m.insert("FLOAT64"); + m.insert("UTF8"); + + m + }; +} + +/// SQL Tokenizer +pub struct Tokenizer { + pub query: String, +} + +impl Tokenizer { + /// Create a new SQL tokenizer for the specified SQL statement + pub fn new(query: &str) -> Self { + Self { + query: query.to_string(), + } + } + + /// Tokenize the statement and produce a vector of tokens + pub fn tokenize(&mut self) -> Result, TokenizerError> { + let mut peekable = self.query.chars().peekable(); + + let mut tokens: Vec = vec![]; + + while let Some(token) = self.next_token(&mut peekable)? { + tokens.push(token); + } + + Ok(tokens + .into_iter() + .filter(|t| match t { + Token::Whitespace => false, + _ => true, + }) + .collect()) + } + + /// Get the next token or return None + fn next_token(&self, chars: &mut Peekable) -> Result, TokenizerError> { + //println!("next_token: {:?}", chars.peek()); + match chars.peek() { + Some(&ch) => match ch { + // whitespace + ' ' | '\t' | '\n' => { + chars.next(); // consume + Ok(Some(Token::Whitespace)) + } + // identifier or keyword + 'a'...'z' | 'A'...'Z' | '_' | '@' => { + let mut s = String::new(); + while let Some(&ch) = chars.peek() { + match ch { + 'a'...'z' | 'A'...'Z' | '_' | '0'...'9' | '@' => { + chars.next(); // consume + s.push(ch); + } + _ => break, + } + } + let upper_str = s.to_uppercase(); + if KEYWORDS.contains(upper_str.as_str()) { + Ok(Some(Token::Keyword(upper_str))) + } else { + Ok(Some(Token::Identifier(s))) + } + } + // string + '\'' => { + //TODO: handle escaped quotes in string + //TODO: handle EOF before terminating quote + let mut s = String::new(); + chars.next(); // consume + while let Some(&ch) = chars.peek() { + match ch { + '\'' => { + chars.next(); // consume + break; + } + _ => { + chars.next(); // consume + s.push(ch); + } + } + } + Ok(Some(Token::String(s))) + } + // numbers + '0'...'9' => { + let mut s = String::new(); + while let Some(&ch) = chars.peek() { + match ch { + '0'...'9' | '.' => { + chars.next(); // consume + s.push(ch); + } + _ => break, + } + } + Ok(Some(Token::Number(s))) + } + // punctuation + ',' => { + chars.next(); + Ok(Some(Token::Comma)) + } + '(' => { + chars.next(); + Ok(Some(Token::LParen)) + } + ')' => { + chars.next(); + Ok(Some(Token::RParen)) + } + // operators + '+' => { + chars.next(); + Ok(Some(Token::Plus)) + } + '-' => { + chars.next(); + Ok(Some(Token::Minus)) + } + '*' => { + chars.next(); + Ok(Some(Token::Mult)) + } + '/' => { + chars.next(); + Ok(Some(Token::Div)) + } + '%' => { + chars.next(); + Ok(Some(Token::Mod)) + } + '=' => { + chars.next(); + Ok(Some(Token::Eq)) + } + '.' => { + chars.next(); + Ok(Some(Token::Period)) + } + '!' => { + chars.next(); // consume + match chars.peek() { + Some(&ch) => match ch { + '=' => { + chars.next(); + Ok(Some(Token::Neq)) + } + _ => Err(TokenizerError(format!("TBD"))), + }, + None => Err(TokenizerError(format!("TBD"))), + } + } + '<' => { + chars.next(); // consume + match chars.peek() { + Some(&ch) => match ch { + '=' => { + chars.next(); + Ok(Some(Token::LtEq)) + } + '>' => { + chars.next(); + Ok(Some(Token::Neq)) + } + _ => Ok(Some(Token::Lt)), + }, + None => Ok(Some(Token::Lt)), + } + } + '>' => { + chars.next(); // consume + match chars.peek() { + Some(&ch) => match ch { + '=' => { + chars.next(); + Ok(Some(Token::GtEq)) + } + _ => Ok(Some(Token::Gt)), + }, + None => Ok(Some(Token::Gt)), + } + } + _ => Err(TokenizerError(format!( + "unhandled char '{}' in tokenizer", + ch + ))), + }, + None => Ok(None), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn tokenize_select_1() { + let sql = String::from("SELECT 1"); + let mut tokenizer = Tokenizer::new(&sql); + let tokens = tokenizer.tokenize().unwrap(); + + let expected = vec![ + Token::Keyword(String::from("SELECT")), + Token::Number(String::from("1")), + ]; + + compare(expected, tokens); + } + + #[test] + fn tokenize_scalar_function() { + let sql = String::from("SELECT sqrt(1)"); + let mut tokenizer = Tokenizer::new(&sql); + let tokens = tokenizer.tokenize().unwrap(); + + let expected = vec![ + Token::Keyword(String::from("SELECT")), + Token::Identifier(String::from("sqrt")), + Token::LParen, + Token::Number(String::from("1")), + Token::RParen, + ]; + + compare(expected, tokens); + } + + #[test] + fn tokenize_simple_select() { + let sql = String::from("SELECT * FROM customer WHERE id = 1 LIMIT 5"); + let mut tokenizer = Tokenizer::new(&sql); + let tokens = tokenizer.tokenize().unwrap(); + + let expected = vec![ + Token::Keyword(String::from("SELECT")), + Token::Mult, + Token::Keyword(String::from("FROM")), + Token::Identifier(String::from("customer")), + Token::Keyword(String::from("WHERE")), + Token::Identifier(String::from("id")), + Token::Eq, + Token::Number(String::from("1")), + Token::Keyword(String::from("LIMIT")), + Token::Number(String::from("5")), + ]; + + compare(expected, tokens); + } + + #[test] + fn tokenize_string_predicate() { + let sql = String::from("SELECT * FROM customer WHERE salary != 'Not Provided'"); + let mut tokenizer = Tokenizer::new(&sql); + let tokens = tokenizer.tokenize().unwrap(); + + let expected = vec![ + Token::Keyword(String::from("SELECT")), + Token::Mult, + Token::Keyword(String::from("FROM")), + Token::Identifier(String::from("customer")), + Token::Keyword(String::from("WHERE")), + Token::Identifier(String::from("salary")), + Token::Neq, + Token::String(String::from("Not Provided")), + ]; + + compare(expected, tokens); + } + + #[test] + fn tokenize_is_null() { + let sql = String::from("a IS NULL"); + let mut tokenizer = Tokenizer::new(&sql); + let tokens = tokenizer.tokenize().unwrap(); + + let expected = vec![ + Token::Identifier(String::from("a")), + Token::Keyword("IS".to_string()), + Token::Keyword("NULL".to_string()), + ]; + + compare(expected, tokens); + } + + fn compare(expected: Vec, actual: Vec) { + //println!("------------------------------"); + //println!("tokens = {:?}", actual); + //println!("expected = {:?}", expected); + //println!("------------------------------"); + assert_eq!(expected, actual); + } + +} diff --git a/src/tokenizer.rs b/src/tokenizer.rs deleted file mode 100644 index 73eb344f..00000000 --- a/src/tokenizer.rs +++ /dev/null @@ -1,124 +0,0 @@ -use std::cmp::PartialEq; -use std::fmt::Debug; - -/// Simple holder for a sequence of characters that supports iteration and mark/reset methods -pub struct CharSeq { - chars: Vec, - i: usize, - m: usize -} - -impl CharSeq { - - /// Create a CharSeq from a string - pub fn new(sql: &str) -> Self { - CharSeq { - chars: sql.chars().collect(), - i: 0, - m: 0 - } - } - - /// Mark the current index - pub fn mark(&mut self) { - self.m = self.i; - } - - /// Reset the index - pub fn reset(&mut self) { - self.i = self.m; - } - - /// Peek the next char - pub fn peek(&mut self) -> Option<&char> { - if self.i < self.chars.len() { - Some(&self.chars[self.i]) - } else { - None - } - } - - /// Get the next char - pub fn next(&mut self) -> Option { - if self.i < self.chars.len() { - self.i += 1; - Some(self.chars[self.i-1]) - } else { - None - } - } -} - -#[derive(Debug)] -pub struct Position { - line: usize, - col: usize -} -impl Position { - pub fn new(line: usize, col: usize) -> Self { - Position { line, col } - } -} - -#[derive(Debug)] -pub enum TokenizerError { - UnexpectedChar(char,Position), - UnexpectedEof(Position), - UnterminatedStringLiteral(Position), - Custom(String) -} - -/// SQL Tokens -#[derive(Debug,PartialEq)] -pub enum SQLToken { - Whitespace(char), - Keyword(String), - Identifier(String), - Literal(String), //TODO: need to model different types of literal - Plus, - Minus, - Mult, - Divide, - Eq, - Not, - NotEq, - Gt, - GtEq, - Lt, - LtEq, - LParen, - RParen, - Comma, -} - -pub trait SQLTokenizer { - - /// get the precendence of a token - fn precedence(&self, token: &SQLToken) -> usize; - - fn peek_token(&mut self) -> Result, TokenizerError>; - - /// return a reference to the next token and advance the index - fn next_token(&mut self) -> Result, TokenizerError>; -} - - -pub fn tokenize(sql: &str, tokenizer: &mut SQLTokenizer) -> Result, TokenizerError> { - - let mut chars = CharSeq::new(sql); - - let mut tokens : Vec = vec![]; - - loop { - match tokenizer.next_token()? { - Some(SQLToken::Whitespace(_)) => { /* ignore */ }, - Some(token) => { - println!("Token: {:?}", token); - tokens.push(token) - }, - None => break - } - } - - Ok(tokens) -} \ No newline at end of file