From 810cd8e6cf28a1586579bbd5d7dfcbae078c2922 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Sat, 8 Sep 2018 14:49:25 -0600 Subject: [PATCH] tokenizer delegates to dialect now --- examples/parse_select.rs | 4 +- src/dialect.rs | 516 +++++++++++++++++++++++++++++++-------- src/sqlparser.rs | 4 +- src/sqltokenizer.rs | 62 ++--- 4 files changed, 449 insertions(+), 137 deletions(-) diff --git a/examples/parse_select.rs b/examples/parse_select.rs index cb5b33aa..a87abb9b 100644 --- a/examples/parse_select.rs +++ b/examples/parse_select.rs @@ -9,9 +9,9 @@ fn main() { WHERE a > b AND b < 100 \ ORDER BY a DESC, b"; - let dialect = GenericSqlDialect{}; + let dialect = GenericSqlDialect {}; - let ast = Parser::parse_sql(&dialect,sql.to_string()).unwrap(); + let ast = Parser::parse_sql(&dialect, sql.to_string()).unwrap(); println!("AST: {:?}", ast); } diff --git a/src/dialect.rs b/src/dialect.rs index 8a6c6c00..1c6ce3e5 100644 --- a/src/dialect.rs +++ b/src/dialect.rs @@ -17,54 +17,353 @@ pub trait Dialect { /// Get a list of keywords for this dialect fn keywords(&self) -> Vec<&'static str>; + /// Determine if a character is a valid identifier start character + fn is_identifier_start(&self, ch: char) -> bool; + /// Determine if a character is a valid identifier character + fn is_identifier_part(&self, ch: char) -> bool; } -pub struct AnsiSqlDialect { -} +pub struct AnsiSqlDialect {} impl Dialect for AnsiSqlDialect { fn keywords(&self) -> Vec<&'static str> { - return vec!["ABS", "ALL", "ALLOCATE", "ALTER", "AND", "ANY", "ARE", "ARRAY", "ARRAY_AGG", - "ARRAY_MAX_CARDINALITY", "AS", "ASENSITIVE", "ASYMMETRIC", "AT", "ATOMIC", "AUTHORIZATION", - "AVG", "BEGIN", "BEGIN_FRAME", "BEGIN_PARTITION", "BETWEEN", "BIGINT", "BINARY", "BLOB", - "BOOLEAN", "BOTH", "BY", "CALL", "CALLED", "CARDINALITY", "CASCADED", "CASE", "CAST", "CEIL", - "CEILING", "CHAR", "CHAR_LENGTH", "CHARACTER", "CHARACTER_LENGTH", "CHECK", "CLOB", "CLOSE", - "COALESCE", "COLLATE", "COLLECT", "COLUMN", "COMMIT", "CONDITION", "CONNECT", "CONSTRAINT", - "CONTAINS", "CONVERT", "CORR", "CORRESPONDING", "COUNT", "COVAR_POP", "COVAR_SAMP", "CREATE", - "CROSS", "CUBE", "CUME_DIST", "CURRENT", "CURRENT_CATALOG", "CURRENT_DATE", - "CURRENT_DEFAULT_TRANSFORM_GROUP", "CURRENT_PATH", "CURRENT_ROLE", "CURRENT_ROW", - "CURRENT_SCHEMA", "CURRENT_TIME", "CURRENT_TIMESTAMP", "CURRENT_TRANSFORM_GROUP_FOR_TYPE", - "CURRENT_USER", "CURSOR", "CYCLE", "DATE", "DAY", "DEALLOCATE", "DEC", "DECIMAL", "DECLARE", - "DEFAULT", "DELETE", "DENSE_RANK", "DEREF", "DESCRIBE", "DETERMINISTIC", "DISCONNECT", - "DISTINCT", "DOUBLE", "DROP", "DYNAMIC", "EACH", "ELEMENT", "ELSE", "END", "END_FRAME", - "END_PARTITION", "END-EXEC", "EQUALS", "ESCAPE", "EVERY", "EXCEPT", "EXEC", "EXECUTE", - "EXISTS", "EXP", "EXTERNAL", "EXTRACT", "FALSE", "FETCH", "FILTER", "FIRST_VALUE", "FLOAT", - "FLOOR", "FOR", "FOREIGN", "FRAME_ROW", "FREE", "FROM", "FULL", "FUNCTION", "FUSION", - "GET", "GLOBAL", "GRANT", "GROUP", "GROUPING", "GROUPS", "HAVING", "HOLD", "HOUR", "IDENTITY", - "IN", "INDICATOR", "INNER", "INOUT", "INSENSITIVE", "INSERT", "INT", "INTEGER", "INTERSECT", - "INTERSECTION", "INTERVAL", "INTO", "IS", "JOIN", "LAG", "LANGUAGE", "LARGE", "LAST_VALUE", - "LATERAL", "LEAD", "LEADING", "LEFT", "LIKE", "LIKE_REGEX", "LN", "LOCAL", "LOCALTIME", - "LOCALTIMESTAMP", "LOWER", "MATCH", "MAX", "MEMBER", "MERGE", "METHOD", "MIN", "MINUTE", - "MOD", "MODIFIES", "MODULE", "MONTH", "MULTISET", "NATIONAL", "NATURAL", "NCHAR", "NCLOB", - "NEW", "NO", "NONE", "NORMALIZE", "NOT", "NTH_VALUE", "NTILE", "NULL", "NULLIF", "NUMERIC", - "OCTET_LENGTH", "OCCURRENCES_REGEX", "OF", "OFFSET", "OLD", "ON", "ONLY", "OPEN", "OR", - "ORDER", "OUT", "OUTER", "OVER", "OVERLAPS", "OVERLAY", "PARAMETER", "PARTITION", "PERCENT", - "PERCENT_RANK", "PERCENTILE_CONT", "PERCENTILE_DISC", "PERIOD", "PORTION", "POSITION", - "POSITION_REGEX", "POWER", "PRECEDES", "PRECISION", "PREPARE", "PRIMARY", - "PROCEDURE", "RANGE", "RANK", "READS", "REAL", "RECURSIVE", "REF", "REFERENCES", - "REFERENCING", "REGR_AVGX", "REGR_AVGY", "REGR_COUNT", "REGR_INTERCEPT", "REGR_R2", - "REGR_SLOPE", "REGR_SXX", "REGR_SXY", "REGR_SYY", "RELEASE", "RESULT", "RETURN", "RETURNS", - "REVOKE", "RIGHT", "ROLLBACK", "ROLLUP", "ROW", "ROW_NUMBER", "ROWS", "SAVEPOINT", - "SCOPE", "SCROLL", "SEARCH", "SECOND", "SELECT", "SENSITIVE", "SESSION_USER", "SET", - "SIMILAR", "SMALLINT", "SOME", "SPECIFIC", "SPECIFICTYPE", "SQL", "SQLEXCEPTION", "SQLSTATE", - "SQLWARNING", "SQRT", "START", "STATIC", "STDDEV_POP", "STDDEV_SAMP", "SUBMULTISET", - "SUBSTRING", "SUBSTRING_REGEX", "SUCCEEDS", "SUM", "SYMMETRIC", "SYSTEM", "SYSTEM_TIME", - "SYSTEM_USER", "TABLE", "TABLESAMPLE", "THEN", "TIME", "TIMESTAMP", "TIMEZONE_HOUR", - "TIMEZONE_MINUTE", "TO", "TRAILING", "TRANSLATE", "TRANSLATE_REGEX", "TRANSLATION", - "TREAT", "TRIGGER", "TRUNCATE", "TRIM", "TRIM_ARRAY", "TRUE", "UESCAPE", "UNION", "UNIQUE", - "UNKNOWN", "UNNEST", "UPDATE", "UPPER", "USER", "USING", "VALUE", "VALUES", "VALUE_OF", - "VAR_POP", "VAR_SAMP", "VARBINARY", "VARCHAR", "VARYING", "VERSIONING", "WHEN", "WHENEVER", - "WHERE", "WIDTH_BUCKET", "WINDOW", "WITH", "WITHIN", "WITHOUT", "YEAR"]; + return vec![ + "ABS", + "ALL", + "ALLOCATE", + "ALTER", + "AND", + "ANY", + "ARE", + "ARRAY", + "ARRAY_AGG", + "ARRAY_MAX_CARDINALITY", + "AS", + "ASENSITIVE", + "ASYMMETRIC", + "AT", + "ATOMIC", + "AUTHORIZATION", + "AVG", + "BEGIN", + "BEGIN_FRAME", + "BEGIN_PARTITION", + "BETWEEN", + "BIGINT", + "BINARY", + "BLOB", + "BOOLEAN", + "BOTH", + "BY", + "CALL", + "CALLED", + "CARDINALITY", + "CASCADED", + "CASE", + "CAST", + "CEIL", + "CEILING", + "CHAR", + "CHAR_LENGTH", + "CHARACTER", + "CHARACTER_LENGTH", + "CHECK", + "CLOB", + "CLOSE", + "COALESCE", + "COLLATE", + "COLLECT", + "COLUMN", + "COMMIT", + "CONDITION", + "CONNECT", + "CONSTRAINT", + "CONTAINS", + "CONVERT", + "CORR", + "CORRESPONDING", + "COUNT", + "COVAR_POP", + "COVAR_SAMP", + "CREATE", + "CROSS", + "CUBE", + "CUME_DIST", + "CURRENT", + "CURRENT_CATALOG", + "CURRENT_DATE", + "CURRENT_DEFAULT_TRANSFORM_GROUP", + "CURRENT_PATH", + "CURRENT_ROLE", + "CURRENT_ROW", + "CURRENT_SCHEMA", + "CURRENT_TIME", + "CURRENT_TIMESTAMP", + "CURRENT_TRANSFORM_GROUP_FOR_TYPE", + "CURRENT_USER", + "CURSOR", + "CYCLE", + "DATE", + "DAY", + "DEALLOCATE", + "DEC", + "DECIMAL", + "DECLARE", + "DEFAULT", + "DELETE", + "DENSE_RANK", + "DEREF", + "DESCRIBE", + "DETERMINISTIC", + "DISCONNECT", + "DISTINCT", + "DOUBLE", + "DROP", + "DYNAMIC", + "EACH", + "ELEMENT", + "ELSE", + "END", + "END_FRAME", + "END_PARTITION", + "END-EXEC", + "EQUALS", + "ESCAPE", + "EVERY", + "EXCEPT", + "EXEC", + "EXECUTE", + "EXISTS", + "EXP", + "EXTERNAL", + "EXTRACT", + "FALSE", + "FETCH", + "FILTER", + "FIRST_VALUE", + "FLOAT", + "FLOOR", + "FOR", + "FOREIGN", + "FRAME_ROW", + "FREE", + "FROM", + "FULL", + "FUNCTION", + "FUSION", + "GET", + "GLOBAL", + "GRANT", + "GROUP", + "GROUPING", + "GROUPS", + "HAVING", + "HOLD", + "HOUR", + "IDENTITY", + "IN", + "INDICATOR", + "INNER", + "INOUT", + "INSENSITIVE", + "INSERT", + "INT", + "INTEGER", + "INTERSECT", + "INTERSECTION", + "INTERVAL", + "INTO", + "IS", + "JOIN", + "LAG", + "LANGUAGE", + "LARGE", + "LAST_VALUE", + "LATERAL", + "LEAD", + "LEADING", + "LEFT", + "LIKE", + "LIKE_REGEX", + "LN", + "LOCAL", + "LOCALTIME", + "LOCALTIMESTAMP", + "LOWER", + "MATCH", + "MAX", + "MEMBER", + "MERGE", + "METHOD", + "MIN", + "MINUTE", + "MOD", + "MODIFIES", + "MODULE", + "MONTH", + "MULTISET", + "NATIONAL", + "NATURAL", + "NCHAR", + "NCLOB", + "NEW", + "NO", + "NONE", + "NORMALIZE", + "NOT", + "NTH_VALUE", + "NTILE", + "NULL", + "NULLIF", + "NUMERIC", + "OCTET_LENGTH", + "OCCURRENCES_REGEX", + "OF", + "OFFSET", + "OLD", + "ON", + "ONLY", + "OPEN", + "OR", + "ORDER", + "OUT", + "OUTER", + "OVER", + "OVERLAPS", + "OVERLAY", + "PARAMETER", + "PARTITION", + "PERCENT", + "PERCENT_RANK", + "PERCENTILE_CONT", + "PERCENTILE_DISC", + "PERIOD", + "PORTION", + "POSITION", + "POSITION_REGEX", + "POWER", + "PRECEDES", + "PRECISION", + "PREPARE", + "PRIMARY", + "PROCEDURE", + "RANGE", + "RANK", + "READS", + "REAL", + "RECURSIVE", + "REF", + "REFERENCES", + "REFERENCING", + "REGR_AVGX", + "REGR_AVGY", + "REGR_COUNT", + "REGR_INTERCEPT", + "REGR_R2", + "REGR_SLOPE", + "REGR_SXX", + "REGR_SXY", + "REGR_SYY", + "RELEASE", + "RESULT", + "RETURN", + "RETURNS", + "REVOKE", + "RIGHT", + "ROLLBACK", + "ROLLUP", + "ROW", + "ROW_NUMBER", + "ROWS", + "SAVEPOINT", + "SCOPE", + "SCROLL", + "SEARCH", + "SECOND", + "SELECT", + "SENSITIVE", + "SESSION_USER", + "SET", + "SIMILAR", + "SMALLINT", + "SOME", + "SPECIFIC", + "SPECIFICTYPE", + "SQL", + "SQLEXCEPTION", + "SQLSTATE", + "SQLWARNING", + "SQRT", + "START", + "STATIC", + "STDDEV_POP", + "STDDEV_SAMP", + "SUBMULTISET", + "SUBSTRING", + "SUBSTRING_REGEX", + "SUCCEEDS", + "SUM", + "SYMMETRIC", + "SYSTEM", + "SYSTEM_TIME", + "SYSTEM_USER", + "TABLE", + "TABLESAMPLE", + "THEN", + "TIME", + "TIMESTAMP", + "TIMEZONE_HOUR", + "TIMEZONE_MINUTE", + "TO", + "TRAILING", + "TRANSLATE", + "TRANSLATE_REGEX", + "TRANSLATION", + "TREAT", + "TRIGGER", + "TRUNCATE", + "TRIM", + "TRIM_ARRAY", + "TRUE", + "UESCAPE", + "UNION", + "UNIQUE", + "UNKNOWN", + "UNNEST", + "UPDATE", + "UPPER", + "USER", + "USING", + "VALUE", + "VALUES", + "VALUE_OF", + "VAR_POP", + "VAR_SAMP", + "VARBINARY", + "VARCHAR", + "VARYING", + "VERSIONING", + "WHEN", + "WHENEVER", + "WHERE", + "WIDTH_BUCKET", + "WINDOW", + "WITH", + "WITHIN", + "WITHOUT", + "YEAR", + ]; + } + + fn is_identifier_start(&self, ch: char) -> bool { + (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') + } + + fn is_identifier_part(&self, ch: char) -> bool { + (ch >= 'a' && ch <= 'z') + || (ch >= 'A' && ch <= 'Z') + || (ch >= '0' && ch <= '9') + || ch == '_' } } @@ -75,67 +374,76 @@ impl Dialect for GenericSqlDialect { return vec![ "SELECT", "FROM", - "WHERE", - "LIMIT", - "ORDER", - "GROUP", - "BY", - "HAVING", - "UNION", - "ALL", - "INSERT", - "UPDATE", - "DELETE", - "IN", - "IS", - "NULL", - "SET", - "CREATE", - "EXTERNAL", - "TABLE", - "ASC", - "DESC", - "AND", - "OR", - "NOT", - "AS", - "STORED", - "CSV", - "PARQUET", - "LOCATION", - "WITH", - "WITHOUT", - "HEADER", - "ROW", - - // SQL types - "CHAR", - "CHARACTER", - "VARYING", - "LARGE", - "OBJECT", - "VARCHAR", - "CLOB", - "BINARY", - "VARBINARY", - "BLOB", - "FLOAT", - "REAL", - "DOUBLE", - "PRECISION", - "INT", - "INTEGER", - "SMALLINT", - "BIGINT", - "NUMERIC", - "DECIMAL", - "DEC", - "BOOLEAN", - "DATE", - "TIME", - "TIMESTAMP", - + "WHERE", + "LIMIT", + "ORDER", + "GROUP", + "BY", + "HAVING", + "UNION", + "ALL", + "INSERT", + "UPDATE", + "DELETE", + "IN", + "IS", + "NULL", + "SET", + "CREATE", + "EXTERNAL", + "TABLE", + "ASC", + "DESC", + "AND", + "OR", + "NOT", + "AS", + "STORED", + "CSV", + "PARQUET", + "LOCATION", + "WITH", + "WITHOUT", + "HEADER", + "ROW", + // SQL types + "CHAR", + "CHARACTER", + "VARYING", + "LARGE", + "OBJECT", + "VARCHAR", + "CLOB", + "BINARY", + "VARBINARY", + "BLOB", + "FLOAT", + "REAL", + "DOUBLE", + "PRECISION", + "INT", + "INTEGER", + "SMALLINT", + "BIGINT", + "NUMERIC", + "DECIMAL", + "DEC", + "BOOLEAN", + "DATE", + "TIME", + "TIMESTAMP", ]; } -} + fn is_identifier_start(&self, ch: char) -> bool { + (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || ch == '@' + } + + fn is_identifier_part(&self, ch: char) -> bool { + (ch >= 'a' && ch <= 'z') + || (ch >= 'A' && ch <= 'Z') + || (ch >= '0' && ch <= '9') + || ch == '@' + || ch == '_' + } +} diff --git a/src/sqlparser.rs b/src/sqlparser.rs index 0718e45f..234fd607 100644 --- a/src/sqlparser.rs +++ b/src/sqlparser.rs @@ -619,8 +619,8 @@ impl Parser { #[cfg(test)] mod tests { - use super::*; use super::super::dialect::GenericSqlDialect; + use super::*; #[test] fn parse_delete_statement() { @@ -952,7 +952,7 @@ mod tests { fn parse_sql(sql: &str) -> ASTNode { let dialect = GenericSqlDialect {}; - let mut tokenizer = Tokenizer::new(&dialect,&sql, ); + let mut tokenizer = Tokenizer::new(&dialect, &sql); let tokens = tokenizer.tokenize().unwrap(); let mut parser = Parser::new(tokens); let ast = parser.parse().unwrap(); diff --git a/src/sqltokenizer.rs b/src/sqltokenizer.rs index 345d130a..6a5e6876 100644 --- a/src/sqltokenizer.rs +++ b/src/sqltokenizer.rs @@ -13,6 +13,10 @@ // limitations under the License. //! SQL Tokenizer +//! +//! The tokenizer (a.k.a. lexer) converts a string into a sequence of tokens. +//! +//! The tokens then form the input for the parser, which outputs an Abstract Syntax Tree (AST). use std::iter::Peekable; use std::str::Chars; @@ -69,18 +73,18 @@ pub enum Token { pub struct TokenizerError(String); /// SQL Tokenizer -pub struct Tokenizer { - keywords: Vec<&'static str>, +pub struct Tokenizer<'a> { + dialect: &'a Dialect, pub query: String, pub line: u64, pub col: u64, } -impl Tokenizer { +impl<'a> Tokenizer<'a> { /// Create a new SQL tokenizer for the specified SQL statement - pub fn new(dialect: &Dialect, query: &str) -> Self { + pub fn new(dialect: &'a Dialect, query: &str) -> Self { Self { - keywords: dialect.keywords(), + dialect, query: query.to_string(), line: 1, col: 1, @@ -91,8 +95,7 @@ impl Tokenizer { //TODO: need to reintroduce FnvHashSet at some point .. iterating over keywords is // not fast but I want the simplicity for now while I experiment with pluggable // dialects - return self.keywords.contains(&s); - + return self.dialect.keywords().contains(&s); } /// Tokenize the statement and produce a vector of tokens @@ -138,15 +141,16 @@ impl Tokenizer { Ok(Some(Token::Whitespace(ch))) } // identifier or keyword - 'a'...'z' | 'A'...'Z' | '_' | '@' => { + ch if self.dialect.is_identifier_start(ch) => { let mut s = String::new(); + chars.next(); // consume + s.push(ch); while let Some(&ch) = chars.peek() { - match ch { - 'a'...'z' | 'A'...'Z' | '_' | '0'...'9' | '@' => { - chars.next(); // consume - s.push(ch); - } - _ => break, + if self.dialect.is_identifier_part(ch) { + chars.next(); // consume + s.push(ch); + } else { + break; } } let upper_str = s.to_uppercase(); @@ -293,14 +297,14 @@ impl Tokenizer { #[cfg(test)] mod tests { + use super::super::dialect::GenericSqlDialect; use super::*; - use super::super::dialect::{GenericSqlDialect}; #[test] fn tokenize_select_1() { let sql = String::from("SELECT 1"); - let dialect = GenericSqlDialect{}; - let mut tokenizer = Tokenizer::new(&dialect,&sql); + let dialect = GenericSqlDialect {}; + let mut tokenizer = Tokenizer::new(&dialect, &sql); let tokens = tokenizer.tokenize().unwrap(); let expected = vec![ @@ -314,8 +318,8 @@ mod tests { #[test] fn tokenize_scalar_function() { let sql = String::from("SELECT sqrt(1)"); - let dialect = GenericSqlDialect{}; - let mut tokenizer = Tokenizer::new(&dialect,&sql); + let dialect = GenericSqlDialect {}; + let mut tokenizer = Tokenizer::new(&dialect, &sql); let tokens = tokenizer.tokenize().unwrap(); let expected = vec![ @@ -332,8 +336,8 @@ mod tests { #[test] fn tokenize_simple_select() { let sql = String::from("SELECT * FROM customer WHERE id = 1 LIMIT 5"); - let dialect = GenericSqlDialect{}; - let mut tokenizer = Tokenizer::new(&dialect,&sql); + let dialect = GenericSqlDialect {}; + let mut tokenizer = Tokenizer::new(&dialect, &sql); let tokens = tokenizer.tokenize().unwrap(); let expected = vec![ @@ -355,8 +359,8 @@ mod tests { #[test] fn tokenize_string_predicate() { let sql = String::from("SELECT * FROM customer WHERE salary != 'Not Provided'"); - let dialect = GenericSqlDialect{}; - let mut tokenizer = Tokenizer::new(&dialect,&sql); + let dialect = GenericSqlDialect {}; + let mut tokenizer = Tokenizer::new(&dialect, &sql); let tokens = tokenizer.tokenize().unwrap(); let expected = vec![ @@ -377,8 +381,8 @@ mod tests { fn tokenize_invalid_string() { let sql = String::from("\nمصطفىh"); - let dialect = GenericSqlDialect{}; - let mut tokenizer = Tokenizer::new(&dialect,&sql); + let dialect = GenericSqlDialect {}; + let mut tokenizer = Tokenizer::new(&dialect, &sql); let tokens = tokenizer.tokenize(); match tokens { @@ -396,8 +400,8 @@ mod tests { fn tokenize_invalid_string_cols() { let sql = String::from("\n\nSELECT * FROM table\tمصطفىh"); - let dialect = GenericSqlDialect{}; - let mut tokenizer = Tokenizer::new(&dialect,&sql); + let dialect = GenericSqlDialect {}; + let mut tokenizer = Tokenizer::new(&dialect, &sql); let tokens = tokenizer.tokenize(); match tokens { Err(e) => assert_eq!( @@ -413,8 +417,8 @@ mod tests { #[test] fn tokenize_is_null() { let sql = String::from("a IS NULL"); - let dialect = GenericSqlDialect{}; - let mut tokenizer = Tokenizer::new(&dialect,&sql); + let dialect = GenericSqlDialect {}; + let mut tokenizer = Tokenizer::new(&dialect, &sql); let tokens = tokenizer.tokenize().unwrap(); let expected = vec![