tokenizer delegates to dialect now

This commit is contained in:
Andy Grove 2018-09-08 14:49:25 -06:00
parent 96f1f9f35e
commit 810cd8e6cf
4 changed files with 449 additions and 137 deletions

View file

@ -9,9 +9,9 @@ fn main() {
WHERE a > b AND b < 100 \
ORDER BY a DESC, b";
let dialect = GenericSqlDialect{};
let dialect = GenericSqlDialect {};
let ast = Parser::parse_sql(&dialect,sql.to_string()).unwrap();
let ast = Parser::parse_sql(&dialect, sql.to_string()).unwrap();
println!("AST: {:?}", ast);
}

View file

@ -17,54 +17,353 @@
pub trait Dialect {
/// Get a list of keywords for this dialect
fn keywords(&self) -> Vec<&'static str>;
/// Determine if a character is a valid identifier start character
fn is_identifier_start(&self, ch: char) -> bool;
/// Determine if a character is a valid identifier character
fn is_identifier_part(&self, ch: char) -> bool;
}
pub struct AnsiSqlDialect {
}
pub struct AnsiSqlDialect {}
impl Dialect for AnsiSqlDialect {
fn keywords(&self) -> Vec<&'static str> {
return vec!["ABS", "ALL", "ALLOCATE", "ALTER", "AND", "ANY", "ARE", "ARRAY", "ARRAY_AGG",
"ARRAY_MAX_CARDINALITY", "AS", "ASENSITIVE", "ASYMMETRIC", "AT", "ATOMIC", "AUTHORIZATION",
"AVG", "BEGIN", "BEGIN_FRAME", "BEGIN_PARTITION", "BETWEEN", "BIGINT", "BINARY", "BLOB",
"BOOLEAN", "BOTH", "BY", "CALL", "CALLED", "CARDINALITY", "CASCADED", "CASE", "CAST", "CEIL",
"CEILING", "CHAR", "CHAR_LENGTH", "CHARACTER", "CHARACTER_LENGTH", "CHECK", "CLOB", "CLOSE",
"COALESCE", "COLLATE", "COLLECT", "COLUMN", "COMMIT", "CONDITION", "CONNECT", "CONSTRAINT",
"CONTAINS", "CONVERT", "CORR", "CORRESPONDING", "COUNT", "COVAR_POP", "COVAR_SAMP", "CREATE",
"CROSS", "CUBE", "CUME_DIST", "CURRENT", "CURRENT_CATALOG", "CURRENT_DATE",
"CURRENT_DEFAULT_TRANSFORM_GROUP", "CURRENT_PATH", "CURRENT_ROLE", "CURRENT_ROW",
"CURRENT_SCHEMA", "CURRENT_TIME", "CURRENT_TIMESTAMP", "CURRENT_TRANSFORM_GROUP_FOR_TYPE",
"CURRENT_USER", "CURSOR", "CYCLE", "DATE", "DAY", "DEALLOCATE", "DEC", "DECIMAL", "DECLARE",
"DEFAULT", "DELETE", "DENSE_RANK", "DEREF", "DESCRIBE", "DETERMINISTIC", "DISCONNECT",
"DISTINCT", "DOUBLE", "DROP", "DYNAMIC", "EACH", "ELEMENT", "ELSE", "END", "END_FRAME",
"END_PARTITION", "END-EXEC", "EQUALS", "ESCAPE", "EVERY", "EXCEPT", "EXEC", "EXECUTE",
"EXISTS", "EXP", "EXTERNAL", "EXTRACT", "FALSE", "FETCH", "FILTER", "FIRST_VALUE", "FLOAT",
"FLOOR", "FOR", "FOREIGN", "FRAME_ROW", "FREE", "FROM", "FULL", "FUNCTION", "FUSION",
"GET", "GLOBAL", "GRANT", "GROUP", "GROUPING", "GROUPS", "HAVING", "HOLD", "HOUR", "IDENTITY",
"IN", "INDICATOR", "INNER", "INOUT", "INSENSITIVE", "INSERT", "INT", "INTEGER", "INTERSECT",
"INTERSECTION", "INTERVAL", "INTO", "IS", "JOIN", "LAG", "LANGUAGE", "LARGE", "LAST_VALUE",
"LATERAL", "LEAD", "LEADING", "LEFT", "LIKE", "LIKE_REGEX", "LN", "LOCAL", "LOCALTIME",
"LOCALTIMESTAMP", "LOWER", "MATCH", "MAX", "MEMBER", "MERGE", "METHOD", "MIN", "MINUTE",
"MOD", "MODIFIES", "MODULE", "MONTH", "MULTISET", "NATIONAL", "NATURAL", "NCHAR", "NCLOB",
"NEW", "NO", "NONE", "NORMALIZE", "NOT", "NTH_VALUE", "NTILE", "NULL", "NULLIF", "NUMERIC",
"OCTET_LENGTH", "OCCURRENCES_REGEX", "OF", "OFFSET", "OLD", "ON", "ONLY", "OPEN", "OR",
"ORDER", "OUT", "OUTER", "OVER", "OVERLAPS", "OVERLAY", "PARAMETER", "PARTITION", "PERCENT",
"PERCENT_RANK", "PERCENTILE_CONT", "PERCENTILE_DISC", "PERIOD", "PORTION", "POSITION",
"POSITION_REGEX", "POWER", "PRECEDES", "PRECISION", "PREPARE", "PRIMARY",
"PROCEDURE", "RANGE", "RANK", "READS", "REAL", "RECURSIVE", "REF", "REFERENCES",
"REFERENCING", "REGR_AVGX", "REGR_AVGY", "REGR_COUNT", "REGR_INTERCEPT", "REGR_R2",
"REGR_SLOPE", "REGR_SXX", "REGR_SXY", "REGR_SYY", "RELEASE", "RESULT", "RETURN", "RETURNS",
"REVOKE", "RIGHT", "ROLLBACK", "ROLLUP", "ROW", "ROW_NUMBER", "ROWS", "SAVEPOINT",
"SCOPE", "SCROLL", "SEARCH", "SECOND", "SELECT", "SENSITIVE", "SESSION_USER", "SET",
"SIMILAR", "SMALLINT", "SOME", "SPECIFIC", "SPECIFICTYPE", "SQL", "SQLEXCEPTION", "SQLSTATE",
"SQLWARNING", "SQRT", "START", "STATIC", "STDDEV_POP", "STDDEV_SAMP", "SUBMULTISET",
"SUBSTRING", "SUBSTRING_REGEX", "SUCCEEDS", "SUM", "SYMMETRIC", "SYSTEM", "SYSTEM_TIME",
"SYSTEM_USER", "TABLE", "TABLESAMPLE", "THEN", "TIME", "TIMESTAMP", "TIMEZONE_HOUR",
"TIMEZONE_MINUTE", "TO", "TRAILING", "TRANSLATE", "TRANSLATE_REGEX", "TRANSLATION",
"TREAT", "TRIGGER", "TRUNCATE", "TRIM", "TRIM_ARRAY", "TRUE", "UESCAPE", "UNION", "UNIQUE",
"UNKNOWN", "UNNEST", "UPDATE", "UPPER", "USER", "USING", "VALUE", "VALUES", "VALUE_OF",
"VAR_POP", "VAR_SAMP", "VARBINARY", "VARCHAR", "VARYING", "VERSIONING", "WHEN", "WHENEVER",
"WHERE", "WIDTH_BUCKET", "WINDOW", "WITH", "WITHIN", "WITHOUT", "YEAR"];
return vec![
"ABS",
"ALL",
"ALLOCATE",
"ALTER",
"AND",
"ANY",
"ARE",
"ARRAY",
"ARRAY_AGG",
"ARRAY_MAX_CARDINALITY",
"AS",
"ASENSITIVE",
"ASYMMETRIC",
"AT",
"ATOMIC",
"AUTHORIZATION",
"AVG",
"BEGIN",
"BEGIN_FRAME",
"BEGIN_PARTITION",
"BETWEEN",
"BIGINT",
"BINARY",
"BLOB",
"BOOLEAN",
"BOTH",
"BY",
"CALL",
"CALLED",
"CARDINALITY",
"CASCADED",
"CASE",
"CAST",
"CEIL",
"CEILING",
"CHAR",
"CHAR_LENGTH",
"CHARACTER",
"CHARACTER_LENGTH",
"CHECK",
"CLOB",
"CLOSE",
"COALESCE",
"COLLATE",
"COLLECT",
"COLUMN",
"COMMIT",
"CONDITION",
"CONNECT",
"CONSTRAINT",
"CONTAINS",
"CONVERT",
"CORR",
"CORRESPONDING",
"COUNT",
"COVAR_POP",
"COVAR_SAMP",
"CREATE",
"CROSS",
"CUBE",
"CUME_DIST",
"CURRENT",
"CURRENT_CATALOG",
"CURRENT_DATE",
"CURRENT_DEFAULT_TRANSFORM_GROUP",
"CURRENT_PATH",
"CURRENT_ROLE",
"CURRENT_ROW",
"CURRENT_SCHEMA",
"CURRENT_TIME",
"CURRENT_TIMESTAMP",
"CURRENT_TRANSFORM_GROUP_FOR_TYPE",
"CURRENT_USER",
"CURSOR",
"CYCLE",
"DATE",
"DAY",
"DEALLOCATE",
"DEC",
"DECIMAL",
"DECLARE",
"DEFAULT",
"DELETE",
"DENSE_RANK",
"DEREF",
"DESCRIBE",
"DETERMINISTIC",
"DISCONNECT",
"DISTINCT",
"DOUBLE",
"DROP",
"DYNAMIC",
"EACH",
"ELEMENT",
"ELSE",
"END",
"END_FRAME",
"END_PARTITION",
"END-EXEC",
"EQUALS",
"ESCAPE",
"EVERY",
"EXCEPT",
"EXEC",
"EXECUTE",
"EXISTS",
"EXP",
"EXTERNAL",
"EXTRACT",
"FALSE",
"FETCH",
"FILTER",
"FIRST_VALUE",
"FLOAT",
"FLOOR",
"FOR",
"FOREIGN",
"FRAME_ROW",
"FREE",
"FROM",
"FULL",
"FUNCTION",
"FUSION",
"GET",
"GLOBAL",
"GRANT",
"GROUP",
"GROUPING",
"GROUPS",
"HAVING",
"HOLD",
"HOUR",
"IDENTITY",
"IN",
"INDICATOR",
"INNER",
"INOUT",
"INSENSITIVE",
"INSERT",
"INT",
"INTEGER",
"INTERSECT",
"INTERSECTION",
"INTERVAL",
"INTO",
"IS",
"JOIN",
"LAG",
"LANGUAGE",
"LARGE",
"LAST_VALUE",
"LATERAL",
"LEAD",
"LEADING",
"LEFT",
"LIKE",
"LIKE_REGEX",
"LN",
"LOCAL",
"LOCALTIME",
"LOCALTIMESTAMP",
"LOWER",
"MATCH",
"MAX",
"MEMBER",
"MERGE",
"METHOD",
"MIN",
"MINUTE",
"MOD",
"MODIFIES",
"MODULE",
"MONTH",
"MULTISET",
"NATIONAL",
"NATURAL",
"NCHAR",
"NCLOB",
"NEW",
"NO",
"NONE",
"NORMALIZE",
"NOT",
"NTH_VALUE",
"NTILE",
"NULL",
"NULLIF",
"NUMERIC",
"OCTET_LENGTH",
"OCCURRENCES_REGEX",
"OF",
"OFFSET",
"OLD",
"ON",
"ONLY",
"OPEN",
"OR",
"ORDER",
"OUT",
"OUTER",
"OVER",
"OVERLAPS",
"OVERLAY",
"PARAMETER",
"PARTITION",
"PERCENT",
"PERCENT_RANK",
"PERCENTILE_CONT",
"PERCENTILE_DISC",
"PERIOD",
"PORTION",
"POSITION",
"POSITION_REGEX",
"POWER",
"PRECEDES",
"PRECISION",
"PREPARE",
"PRIMARY",
"PROCEDURE",
"RANGE",
"RANK",
"READS",
"REAL",
"RECURSIVE",
"REF",
"REFERENCES",
"REFERENCING",
"REGR_AVGX",
"REGR_AVGY",
"REGR_COUNT",
"REGR_INTERCEPT",
"REGR_R2",
"REGR_SLOPE",
"REGR_SXX",
"REGR_SXY",
"REGR_SYY",
"RELEASE",
"RESULT",
"RETURN",
"RETURNS",
"REVOKE",
"RIGHT",
"ROLLBACK",
"ROLLUP",
"ROW",
"ROW_NUMBER",
"ROWS",
"SAVEPOINT",
"SCOPE",
"SCROLL",
"SEARCH",
"SECOND",
"SELECT",
"SENSITIVE",
"SESSION_USER",
"SET",
"SIMILAR",
"SMALLINT",
"SOME",
"SPECIFIC",
"SPECIFICTYPE",
"SQL",
"SQLEXCEPTION",
"SQLSTATE",
"SQLWARNING",
"SQRT",
"START",
"STATIC",
"STDDEV_POP",
"STDDEV_SAMP",
"SUBMULTISET",
"SUBSTRING",
"SUBSTRING_REGEX",
"SUCCEEDS",
"SUM",
"SYMMETRIC",
"SYSTEM",
"SYSTEM_TIME",
"SYSTEM_USER",
"TABLE",
"TABLESAMPLE",
"THEN",
"TIME",
"TIMESTAMP",
"TIMEZONE_HOUR",
"TIMEZONE_MINUTE",
"TO",
"TRAILING",
"TRANSLATE",
"TRANSLATE_REGEX",
"TRANSLATION",
"TREAT",
"TRIGGER",
"TRUNCATE",
"TRIM",
"TRIM_ARRAY",
"TRUE",
"UESCAPE",
"UNION",
"UNIQUE",
"UNKNOWN",
"UNNEST",
"UPDATE",
"UPPER",
"USER",
"USING",
"VALUE",
"VALUES",
"VALUE_OF",
"VAR_POP",
"VAR_SAMP",
"VARBINARY",
"VARCHAR",
"VARYING",
"VERSIONING",
"WHEN",
"WHENEVER",
"WHERE",
"WIDTH_BUCKET",
"WINDOW",
"WITH",
"WITHIN",
"WITHOUT",
"YEAR",
];
}
fn is_identifier_start(&self, ch: char) -> bool {
(ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z')
}
fn is_identifier_part(&self, ch: char) -> bool {
(ch >= 'a' && ch <= 'z')
|| (ch >= 'A' && ch <= 'Z')
|| (ch >= '0' && ch <= '9')
|| ch == '_'
}
}
@ -75,67 +374,76 @@ impl Dialect for GenericSqlDialect {
return vec![
"SELECT",
"FROM",
"WHERE",
"LIMIT",
"ORDER",
"GROUP",
"BY",
"HAVING",
"UNION",
"ALL",
"INSERT",
"UPDATE",
"DELETE",
"IN",
"IS",
"NULL",
"SET",
"CREATE",
"EXTERNAL",
"TABLE",
"ASC",
"DESC",
"AND",
"OR",
"NOT",
"AS",
"STORED",
"CSV",
"PARQUET",
"LOCATION",
"WITH",
"WITHOUT",
"HEADER",
"ROW",
// SQL types
"CHAR",
"CHARACTER",
"VARYING",
"LARGE",
"OBJECT",
"VARCHAR",
"CLOB",
"BINARY",
"VARBINARY",
"BLOB",
"FLOAT",
"REAL",
"DOUBLE",
"PRECISION",
"INT",
"INTEGER",
"SMALLINT",
"BIGINT",
"NUMERIC",
"DECIMAL",
"DEC",
"BOOLEAN",
"DATE",
"TIME",
"TIMESTAMP",
"WHERE",
"LIMIT",
"ORDER",
"GROUP",
"BY",
"HAVING",
"UNION",
"ALL",
"INSERT",
"UPDATE",
"DELETE",
"IN",
"IS",
"NULL",
"SET",
"CREATE",
"EXTERNAL",
"TABLE",
"ASC",
"DESC",
"AND",
"OR",
"NOT",
"AS",
"STORED",
"CSV",
"PARQUET",
"LOCATION",
"WITH",
"WITHOUT",
"HEADER",
"ROW",
// SQL types
"CHAR",
"CHARACTER",
"VARYING",
"LARGE",
"OBJECT",
"VARCHAR",
"CLOB",
"BINARY",
"VARBINARY",
"BLOB",
"FLOAT",
"REAL",
"DOUBLE",
"PRECISION",
"INT",
"INTEGER",
"SMALLINT",
"BIGINT",
"NUMERIC",
"DECIMAL",
"DEC",
"BOOLEAN",
"DATE",
"TIME",
"TIMESTAMP",
];
}
}
fn is_identifier_start(&self, ch: char) -> bool {
(ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || ch == '@'
}
fn is_identifier_part(&self, ch: char) -> bool {
(ch >= 'a' && ch <= 'z')
|| (ch >= 'A' && ch <= 'Z')
|| (ch >= '0' && ch <= '9')
|| ch == '@'
|| ch == '_'
}
}

View file

@ -619,8 +619,8 @@ impl Parser {
#[cfg(test)]
mod tests {
use super::*;
use super::super::dialect::GenericSqlDialect;
use super::*;
#[test]
fn parse_delete_statement() {
@ -952,7 +952,7 @@ mod tests {
fn parse_sql(sql: &str) -> ASTNode {
let dialect = GenericSqlDialect {};
let mut tokenizer = Tokenizer::new(&dialect,&sql, );
let mut tokenizer = Tokenizer::new(&dialect, &sql);
let tokens = tokenizer.tokenize().unwrap();
let mut parser = Parser::new(tokens);
let ast = parser.parse().unwrap();

View file

@ -13,6 +13,10 @@
// limitations under the License.
//! SQL Tokenizer
//!
//! The tokenizer (a.k.a. lexer) converts a string into a sequence of tokens.
//!
//! The tokens then form the input for the parser, which outputs an Abstract Syntax Tree (AST).
use std::iter::Peekable;
use std::str::Chars;
@ -69,18 +73,18 @@ pub enum Token {
pub struct TokenizerError(String);
/// SQL Tokenizer
pub struct Tokenizer {
keywords: Vec<&'static str>,
pub struct Tokenizer<'a> {
dialect: &'a Dialect,
pub query: String,
pub line: u64,
pub col: u64,
}
impl Tokenizer {
impl<'a> Tokenizer<'a> {
/// Create a new SQL tokenizer for the specified SQL statement
pub fn new(dialect: &Dialect, query: &str) -> Self {
pub fn new(dialect: &'a Dialect, query: &str) -> Self {
Self {
keywords: dialect.keywords(),
dialect,
query: query.to_string(),
line: 1,
col: 1,
@ -91,8 +95,7 @@ impl Tokenizer {
//TODO: need to reintroduce FnvHashSet at some point .. iterating over keywords is
// not fast but I want the simplicity for now while I experiment with pluggable
// dialects
return self.keywords.contains(&s);
return self.dialect.keywords().contains(&s);
}
/// Tokenize the statement and produce a vector of tokens
@ -138,15 +141,16 @@ impl Tokenizer {
Ok(Some(Token::Whitespace(ch)))
}
// identifier or keyword
'a'...'z' | 'A'...'Z' | '_' | '@' => {
ch if self.dialect.is_identifier_start(ch) => {
let mut s = String::new();
chars.next(); // consume
s.push(ch);
while let Some(&ch) = chars.peek() {
match ch {
'a'...'z' | 'A'...'Z' | '_' | '0'...'9' | '@' => {
chars.next(); // consume
s.push(ch);
}
_ => break,
if self.dialect.is_identifier_part(ch) {
chars.next(); // consume
s.push(ch);
} else {
break;
}
}
let upper_str = s.to_uppercase();
@ -293,14 +297,14 @@ impl Tokenizer {
#[cfg(test)]
mod tests {
use super::super::dialect::GenericSqlDialect;
use super::*;
use super::super::dialect::{GenericSqlDialect};
#[test]
fn tokenize_select_1() {
let sql = String::from("SELECT 1");
let dialect = GenericSqlDialect{};
let mut tokenizer = Tokenizer::new(&dialect,&sql);
let dialect = GenericSqlDialect {};
let mut tokenizer = Tokenizer::new(&dialect, &sql);
let tokens = tokenizer.tokenize().unwrap();
let expected = vec![
@ -314,8 +318,8 @@ mod tests {
#[test]
fn tokenize_scalar_function() {
let sql = String::from("SELECT sqrt(1)");
let dialect = GenericSqlDialect{};
let mut tokenizer = Tokenizer::new(&dialect,&sql);
let dialect = GenericSqlDialect {};
let mut tokenizer = Tokenizer::new(&dialect, &sql);
let tokens = tokenizer.tokenize().unwrap();
let expected = vec![
@ -332,8 +336,8 @@ mod tests {
#[test]
fn tokenize_simple_select() {
let sql = String::from("SELECT * FROM customer WHERE id = 1 LIMIT 5");
let dialect = GenericSqlDialect{};
let mut tokenizer = Tokenizer::new(&dialect,&sql);
let dialect = GenericSqlDialect {};
let mut tokenizer = Tokenizer::new(&dialect, &sql);
let tokens = tokenizer.tokenize().unwrap();
let expected = vec![
@ -355,8 +359,8 @@ mod tests {
#[test]
fn tokenize_string_predicate() {
let sql = String::from("SELECT * FROM customer WHERE salary != 'Not Provided'");
let dialect = GenericSqlDialect{};
let mut tokenizer = Tokenizer::new(&dialect,&sql);
let dialect = GenericSqlDialect {};
let mut tokenizer = Tokenizer::new(&dialect, &sql);
let tokens = tokenizer.tokenize().unwrap();
let expected = vec![
@ -377,8 +381,8 @@ mod tests {
fn tokenize_invalid_string() {
let sql = String::from("\nمصطفىh");
let dialect = GenericSqlDialect{};
let mut tokenizer = Tokenizer::new(&dialect,&sql);
let dialect = GenericSqlDialect {};
let mut tokenizer = Tokenizer::new(&dialect, &sql);
let tokens = tokenizer.tokenize();
match tokens {
@ -396,8 +400,8 @@ mod tests {
fn tokenize_invalid_string_cols() {
let sql = String::from("\n\nSELECT * FROM table\tمصطفىh");
let dialect = GenericSqlDialect{};
let mut tokenizer = Tokenizer::new(&dialect,&sql);
let dialect = GenericSqlDialect {};
let mut tokenizer = Tokenizer::new(&dialect, &sql);
let tokens = tokenizer.tokenize();
match tokens {
Err(e) => assert_eq!(
@ -413,8 +417,8 @@ mod tests {
#[test]
fn tokenize_is_null() {
let sql = String::from("a IS NULL");
let dialect = GenericSqlDialect{};
let mut tokenizer = Tokenizer::new(&dialect,&sql);
let dialect = GenericSqlDialect {};
let mut tokenizer = Tokenizer::new(&dialect, &sql);
let tokens = tokenizer.tokenize().unwrap();
let expected = vec![