tokenizer delegates to dialect now

This commit is contained in:
Andy Grove 2018-09-08 14:49:25 -06:00
parent 96f1f9f35e
commit 810cd8e6cf
4 changed files with 449 additions and 137 deletions

View file

@ -9,9 +9,9 @@ fn main() {
WHERE a > b AND b < 100 \
ORDER BY a DESC, b";
let dialect = GenericSqlDialect{};
let dialect = GenericSqlDialect {};
let ast = Parser::parse_sql(&dialect,sql.to_string()).unwrap();
let ast = Parser::parse_sql(&dialect, sql.to_string()).unwrap();
println!("AST: {:?}", ast);
}

View file

@ -17,54 +17,353 @@
pub trait Dialect {
/// Get a list of keywords for this dialect
fn keywords(&self) -> Vec<&'static str>;
/// Determine if a character is a valid identifier start character
fn is_identifier_start(&self, ch: char) -> bool;
/// Determine if a character is a valid identifier character
fn is_identifier_part(&self, ch: char) -> bool;
}
pub struct AnsiSqlDialect {
}
pub struct AnsiSqlDialect {}
impl Dialect for AnsiSqlDialect {
fn keywords(&self) -> Vec<&'static str> {
return vec!["ABS", "ALL", "ALLOCATE", "ALTER", "AND", "ANY", "ARE", "ARRAY", "ARRAY_AGG",
"ARRAY_MAX_CARDINALITY", "AS", "ASENSITIVE", "ASYMMETRIC", "AT", "ATOMIC", "AUTHORIZATION",
"AVG", "BEGIN", "BEGIN_FRAME", "BEGIN_PARTITION", "BETWEEN", "BIGINT", "BINARY", "BLOB",
"BOOLEAN", "BOTH", "BY", "CALL", "CALLED", "CARDINALITY", "CASCADED", "CASE", "CAST", "CEIL",
"CEILING", "CHAR", "CHAR_LENGTH", "CHARACTER", "CHARACTER_LENGTH", "CHECK", "CLOB", "CLOSE",
"COALESCE", "COLLATE", "COLLECT", "COLUMN", "COMMIT", "CONDITION", "CONNECT", "CONSTRAINT",
"CONTAINS", "CONVERT", "CORR", "CORRESPONDING", "COUNT", "COVAR_POP", "COVAR_SAMP", "CREATE",
"CROSS", "CUBE", "CUME_DIST", "CURRENT", "CURRENT_CATALOG", "CURRENT_DATE",
"CURRENT_DEFAULT_TRANSFORM_GROUP", "CURRENT_PATH", "CURRENT_ROLE", "CURRENT_ROW",
"CURRENT_SCHEMA", "CURRENT_TIME", "CURRENT_TIMESTAMP", "CURRENT_TRANSFORM_GROUP_FOR_TYPE",
"CURRENT_USER", "CURSOR", "CYCLE", "DATE", "DAY", "DEALLOCATE", "DEC", "DECIMAL", "DECLARE",
"DEFAULT", "DELETE", "DENSE_RANK", "DEREF", "DESCRIBE", "DETERMINISTIC", "DISCONNECT",
"DISTINCT", "DOUBLE", "DROP", "DYNAMIC", "EACH", "ELEMENT", "ELSE", "END", "END_FRAME",
"END_PARTITION", "END-EXEC", "EQUALS", "ESCAPE", "EVERY", "EXCEPT", "EXEC", "EXECUTE",
"EXISTS", "EXP", "EXTERNAL", "EXTRACT", "FALSE", "FETCH", "FILTER", "FIRST_VALUE", "FLOAT",
"FLOOR", "FOR", "FOREIGN", "FRAME_ROW", "FREE", "FROM", "FULL", "FUNCTION", "FUSION",
"GET", "GLOBAL", "GRANT", "GROUP", "GROUPING", "GROUPS", "HAVING", "HOLD", "HOUR", "IDENTITY",
"IN", "INDICATOR", "INNER", "INOUT", "INSENSITIVE", "INSERT", "INT", "INTEGER", "INTERSECT",
"INTERSECTION", "INTERVAL", "INTO", "IS", "JOIN", "LAG", "LANGUAGE", "LARGE", "LAST_VALUE",
"LATERAL", "LEAD", "LEADING", "LEFT", "LIKE", "LIKE_REGEX", "LN", "LOCAL", "LOCALTIME",
"LOCALTIMESTAMP", "LOWER", "MATCH", "MAX", "MEMBER", "MERGE", "METHOD", "MIN", "MINUTE",
"MOD", "MODIFIES", "MODULE", "MONTH", "MULTISET", "NATIONAL", "NATURAL", "NCHAR", "NCLOB",
"NEW", "NO", "NONE", "NORMALIZE", "NOT", "NTH_VALUE", "NTILE", "NULL", "NULLIF", "NUMERIC",
"OCTET_LENGTH", "OCCURRENCES_REGEX", "OF", "OFFSET", "OLD", "ON", "ONLY", "OPEN", "OR",
"ORDER", "OUT", "OUTER", "OVER", "OVERLAPS", "OVERLAY", "PARAMETER", "PARTITION", "PERCENT",
"PERCENT_RANK", "PERCENTILE_CONT", "PERCENTILE_DISC", "PERIOD", "PORTION", "POSITION",
"POSITION_REGEX", "POWER", "PRECEDES", "PRECISION", "PREPARE", "PRIMARY",
"PROCEDURE", "RANGE", "RANK", "READS", "REAL", "RECURSIVE", "REF", "REFERENCES",
"REFERENCING", "REGR_AVGX", "REGR_AVGY", "REGR_COUNT", "REGR_INTERCEPT", "REGR_R2",
"REGR_SLOPE", "REGR_SXX", "REGR_SXY", "REGR_SYY", "RELEASE", "RESULT", "RETURN", "RETURNS",
"REVOKE", "RIGHT", "ROLLBACK", "ROLLUP", "ROW", "ROW_NUMBER", "ROWS", "SAVEPOINT",
"SCOPE", "SCROLL", "SEARCH", "SECOND", "SELECT", "SENSITIVE", "SESSION_USER", "SET",
"SIMILAR", "SMALLINT", "SOME", "SPECIFIC", "SPECIFICTYPE", "SQL", "SQLEXCEPTION", "SQLSTATE",
"SQLWARNING", "SQRT", "START", "STATIC", "STDDEV_POP", "STDDEV_SAMP", "SUBMULTISET",
"SUBSTRING", "SUBSTRING_REGEX", "SUCCEEDS", "SUM", "SYMMETRIC", "SYSTEM", "SYSTEM_TIME",
"SYSTEM_USER", "TABLE", "TABLESAMPLE", "THEN", "TIME", "TIMESTAMP", "TIMEZONE_HOUR",
"TIMEZONE_MINUTE", "TO", "TRAILING", "TRANSLATE", "TRANSLATE_REGEX", "TRANSLATION",
"TREAT", "TRIGGER", "TRUNCATE", "TRIM", "TRIM_ARRAY", "TRUE", "UESCAPE", "UNION", "UNIQUE",
"UNKNOWN", "UNNEST", "UPDATE", "UPPER", "USER", "USING", "VALUE", "VALUES", "VALUE_OF",
"VAR_POP", "VAR_SAMP", "VARBINARY", "VARCHAR", "VARYING", "VERSIONING", "WHEN", "WHENEVER",
"WHERE", "WIDTH_BUCKET", "WINDOW", "WITH", "WITHIN", "WITHOUT", "YEAR"];
return vec![
"ABS",
"ALL",
"ALLOCATE",
"ALTER",
"AND",
"ANY",
"ARE",
"ARRAY",
"ARRAY_AGG",
"ARRAY_MAX_CARDINALITY",
"AS",
"ASENSITIVE",
"ASYMMETRIC",
"AT",
"ATOMIC",
"AUTHORIZATION",
"AVG",
"BEGIN",
"BEGIN_FRAME",
"BEGIN_PARTITION",
"BETWEEN",
"BIGINT",
"BINARY",
"BLOB",
"BOOLEAN",
"BOTH",
"BY",
"CALL",
"CALLED",
"CARDINALITY",
"CASCADED",
"CASE",
"CAST",
"CEIL",
"CEILING",
"CHAR",
"CHAR_LENGTH",
"CHARACTER",
"CHARACTER_LENGTH",
"CHECK",
"CLOB",
"CLOSE",
"COALESCE",
"COLLATE",
"COLLECT",
"COLUMN",
"COMMIT",
"CONDITION",
"CONNECT",
"CONSTRAINT",
"CONTAINS",
"CONVERT",
"CORR",
"CORRESPONDING",
"COUNT",
"COVAR_POP",
"COVAR_SAMP",
"CREATE",
"CROSS",
"CUBE",
"CUME_DIST",
"CURRENT",
"CURRENT_CATALOG",
"CURRENT_DATE",
"CURRENT_DEFAULT_TRANSFORM_GROUP",
"CURRENT_PATH",
"CURRENT_ROLE",
"CURRENT_ROW",
"CURRENT_SCHEMA",
"CURRENT_TIME",
"CURRENT_TIMESTAMP",
"CURRENT_TRANSFORM_GROUP_FOR_TYPE",
"CURRENT_USER",
"CURSOR",
"CYCLE",
"DATE",
"DAY",
"DEALLOCATE",
"DEC",
"DECIMAL",
"DECLARE",
"DEFAULT",
"DELETE",
"DENSE_RANK",
"DEREF",
"DESCRIBE",
"DETERMINISTIC",
"DISCONNECT",
"DISTINCT",
"DOUBLE",
"DROP",
"DYNAMIC",
"EACH",
"ELEMENT",
"ELSE",
"END",
"END_FRAME",
"END_PARTITION",
"END-EXEC",
"EQUALS",
"ESCAPE",
"EVERY",
"EXCEPT",
"EXEC",
"EXECUTE",
"EXISTS",
"EXP",
"EXTERNAL",
"EXTRACT",
"FALSE",
"FETCH",
"FILTER",
"FIRST_VALUE",
"FLOAT",
"FLOOR",
"FOR",
"FOREIGN",
"FRAME_ROW",
"FREE",
"FROM",
"FULL",
"FUNCTION",
"FUSION",
"GET",
"GLOBAL",
"GRANT",
"GROUP",
"GROUPING",
"GROUPS",
"HAVING",
"HOLD",
"HOUR",
"IDENTITY",
"IN",
"INDICATOR",
"INNER",
"INOUT",
"INSENSITIVE",
"INSERT",
"INT",
"INTEGER",
"INTERSECT",
"INTERSECTION",
"INTERVAL",
"INTO",
"IS",
"JOIN",
"LAG",
"LANGUAGE",
"LARGE",
"LAST_VALUE",
"LATERAL",
"LEAD",
"LEADING",
"LEFT",
"LIKE",
"LIKE_REGEX",
"LN",
"LOCAL",
"LOCALTIME",
"LOCALTIMESTAMP",
"LOWER",
"MATCH",
"MAX",
"MEMBER",
"MERGE",
"METHOD",
"MIN",
"MINUTE",
"MOD",
"MODIFIES",
"MODULE",
"MONTH",
"MULTISET",
"NATIONAL",
"NATURAL",
"NCHAR",
"NCLOB",
"NEW",
"NO",
"NONE",
"NORMALIZE",
"NOT",
"NTH_VALUE",
"NTILE",
"NULL",
"NULLIF",
"NUMERIC",
"OCTET_LENGTH",
"OCCURRENCES_REGEX",
"OF",
"OFFSET",
"OLD",
"ON",
"ONLY",
"OPEN",
"OR",
"ORDER",
"OUT",
"OUTER",
"OVER",
"OVERLAPS",
"OVERLAY",
"PARAMETER",
"PARTITION",
"PERCENT",
"PERCENT_RANK",
"PERCENTILE_CONT",
"PERCENTILE_DISC",
"PERIOD",
"PORTION",
"POSITION",
"POSITION_REGEX",
"POWER",
"PRECEDES",
"PRECISION",
"PREPARE",
"PRIMARY",
"PROCEDURE",
"RANGE",
"RANK",
"READS",
"REAL",
"RECURSIVE",
"REF",
"REFERENCES",
"REFERENCING",
"REGR_AVGX",
"REGR_AVGY",
"REGR_COUNT",
"REGR_INTERCEPT",
"REGR_R2",
"REGR_SLOPE",
"REGR_SXX",
"REGR_SXY",
"REGR_SYY",
"RELEASE",
"RESULT",
"RETURN",
"RETURNS",
"REVOKE",
"RIGHT",
"ROLLBACK",
"ROLLUP",
"ROW",
"ROW_NUMBER",
"ROWS",
"SAVEPOINT",
"SCOPE",
"SCROLL",
"SEARCH",
"SECOND",
"SELECT",
"SENSITIVE",
"SESSION_USER",
"SET",
"SIMILAR",
"SMALLINT",
"SOME",
"SPECIFIC",
"SPECIFICTYPE",
"SQL",
"SQLEXCEPTION",
"SQLSTATE",
"SQLWARNING",
"SQRT",
"START",
"STATIC",
"STDDEV_POP",
"STDDEV_SAMP",
"SUBMULTISET",
"SUBSTRING",
"SUBSTRING_REGEX",
"SUCCEEDS",
"SUM",
"SYMMETRIC",
"SYSTEM",
"SYSTEM_TIME",
"SYSTEM_USER",
"TABLE",
"TABLESAMPLE",
"THEN",
"TIME",
"TIMESTAMP",
"TIMEZONE_HOUR",
"TIMEZONE_MINUTE",
"TO",
"TRAILING",
"TRANSLATE",
"TRANSLATE_REGEX",
"TRANSLATION",
"TREAT",
"TRIGGER",
"TRUNCATE",
"TRIM",
"TRIM_ARRAY",
"TRUE",
"UESCAPE",
"UNION",
"UNIQUE",
"UNKNOWN",
"UNNEST",
"UPDATE",
"UPPER",
"USER",
"USING",
"VALUE",
"VALUES",
"VALUE_OF",
"VAR_POP",
"VAR_SAMP",
"VARBINARY",
"VARCHAR",
"VARYING",
"VERSIONING",
"WHEN",
"WHENEVER",
"WHERE",
"WIDTH_BUCKET",
"WINDOW",
"WITH",
"WITHIN",
"WITHOUT",
"YEAR",
];
}
fn is_identifier_start(&self, ch: char) -> bool {
(ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z')
}
fn is_identifier_part(&self, ch: char) -> bool {
(ch >= 'a' && ch <= 'z')
|| (ch >= 'A' && ch <= 'Z')
|| (ch >= '0' && ch <= '9')
|| ch == '_'
}
}
@ -107,7 +406,6 @@ impl Dialect for GenericSqlDialect {
"WITHOUT",
"HEADER",
"ROW",
// SQL types
"CHAR",
"CHARACTER",
@ -134,8 +432,18 @@ impl Dialect for GenericSqlDialect {
"DATE",
"TIME",
"TIMESTAMP",
];
}
}
fn is_identifier_start(&self, ch: char) -> bool {
(ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || ch == '@'
}
fn is_identifier_part(&self, ch: char) -> bool {
(ch >= 'a' && ch <= 'z')
|| (ch >= 'A' && ch <= 'Z')
|| (ch >= '0' && ch <= '9')
|| ch == '@'
|| ch == '_'
}
}

View file

@ -619,8 +619,8 @@ impl Parser {
#[cfg(test)]
mod tests {
use super::*;
use super::super::dialect::GenericSqlDialect;
use super::*;
#[test]
fn parse_delete_statement() {
@ -952,7 +952,7 @@ mod tests {
fn parse_sql(sql: &str) -> ASTNode {
let dialect = GenericSqlDialect {};
let mut tokenizer = Tokenizer::new(&dialect,&sql, );
let mut tokenizer = Tokenizer::new(&dialect, &sql);
let tokens = tokenizer.tokenize().unwrap();
let mut parser = Parser::new(tokens);
let ast = parser.parse().unwrap();

View file

@ -13,6 +13,10 @@
// limitations under the License.
//! SQL Tokenizer
//!
//! The tokenizer (a.k.a. lexer) converts a string into a sequence of tokens.
//!
//! The tokens then form the input for the parser, which outputs an Abstract Syntax Tree (AST).
use std::iter::Peekable;
use std::str::Chars;
@ -69,18 +73,18 @@ pub enum Token {
pub struct TokenizerError(String);
/// SQL Tokenizer
pub struct Tokenizer {
keywords: Vec<&'static str>,
pub struct Tokenizer<'a> {
dialect: &'a Dialect,
pub query: String,
pub line: u64,
pub col: u64,
}
impl Tokenizer {
impl<'a> Tokenizer<'a> {
/// Create a new SQL tokenizer for the specified SQL statement
pub fn new(dialect: &Dialect, query: &str) -> Self {
pub fn new(dialect: &'a Dialect, query: &str) -> Self {
Self {
keywords: dialect.keywords(),
dialect,
query: query.to_string(),
line: 1,
col: 1,
@ -91,8 +95,7 @@ impl Tokenizer {
//TODO: need to reintroduce FnvHashSet at some point .. iterating over keywords is
// not fast but I want the simplicity for now while I experiment with pluggable
// dialects
return self.keywords.contains(&s);
return self.dialect.keywords().contains(&s);
}
/// Tokenize the statement and produce a vector of tokens
@ -138,15 +141,16 @@ impl Tokenizer {
Ok(Some(Token::Whitespace(ch)))
}
// identifier or keyword
'a'...'z' | 'A'...'Z' | '_' | '@' => {
ch if self.dialect.is_identifier_start(ch) => {
let mut s = String::new();
while let Some(&ch) = chars.peek() {
match ch {
'a'...'z' | 'A'...'Z' | '_' | '0'...'9' | '@' => {
chars.next(); // consume
s.push(ch);
}
_ => break,
while let Some(&ch) = chars.peek() {
if self.dialect.is_identifier_part(ch) {
chars.next(); // consume
s.push(ch);
} else {
break;
}
}
let upper_str = s.to_uppercase();
@ -293,14 +297,14 @@ impl Tokenizer {
#[cfg(test)]
mod tests {
use super::super::dialect::GenericSqlDialect;
use super::*;
use super::super::dialect::{GenericSqlDialect};
#[test]
fn tokenize_select_1() {
let sql = String::from("SELECT 1");
let dialect = GenericSqlDialect{};
let mut tokenizer = Tokenizer::new(&dialect,&sql);
let dialect = GenericSqlDialect {};
let mut tokenizer = Tokenizer::new(&dialect, &sql);
let tokens = tokenizer.tokenize().unwrap();
let expected = vec![
@ -314,8 +318,8 @@ mod tests {
#[test]
fn tokenize_scalar_function() {
let sql = String::from("SELECT sqrt(1)");
let dialect = GenericSqlDialect{};
let mut tokenizer = Tokenizer::new(&dialect,&sql);
let dialect = GenericSqlDialect {};
let mut tokenizer = Tokenizer::new(&dialect, &sql);
let tokens = tokenizer.tokenize().unwrap();
let expected = vec![
@ -332,8 +336,8 @@ mod tests {
#[test]
fn tokenize_simple_select() {
let sql = String::from("SELECT * FROM customer WHERE id = 1 LIMIT 5");
let dialect = GenericSqlDialect{};
let mut tokenizer = Tokenizer::new(&dialect,&sql);
let dialect = GenericSqlDialect {};
let mut tokenizer = Tokenizer::new(&dialect, &sql);
let tokens = tokenizer.tokenize().unwrap();
let expected = vec![
@ -355,8 +359,8 @@ mod tests {
#[test]
fn tokenize_string_predicate() {
let sql = String::from("SELECT * FROM customer WHERE salary != 'Not Provided'");
let dialect = GenericSqlDialect{};
let mut tokenizer = Tokenizer::new(&dialect,&sql);
let dialect = GenericSqlDialect {};
let mut tokenizer = Tokenizer::new(&dialect, &sql);
let tokens = tokenizer.tokenize().unwrap();
let expected = vec![
@ -377,8 +381,8 @@ mod tests {
fn tokenize_invalid_string() {
let sql = String::from("\nمصطفىh");
let dialect = GenericSqlDialect{};
let mut tokenizer = Tokenizer::new(&dialect,&sql);
let dialect = GenericSqlDialect {};
let mut tokenizer = Tokenizer::new(&dialect, &sql);
let tokens = tokenizer.tokenize();
match tokens {
@ -396,8 +400,8 @@ mod tests {
fn tokenize_invalid_string_cols() {
let sql = String::from("\n\nSELECT * FROM table\tمصطفىh");
let dialect = GenericSqlDialect{};
let mut tokenizer = Tokenizer::new(&dialect,&sql);
let dialect = GenericSqlDialect {};
let mut tokenizer = Tokenizer::new(&dialect, &sql);
let tokens = tokenizer.tokenize();
match tokens {
Err(e) => assert_eq!(
@ -413,8 +417,8 @@ mod tests {
#[test]
fn tokenize_is_null() {
let sql = String::from("a IS NULL");
let dialect = GenericSqlDialect{};
let mut tokenizer = Tokenizer::new(&dialect,&sql);
let dialect = GenericSqlDialect {};
let mut tokenizer = Tokenizer::new(&dialect, &sql);
let tokens = tokenizer.tokenize().unwrap();
let expected = vec![