mirror of
https://github.com/apache/datafusion-sqlparser-rs.git
synced 2025-07-07 17:04:59 +00:00
Introduce concept of dialects
This commit is contained in:
parent
cc725791de
commit
06a8870bd7
7 changed files with 184 additions and 92 deletions
|
@ -18,5 +18,3 @@ name = "sqlparser"
|
|||
path = "src/lib.rs"
|
||||
|
||||
[dependencies]
|
||||
fnv = "1.0.3"
|
||||
lazy_static = "1.0"
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
extern crate sqlparser;
|
||||
|
||||
use sqlparser::dialect::AnsiSqlDialect;
|
||||
use sqlparser::sqlparser::*;
|
||||
|
||||
fn main() {
|
||||
|
@ -8,7 +9,9 @@ fn main() {
|
|||
WHERE a > b AND b < 100 \
|
||||
ORDER BY a DESC, b";
|
||||
|
||||
let ast = Parser::parse_sql(sql.to_string()).unwrap();
|
||||
let dialect = AnsiSqlDialect{};
|
||||
|
||||
let ast = Parser::parse_sql(&dialect,sql.to_string()).unwrap();
|
||||
|
||||
println!("AST: {:?}", ast);
|
||||
}
|
||||
|
|
Binary file not shown.
141
src/dialect.rs
Normal file
141
src/dialect.rs
Normal file
|
@ -0,0 +1,141 @@
|
|||
// Copyright 2018 Grove Enterprises LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//! Support for custom dialects
|
||||
|
||||
pub trait Dialect {
|
||||
/// Get a list of keywords for this dialect
|
||||
fn keywords(&self) -> Vec<&'static str>;
|
||||
}
|
||||
|
||||
pub struct AnsiSqlDialect {
|
||||
}
|
||||
|
||||
impl Dialect for AnsiSqlDialect {
|
||||
fn keywords(&self) -> Vec<&'static str> {
|
||||
return vec!["ABS", "ALL", "ALLOCATE", "ALTER", "AND", "ANY", "ARE", "ARRAY", "ARRAY_AGG",
|
||||
"ARRAY_MAX_CARDINALITY", "AS", "ASENSITIVE", "ASYMMETRIC", "AT", "ATOMIC", "AUTHORIZATION",
|
||||
"AVG", "BEGIN", "BEGIN_FRAME", "BEGIN_PARTITION", "BETWEEN", "BIGINT", "BINARY", "BLOB",
|
||||
"BOOLEAN", "BOTH", "BY", "CALL", "CALLED", "CARDINALITY", "CASCADED", "CASE", "CAST", "CEIL",
|
||||
"CEILING", "CHAR", "CHAR_LENGTH", "CHARACTER", "CHARACTER_LENGTH", "CHECK", "CLOB", "CLOSE",
|
||||
"COALESCE", "COLLATE", "COLLECT", "COLUMN", "COMMIT", "CONDITION", "CONNECT", "CONSTRAINT",
|
||||
"CONTAINS", "CONVERT", "CORR", "CORRESPONDING", "COUNT", "COVAR_POP", "COVAR_SAMP", "CREATE",
|
||||
"CROSS", "CUBE", "CUME_DIST", "CURRENT", "CURRENT_CATALOG", "CURRENT_DATE",
|
||||
"CURRENT_DEFAULT_TRANSFORM_GROUP", "CURRENT_PATH", "CURRENT_ROLE", "CURRENT_ROW",
|
||||
"CURRENT_SCHEMA", "CURRENT_TIME", "CURRENT_TIMESTAMP", "CURRENT_TRANSFORM_GROUP_FOR_TYPE",
|
||||
"CURRENT_USER", "CURSOR", "CYCLE", "DATE", "DAY", "DEALLOCATE", "DEC", "DECIMAL", "DECLARE",
|
||||
"DEFAULT", "DELETE", "DENSE_RANK", "DEREF", "DESCRIBE", "DETERMINISTIC", "DISCONNECT",
|
||||
"DISTINCT", "DOUBLE", "DROP", "DYNAMIC", "EACH", "ELEMENT", "ELSE", "END", "END_FRAME",
|
||||
"END_PARTITION", "END-EXEC", "EQUALS", "ESCAPE", "EVERY", "EXCEPT", "EXEC", "EXECUTE",
|
||||
"EXISTS", "EXP", "EXTERNAL", "EXTRACT", "FALSE", "FETCH", "FILTER", "FIRST_VALUE", "FLOAT",
|
||||
"FLOOR", "FOR", "FOREIGN", "FRAME_ROW", "FREE", "FROM", "FULL", "FUNCTION", "FUSION",
|
||||
"GET", "GLOBAL", "GRANT", "GROUP", "GROUPING", "GROUPS", "HAVING", "HOLD", "HOUR", "IDENTITY",
|
||||
"IN", "INDICATOR", "INNER", "INOUT", "INSENSITIVE", "INSERT", "INT", "INTEGER", "INTERSECT",
|
||||
"INTERSECTION", "INTERVAL", "INTO", "IS", "JOIN", "LAG", "LANGUAGE", "LARGE", "LAST_VALUE",
|
||||
"LATERAL", "LEAD", "LEADING", "LEFT", "LIKE", "LIKE_REGEX", "LN", "LOCAL", "LOCALTIME",
|
||||
"LOCALTIMESTAMP", "LOWER", "MATCH", "MAX", "MEMBER", "MERGE", "METHOD", "MIN", "MINUTE",
|
||||
"MOD", "MODIFIES", "MODULE", "MONTH", "MULTISET", "NATIONAL", "NATURAL", "NCHAR", "NCLOB",
|
||||
"NEW", "NO", "NONE", "NORMALIZE", "NOT", "NTH_VALUE", "NTILE", "NULL", "NULLIF", "NUMERIC",
|
||||
"OCTET_LENGTH", "OCCURRENCES_REGEX", "OF", "OFFSET", "OLD", "ON", "ONLY", "OPEN", "OR",
|
||||
"ORDER", "OUT", "OUTER", "OVER", "OVERLAPS", "OVERLAY", "PARAMETER", "PARTITION", "PERCENT",
|
||||
"PERCENT_RANK", "PERCENTILE_CONT", "PERCENTILE_DISC", "PERIOD", "PORTION", "POSITION",
|
||||
"POSITION_REGEX", "POWER", "PRECEDES", "PRECISION", "PREPARE", "PRIMARY",
|
||||
"PROCEDURE", "RANGE", "RANK", "READS", "REAL", "RECURSIVE", "REF", "REFERENCES",
|
||||
"REFERENCING", "REGR_AVGX", "REGR_AVGY", "REGR_COUNT", "REGR_INTERCEPT", "REGR_R2",
|
||||
"REGR_SLOPE", "REGR_SXX", "REGR_SXY", "REGR_SYY", "RELEASE", "RESULT", "RETURN", "RETURNS",
|
||||
"REVOKE", "RIGHT", "ROLLBACK", "ROLLUP", "ROW", "ROW_NUMBER", "ROWS", "SAVEPOINT",
|
||||
"SCOPE", "SCROLL", "SEARCH", "SECOND", "SELECT", "SENSITIVE", "SESSION_USER", "SET",
|
||||
"SIMILAR", "SMALLINT", "SOME", "SPECIFIC", "SPECIFICTYPE", "SQL", "SQLEXCEPTION", "SQLSTATE",
|
||||
"SQLWARNING", "SQRT", "START", "STATIC", "STDDEV_POP", "STDDEV_SAMP", "SUBMULTISET",
|
||||
"SUBSTRING", "SUBSTRING_REGEX", "SUCCEEDS", "SUM", "SYMMETRIC", "SYSTEM", "SYSTEM_TIME",
|
||||
"SYSTEM_USER", "TABLE", "TABLESAMPLE", "THEN", "TIME", "TIMESTAMP", "TIMEZONE_HOUR",
|
||||
"TIMEZONE_MINUTE", "TO", "TRAILING", "TRANSLATE", "TRANSLATE_REGEX", "TRANSLATION",
|
||||
"TREAT", "TRIGGER", "TRUNCATE", "TRIM", "TRIM_ARRAY", "TRUE", "UESCAPE", "UNION", "UNIQUE",
|
||||
"UNKNOWN", "UNNEST", "UPDATE", "UPPER", "USER", "USING", "VALUE", "VALUES", "VALUE_OF",
|
||||
"VAR_POP", "VAR_SAMP", "VARBINARY", "VARCHAR", "VARYING", "VERSIONING", "WHEN", "WHENEVER",
|
||||
"WHERE", "WIDTH_BUCKET", "WINDOW", "WITH", "WITHIN", "WITHOUT", "YEAR"];
|
||||
}
|
||||
}
|
||||
|
||||
pub struct GenericSqlDialect {}
|
||||
|
||||
impl Dialect for GenericSqlDialect {
|
||||
fn keywords(&self) -> Vec<&'static str> {
|
||||
return vec![
|
||||
"SELECT",
|
||||
"FROM",
|
||||
"WHERE",
|
||||
"LIMIT",
|
||||
"ORDER",
|
||||
"GROUP",
|
||||
"BY",
|
||||
"HAVING",
|
||||
"UNION",
|
||||
"ALL",
|
||||
"INSERT",
|
||||
"UPDATE",
|
||||
"DELETE",
|
||||
"IN",
|
||||
"IS",
|
||||
"NULL",
|
||||
"SET",
|
||||
"CREATE",
|
||||
"EXTERNAL",
|
||||
"TABLE",
|
||||
"ASC",
|
||||
"DESC",
|
||||
"AND",
|
||||
"OR",
|
||||
"NOT",
|
||||
"AS",
|
||||
"STORED",
|
||||
"CSV",
|
||||
"PARQUET",
|
||||
"LOCATION",
|
||||
"WITH",
|
||||
"WITHOUT",
|
||||
"HEADER",
|
||||
"ROW",
|
||||
|
||||
// SQL types
|
||||
"CHAR",
|
||||
"CHARACTER",
|
||||
"VARYING",
|
||||
"LARGE",
|
||||
"OBJECT",
|
||||
"VARCHAR",
|
||||
"CLOB",
|
||||
"BINARY",
|
||||
"VARBINARY",
|
||||
"BLOB",
|
||||
"FLOAT",
|
||||
"REAL",
|
||||
"DOUBLE",
|
||||
"PRECISION",
|
||||
"INT",
|
||||
"INTEGER",
|
||||
"SMALLINT",
|
||||
"BIGINT",
|
||||
"NUMERIC",
|
||||
"DECIMAL",
|
||||
"DEC",
|
||||
"BOOLEAN",
|
||||
"DATE",
|
||||
"TIME",
|
||||
"TIMESTAMP",
|
||||
|
||||
];
|
||||
}
|
||||
}
|
||||
|
11
src/lib.rs
11
src/lib.rs
|
@ -20,23 +20,22 @@
|
|||
//! Syntax Tree (AST).
|
||||
//!
|
||||
//! ```
|
||||
//! use sqlparser::dialect::GenericSqlDialect;
|
||||
//! use sqlparser::sqlparser::Parser;
|
||||
//!
|
||||
//! let dialect = GenericSqlDialect {}; // or AnsiSqlDialect
|
||||
//!
|
||||
//! let sql = "SELECT a, b, 123, myfunc(b) \
|
||||
//! FROM table_1 \
|
||||
//! WHERE a > b AND b < 100 \
|
||||
//! ORDER BY a DESC, b";
|
||||
//!
|
||||
//! let ast = Parser::parse_sql(sql.to_string()).unwrap();
|
||||
//! let ast = Parser::parse_sql(&dialect, sql.to_string()).unwrap();
|
||||
//!
|
||||
//! println!("AST: {:?}", ast);
|
||||
//! ```
|
||||
|
||||
extern crate fnv;
|
||||
|
||||
#[macro_use]
|
||||
extern crate lazy_static;
|
||||
|
||||
pub mod dialect;
|
||||
pub mod sqlast;
|
||||
pub mod sqlparser;
|
||||
pub mod sqltokenizer;
|
||||
|
|
|
@ -14,6 +14,7 @@
|
|||
|
||||
//! SQL Parser
|
||||
|
||||
use super::dialect::Dialect;
|
||||
use super::sqlast::*;
|
||||
use super::sqltokenizer::*;
|
||||
|
||||
|
@ -51,8 +52,8 @@ impl Parser {
|
|||
}
|
||||
|
||||
/// Parse a SQL statement and produce an Abstract Syntax Tree (AST)
|
||||
pub fn parse_sql(sql: String) -> Result<ASTNode, ParserError> {
|
||||
let mut tokenizer = Tokenizer::new(&sql);
|
||||
pub fn parse_sql(dialect: &Dialect, sql: String) -> Result<ASTNode, ParserError> {
|
||||
let mut tokenizer = Tokenizer::new(dialect, &sql);
|
||||
let tokens = tokenizer.tokenize()?;
|
||||
let mut parser = Parser::new(tokens);
|
||||
parser.parse()
|
||||
|
@ -619,6 +620,7 @@ impl Parser {
|
|||
mod tests {
|
||||
|
||||
use super::*;
|
||||
use super::super::dialect::GenericSqlDialect;
|
||||
|
||||
#[test]
|
||||
fn parse_delete_statement() {
|
||||
|
@ -949,7 +951,8 @@ mod tests {
|
|||
}
|
||||
|
||||
fn parse_sql(sql: &str) -> ASTNode {
|
||||
let mut tokenizer = Tokenizer::new(&sql);
|
||||
let dialect = GenericSqlDialect {};
|
||||
let mut tokenizer = Tokenizer::new(&dialect,&sql, );
|
||||
let tokens = tokenizer.tokenize().unwrap();
|
||||
let mut parser = Parser::new(tokens);
|
||||
let ast = parser.parse().unwrap();
|
||||
|
|
|
@ -17,7 +17,7 @@
|
|||
use std::iter::Peekable;
|
||||
use std::str::Chars;
|
||||
|
||||
use fnv::FnvHashSet;
|
||||
use super::dialect::Dialect;
|
||||
|
||||
/// SQL Token enumeration
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
|
@ -68,78 +68,9 @@ pub enum Token {
|
|||
#[derive(Debug, PartialEq)]
|
||||
pub struct TokenizerError(String);
|
||||
|
||||
lazy_static! {
|
||||
static ref KEYWORDS: FnvHashSet<&'static str> = {
|
||||
let mut m = FnvHashSet::default();
|
||||
|
||||
m.insert("SELECT");
|
||||
m.insert("FROM");
|
||||
m.insert("WHERE");
|
||||
m.insert("LIMIT");
|
||||
m.insert("ORDER");
|
||||
m.insert("GROUP");
|
||||
m.insert("BY");
|
||||
m.insert("HAVING");
|
||||
m.insert("UNION");
|
||||
m.insert("ALL");
|
||||
m.insert("INSERT");
|
||||
m.insert("UPDATE");
|
||||
m.insert("DELETE");
|
||||
m.insert("IN");
|
||||
m.insert("IS");
|
||||
m.insert("NULL");
|
||||
m.insert("SET");
|
||||
m.insert("CREATE");
|
||||
m.insert("EXTERNAL");
|
||||
m.insert("TABLE");
|
||||
m.insert("ASC");
|
||||
m.insert("DESC");
|
||||
m.insert("AND");
|
||||
m.insert("OR");
|
||||
m.insert("NOT");
|
||||
m.insert("AS");
|
||||
m.insert("STORED");
|
||||
m.insert("CSV");
|
||||
m.insert("PARQUET");
|
||||
m.insert("LOCATION");
|
||||
m.insert("WITH");
|
||||
m.insert("WITHOUT");
|
||||
m.insert("HEADER");
|
||||
m.insert("ROW");
|
||||
|
||||
// SQL types
|
||||
m.insert("CHAR");
|
||||
m.insert("CHARACTER");
|
||||
m.insert("VARYING");
|
||||
m.insert("LARGE");
|
||||
m.insert("OBJECT");
|
||||
m.insert("VARCHAR");
|
||||
m.insert("CLOB");
|
||||
m.insert("BINARY");
|
||||
m.insert("VARBINARY");
|
||||
m.insert("BLOB");
|
||||
m.insert("FLOAT");
|
||||
m.insert("REAL");
|
||||
m.insert("DOUBLE");
|
||||
m.insert("PRECISION");
|
||||
m.insert("INT");
|
||||
m.insert("INTEGER");
|
||||
m.insert("SMALLINT");
|
||||
m.insert("BIGINT");
|
||||
m.insert("NUMERIC");
|
||||
m.insert("DECIMAL");
|
||||
m.insert("DEC");
|
||||
m.insert("BOOLEAN");
|
||||
m.insert("DATE");
|
||||
m.insert("TIME");
|
||||
m.insert("TIMESTAMP");
|
||||
|
||||
m
|
||||
};
|
||||
}
|
||||
|
||||
/// SQL Tokenizer
|
||||
pub struct Tokenizer {
|
||||
keywords: Vec<&'static str>,
|
||||
pub query: String,
|
||||
pub line: u64,
|
||||
pub col: u64,
|
||||
|
@ -147,14 +78,23 @@ pub struct Tokenizer {
|
|||
|
||||
impl Tokenizer {
|
||||
/// Create a new SQL tokenizer for the specified SQL statement
|
||||
pub fn new(query: &str) -> Self {
|
||||
pub fn new(dialect: &Dialect, query: &str) -> Self {
|
||||
Self {
|
||||
keywords: dialect.keywords(),
|
||||
query: query.to_string(),
|
||||
line: 1,
|
||||
col: 1,
|
||||
}
|
||||
}
|
||||
|
||||
fn is_keyword(&self, s: &str) -> bool {
|
||||
//TODO: need to reintroduce FnvHashSet at some point .. iterating over keywords is
|
||||
// not fast but I want the simplicity for now while I experiment with pluggable
|
||||
// dialects
|
||||
return self.keywords.contains(&s);
|
||||
|
||||
}
|
||||
|
||||
/// Tokenize the statement and produce a vector of tokens
|
||||
pub fn tokenize(&mut self) -> Result<Vec<Token>, TokenizerError> {
|
||||
let mut peekable = self.query.chars().peekable();
|
||||
|
@ -210,7 +150,7 @@ impl Tokenizer {
|
|||
}
|
||||
}
|
||||
let upper_str = s.to_uppercase();
|
||||
if KEYWORDS.contains(upper_str.as_str()) {
|
||||
if self.is_keyword(upper_str.as_str()) {
|
||||
Ok(Some(Token::Keyword(upper_str)))
|
||||
} else {
|
||||
Ok(Some(Token::Identifier(s)))
|
||||
|
@ -354,11 +294,13 @@ impl Tokenizer {
|
|||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use super::super::dialect::{GenericSqlDialect};
|
||||
|
||||
#[test]
|
||||
fn tokenize_select_1() {
|
||||
let sql = String::from("SELECT 1");
|
||||
let mut tokenizer = Tokenizer::new(&sql);
|
||||
let dialect = GenericSqlDialect{};
|
||||
let mut tokenizer = Tokenizer::new(&dialect,&sql);
|
||||
let tokens = tokenizer.tokenize().unwrap();
|
||||
|
||||
let expected = vec![
|
||||
|
@ -372,7 +314,8 @@ mod tests {
|
|||
#[test]
|
||||
fn tokenize_scalar_function() {
|
||||
let sql = String::from("SELECT sqrt(1)");
|
||||
let mut tokenizer = Tokenizer::new(&sql);
|
||||
let dialect = GenericSqlDialect{};
|
||||
let mut tokenizer = Tokenizer::new(&dialect,&sql);
|
||||
let tokens = tokenizer.tokenize().unwrap();
|
||||
|
||||
let expected = vec![
|
||||
|
@ -389,7 +332,8 @@ mod tests {
|
|||
#[test]
|
||||
fn tokenize_simple_select() {
|
||||
let sql = String::from("SELECT * FROM customer WHERE id = 1 LIMIT 5");
|
||||
let mut tokenizer = Tokenizer::new(&sql);
|
||||
let dialect = GenericSqlDialect{};
|
||||
let mut tokenizer = Tokenizer::new(&dialect,&sql);
|
||||
let tokens = tokenizer.tokenize().unwrap();
|
||||
|
||||
let expected = vec![
|
||||
|
@ -411,7 +355,8 @@ mod tests {
|
|||
#[test]
|
||||
fn tokenize_string_predicate() {
|
||||
let sql = String::from("SELECT * FROM customer WHERE salary != 'Not Provided'");
|
||||
let mut tokenizer = Tokenizer::new(&sql);
|
||||
let dialect = GenericSqlDialect{};
|
||||
let mut tokenizer = Tokenizer::new(&dialect,&sql);
|
||||
let tokens = tokenizer.tokenize().unwrap();
|
||||
|
||||
let expected = vec![
|
||||
|
@ -432,7 +377,8 @@ mod tests {
|
|||
fn tokenize_invalid_string() {
|
||||
let sql = String::from("\nمصطفىh");
|
||||
|
||||
let mut tokenizer = Tokenizer::new(&sql);
|
||||
let dialect = GenericSqlDialect{};
|
||||
let mut tokenizer = Tokenizer::new(&dialect,&sql);
|
||||
let tokens = tokenizer.tokenize();
|
||||
|
||||
match tokens {
|
||||
|
@ -450,7 +396,8 @@ mod tests {
|
|||
fn tokenize_invalid_string_cols() {
|
||||
let sql = String::from("\n\nSELECT * FROM table\tمصطفىh");
|
||||
|
||||
let mut tokenizer = Tokenizer::new(&sql);
|
||||
let dialect = GenericSqlDialect{};
|
||||
let mut tokenizer = Tokenizer::new(&dialect,&sql);
|
||||
let tokens = tokenizer.tokenize();
|
||||
match tokens {
|
||||
Err(e) => assert_eq!(
|
||||
|
@ -466,7 +413,8 @@ mod tests {
|
|||
#[test]
|
||||
fn tokenize_is_null() {
|
||||
let sql = String::from("a IS NULL");
|
||||
let mut tokenizer = Tokenizer::new(&sql);
|
||||
let dialect = GenericSqlDialect{};
|
||||
let mut tokenizer = Tokenizer::new(&dialect,&sql);
|
||||
let tokens = tokenizer.tokenize().unwrap();
|
||||
|
||||
let expected = vec![
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue