replace with code from datafusion

2025-11-01 23:50:56 +00:00 · 2018-09-03 09:56:39 -06:00 · 2018-09-03 09:56:39 -06:00 · 0c23392adb
commit 0c23392adb
parent a86bd30515
14 changed files with 1762 additions and 595 deletions
--- a/src/ansi/mod.rs
+++ b/src/ansi/mod.rs
@ -1,3 +0,0 @@
-
-pub mod tokenizer;
-pub mod parser;
--- a/src/ansi/parser.rs
+++ b/src/ansi/parser.rs
@ -1,70 +0,0 @@
-use std::cmp::PartialEq;
-use std::fmt::Debug;
-//use std::rc::Rc;
-//use std::sync::{Arc, Mutex};
-
-use super::tokenizer::ANSISQLTokenizer;
-use super::super::tokenizer::*;
-use super::super::parser::*;
-
-pub struct ANSISQLParser {
-    tokenizer: Box<SQLTokenizer>
-}
-
-impl ANSISQLParser where {
-
-    pub fn parse(sql: &str) -> Result<Option<Box<SQLExpr>>, ParserError> {
-        let mut parser = ANSISQLParser { tokenizer: Box::new(ANSISQLTokenizer::new(sql)) };
-        parser.parse_expr()
-    }
-}
-
-impl SQLParser for ANSISQLParser {
-
-    fn parse_expr(&mut self) -> Result<Option<Box<SQLExpr>>, ParserError> {
-
-        let precedence: usize = 0;
-
-        let mut e = self.parse_prefix()?;
-
-        match e {
-            Some(mut expr) => {
-                while let Some(token) = self.tokenizer.peek_token()? {
-                    let next_precedence = self.tokenizer.precedence(&token);
-
-                    if precedence >= next_precedence {
-                        break;
-                    }
-
-                    expr = self.parse_infix(&expr, next_precedence)?.unwrap(); //TODO: fix me
-                }
-
-                Ok(Some(expr))
-            }
-            _ => {
-                Ok(None)
-            }
-        }
-
-    }
-
-    fn parse_prefix(&mut self) -> Result<Option<Box<SQLExpr>>, ParserError> {
-
-        match self.tokenizer.next_token()? {
-            Some(SQLToken::Keyword(ref k)) => match k.to_uppercase().as_ref() {
-                "INSERT" => unimplemented!(),
-                "UPDATE" => unimplemented!(),
-                "DELETE" => unimplemented!(),
-                "SELECT" => unimplemented!(),
-                "CREATE" => unimplemented!(),
-                _ => unimplemented!()
-            },
-            _ => unimplemented!()
-        }
-    }
-
-    fn parse_infix(&mut self, _left: &SQLExpr, _precedence: usize) -> Result<Option<Box<SQLExpr>>, ParserError> {
-        unimplemented!()
-    }
-}
-
--- a/src/ansi/tokenizer.rs
+++ b/src/ansi/tokenizer.rs
@ -1,56 +0,0 @@
-use std::cmp::PartialEq;
-use std::fmt::Debug;
-
-use super::super::tokenizer::*;
-
-pub struct ANSISQLTokenizer {
-    chars: CharSeq
-}
-
-impl ANSISQLTokenizer {
-    pub fn new(sql: &str) -> Self {
-        ANSISQLTokenizer { chars: CharSeq::new(sql) }
-    }
-}
-
-impl SQLTokenizer for ANSISQLTokenizer {
-
-    fn precedence(&self, _token: &SQLToken) -> usize {
-        unimplemented!()
-    }
-
-    fn peek_token(&mut self) -> Result<Option<SQLToken>, TokenizerError> {
-        unimplemented!()
-    }
-
-
-    fn next_token(&mut self) -> Result<Option<SQLToken>, TokenizerError> {
-        match self.chars.next() {
-            Some(ch) => match ch {
-                ' ' | '\t' | '\n' => Ok(Some(SQLToken::Whitespace(ch))),
-                '0' ... '9' => {
-                    let mut s = String::new();
-                    s.push(ch);
-                    while let Some(&ch) = self.chars.peek() {
-                        match ch {
-                            '0' ... '9' => {
-                                self.chars.next(); // consume
-                                s.push(ch);
-                            },
-                            _ => break
-                        }
-                    }
-                    Ok(Some(SQLToken::Literal(s)))
-                },
-                '+' => Ok(Some(SQLToken::Plus)),
-                '-' => Ok(Some(SQLToken::Minus)),
-                '*' => Ok(Some(SQLToken::Mult)),
-                '/' => Ok(Some(SQLToken::Divide)),
-                _ => Err(TokenizerError::UnexpectedChar(ch,Position::new(0, 0)))
-            },
-            None => Ok(None)
-        }
-    }
-
-}
-
--- a/src/lib.rs
+++ b/src/lib.rs
@ -1,3 +1,22 @@
-pub mod ansi;
-pub mod tokenizer;
-pub mod parser;
+// Copyright 2018 Grove Enterprises LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+extern crate fnv;
+
+#[macro_use]
+extern crate lazy_static;
+
+pub mod sqlast;
+pub mod sqlparser;
+pub mod sqltokenizer;
--- a/src/parser.rs
+++ b/src/parser.rs
@ -1,106 +0,0 @@
-use std::cmp::PartialEq;
-use std::fmt::Debug;
-
-use super::tokenizer::*;
-
-// https://jakewheat.github.io/sql-overview/sql-2011-foundation-grammar.html
-
-/// ANSI SQL:2011 Data Types
-#[derive(Debug)]
-pub enum SQLDataType {
-    /// BOOLEAN
-    Boolean,
-    /// NUMERIC, DECIMAL, DEC
-    Numeric { precision: usize, scale: Option<usize> },
-    /// SMALLINT
-    SmallInt,
-    /// INT, INTEGER
-    Int,
-    /// BIGINT
-    BigInt,
-    /// Floating point: `FLOAT(precision)`
-    Float(usize),
-    /// REAL
-    Real,
-    /// Double: `DOUBLE PRECISION`
-    Double,
-    /// Fixed-length character. `CHAR, CHARACTER`
-    Char(usize),
-    /// Variable-length character: `VARCHAR, CHARACTER VARYING, CHAR VARYING`
-    VarChar(usize),
-    /// Character Large Object: `CHARACTER LARGE OBJECT, CHAR LARGE OBJECT, CLOB`
-    Clob(usize),
-    /// Fixed-length character. `NCHAR, NATIONAL CHAR, NATIONAL CHARACTER`
-    NChar(usize),
-    /// Variable-length character: `NCHAR VARYING, NATIONAL CHARACTER VARYING, NATIONAL CHAR VARYING`
-    NVarChar(usize),
-    /// National Character Large Object: `NATIONAL CHARACTER LARGE OBJECT, NCHAR LARGE OBJECT, NCLOB`
-    NClob(usize),
-    /// Fixed-length binary
-    Binary(usize),
-    /// Variable-length binary
-    VarBinary(usize),
-    /// Binary large object
-    Blob(usize),
-    /// Date
-    Date,
-    /// Time: `TIME [(precision)] [WITH TIME ZONE | WITHOUT TIME ZONE]`
-    Time { precision: usize, tz: bool },
-    /// Time: `TIMESTAMP [(precision)] [WITH TIME ZONE | WITHOUT TIME ZONE]`
-    Timestamp { precision: usize, tz: bool },
-}
-
-
-
-#[derive(Debug)]
-pub enum SQLOperator {
-    Plus,
-    Minus,
-    Mult,
-    Div,
-    Eq,
-    Gt,
-    GtEq,
-    Lt,
-    LtEq,
-}
-
-/// SQL Expressions
-#[derive(Debug)]
-pub enum SQLExpr{
-    /// Identifier e.g. table name or column name
-    Identifier(String),
-    /// Literal value
-    Literal(String),
-    /// Binary expression e.g. `1 + 2` or `fname LIKE "A%"`
-    Binary(Box<SQLExpr>, SQLOperator, Box<SQLExpr>),
-    /// Function invocation with function name and list of argument expressions
-    FunctionCall(String, Vec<SQLExpr>),
-    Insert,
-    Update,
-    Delete,
-    Select,
-    CreateTable,
-}
-
-#[derive(Debug)]
-pub enum ParserError {
-    WrongToken { expected: Vec<SQLToken>, actual: SQLToken, line: usize, col: usize },
-    Custom(String)
-}
-
-impl From<TokenizerError> for ParserError {
-    fn from(e: TokenizerError) -> Self {
-        ParserError::Custom(format!("{:?}", e))
-    }
-}
-
-
-pub trait SQLParser {
-    fn parse_expr(&mut self) -> Result<Option<Box<SQLExpr>>, ParserError>;
-    /// parse the prefix and stop once an infix operator is reached
-    fn parse_prefix(&mut self) -> Result<Option<Box<SQLExpr>>, ParserError> ;
-    /// parse the next infix expression, returning None if the precedence has changed
-    fn parse_infix(&mut self, left: &SQLExpr, precedence: usize) -> Result<Option<Box<SQLExpr>>, ParserError>;
-}
-
--- a/src/sqlast.rs
+++ b/src/sqlast.rs
@ -0,0 +1,122 @@
+// Copyright 2018 Grove Enterprises LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! SQL Abstract Syntax Tree (AST) types
+
+/// Supported file types for `CREATE EXTERNAL TABLE`
+#[derive(Debug, Clone, PartialEq)]
+pub enum FileType {
+    CSV,
+    NdJson,
+    Parquet,
+}
+
+/// SQL Abstract Syntax Tree (AST)
+#[derive(Debug, Clone, PartialEq)]
+pub enum ASTNode {
+    SQLIdentifier(String),
+    SQLWildcard,
+    SQLCompoundIdentifier(Vec<String>),
+    SQLIsNull(Box<ASTNode>),
+    SQLIsNotNull(Box<ASTNode>),
+    SQLBinaryExpr {
+        left: Box<ASTNode>,
+        op: SQLOperator,
+        right: Box<ASTNode>,
+    },
+    SQLCast {
+        expr: Box<ASTNode>,
+        data_type: SQLType,
+    },
+    SQLNested(Box<ASTNode>),
+    SQLUnary {
+        operator: SQLOperator,
+        rex: Box<ASTNode>,
+    },
+    SQLLiteralLong(i64),
+    SQLLiteralDouble(f64),
+    SQLLiteralString(String),
+    SQLFunction {
+        id: String,
+        args: Vec<ASTNode>,
+    },
+    SQLOrderBy {
+        expr: Box<ASTNode>,
+        asc: bool,
+    },
+    SQLSelect {
+        projection: Vec<ASTNode>,
+        relation: Option<Box<ASTNode>>,
+        selection: Option<Box<ASTNode>>,
+        order_by: Option<Vec<ASTNode>>,
+        group_by: Option<Vec<ASTNode>>,
+        having: Option<Box<ASTNode>>,
+        limit: Option<Box<ASTNode>>,
+    },
+    SQLCreateTable {
+        /// Table name
+        name: String,
+        /// Optional schema
+        columns: Vec<SQLColumnDef>,
+        /// File type (CSV or Parquet)
+        file_type: FileType,
+        /// For CSV files, indicate whether the file has a header row or not
+        header_row: bool,
+        /// Path to file or directory contianing files
+        location: String,
+    },
+}
+
+/// SQL column definition
+#[derive(Debug, Clone, PartialEq)]
+pub struct SQLColumnDef {
+    pub name: String,
+    pub data_type: SQLType,
+    pub allow_null: bool,
+}
+
+/// SQL datatypes for literals in SQL statements
+#[derive(Debug, Clone, PartialEq)]
+pub enum SQLType {
+    Boolean,
+    UInt8,
+    UInt16,
+    UInt32,
+    UInt64,
+    Int8,
+    Int16,
+    Int32,
+    Int64,
+    Float32,
+    Double64,
+    Utf8(usize),
+}
+
+/// SQL Operator
+#[derive(Debug, PartialEq, Clone)]
+pub enum SQLOperator {
+    Plus,
+    Minus,
+    Multiply,
+    Divide,
+    Modulus,
+    Gt,
+    Lt,
+    GtEq,
+    LtEq,
+    Eq,
+    NotEq,
+    And,
+    Or,
+}
--- a/src/sqlparser.rs
+++ b/src/sqlparser.rs
@ -0,0 +1,971 @@
+// Copyright 2018 Grove Enterprises LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! SQL Parser
+
+use super::sqlast::*;
+use super::sqltokenizer::*;
+
+#[derive(Debug, Clone)]
+pub enum ParserError {
+    TokenizerError(String),
+    ParserError(String),
+}
+
+macro_rules! parser_err {
+    ($MSG:expr) => {
+        Err(ParserError::ParserError($MSG.to_string()))
+    };
+}
+
+impl From<TokenizerError> for ParserError {
+    fn from(e: TokenizerError) -> Self {
+        ParserError::TokenizerError(format!("{:?}", e))
+    }
+}
+
+/// SQL Parser
+pub struct Parser {
+    tokens: Vec<Token>,
+    index: usize,
+}
+
+impl Parser {
+    /// Parse the specified tokens
+    pub fn new(tokens: Vec<Token>) -> Self {
+        Parser {
+            tokens: tokens,
+            index: 0,
+        }
+    }
+
+    /// Parse a SQL statement and produce an Abstract Syntax Tree (AST)
+    pub fn parse_sql(sql: String) -> Result<ASTNode, ParserError> {
+        let mut tokenizer = Tokenizer::new(&sql);
+        let tokens = tokenizer.tokenize()?;
+        let mut parser = Parser::new(tokens);
+        parser.parse()
+    }
+
+    /// Parse a new expression
+    pub fn parse(&mut self) -> Result<ASTNode, ParserError> {
+        self.parse_expr(0)
+    }
+
+    /// Parse tokens until the precedence changes
+    fn parse_expr(&mut self, precedence: u8) -> Result<ASTNode, ParserError> {
+        //        println!("parse_expr() precendence = {}", precedence);
+
+        let mut expr = self.parse_prefix()?;
+        //        println!("parsed prefix: {:?}", expr);
+
+        loop {
+            let next_precedence = self.get_next_precedence()?;
+            if precedence >= next_precedence {
+                //                println!("break on precedence change ({} >= {})", precedence, next_precedence);
+                break;
+            }
+
+            if let Some(infix_expr) = self.parse_infix(expr.clone(), next_precedence)? {
+                //                println!("parsed infix: {:?}", infix_expr);
+                expr = infix_expr;
+            }
+        }
+
+        //        println!("parse_expr() returning {:?}", expr);
+
+        Ok(expr)
+    }
+
+    /// Parse an expression prefix
+    fn parse_prefix(&mut self) -> Result<ASTNode, ParserError> {
+        match self.next_token() {
+            Some(t) => {
+                match t {
+                    Token::Keyword(k) => match k.to_uppercase().as_ref() {
+                        "SELECT" => Ok(self.parse_select()?),
+                        "CREATE" => Ok(self.parse_create()?),
+                        _ => return parser_err!(format!("No prefix parser for keyword {}", k)),
+                    },
+                    Token::Mult => Ok(ASTNode::SQLWildcard),
+                    Token::Identifier(id) => {
+                        match self.peek_token() {
+                            Some(Token::LParen) => {
+                                self.next_token(); // skip lparen
+                                match id.to_uppercase().as_ref() {
+                                    "CAST" => self.parse_cast_expression(),
+                                    _ => {
+                                        let args = self.parse_expr_list()?;
+                                        self.next_token(); // skip rparen
+                                        Ok(ASTNode::SQLFunction { id, args })
+                                    }
+                                }
+                            }
+                            Some(Token::Period) => {
+                                let mut id_parts: Vec<String> = vec![id];
+                                while self.peek_token() == Some(Token::Period) {
+                                    self.consume_token(&Token::Period)?;
+                                    match self.next_token() {
+                                        Some(Token::Identifier(id)) => id_parts.push(id),
+                                        _ => {
+                                            return parser_err!(format!(
+                                                "Error parsing compound identifier"
+                                            ))
+                                        }
+                                    }
+                                }
+                                Ok(ASTNode::SQLCompoundIdentifier(id_parts))
+                            }
+                            _ => Ok(ASTNode::SQLIdentifier(id)),
+                        }
+                    }
+                    Token::Number(ref n) if n.contains(".") => match n.parse::<f64>() {
+                        Ok(n) => Ok(ASTNode::SQLLiteralDouble(n)),
+                        Err(e) => parser_err!(format!("Could not parse '{}' as i64: {}", n, e)),
+                    },
+                    Token::Number(ref n) => match n.parse::<i64>() {
+                        Ok(n) => Ok(ASTNode::SQLLiteralLong(n)),
+                        Err(e) => parser_err!(format!("Could not parse '{}' as i64: {}", n, e)),
+                    },
+                    Token::String(ref s) => Ok(ASTNode::SQLLiteralString(s.to_string())),
+                    _ => parser_err!(format!(
+                        "Prefix parser expected a keyword but found {:?}",
+                        t
+                    )),
+                }
+            }
+            None => parser_err!(format!("Prefix parser expected a keyword but hit EOF")),
+        }
+    }
+
+    /// Parse a SQL CAST function e.g. `CAST(expr AS FLOAT)`
+    fn parse_cast_expression(&mut self) -> Result<ASTNode, ParserError> {
+        let expr = self.parse_expr(0)?;
+        self.consume_token(&Token::Keyword("AS".to_string()))?;
+        let data_type = self.parse_data_type()?;
+        self.consume_token(&Token::RParen)?;
+        Ok(ASTNode::SQLCast {
+            expr: Box::new(expr),
+            data_type,
+        })
+    }
+
+    /// Parse an expression infix (typically an operator)
+    fn parse_infix(
+        &mut self,
+        expr: ASTNode,
+        precedence: u8,
+    ) -> Result<Option<ASTNode>, ParserError> {
+        match self.next_token() {
+            Some(tok) => match tok {
+                Token::Keyword(ref k) => if k == "IS" {
+                    if self.parse_keywords(vec!["NULL"]) {
+                        Ok(Some(ASTNode::SQLIsNull(Box::new(expr))))
+                    } else if self.parse_keywords(vec!["NOT", "NULL"]) {
+                        Ok(Some(ASTNode::SQLIsNotNull(Box::new(expr))))
+                    } else {
+                        parser_err!("Invalid tokens after IS")
+                    }
+                } else {
+                    Ok(Some(ASTNode::SQLBinaryExpr {
+                        left: Box::new(expr),
+                        op: self.to_sql_operator(&tok)?,
+                        right: Box::new(self.parse_expr(precedence)?),
+                    }))
+                },
+                Token::Eq
+                | Token::Neq
+                | Token::Gt
+                | Token::GtEq
+                | Token::Lt
+                | Token::LtEq
+                | Token::Plus
+                | Token::Minus
+                | Token::Mult
+                | Token::Mod
+                | Token::Div => Ok(Some(ASTNode::SQLBinaryExpr {
+                    left: Box::new(expr),
+                    op: self.to_sql_operator(&tok)?,
+                    right: Box::new(self.parse_expr(precedence)?),
+                })),
+                _ => parser_err!(format!("No infix parser for token {:?}", tok)),
+            },
+            None => Ok(None),
+        }
+    }
+
+    /// Convert a token operator to an AST operator
+    fn to_sql_operator(&self, tok: &Token) -> Result<SQLOperator, ParserError> {
+        match tok {
+            &Token::Eq => Ok(SQLOperator::Eq),
+            &Token::Neq => Ok(SQLOperator::NotEq),
+            &Token::Lt => Ok(SQLOperator::Lt),
+            &Token::LtEq => Ok(SQLOperator::LtEq),
+            &Token::Gt => Ok(SQLOperator::Gt),
+            &Token::GtEq => Ok(SQLOperator::GtEq),
+            &Token::Plus => Ok(SQLOperator::Plus),
+            &Token::Minus => Ok(SQLOperator::Minus),
+            &Token::Mult => Ok(SQLOperator::Multiply),
+            &Token::Div => Ok(SQLOperator::Divide),
+            &Token::Mod => Ok(SQLOperator::Modulus),
+            &Token::Keyword(ref k) if k == "AND" => Ok(SQLOperator::And),
+            &Token::Keyword(ref k) if k == "OR" => Ok(SQLOperator::Or),
+            _ => parser_err!(format!("Unsupported SQL operator {:?}", tok)),
+        }
+    }
+
+    /// Get the precedence of the next token
+    fn get_next_precedence(&self) -> Result<u8, ParserError> {
+        if self.index < self.tokens.len() {
+            self.get_precedence(&self.tokens[self.index])
+        } else {
+            Ok(0)
+        }
+    }
+
+    /// Get the precedence of a token
+    fn get_precedence(&self, tok: &Token) -> Result<u8, ParserError> {
+        //println!("get_precedence() {:?}", tok);
+
+        match tok {
+            &Token::Keyword(ref k) if k == "OR" => Ok(5),
+            &Token::Keyword(ref k) if k == "AND" => Ok(10),
+            &Token::Keyword(ref k) if k == "IS" => Ok(15),
+            &Token::Eq | &Token::Lt | &Token::LtEq | &Token::Neq | &Token::Gt | &Token::GtEq => {
+                Ok(20)
+            }
+            &Token::Plus | &Token::Minus => Ok(30),
+            &Token::Mult | &Token::Div | &Token::Mod => Ok(40),
+            _ => Ok(0),
+        }
+    }
+
+    /// Peek at the next token
+    fn peek_token(&mut self) -> Option<Token> {
+        if self.index < self.tokens.len() {
+            Some(self.tokens[self.index].clone())
+        } else {
+            None
+        }
+    }
+
+    /// Get the next token and increment the token index
+    fn next_token(&mut self) -> Option<Token> {
+        if self.index < self.tokens.len() {
+            self.index = self.index + 1;
+            Some(self.tokens[self.index - 1].clone())
+        } else {
+            None
+        }
+    }
+
+    /// Get the previous token and decrement the token index
+    fn prev_token(&mut self) -> Option<Token> {
+        if self.index > 0 {
+            Some(self.tokens[self.index - 1].clone())
+        } else {
+            None
+        }
+    }
+
+    /// Look for an expected keyword and consume it if it exists
+    fn parse_keyword(&mut self, expected: &'static str) -> bool {
+        match self.peek_token() {
+            Some(Token::Keyword(k)) => {
+                if expected.eq_ignore_ascii_case(k.as_str()) {
+                    self.next_token();
+                    true
+                } else {
+                    false
+                }
+            }
+            _ => false,
+        }
+    }
+
+    /// Look for an expected sequence of keywords and consume them if they exist
+    fn parse_keywords(&mut self, keywords: Vec<&'static str>) -> bool {
+        let index = self.index;
+        for keyword in keywords {
+            //println!("parse_keywords aborting .. expecting {}", keyword);
+            if !self.parse_keyword(&keyword) {
+                //println!("parse_keywords aborting .. did not find {}", keyword);
+                // reset index and return immediately
+                self.index = index;
+                return false;
+            }
+        }
+        true
+    }
+
+    //    fn parse_identifier(&mut self) -> Result<ASTNode::SQLIdentifier, Err> {
+    //        let expr = self.parse_expr()?;
+    //        match expr {
+    //            Some(ASTNode::SQLIdentifier { .. }) => Ok(expr),
+    //            _ => parser_err!(format!("Expected identifier but found {:?}", expr)))
+    //        }
+    //    }
+
+    /// Consume the next token if it matches the expected token, otherwise return an error
+    fn consume_token(&mut self, expected: &Token) -> Result<bool, ParserError> {
+        match self.peek_token() {
+            Some(ref t) => if *t == *expected {
+                self.next_token();
+                Ok(true)
+            } else {
+                Ok(false)
+            },
+            _ => parser_err!(format!(
+                "expected token {:?} but was {:?}",
+                expected,
+                self.prev_token()
+            )),
+        }
+    }
+
+    /// Parse a SQL CREATE statement
+    fn parse_create(&mut self) -> Result<ASTNode, ParserError> {
+        if self.parse_keywords(vec!["EXTERNAL", "TABLE"]) {
+            match self.next_token() {
+                Some(Token::Identifier(id)) => {
+                    // parse optional column list (schema)
+                    let mut columns = vec![];
+                    if self.consume_token(&Token::LParen)? {
+                        loop {
+                            if let Some(Token::Identifier(column_name)) = self.next_token() {
+                                if let Ok(data_type) = self.parse_data_type() {
+                                    let allow_null = if self.parse_keywords(vec!["NOT", "NULL"]) {
+                                        false
+                                    } else if self.parse_keyword("NULL") {
+                                        true
+                                    } else {
+                                        true
+                                    };
+
+                                    match self.peek_token() {
+                                        Some(Token::Comma) => {
+                                            self.next_token();
+                                            columns.push(SQLColumnDef {
+                                                name: column_name,
+                                                data_type: data_type,
+                                                allow_null,
+                                            });
+                                        }
+                                        Some(Token::RParen) => {
+                                            self.next_token();
+                                            columns.push(SQLColumnDef {
+                                                name: column_name,
+                                                data_type: data_type,
+                                                allow_null,
+                                            });
+                                            break;
+                                        }
+                                        _ => {
+                                            return parser_err!(
+                                                "Expected ',' or ')' after column definition"
+                                            );
+                                        }
+                                    }
+                                } else {
+                                    return parser_err!(
+                                        "Error parsing data type in column definition"
+                                    );
+                                }
+                            } else {
+                                return parser_err!("Error parsing column name");
+                            }
+                        }
+                    }
+
+                    //println!("Parsed {} column defs", columns.len());
+
+                    let mut headers = true;
+                    let file_type: FileType = if self.parse_keywords(vec!["STORED", "AS", "CSV"]) {
+                        if self.parse_keywords(vec!["WITH", "HEADER", "ROW"]) {
+                            headers = true;
+                        } else if self.parse_keywords(vec!["WITHOUT", "HEADER", "ROW"]) {
+                            headers = false;
+                        }
+                        FileType::CSV
+                    } else if self.parse_keywords(vec!["STORED", "AS", "NDJSON"]) {
+                        FileType::NdJson
+                    } else if self.parse_keywords(vec!["STORED", "AS", "PARQUET"]) {
+                        FileType::Parquet
+                    } else {
+                        return parser_err!(format!(
+                            "Expected 'STORED AS' clause, found {:?}",
+                            self.peek_token()
+                        ));
+                    };
+
+                    let location: String = if self.parse_keywords(vec!["LOCATION"]) {
+                        self.parse_literal_string()?
+                    } else {
+                        return parser_err!("Missing 'LOCATION' clause");
+                    };
+
+                    Ok(ASTNode::SQLCreateTable {
+                        name: id,
+                        columns,
+                        file_type,
+                        header_row: headers,
+                        location,
+                    })
+                }
+                _ => parser_err!(format!(
+                    "Unexpected token after CREATE EXTERNAL TABLE: {:?}",
+                    self.peek_token()
+                )),
+            }
+        } else {
+            parser_err!(format!(
+                "Unexpected token after CREATE: {:?}",
+                self.peek_token()
+            ))
+        }
+    }
+
+    /// Parse a literal integer/long
+    fn parse_literal_int(&mut self) -> Result<i64, ParserError> {
+        match self.next_token() {
+            Some(Token::Number(s)) => s.parse::<i64>().map_err(|e| {
+                ParserError::ParserError(format!("Could not parse '{}' as i64: {}", s, e))
+            }),
+            other => parser_err!(format!("Expected literal int, found {:?}", other)),
+        }
+    }
+
+    /// Parse a literal string
+    fn parse_literal_string(&mut self) -> Result<String, ParserError> {
+        match self.next_token() {
+            Some(Token::String(ref s)) => Ok(s.clone()),
+            other => parser_err!(format!("Expected literal string, found {:?}", other)),
+        }
+    }
+
+    /// Parse a SQL datatype (in the context of a CREATE TABLE statement for example)
+    fn parse_data_type(&mut self) -> Result<SQLType, ParserError> {
+        match self.next_token() {
+            Some(Token::Keyword(k)) => match k.to_uppercase().as_ref() {
+                "BOOLEAN" => Ok(SQLType::Boolean),
+                "UINT8" => Ok(SQLType::UInt8),
+                "UINT16" => Ok(SQLType::UInt16),
+                "UINT32" => Ok(SQLType::UInt32),
+                "UINT64" => Ok(SQLType::UInt64),
+                "INT8" => Ok(SQLType::Int8),
+                "INT16" => Ok(SQLType::Int16),
+                "INT32" | "INT" | "INTEGER" => Ok(SQLType::Int32),
+                "INT64" | "LONG" => Ok(SQLType::Int64),
+                "FLOAT32" | "FLOAT" => Ok(SQLType::Float32),
+                "FLOAT64" | "DOUBLE" => Ok(SQLType::Double64),
+                "UTF8" | "VARCHAR" | "STRING" => {
+                    // optional length
+                    if self.consume_token(&Token::LParen)? {
+                        let n = self.parse_literal_int()?;
+                        self.consume_token(&Token::RParen)?;
+                        Ok(SQLType::Utf8(n as usize))
+                    } else {
+                        Ok(SQLType::Utf8(100 as usize))
+                    }
+                }
+                _ => parser_err!(format!("Invalid data type '{:?}'", k)),
+            },
+            other => parser_err!(format!("Invalid data type: '{:?}'", other)),
+        }
+    }
+
+    /// Parse a SELECT statement
+    fn parse_select(&mut self) -> Result<ASTNode, ParserError> {
+        let projection = self.parse_expr_list()?;
+
+        let relation: Option<Box<ASTNode>> = if self.parse_keyword("FROM") {
+            //TODO: add support for JOIN
+            Some(Box::new(self.parse_expr(0)?))
+        } else {
+            None
+        };
+
+        let selection = if self.parse_keyword("WHERE") {
+            Some(Box::new(self.parse_expr(0)?))
+        } else {
+            None
+        };
+
+        let group_by = if self.parse_keywords(vec!["GROUP", "BY"]) {
+            Some(self.parse_expr_list()?)
+        } else {
+            None
+        };
+
+        let having = if self.parse_keyword("HAVING") {
+            Some(Box::new(self.parse_expr(0)?))
+        } else {
+            None
+        };
+
+        let order_by = if self.parse_keywords(vec!["ORDER", "BY"]) {
+            Some(self.parse_order_by_expr_list()?)
+        } else {
+            None
+        };
+
+        let limit = if self.parse_keyword("LIMIT") {
+            self.parse_limit()?
+        } else {
+            None
+        };
+
+        if let Some(next_token) = self.peek_token() {
+            parser_err!(format!(
+                "Unexpected token at end of SELECT: {:?}",
+                next_token
+            ))
+        } else {
+            Ok(ASTNode::SQLSelect {
+                projection,
+                selection,
+                relation,
+                limit,
+                order_by,
+                group_by,
+                having,
+            })
+        }
+    }
+
+    /// Parse a comma-delimited list of SQL expressions
+    fn parse_expr_list(&mut self) -> Result<Vec<ASTNode>, ParserError> {
+        let mut expr_list: Vec<ASTNode> = vec![];
+        loop {
+            expr_list.push(self.parse_expr(0)?);
+            if let Some(t) = self.peek_token() {
+                if t == Token::Comma {
+                    self.next_token();
+                } else {
+                    break;
+                }
+            } else {
+                //EOF
+                break;
+            }
+        }
+        Ok(expr_list)
+    }
+
+    /// Parse a comma-delimited list of SQL ORDER BY expressions
+    fn parse_order_by_expr_list(&mut self) -> Result<Vec<ASTNode>, ParserError> {
+        let mut expr_list: Vec<ASTNode> = vec![];
+        loop {
+            let expr = self.parse_expr(0)?;
+
+            // look for optional ASC / DESC specifier
+            let asc = match self.peek_token() {
+                Some(Token::Keyword(k)) => {
+                    self.next_token(); // consume it
+                    match k.to_uppercase().as_ref() {
+                        "ASC" => true,
+                        "DESC" => false,
+                        _ => {
+                            return parser_err!(format!(
+                                "Invalid modifier for ORDER BY expression: {:?}",
+                                k
+                            ))
+                        }
+                    }
+                }
+                Some(Token::Comma) => true,
+                Some(other) => {
+                    return parser_err!(format!("Unexpected token after ORDER BY expr: {:?}", other))
+                }
+                None => true,
+            };
+
+            expr_list.push(ASTNode::SQLOrderBy {
+                expr: Box::new(expr),
+                asc,
+            });
+
+            if let Some(t) = self.peek_token() {
+                if t == Token::Comma {
+                    self.next_token();
+                } else {
+                    break;
+                }
+            } else {
+                // EOF
+                break;
+            }
+        }
+        Ok(expr_list)
+    }
+
+    /// Parse a LIMIT clause
+    fn parse_limit(&mut self) -> Result<Option<Box<ASTNode>>, ParserError> {
+        if self.parse_keyword("ALL") {
+            Ok(None)
+        } else {
+            self.parse_literal_int()
+                .map(|n| Some(Box::new(ASTNode::SQLLiteralLong(n))))
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+
+    use super::*;
+
+    #[test]
+    fn parse_simple_select() {
+        let sql = String::from("SELECT id, fname, lname FROM customer WHERE id = 1 LIMIT 5");
+        let ast = parse_sql(&sql);
+        match ast {
+            ASTNode::SQLSelect {
+                projection, limit, ..
+            } => {
+                assert_eq!(3, projection.len());
+                assert_eq!(Some(Box::new(ASTNode::SQLLiteralLong(5))), limit);
+            }
+            _ => assert!(false),
+        }
+    }
+
+    #[test]
+    fn parse_select_wildcard() {
+        let sql = String::from("SELECT * FROM customer");
+        let ast = parse_sql(&sql);
+        match ast {
+            ASTNode::SQLSelect { projection, .. } => {
+                assert_eq!(1, projection.len());
+                assert_eq!(ASTNode::SQLWildcard, projection[0]);
+            }
+            _ => assert!(false),
+        }
+    }
+
+    #[test]
+    fn parse_select_count_wildcard() {
+        let sql = String::from("SELECT COUNT(*) FROM customer");
+        let ast = parse_sql(&sql);
+        match ast {
+            ASTNode::SQLSelect { projection, .. } => {
+                assert_eq!(1, projection.len());
+                assert_eq!(
+                    ASTNode::SQLFunction {
+                        id: "COUNT".to_string(),
+                        args: vec![ASTNode::SQLWildcard],
+                    },
+                    projection[0]
+                );
+            }
+            _ => assert!(false),
+        }
+    }
+
+    #[test]
+    fn parse_select_string_predicate() {
+        let sql = String::from(
+            "SELECT id, fname, lname FROM customer \
+             WHERE salary != 'Not Provided' AND salary != ''",
+        );
+        let _ast = parse_sql(&sql);
+        //TODO: add assertions
+    }
+
+    #[test]
+    fn parse_projection_nested_type() {
+        let sql = String::from("SELECT customer.address.state FROM foo");
+        let _ast = parse_sql(&sql);
+        //TODO: add assertions
+    }
+
+    #[test]
+    fn parse_compound_expr_1() {
+        use self::ASTNode::*;
+        use self::SQLOperator::*;
+        let sql = String::from("a + b * c");
+        let ast = parse_sql(&sql);
+        assert_eq!(
+            SQLBinaryExpr {
+                left: Box::new(SQLIdentifier("a".to_string())),
+                op: Plus,
+                right: Box::new(SQLBinaryExpr {
+                    left: Box::new(SQLIdentifier("b".to_string())),
+                    op: Multiply,
+                    right: Box::new(SQLIdentifier("c".to_string()))
+                })
+            },
+            ast
+        );
+    }
+
+    #[test]
+    fn parse_compound_expr_2() {
+        use self::ASTNode::*;
+        use self::SQLOperator::*;
+        let sql = String::from("a * b + c");
+        let ast = parse_sql(&sql);
+        assert_eq!(
+            SQLBinaryExpr {
+                left: Box::new(SQLBinaryExpr {
+                    left: Box::new(SQLIdentifier("a".to_string())),
+                    op: Multiply,
+                    right: Box::new(SQLIdentifier("b".to_string()))
+                }),
+                op: Plus,
+                right: Box::new(SQLIdentifier("c".to_string()))
+            },
+            ast
+        );
+    }
+
+    #[test]
+    fn parse_is_null() {
+        use self::ASTNode::*;
+        let sql = String::from("a IS NULL");
+        let ast = parse_sql(&sql);
+        assert_eq!(SQLIsNull(Box::new(SQLIdentifier("a".to_string()))), ast);
+    }
+
+    #[test]
+    fn parse_is_not_null() {
+        use self::ASTNode::*;
+        let sql = String::from("a IS NOT NULL");
+        let ast = parse_sql(&sql);
+        assert_eq!(SQLIsNotNull(Box::new(SQLIdentifier("a".to_string()))), ast);
+    }
+
+    #[test]
+    fn parse_select_order_by() {
+        let sql = String::from(
+            "SELECT id, fname, lname FROM customer WHERE id < 5 ORDER BY lname ASC, fname DESC",
+        );
+        let ast = parse_sql(&sql);
+        match ast {
+            ASTNode::SQLSelect { order_by, .. } => {
+                assert_eq!(
+                    Some(vec![
+                        ASTNode::SQLOrderBy {
+                            expr: Box::new(ASTNode::SQLIdentifier("lname".to_string())),
+                            asc: true,
+                        },
+                        ASTNode::SQLOrderBy {
+                            expr: Box::new(ASTNode::SQLIdentifier("fname".to_string())),
+                            asc: false,
+                        },
+                    ]),
+                    order_by
+                );
+            }
+            _ => assert!(false),
+        }
+    }
+
+    #[test]
+    fn parse_select_group_by() {
+        let sql = String::from("SELECT id, fname, lname FROM customer GROUP BY lname, fname");
+        let ast = parse_sql(&sql);
+        match ast {
+            ASTNode::SQLSelect { group_by, .. } => {
+                assert_eq!(
+                    Some(vec![
+                        ASTNode::SQLIdentifier("lname".to_string()),
+                        ASTNode::SQLIdentifier("fname".to_string()),
+                    ]),
+                    group_by
+                );
+            }
+            _ => assert!(false),
+        }
+    }
+
+    #[test]
+    fn parse_limit_accepts_all() {
+        let sql = String::from("SELECT id, fname, lname FROM customer WHERE id = 1 LIMIT ALL");
+        let ast = parse_sql(&sql);
+        match ast {
+            ASTNode::SQLSelect {
+                projection, limit, ..
+            } => {
+                assert_eq!(3, projection.len());
+                assert_eq!(None, limit);
+            }
+            _ => assert!(false),
+        }
+    }
+
+    #[test]
+    fn parse_cast() {
+        let sql = String::from("SELECT CAST(id AS DOUBLE) FROM customer");
+        let ast = parse_sql(&sql);
+        match ast {
+            ASTNode::SQLSelect { projection, .. } => {
+                assert_eq!(1, projection.len());
+                assert_eq!(
+                    ASTNode::SQLCast {
+                        expr: Box::new(ASTNode::SQLIdentifier("id".to_string())),
+                        data_type: SQLType::Double64
+                    },
+                    projection[0]
+                );
+            }
+            _ => assert!(false),
+        }
+    }
+
+    #[test]
+    fn parse_create_external_table_csv_with_header_row() {
+        let sql = String::from(
+            "CREATE EXTERNAL TABLE uk_cities (\
+             name VARCHAR(100) NOT NULL,\
+             lat DOUBLE NULL,\
+             lng DOUBLE NULL) \
+             STORED AS CSV WITH HEADER ROW \
+             LOCATION '/mnt/ssd/uk_cities.csv'",
+        );
+        let ast = parse_sql(&sql);
+        match ast {
+            ASTNode::SQLCreateTable {
+                name,
+                columns,
+                file_type,
+                header_row,
+                location,
+            } => {
+                assert_eq!("uk_cities", name);
+                assert_eq!(3, columns.len());
+                assert_eq!(FileType::CSV, file_type);
+                assert_eq!(true, header_row);
+                assert_eq!("/mnt/ssd/uk_cities.csv", location);
+
+                let c_name = &columns[0];
+                assert_eq!("name", c_name.name);
+                assert_eq!(SQLType::Utf8(100), c_name.data_type);
+                assert_eq!(false, c_name.allow_null);
+
+                let c_lat = &columns[1];
+                assert_eq!("lat", c_lat.name);
+                assert_eq!(SQLType::Double64, c_lat.data_type);
+                assert_eq!(true, c_lat.allow_null);
+
+                let c_lng = &columns[2];
+                assert_eq!("lng", c_lng.name);
+                assert_eq!(SQLType::Double64, c_lng.data_type);
+                assert_eq!(true, c_lng.allow_null);
+            }
+            _ => assert!(false),
+        }
+    }
+
+    #[test]
+    fn parse_create_external_table_csv_without_header_row() {
+        let sql = String::from(
+            "CREATE EXTERNAL TABLE uk_cities (\
+             name VARCHAR(100) NOT NULL,\
+             lat DOUBLE NOT NULL,\
+             lng DOUBLE NOT NULL) \
+             STORED AS CSV WITHOUT HEADER ROW \
+             LOCATION '/mnt/ssd/uk_cities.csv'",
+        );
+        let ast = parse_sql(&sql);
+        match ast {
+            ASTNode::SQLCreateTable {
+                name,
+                columns,
+                file_type,
+                header_row,
+                location,
+            } => {
+                assert_eq!("uk_cities", name);
+                assert_eq!(3, columns.len());
+                assert_eq!(FileType::CSV, file_type);
+                assert_eq!(false, header_row);
+                assert_eq!("/mnt/ssd/uk_cities.csv", location);
+            }
+            _ => assert!(false),
+        }
+    }
+
+    #[test]
+    fn parse_create_external_table_parquet() {
+        let sql = String::from(
+            "CREATE EXTERNAL TABLE uk_cities \
+             STORED AS PARQUET \
+             LOCATION '/mnt/ssd/uk_cities.parquet'",
+        );
+        let ast = parse_sql(&sql);
+        match ast {
+            ASTNode::SQLCreateTable {
+                name,
+                columns,
+                file_type,
+                location,
+                ..
+            } => {
+                assert_eq!("uk_cities", name);
+                assert_eq!(0, columns.len());
+                assert_eq!(FileType::Parquet, file_type);
+                assert_eq!("/mnt/ssd/uk_cities.parquet", location);
+            }
+            _ => assert!(false),
+        }
+    }
+
+    #[test]
+    fn parse_scalar_function_in_projection() {
+        let sql = String::from("SELECT sqrt(id) FROM foo");
+        let ast = parse_sql(&sql);
+        if let ASTNode::SQLSelect { projection, .. } = ast {
+            assert_eq!(
+                vec![ASTNode::SQLFunction {
+                    id: String::from("sqrt"),
+                    args: vec![ASTNode::SQLIdentifier(String::from("id"))],
+                }],
+                projection
+            );
+        } else {
+            assert!(false);
+        }
+    }
+
+    #[test]
+    fn parse_aggregate_with_group_by() {
+        let sql = String::from("SELECT a, COUNT(1), MIN(b), MAX(b) FROM foo GROUP BY a");
+        let _ast = parse_sql(&sql);
+        //TODO: assertions
+    }
+
+    #[test]
+    fn parse_select_version() {
+        let sql = "SELECT @@version";
+        match parse_sql(&sql) {
+            ASTNode::SQLSelect { ref projection, .. } => {
+                assert_eq!(
+                    projection[0],
+                    ASTNode::SQLIdentifier("@@version".to_string())
+                );
+            }
+            _ => panic!(),
+        }
+    }
+
+    fn parse_sql(sql: &str) -> ASTNode {
+        let mut tokenizer = Tokenizer::new(&sql);
+        let tokens = tokenizer.tokenize().unwrap();
+        let mut parser = Parser::new(tokens);
+        let ast = parser.parse().unwrap();
+        ast
+    }
+
+}
--- a/src/sqltokenizer.rs
+++ b/src/sqltokenizer.rs
@ -0,0 +1,427 @@
+// Copyright 2018 Grove Enterprises LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! SQL Tokenizer
+
+use std::iter::Peekable;
+use std::str::Chars;
+
+use fnv::FnvHashSet;
+
+/// SQL Token enumeration
+#[derive(Debug, Clone, PartialEq)]
+pub enum Token {
+    /// SQL identifier e.g. table or column name
+    Identifier(String),
+    /// SQL keyword  e.g. Keyword("SELECT")
+    Keyword(String),
+    /// Numeric literal
+    Number(String),
+    /// String literal
+    String(String),
+    /// Comma
+    Comma,
+    /// Whitespace (space, tab, etc)
+    Whitespace,
+    /// Equality operator `=`
+    Eq,
+    /// Not Equals operator `!=` or `<>`
+    Neq,
+    /// Less Than operator `<`
+    Lt,
+    /// Greater han operator `>`
+    Gt,
+    /// Less Than Or Equals operator `<=`
+    LtEq,
+    /// Greater Than Or Equals operator `>=`
+    GtEq,
+    /// Plus operator `+`
+    Plus,
+    /// Minus operator `-`
+    Minus,
+    /// Multiplication operator `*`
+    Mult,
+    /// Division operator `/`
+    Div,
+    /// Modulo Operator `%`
+    Mod,
+    /// Left parenthesis `(`
+    LParen,
+    /// Right parenthesis `)`
+    RParen,
+    /// Period (used for compound identifiers or projections into nested types)
+    Period,
+}
+
+/// Tokenizer error
+#[derive(Debug)]
+pub struct TokenizerError(String);
+
+lazy_static! {
+    static ref KEYWORDS: FnvHashSet<&'static str> = {
+        let mut m = FnvHashSet::default();
+
+        m.insert("SELECT");
+        m.insert("FROM");
+        m.insert("WHERE");
+        m.insert("LIMIT");
+        m.insert("ORDER");
+        m.insert("GROUP");
+        m.insert("BY");
+        m.insert("HAVING");
+        m.insert("UNION");
+        m.insert("ALL");
+        m.insert("INSERT");
+        m.insert("UPDATE");
+        m.insert("DELETE");
+        m.insert("IN");
+        m.insert("IS");
+        m.insert("NULL");
+        m.insert("SET");
+        m.insert("CREATE");
+        m.insert("EXTERNAL");
+        m.insert("TABLE");
+        m.insert("ASC");
+        m.insert("DESC");
+        m.insert("AND");
+        m.insert("OR");
+        m.insert("NOT");
+        m.insert("AS");
+        m.insert("STORED");
+        m.insert("CSV");
+        m.insert("PARQUET");
+        m.insert("LOCATION");
+        m.insert("WITH");
+        m.insert("WITHOUT");
+        m.insert("HEADER");
+        m.insert("ROW");
+
+        // SQL types
+        m.insert("STRING");
+        m.insert("VARCHAR");
+        m.insert("FLOAT");
+        m.insert("DOUBLE");
+        m.insert("INT");
+        m.insert("INTEGER");
+        m.insert("LONG");
+
+        // Arrow native types
+        m.insert("BOOLEAN");
+        m.insert("UINT8");
+        m.insert("UINT16");
+        m.insert("UINT32");
+        m.insert("UINT64");
+        m.insert("INT8");
+        m.insert("INT16");
+        m.insert("INT32");
+        m.insert("INT64");
+        m.insert("FLOAT32");
+        m.insert("FLOAT64");
+        m.insert("UTF8");
+
+        m
+    };
+}
+
+/// SQL Tokenizer
+pub struct Tokenizer {
+    pub query: String,
+}
+
+impl Tokenizer {
+    /// Create a new SQL tokenizer for the specified SQL statement
+    pub fn new(query: &str) -> Self {
+        Self {
+            query: query.to_string(),
+        }
+    }
+
+    /// Tokenize the statement and produce a vector of tokens
+    pub fn tokenize(&mut self) -> Result<Vec<Token>, TokenizerError> {
+        let mut peekable = self.query.chars().peekable();
+
+        let mut tokens: Vec<Token> = vec![];
+
+        while let Some(token) = self.next_token(&mut peekable)? {
+            tokens.push(token);
+        }
+
+        Ok(tokens
+            .into_iter()
+            .filter(|t| match t {
+                Token::Whitespace => false,
+                _ => true,
+            })
+            .collect())
+    }
+
+    /// Get the next token or return None
+    fn next_token(&self, chars: &mut Peekable<Chars>) -> Result<Option<Token>, TokenizerError> {
+        //println!("next_token: {:?}", chars.peek());
+        match chars.peek() {
+            Some(&ch) => match ch {
+                // whitespace
+                ' ' | '\t' | '\n' => {
+                    chars.next(); // consume
+                    Ok(Some(Token::Whitespace))
+                }
+                // identifier or keyword
+                'a'...'z' | 'A'...'Z' | '_' | '@' => {
+                    let mut s = String::new();
+                    while let Some(&ch) = chars.peek() {
+                        match ch {
+                            'a'...'z' | 'A'...'Z' | '_' | '0'...'9' | '@' => {
+                                chars.next(); // consume
+                                s.push(ch);
+                            }
+                            _ => break,
+                        }
+                    }
+                    let upper_str = s.to_uppercase();
+                    if KEYWORDS.contains(upper_str.as_str()) {
+                        Ok(Some(Token::Keyword(upper_str)))
+                    } else {
+                        Ok(Some(Token::Identifier(s)))
+                    }
+                }
+                // string
+                '\'' => {
+                    //TODO: handle escaped quotes in string
+                    //TODO: handle EOF before terminating quote
+                    let mut s = String::new();
+                    chars.next(); // consume
+                    while let Some(&ch) = chars.peek() {
+                        match ch {
+                            '\'' => {
+                                chars.next(); // consume
+                                break;
+                            }
+                            _ => {
+                                chars.next(); // consume
+                                s.push(ch);
+                            }
+                        }
+                    }
+                    Ok(Some(Token::String(s)))
+                }
+                // numbers
+                '0'...'9' => {
+                    let mut s = String::new();
+                    while let Some(&ch) = chars.peek() {
+                        match ch {
+                            '0'...'9' | '.' => {
+                                chars.next(); // consume
+                                s.push(ch);
+                            }
+                            _ => break,
+                        }
+                    }
+                    Ok(Some(Token::Number(s)))
+                }
+                // punctuation
+                ',' => {
+                    chars.next();
+                    Ok(Some(Token::Comma))
+                }
+                '(' => {
+                    chars.next();
+                    Ok(Some(Token::LParen))
+                }
+                ')' => {
+                    chars.next();
+                    Ok(Some(Token::RParen))
+                }
+                // operators
+                '+' => {
+                    chars.next();
+                    Ok(Some(Token::Plus))
+                }
+                '-' => {
+                    chars.next();
+                    Ok(Some(Token::Minus))
+                }
+                '*' => {
+                    chars.next();
+                    Ok(Some(Token::Mult))
+                }
+                '/' => {
+                    chars.next();
+                    Ok(Some(Token::Div))
+                }
+                '%' => {
+                    chars.next();
+                    Ok(Some(Token::Mod))
+                }
+                '=' => {
+                    chars.next();
+                    Ok(Some(Token::Eq))
+                }
+                '.' => {
+                    chars.next();
+                    Ok(Some(Token::Period))
+                }
+                '!' => {
+                    chars.next(); // consume
+                    match chars.peek() {
+                        Some(&ch) => match ch {
+                            '=' => {
+                                chars.next();
+                                Ok(Some(Token::Neq))
+                            }
+                            _ => Err(TokenizerError(format!("TBD"))),
+                        },
+                        None => Err(TokenizerError(format!("TBD"))),
+                    }
+                }
+                '<' => {
+                    chars.next(); // consume
+                    match chars.peek() {
+                        Some(&ch) => match ch {
+                            '=' => {
+                                chars.next();
+                                Ok(Some(Token::LtEq))
+                            }
+                            '>' => {
+                                chars.next();
+                                Ok(Some(Token::Neq))
+                            }
+                            _ => Ok(Some(Token::Lt)),
+                        },
+                        None => Ok(Some(Token::Lt)),
+                    }
+                }
+                '>' => {
+                    chars.next(); // consume
+                    match chars.peek() {
+                        Some(&ch) => match ch {
+                            '=' => {
+                                chars.next();
+                                Ok(Some(Token::GtEq))
+                            }
+                            _ => Ok(Some(Token::Gt)),
+                        },
+                        None => Ok(Some(Token::Gt)),
+                    }
+                }
+                _ => Err(TokenizerError(format!(
+                    "unhandled char '{}' in tokenizer",
+                    ch
+                ))),
+            },
+            None => Ok(None),
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn tokenize_select_1() {
+        let sql = String::from("SELECT 1");
+        let mut tokenizer = Tokenizer::new(&sql);
+        let tokens = tokenizer.tokenize().unwrap();
+
+        let expected = vec![
+            Token::Keyword(String::from("SELECT")),
+            Token::Number(String::from("1")),
+        ];
+
+        compare(expected, tokens);
+    }
+
+    #[test]
+    fn tokenize_scalar_function() {
+        let sql = String::from("SELECT sqrt(1)");
+        let mut tokenizer = Tokenizer::new(&sql);
+        let tokens = tokenizer.tokenize().unwrap();
+
+        let expected = vec![
+            Token::Keyword(String::from("SELECT")),
+            Token::Identifier(String::from("sqrt")),
+            Token::LParen,
+            Token::Number(String::from("1")),
+            Token::RParen,
+        ];
+
+        compare(expected, tokens);
+    }
+
+    #[test]
+    fn tokenize_simple_select() {
+        let sql = String::from("SELECT * FROM customer WHERE id = 1 LIMIT 5");
+        let mut tokenizer = Tokenizer::new(&sql);
+        let tokens = tokenizer.tokenize().unwrap();
+
+        let expected = vec![
+            Token::Keyword(String::from("SELECT")),
+            Token::Mult,
+            Token::Keyword(String::from("FROM")),
+            Token::Identifier(String::from("customer")),
+            Token::Keyword(String::from("WHERE")),
+            Token::Identifier(String::from("id")),
+            Token::Eq,
+            Token::Number(String::from("1")),
+            Token::Keyword(String::from("LIMIT")),
+            Token::Number(String::from("5")),
+        ];
+
+        compare(expected, tokens);
+    }
+
+    #[test]
+    fn tokenize_string_predicate() {
+        let sql = String::from("SELECT * FROM customer WHERE salary != 'Not Provided'");
+        let mut tokenizer = Tokenizer::new(&sql);
+        let tokens = tokenizer.tokenize().unwrap();
+
+        let expected = vec![
+            Token::Keyword(String::from("SELECT")),
+            Token::Mult,
+            Token::Keyword(String::from("FROM")),
+            Token::Identifier(String::from("customer")),
+            Token::Keyword(String::from("WHERE")),
+            Token::Identifier(String::from("salary")),
+            Token::Neq,
+            Token::String(String::from("Not Provided")),
+        ];
+
+        compare(expected, tokens);
+    }
+
+    #[test]
+    fn tokenize_is_null() {
+        let sql = String::from("a IS NULL");
+        let mut tokenizer = Tokenizer::new(&sql);
+        let tokens = tokenizer.tokenize().unwrap();
+
+        let expected = vec![
+            Token::Identifier(String::from("a")),
+            Token::Keyword("IS".to_string()),
+            Token::Keyword("NULL".to_string()),
+        ];
+
+        compare(expected, tokens);
+    }
+
+    fn compare(expected: Vec<Token>, actual: Vec<Token>) {
+        //println!("------------------------------");
+        //println!("tokens   = {:?}", actual);
+        //println!("expected = {:?}", expected);
+        //println!("------------------------------");
+        assert_eq!(expected, actual);
+    }
+
+}
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@ -1,124 +0,0 @@
-use std::cmp::PartialEq;
-use std::fmt::Debug;
-
-/// Simple holder for a sequence of characters that supports iteration and mark/reset methods
-pub struct CharSeq {
-    chars: Vec<char>,
-    i: usize,
-    m: usize
-}
-
-impl CharSeq {
-
-    /// Create a CharSeq from a string
-    pub fn new(sql: &str) -> Self {
-        CharSeq {
-            chars: sql.chars().collect(),
-            i: 0,
-            m: 0
-        }
-    }
-
-    /// Mark the current index
-    pub fn mark(&mut self) {
-        self.m = self.i;
-    }
-
-    /// Reset the index
-    pub fn reset(&mut self) {
-        self.i = self.m;
-    }
-
-    /// Peek the next char
-    pub fn peek(&mut self) -> Option<&char> {
-        if self.i < self.chars.len() {
-            Some(&self.chars[self.i])
-        } else {
-            None
-        }
-    }
-
-    /// Get the next char
-    pub fn next(&mut self) -> Option<char> {
-        if self.i < self.chars.len() {
-            self.i += 1;
-            Some(self.chars[self.i-1])
-        } else {
-            None
-        }
-    }
-}
-
-#[derive(Debug)]
-pub struct Position {
-    line: usize,
-    col: usize
-}
-impl Position {
-    pub fn new(line: usize, col: usize) -> Self {
-        Position { line, col }
-    }
-}
-
-#[derive(Debug)]
-pub enum TokenizerError {
-    UnexpectedChar(char,Position),
-    UnexpectedEof(Position),
-    UnterminatedStringLiteral(Position),
-    Custom(String)
-}
-
-/// SQL Tokens
-#[derive(Debug,PartialEq)]
-pub enum SQLToken {
-    Whitespace(char),
-    Keyword(String),
-    Identifier(String),
-    Literal(String), //TODO: need to model different types of literal
-    Plus,
-    Minus,
-    Mult,
-    Divide,
-    Eq,
-    Not,
-    NotEq,
-    Gt,
-    GtEq,
-    Lt,
-    LtEq,
-    LParen,
-    RParen,
-    Comma,
-}
-
-pub trait SQLTokenizer {
-
-    /// get the precendence of a token
-    fn precedence(&self, token: &SQLToken) -> usize;
-
-    fn peek_token(&mut self) -> Result<Option<SQLToken>, TokenizerError>;
-
-    /// return a reference to the next token and advance the index
-    fn next_token(&mut self) -> Result<Option<SQLToken>, TokenizerError>;
-}
-
-
-pub fn tokenize(sql: &str, tokenizer: &mut SQLTokenizer) -> Result<Vec<SQLToken>, TokenizerError> {
-
-    let mut chars = CharSeq::new(sql);
-
-    let mut tokens : Vec<SQLToken> = vec![];
-
-    loop {
-        match tokenizer.next_token()? {
-            Some(SQLToken::Whitespace(_)) => { /* ignore */ },
-            Some(token) => {
-                println!("Token: {:?}", token);
-                tokens.push(token)
-            },
-            None => break
-        }
-    }
-
-    Ok(tokens)
-}