simple example of custom tokenizer

This commit is contained in:
Andy Grove 2018-02-09 06:53:49 -07:00
parent fcf6b1150e
commit f56846098e
6 changed files with 145 additions and 29 deletions

View file

@ -1,10 +1,12 @@
# datafusion-sql
DataFusion SQL Parser (v2)
This is a work-in-progress to develop a new version of the DataFusion SQL Parser.
Goals:
Goals for this version:
- Support for custom SQL dialects, so other projects can implement their own parsers easily
- Zero-copy of tokens when parsing
- Good error reporting (e.g. show line / column numbers and descriptive messages)
- Zero-copy of tokens when parsing
- Concise code

View file

@ -1,17 +1,23 @@
use std::str::Chars;
use std::iter::Peekable;
extern crate datafusion_sql;
use datafusion_sql::tokenizer::*;
use datafusion_sql::generic_tokenizer::*;
use datafusion_sql::parser::*;
#[derive(Debug)]
///
/// This example demonstrates building a custom ACME parser that extends the generic parser
/// by adding support for a factorial operator !!
///
#[derive(Debug,PartialEq)]
enum AcmeToken {
/// Factorial operator `!!`
Factorial
}
#[derive(Debug)]
enum AcmeOperator {
Factorial
@ -19,21 +25,35 @@ enum AcmeOperator {
#[derive(Debug)]
enum AcmeTokenizerError {
}
struct AcmeTokenizer {
//chars: &'a Chars
generic: GenericTokenizer
}
/// The ACME tokenizer looks for the factorial operator `!!` but delegates everything else
impl SQLTokenizer<AcmeToken, AcmeTokenizerError> for AcmeTokenizer {
fn peek_token(&mut self) -> Result<Option<SQLToken<AcmeToken>>, TokenizerError<AcmeTokenizerError>> {
Ok(Some(SQLToken::Custom(AcmeToken::Factorial)))
}
fn next_token(&mut self) -> Result<Option<SQLToken<AcmeToken>>, TokenizerError<AcmeTokenizerError>> {
Ok(Some(SQLToken::Custom(AcmeToken::Factorial)))
fn next_token(&self, chars: &mut Peekable<Chars>) -> Result<Option<SQLToken<AcmeToken>>, TokenizerError<AcmeTokenizerError>> {
match chars.peek() {
Some(&ch) => match ch {
'!' => {
chars.next(); // consume the first `!`
match chars.peek() {
Some(&ch) => match ch {
'!' => {
chars.next(); // consume the second `!`
Ok(Some(SQLToken::Custom(AcmeToken::Factorial)))
},
_ => Err(TokenizerError::UnexpectedChar(ch,Position::new(0,0)))
},
None => Ok(Some(SQLToken::Not))
}
},
_ => self.generic.next_token(chars)
}
_ => self.generic.next_token(chars)
}
}
}
@ -43,9 +63,13 @@ fn main() {
let sql = "1 + !! 5 * 2";
let mut tokenizer = AcmeTokenizer { };
let mut acme_tokenizer = AcmeTokenizer {
generic: GenericTokenizer { }
};
println!("token = {:?}", tokenizer.peek_token().unwrap());
let tokens = tokenize(&sql, &mut acme_tokenizer).unwrap();
println!("tokens = {:?}", tokens);

41
src/generic_tokenizer.rs Normal file
View file

@ -0,0 +1,41 @@
use std::cmp::PartialEq;
use std::fmt::Debug;
use std::iter::Peekable;
use std::str::Chars;
use super::tokenizer::*;
pub struct GenericTokenizer {}
impl<S,TE> SQLTokenizer<S,TE> for GenericTokenizer
where S: Debug + PartialEq {
fn next_token(&self, chars: &mut Peekable<Chars>) -> Result<Option<SQLToken<S>>, TokenizerError<TE>> {
match chars.next() {
Some(ch) => match ch {
' ' | '\t' | '\n' => Ok(Some(SQLToken::Whitespace(ch))),
'0' ... '9' => {
let mut s = String::new();
s.push(ch);
while let Some(&ch) = chars.peek() {
match ch {
'0' ... '9' => {
chars.next(); // consume
s.push(ch);
},
_ => break
}
}
Ok(Some(SQLToken::Literal(s)))
},
'+' => Ok(Some(SQLToken::Plus)),
'-' => Ok(Some(SQLToken::Minus)),
'*' => Ok(Some(SQLToken::Mult)),
'/' => Ok(Some(SQLToken::Divide)),
_ => Err(TokenizerError::UnexpectedChar(ch,Position::new(0, 0)))
},
None => Ok(None)
}
}
}

View file

@ -1,2 +1,3 @@
pub mod tokenizer;
pub mod generic_tokenizer;
pub mod parser;

View file

@ -1,3 +1,6 @@
use std::cmp::PartialEq;
use std::fmt::Debug;
use super::tokenizer::*;
#[derive(Debug)]
@ -35,23 +38,29 @@ pub enum SQLExpr<T> {
}
#[derive(Debug)]
pub enum ParserError<T> {
WrongToken { expected: Vec<SQLToken<T>>, actual: SQLToken<T>, line: usize, col: usize },
Custom(T)
pub enum ParserError<S, PE>
where S: Debug + PartialEq {
WrongToken { expected: Vec<SQLToken<S>>, actual: SQLToken<S>, line: usize, col: usize },
Custom(PE)
}
impl<T> From<TokenizerError<T>> for ParserError<T> {
fn from(_: TokenizerError<T>) -> Self {
impl<S, TE> From<TokenizerError<TE>> for ParserError<S, TE>
where S: Debug + PartialEq {
fn from(_: TokenizerError<TE>) -> Self {
unimplemented!()
}
}
trait Parser<S, PE> {
trait Parser<S, PE>
where S: Debug + PartialEq {
/// parse the prefix and stop once an infix operator is reached
fn parse_prefix(&mut self) -> Result<Box<SQLExpr<S>>, ParserError<PE>> ;
fn parse_prefix(&mut self) -> Result<Box<SQLExpr<S>>, ParserError<S, PE>> ;
/// parse the next infix expression, returning None if the precedence has changed
fn parse_infix(&mut self, left: SQLExpr<S>) -> Result<Option<Box<SQLExpr<S>>>, ParserError<PE>>;
fn parse_infix(&mut self, left: SQLExpr<S>) -> Result<Option<Box<SQLExpr<S>>>, ParserError<S, PE>>;
}
//

View file

@ -1,24 +1,40 @@
use std::cmp::PartialEq;
use std::fmt::Debug;
use std::iter::Peekable;
use std::str::Chars;
#[derive(Debug)]
pub struct Position {
line: usize,
col: usize
}
impl Position {
pub fn new(line: usize, col: usize) -> Self {
Position { line, col }
}
}
#[derive(Debug)]
pub enum TokenizerError<T> {
UnexpectedChar(char,Position),
UnexpectedEof(Position),
UnterminatedStringLiteral(Position),
Custom(T)
}
/// SQL Tokens
#[derive(Debug)]
pub enum SQLToken<T> {
#[derive(Debug,PartialEq)]
pub enum SQLToken<T: Debug + PartialEq> {
Whitespace(char),
Keyword(String), //TODO: &str ?
Identifier(String), //TODO: &str ?
Literal(String), //TODO: need to model different types of literal
Plus,
Minus,
Mult,
Divide,
Eq,
Not,
NotEq,
Gt,
GtEq,
@ -31,9 +47,32 @@ pub enum SQLToken<T> {
Custom(T)
}
pub trait SQLTokenizer<S, T> {
/// return a reference to the next token without consuming it (look ahead)
fn peek_token(&mut self) -> Result<Option<SQLToken<S>>, TokenizerError<T>>;
pub trait SQLTokenizer<S, TE>
where S: Debug + PartialEq {
/// return a reference to the next token and advance the index
fn next_token(&mut self) -> Result<Option<SQLToken<S>>, TokenizerError<T>>;
fn next_token(&self, chars: &mut Peekable<Chars>) -> Result<Option<SQLToken<S>>, TokenizerError<TE>>;
}
pub fn tokenize<S,TE>(sql: &str, tokenizer: &mut SQLTokenizer<S,TE>) -> Result<Vec<SQLToken<S>>, TokenizerError<TE>>
where S: Debug + PartialEq
{
let mut peekable = sql.chars().peekable();
let mut tokens : Vec<SQLToken<S>> = vec![];
loop {
match tokenizer.next_token(&mut peekable)? {
Some(SQLToken::Whitespace(_)) => { /* ignore */ },
Some(token) => {
println!("Token: {:?}", token);
tokens.push(token)
},
None => break
}
}
Ok(tokens)
}