simple example of custom tokenizer

This commit is contained in:
Andy Grove 2018-02-09 06:53:49 -07:00
parent fcf6b1150e
commit f56846098e
6 changed files with 145 additions and 29 deletions

View file

@ -1,10 +1,12 @@
# datafusion-sql # datafusion-sql
DataFusion SQL Parser (v2) This is a work-in-progress to develop a new version of the DataFusion SQL Parser.
Goals: Goals for this version:
- Support for custom SQL dialects, so other projects can implement their own parsers easily - Support for custom SQL dialects, so other projects can implement their own parsers easily
- Zero-copy of tokens when parsing
- Good error reporting (e.g. show line / column numbers and descriptive messages) - Good error reporting (e.g. show line / column numbers and descriptive messages)
- Zero-copy of tokens when parsing
- Concise code

View file

@ -1,17 +1,23 @@
use std::str::Chars; use std::str::Chars;
use std::iter::Peekable;
extern crate datafusion_sql; extern crate datafusion_sql;
use datafusion_sql::tokenizer::*; use datafusion_sql::tokenizer::*;
use datafusion_sql::generic_tokenizer::*;
use datafusion_sql::parser::*; use datafusion_sql::parser::*;
#[derive(Debug)] ///
/// This example demonstrates building a custom ACME parser that extends the generic parser
/// by adding support for a factorial operator !!
///
#[derive(Debug,PartialEq)]
enum AcmeToken { enum AcmeToken {
/// Factorial operator `!!` /// Factorial operator `!!`
Factorial Factorial
} }
#[derive(Debug)] #[derive(Debug)]
enum AcmeOperator { enum AcmeOperator {
Factorial Factorial
@ -19,21 +25,35 @@ enum AcmeOperator {
#[derive(Debug)] #[derive(Debug)]
enum AcmeTokenizerError { enum AcmeTokenizerError {
} }
struct AcmeTokenizer { struct AcmeTokenizer {
//chars: &'a Chars generic: GenericTokenizer
} }
/// The ACME tokenizer looks for the factorial operator `!!` but delegates everything else
impl SQLTokenizer<AcmeToken, AcmeTokenizerError> for AcmeTokenizer { impl SQLTokenizer<AcmeToken, AcmeTokenizerError> for AcmeTokenizer {
fn peek_token(&mut self) -> Result<Option<SQLToken<AcmeToken>>, TokenizerError<AcmeTokenizerError>> { fn next_token(&self, chars: &mut Peekable<Chars>) -> Result<Option<SQLToken<AcmeToken>>, TokenizerError<AcmeTokenizerError>> {
Ok(Some(SQLToken::Custom(AcmeToken::Factorial))) match chars.peek() {
} Some(&ch) => match ch {
'!' => {
fn next_token(&mut self) -> Result<Option<SQLToken<AcmeToken>>, TokenizerError<AcmeTokenizerError>> { chars.next(); // consume the first `!`
Ok(Some(SQLToken::Custom(AcmeToken::Factorial))) match chars.peek() {
Some(&ch) => match ch {
'!' => {
chars.next(); // consume the second `!`
Ok(Some(SQLToken::Custom(AcmeToken::Factorial)))
},
_ => Err(TokenizerError::UnexpectedChar(ch,Position::new(0,0)))
},
None => Ok(Some(SQLToken::Not))
}
},
_ => self.generic.next_token(chars)
}
_ => self.generic.next_token(chars)
}
} }
} }
@ -43,9 +63,13 @@ fn main() {
let sql = "1 + !! 5 * 2"; let sql = "1 + !! 5 * 2";
let mut tokenizer = AcmeTokenizer { }; let mut acme_tokenizer = AcmeTokenizer {
generic: GenericTokenizer { }
};
println!("token = {:?}", tokenizer.peek_token().unwrap()); let tokens = tokenize(&sql, &mut acme_tokenizer).unwrap();
println!("tokens = {:?}", tokens);

41
src/generic_tokenizer.rs Normal file
View file

@ -0,0 +1,41 @@
use std::cmp::PartialEq;
use std::fmt::Debug;
use std::iter::Peekable;
use std::str::Chars;
use super::tokenizer::*;
pub struct GenericTokenizer {}
impl<S,TE> SQLTokenizer<S,TE> for GenericTokenizer
where S: Debug + PartialEq {
fn next_token(&self, chars: &mut Peekable<Chars>) -> Result<Option<SQLToken<S>>, TokenizerError<TE>> {
match chars.next() {
Some(ch) => match ch {
' ' | '\t' | '\n' => Ok(Some(SQLToken::Whitespace(ch))),
'0' ... '9' => {
let mut s = String::new();
s.push(ch);
while let Some(&ch) = chars.peek() {
match ch {
'0' ... '9' => {
chars.next(); // consume
s.push(ch);
},
_ => break
}
}
Ok(Some(SQLToken::Literal(s)))
},
'+' => Ok(Some(SQLToken::Plus)),
'-' => Ok(Some(SQLToken::Minus)),
'*' => Ok(Some(SQLToken::Mult)),
'/' => Ok(Some(SQLToken::Divide)),
_ => Err(TokenizerError::UnexpectedChar(ch,Position::new(0, 0)))
},
None => Ok(None)
}
}
}

View file

@ -1,2 +1,3 @@
pub mod tokenizer; pub mod tokenizer;
pub mod generic_tokenizer;
pub mod parser; pub mod parser;

View file

@ -1,3 +1,6 @@
use std::cmp::PartialEq;
use std::fmt::Debug;
use super::tokenizer::*; use super::tokenizer::*;
#[derive(Debug)] #[derive(Debug)]
@ -35,23 +38,29 @@ pub enum SQLExpr<T> {
} }
#[derive(Debug)] #[derive(Debug)]
pub enum ParserError<T> { pub enum ParserError<S, PE>
WrongToken { expected: Vec<SQLToken<T>>, actual: SQLToken<T>, line: usize, col: usize }, where S: Debug + PartialEq {
Custom(T)
WrongToken { expected: Vec<SQLToken<S>>, actual: SQLToken<S>, line: usize, col: usize },
Custom(PE)
} }
impl<T> From<TokenizerError<T>> for ParserError<T> { impl<S, TE> From<TokenizerError<TE>> for ParserError<S, TE>
fn from(_: TokenizerError<T>) -> Self { where S: Debug + PartialEq {
fn from(_: TokenizerError<TE>) -> Self {
unimplemented!() unimplemented!()
} }
} }
trait Parser<S, PE> { trait Parser<S, PE>
where S: Debug + PartialEq {
/// parse the prefix and stop once an infix operator is reached /// parse the prefix and stop once an infix operator is reached
fn parse_prefix(&mut self) -> Result<Box<SQLExpr<S>>, ParserError<PE>> ; fn parse_prefix(&mut self) -> Result<Box<SQLExpr<S>>, ParserError<S, PE>> ;
/// parse the next infix expression, returning None if the precedence has changed /// parse the next infix expression, returning None if the precedence has changed
fn parse_infix(&mut self, left: SQLExpr<S>) -> Result<Option<Box<SQLExpr<S>>>, ParserError<PE>>; fn parse_infix(&mut self, left: SQLExpr<S>) -> Result<Option<Box<SQLExpr<S>>>, ParserError<S, PE>>;
} }
// //

View file

@ -1,24 +1,40 @@
use std::cmp::PartialEq;
use std::fmt::Debug;
use std::iter::Peekable;
use std::str::Chars;
#[derive(Debug)] #[derive(Debug)]
pub struct Position { pub struct Position {
line: usize, line: usize,
col: usize col: usize
} }
impl Position {
pub fn new(line: usize, col: usize) -> Self {
Position { line, col }
}
}
#[derive(Debug)] #[derive(Debug)]
pub enum TokenizerError<T> { pub enum TokenizerError<T> {
UnexpectedChar(char,Position),
UnexpectedEof(Position), UnexpectedEof(Position),
UnterminatedStringLiteral(Position), UnterminatedStringLiteral(Position),
Custom(T) Custom(T)
} }
/// SQL Tokens /// SQL Tokens
#[derive(Debug)] #[derive(Debug,PartialEq)]
pub enum SQLToken<T> { pub enum SQLToken<T: Debug + PartialEq> {
Whitespace(char),
Keyword(String), //TODO: &str ? Keyword(String), //TODO: &str ?
Identifier(String), //TODO: &str ? Identifier(String), //TODO: &str ?
Literal(String), //TODO: need to model different types of literal Literal(String), //TODO: need to model different types of literal
Plus,
Minus,
Mult,
Divide,
Eq, Eq,
Not,
NotEq, NotEq,
Gt, Gt,
GtEq, GtEq,
@ -31,9 +47,32 @@ pub enum SQLToken<T> {
Custom(T) Custom(T)
} }
pub trait SQLTokenizer<S, T> { pub trait SQLTokenizer<S, TE>
/// return a reference to the next token without consuming it (look ahead) where S: Debug + PartialEq {
fn peek_token(&mut self) -> Result<Option<SQLToken<S>>, TokenizerError<T>>;
/// return a reference to the next token and advance the index /// return a reference to the next token and advance the index
fn next_token(&mut self) -> Result<Option<SQLToken<S>>, TokenizerError<T>>; fn next_token(&self, chars: &mut Peekable<Chars>) -> Result<Option<SQLToken<S>>, TokenizerError<TE>>;
}
pub fn tokenize<S,TE>(sql: &str, tokenizer: &mut SQLTokenizer<S,TE>) -> Result<Vec<SQLToken<S>>, TokenizerError<TE>>
where S: Debug + PartialEq
{
let mut peekable = sql.chars().peekable();
let mut tokens : Vec<SQLToken<S>> = vec![];
loop {
match tokenizer.next_token(&mut peekable)? {
Some(SQLToken::Whitespace(_)) => { /* ignore */ },
Some(token) => {
println!("Token: {:?}", token);
tokens.push(token)
},
None => break
}
}
Ok(tokens)
} }