mirror of
https://github.com/apache/datafusion-sqlparser-rs.git
synced 2025-07-08 01:15:00 +00:00
simple example of custom tokenizer
This commit is contained in:
parent
fcf6b1150e
commit
f56846098e
6 changed files with 145 additions and 29 deletions
|
@ -1,10 +1,12 @@
|
||||||
# datafusion-sql
|
# datafusion-sql
|
||||||
|
|
||||||
DataFusion SQL Parser (v2)
|
This is a work-in-progress to develop a new version of the DataFusion SQL Parser.
|
||||||
|
|
||||||
Goals:
|
Goals for this version:
|
||||||
|
|
||||||
- Support for custom SQL dialects, so other projects can implement their own parsers easily
|
- Support for custom SQL dialects, so other projects can implement their own parsers easily
|
||||||
- Zero-copy of tokens when parsing
|
|
||||||
- Good error reporting (e.g. show line / column numbers and descriptive messages)
|
- Good error reporting (e.g. show line / column numbers and descriptive messages)
|
||||||
|
- Zero-copy of tokens when parsing
|
||||||
|
- Concise code
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,17 +1,23 @@
|
||||||
use std::str::Chars;
|
use std::str::Chars;
|
||||||
|
use std::iter::Peekable;
|
||||||
|
|
||||||
extern crate datafusion_sql;
|
extern crate datafusion_sql;
|
||||||
|
|
||||||
use datafusion_sql::tokenizer::*;
|
use datafusion_sql::tokenizer::*;
|
||||||
|
use datafusion_sql::generic_tokenizer::*;
|
||||||
use datafusion_sql::parser::*;
|
use datafusion_sql::parser::*;
|
||||||
|
|
||||||
#[derive(Debug)]
|
///
|
||||||
|
/// This example demonstrates building a custom ACME parser that extends the generic parser
|
||||||
|
/// by adding support for a factorial operator !!
|
||||||
|
///
|
||||||
|
|
||||||
|
#[derive(Debug,PartialEq)]
|
||||||
enum AcmeToken {
|
enum AcmeToken {
|
||||||
/// Factorial operator `!!`
|
/// Factorial operator `!!`
|
||||||
Factorial
|
Factorial
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
enum AcmeOperator {
|
enum AcmeOperator {
|
||||||
Factorial
|
Factorial
|
||||||
|
@ -19,21 +25,35 @@ enum AcmeOperator {
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
enum AcmeTokenizerError {
|
enum AcmeTokenizerError {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
struct AcmeTokenizer {
|
struct AcmeTokenizer {
|
||||||
//chars: &'a Chars
|
generic: GenericTokenizer
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// The ACME tokenizer looks for the factorial operator `!!` but delegates everything else
|
||||||
impl SQLTokenizer<AcmeToken, AcmeTokenizerError> for AcmeTokenizer {
|
impl SQLTokenizer<AcmeToken, AcmeTokenizerError> for AcmeTokenizer {
|
||||||
|
|
||||||
fn peek_token(&mut self) -> Result<Option<SQLToken<AcmeToken>>, TokenizerError<AcmeTokenizerError>> {
|
fn next_token(&self, chars: &mut Peekable<Chars>) -> Result<Option<SQLToken<AcmeToken>>, TokenizerError<AcmeTokenizerError>> {
|
||||||
Ok(Some(SQLToken::Custom(AcmeToken::Factorial)))
|
match chars.peek() {
|
||||||
}
|
Some(&ch) => match ch {
|
||||||
|
'!' => {
|
||||||
fn next_token(&mut self) -> Result<Option<SQLToken<AcmeToken>>, TokenizerError<AcmeTokenizerError>> {
|
chars.next(); // consume the first `!`
|
||||||
Ok(Some(SQLToken::Custom(AcmeToken::Factorial)))
|
match chars.peek() {
|
||||||
|
Some(&ch) => match ch {
|
||||||
|
'!' => {
|
||||||
|
chars.next(); // consume the second `!`
|
||||||
|
Ok(Some(SQLToken::Custom(AcmeToken::Factorial)))
|
||||||
|
},
|
||||||
|
_ => Err(TokenizerError::UnexpectedChar(ch,Position::new(0,0)))
|
||||||
|
},
|
||||||
|
None => Ok(Some(SQLToken::Not))
|
||||||
|
}
|
||||||
|
},
|
||||||
|
_ => self.generic.next_token(chars)
|
||||||
|
}
|
||||||
|
_ => self.generic.next_token(chars)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -43,9 +63,13 @@ fn main() {
|
||||||
|
|
||||||
let sql = "1 + !! 5 * 2";
|
let sql = "1 + !! 5 * 2";
|
||||||
|
|
||||||
let mut tokenizer = AcmeTokenizer { };
|
let mut acme_tokenizer = AcmeTokenizer {
|
||||||
|
generic: GenericTokenizer { }
|
||||||
|
};
|
||||||
|
|
||||||
println!("token = {:?}", tokenizer.peek_token().unwrap());
|
let tokens = tokenize(&sql, &mut acme_tokenizer).unwrap();
|
||||||
|
|
||||||
|
println!("tokens = {:?}", tokens);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
41
src/generic_tokenizer.rs
Normal file
41
src/generic_tokenizer.rs
Normal file
|
@ -0,0 +1,41 @@
|
||||||
|
use std::cmp::PartialEq;
|
||||||
|
use std::fmt::Debug;
|
||||||
|
use std::iter::Peekable;
|
||||||
|
use std::str::Chars;
|
||||||
|
|
||||||
|
use super::tokenizer::*;
|
||||||
|
|
||||||
|
pub struct GenericTokenizer {}
|
||||||
|
|
||||||
|
impl<S,TE> SQLTokenizer<S,TE> for GenericTokenizer
|
||||||
|
where S: Debug + PartialEq {
|
||||||
|
|
||||||
|
fn next_token(&self, chars: &mut Peekable<Chars>) -> Result<Option<SQLToken<S>>, TokenizerError<TE>> {
|
||||||
|
match chars.next() {
|
||||||
|
Some(ch) => match ch {
|
||||||
|
' ' | '\t' | '\n' => Ok(Some(SQLToken::Whitespace(ch))),
|
||||||
|
'0' ... '9' => {
|
||||||
|
let mut s = String::new();
|
||||||
|
s.push(ch);
|
||||||
|
while let Some(&ch) = chars.peek() {
|
||||||
|
match ch {
|
||||||
|
'0' ... '9' => {
|
||||||
|
chars.next(); // consume
|
||||||
|
s.push(ch);
|
||||||
|
},
|
||||||
|
_ => break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(Some(SQLToken::Literal(s)))
|
||||||
|
},
|
||||||
|
'+' => Ok(Some(SQLToken::Plus)),
|
||||||
|
'-' => Ok(Some(SQLToken::Minus)),
|
||||||
|
'*' => Ok(Some(SQLToken::Mult)),
|
||||||
|
'/' => Ok(Some(SQLToken::Divide)),
|
||||||
|
_ => Err(TokenizerError::UnexpectedChar(ch,Position::new(0, 0)))
|
||||||
|
},
|
||||||
|
None => Ok(None)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -1,2 +1,3 @@
|
||||||
pub mod tokenizer;
|
pub mod tokenizer;
|
||||||
|
pub mod generic_tokenizer;
|
||||||
pub mod parser;
|
pub mod parser;
|
||||||
|
|
|
@ -1,3 +1,6 @@
|
||||||
|
use std::cmp::PartialEq;
|
||||||
|
use std::fmt::Debug;
|
||||||
|
|
||||||
use super::tokenizer::*;
|
use super::tokenizer::*;
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
|
@ -35,23 +38,29 @@ pub enum SQLExpr<T> {
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub enum ParserError<T> {
|
pub enum ParserError<S, PE>
|
||||||
WrongToken { expected: Vec<SQLToken<T>>, actual: SQLToken<T>, line: usize, col: usize },
|
where S: Debug + PartialEq {
|
||||||
Custom(T)
|
|
||||||
|
WrongToken { expected: Vec<SQLToken<S>>, actual: SQLToken<S>, line: usize, col: usize },
|
||||||
|
Custom(PE)
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<T> From<TokenizerError<T>> for ParserError<T> {
|
impl<S, TE> From<TokenizerError<TE>> for ParserError<S, TE>
|
||||||
fn from(_: TokenizerError<T>) -> Self {
|
where S: Debug + PartialEq {
|
||||||
|
|
||||||
|
fn from(_: TokenizerError<TE>) -> Self {
|
||||||
unimplemented!()
|
unimplemented!()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
trait Parser<S, PE> {
|
trait Parser<S, PE>
|
||||||
|
where S: Debug + PartialEq {
|
||||||
|
|
||||||
/// parse the prefix and stop once an infix operator is reached
|
/// parse the prefix and stop once an infix operator is reached
|
||||||
fn parse_prefix(&mut self) -> Result<Box<SQLExpr<S>>, ParserError<PE>> ;
|
fn parse_prefix(&mut self) -> Result<Box<SQLExpr<S>>, ParserError<S, PE>> ;
|
||||||
/// parse the next infix expression, returning None if the precedence has changed
|
/// parse the next infix expression, returning None if the precedence has changed
|
||||||
fn parse_infix(&mut self, left: SQLExpr<S>) -> Result<Option<Box<SQLExpr<S>>>, ParserError<PE>>;
|
fn parse_infix(&mut self, left: SQLExpr<S>) -> Result<Option<Box<SQLExpr<S>>>, ParserError<S, PE>>;
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
|
|
|
@ -1,24 +1,40 @@
|
||||||
|
use std::cmp::PartialEq;
|
||||||
|
use std::fmt::Debug;
|
||||||
|
use std::iter::Peekable;
|
||||||
|
use std::str::Chars;
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub struct Position {
|
pub struct Position {
|
||||||
line: usize,
|
line: usize,
|
||||||
col: usize
|
col: usize
|
||||||
}
|
}
|
||||||
|
impl Position {
|
||||||
|
pub fn new(line: usize, col: usize) -> Self {
|
||||||
|
Position { line, col }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub enum TokenizerError<T> {
|
pub enum TokenizerError<T> {
|
||||||
|
UnexpectedChar(char,Position),
|
||||||
UnexpectedEof(Position),
|
UnexpectedEof(Position),
|
||||||
UnterminatedStringLiteral(Position),
|
UnterminatedStringLiteral(Position),
|
||||||
Custom(T)
|
Custom(T)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// SQL Tokens
|
/// SQL Tokens
|
||||||
#[derive(Debug)]
|
#[derive(Debug,PartialEq)]
|
||||||
pub enum SQLToken<T> {
|
pub enum SQLToken<T: Debug + PartialEq> {
|
||||||
|
Whitespace(char),
|
||||||
Keyword(String), //TODO: &str ?
|
Keyword(String), //TODO: &str ?
|
||||||
Identifier(String), //TODO: &str ?
|
Identifier(String), //TODO: &str ?
|
||||||
Literal(String), //TODO: need to model different types of literal
|
Literal(String), //TODO: need to model different types of literal
|
||||||
|
Plus,
|
||||||
|
Minus,
|
||||||
|
Mult,
|
||||||
|
Divide,
|
||||||
Eq,
|
Eq,
|
||||||
|
Not,
|
||||||
NotEq,
|
NotEq,
|
||||||
Gt,
|
Gt,
|
||||||
GtEq,
|
GtEq,
|
||||||
|
@ -31,9 +47,32 @@ pub enum SQLToken<T> {
|
||||||
Custom(T)
|
Custom(T)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub trait SQLTokenizer<S, T> {
|
pub trait SQLTokenizer<S, TE>
|
||||||
/// return a reference to the next token without consuming it (look ahead)
|
where S: Debug + PartialEq {
|
||||||
fn peek_token(&mut self) -> Result<Option<SQLToken<S>>, TokenizerError<T>>;
|
|
||||||
/// return a reference to the next token and advance the index
|
/// return a reference to the next token and advance the index
|
||||||
fn next_token(&mut self) -> Result<Option<SQLToken<S>>, TokenizerError<T>>;
|
fn next_token(&self, chars: &mut Peekable<Chars>) -> Result<Option<SQLToken<S>>, TokenizerError<TE>>;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
pub fn tokenize<S,TE>(sql: &str, tokenizer: &mut SQLTokenizer<S,TE>) -> Result<Vec<SQLToken<S>>, TokenizerError<TE>>
|
||||||
|
where S: Debug + PartialEq
|
||||||
|
{
|
||||||
|
|
||||||
|
let mut peekable = sql.chars().peekable();
|
||||||
|
|
||||||
|
let mut tokens : Vec<SQLToken<S>> = vec![];
|
||||||
|
|
||||||
|
loop {
|
||||||
|
match tokenizer.next_token(&mut peekable)? {
|
||||||
|
Some(SQLToken::Whitespace(_)) => { /* ignore */ },
|
||||||
|
Some(token) => {
|
||||||
|
println!("Token: {:?}", token);
|
||||||
|
tokens.push(token)
|
||||||
|
},
|
||||||
|
None => break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(tokens)
|
||||||
}
|
}
|
Loading…
Add table
Add a link
Reference in a new issue