From 82d1f36366df53c11f4d361534152b7bbcd44e59 Mon Sep 17 00:00:00 2001 From: crw5996 Date: Fri, 7 Sep 2018 18:33:02 -0400 Subject: [PATCH 1/2] Added line number errors in the tokenizer --- src/sqltokenizer.rs | 55 +++++++++++++++++++++++++++++++++++++++------ 1 file changed, 48 insertions(+), 7 deletions(-) diff --git a/src/sqltokenizer.rs b/src/sqltokenizer.rs index 912f8442..aaaa9748 100644 --- a/src/sqltokenizer.rs +++ b/src/sqltokenizer.rs @@ -33,7 +33,7 @@ pub enum Token { /// Comma Comma, /// Whitespace (space, tab, etc) - Whitespace, + Whitespace(char), /// Equality operator `=` Eq, /// Not Equals operator `!=` or `<>` @@ -65,7 +65,7 @@ pub enum Token { } /// Tokenizer error -#[derive(Debug)] +#[derive(Debug, PartialEq)] pub struct TokenizerError(String); lazy_static! { @@ -141,6 +141,8 @@ lazy_static! { /// SQL Tokenizer pub struct Tokenizer { pub query: String, + pub line: u64, + pub col: u64, } impl Tokenizer { @@ -148,6 +150,8 @@ impl Tokenizer { pub fn new(query: &str) -> Self { Self { query: query.to_string(), + line: 1, + col: 1, } } @@ -158,13 +162,20 @@ impl Tokenizer { let mut tokens: Vec = vec![]; while let Some(token) = self.next_token(&mut peekable)? { + + if token == Token::Whitespace('\n') { + self.line += 1; + self.col = 0; + } else if token == Token::Whitespace('\t') { + self.col += 1; + } tokens.push(token); } Ok(tokens .into_iter() .filter(|t| match t { - Token::Whitespace => false, + Token::Whitespace(..) => false, _ => true, }).collect()) } @@ -177,7 +188,7 @@ impl Tokenizer { // whitespace ' ' | '\t' | '\n' => { chars.next(); // consume - Ok(Some(Token::Whitespace)) + Ok(Some(Token::Whitespace(ch))) } // identifier or keyword 'a'...'z' | 'A'...'Z' | '_' | '@' => { @@ -282,9 +293,9 @@ impl Tokenizer { chars.next(); Ok(Some(Token::Neq)) } - _ => Err(TokenizerError(format!("TBD"))), + _ => Err(TokenizerError(format!("Tokenizer Error at Line: {}, Col: {}", self.line, self.col))), }, - None => Err(TokenizerError(format!("TBD"))), + None => Err(TokenizerError(format!("Tokenizer Error at Line: {}, Col: {}", self.line, self.col))), } } '<' => { @@ -318,7 +329,9 @@ impl Tokenizer { } } _ => Err(TokenizerError(format!( - "unhandled char '{}' in tokenizer", + "Tokenizer Error at Line: {}, Column: {}, unhandled char '{}'", + self.line, + self.col, ch ))), }, @@ -404,6 +417,34 @@ mod tests { compare(expected, tokens); } + #[test] + fn tokenize_invalid_string() { + let sql = String::from("\nمصطفىh"); + + let mut tokenizer = Tokenizer::new(&sql); + let tokens = tokenizer.tokenize(); + + match tokens { + Err(e) => assert_eq!(TokenizerError("Tokenizer Error at Line: 2, Column: 0, unhandled char \'م\'".to_string()), e), + _ => panic!("Test Failure in tokenize_invalid_string"), + } + + } + + #[test] + fn tokenize_invalid_string_cols() { + + let sql = String::from("\n\nSELECT * FROM table\tمصطفىh"); + + let mut tokenizer = Tokenizer::new(&sql); + let tokens = tokenizer.tokenize(); + match tokens { + Err(e) => assert_eq!(TokenizerError("Tokenizer Error at Line: 3, Column: 1, unhandled char \'م\'".to_string()), e), + _ => panic!("Test Failure in tokenize_invalid_string_cols"), + } + + } + #[test] fn tokenize_is_null() { let sql = String::from("a IS NULL"); From 900c56ff292fea64ff065a599e20e7d8624faee1 Mon Sep 17 00:00:00 2001 From: crw5996 Date: Fri, 7 Sep 2018 20:23:23 -0400 Subject: [PATCH 2/2] Fixed column values to reflect length of tokens --- src/sqltokenizer.rs | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/src/sqltokenizer.rs b/src/sqltokenizer.rs index aaaa9748..2dbc6cf0 100644 --- a/src/sqltokenizer.rs +++ b/src/sqltokenizer.rs @@ -162,14 +162,24 @@ impl Tokenizer { let mut tokens: Vec = vec![]; while let Some(token) = self.next_token(&mut peekable)? { + + match &token { + + Token::Whitespace('\n') => { + self.line += 1; + self.col = 1; + }, - if token == Token::Whitespace('\n') { - self.line += 1; - self.col = 0; - } else if token == Token::Whitespace('\t') { - self.col += 1; + Token::Whitespace('\t') => self.col += 4, + Token::Identifier(s) => self.col += s.len() as u64, + Token::Keyword(s) => self.col += s.len() as u64, + Token::Number(s) => self.col += s.len() as u64, + Token::String(s) => self.col += s.len() as u64, + _ => self.col += 1, } + tokens.push(token); + } Ok(tokens @@ -425,7 +435,7 @@ mod tests { let tokens = tokenizer.tokenize(); match tokens { - Err(e) => assert_eq!(TokenizerError("Tokenizer Error at Line: 2, Column: 0, unhandled char \'م\'".to_string()), e), + Err(e) => assert_eq!(TokenizerError("Tokenizer Error at Line: 2, Column: 1, unhandled char \'م\'".to_string()), e), _ => panic!("Test Failure in tokenize_invalid_string"), } @@ -439,7 +449,7 @@ mod tests { let mut tokenizer = Tokenizer::new(&sql); let tokens = tokenizer.tokenize(); match tokens { - Err(e) => assert_eq!(TokenizerError("Tokenizer Error at Line: 3, Column: 1, unhandled char \'م\'".to_string()), e), + Err(e) => assert_eq!(TokenizerError("Tokenizer Error at Line: 3, Column: 24, unhandled char \'م\'".to_string()), e), _ => panic!("Test Failure in tokenize_invalid_string_cols"), }