From 6f44494910fa676ea4f592520a425233782802d8 Mon Sep 17 00:00:00 2001 From: Nickolay Ponomarev Date: Tue, 12 Feb 2019 01:15:47 +0300 Subject: [PATCH 1/2] Support \r and \r\n line breaks in tokenizer --- src/sqltokenizer.rs | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/src/sqltokenizer.rs b/src/sqltokenizer.rs index 83105736..dee6844b 100644 --- a/src/sqltokenizer.rs +++ b/src/sqltokenizer.rs @@ -272,6 +272,14 @@ impl<'a> Tokenizer<'a> { chars.next(); Ok(Some(Token::Whitespace(Whitespace::Newline))) } + '\r' => { + // Emit a single Whitespace::Newline token for \r and \r\n + chars.next(); + if let Some('\n') = chars.peek() { + chars.next(); + } + Ok(Some(Token::Whitespace(Whitespace::Newline))) + } 'N' => { chars.next(); // consume, to check the next char match chars.peek() { @@ -743,6 +751,26 @@ mod tests { compare(expected, tokens); } + #[test] + fn tokenize_newlines() { + let sql = String::from("line1\nline2\rline3\r\nline4\r"); + + let dialect = GenericSqlDialect {}; + let mut tokenizer = Tokenizer::new(&dialect, &sql); + let tokens = tokenizer.tokenize().unwrap(); + let expected = vec![ + Token::make_word("line1", None), + Token::Whitespace(Whitespace::Newline), + Token::make_word("line2", None), + Token::Whitespace(Whitespace::Newline), + Token::make_word("line3", None), + Token::Whitespace(Whitespace::Newline), + Token::make_word("line4", None), + Token::Whitespace(Whitespace::Newline), + ]; + compare(expected, tokens); + } + fn compare(expected: Vec, actual: Vec) { //println!("------------------------------"); //println!("tokens = {:?}", actual); From bf3110f6ce8d7b9a63f9611ed530d712c1425d5a Mon Sep 17 00:00:00 2001 From: Nickolay Ponomarev Date: Sat, 27 Apr 2019 01:52:32 +0300 Subject: [PATCH 2/2] Use `consume_and_return` when possible --- src/sqltokenizer.rs | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/src/sqltokenizer.rs b/src/sqltokenizer.rs index dee6844b..ead33483 100644 --- a/src/sqltokenizer.rs +++ b/src/sqltokenizer.rs @@ -260,18 +260,9 @@ impl<'a> Tokenizer<'a> { //println!("next_token: {:?}", chars.peek()); match chars.peek() { Some(&ch) => match ch { - ' ' => { - chars.next(); - Ok(Some(Token::Whitespace(Whitespace::Space))) - } - '\t' => { - chars.next(); - Ok(Some(Token::Whitespace(Whitespace::Tab))) - } - '\n' => { - chars.next(); - Ok(Some(Token::Whitespace(Whitespace::Newline))) - } + ' ' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)), + '\t' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Tab)), + '\n' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Newline)), '\r' => { // Emit a single Whitespace::Newline token for \r and \r\n chars.next();