From c2ccc80c28c4589d5bae8c3e1fc8cc2851b991ba Mon Sep 17 00:00:00 2001 From: Mykhailo Bondarenko <70747718+michael-2956@users.noreply.github.com> Date: Thu, 7 Jul 2022 20:45:59 +0300 Subject: [PATCH] Count characters instead of bytes (#529) * Count characters instead of bytes * cargo fmt * add tests to PR #529 --- src/tokenizer.rs | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 5a9a0961..57ec57d4 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -354,11 +354,15 @@ impl<'a> Tokenizer<'a> { } Token::Whitespace(Whitespace::Tab) => self.col += 4, - Token::Word(w) if w.quote_style == None => self.col += w.value.len() as u64, - Token::Word(w) if w.quote_style != None => self.col += w.value.len() as u64 + 2, - Token::Number(s, _) => self.col += s.len() as u64, - Token::SingleQuotedString(s) => self.col += s.len() as u64, - Token::Placeholder(s) => self.col += s.len() as u64, + Token::Word(w) if w.quote_style == None => { + self.col += w.value.chars().count() as u64 + } + Token::Word(w) if w.quote_style != None => { + self.col += w.value.chars().count() as u64 + 2 + } + Token::Number(s, _) => self.col += s.chars().count() as u64, + Token::SingleQuotedString(s) => self.col += s.chars().count() as u64, + Token::Placeholder(s) => self.col += s.chars().count() as u64, _ => self.col += 1, } @@ -1220,6 +1224,22 @@ mod tests { ); } + #[test] + fn tokenize_unterminated_string_literal_utf8() { + let sql = String::from("SELECT \"なにか\" FROM Y WHERE \"なにか\" = 'test;"); + + let dialect = GenericDialect {}; + let mut tokenizer = Tokenizer::new(&dialect, &sql); + assert_eq!( + tokenizer.tokenize(), + Err(TokenizerError { + message: "Unterminated string literal".to_string(), + line: 1, + col: 35 + }) + ); + } + #[test] fn tokenize_invalid_string_cols() { let sql = String::from("\n\nSELECT * FROM table\tمصطفىh");