Add support for quoted string backslash escaping (#1177)

This commit is contained in:
Ifeanyi Ubah 2024-04-21 15:07:56 +02:00 committed by GitHub
parent 7b49c69b3a
commit d2c2b15f9e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
18 changed files with 352 additions and 996 deletions

View file

@ -627,11 +627,11 @@ impl<'a> Tokenizer<'a> {
chars.next(); // consume
match chars.peek() {
Some('\'') => {
let s = self.tokenize_quoted_string(chars, '\'')?;
let s = self.tokenize_quoted_string(chars, '\'', false)?;
Ok(Some(Token::SingleQuotedByteStringLiteral(s)))
}
Some('\"') => {
let s = self.tokenize_quoted_string(chars, '\"')?;
let s = self.tokenize_quoted_string(chars, '\"', false)?;
Ok(Some(Token::DoubleQuotedByteStringLiteral(s)))
}
_ => {
@ -646,11 +646,11 @@ impl<'a> Tokenizer<'a> {
chars.next(); // consume
match chars.peek() {
Some('\'') => {
let s = self.tokenize_quoted_string(chars, '\'')?;
let s = self.tokenize_quoted_string(chars, '\'', false)?;
Ok(Some(Token::RawStringLiteral(s)))
}
Some('\"') => {
let s = self.tokenize_quoted_string(chars, '\"')?;
let s = self.tokenize_quoted_string(chars, '\"', false)?;
Ok(Some(Token::RawStringLiteral(s)))
}
_ => {
@ -666,7 +666,7 @@ impl<'a> Tokenizer<'a> {
match chars.peek() {
Some('\'') => {
// N'...' - a <national character string literal>
let s = self.tokenize_quoted_string(chars, '\'')?;
let s = self.tokenize_quoted_string(chars, '\'', true)?;
Ok(Some(Token::NationalStringLiteral(s)))
}
_ => {
@ -700,7 +700,7 @@ impl<'a> Tokenizer<'a> {
match chars.peek() {
Some('\'') => {
// X'...' - a <binary string literal>
let s = self.tokenize_quoted_string(chars, '\'')?;
let s = self.tokenize_quoted_string(chars, '\'', true)?;
Ok(Some(Token::HexStringLiteral(s)))
}
_ => {
@ -712,7 +712,11 @@ impl<'a> Tokenizer<'a> {
}
// single quoted string
'\'' => {
let s = self.tokenize_quoted_string(chars, '\'')?;
let s = self.tokenize_quoted_string(
chars,
'\'',
self.dialect.supports_string_literal_backslash_escape(),
)?;
Ok(Some(Token::SingleQuotedString(s)))
}
@ -720,7 +724,11 @@ impl<'a> Tokenizer<'a> {
'\"' if !self.dialect.is_delimited_identifier_start(ch)
&& !self.dialect.is_identifier_start(ch) =>
{
let s = self.tokenize_quoted_string(chars, '"')?;
let s = self.tokenize_quoted_string(
chars,
'"',
self.dialect.supports_string_literal_backslash_escape(),
)?;
Ok(Some(Token::DoubleQuotedString(s)))
}
@ -1222,6 +1230,7 @@ impl<'a> Tokenizer<'a> {
&self,
chars: &mut State,
quote_style: char,
allow_escape: bool,
) -> Result<String, TokenizerError> {
let mut s = String::new();
let error_loc = chars.location();
@ -1243,35 +1252,31 @@ impl<'a> Tokenizer<'a> {
return Ok(s);
}
}
'\\' => {
// consume
'\\' if allow_escape => {
// consume backslash
chars.next();
// slash escaping is specific to MySQL dialect.
if dialect_of!(self is MySqlDialect) {
if let Some(next) = chars.peek() {
if !self.unescape {
// In no-escape mode, the given query has to be saved completely including backslashes.
s.push(ch);
s.push(*next);
chars.next(); // consume next
} else {
// See https://dev.mysql.com/doc/refman/8.0/en/string-literals.html#character-escape-sequences
let n = match next {
'\'' | '\"' | '\\' | '%' | '_' => *next,
'0' => '\0',
'b' => '\u{8}',
'n' => '\n',
'r' => '\r',
't' => '\t',
'Z' => '\u{1a}',
_ => *next,
};
s.push(n);
chars.next(); // consume next
}
if let Some(next) = chars.peek() {
if !self.unescape {
// In no-escape mode, the given query has to be saved completely including backslashes.
s.push(ch);
s.push(*next);
chars.next(); // consume next
} else {
let n = match next {
'0' => '\0',
'a' => '\u{7}',
'b' => '\u{8}',
'f' => '\u{c}',
'n' => '\n',
'r' => '\r',
't' => '\t',
'Z' => '\u{1a}',
_ => *next,
};
s.push(n);
chars.next(); // consume next
}
} else {
s.push(ch);
}
}
_ => {
@ -1517,7 +1522,7 @@ impl<'a: 'b, 'b> Unescape<'a, 'b> {
#[cfg(test)]
mod tests {
use super::*;
use crate::dialect::{ClickHouseDialect, MsSqlDialect};
use crate::dialect::{BigQueryDialect, ClickHouseDialect, MsSqlDialect};
#[test]
fn tokenizer_error_impl() {
@ -2386,4 +2391,57 @@ mod tests {
check_unescape(r"Hello\0", None);
check_unescape(r"Hello\xCADRust", None);
}
#[test]
fn tokenize_quoted_string_escape() {
for (sql, expected, expected_unescaped) in [
(r#"'%a\'%b'"#, r#"%a\'%b"#, r#"%a'%b"#),
(r#"'a\'\'b\'c\'d'"#, r#"a\'\'b\'c\'d"#, r#"a''b'c'd"#),
(r#"'\\'"#, r#"\\"#, r#"\"#),
(
r#"'\0\a\b\f\n\r\t\Z'"#,
r#"\0\a\b\f\n\r\t\Z"#,
"\0\u{7}\u{8}\u{c}\n\r\t\u{1a}",
),
(r#"'\"'"#, r#"\""#, "\""),
(r#"'\\a\\b\'c'"#, r#"\\a\\b\'c"#, r#"\a\b'c"#),
(r#"'\'abcd'"#, r#"\'abcd"#, r#"'abcd"#),
(r#"'''a''b'"#, r#"''a''b"#, r#"'a'b"#),
] {
let dialect = BigQueryDialect {};
let tokens = Tokenizer::new(&dialect, sql)
.with_unescape(false)
.tokenize()
.unwrap();
let expected = vec![Token::SingleQuotedString(expected.to_string())];
compare(expected, tokens);
let tokens = Tokenizer::new(&dialect, sql)
.with_unescape(true)
.tokenize()
.unwrap();
let expected = vec![Token::SingleQuotedString(expected_unescaped.to_string())];
compare(expected, tokens);
}
for sql in [r#"'\'"#, r#"'ab\'"#] {
let dialect = BigQueryDialect {};
let mut tokenizer = Tokenizer::new(&dialect, sql);
assert_eq!(
"Unterminated string literal",
tokenizer.tokenize().unwrap_err().message.as_str(),
);
}
// Non-escape dialect
for (sql, expected) in [(r#"'\'"#, r#"\"#), (r#"'ab\'"#, r#"ab\"#)] {
let dialect = GenericDialect {};
let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
let expected = vec![Token::SingleQuotedString(expected.to_string())];
compare(expected, tokens);
}
}
}