mirror of
https://github.com/apache/datafusion-sqlparser-rs.git
synced 2025-11-01 15:44:14 +00:00
Add support for quoted string backslash escaping (#1177)
This commit is contained in:
parent
7b49c69b3a
commit
d2c2b15f9e
18 changed files with 352 additions and 996 deletions
|
|
@ -512,21 +512,21 @@ pub enum Expr {
|
|||
negated: bool,
|
||||
expr: Box<Expr>,
|
||||
pattern: Box<Expr>,
|
||||
escape_char: Option<char>,
|
||||
escape_char: Option<String>,
|
||||
},
|
||||
/// `ILIKE` (case-insensitive `LIKE`)
|
||||
ILike {
|
||||
negated: bool,
|
||||
expr: Box<Expr>,
|
||||
pattern: Box<Expr>,
|
||||
escape_char: Option<char>,
|
||||
escape_char: Option<String>,
|
||||
},
|
||||
/// SIMILAR TO regex
|
||||
SimilarTo {
|
||||
negated: bool,
|
||||
expr: Box<Expr>,
|
||||
pattern: Box<Expr>,
|
||||
escape_char: Option<char>,
|
||||
escape_char: Option<String>,
|
||||
},
|
||||
/// MySQL: RLIKE regex or REGEXP regex
|
||||
RLike {
|
||||
|
|
|
|||
|
|
@ -29,4 +29,9 @@ impl Dialect for BigQueryDialect {
|
|||
fn is_identifier_part(&self, ch: char) -> bool {
|
||||
ch.is_ascii_lowercase() || ch.is_ascii_uppercase() || ch.is_ascii_digit() || ch == '_'
|
||||
}
|
||||
|
||||
// See https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#escape_sequences
|
||||
fn supports_string_literal_backslash_escape(&self) -> bool {
|
||||
true
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -25,4 +25,8 @@ impl Dialect for ClickHouseDialect {
|
|||
fn is_identifier_part(&self, ch: char) -> bool {
|
||||
self.is_identifier_start(ch) || ch.is_ascii_digit()
|
||||
}
|
||||
|
||||
fn supports_string_literal_backslash_escape(&self) -> bool {
|
||||
true
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -120,6 +120,23 @@ pub trait Dialect: Debug + Any {
|
|||
fn is_identifier_start(&self, ch: char) -> bool;
|
||||
/// Determine if a character is a valid unquoted identifier character
|
||||
fn is_identifier_part(&self, ch: char) -> bool;
|
||||
/// Determine if the dialect supports escaping characters via '\' in string literals.
|
||||
///
|
||||
/// Some dialects like BigQuery and Snowflake support this while others like
|
||||
/// Postgres do not. Such that the following is accepted by the former but
|
||||
/// rejected by the latter.
|
||||
/// ```sql
|
||||
/// SELECT 'ab\'cd';
|
||||
/// ```
|
||||
///
|
||||
/// Conversely, such dialects reject the following statement which
|
||||
/// otherwise would be valid in the other dialects.
|
||||
/// ```sql
|
||||
/// SELECT '\';
|
||||
/// ```
|
||||
fn supports_string_literal_backslash_escape(&self) -> bool {
|
||||
false
|
||||
}
|
||||
/// Does the dialect support `FILTER (WHERE expr)` for aggregate queries?
|
||||
fn supports_filter_during_aggregation(&self) -> bool {
|
||||
false
|
||||
|
|
@ -306,6 +323,10 @@ mod tests {
|
|||
self.0.identifier_quote_style(identifier)
|
||||
}
|
||||
|
||||
fn supports_string_literal_backslash_escape(&self) -> bool {
|
||||
self.0.supports_string_literal_backslash_escape()
|
||||
}
|
||||
|
||||
fn is_proper_identifier_inside_quotes(
|
||||
&self,
|
||||
chars: std::iter::Peekable<std::str::Chars<'_>>,
|
||||
|
|
|
|||
|
|
@ -48,6 +48,11 @@ impl Dialect for MySqlDialect {
|
|||
Some('`')
|
||||
}
|
||||
|
||||
// See https://dev.mysql.com/doc/refman/8.0/en/string-literals.html#character-escape-sequences
|
||||
fn supports_string_literal_backslash_escape(&self) -> bool {
|
||||
true
|
||||
}
|
||||
|
||||
fn parse_infix(
|
||||
&self,
|
||||
parser: &mut crate::parser::Parser,
|
||||
|
|
|
|||
|
|
@ -46,6 +46,11 @@ impl Dialect for SnowflakeDialect {
|
|||
|| ch == '_'
|
||||
}
|
||||
|
||||
// See https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#escape_sequences
|
||||
fn supports_string_literal_backslash_escape(&self) -> bool {
|
||||
true
|
||||
}
|
||||
|
||||
fn supports_within_after_array_aggregation(&self) -> bool {
|
||||
true
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2560,9 +2560,9 @@ impl<'a> Parser<'a> {
|
|||
}
|
||||
|
||||
/// parse the ESCAPE CHAR portion of LIKE, ILIKE, and SIMILAR TO
|
||||
pub fn parse_escape_char(&mut self) -> Result<Option<char>, ParserError> {
|
||||
pub fn parse_escape_char(&mut self) -> Result<Option<String>, ParserError> {
|
||||
if self.parse_keyword(Keyword::ESCAPE) {
|
||||
Ok(Some(self.parse_literal_char()?))
|
||||
Ok(Some(self.parse_literal_string()?))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
|
|
|
|||
130
src/tokenizer.rs
130
src/tokenizer.rs
|
|
@ -627,11 +627,11 @@ impl<'a> Tokenizer<'a> {
|
|||
chars.next(); // consume
|
||||
match chars.peek() {
|
||||
Some('\'') => {
|
||||
let s = self.tokenize_quoted_string(chars, '\'')?;
|
||||
let s = self.tokenize_quoted_string(chars, '\'', false)?;
|
||||
Ok(Some(Token::SingleQuotedByteStringLiteral(s)))
|
||||
}
|
||||
Some('\"') => {
|
||||
let s = self.tokenize_quoted_string(chars, '\"')?;
|
||||
let s = self.tokenize_quoted_string(chars, '\"', false)?;
|
||||
Ok(Some(Token::DoubleQuotedByteStringLiteral(s)))
|
||||
}
|
||||
_ => {
|
||||
|
|
@ -646,11 +646,11 @@ impl<'a> Tokenizer<'a> {
|
|||
chars.next(); // consume
|
||||
match chars.peek() {
|
||||
Some('\'') => {
|
||||
let s = self.tokenize_quoted_string(chars, '\'')?;
|
||||
let s = self.tokenize_quoted_string(chars, '\'', false)?;
|
||||
Ok(Some(Token::RawStringLiteral(s)))
|
||||
}
|
||||
Some('\"') => {
|
||||
let s = self.tokenize_quoted_string(chars, '\"')?;
|
||||
let s = self.tokenize_quoted_string(chars, '\"', false)?;
|
||||
Ok(Some(Token::RawStringLiteral(s)))
|
||||
}
|
||||
_ => {
|
||||
|
|
@ -666,7 +666,7 @@ impl<'a> Tokenizer<'a> {
|
|||
match chars.peek() {
|
||||
Some('\'') => {
|
||||
// N'...' - a <national character string literal>
|
||||
let s = self.tokenize_quoted_string(chars, '\'')?;
|
||||
let s = self.tokenize_quoted_string(chars, '\'', true)?;
|
||||
Ok(Some(Token::NationalStringLiteral(s)))
|
||||
}
|
||||
_ => {
|
||||
|
|
@ -700,7 +700,7 @@ impl<'a> Tokenizer<'a> {
|
|||
match chars.peek() {
|
||||
Some('\'') => {
|
||||
// X'...' - a <binary string literal>
|
||||
let s = self.tokenize_quoted_string(chars, '\'')?;
|
||||
let s = self.tokenize_quoted_string(chars, '\'', true)?;
|
||||
Ok(Some(Token::HexStringLiteral(s)))
|
||||
}
|
||||
_ => {
|
||||
|
|
@ -712,7 +712,11 @@ impl<'a> Tokenizer<'a> {
|
|||
}
|
||||
// single quoted string
|
||||
'\'' => {
|
||||
let s = self.tokenize_quoted_string(chars, '\'')?;
|
||||
let s = self.tokenize_quoted_string(
|
||||
chars,
|
||||
'\'',
|
||||
self.dialect.supports_string_literal_backslash_escape(),
|
||||
)?;
|
||||
|
||||
Ok(Some(Token::SingleQuotedString(s)))
|
||||
}
|
||||
|
|
@ -720,7 +724,11 @@ impl<'a> Tokenizer<'a> {
|
|||
'\"' if !self.dialect.is_delimited_identifier_start(ch)
|
||||
&& !self.dialect.is_identifier_start(ch) =>
|
||||
{
|
||||
let s = self.tokenize_quoted_string(chars, '"')?;
|
||||
let s = self.tokenize_quoted_string(
|
||||
chars,
|
||||
'"',
|
||||
self.dialect.supports_string_literal_backslash_escape(),
|
||||
)?;
|
||||
|
||||
Ok(Some(Token::DoubleQuotedString(s)))
|
||||
}
|
||||
|
|
@ -1222,6 +1230,7 @@ impl<'a> Tokenizer<'a> {
|
|||
&self,
|
||||
chars: &mut State,
|
||||
quote_style: char,
|
||||
allow_escape: bool,
|
||||
) -> Result<String, TokenizerError> {
|
||||
let mut s = String::new();
|
||||
let error_loc = chars.location();
|
||||
|
|
@ -1243,35 +1252,31 @@ impl<'a> Tokenizer<'a> {
|
|||
return Ok(s);
|
||||
}
|
||||
}
|
||||
'\\' => {
|
||||
// consume
|
||||
'\\' if allow_escape => {
|
||||
// consume backslash
|
||||
chars.next();
|
||||
// slash escaping is specific to MySQL dialect.
|
||||
if dialect_of!(self is MySqlDialect) {
|
||||
if let Some(next) = chars.peek() {
|
||||
if !self.unescape {
|
||||
// In no-escape mode, the given query has to be saved completely including backslashes.
|
||||
s.push(ch);
|
||||
s.push(*next);
|
||||
chars.next(); // consume next
|
||||
} else {
|
||||
// See https://dev.mysql.com/doc/refman/8.0/en/string-literals.html#character-escape-sequences
|
||||
let n = match next {
|
||||
'\'' | '\"' | '\\' | '%' | '_' => *next,
|
||||
'0' => '\0',
|
||||
'b' => '\u{8}',
|
||||
'n' => '\n',
|
||||
'r' => '\r',
|
||||
't' => '\t',
|
||||
'Z' => '\u{1a}',
|
||||
_ => *next,
|
||||
};
|
||||
s.push(n);
|
||||
chars.next(); // consume next
|
||||
}
|
||||
|
||||
if let Some(next) = chars.peek() {
|
||||
if !self.unescape {
|
||||
// In no-escape mode, the given query has to be saved completely including backslashes.
|
||||
s.push(ch);
|
||||
s.push(*next);
|
||||
chars.next(); // consume next
|
||||
} else {
|
||||
let n = match next {
|
||||
'0' => '\0',
|
||||
'a' => '\u{7}',
|
||||
'b' => '\u{8}',
|
||||
'f' => '\u{c}',
|
||||
'n' => '\n',
|
||||
'r' => '\r',
|
||||
't' => '\t',
|
||||
'Z' => '\u{1a}',
|
||||
_ => *next,
|
||||
};
|
||||
s.push(n);
|
||||
chars.next(); // consume next
|
||||
}
|
||||
} else {
|
||||
s.push(ch);
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
|
|
@ -1517,7 +1522,7 @@ impl<'a: 'b, 'b> Unescape<'a, 'b> {
|
|||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::dialect::{ClickHouseDialect, MsSqlDialect};
|
||||
use crate::dialect::{BigQueryDialect, ClickHouseDialect, MsSqlDialect};
|
||||
|
||||
#[test]
|
||||
fn tokenizer_error_impl() {
|
||||
|
|
@ -2386,4 +2391,57 @@ mod tests {
|
|||
check_unescape(r"Hello\0", None);
|
||||
check_unescape(r"Hello\xCADRust", None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tokenize_quoted_string_escape() {
|
||||
for (sql, expected, expected_unescaped) in [
|
||||
(r#"'%a\'%b'"#, r#"%a\'%b"#, r#"%a'%b"#),
|
||||
(r#"'a\'\'b\'c\'d'"#, r#"a\'\'b\'c\'d"#, r#"a''b'c'd"#),
|
||||
(r#"'\\'"#, r#"\\"#, r#"\"#),
|
||||
(
|
||||
r#"'\0\a\b\f\n\r\t\Z'"#,
|
||||
r#"\0\a\b\f\n\r\t\Z"#,
|
||||
"\0\u{7}\u{8}\u{c}\n\r\t\u{1a}",
|
||||
),
|
||||
(r#"'\"'"#, r#"\""#, "\""),
|
||||
(r#"'\\a\\b\'c'"#, r#"\\a\\b\'c"#, r#"\a\b'c"#),
|
||||
(r#"'\'abcd'"#, r#"\'abcd"#, r#"'abcd"#),
|
||||
(r#"'''a''b'"#, r#"''a''b"#, r#"'a'b"#),
|
||||
] {
|
||||
let dialect = BigQueryDialect {};
|
||||
|
||||
let tokens = Tokenizer::new(&dialect, sql)
|
||||
.with_unescape(false)
|
||||
.tokenize()
|
||||
.unwrap();
|
||||
let expected = vec![Token::SingleQuotedString(expected.to_string())];
|
||||
compare(expected, tokens);
|
||||
|
||||
let tokens = Tokenizer::new(&dialect, sql)
|
||||
.with_unescape(true)
|
||||
.tokenize()
|
||||
.unwrap();
|
||||
let expected = vec![Token::SingleQuotedString(expected_unescaped.to_string())];
|
||||
compare(expected, tokens);
|
||||
}
|
||||
|
||||
for sql in [r#"'\'"#, r#"'ab\'"#] {
|
||||
let dialect = BigQueryDialect {};
|
||||
let mut tokenizer = Tokenizer::new(&dialect, sql);
|
||||
assert_eq!(
|
||||
"Unterminated string literal",
|
||||
tokenizer.tokenize().unwrap_err().message.as_str(),
|
||||
);
|
||||
}
|
||||
|
||||
// Non-escape dialect
|
||||
for (sql, expected) in [(r#"'\'"#, r#"\"#), (r#"'ab\'"#, r#"ab\"#)] {
|
||||
let dialect = GenericDialect {};
|
||||
let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
|
||||
|
||||
let expected = vec![Token::SingleQuotedString(expected.to_string())];
|
||||
|
||||
compare(expected, tokens);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue