Support triple quoted strings (#1262)

This commit is contained in:
Ifeanyi Ubah 2024-05-11 12:41:04 +02:00 committed by GitHub
parent e3692f4681
commit 036a4120b4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 592 additions and 62 deletions

View file

@ -42,6 +42,12 @@ pub enum Value {
SingleQuotedString(String),
// $<tag_name>$string value$<tag_name>$ (postgres syntax)
DollarQuotedString(DollarQuotedString),
/// Triple single quoted strings: Example '''abc'''
/// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
TripleSingleQuotedString(String),
/// Triple double quoted strings: Example """abc"""
/// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
TripleDoubleQuotedString(String),
/// e'string value' (postgres extension)
/// See [Postgres docs](https://www.postgresql.org/docs/8.3/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS)
/// for more details.
@ -50,9 +56,24 @@ pub enum Value {
SingleQuotedByteStringLiteral(String),
/// B"string value"
DoubleQuotedByteStringLiteral(String),
/// R'string value' or r'string value' or r"string value"
/// <https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals>
RawStringLiteral(String),
/// Triple single quoted literal with byte string prefix. Example `B'''abc'''`
/// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
TripleSingleQuotedByteStringLiteral(String),
/// Triple double quoted literal with byte string prefix. Example `B"""abc"""`
/// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
TripleDoubleQuotedByteStringLiteral(String),
/// Single quoted literal with raw string prefix. Example `R'abc'`
/// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
SingleQuotedRawStringLiteral(String),
/// Double quoted literal with raw string prefix. Example `R"abc"`
/// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
DoubleQuotedRawStringLiteral(String),
/// Triple single quoted literal with raw string prefix. Example `R'''abc'''`
/// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
TripleSingleQuotedRawStringLiteral(String),
/// Triple double quoted literal with raw string prefix. Example `R"""abc"""`
/// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
TripleDoubleQuotedRawStringLiteral(String),
/// N'string value'
NationalStringLiteral(String),
/// X'hex value'
@ -73,6 +94,12 @@ impl fmt::Display for Value {
Value::Number(v, l) => write!(f, "{}{long}", v, long = if *l { "L" } else { "" }),
Value::DoubleQuotedString(v) => write!(f, "\"{}\"", escape_double_quote_string(v)),
Value::SingleQuotedString(v) => write!(f, "'{}'", escape_single_quote_string(v)),
Value::TripleSingleQuotedString(v) => {
write!(f, "'''{v}'''")
}
Value::TripleDoubleQuotedString(v) => {
write!(f, r#""""{v}""""#)
}
Value::DollarQuotedString(v) => write!(f, "{v}"),
Value::EscapedStringLiteral(v) => write!(f, "E'{}'", escape_escaped_string(v)),
Value::NationalStringLiteral(v) => write!(f, "N'{v}'"),
@ -80,7 +107,12 @@ impl fmt::Display for Value {
Value::Boolean(v) => write!(f, "{v}"),
Value::SingleQuotedByteStringLiteral(v) => write!(f, "B'{v}'"),
Value::DoubleQuotedByteStringLiteral(v) => write!(f, "B\"{v}\""),
Value::RawStringLiteral(v) => write!(f, "R'{v}'"),
Value::TripleSingleQuotedByteStringLiteral(v) => write!(f, "B'''{v}'''"),
Value::TripleDoubleQuotedByteStringLiteral(v) => write!(f, r#"B"""{v}""""#),
Value::SingleQuotedRawStringLiteral(v) => write!(f, "R'{v}'"),
Value::DoubleQuotedRawStringLiteral(v) => write!(f, "R\"{v}\""),
Value::TripleSingleQuotedRawStringLiteral(v) => write!(f, "R'''{v}'''"),
Value::TripleDoubleQuotedRawStringLiteral(v) => write!(f, r#"R"""{v}""""#),
Value::Null => write!(f, "NULL"),
Value::Placeholder(v) => write!(f, "{v}"),
}

View file

@ -30,6 +30,11 @@ impl Dialect for BigQueryDialect {
ch.is_ascii_lowercase() || ch.is_ascii_uppercase() || ch.is_ascii_digit() || ch == '_'
}
/// See [doc](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
fn supports_triple_quoted_string(&self) -> bool {
true
}
/// See [doc](https://cloud.google.com/bigquery/docs/reference/standard-sql/navigation_functions#first_value)
fn supports_window_function_null_treatment_arg(&self) -> bool {
true

View file

@ -231,6 +231,11 @@ pub trait Dialect: Debug + Any {
fn convert_type_before_value(&self) -> bool {
false
}
/// Returns true if the dialect supports triple quoted string
/// e.g. `"""abc"""`
fn supports_triple_quoted_string(&self) -> bool {
false
}
/// Dialect-specific prefix parser override
fn parse_prefix(&self, _parser: &mut Parser) -> Option<Result<Expr, ParserError>> {
// return None to fall back to the default behavior

View file

@ -1187,10 +1187,17 @@ impl<'a> Parser<'a> {
Token::Number(_, _)
| Token::SingleQuotedString(_)
| Token::DoubleQuotedString(_)
| Token::TripleSingleQuotedString(_)
| Token::TripleDoubleQuotedString(_)
| Token::DollarQuotedString(_)
| Token::SingleQuotedByteStringLiteral(_)
| Token::DoubleQuotedByteStringLiteral(_)
| Token::RawStringLiteral(_)
| Token::TripleSingleQuotedByteStringLiteral(_)
| Token::TripleDoubleQuotedByteStringLiteral(_)
| Token::SingleQuotedRawStringLiteral(_)
| Token::DoubleQuotedRawStringLiteral(_)
| Token::TripleSingleQuotedRawStringLiteral(_)
| Token::TripleDoubleQuotedRawStringLiteral(_)
| Token::NationalStringLiteral(_)
| Token::HexStringLiteral(_) => {
self.prev_token();
@ -6425,6 +6432,12 @@ impl<'a> Parser<'a> {
},
Token::SingleQuotedString(ref s) => Ok(Value::SingleQuotedString(s.to_string())),
Token::DoubleQuotedString(ref s) => Ok(Value::DoubleQuotedString(s.to_string())),
Token::TripleSingleQuotedString(ref s) => {
Ok(Value::TripleSingleQuotedString(s.to_string()))
}
Token::TripleDoubleQuotedString(ref s) => {
Ok(Value::TripleDoubleQuotedString(s.to_string()))
}
Token::DollarQuotedString(ref s) => Ok(Value::DollarQuotedString(s.clone())),
Token::SingleQuotedByteStringLiteral(ref s) => {
Ok(Value::SingleQuotedByteStringLiteral(s.clone()))
@ -6432,7 +6445,24 @@ impl<'a> Parser<'a> {
Token::DoubleQuotedByteStringLiteral(ref s) => {
Ok(Value::DoubleQuotedByteStringLiteral(s.clone()))
}
Token::RawStringLiteral(ref s) => Ok(Value::RawStringLiteral(s.clone())),
Token::TripleSingleQuotedByteStringLiteral(ref s) => {
Ok(Value::TripleSingleQuotedByteStringLiteral(s.clone()))
}
Token::TripleDoubleQuotedByteStringLiteral(ref s) => {
Ok(Value::TripleDoubleQuotedByteStringLiteral(s.clone()))
}
Token::SingleQuotedRawStringLiteral(ref s) => {
Ok(Value::SingleQuotedRawStringLiteral(s.clone()))
}
Token::DoubleQuotedRawStringLiteral(ref s) => {
Ok(Value::DoubleQuotedRawStringLiteral(s.clone()))
}
Token::TripleSingleQuotedRawStringLiteral(ref s) => {
Ok(Value::TripleSingleQuotedRawStringLiteral(s.clone()))
}
Token::TripleDoubleQuotedRawStringLiteral(ref s) => {
Ok(Value::TripleDoubleQuotedRawStringLiteral(s.clone()))
}
Token::NationalStringLiteral(ref s) => Ok(Value::NationalStringLiteral(s.to_string())),
Token::EscapedStringLiteral(ref s) => Ok(Value::EscapedStringLiteral(s.to_string())),
Token::HexStringLiteral(ref s) => Ok(Value::HexStringLiteral(s.to_string())),

View file

@ -26,6 +26,7 @@ use alloc::{
};
use core::fmt;
use core::iter::Peekable;
use core::num::NonZeroU8;
use core::str::Chars;
#[cfg(feature = "serde")]
@ -58,6 +59,12 @@ pub enum Token {
SingleQuotedString(String),
/// Double quoted string: i.e: "string"
DoubleQuotedString(String),
/// Triple single quoted strings: Example '''abc'''
/// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
TripleSingleQuotedString(String),
/// Triple double quoted strings: Example """abc"""
/// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
TripleDoubleQuotedString(String),
/// Dollar quoted string: i.e: $$string$$ or $tag_name$string$tag_name$
DollarQuotedString(DollarQuotedString),
/// Byte string literal: i.e: b'string' or B'string' (note that some backends, such as
@ -65,8 +72,24 @@ pub enum Token {
SingleQuotedByteStringLiteral(String),
/// Byte string literal: i.e: b"string" or B"string"
DoubleQuotedByteStringLiteral(String),
/// Raw string literal: i.e: r'string' or R'string' or r"string" or R"string"
RawStringLiteral(String),
/// Triple single quoted literal with byte string prefix. Example `B'''abc'''`
/// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
TripleSingleQuotedByteStringLiteral(String),
/// Triple double quoted literal with byte string prefix. Example `B"""abc"""`
/// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
TripleDoubleQuotedByteStringLiteral(String),
/// Single quoted literal with raw string prefix. Example `R'abc'`
/// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
SingleQuotedRawStringLiteral(String),
/// Double quoted literal with raw string prefix. Example `R"abc"`
/// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
DoubleQuotedRawStringLiteral(String),
/// Triple single quoted literal with raw string prefix. Example `R'''abc'''`
/// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
TripleSingleQuotedRawStringLiteral(String),
/// Triple double quoted literal with raw string prefix. Example `R"""abc"""`
/// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
TripleDoubleQuotedRawStringLiteral(String),
/// "National" string literal: i.e: N'string'
NationalStringLiteral(String),
/// "escaped" string literal, which are an extension to the SQL standard: i.e: e'first \n second' or E 'first \n second'
@ -218,14 +241,21 @@ impl fmt::Display for Token {
Token::Number(ref n, l) => write!(f, "{}{long}", n, long = if *l { "L" } else { "" }),
Token::Char(ref c) => write!(f, "{c}"),
Token::SingleQuotedString(ref s) => write!(f, "'{s}'"),
Token::TripleSingleQuotedString(ref s) => write!(f, "'''{s}'''"),
Token::DoubleQuotedString(ref s) => write!(f, "\"{s}\""),
Token::TripleDoubleQuotedString(ref s) => write!(f, "\"\"\"{s}\"\"\""),
Token::DollarQuotedString(ref s) => write!(f, "{s}"),
Token::NationalStringLiteral(ref s) => write!(f, "N'{s}'"),
Token::EscapedStringLiteral(ref s) => write!(f, "E'{s}'"),
Token::HexStringLiteral(ref s) => write!(f, "X'{s}'"),
Token::SingleQuotedByteStringLiteral(ref s) => write!(f, "B'{s}'"),
Token::TripleSingleQuotedByteStringLiteral(ref s) => write!(f, "B'''{s}'''"),
Token::DoubleQuotedByteStringLiteral(ref s) => write!(f, "B\"{s}\""),
Token::RawStringLiteral(ref s) => write!(f, "R'{s}'"),
Token::TripleDoubleQuotedByteStringLiteral(ref s) => write!(f, "B\"\"\"{s}\"\"\""),
Token::SingleQuotedRawStringLiteral(ref s) => write!(f, "R'{s}'"),
Token::DoubleQuotedRawStringLiteral(ref s) => write!(f, "R\"{s}\""),
Token::TripleSingleQuotedRawStringLiteral(ref s) => write!(f, "R'''{s}'''"),
Token::TripleDoubleQuotedRawStringLiteral(ref s) => write!(f, "R\"\"\"{s}\"\"\""),
Token::Comma => f.write_str(","),
Token::Whitespace(ws) => write!(f, "{ws}"),
Token::DoubleEq => f.write_str("=="),
@ -490,6 +520,32 @@ impl<'a> State<'a> {
}
}
/// Represents how many quote characters enclose a string literal.
#[derive(Copy, Clone)]
enum NumStringQuoteChars {
/// e.g. `"abc"`, `'abc'`, `r'abc'`
One,
/// e.g. `"""abc"""`, `'''abc'''`, `r'''abc'''`
Many(NonZeroU8),
}
/// Settings for tokenizing a quoted string literal.
struct TokenizeQuotedStringSettings {
/// The character used to quote the string.
quote_style: char,
/// Represents how many quotes characters enclose the string literal.
num_quote_chars: NumStringQuoteChars,
/// The number of opening quotes left to consume, before parsing
/// the remaining string literal.
/// For example: given initial string `"""abc"""`. If the caller has
/// already parsed the first quote for some reason, then this value
/// is set to 1, flagging to look to consume only 2 leading quotes.
num_opening_quotes_to_consume: u8,
/// True if the string uses backslash escaping of special characters
/// e.g `'abc\ndef\'ghi'
backslash_escape: bool,
}
/// SQL Tokenizer
pub struct Tokenizer<'a> {
dialect: &'a dyn Dialect,
@ -639,11 +695,31 @@ impl<'a> Tokenizer<'a> {
chars.next(); // consume
match chars.peek() {
Some('\'') => {
let s = self.tokenize_quoted_string(chars, '\'', false)?;
if self.dialect.supports_triple_quoted_string() {
return self
.tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
chars,
'\'',
false,
Token::SingleQuotedByteStringLiteral,
Token::TripleSingleQuotedByteStringLiteral,
);
}
let s = self.tokenize_single_quoted_string(chars, '\'', false)?;
Ok(Some(Token::SingleQuotedByteStringLiteral(s)))
}
Some('\"') => {
let s = self.tokenize_quoted_string(chars, '\"', false)?;
if self.dialect.supports_triple_quoted_string() {
return self
.tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
chars,
'"',
false,
Token::DoubleQuotedByteStringLiteral,
Token::TripleDoubleQuotedByteStringLiteral,
);
}
let s = self.tokenize_single_quoted_string(chars, '\"', false)?;
Ok(Some(Token::DoubleQuotedByteStringLiteral(s)))
}
_ => {
@ -657,14 +733,22 @@ impl<'a> Tokenizer<'a> {
b @ 'R' | b @ 'r' if dialect_of!(self is BigQueryDialect | GenericDialect) => {
chars.next(); // consume
match chars.peek() {
Some('\'') => {
let s = self.tokenize_quoted_string(chars, '\'', false)?;
Ok(Some(Token::RawStringLiteral(s)))
}
Some('\"') => {
let s = self.tokenize_quoted_string(chars, '\"', false)?;
Ok(Some(Token::RawStringLiteral(s)))
}
Some('\'') => self
.tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
chars,
'\'',
false,
Token::SingleQuotedRawStringLiteral,
Token::TripleSingleQuotedRawStringLiteral,
),
Some('\"') => self
.tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
chars,
'"',
false,
Token::DoubleQuotedRawStringLiteral,
Token::TripleDoubleQuotedRawStringLiteral,
),
_ => {
// regular identifier starting with an "r" or "R"
let s = self.tokenize_word(b, chars);
@ -678,7 +762,7 @@ impl<'a> Tokenizer<'a> {
match chars.peek() {
Some('\'') => {
// N'...' - a <national character string literal>
let s = self.tokenize_quoted_string(chars, '\'', true)?;
let s = self.tokenize_single_quoted_string(chars, '\'', true)?;
Ok(Some(Token::NationalStringLiteral(s)))
}
_ => {
@ -712,7 +796,7 @@ impl<'a> Tokenizer<'a> {
match chars.peek() {
Some('\'') => {
// X'...' - a <binary string literal>
let s = self.tokenize_quoted_string(chars, '\'', true)?;
let s = self.tokenize_single_quoted_string(chars, '\'', true)?;
Ok(Some(Token::HexStringLiteral(s)))
}
_ => {
@ -724,7 +808,17 @@ impl<'a> Tokenizer<'a> {
}
// single quoted string
'\'' => {
let s = self.tokenize_quoted_string(
if self.dialect.supports_triple_quoted_string() {
return self
.tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
chars,
'\'',
self.dialect.supports_string_literal_backslash_escape(),
Token::SingleQuotedString,
Token::TripleSingleQuotedString,
);
}
let s = self.tokenize_single_quoted_string(
chars,
'\'',
self.dialect.supports_string_literal_backslash_escape(),
@ -736,7 +830,17 @@ impl<'a> Tokenizer<'a> {
'\"' if !self.dialect.is_delimited_identifier_start(ch)
&& !self.dialect.is_identifier_start(ch) =>
{
let s = self.tokenize_quoted_string(
if self.dialect.supports_triple_quoted_string() {
return self
.tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
chars,
'"',
self.dialect.supports_string_literal_backslash_escape(),
Token::DoubleQuotedString,
Token::TripleDoubleQuotedString,
);
}
let s = self.tokenize_single_quoted_string(
chars,
'"',
self.dialect.supports_string_literal_backslash_escape(),
@ -1246,23 +1350,128 @@ impl<'a> Tokenizer<'a> {
self.tokenizer_error(starting_loc, "Unterminated encoded string literal")
}
/// Read a single quoted string, starting with the opening quote.
fn tokenize_quoted_string(
/// Reads a string literal quoted by a single or triple quote characters.
/// Examples: `'abc'`, `'''abc'''`, `"""abc"""`.
fn tokenize_single_or_triple_quoted_string<F>(
&self,
chars: &mut State,
quote_style: char,
allow_escape: bool,
backslash_escape: bool,
single_quote_token: F,
triple_quote_token: F,
) -> Result<Option<Token>, TokenizerError>
where
F: Fn(String) -> Token,
{
let error_loc = chars.location();
let mut num_opening_quotes = 0u8;
for _ in 0..3 {
if Some(&quote_style) == chars.peek() {
chars.next(); // Consume quote.
num_opening_quotes += 1;
} else {
break;
}
}
let (token_fn, num_quote_chars) = match num_opening_quotes {
1 => (single_quote_token, NumStringQuoteChars::One),
2 => {
// If we matched double quotes, then this is an empty string.
return Ok(Some(single_quote_token("".into())));
}
3 => {
let Some(num_quote_chars) = NonZeroU8::new(3) else {
return self.tokenizer_error(error_loc, "invalid number of opening quotes");
};
(
triple_quote_token,
NumStringQuoteChars::Many(num_quote_chars),
)
}
_ => {
return self.tokenizer_error(error_loc, "invalid string literal opening");
}
};
let settings = TokenizeQuotedStringSettings {
quote_style,
num_quote_chars,
num_opening_quotes_to_consume: 0,
backslash_escape,
};
self.tokenize_quoted_string(chars, settings)
.map(token_fn)
.map(Some)
}
/// Reads a string literal quoted by a single quote character.
fn tokenize_single_quoted_string(
&self,
chars: &mut State,
quote_style: char,
backslash_escape: bool,
) -> Result<String, TokenizerError> {
self.tokenize_quoted_string(
chars,
TokenizeQuotedStringSettings {
quote_style,
num_quote_chars: NumStringQuoteChars::One,
num_opening_quotes_to_consume: 1,
backslash_escape,
},
)
}
/// Read a quoted string.
fn tokenize_quoted_string(
&self,
chars: &mut State,
settings: TokenizeQuotedStringSettings,
) -> Result<String, TokenizerError> {
let mut s = String::new();
let error_loc = chars.location();
chars.next(); // consume the opening quote
// Consume any opening quotes.
for _ in 0..settings.num_opening_quotes_to_consume {
if Some(settings.quote_style) != chars.next() {
return self.tokenizer_error(error_loc, "invalid string literal opening");
}
}
let mut num_consecutive_quotes = 0;
while let Some(&ch) = chars.peek() {
let pending_final_quote = match settings.num_quote_chars {
NumStringQuoteChars::One => Some(NumStringQuoteChars::One),
n @ NumStringQuoteChars::Many(count)
if num_consecutive_quotes + 1 == count.get() =>
{
Some(n)
}
NumStringQuoteChars::Many(_) => None,
};
match ch {
char if char == quote_style => {
char if char == settings.quote_style && pending_final_quote.is_some() => {
chars.next(); // consume
if chars.peek().map(|c| *c == quote_style).unwrap_or(false) {
if let Some(NumStringQuoteChars::Many(count)) = pending_final_quote {
// For an initial string like `"""abc"""`, at this point we have
// `abc""` in the buffer and have now matched the final `"`.
// However, the string to return is simply `abc`, so we strip off
// the trailing quotes before returning.
let mut buf = s.chars();
for _ in 1..count.get() {
buf.next_back();
}
return Ok(buf.as_str().to_string());
} else if chars
.peek()
.map(|c| *c == settings.quote_style)
.unwrap_or(false)
{
s.push(ch);
if !self.unescape {
// In no-escape mode, the given query has to be saved completely
@ -1273,10 +1482,12 @@ impl<'a> Tokenizer<'a> {
return Ok(s);
}
}
'\\' if allow_escape => {
'\\' if settings.backslash_escape => {
// consume backslash
chars.next();
num_consecutive_quotes = 0;
if let Some(next) = chars.peek() {
if !self.unescape {
// In no-escape mode, the given query has to be saved completely including backslashes.
@ -1300,8 +1511,15 @@ impl<'a> Tokenizer<'a> {
}
}
}
_ => {
chars.next(); // consume
ch => {
chars.next(); // consume ch
if ch == settings.quote_style {
num_consecutive_quotes += 1;
} else {
num_consecutive_quotes = 0;
}
s.push(ch);
}
}
@ -2466,6 +2684,7 @@ mod tests {
#[test]
fn tokenize_quoted_string_escape() {
let dialect = SnowflakeDialect {};
for (sql, expected, expected_unescaped) in [
(r#"'%a\'%b'"#, r#"%a\'%b"#, r#"%a'%b"#),
(r#"'a\'\'b\'c\'d'"#, r#"a\'\'b\'c\'d"#, r#"a''b'c'd"#),
@ -2480,8 +2699,6 @@ mod tests {
(r#"'\'abcd'"#, r#"\'abcd"#, r#"'abcd"#),
(r#"'''a''b'"#, r#"''a''b"#, r#"'a'b"#),
] {
let dialect = BigQueryDialect {};
let tokens = Tokenizer::new(&dialect, sql)
.with_unescape(false)
.tokenize()
@ -2498,7 +2715,6 @@ mod tests {
}
for sql in [r#"'\'"#, r#"'ab\'"#] {
let dialect = BigQueryDialect {};
let mut tokenizer = Tokenizer::new(&dialect, sql);
assert_eq!(
"Unterminated string literal",
@ -2516,4 +2732,124 @@ mod tests {
compare(expected, tokens);
}
}
#[test]
fn tokenize_triple_quoted_string() {
fn check<F>(
q: char, // The quote character to test
r: char, // An alternate quote character.
quote_token: F,
) where
F: Fn(String) -> Token,
{
let dialect = BigQueryDialect {};
for (sql, expected, expected_unescaped) in [
// Empty string
(format!(r#"{q}{q}{q}{q}{q}{q}"#), "".into(), "".into()),
// Should not count escaped quote as end of string.
(
format!(r#"{q}{q}{q}ab{q}{q}\{q}{q}cd{q}{q}{q}"#),
format!(r#"ab{q}{q}\{q}{q}cd"#),
format!(r#"ab{q}{q}{q}{q}cd"#),
),
// Simple string
(
format!(r#"{q}{q}{q}abc{q}{q}{q}"#),
"abc".into(),
"abc".into(),
),
// Mix single-double quotes unescaped.
(
format!(r#"{q}{q}{q}ab{r}{r}{r}c{r}def{r}{r}{r}{q}{q}{q}"#),
format!("ab{r}{r}{r}c{r}def{r}{r}{r}"),
format!("ab{r}{r}{r}c{r}def{r}{r}{r}"),
),
// Escaped quote.
(
format!(r#"{q}{q}{q}ab{q}{q}c{q}{q}\{q}de{q}{q}f{q}{q}{q}"#),
format!(r#"ab{q}{q}c{q}{q}\{q}de{q}{q}f"#),
format!(r#"ab{q}{q}c{q}{q}{q}de{q}{q}f"#),
),
// backslash-escaped quote characters.
(
format!(r#"{q}{q}{q}a\'\'b\'c\'d{q}{q}{q}"#),
r#"a\'\'b\'c\'d"#.into(),
r#"a''b'c'd"#.into(),
),
// backslash-escaped characters
(
format!(r#"{q}{q}{q}abc\0\n\rdef{q}{q}{q}"#),
r#"abc\0\n\rdef"#.into(),
"abc\0\n\rdef".into(),
),
] {
let tokens = Tokenizer::new(&dialect, sql.as_str())
.with_unescape(false)
.tokenize()
.unwrap();
let expected = vec![quote_token(expected.to_string())];
compare(expected, tokens);
let tokens = Tokenizer::new(&dialect, sql.as_str())
.with_unescape(true)
.tokenize()
.unwrap();
let expected = vec![quote_token(expected_unescaped.to_string())];
compare(expected, tokens);
}
for sql in [
format!(r#"{q}{q}{q}{q}{q}\{q}"#),
format!(r#"{q}{q}{q}abc{q}{q}\{q}"#),
format!(r#"{q}{q}{q}{q}"#),
format!(r#"{q}{q}{q}{r}{r}"#),
format!(r#"{q}{q}{q}abc{q}"#),
format!(r#"{q}{q}{q}abc{q}{q}"#),
format!(r#"{q}{q}{q}abc"#),
] {
let dialect = BigQueryDialect {};
let mut tokenizer = Tokenizer::new(&dialect, sql.as_str());
assert_eq!(
"Unterminated string literal",
tokenizer.tokenize().unwrap_err().message.as_str(),
);
}
}
check('"', '\'', Token::TripleDoubleQuotedString);
check('\'', '"', Token::TripleSingleQuotedString);
let dialect = BigQueryDialect {};
let sql = r#"""''"#;
let tokens = Tokenizer::new(&dialect, sql)
.with_unescape(true)
.tokenize()
.unwrap();
let expected = vec![
Token::DoubleQuotedString("".to_string()),
Token::SingleQuotedString("".to_string()),
];
compare(expected, tokens);
let sql = r#"''"""#;
let tokens = Tokenizer::new(&dialect, sql)
.with_unescape(true)
.tokenize()
.unwrap();
let expected = vec![
Token::SingleQuotedString("".to_string()),
Token::DoubleQuotedString("".to_string()),
];
compare(expected, tokens);
// Non-triple quoted string dialect
let dialect = SnowflakeDialect {};
let sql = r#"''''''"#;
let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
let expected = vec![Token::SingleQuotedString("''".to_string())];
compare(expected, tokens);
}
}

View file

@ -18,14 +18,30 @@ use std::ops::Deref;
use sqlparser::ast::*;
use sqlparser::dialect::{BigQueryDialect, GenericDialect};
use sqlparser::parser::ParserError;
use sqlparser::parser::{ParserError, ParserOptions};
use test_utils::*;
#[test]
fn parse_literal_string() {
let sql = r#"SELECT 'single', "double""#;
let select = bigquery().verified_only_select(sql);
assert_eq!(2, select.projection.len());
let sql = concat!(
"SELECT ",
"'single', ",
r#""double", "#,
"'''triple-single''', ",
r#""""triple-double""", "#,
r#"'single\'escaped', "#,
r#"'''triple-single\'escaped''', "#,
r#"'''triple-single'unescaped''', "#,
r#""double\"escaped", "#,
r#""""triple-double\"escaped""", "#,
r#""""triple-double"unescaped""""#,
);
let dialect = TestedDialects {
dialects: vec![Box::new(BigQueryDialect {})],
options: Some(ParserOptions::new().with_unescape(false)),
};
let select = dialect.verified_only_select(sql);
assert_eq!(10, select.projection.len());
assert_eq!(
&Expr::Value(Value::SingleQuotedString("single".to_string())),
expr_from_projection(&select.projection[0])
@ -34,56 +50,162 @@ fn parse_literal_string() {
&Expr::Value(Value::DoubleQuotedString("double".to_string())),
expr_from_projection(&select.projection[1])
);
assert_eq!(
&Expr::Value(Value::TripleSingleQuotedString("triple-single".to_string())),
expr_from_projection(&select.projection[2])
);
assert_eq!(
&Expr::Value(Value::TripleDoubleQuotedString("triple-double".to_string())),
expr_from_projection(&select.projection[3])
);
assert_eq!(
&Expr::Value(Value::SingleQuotedString(r#"single\'escaped"#.to_string())),
expr_from_projection(&select.projection[4])
);
assert_eq!(
&Expr::Value(Value::TripleSingleQuotedString(
r#"triple-single\'escaped"#.to_string()
)),
expr_from_projection(&select.projection[5])
);
assert_eq!(
&Expr::Value(Value::TripleSingleQuotedString(
r#"triple-single'unescaped"#.to_string()
)),
expr_from_projection(&select.projection[6])
);
assert_eq!(
&Expr::Value(Value::DoubleQuotedString(r#"double\"escaped"#.to_string())),
expr_from_projection(&select.projection[7])
);
assert_eq!(
&Expr::Value(Value::TripleDoubleQuotedString(
r#"triple-double\"escaped"#.to_string()
)),
expr_from_projection(&select.projection[8])
);
assert_eq!(
&Expr::Value(Value::TripleDoubleQuotedString(
r#"triple-double"unescaped"#.to_string()
)),
expr_from_projection(&select.projection[9])
);
}
#[test]
fn parse_byte_literal() {
let sql = r#"SELECT B'abc', B"abc""#;
let select = bigquery().verified_only_select(sql);
assert_eq!(2, select.projection.len());
assert_eq!(
&Expr::Value(Value::SingleQuotedByteStringLiteral("abc".to_string())),
expr_from_projection(&select.projection[0])
);
assert_eq!(
&Expr::Value(Value::DoubleQuotedByteStringLiteral("abc".to_string())),
expr_from_projection(&select.projection[1])
let sql = concat!(
"SELECT ",
"B'abc', ",
r#"B"abc", "#,
r#"B'f\(abc,(.*),def\)', "#,
r#"B"f\(abc,(.*),def\)", "#,
r#"B'''abc''', "#,
r#"B"""abc""""#,
);
let stmt = bigquery().verified_stmt(sql);
if let Statement::Query(query) = stmt {
if let SetExpr::Select(select) = *query.body {
assert_eq!(6, select.projection.len());
assert_eq!(
&Expr::Value(Value::SingleQuotedByteStringLiteral("abc".to_string())),
expr_from_projection(&select.projection[0])
);
assert_eq!(
&Expr::Value(Value::DoubleQuotedByteStringLiteral("abc".to_string())),
expr_from_projection(&select.projection[1])
);
assert_eq!(
&Expr::Value(Value::SingleQuotedByteStringLiteral(
r"f\(abc,(.*),def\)".to_string()
)),
expr_from_projection(&select.projection[2])
);
assert_eq!(
&Expr::Value(Value::DoubleQuotedByteStringLiteral(
r"f\(abc,(.*),def\)".to_string()
)),
expr_from_projection(&select.projection[3])
);
assert_eq!(
&Expr::Value(Value::TripleSingleQuotedByteStringLiteral(
r"abc".to_string()
)),
expr_from_projection(&select.projection[4])
);
assert_eq!(
&Expr::Value(Value::TripleDoubleQuotedByteStringLiteral(
r"abc".to_string()
)),
expr_from_projection(&select.projection[5])
);
}
} else {
panic!("invalid query");
}
let sql = r#"SELECT b'abc', b"abc""#;
bigquery().one_statement_parses_to(sql, r#"SELECT B'abc', B"abc""#);
bigquery().one_statement_parses_to(
r#"SELECT b'123', b"123", b'''123''', b"""123""""#,
r#"SELECT B'123', B"123", B'''123''', B"""123""""#,
);
}
#[test]
fn parse_raw_literal() {
let sql = r#"SELECT R'abc', R"abc", R'f\(abc,(.*),def\)', R"f\(abc,(.*),def\)""#;
let stmt = bigquery().one_statement_parses_to(
sql,
r"SELECT R'abc', R'abc', R'f\(abc,(.*),def\)', R'f\(abc,(.*),def\)'",
let sql = concat!(
"SELECT ",
"R'abc', ",
r#"R"abc", "#,
r#"R'f\(abc,(.*),def\)', "#,
r#"R"f\(abc,(.*),def\)", "#,
r#"R'''abc''', "#,
r#"R"""abc""""#,
);
let stmt = bigquery().verified_stmt(sql);
if let Statement::Query(query) = stmt {
if let SetExpr::Select(select) = *query.body {
assert_eq!(4, select.projection.len());
assert_eq!(6, select.projection.len());
assert_eq!(
&Expr::Value(Value::RawStringLiteral("abc".to_string())),
&Expr::Value(Value::SingleQuotedRawStringLiteral("abc".to_string())),
expr_from_projection(&select.projection[0])
);
assert_eq!(
&Expr::Value(Value::RawStringLiteral("abc".to_string())),
&Expr::Value(Value::DoubleQuotedRawStringLiteral("abc".to_string())),
expr_from_projection(&select.projection[1])
);
assert_eq!(
&Expr::Value(Value::RawStringLiteral(r"f\(abc,(.*),def\)".to_string())),
&Expr::Value(Value::SingleQuotedRawStringLiteral(
r"f\(abc,(.*),def\)".to_string()
)),
expr_from_projection(&select.projection[2])
);
assert_eq!(
&Expr::Value(Value::RawStringLiteral(r"f\(abc,(.*),def\)".to_string())),
&Expr::Value(Value::DoubleQuotedRawStringLiteral(
r"f\(abc,(.*),def\)".to_string()
)),
expr_from_projection(&select.projection[3])
);
return;
assert_eq!(
&Expr::Value(Value::TripleSingleQuotedRawStringLiteral(
r"abc".to_string()
)),
expr_from_projection(&select.projection[4])
);
assert_eq!(
&Expr::Value(Value::TripleDoubleQuotedRawStringLiteral(
r"abc".to_string()
)),
expr_from_projection(&select.projection[5])
);
}
} else {
panic!("invalid query");
}
panic!("invalid query")
bigquery().one_statement_parses_to(
r#"SELECT r'123', r"123", r'''123''', r"""123""""#,
r#"SELECT R'123', R"123", R'''123''', R"""123""""#,
);
}
#[test]