Support triple quoted strings (#1262)

This commit is contained in:
Ifeanyi Ubah 2024-05-11 12:41:04 +02:00 committed by GitHub
parent e3692f4681
commit 036a4120b4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 592 additions and 62 deletions

View file

@ -26,6 +26,7 @@ use alloc::{
};
use core::fmt;
use core::iter::Peekable;
use core::num::NonZeroU8;
use core::str::Chars;
#[cfg(feature = "serde")]
@ -58,6 +59,12 @@ pub enum Token {
SingleQuotedString(String),
/// Double quoted string: i.e: "string"
DoubleQuotedString(String),
/// Triple single quoted strings: Example '''abc'''
/// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
TripleSingleQuotedString(String),
/// Triple double quoted strings: Example """abc"""
/// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
TripleDoubleQuotedString(String),
/// Dollar quoted string: i.e: $$string$$ or $tag_name$string$tag_name$
DollarQuotedString(DollarQuotedString),
/// Byte string literal: i.e: b'string' or B'string' (note that some backends, such as
@ -65,8 +72,24 @@ pub enum Token {
SingleQuotedByteStringLiteral(String),
/// Byte string literal: i.e: b"string" or B"string"
DoubleQuotedByteStringLiteral(String),
/// Raw string literal: i.e: r'string' or R'string' or r"string" or R"string"
RawStringLiteral(String),
/// Triple single quoted literal with byte string prefix. Example `B'''abc'''`
/// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
TripleSingleQuotedByteStringLiteral(String),
/// Triple double quoted literal with byte string prefix. Example `B"""abc"""`
/// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
TripleDoubleQuotedByteStringLiteral(String),
/// Single quoted literal with raw string prefix. Example `R'abc'`
/// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
SingleQuotedRawStringLiteral(String),
/// Double quoted literal with raw string prefix. Example `R"abc"`
/// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
DoubleQuotedRawStringLiteral(String),
/// Triple single quoted literal with raw string prefix. Example `R'''abc'''`
/// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
TripleSingleQuotedRawStringLiteral(String),
/// Triple double quoted literal with raw string prefix. Example `R"""abc"""`
/// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
TripleDoubleQuotedRawStringLiteral(String),
/// "National" string literal: i.e: N'string'
NationalStringLiteral(String),
/// "escaped" string literal, which are an extension to the SQL standard: i.e: e'first \n second' or E 'first \n second'
@ -218,14 +241,21 @@ impl fmt::Display for Token {
Token::Number(ref n, l) => write!(f, "{}{long}", n, long = if *l { "L" } else { "" }),
Token::Char(ref c) => write!(f, "{c}"),
Token::SingleQuotedString(ref s) => write!(f, "'{s}'"),
Token::TripleSingleQuotedString(ref s) => write!(f, "'''{s}'''"),
Token::DoubleQuotedString(ref s) => write!(f, "\"{s}\""),
Token::TripleDoubleQuotedString(ref s) => write!(f, "\"\"\"{s}\"\"\""),
Token::DollarQuotedString(ref s) => write!(f, "{s}"),
Token::NationalStringLiteral(ref s) => write!(f, "N'{s}'"),
Token::EscapedStringLiteral(ref s) => write!(f, "E'{s}'"),
Token::HexStringLiteral(ref s) => write!(f, "X'{s}'"),
Token::SingleQuotedByteStringLiteral(ref s) => write!(f, "B'{s}'"),
Token::TripleSingleQuotedByteStringLiteral(ref s) => write!(f, "B'''{s}'''"),
Token::DoubleQuotedByteStringLiteral(ref s) => write!(f, "B\"{s}\""),
Token::RawStringLiteral(ref s) => write!(f, "R'{s}'"),
Token::TripleDoubleQuotedByteStringLiteral(ref s) => write!(f, "B\"\"\"{s}\"\"\""),
Token::SingleQuotedRawStringLiteral(ref s) => write!(f, "R'{s}'"),
Token::DoubleQuotedRawStringLiteral(ref s) => write!(f, "R\"{s}\""),
Token::TripleSingleQuotedRawStringLiteral(ref s) => write!(f, "R'''{s}'''"),
Token::TripleDoubleQuotedRawStringLiteral(ref s) => write!(f, "R\"\"\"{s}\"\"\""),
Token::Comma => f.write_str(","),
Token::Whitespace(ws) => write!(f, "{ws}"),
Token::DoubleEq => f.write_str("=="),
@ -490,6 +520,32 @@ impl<'a> State<'a> {
}
}
/// Represents how many quote characters enclose a string literal.
#[derive(Copy, Clone)]
enum NumStringQuoteChars {
/// e.g. `"abc"`, `'abc'`, `r'abc'`
One,
/// e.g. `"""abc"""`, `'''abc'''`, `r'''abc'''`
Many(NonZeroU8),
}
/// Settings for tokenizing a quoted string literal.
struct TokenizeQuotedStringSettings {
/// The character used to quote the string.
quote_style: char,
/// Represents how many quotes characters enclose the string literal.
num_quote_chars: NumStringQuoteChars,
/// The number of opening quotes left to consume, before parsing
/// the remaining string literal.
/// For example: given initial string `"""abc"""`. If the caller has
/// already parsed the first quote for some reason, then this value
/// is set to 1, flagging to look to consume only 2 leading quotes.
num_opening_quotes_to_consume: u8,
/// True if the string uses backslash escaping of special characters
/// e.g `'abc\ndef\'ghi'
backslash_escape: bool,
}
/// SQL Tokenizer
pub struct Tokenizer<'a> {
dialect: &'a dyn Dialect,
@ -639,11 +695,31 @@ impl<'a> Tokenizer<'a> {
chars.next(); // consume
match chars.peek() {
Some('\'') => {
let s = self.tokenize_quoted_string(chars, '\'', false)?;
if self.dialect.supports_triple_quoted_string() {
return self
.tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
chars,
'\'',
false,
Token::SingleQuotedByteStringLiteral,
Token::TripleSingleQuotedByteStringLiteral,
);
}
let s = self.tokenize_single_quoted_string(chars, '\'', false)?;
Ok(Some(Token::SingleQuotedByteStringLiteral(s)))
}
Some('\"') => {
let s = self.tokenize_quoted_string(chars, '\"', false)?;
if self.dialect.supports_triple_quoted_string() {
return self
.tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
chars,
'"',
false,
Token::DoubleQuotedByteStringLiteral,
Token::TripleDoubleQuotedByteStringLiteral,
);
}
let s = self.tokenize_single_quoted_string(chars, '\"', false)?;
Ok(Some(Token::DoubleQuotedByteStringLiteral(s)))
}
_ => {
@ -657,14 +733,22 @@ impl<'a> Tokenizer<'a> {
b @ 'R' | b @ 'r' if dialect_of!(self is BigQueryDialect | GenericDialect) => {
chars.next(); // consume
match chars.peek() {
Some('\'') => {
let s = self.tokenize_quoted_string(chars, '\'', false)?;
Ok(Some(Token::RawStringLiteral(s)))
}
Some('\"') => {
let s = self.tokenize_quoted_string(chars, '\"', false)?;
Ok(Some(Token::RawStringLiteral(s)))
}
Some('\'') => self
.tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
chars,
'\'',
false,
Token::SingleQuotedRawStringLiteral,
Token::TripleSingleQuotedRawStringLiteral,
),
Some('\"') => self
.tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
chars,
'"',
false,
Token::DoubleQuotedRawStringLiteral,
Token::TripleDoubleQuotedRawStringLiteral,
),
_ => {
// regular identifier starting with an "r" or "R"
let s = self.tokenize_word(b, chars);
@ -678,7 +762,7 @@ impl<'a> Tokenizer<'a> {
match chars.peek() {
Some('\'') => {
// N'...' - a <national character string literal>
let s = self.tokenize_quoted_string(chars, '\'', true)?;
let s = self.tokenize_single_quoted_string(chars, '\'', true)?;
Ok(Some(Token::NationalStringLiteral(s)))
}
_ => {
@ -712,7 +796,7 @@ impl<'a> Tokenizer<'a> {
match chars.peek() {
Some('\'') => {
// X'...' - a <binary string literal>
let s = self.tokenize_quoted_string(chars, '\'', true)?;
let s = self.tokenize_single_quoted_string(chars, '\'', true)?;
Ok(Some(Token::HexStringLiteral(s)))
}
_ => {
@ -724,7 +808,17 @@ impl<'a> Tokenizer<'a> {
}
// single quoted string
'\'' => {
let s = self.tokenize_quoted_string(
if self.dialect.supports_triple_quoted_string() {
return self
.tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
chars,
'\'',
self.dialect.supports_string_literal_backslash_escape(),
Token::SingleQuotedString,
Token::TripleSingleQuotedString,
);
}
let s = self.tokenize_single_quoted_string(
chars,
'\'',
self.dialect.supports_string_literal_backslash_escape(),
@ -736,7 +830,17 @@ impl<'a> Tokenizer<'a> {
'\"' if !self.dialect.is_delimited_identifier_start(ch)
&& !self.dialect.is_identifier_start(ch) =>
{
let s = self.tokenize_quoted_string(
if self.dialect.supports_triple_quoted_string() {
return self
.tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
chars,
'"',
self.dialect.supports_string_literal_backslash_escape(),
Token::DoubleQuotedString,
Token::TripleDoubleQuotedString,
);
}
let s = self.tokenize_single_quoted_string(
chars,
'"',
self.dialect.supports_string_literal_backslash_escape(),
@ -1246,23 +1350,128 @@ impl<'a> Tokenizer<'a> {
self.tokenizer_error(starting_loc, "Unterminated encoded string literal")
}
/// Read a single quoted string, starting with the opening quote.
fn tokenize_quoted_string(
/// Reads a string literal quoted by a single or triple quote characters.
/// Examples: `'abc'`, `'''abc'''`, `"""abc"""`.
fn tokenize_single_or_triple_quoted_string<F>(
&self,
chars: &mut State,
quote_style: char,
allow_escape: bool,
backslash_escape: bool,
single_quote_token: F,
triple_quote_token: F,
) -> Result<Option<Token>, TokenizerError>
where
F: Fn(String) -> Token,
{
let error_loc = chars.location();
let mut num_opening_quotes = 0u8;
for _ in 0..3 {
if Some(&quote_style) == chars.peek() {
chars.next(); // Consume quote.
num_opening_quotes += 1;
} else {
break;
}
}
let (token_fn, num_quote_chars) = match num_opening_quotes {
1 => (single_quote_token, NumStringQuoteChars::One),
2 => {
// If we matched double quotes, then this is an empty string.
return Ok(Some(single_quote_token("".into())));
}
3 => {
let Some(num_quote_chars) = NonZeroU8::new(3) else {
return self.tokenizer_error(error_loc, "invalid number of opening quotes");
};
(
triple_quote_token,
NumStringQuoteChars::Many(num_quote_chars),
)
}
_ => {
return self.tokenizer_error(error_loc, "invalid string literal opening");
}
};
let settings = TokenizeQuotedStringSettings {
quote_style,
num_quote_chars,
num_opening_quotes_to_consume: 0,
backslash_escape,
};
self.tokenize_quoted_string(chars, settings)
.map(token_fn)
.map(Some)
}
/// Reads a string literal quoted by a single quote character.
fn tokenize_single_quoted_string(
&self,
chars: &mut State,
quote_style: char,
backslash_escape: bool,
) -> Result<String, TokenizerError> {
self.tokenize_quoted_string(
chars,
TokenizeQuotedStringSettings {
quote_style,
num_quote_chars: NumStringQuoteChars::One,
num_opening_quotes_to_consume: 1,
backslash_escape,
},
)
}
/// Read a quoted string.
fn tokenize_quoted_string(
&self,
chars: &mut State,
settings: TokenizeQuotedStringSettings,
) -> Result<String, TokenizerError> {
let mut s = String::new();
let error_loc = chars.location();
chars.next(); // consume the opening quote
// Consume any opening quotes.
for _ in 0..settings.num_opening_quotes_to_consume {
if Some(settings.quote_style) != chars.next() {
return self.tokenizer_error(error_loc, "invalid string literal opening");
}
}
let mut num_consecutive_quotes = 0;
while let Some(&ch) = chars.peek() {
let pending_final_quote = match settings.num_quote_chars {
NumStringQuoteChars::One => Some(NumStringQuoteChars::One),
n @ NumStringQuoteChars::Many(count)
if num_consecutive_quotes + 1 == count.get() =>
{
Some(n)
}
NumStringQuoteChars::Many(_) => None,
};
match ch {
char if char == quote_style => {
char if char == settings.quote_style && pending_final_quote.is_some() => {
chars.next(); // consume
if chars.peek().map(|c| *c == quote_style).unwrap_or(false) {
if let Some(NumStringQuoteChars::Many(count)) = pending_final_quote {
// For an initial string like `"""abc"""`, at this point we have
// `abc""` in the buffer and have now matched the final `"`.
// However, the string to return is simply `abc`, so we strip off
// the trailing quotes before returning.
let mut buf = s.chars();
for _ in 1..count.get() {
buf.next_back();
}
return Ok(buf.as_str().to_string());
} else if chars
.peek()
.map(|c| *c == settings.quote_style)
.unwrap_or(false)
{
s.push(ch);
if !self.unescape {
// In no-escape mode, the given query has to be saved completely
@ -1273,10 +1482,12 @@ impl<'a> Tokenizer<'a> {
return Ok(s);
}
}
'\\' if allow_escape => {
'\\' if settings.backslash_escape => {
// consume backslash
chars.next();
num_consecutive_quotes = 0;
if let Some(next) = chars.peek() {
if !self.unescape {
// In no-escape mode, the given query has to be saved completely including backslashes.
@ -1300,8 +1511,15 @@ impl<'a> Tokenizer<'a> {
}
}
}
_ => {
chars.next(); // consume
ch => {
chars.next(); // consume ch
if ch == settings.quote_style {
num_consecutive_quotes += 1;
} else {
num_consecutive_quotes = 0;
}
s.push(ch);
}
}
@ -2466,6 +2684,7 @@ mod tests {
#[test]
fn tokenize_quoted_string_escape() {
let dialect = SnowflakeDialect {};
for (sql, expected, expected_unescaped) in [
(r#"'%a\'%b'"#, r#"%a\'%b"#, r#"%a'%b"#),
(r#"'a\'\'b\'c\'d'"#, r#"a\'\'b\'c\'d"#, r#"a''b'c'd"#),
@ -2480,8 +2699,6 @@ mod tests {
(r#"'\'abcd'"#, r#"\'abcd"#, r#"'abcd"#),
(r#"'''a''b'"#, r#"''a''b"#, r#"'a'b"#),
] {
let dialect = BigQueryDialect {};
let tokens = Tokenizer::new(&dialect, sql)
.with_unescape(false)
.tokenize()
@ -2498,7 +2715,6 @@ mod tests {
}
for sql in [r#"'\'"#, r#"'ab\'"#] {
let dialect = BigQueryDialect {};
let mut tokenizer = Tokenizer::new(&dialect, sql);
assert_eq!(
"Unterminated string literal",
@ -2516,4 +2732,124 @@ mod tests {
compare(expected, tokens);
}
}
#[test]
fn tokenize_triple_quoted_string() {
fn check<F>(
q: char, // The quote character to test
r: char, // An alternate quote character.
quote_token: F,
) where
F: Fn(String) -> Token,
{
let dialect = BigQueryDialect {};
for (sql, expected, expected_unescaped) in [
// Empty string
(format!(r#"{q}{q}{q}{q}{q}{q}"#), "".into(), "".into()),
// Should not count escaped quote as end of string.
(
format!(r#"{q}{q}{q}ab{q}{q}\{q}{q}cd{q}{q}{q}"#),
format!(r#"ab{q}{q}\{q}{q}cd"#),
format!(r#"ab{q}{q}{q}{q}cd"#),
),
// Simple string
(
format!(r#"{q}{q}{q}abc{q}{q}{q}"#),
"abc".into(),
"abc".into(),
),
// Mix single-double quotes unescaped.
(
format!(r#"{q}{q}{q}ab{r}{r}{r}c{r}def{r}{r}{r}{q}{q}{q}"#),
format!("ab{r}{r}{r}c{r}def{r}{r}{r}"),
format!("ab{r}{r}{r}c{r}def{r}{r}{r}"),
),
// Escaped quote.
(
format!(r#"{q}{q}{q}ab{q}{q}c{q}{q}\{q}de{q}{q}f{q}{q}{q}"#),
format!(r#"ab{q}{q}c{q}{q}\{q}de{q}{q}f"#),
format!(r#"ab{q}{q}c{q}{q}{q}de{q}{q}f"#),
),
// backslash-escaped quote characters.
(
format!(r#"{q}{q}{q}a\'\'b\'c\'d{q}{q}{q}"#),
r#"a\'\'b\'c\'d"#.into(),
r#"a''b'c'd"#.into(),
),
// backslash-escaped characters
(
format!(r#"{q}{q}{q}abc\0\n\rdef{q}{q}{q}"#),
r#"abc\0\n\rdef"#.into(),
"abc\0\n\rdef".into(),
),
] {
let tokens = Tokenizer::new(&dialect, sql.as_str())
.with_unescape(false)
.tokenize()
.unwrap();
let expected = vec![quote_token(expected.to_string())];
compare(expected, tokens);
let tokens = Tokenizer::new(&dialect, sql.as_str())
.with_unescape(true)
.tokenize()
.unwrap();
let expected = vec![quote_token(expected_unescaped.to_string())];
compare(expected, tokens);
}
for sql in [
format!(r#"{q}{q}{q}{q}{q}\{q}"#),
format!(r#"{q}{q}{q}abc{q}{q}\{q}"#),
format!(r#"{q}{q}{q}{q}"#),
format!(r#"{q}{q}{q}{r}{r}"#),
format!(r#"{q}{q}{q}abc{q}"#),
format!(r#"{q}{q}{q}abc{q}{q}"#),
format!(r#"{q}{q}{q}abc"#),
] {
let dialect = BigQueryDialect {};
let mut tokenizer = Tokenizer::new(&dialect, sql.as_str());
assert_eq!(
"Unterminated string literal",
tokenizer.tokenize().unwrap_err().message.as_str(),
);
}
}
check('"', '\'', Token::TripleDoubleQuotedString);
check('\'', '"', Token::TripleSingleQuotedString);
let dialect = BigQueryDialect {};
let sql = r#"""''"#;
let tokens = Tokenizer::new(&dialect, sql)
.with_unescape(true)
.tokenize()
.unwrap();
let expected = vec![
Token::DoubleQuotedString("".to_string()),
Token::SingleQuotedString("".to_string()),
];
compare(expected, tokens);
let sql = r#"''"""#;
let tokens = Tokenizer::new(&dialect, sql)
.with_unescape(true)
.tokenize()
.unwrap();
let expected = vec![
Token::SingleQuotedString("".to_string()),
Token::DoubleQuotedString("".to_string()),
];
compare(expected, tokens);
// Non-triple quoted string dialect
let dialect = SnowflakeDialect {};
let sql = r#"''''''"#;
let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
let expected = vec![Token::SingleQuotedString("''".to_string())];
compare(expected, tokens);
}
}