mirror of
https://github.com/apache/datafusion-sqlparser-rs.git
synced 2025-10-09 13:40:22 +00:00
Support triple quoted strings (#1262)
This commit is contained in:
parent
e3692f4681
commit
036a4120b4
6 changed files with 592 additions and 62 deletions
|
@ -42,6 +42,12 @@ pub enum Value {
|
|||
SingleQuotedString(String),
|
||||
// $<tag_name>$string value$<tag_name>$ (postgres syntax)
|
||||
DollarQuotedString(DollarQuotedString),
|
||||
/// Triple single quoted strings: Example '''abc'''
|
||||
/// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
|
||||
TripleSingleQuotedString(String),
|
||||
/// Triple double quoted strings: Example """abc"""
|
||||
/// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
|
||||
TripleDoubleQuotedString(String),
|
||||
/// e'string value' (postgres extension)
|
||||
/// See [Postgres docs](https://www.postgresql.org/docs/8.3/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS)
|
||||
/// for more details.
|
||||
|
@ -50,9 +56,24 @@ pub enum Value {
|
|||
SingleQuotedByteStringLiteral(String),
|
||||
/// B"string value"
|
||||
DoubleQuotedByteStringLiteral(String),
|
||||
/// R'string value' or r'string value' or r"string value"
|
||||
/// <https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals>
|
||||
RawStringLiteral(String),
|
||||
/// Triple single quoted literal with byte string prefix. Example `B'''abc'''`
|
||||
/// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
|
||||
TripleSingleQuotedByteStringLiteral(String),
|
||||
/// Triple double quoted literal with byte string prefix. Example `B"""abc"""`
|
||||
/// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
|
||||
TripleDoubleQuotedByteStringLiteral(String),
|
||||
/// Single quoted literal with raw string prefix. Example `R'abc'`
|
||||
/// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
|
||||
SingleQuotedRawStringLiteral(String),
|
||||
/// Double quoted literal with raw string prefix. Example `R"abc"`
|
||||
/// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
|
||||
DoubleQuotedRawStringLiteral(String),
|
||||
/// Triple single quoted literal with raw string prefix. Example `R'''abc'''`
|
||||
/// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
|
||||
TripleSingleQuotedRawStringLiteral(String),
|
||||
/// Triple double quoted literal with raw string prefix. Example `R"""abc"""`
|
||||
/// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
|
||||
TripleDoubleQuotedRawStringLiteral(String),
|
||||
/// N'string value'
|
||||
NationalStringLiteral(String),
|
||||
/// X'hex value'
|
||||
|
@ -73,6 +94,12 @@ impl fmt::Display for Value {
|
|||
Value::Number(v, l) => write!(f, "{}{long}", v, long = if *l { "L" } else { "" }),
|
||||
Value::DoubleQuotedString(v) => write!(f, "\"{}\"", escape_double_quote_string(v)),
|
||||
Value::SingleQuotedString(v) => write!(f, "'{}'", escape_single_quote_string(v)),
|
||||
Value::TripleSingleQuotedString(v) => {
|
||||
write!(f, "'''{v}'''")
|
||||
}
|
||||
Value::TripleDoubleQuotedString(v) => {
|
||||
write!(f, r#""""{v}""""#)
|
||||
}
|
||||
Value::DollarQuotedString(v) => write!(f, "{v}"),
|
||||
Value::EscapedStringLiteral(v) => write!(f, "E'{}'", escape_escaped_string(v)),
|
||||
Value::NationalStringLiteral(v) => write!(f, "N'{v}'"),
|
||||
|
@ -80,7 +107,12 @@ impl fmt::Display for Value {
|
|||
Value::Boolean(v) => write!(f, "{v}"),
|
||||
Value::SingleQuotedByteStringLiteral(v) => write!(f, "B'{v}'"),
|
||||
Value::DoubleQuotedByteStringLiteral(v) => write!(f, "B\"{v}\""),
|
||||
Value::RawStringLiteral(v) => write!(f, "R'{v}'"),
|
||||
Value::TripleSingleQuotedByteStringLiteral(v) => write!(f, "B'''{v}'''"),
|
||||
Value::TripleDoubleQuotedByteStringLiteral(v) => write!(f, r#"B"""{v}""""#),
|
||||
Value::SingleQuotedRawStringLiteral(v) => write!(f, "R'{v}'"),
|
||||
Value::DoubleQuotedRawStringLiteral(v) => write!(f, "R\"{v}\""),
|
||||
Value::TripleSingleQuotedRawStringLiteral(v) => write!(f, "R'''{v}'''"),
|
||||
Value::TripleDoubleQuotedRawStringLiteral(v) => write!(f, r#"R"""{v}""""#),
|
||||
Value::Null => write!(f, "NULL"),
|
||||
Value::Placeholder(v) => write!(f, "{v}"),
|
||||
}
|
||||
|
|
|
@ -30,6 +30,11 @@ impl Dialect for BigQueryDialect {
|
|||
ch.is_ascii_lowercase() || ch.is_ascii_uppercase() || ch.is_ascii_digit() || ch == '_'
|
||||
}
|
||||
|
||||
/// See [doc](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
|
||||
fn supports_triple_quoted_string(&self) -> bool {
|
||||
true
|
||||
}
|
||||
|
||||
/// See [doc](https://cloud.google.com/bigquery/docs/reference/standard-sql/navigation_functions#first_value)
|
||||
fn supports_window_function_null_treatment_arg(&self) -> bool {
|
||||
true
|
||||
|
|
|
@ -231,6 +231,11 @@ pub trait Dialect: Debug + Any {
|
|||
fn convert_type_before_value(&self) -> bool {
|
||||
false
|
||||
}
|
||||
/// Returns true if the dialect supports triple quoted string
|
||||
/// e.g. `"""abc"""`
|
||||
fn supports_triple_quoted_string(&self) -> bool {
|
||||
false
|
||||
}
|
||||
/// Dialect-specific prefix parser override
|
||||
fn parse_prefix(&self, _parser: &mut Parser) -> Option<Result<Expr, ParserError>> {
|
||||
// return None to fall back to the default behavior
|
||||
|
|
|
@ -1187,10 +1187,17 @@ impl<'a> Parser<'a> {
|
|||
Token::Number(_, _)
|
||||
| Token::SingleQuotedString(_)
|
||||
| Token::DoubleQuotedString(_)
|
||||
| Token::TripleSingleQuotedString(_)
|
||||
| Token::TripleDoubleQuotedString(_)
|
||||
| Token::DollarQuotedString(_)
|
||||
| Token::SingleQuotedByteStringLiteral(_)
|
||||
| Token::DoubleQuotedByteStringLiteral(_)
|
||||
| Token::RawStringLiteral(_)
|
||||
| Token::TripleSingleQuotedByteStringLiteral(_)
|
||||
| Token::TripleDoubleQuotedByteStringLiteral(_)
|
||||
| Token::SingleQuotedRawStringLiteral(_)
|
||||
| Token::DoubleQuotedRawStringLiteral(_)
|
||||
| Token::TripleSingleQuotedRawStringLiteral(_)
|
||||
| Token::TripleDoubleQuotedRawStringLiteral(_)
|
||||
| Token::NationalStringLiteral(_)
|
||||
| Token::HexStringLiteral(_) => {
|
||||
self.prev_token();
|
||||
|
@ -6425,6 +6432,12 @@ impl<'a> Parser<'a> {
|
|||
},
|
||||
Token::SingleQuotedString(ref s) => Ok(Value::SingleQuotedString(s.to_string())),
|
||||
Token::DoubleQuotedString(ref s) => Ok(Value::DoubleQuotedString(s.to_string())),
|
||||
Token::TripleSingleQuotedString(ref s) => {
|
||||
Ok(Value::TripleSingleQuotedString(s.to_string()))
|
||||
}
|
||||
Token::TripleDoubleQuotedString(ref s) => {
|
||||
Ok(Value::TripleDoubleQuotedString(s.to_string()))
|
||||
}
|
||||
Token::DollarQuotedString(ref s) => Ok(Value::DollarQuotedString(s.clone())),
|
||||
Token::SingleQuotedByteStringLiteral(ref s) => {
|
||||
Ok(Value::SingleQuotedByteStringLiteral(s.clone()))
|
||||
|
@ -6432,7 +6445,24 @@ impl<'a> Parser<'a> {
|
|||
Token::DoubleQuotedByteStringLiteral(ref s) => {
|
||||
Ok(Value::DoubleQuotedByteStringLiteral(s.clone()))
|
||||
}
|
||||
Token::RawStringLiteral(ref s) => Ok(Value::RawStringLiteral(s.clone())),
|
||||
Token::TripleSingleQuotedByteStringLiteral(ref s) => {
|
||||
Ok(Value::TripleSingleQuotedByteStringLiteral(s.clone()))
|
||||
}
|
||||
Token::TripleDoubleQuotedByteStringLiteral(ref s) => {
|
||||
Ok(Value::TripleDoubleQuotedByteStringLiteral(s.clone()))
|
||||
}
|
||||
Token::SingleQuotedRawStringLiteral(ref s) => {
|
||||
Ok(Value::SingleQuotedRawStringLiteral(s.clone()))
|
||||
}
|
||||
Token::DoubleQuotedRawStringLiteral(ref s) => {
|
||||
Ok(Value::DoubleQuotedRawStringLiteral(s.clone()))
|
||||
}
|
||||
Token::TripleSingleQuotedRawStringLiteral(ref s) => {
|
||||
Ok(Value::TripleSingleQuotedRawStringLiteral(s.clone()))
|
||||
}
|
||||
Token::TripleDoubleQuotedRawStringLiteral(ref s) => {
|
||||
Ok(Value::TripleDoubleQuotedRawStringLiteral(s.clone()))
|
||||
}
|
||||
Token::NationalStringLiteral(ref s) => Ok(Value::NationalStringLiteral(s.to_string())),
|
||||
Token::EscapedStringLiteral(ref s) => Ok(Value::EscapedStringLiteral(s.to_string())),
|
||||
Token::HexStringLiteral(ref s) => Ok(Value::HexStringLiteral(s.to_string())),
|
||||
|
|
394
src/tokenizer.rs
394
src/tokenizer.rs
|
@ -26,6 +26,7 @@ use alloc::{
|
|||
};
|
||||
use core::fmt;
|
||||
use core::iter::Peekable;
|
||||
use core::num::NonZeroU8;
|
||||
use core::str::Chars;
|
||||
|
||||
#[cfg(feature = "serde")]
|
||||
|
@ -58,6 +59,12 @@ pub enum Token {
|
|||
SingleQuotedString(String),
|
||||
/// Double quoted string: i.e: "string"
|
||||
DoubleQuotedString(String),
|
||||
/// Triple single quoted strings: Example '''abc'''
|
||||
/// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
|
||||
TripleSingleQuotedString(String),
|
||||
/// Triple double quoted strings: Example """abc"""
|
||||
/// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
|
||||
TripleDoubleQuotedString(String),
|
||||
/// Dollar quoted string: i.e: $$string$$ or $tag_name$string$tag_name$
|
||||
DollarQuotedString(DollarQuotedString),
|
||||
/// Byte string literal: i.e: b'string' or B'string' (note that some backends, such as
|
||||
|
@ -65,8 +72,24 @@ pub enum Token {
|
|||
SingleQuotedByteStringLiteral(String),
|
||||
/// Byte string literal: i.e: b"string" or B"string"
|
||||
DoubleQuotedByteStringLiteral(String),
|
||||
/// Raw string literal: i.e: r'string' or R'string' or r"string" or R"string"
|
||||
RawStringLiteral(String),
|
||||
/// Triple single quoted literal with byte string prefix. Example `B'''abc'''`
|
||||
/// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
|
||||
TripleSingleQuotedByteStringLiteral(String),
|
||||
/// Triple double quoted literal with byte string prefix. Example `B"""abc"""`
|
||||
/// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
|
||||
TripleDoubleQuotedByteStringLiteral(String),
|
||||
/// Single quoted literal with raw string prefix. Example `R'abc'`
|
||||
/// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
|
||||
SingleQuotedRawStringLiteral(String),
|
||||
/// Double quoted literal with raw string prefix. Example `R"abc"`
|
||||
/// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
|
||||
DoubleQuotedRawStringLiteral(String),
|
||||
/// Triple single quoted literal with raw string prefix. Example `R'''abc'''`
|
||||
/// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
|
||||
TripleSingleQuotedRawStringLiteral(String),
|
||||
/// Triple double quoted literal with raw string prefix. Example `R"""abc"""`
|
||||
/// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
|
||||
TripleDoubleQuotedRawStringLiteral(String),
|
||||
/// "National" string literal: i.e: N'string'
|
||||
NationalStringLiteral(String),
|
||||
/// "escaped" string literal, which are an extension to the SQL standard: i.e: e'first \n second' or E 'first \n second'
|
||||
|
@ -218,14 +241,21 @@ impl fmt::Display for Token {
|
|||
Token::Number(ref n, l) => write!(f, "{}{long}", n, long = if *l { "L" } else { "" }),
|
||||
Token::Char(ref c) => write!(f, "{c}"),
|
||||
Token::SingleQuotedString(ref s) => write!(f, "'{s}'"),
|
||||
Token::TripleSingleQuotedString(ref s) => write!(f, "'''{s}'''"),
|
||||
Token::DoubleQuotedString(ref s) => write!(f, "\"{s}\""),
|
||||
Token::TripleDoubleQuotedString(ref s) => write!(f, "\"\"\"{s}\"\"\""),
|
||||
Token::DollarQuotedString(ref s) => write!(f, "{s}"),
|
||||
Token::NationalStringLiteral(ref s) => write!(f, "N'{s}'"),
|
||||
Token::EscapedStringLiteral(ref s) => write!(f, "E'{s}'"),
|
||||
Token::HexStringLiteral(ref s) => write!(f, "X'{s}'"),
|
||||
Token::SingleQuotedByteStringLiteral(ref s) => write!(f, "B'{s}'"),
|
||||
Token::TripleSingleQuotedByteStringLiteral(ref s) => write!(f, "B'''{s}'''"),
|
||||
Token::DoubleQuotedByteStringLiteral(ref s) => write!(f, "B\"{s}\""),
|
||||
Token::RawStringLiteral(ref s) => write!(f, "R'{s}'"),
|
||||
Token::TripleDoubleQuotedByteStringLiteral(ref s) => write!(f, "B\"\"\"{s}\"\"\""),
|
||||
Token::SingleQuotedRawStringLiteral(ref s) => write!(f, "R'{s}'"),
|
||||
Token::DoubleQuotedRawStringLiteral(ref s) => write!(f, "R\"{s}\""),
|
||||
Token::TripleSingleQuotedRawStringLiteral(ref s) => write!(f, "R'''{s}'''"),
|
||||
Token::TripleDoubleQuotedRawStringLiteral(ref s) => write!(f, "R\"\"\"{s}\"\"\""),
|
||||
Token::Comma => f.write_str(","),
|
||||
Token::Whitespace(ws) => write!(f, "{ws}"),
|
||||
Token::DoubleEq => f.write_str("=="),
|
||||
|
@ -490,6 +520,32 @@ impl<'a> State<'a> {
|
|||
}
|
||||
}
|
||||
|
||||
/// Represents how many quote characters enclose a string literal.
|
||||
#[derive(Copy, Clone)]
|
||||
enum NumStringQuoteChars {
|
||||
/// e.g. `"abc"`, `'abc'`, `r'abc'`
|
||||
One,
|
||||
/// e.g. `"""abc"""`, `'''abc'''`, `r'''abc'''`
|
||||
Many(NonZeroU8),
|
||||
}
|
||||
|
||||
/// Settings for tokenizing a quoted string literal.
|
||||
struct TokenizeQuotedStringSettings {
|
||||
/// The character used to quote the string.
|
||||
quote_style: char,
|
||||
/// Represents how many quotes characters enclose the string literal.
|
||||
num_quote_chars: NumStringQuoteChars,
|
||||
/// The number of opening quotes left to consume, before parsing
|
||||
/// the remaining string literal.
|
||||
/// For example: given initial string `"""abc"""`. If the caller has
|
||||
/// already parsed the first quote for some reason, then this value
|
||||
/// is set to 1, flagging to look to consume only 2 leading quotes.
|
||||
num_opening_quotes_to_consume: u8,
|
||||
/// True if the string uses backslash escaping of special characters
|
||||
/// e.g `'abc\ndef\'ghi'
|
||||
backslash_escape: bool,
|
||||
}
|
||||
|
||||
/// SQL Tokenizer
|
||||
pub struct Tokenizer<'a> {
|
||||
dialect: &'a dyn Dialect,
|
||||
|
@ -639,11 +695,31 @@ impl<'a> Tokenizer<'a> {
|
|||
chars.next(); // consume
|
||||
match chars.peek() {
|
||||
Some('\'') => {
|
||||
let s = self.tokenize_quoted_string(chars, '\'', false)?;
|
||||
if self.dialect.supports_triple_quoted_string() {
|
||||
return self
|
||||
.tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
|
||||
chars,
|
||||
'\'',
|
||||
false,
|
||||
Token::SingleQuotedByteStringLiteral,
|
||||
Token::TripleSingleQuotedByteStringLiteral,
|
||||
);
|
||||
}
|
||||
let s = self.tokenize_single_quoted_string(chars, '\'', false)?;
|
||||
Ok(Some(Token::SingleQuotedByteStringLiteral(s)))
|
||||
}
|
||||
Some('\"') => {
|
||||
let s = self.tokenize_quoted_string(chars, '\"', false)?;
|
||||
if self.dialect.supports_triple_quoted_string() {
|
||||
return self
|
||||
.tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
|
||||
chars,
|
||||
'"',
|
||||
false,
|
||||
Token::DoubleQuotedByteStringLiteral,
|
||||
Token::TripleDoubleQuotedByteStringLiteral,
|
||||
);
|
||||
}
|
||||
let s = self.tokenize_single_quoted_string(chars, '\"', false)?;
|
||||
Ok(Some(Token::DoubleQuotedByteStringLiteral(s)))
|
||||
}
|
||||
_ => {
|
||||
|
@ -657,14 +733,22 @@ impl<'a> Tokenizer<'a> {
|
|||
b @ 'R' | b @ 'r' if dialect_of!(self is BigQueryDialect | GenericDialect) => {
|
||||
chars.next(); // consume
|
||||
match chars.peek() {
|
||||
Some('\'') => {
|
||||
let s = self.tokenize_quoted_string(chars, '\'', false)?;
|
||||
Ok(Some(Token::RawStringLiteral(s)))
|
||||
}
|
||||
Some('\"') => {
|
||||
let s = self.tokenize_quoted_string(chars, '\"', false)?;
|
||||
Ok(Some(Token::RawStringLiteral(s)))
|
||||
}
|
||||
Some('\'') => self
|
||||
.tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
|
||||
chars,
|
||||
'\'',
|
||||
false,
|
||||
Token::SingleQuotedRawStringLiteral,
|
||||
Token::TripleSingleQuotedRawStringLiteral,
|
||||
),
|
||||
Some('\"') => self
|
||||
.tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
|
||||
chars,
|
||||
'"',
|
||||
false,
|
||||
Token::DoubleQuotedRawStringLiteral,
|
||||
Token::TripleDoubleQuotedRawStringLiteral,
|
||||
),
|
||||
_ => {
|
||||
// regular identifier starting with an "r" or "R"
|
||||
let s = self.tokenize_word(b, chars);
|
||||
|
@ -678,7 +762,7 @@ impl<'a> Tokenizer<'a> {
|
|||
match chars.peek() {
|
||||
Some('\'') => {
|
||||
// N'...' - a <national character string literal>
|
||||
let s = self.tokenize_quoted_string(chars, '\'', true)?;
|
||||
let s = self.tokenize_single_quoted_string(chars, '\'', true)?;
|
||||
Ok(Some(Token::NationalStringLiteral(s)))
|
||||
}
|
||||
_ => {
|
||||
|
@ -712,7 +796,7 @@ impl<'a> Tokenizer<'a> {
|
|||
match chars.peek() {
|
||||
Some('\'') => {
|
||||
// X'...' - a <binary string literal>
|
||||
let s = self.tokenize_quoted_string(chars, '\'', true)?;
|
||||
let s = self.tokenize_single_quoted_string(chars, '\'', true)?;
|
||||
Ok(Some(Token::HexStringLiteral(s)))
|
||||
}
|
||||
_ => {
|
||||
|
@ -724,7 +808,17 @@ impl<'a> Tokenizer<'a> {
|
|||
}
|
||||
// single quoted string
|
||||
'\'' => {
|
||||
let s = self.tokenize_quoted_string(
|
||||
if self.dialect.supports_triple_quoted_string() {
|
||||
return self
|
||||
.tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
|
||||
chars,
|
||||
'\'',
|
||||
self.dialect.supports_string_literal_backslash_escape(),
|
||||
Token::SingleQuotedString,
|
||||
Token::TripleSingleQuotedString,
|
||||
);
|
||||
}
|
||||
let s = self.tokenize_single_quoted_string(
|
||||
chars,
|
||||
'\'',
|
||||
self.dialect.supports_string_literal_backslash_escape(),
|
||||
|
@ -736,7 +830,17 @@ impl<'a> Tokenizer<'a> {
|
|||
'\"' if !self.dialect.is_delimited_identifier_start(ch)
|
||||
&& !self.dialect.is_identifier_start(ch) =>
|
||||
{
|
||||
let s = self.tokenize_quoted_string(
|
||||
if self.dialect.supports_triple_quoted_string() {
|
||||
return self
|
||||
.tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
|
||||
chars,
|
||||
'"',
|
||||
self.dialect.supports_string_literal_backslash_escape(),
|
||||
Token::DoubleQuotedString,
|
||||
Token::TripleDoubleQuotedString,
|
||||
);
|
||||
}
|
||||
let s = self.tokenize_single_quoted_string(
|
||||
chars,
|
||||
'"',
|
||||
self.dialect.supports_string_literal_backslash_escape(),
|
||||
|
@ -1246,23 +1350,128 @@ impl<'a> Tokenizer<'a> {
|
|||
self.tokenizer_error(starting_loc, "Unterminated encoded string literal")
|
||||
}
|
||||
|
||||
/// Read a single quoted string, starting with the opening quote.
|
||||
fn tokenize_quoted_string(
|
||||
/// Reads a string literal quoted by a single or triple quote characters.
|
||||
/// Examples: `'abc'`, `'''abc'''`, `"""abc"""`.
|
||||
fn tokenize_single_or_triple_quoted_string<F>(
|
||||
&self,
|
||||
chars: &mut State,
|
||||
quote_style: char,
|
||||
allow_escape: bool,
|
||||
backslash_escape: bool,
|
||||
single_quote_token: F,
|
||||
triple_quote_token: F,
|
||||
) -> Result<Option<Token>, TokenizerError>
|
||||
where
|
||||
F: Fn(String) -> Token,
|
||||
{
|
||||
let error_loc = chars.location();
|
||||
|
||||
let mut num_opening_quotes = 0u8;
|
||||
for _ in 0..3 {
|
||||
if Some("e_style) == chars.peek() {
|
||||
chars.next(); // Consume quote.
|
||||
num_opening_quotes += 1;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
let (token_fn, num_quote_chars) = match num_opening_quotes {
|
||||
1 => (single_quote_token, NumStringQuoteChars::One),
|
||||
2 => {
|
||||
// If we matched double quotes, then this is an empty string.
|
||||
return Ok(Some(single_quote_token("".into())));
|
||||
}
|
||||
3 => {
|
||||
let Some(num_quote_chars) = NonZeroU8::new(3) else {
|
||||
return self.tokenizer_error(error_loc, "invalid number of opening quotes");
|
||||
};
|
||||
(
|
||||
triple_quote_token,
|
||||
NumStringQuoteChars::Many(num_quote_chars),
|
||||
)
|
||||
}
|
||||
_ => {
|
||||
return self.tokenizer_error(error_loc, "invalid string literal opening");
|
||||
}
|
||||
};
|
||||
|
||||
let settings = TokenizeQuotedStringSettings {
|
||||
quote_style,
|
||||
num_quote_chars,
|
||||
num_opening_quotes_to_consume: 0,
|
||||
backslash_escape,
|
||||
};
|
||||
|
||||
self.tokenize_quoted_string(chars, settings)
|
||||
.map(token_fn)
|
||||
.map(Some)
|
||||
}
|
||||
|
||||
/// Reads a string literal quoted by a single quote character.
|
||||
fn tokenize_single_quoted_string(
|
||||
&self,
|
||||
chars: &mut State,
|
||||
quote_style: char,
|
||||
backslash_escape: bool,
|
||||
) -> Result<String, TokenizerError> {
|
||||
self.tokenize_quoted_string(
|
||||
chars,
|
||||
TokenizeQuotedStringSettings {
|
||||
quote_style,
|
||||
num_quote_chars: NumStringQuoteChars::One,
|
||||
num_opening_quotes_to_consume: 1,
|
||||
backslash_escape,
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
/// Read a quoted string.
|
||||
fn tokenize_quoted_string(
|
||||
&self,
|
||||
chars: &mut State,
|
||||
settings: TokenizeQuotedStringSettings,
|
||||
) -> Result<String, TokenizerError> {
|
||||
let mut s = String::new();
|
||||
let error_loc = chars.location();
|
||||
|
||||
chars.next(); // consume the opening quote
|
||||
// Consume any opening quotes.
|
||||
for _ in 0..settings.num_opening_quotes_to_consume {
|
||||
if Some(settings.quote_style) != chars.next() {
|
||||
return self.tokenizer_error(error_loc, "invalid string literal opening");
|
||||
}
|
||||
}
|
||||
|
||||
let mut num_consecutive_quotes = 0;
|
||||
while let Some(&ch) = chars.peek() {
|
||||
let pending_final_quote = match settings.num_quote_chars {
|
||||
NumStringQuoteChars::One => Some(NumStringQuoteChars::One),
|
||||
n @ NumStringQuoteChars::Many(count)
|
||||
if num_consecutive_quotes + 1 == count.get() =>
|
||||
{
|
||||
Some(n)
|
||||
}
|
||||
NumStringQuoteChars::Many(_) => None,
|
||||
};
|
||||
|
||||
match ch {
|
||||
char if char == quote_style => {
|
||||
char if char == settings.quote_style && pending_final_quote.is_some() => {
|
||||
chars.next(); // consume
|
||||
if chars.peek().map(|c| *c == quote_style).unwrap_or(false) {
|
||||
|
||||
if let Some(NumStringQuoteChars::Many(count)) = pending_final_quote {
|
||||
// For an initial string like `"""abc"""`, at this point we have
|
||||
// `abc""` in the buffer and have now matched the final `"`.
|
||||
// However, the string to return is simply `abc`, so we strip off
|
||||
// the trailing quotes before returning.
|
||||
let mut buf = s.chars();
|
||||
for _ in 1..count.get() {
|
||||
buf.next_back();
|
||||
}
|
||||
return Ok(buf.as_str().to_string());
|
||||
} else if chars
|
||||
.peek()
|
||||
.map(|c| *c == settings.quote_style)
|
||||
.unwrap_or(false)
|
||||
{
|
||||
s.push(ch);
|
||||
if !self.unescape {
|
||||
// In no-escape mode, the given query has to be saved completely
|
||||
|
@ -1273,10 +1482,12 @@ impl<'a> Tokenizer<'a> {
|
|||
return Ok(s);
|
||||
}
|
||||
}
|
||||
'\\' if allow_escape => {
|
||||
'\\' if settings.backslash_escape => {
|
||||
// consume backslash
|
||||
chars.next();
|
||||
|
||||
num_consecutive_quotes = 0;
|
||||
|
||||
if let Some(next) = chars.peek() {
|
||||
if !self.unescape {
|
||||
// In no-escape mode, the given query has to be saved completely including backslashes.
|
||||
|
@ -1300,8 +1511,15 @@ impl<'a> Tokenizer<'a> {
|
|||
}
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
chars.next(); // consume
|
||||
ch => {
|
||||
chars.next(); // consume ch
|
||||
|
||||
if ch == settings.quote_style {
|
||||
num_consecutive_quotes += 1;
|
||||
} else {
|
||||
num_consecutive_quotes = 0;
|
||||
}
|
||||
|
||||
s.push(ch);
|
||||
}
|
||||
}
|
||||
|
@ -2466,6 +2684,7 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn tokenize_quoted_string_escape() {
|
||||
let dialect = SnowflakeDialect {};
|
||||
for (sql, expected, expected_unescaped) in [
|
||||
(r#"'%a\'%b'"#, r#"%a\'%b"#, r#"%a'%b"#),
|
||||
(r#"'a\'\'b\'c\'d'"#, r#"a\'\'b\'c\'d"#, r#"a''b'c'd"#),
|
||||
|
@ -2480,8 +2699,6 @@ mod tests {
|
|||
(r#"'\'abcd'"#, r#"\'abcd"#, r#"'abcd"#),
|
||||
(r#"'''a''b'"#, r#"''a''b"#, r#"'a'b"#),
|
||||
] {
|
||||
let dialect = BigQueryDialect {};
|
||||
|
||||
let tokens = Tokenizer::new(&dialect, sql)
|
||||
.with_unescape(false)
|
||||
.tokenize()
|
||||
|
@ -2498,7 +2715,6 @@ mod tests {
|
|||
}
|
||||
|
||||
for sql in [r#"'\'"#, r#"'ab\'"#] {
|
||||
let dialect = BigQueryDialect {};
|
||||
let mut tokenizer = Tokenizer::new(&dialect, sql);
|
||||
assert_eq!(
|
||||
"Unterminated string literal",
|
||||
|
@ -2516,4 +2732,124 @@ mod tests {
|
|||
compare(expected, tokens);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tokenize_triple_quoted_string() {
|
||||
fn check<F>(
|
||||
q: char, // The quote character to test
|
||||
r: char, // An alternate quote character.
|
||||
quote_token: F,
|
||||
) where
|
||||
F: Fn(String) -> Token,
|
||||
{
|
||||
let dialect = BigQueryDialect {};
|
||||
|
||||
for (sql, expected, expected_unescaped) in [
|
||||
// Empty string
|
||||
(format!(r#"{q}{q}{q}{q}{q}{q}"#), "".into(), "".into()),
|
||||
// Should not count escaped quote as end of string.
|
||||
(
|
||||
format!(r#"{q}{q}{q}ab{q}{q}\{q}{q}cd{q}{q}{q}"#),
|
||||
format!(r#"ab{q}{q}\{q}{q}cd"#),
|
||||
format!(r#"ab{q}{q}{q}{q}cd"#),
|
||||
),
|
||||
// Simple string
|
||||
(
|
||||
format!(r#"{q}{q}{q}abc{q}{q}{q}"#),
|
||||
"abc".into(),
|
||||
"abc".into(),
|
||||
),
|
||||
// Mix single-double quotes unescaped.
|
||||
(
|
||||
format!(r#"{q}{q}{q}ab{r}{r}{r}c{r}def{r}{r}{r}{q}{q}{q}"#),
|
||||
format!("ab{r}{r}{r}c{r}def{r}{r}{r}"),
|
||||
format!("ab{r}{r}{r}c{r}def{r}{r}{r}"),
|
||||
),
|
||||
// Escaped quote.
|
||||
(
|
||||
format!(r#"{q}{q}{q}ab{q}{q}c{q}{q}\{q}de{q}{q}f{q}{q}{q}"#),
|
||||
format!(r#"ab{q}{q}c{q}{q}\{q}de{q}{q}f"#),
|
||||
format!(r#"ab{q}{q}c{q}{q}{q}de{q}{q}f"#),
|
||||
),
|
||||
// backslash-escaped quote characters.
|
||||
(
|
||||
format!(r#"{q}{q}{q}a\'\'b\'c\'d{q}{q}{q}"#),
|
||||
r#"a\'\'b\'c\'d"#.into(),
|
||||
r#"a''b'c'd"#.into(),
|
||||
),
|
||||
// backslash-escaped characters
|
||||
(
|
||||
format!(r#"{q}{q}{q}abc\0\n\rdef{q}{q}{q}"#),
|
||||
r#"abc\0\n\rdef"#.into(),
|
||||
"abc\0\n\rdef".into(),
|
||||
),
|
||||
] {
|
||||
let tokens = Tokenizer::new(&dialect, sql.as_str())
|
||||
.with_unescape(false)
|
||||
.tokenize()
|
||||
.unwrap();
|
||||
let expected = vec![quote_token(expected.to_string())];
|
||||
compare(expected, tokens);
|
||||
|
||||
let tokens = Tokenizer::new(&dialect, sql.as_str())
|
||||
.with_unescape(true)
|
||||
.tokenize()
|
||||
.unwrap();
|
||||
let expected = vec![quote_token(expected_unescaped.to_string())];
|
||||
compare(expected, tokens);
|
||||
}
|
||||
|
||||
for sql in [
|
||||
format!(r#"{q}{q}{q}{q}{q}\{q}"#),
|
||||
format!(r#"{q}{q}{q}abc{q}{q}\{q}"#),
|
||||
format!(r#"{q}{q}{q}{q}"#),
|
||||
format!(r#"{q}{q}{q}{r}{r}"#),
|
||||
format!(r#"{q}{q}{q}abc{q}"#),
|
||||
format!(r#"{q}{q}{q}abc{q}{q}"#),
|
||||
format!(r#"{q}{q}{q}abc"#),
|
||||
] {
|
||||
let dialect = BigQueryDialect {};
|
||||
let mut tokenizer = Tokenizer::new(&dialect, sql.as_str());
|
||||
assert_eq!(
|
||||
"Unterminated string literal",
|
||||
tokenizer.tokenize().unwrap_err().message.as_str(),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
check('"', '\'', Token::TripleDoubleQuotedString);
|
||||
|
||||
check('\'', '"', Token::TripleSingleQuotedString);
|
||||
|
||||
let dialect = BigQueryDialect {};
|
||||
|
||||
let sql = r#"""''"#;
|
||||
let tokens = Tokenizer::new(&dialect, sql)
|
||||
.with_unescape(true)
|
||||
.tokenize()
|
||||
.unwrap();
|
||||
let expected = vec![
|
||||
Token::DoubleQuotedString("".to_string()),
|
||||
Token::SingleQuotedString("".to_string()),
|
||||
];
|
||||
compare(expected, tokens);
|
||||
|
||||
let sql = r#"''"""#;
|
||||
let tokens = Tokenizer::new(&dialect, sql)
|
||||
.with_unescape(true)
|
||||
.tokenize()
|
||||
.unwrap();
|
||||
let expected = vec![
|
||||
Token::SingleQuotedString("".to_string()),
|
||||
Token::DoubleQuotedString("".to_string()),
|
||||
];
|
||||
compare(expected, tokens);
|
||||
|
||||
// Non-triple quoted string dialect
|
||||
let dialect = SnowflakeDialect {};
|
||||
let sql = r#"''''''"#;
|
||||
let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
|
||||
let expected = vec![Token::SingleQuotedString("''".to_string())];
|
||||
compare(expected, tokens);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,14 +18,30 @@ use std::ops::Deref;
|
|||
|
||||
use sqlparser::ast::*;
|
||||
use sqlparser::dialect::{BigQueryDialect, GenericDialect};
|
||||
use sqlparser::parser::ParserError;
|
||||
use sqlparser::parser::{ParserError, ParserOptions};
|
||||
use test_utils::*;
|
||||
|
||||
#[test]
|
||||
fn parse_literal_string() {
|
||||
let sql = r#"SELECT 'single', "double""#;
|
||||
let select = bigquery().verified_only_select(sql);
|
||||
assert_eq!(2, select.projection.len());
|
||||
let sql = concat!(
|
||||
"SELECT ",
|
||||
"'single', ",
|
||||
r#""double", "#,
|
||||
"'''triple-single''', ",
|
||||
r#""""triple-double""", "#,
|
||||
r#"'single\'escaped', "#,
|
||||
r#"'''triple-single\'escaped''', "#,
|
||||
r#"'''triple-single'unescaped''', "#,
|
||||
r#""double\"escaped", "#,
|
||||
r#""""triple-double\"escaped""", "#,
|
||||
r#""""triple-double"unescaped""""#,
|
||||
);
|
||||
let dialect = TestedDialects {
|
||||
dialects: vec![Box::new(BigQueryDialect {})],
|
||||
options: Some(ParserOptions::new().with_unescape(false)),
|
||||
};
|
||||
let select = dialect.verified_only_select(sql);
|
||||
assert_eq!(10, select.projection.len());
|
||||
assert_eq!(
|
||||
&Expr::Value(Value::SingleQuotedString("single".to_string())),
|
||||
expr_from_projection(&select.projection[0])
|
||||
|
@ -34,56 +50,162 @@ fn parse_literal_string() {
|
|||
&Expr::Value(Value::DoubleQuotedString("double".to_string())),
|
||||
expr_from_projection(&select.projection[1])
|
||||
);
|
||||
assert_eq!(
|
||||
&Expr::Value(Value::TripleSingleQuotedString("triple-single".to_string())),
|
||||
expr_from_projection(&select.projection[2])
|
||||
);
|
||||
assert_eq!(
|
||||
&Expr::Value(Value::TripleDoubleQuotedString("triple-double".to_string())),
|
||||
expr_from_projection(&select.projection[3])
|
||||
);
|
||||
assert_eq!(
|
||||
&Expr::Value(Value::SingleQuotedString(r#"single\'escaped"#.to_string())),
|
||||
expr_from_projection(&select.projection[4])
|
||||
);
|
||||
assert_eq!(
|
||||
&Expr::Value(Value::TripleSingleQuotedString(
|
||||
r#"triple-single\'escaped"#.to_string()
|
||||
)),
|
||||
expr_from_projection(&select.projection[5])
|
||||
);
|
||||
assert_eq!(
|
||||
&Expr::Value(Value::TripleSingleQuotedString(
|
||||
r#"triple-single'unescaped"#.to_string()
|
||||
)),
|
||||
expr_from_projection(&select.projection[6])
|
||||
);
|
||||
assert_eq!(
|
||||
&Expr::Value(Value::DoubleQuotedString(r#"double\"escaped"#.to_string())),
|
||||
expr_from_projection(&select.projection[7])
|
||||
);
|
||||
assert_eq!(
|
||||
&Expr::Value(Value::TripleDoubleQuotedString(
|
||||
r#"triple-double\"escaped"#.to_string()
|
||||
)),
|
||||
expr_from_projection(&select.projection[8])
|
||||
);
|
||||
assert_eq!(
|
||||
&Expr::Value(Value::TripleDoubleQuotedString(
|
||||
r#"triple-double"unescaped"#.to_string()
|
||||
)),
|
||||
expr_from_projection(&select.projection[9])
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_byte_literal() {
|
||||
let sql = r#"SELECT B'abc', B"abc""#;
|
||||
let select = bigquery().verified_only_select(sql);
|
||||
assert_eq!(2, select.projection.len());
|
||||
assert_eq!(
|
||||
&Expr::Value(Value::SingleQuotedByteStringLiteral("abc".to_string())),
|
||||
expr_from_projection(&select.projection[0])
|
||||
);
|
||||
assert_eq!(
|
||||
&Expr::Value(Value::DoubleQuotedByteStringLiteral("abc".to_string())),
|
||||
expr_from_projection(&select.projection[1])
|
||||
let sql = concat!(
|
||||
"SELECT ",
|
||||
"B'abc', ",
|
||||
r#"B"abc", "#,
|
||||
r#"B'f\(abc,(.*),def\)', "#,
|
||||
r#"B"f\(abc,(.*),def\)", "#,
|
||||
r#"B'''abc''', "#,
|
||||
r#"B"""abc""""#,
|
||||
);
|
||||
let stmt = bigquery().verified_stmt(sql);
|
||||
if let Statement::Query(query) = stmt {
|
||||
if let SetExpr::Select(select) = *query.body {
|
||||
assert_eq!(6, select.projection.len());
|
||||
assert_eq!(
|
||||
&Expr::Value(Value::SingleQuotedByteStringLiteral("abc".to_string())),
|
||||
expr_from_projection(&select.projection[0])
|
||||
);
|
||||
assert_eq!(
|
||||
&Expr::Value(Value::DoubleQuotedByteStringLiteral("abc".to_string())),
|
||||
expr_from_projection(&select.projection[1])
|
||||
);
|
||||
assert_eq!(
|
||||
&Expr::Value(Value::SingleQuotedByteStringLiteral(
|
||||
r"f\(abc,(.*),def\)".to_string()
|
||||
)),
|
||||
expr_from_projection(&select.projection[2])
|
||||
);
|
||||
assert_eq!(
|
||||
&Expr::Value(Value::DoubleQuotedByteStringLiteral(
|
||||
r"f\(abc,(.*),def\)".to_string()
|
||||
)),
|
||||
expr_from_projection(&select.projection[3])
|
||||
);
|
||||
assert_eq!(
|
||||
&Expr::Value(Value::TripleSingleQuotedByteStringLiteral(
|
||||
r"abc".to_string()
|
||||
)),
|
||||
expr_from_projection(&select.projection[4])
|
||||
);
|
||||
assert_eq!(
|
||||
&Expr::Value(Value::TripleDoubleQuotedByteStringLiteral(
|
||||
r"abc".to_string()
|
||||
)),
|
||||
expr_from_projection(&select.projection[5])
|
||||
);
|
||||
}
|
||||
} else {
|
||||
panic!("invalid query");
|
||||
}
|
||||
|
||||
let sql = r#"SELECT b'abc', b"abc""#;
|
||||
bigquery().one_statement_parses_to(sql, r#"SELECT B'abc', B"abc""#);
|
||||
bigquery().one_statement_parses_to(
|
||||
r#"SELECT b'123', b"123", b'''123''', b"""123""""#,
|
||||
r#"SELECT B'123', B"123", B'''123''', B"""123""""#,
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_raw_literal() {
|
||||
let sql = r#"SELECT R'abc', R"abc", R'f\(abc,(.*),def\)', R"f\(abc,(.*),def\)""#;
|
||||
let stmt = bigquery().one_statement_parses_to(
|
||||
sql,
|
||||
r"SELECT R'abc', R'abc', R'f\(abc,(.*),def\)', R'f\(abc,(.*),def\)'",
|
||||
let sql = concat!(
|
||||
"SELECT ",
|
||||
"R'abc', ",
|
||||
r#"R"abc", "#,
|
||||
r#"R'f\(abc,(.*),def\)', "#,
|
||||
r#"R"f\(abc,(.*),def\)", "#,
|
||||
r#"R'''abc''', "#,
|
||||
r#"R"""abc""""#,
|
||||
);
|
||||
let stmt = bigquery().verified_stmt(sql);
|
||||
if let Statement::Query(query) = stmt {
|
||||
if let SetExpr::Select(select) = *query.body {
|
||||
assert_eq!(4, select.projection.len());
|
||||
assert_eq!(6, select.projection.len());
|
||||
assert_eq!(
|
||||
&Expr::Value(Value::RawStringLiteral("abc".to_string())),
|
||||
&Expr::Value(Value::SingleQuotedRawStringLiteral("abc".to_string())),
|
||||
expr_from_projection(&select.projection[0])
|
||||
);
|
||||
assert_eq!(
|
||||
&Expr::Value(Value::RawStringLiteral("abc".to_string())),
|
||||
&Expr::Value(Value::DoubleQuotedRawStringLiteral("abc".to_string())),
|
||||
expr_from_projection(&select.projection[1])
|
||||
);
|
||||
assert_eq!(
|
||||
&Expr::Value(Value::RawStringLiteral(r"f\(abc,(.*),def\)".to_string())),
|
||||
&Expr::Value(Value::SingleQuotedRawStringLiteral(
|
||||
r"f\(abc,(.*),def\)".to_string()
|
||||
)),
|
||||
expr_from_projection(&select.projection[2])
|
||||
);
|
||||
assert_eq!(
|
||||
&Expr::Value(Value::RawStringLiteral(r"f\(abc,(.*),def\)".to_string())),
|
||||
&Expr::Value(Value::DoubleQuotedRawStringLiteral(
|
||||
r"f\(abc,(.*),def\)".to_string()
|
||||
)),
|
||||
expr_from_projection(&select.projection[3])
|
||||
);
|
||||
return;
|
||||
assert_eq!(
|
||||
&Expr::Value(Value::TripleSingleQuotedRawStringLiteral(
|
||||
r"abc".to_string()
|
||||
)),
|
||||
expr_from_projection(&select.projection[4])
|
||||
);
|
||||
assert_eq!(
|
||||
&Expr::Value(Value::TripleDoubleQuotedRawStringLiteral(
|
||||
r"abc".to_string()
|
||||
)),
|
||||
expr_from_projection(&select.projection[5])
|
||||
);
|
||||
}
|
||||
} else {
|
||||
panic!("invalid query");
|
||||
}
|
||||
panic!("invalid query")
|
||||
|
||||
bigquery().one_statement_parses_to(
|
||||
r#"SELECT r'123', r"123", r'''123''', r"""123""""#,
|
||||
r#"SELECT R'123', R"123", R'''123''', R"""123""""#,
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue