mirror of
https://github.com/apache/datafusion-sqlparser-rs.git
synced 2025-09-23 22:22:28 +00:00
Support for postgres String Constants with Unicode Escapes (#1355)
This commit is contained in:
parent
c3ba2f33c6
commit
bc15f7b4ce
7 changed files with 180 additions and 0 deletions
|
@ -52,6 +52,10 @@ pub enum Value {
|
||||||
/// See [Postgres docs](https://www.postgresql.org/docs/8.3/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS)
|
/// See [Postgres docs](https://www.postgresql.org/docs/8.3/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS)
|
||||||
/// for more details.
|
/// for more details.
|
||||||
EscapedStringLiteral(String),
|
EscapedStringLiteral(String),
|
||||||
|
/// u&'string value' (postgres extension)
|
||||||
|
/// See [Postgres docs](https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS-UESCAPE)
|
||||||
|
/// for more details.
|
||||||
|
UnicodeStringLiteral(String),
|
||||||
/// B'string value'
|
/// B'string value'
|
||||||
SingleQuotedByteStringLiteral(String),
|
SingleQuotedByteStringLiteral(String),
|
||||||
/// B"string value"
|
/// B"string value"
|
||||||
|
@ -102,6 +106,7 @@ impl fmt::Display for Value {
|
||||||
}
|
}
|
||||||
Value::DollarQuotedString(v) => write!(f, "{v}"),
|
Value::DollarQuotedString(v) => write!(f, "{v}"),
|
||||||
Value::EscapedStringLiteral(v) => write!(f, "E'{}'", escape_escaped_string(v)),
|
Value::EscapedStringLiteral(v) => write!(f, "E'{}'", escape_escaped_string(v)),
|
||||||
|
Value::UnicodeStringLiteral(v) => write!(f, "U&'{}'", escape_unicode_string(v)),
|
||||||
Value::NationalStringLiteral(v) => write!(f, "N'{v}'"),
|
Value::NationalStringLiteral(v) => write!(f, "N'{v}'"),
|
||||||
Value::HexStringLiteral(v) => write!(f, "X'{v}'"),
|
Value::HexStringLiteral(v) => write!(f, "X'{v}'"),
|
||||||
Value::Boolean(v) => write!(f, "{v}"),
|
Value::Boolean(v) => write!(f, "{v}"),
|
||||||
|
@ -347,6 +352,41 @@ pub fn escape_escaped_string(s: &str) -> EscapeEscapedStringLiteral<'_> {
|
||||||
EscapeEscapedStringLiteral(s)
|
EscapeEscapedStringLiteral(s)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub struct EscapeUnicodeStringLiteral<'a>(&'a str);
|
||||||
|
|
||||||
|
impl<'a> fmt::Display for EscapeUnicodeStringLiteral<'a> {
|
||||||
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||||
|
for c in self.0.chars() {
|
||||||
|
match c {
|
||||||
|
'\'' => {
|
||||||
|
write!(f, "''")?;
|
||||||
|
}
|
||||||
|
'\\' => {
|
||||||
|
write!(f, r#"\\"#)?;
|
||||||
|
}
|
||||||
|
x if x.is_ascii() => {
|
||||||
|
write!(f, "{}", c)?;
|
||||||
|
}
|
||||||
|
_ => {
|
||||||
|
let codepoint = c as u32;
|
||||||
|
// if the character fits in 32 bits, we can use the \XXXX format
|
||||||
|
// otherwise, we need to use the \+XXXXXX format
|
||||||
|
if codepoint <= 0xFFFF {
|
||||||
|
write!(f, "\\{:04X}", codepoint)?;
|
||||||
|
} else {
|
||||||
|
write!(f, "\\+{:06X}", codepoint)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn escape_unicode_string(s: &str) -> EscapeUnicodeStringLiteral<'_> {
|
||||||
|
EscapeUnicodeStringLiteral(s)
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||||
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
|
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
|
||||||
#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
|
#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
|
||||||
|
|
|
@ -35,6 +35,10 @@ impl Dialect for GenericDialect {
|
||||||
|| ch == '_'
|
|| ch == '_'
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn supports_unicode_string_literal(&self) -> bool {
|
||||||
|
true
|
||||||
|
}
|
||||||
|
|
||||||
fn supports_group_by_expr(&self) -> bool {
|
fn supports_group_by_expr(&self) -> bool {
|
||||||
true
|
true
|
||||||
}
|
}
|
||||||
|
|
|
@ -145,6 +145,21 @@ pub trait Dialect: Debug + Any {
|
||||||
fn supports_string_literal_backslash_escape(&self) -> bool {
|
fn supports_string_literal_backslash_escape(&self) -> bool {
|
||||||
false
|
false
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Determine if the dialect supports string literals with `U&` prefix.
|
||||||
|
/// This is used to specify Unicode code points in string literals.
|
||||||
|
/// For example, in PostgreSQL, the following is a valid string literal:
|
||||||
|
/// ```sql
|
||||||
|
/// SELECT U&'\0061\0062\0063';
|
||||||
|
/// ```
|
||||||
|
/// This is equivalent to the string literal `'abc'`.
|
||||||
|
/// See
|
||||||
|
/// - [Postgres docs](https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS-UESCAPE)
|
||||||
|
/// - [H2 docs](http://www.h2database.com/html/grammar.html#string)
|
||||||
|
fn supports_unicode_string_literal(&self) -> bool {
|
||||||
|
false
|
||||||
|
}
|
||||||
|
|
||||||
/// Does the dialect support `FILTER (WHERE expr)` for aggregate queries?
|
/// Does the dialect support `FILTER (WHERE expr)` for aggregate queries?
|
||||||
fn supports_filter_during_aggregation(&self) -> bool {
|
fn supports_filter_during_aggregation(&self) -> bool {
|
||||||
false
|
false
|
||||||
|
|
|
@ -40,6 +40,10 @@ impl Dialect for PostgreSqlDialect {
|
||||||
ch.is_alphabetic() || ch.is_ascii_digit() || ch == '$' || ch == '_'
|
ch.is_alphabetic() || ch.is_ascii_digit() || ch == '$' || ch == '_'
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn supports_unicode_string_literal(&self) -> bool {
|
||||||
|
true
|
||||||
|
}
|
||||||
|
|
||||||
/// See <https://www.postgresql.org/docs/current/sql-createoperator.html>
|
/// See <https://www.postgresql.org/docs/current/sql-createoperator.html>
|
||||||
fn is_custom_operator_part(&self, ch: char) -> bool {
|
fn is_custom_operator_part(&self, ch: char) -> bool {
|
||||||
matches!(
|
matches!(
|
||||||
|
|
|
@ -1191,6 +1191,10 @@ impl<'a> Parser<'a> {
|
||||||
self.prev_token();
|
self.prev_token();
|
||||||
Ok(Expr::Value(self.parse_value()?))
|
Ok(Expr::Value(self.parse_value()?))
|
||||||
}
|
}
|
||||||
|
Token::UnicodeStringLiteral(_) => {
|
||||||
|
self.prev_token();
|
||||||
|
Ok(Expr::Value(self.parse_value()?))
|
||||||
|
}
|
||||||
Token::Number(_, _)
|
Token::Number(_, _)
|
||||||
| Token::SingleQuotedString(_)
|
| Token::SingleQuotedString(_)
|
||||||
| Token::DoubleQuotedString(_)
|
| Token::DoubleQuotedString(_)
|
||||||
|
@ -1868,6 +1872,7 @@ impl<'a> Parser<'a> {
|
||||||
}
|
}
|
||||||
Token::SingleQuotedString(_)
|
Token::SingleQuotedString(_)
|
||||||
| Token::EscapedStringLiteral(_)
|
| Token::EscapedStringLiteral(_)
|
||||||
|
| Token::UnicodeStringLiteral(_)
|
||||||
| Token::NationalStringLiteral(_)
|
| Token::NationalStringLiteral(_)
|
||||||
| Token::HexStringLiteral(_) => Some(Box::new(self.parse_expr()?)),
|
| Token::HexStringLiteral(_) => Some(Box::new(self.parse_expr()?)),
|
||||||
_ => self.expected(
|
_ => self.expected(
|
||||||
|
@ -6965,6 +6970,7 @@ impl<'a> Parser<'a> {
|
||||||
}
|
}
|
||||||
Token::NationalStringLiteral(ref s) => Ok(Value::NationalStringLiteral(s.to_string())),
|
Token::NationalStringLiteral(ref s) => Ok(Value::NationalStringLiteral(s.to_string())),
|
||||||
Token::EscapedStringLiteral(ref s) => Ok(Value::EscapedStringLiteral(s.to_string())),
|
Token::EscapedStringLiteral(ref s) => Ok(Value::EscapedStringLiteral(s.to_string())),
|
||||||
|
Token::UnicodeStringLiteral(ref s) => Ok(Value::UnicodeStringLiteral(s.to_string())),
|
||||||
Token::HexStringLiteral(ref s) => Ok(Value::HexStringLiteral(s.to_string())),
|
Token::HexStringLiteral(ref s) => Ok(Value::HexStringLiteral(s.to_string())),
|
||||||
Token::Placeholder(ref s) => Ok(Value::Placeholder(s.to_string())),
|
Token::Placeholder(ref s) => Ok(Value::Placeholder(s.to_string())),
|
||||||
tok @ Token::Colon | tok @ Token::AtSign => {
|
tok @ Token::Colon | tok @ Token::AtSign => {
|
||||||
|
@ -7056,6 +7062,7 @@ impl<'a> Parser<'a> {
|
||||||
Token::EscapedStringLiteral(s) if dialect_of!(self is PostgreSqlDialect | GenericDialect) => {
|
Token::EscapedStringLiteral(s) if dialect_of!(self is PostgreSqlDialect | GenericDialect) => {
|
||||||
Ok(s)
|
Ok(s)
|
||||||
}
|
}
|
||||||
|
Token::UnicodeStringLiteral(s) => Ok(s),
|
||||||
_ => self.expected("literal string", next_token),
|
_ => self.expected("literal string", next_token),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -94,6 +94,8 @@ pub enum Token {
|
||||||
NationalStringLiteral(String),
|
NationalStringLiteral(String),
|
||||||
/// "escaped" string literal, which are an extension to the SQL standard: i.e: e'first \n second' or E 'first \n second'
|
/// "escaped" string literal, which are an extension to the SQL standard: i.e: e'first \n second' or E 'first \n second'
|
||||||
EscapedStringLiteral(String),
|
EscapedStringLiteral(String),
|
||||||
|
/// Unicode string literal: i.e: U&'first \000A second'
|
||||||
|
UnicodeStringLiteral(String),
|
||||||
/// Hexadecimal string literal: i.e.: X'deadbeef'
|
/// Hexadecimal string literal: i.e.: X'deadbeef'
|
||||||
HexStringLiteral(String),
|
HexStringLiteral(String),
|
||||||
/// Comma
|
/// Comma
|
||||||
|
@ -251,6 +253,7 @@ impl fmt::Display for Token {
|
||||||
Token::DollarQuotedString(ref s) => write!(f, "{s}"),
|
Token::DollarQuotedString(ref s) => write!(f, "{s}"),
|
||||||
Token::NationalStringLiteral(ref s) => write!(f, "N'{s}'"),
|
Token::NationalStringLiteral(ref s) => write!(f, "N'{s}'"),
|
||||||
Token::EscapedStringLiteral(ref s) => write!(f, "E'{s}'"),
|
Token::EscapedStringLiteral(ref s) => write!(f, "E'{s}'"),
|
||||||
|
Token::UnicodeStringLiteral(ref s) => write!(f, "U&'{s}'"),
|
||||||
Token::HexStringLiteral(ref s) => write!(f, "X'{s}'"),
|
Token::HexStringLiteral(ref s) => write!(f, "X'{s}'"),
|
||||||
Token::SingleQuotedByteStringLiteral(ref s) => write!(f, "B'{s}'"),
|
Token::SingleQuotedByteStringLiteral(ref s) => write!(f, "B'{s}'"),
|
||||||
Token::TripleSingleQuotedByteStringLiteral(ref s) => write!(f, "B'''{s}'''"),
|
Token::TripleSingleQuotedByteStringLiteral(ref s) => write!(f, "B'''{s}'''"),
|
||||||
|
@ -794,6 +797,23 @@ impl<'a> Tokenizer<'a> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// Unicode string literals like U&'first \000A second' are supported in some dialects, including PostgreSQL
|
||||||
|
x @ 'u' | x @ 'U' if self.dialect.supports_unicode_string_literal() => {
|
||||||
|
chars.next(); // consume, to check the next char
|
||||||
|
if chars.peek() == Some(&'&') {
|
||||||
|
// we cannot advance the iterator here, as we need to consume the '&' later if the 'u' was an identifier
|
||||||
|
let mut chars_clone = chars.peekable.clone();
|
||||||
|
chars_clone.next(); // consume the '&' in the clone
|
||||||
|
if chars_clone.peek() == Some(&'\'') {
|
||||||
|
chars.next(); // consume the '&' in the original iterator
|
||||||
|
let s = unescape_unicode_single_quoted_string(chars)?;
|
||||||
|
return Ok(Some(Token::UnicodeStringLiteral(s)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// regular identifier starting with an "U" or "u"
|
||||||
|
let s = self.tokenize_word(x, chars);
|
||||||
|
Ok(Some(Token::make_word(&s, None)))
|
||||||
|
}
|
||||||
// The spec only allows an uppercase 'X' to introduce a hex
|
// The spec only allows an uppercase 'X' to introduce a hex
|
||||||
// string, but PostgreSQL, at least, allows a lowercase 'x' too.
|
// string, but PostgreSQL, at least, allows a lowercase 'x' too.
|
||||||
x @ 'x' | x @ 'X' => {
|
x @ 'x' | x @ 'X' => {
|
||||||
|
@ -1797,6 +1817,64 @@ impl<'a: 'b, 'b> Unescape<'a, 'b> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn unescape_unicode_single_quoted_string(chars: &mut State<'_>) -> Result<String, TokenizerError> {
|
||||||
|
let mut unescaped = String::new();
|
||||||
|
chars.next(); // consume the opening quote
|
||||||
|
while let Some(c) = chars.next() {
|
||||||
|
match c {
|
||||||
|
'\'' => {
|
||||||
|
if chars.peek() == Some(&'\'') {
|
||||||
|
chars.next();
|
||||||
|
unescaped.push('\'');
|
||||||
|
} else {
|
||||||
|
return Ok(unescaped);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
'\\' => match chars.peek() {
|
||||||
|
Some('\\') => {
|
||||||
|
chars.next();
|
||||||
|
unescaped.push('\\');
|
||||||
|
}
|
||||||
|
Some('+') => {
|
||||||
|
chars.next();
|
||||||
|
unescaped.push(take_char_from_hex_digits(chars, 6)?);
|
||||||
|
}
|
||||||
|
_ => unescaped.push(take_char_from_hex_digits(chars, 4)?),
|
||||||
|
},
|
||||||
|
_ => {
|
||||||
|
unescaped.push(c);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(TokenizerError {
|
||||||
|
message: "Unterminated unicode encoded string literal".to_string(),
|
||||||
|
location: chars.location(),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn take_char_from_hex_digits(
|
||||||
|
chars: &mut State<'_>,
|
||||||
|
max_digits: usize,
|
||||||
|
) -> Result<char, TokenizerError> {
|
||||||
|
let mut result = 0u32;
|
||||||
|
for _ in 0..max_digits {
|
||||||
|
let next_char = chars.next().ok_or_else(|| TokenizerError {
|
||||||
|
message: "Unexpected EOF while parsing hex digit in escaped unicode string."
|
||||||
|
.to_string(),
|
||||||
|
location: chars.location(),
|
||||||
|
})?;
|
||||||
|
let digit = next_char.to_digit(16).ok_or_else(|| TokenizerError {
|
||||||
|
message: format!("Invalid hex digit in escaped unicode string: {}", next_char),
|
||||||
|
location: chars.location(),
|
||||||
|
})?;
|
||||||
|
result = result * 16 + digit;
|
||||||
|
}
|
||||||
|
char::from_u32(result).ok_or_else(|| TokenizerError {
|
||||||
|
message: format!("Invalid unicode character: {:x}", result),
|
||||||
|
location: chars.location(),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
|
|
|
@ -4441,3 +4441,35 @@ fn test_table_unnest_with_ordinality() {
|
||||||
_ => panic!("Expecting TableFactor::UNNEST with ordinality"),
|
_ => panic!("Expecting TableFactor::UNNEST with ordinality"),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_escaped_string_literal() {
|
||||||
|
match pg().verified_expr(r#"E'\n'"#) {
|
||||||
|
Expr::Value(Value::EscapedStringLiteral(s)) => {
|
||||||
|
assert_eq!("\n", s);
|
||||||
|
}
|
||||||
|
_ => unreachable!(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_unicode_string_literal() {
|
||||||
|
let pairs = [
|
||||||
|
// Example from the postgres docs
|
||||||
|
(r#"U&'\0441\043B\043E\043D'"#, "слон"),
|
||||||
|
// High unicode code point (> 0xFFFF)
|
||||||
|
(r#"U&'\+01F418'"#, "🐘"),
|
||||||
|
// Escaped backslash
|
||||||
|
(r#"U&'\\'"#, r#"\"#),
|
||||||
|
// Escaped single quote
|
||||||
|
(r#"U&''''"#, "'"),
|
||||||
|
];
|
||||||
|
for (input, expected) in pairs {
|
||||||
|
match pg_and_generic().verified_expr(input) {
|
||||||
|
Expr::Value(Value::UnicodeStringLiteral(s)) => {
|
||||||
|
assert_eq!(expected, s);
|
||||||
|
}
|
||||||
|
_ => unreachable!(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue