Support for postgres String Constants with Unicode Escapes (#1355)

2025-09-23 22:22:28 +00:00 · 2024-07-29 23:18:16 +02:00 · 2024-07-29 23:18:16 +02:00 · bc15f7b4ce
commit bc15f7b4ce
parent c3ba2f33c6
7 changed files with 180 additions and 0 deletions
--- a/src/ast/value.rs
+++ b/src/ast/value.rs
@ -52,6 +52,10 @@ pub enum Value {
    /// See [Postgres docs](https://www.postgresql.org/docs/8.3/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS)
    /// for more details.
    EscapedStringLiteral(String),
    /// u&'string value' (postgres extension)
    /// See [Postgres docs](https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS-UESCAPE)
    /// for more details.
    UnicodeStringLiteral(String),
    /// B'string value'
    SingleQuotedByteStringLiteral(String),
    /// B"string value"
@ -102,6 +106,7 @@ impl fmt::Display for Value {
            }
            Value::DollarQuotedString(v) => write!(f, "{v}"),
            Value::EscapedStringLiteral(v) => write!(f, "E'{}'", escape_escaped_string(v)),
            Value::UnicodeStringLiteral(v) => write!(f, "U&'{}'", escape_unicode_string(v)),
            Value::NationalStringLiteral(v) => write!(f, "N'{v}'"),
            Value::HexStringLiteral(v) => write!(f, "X'{v}'"),
            Value::Boolean(v) => write!(f, "{v}"),
@ -347,6 +352,41 @@ pub fn escape_escaped_string(s: &str) -> EscapeEscapedStringLiteral<'_> {
    EscapeEscapedStringLiteral(s)
 }
 pub struct EscapeUnicodeStringLiteral<'a>(&'a str);
 impl<'a> fmt::Display for EscapeUnicodeStringLiteral<'a> {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        for c in self.0.chars() {
            match c {
                '\'' => {
                    write!(f, "''")?;
                }
                '\\' => {
                    write!(f, r#"\\"#)?;
                }
                x if x.is_ascii() => {
                    write!(f, "{}", c)?;
                }
                _ => {
                    let codepoint = c as u32;
                    // if the character fits in 32 bits, we can use the \XXXX format
                    // otherwise, we need to use the \+XXXXXX format
                    if codepoint <= 0xFFFF {
                        write!(f, "\\{:04X}", codepoint)?;
                    } else {
                        write!(f, "\\+{:06X}", codepoint)?;
                    }
                }
            }
        }
        Ok(())
    }
 }
 pub fn escape_unicode_string(s: &str) -> EscapeUnicodeStringLiteral<'_> {
    EscapeUnicodeStringLiteral(s)
 }
 #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
 #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
 #[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
--- a/src/dialect/generic.rs
+++ b/src/dialect/generic.rs
@ -35,6 +35,10 @@ impl Dialect for GenericDialect {
            || ch == '_'
    }
    fn supports_unicode_string_literal(&self) -> bool {
        true
    }
    fn supports_group_by_expr(&self) -> bool {
        true
    }
--- a/src/dialect/mod.rs
+++ b/src/dialect/mod.rs
@ -145,6 +145,21 @@ pub trait Dialect: Debug + Any {
    fn supports_string_literal_backslash_escape(&self) -> bool {
        false
    }
    /// Determine if the dialect supports string literals with `U&` prefix.
    /// This is used to specify Unicode code points in string literals.
    /// For example, in PostgreSQL, the following is a valid string literal:
    /// ```sql
    /// SELECT U&'\0061\0062\0063';
    /// ```
    /// This is equivalent to the string literal `'abc'`.
    /// See
    ///  - [Postgres docs](https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS-UESCAPE)
    ///  - [H2 docs](http://www.h2database.com/html/grammar.html#string)
    fn supports_unicode_string_literal(&self) -> bool {
        false
    }
    /// Does the dialect support `FILTER (WHERE expr)` for aggregate queries?
    fn supports_filter_during_aggregation(&self) -> bool {
        false
--- a/src/dialect/postgresql.rs
+++ b/src/dialect/postgresql.rs
@ -40,6 +40,10 @@ impl Dialect for PostgreSqlDialect {
        ch.is_alphabetic() || ch.is_ascii_digit() || ch == '$' || ch == '_'
    }
    fn supports_unicode_string_literal(&self) -> bool {
        true
    }
    /// See <https://www.postgresql.org/docs/current/sql-createoperator.html>
    fn is_custom_operator_part(&self, ch: char) -> bool {
        matches!(
--- a/src/parser/mod.rs
+++ b/src/parser/mod.rs
@ -1191,6 +1191,10 @@ impl<'a> Parser<'a> {
                self.prev_token();
                Ok(Expr::Value(self.parse_value()?))
            }
            Token::UnicodeStringLiteral(_) => {
                self.prev_token();
                Ok(Expr::Value(self.parse_value()?))
            }
            Token::Number(_, _)
            | Token::SingleQuotedString(_)
            | Token::DoubleQuotedString(_)
@ -1868,6 +1872,7 @@ impl<'a> Parser<'a> {
                    }
                    Token::SingleQuotedString(_)
                    | Token::EscapedStringLiteral(_)
                    | Token::UnicodeStringLiteral(_)
                    | Token::NationalStringLiteral(_)
                    | Token::HexStringLiteral(_) => Some(Box::new(self.parse_expr()?)),
                    _ => self.expected(
@ -6965,6 +6970,7 @@ impl<'a> Parser<'a> {
            }
            Token::NationalStringLiteral(ref s) => Ok(Value::NationalStringLiteral(s.to_string())),
            Token::EscapedStringLiteral(ref s) => Ok(Value::EscapedStringLiteral(s.to_string())),
            Token::UnicodeStringLiteral(ref s) => Ok(Value::UnicodeStringLiteral(s.to_string())),
            Token::HexStringLiteral(ref s) => Ok(Value::HexStringLiteral(s.to_string())),
            Token::Placeholder(ref s) => Ok(Value::Placeholder(s.to_string())),
            tok @ Token::Colon | tok @ Token::AtSign => {
@ -7056,6 +7062,7 @@ impl<'a> Parser<'a> {
            Token::EscapedStringLiteral(s) if dialect_of!(self is PostgreSqlDialect | GenericDialect) => {
                Ok(s)
            }
            Token::UnicodeStringLiteral(s) => Ok(s),
            _ => self.expected("literal string", next_token),
        }
    }
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@ -94,6 +94,8 @@ pub enum Token {
    NationalStringLiteral(String),
    /// "escaped" string literal, which are an extension to the SQL standard: i.e: e'first \n second' or E 'first \n second'
    EscapedStringLiteral(String),
    /// Unicode string literal: i.e: U&'first \000A second'
    UnicodeStringLiteral(String),
    /// Hexadecimal string literal: i.e.: X'deadbeef'
    HexStringLiteral(String),
    /// Comma
@ -251,6 +253,7 @@ impl fmt::Display for Token {
            Token::DollarQuotedString(ref s) => write!(f, "{s}"),
            Token::NationalStringLiteral(ref s) => write!(f, "N'{s}'"),
            Token::EscapedStringLiteral(ref s) => write!(f, "E'{s}'"),
            Token::UnicodeStringLiteral(ref s) => write!(f, "U&'{s}'"),
            Token::HexStringLiteral(ref s) => write!(f, "X'{s}'"),
            Token::SingleQuotedByteStringLiteral(ref s) => write!(f, "B'{s}'"),
            Token::TripleSingleQuotedByteStringLiteral(ref s) => write!(f, "B'''{s}'''"),
@ -794,6 +797,23 @@ impl<'a> Tokenizer<'a> {
                        }
                    }
                }
                // Unicode string literals like U&'first \000A second' are supported in some dialects, including PostgreSQL
                x @ 'u' | x @ 'U' if self.dialect.supports_unicode_string_literal() => {
                    chars.next(); // consume, to check the next char
                    if chars.peek() == Some(&'&') {
                        // we cannot advance the iterator here, as we need to consume the '&' later if the 'u' was an identifier
                        let mut chars_clone = chars.peekable.clone();
                        chars_clone.next(); // consume the '&' in the clone
                        if chars_clone.peek() == Some(&'\'') {
                            chars.next(); // consume the '&' in the original iterator
                            let s = unescape_unicode_single_quoted_string(chars)?;
                            return Ok(Some(Token::UnicodeStringLiteral(s)));
                        }
                    }
                    // regular identifier starting with an "U" or "u"
                    let s = self.tokenize_word(x, chars);
                    Ok(Some(Token::make_word(&s, None)))
                }
                // The spec only allows an uppercase 'X' to introduce a hex
                // string, but PostgreSQL, at least, allows a lowercase 'x' too.
                x @ 'x' | x @ 'X' => {
@ -1797,6 +1817,64 @@ impl<'a: 'b, 'b> Unescape<'a, 'b> {
    }
 }
 fn unescape_unicode_single_quoted_string(chars: &mut State<'_>) -> Result<String, TokenizerError> {
    let mut unescaped = String::new();
    chars.next(); // consume the opening quote
    while let Some(c) = chars.next() {
        match c {
            '\'' => {
                if chars.peek() == Some(&'\'') {
                    chars.next();
                    unescaped.push('\'');
                } else {
                    return Ok(unescaped);
                }
            }
            '\\' => match chars.peek() {
                Some('\\') => {
                    chars.next();
                    unescaped.push('\\');
                }
                Some('+') => {
                    chars.next();
                    unescaped.push(take_char_from_hex_digits(chars, 6)?);
                }
                _ => unescaped.push(take_char_from_hex_digits(chars, 4)?),
            },
            _ => {
                unescaped.push(c);
            }
        }
    }
    Err(TokenizerError {
        message: "Unterminated unicode encoded string literal".to_string(),
        location: chars.location(),
    })
 }
 fn take_char_from_hex_digits(
    chars: &mut State<'_>,
    max_digits: usize,
 ) -> Result<char, TokenizerError> {
    let mut result = 0u32;
    for _ in 0..max_digits {
        let next_char = chars.next().ok_or_else(|| TokenizerError {
            message: "Unexpected EOF while parsing hex digit in escaped unicode string."
                .to_string(),
            location: chars.location(),
        })?;
        let digit = next_char.to_digit(16).ok_or_else(|| TokenizerError {
            message: format!("Invalid hex digit in escaped unicode string: {}", next_char),
            location: chars.location(),
        })?;
        result = result * 16 + digit;
    }
    char::from_u32(result).ok_or_else(|| TokenizerError {
        message: format!("Invalid unicode character: {:x}", result),
        location: chars.location(),
    })
 }
 #[cfg(test)]
 mod tests {
    use super::*;
--- a/tests/sqlparser_postgres.rs
+++ b/tests/sqlparser_postgres.rs
@ -4441,3 +4441,35 @@ fn test_table_unnest_with_ordinality() {
        _ => panic!("Expecting TableFactor::UNNEST with ordinality"),
    }
 }
 #[test]
 fn test_escaped_string_literal() {
    match pg().verified_expr(r#"E'\n'"#) {
        Expr::Value(Value::EscapedStringLiteral(s)) => {
            assert_eq!("\n", s);
        }
        _ => unreachable!(),
    }
 }
 #[test]
 fn test_unicode_string_literal() {
    let pairs = [
        // Example from the postgres docs
        (r#"U&'\0441\043B\043E\043D'"#, "слон"),
        // High unicode code point (> 0xFFFF)
        (r#"U&'\+01F418'"#, "🐘"),
        // Escaped backslash
        (r#"U&'\\'"#, r#"\"#),
        // Escaped single quote
        (r#"U&''''"#, "'"),
    ];
    for (input, expected) in pairs {
        match pg_and_generic().verified_expr(input) {
            Expr::Value(Value::UnicodeStringLiteral(s)) => {
                assert_eq!(expected, s);
            }
            _ => unreachable!(),
        }
    }
 }