Support MySQL Character Set Introducers (#788)

* MySQL Character Set Introducers * Documentation fix * Parsing string introducer from Token::word * Fixed lint * fix clippy --------- Co-authored-by: Maciej Skrzypkowski <maciej.skrzypkowski@satoricyber.com> Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
2025-10-28 05:59:48 +00:00 · 2023-02-17 19:38:43 +01:00 · 2023-02-17 19:38:43 +01:00 · 488e8a8156
commit 488e8a8156
parent b31ede7733
4 changed files with 77 additions and 5 deletions
--- a/src/ast/mod.rs
+++ b/src/ast/mod.rs
@ -437,6 +437,8 @@ pub enum Expr {
    Nested(Box<Expr>),
    /// A literal value, such as string, number, date or NULL
    Value(Value),
+    /// <https://dev.mysql.com/doc/refman/8.0/en/charset-introducer.html>
+    IntroducedString { introducer: String, value: Value },
    /// A constant of form `<data_type> 'value'`.
    /// This can represent ANSI SQL `DATE`, `TIME`, and `TIMESTAMP` literals (such as `DATE '2020-01-01'`),
    /// as well as constants of other types (a non-standard PostgreSQL extension).
@ -696,6 +698,7 @@ impl fmt::Display for Expr {
            Expr::Collate { expr, collation } => write!(f, "{expr} COLLATE {collation}"),
            Expr::Nested(ast) => write!(f, "({ast})"),
            Expr::Value(v) => write!(f, "{v}"),
+            Expr::IntroducedString { introducer, value } => write!(f, "{introducer} {value}"),
            Expr::TypedString { data_type, value } => {
                write!(f, "{data_type}")?;
                write!(f, " '{}'", &value::escape_single_quote_string(value))
--- a/src/parser.rs
+++ b/src/parser.rs
@ -734,6 +734,17 @@ impl<'a> Parser<'a> {
                            Ok(Expr::CompoundIdentifier(id_parts))
                        }
                    }
+                    // string introducer https://dev.mysql.com/doc/refman/8.0/en/charset-introducer.html
+                    Token::SingleQuotedString(_)
+                    | Token::DoubleQuotedString(_)
+                    | Token::HexStringLiteral(_)
+                        if w.value.starts_with('_') =>
+                    {
+                        Ok(Expr::IntroducedString {
+                            introducer: w.value,
+                            value: self.parse_introduced_string_value()?,
+                        })
+                    }
                    _ => Ok(Expr::Identifier(w.to_ident())),
                },
            }, // End of Token::Word
@ -784,7 +795,6 @@ impl<'a> Parser<'a> {
                self.prev_token();
                Ok(Expr::Value(self.parse_value()?))
            }
-
            Token::LParen => {
                let expr =
                    if self.parse_keyword(Keyword::SELECT) || self.parse_keyword(Keyword::WITH) {
@ -4142,6 +4152,23 @@ impl<'a> Parser<'a> {
        }
    }

+    fn parse_introduced_string_value(&mut self) -> Result<Value, ParserError> {
+        let next_token = self.next_token();
+        let location = next_token.location;
+        match next_token.token {
+            Token::SingleQuotedString(ref s) => Ok(Value::SingleQuotedString(s.to_string())),
+            Token::DoubleQuotedString(ref s) => Ok(Value::DoubleQuotedString(s.to_string())),
+            Token::HexStringLiteral(ref s) => Ok(Value::HexStringLiteral(s.to_string())),
+            unexpected => self.expected(
+                "a string value",
+                TokenWithLocation {
+                    token: unexpected,
+                    location,
+                },
+            ),
+        }
+    }
+
    /// Parse an unsigned literal integer/long
    pub fn parse_literal_uint(&mut self) -> Result<u64, ParserError> {
        let next_token = self.next_token();
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@ -546,12 +546,12 @@ impl<'a> Tokenizer<'a> {
                // identifier or keyword
                ch if self.dialect.is_identifier_start(ch) => {
                    chars.next(); // consume the first char
-                    let s = self.tokenize_word(ch, chars);
+                    let word = self.tokenize_word(ch, chars);

                    // TODO: implement parsing of exponent here
-                    if s.chars().all(|x| ('0'..='9').contains(&x) || x == '.') {
+                    if word.chars().all(|x| ('0'..='9').contains(&x) || x == '.') {
                        let mut inner_state = State {
-                            peekable: s.chars().peekable(),
+                            peekable: word.chars().peekable(),
                            line: 0,
                            col: 0,
                        };
@ -562,7 +562,8 @@ impl<'a> Tokenizer<'a> {
                        s += s2.as_str();
                        return Ok(Some(Token::Number(s, false)));
                    }
-                    Ok(Some(Token::make_word(&s, None)))
+
+                    Ok(Some(Token::make_word(&word, None)))
                }
                // single quoted string
                '\'' => {
--- a/tests/sqlparser_mysql.rs
+++ b/tests/sqlparser_mysql.rs
@ -1264,3 +1264,44 @@ fn parse_values() {
    mysql().verified_stmt("VALUES ROW(1, true, 'a')");
    mysql().verified_stmt("SELECT a, c FROM (VALUES ROW(1, true, 'a'), ROW(2, false, 'b'), ROW(3, false, 'c')) AS t (a, b, c)");
 }
+
+#[test]
+fn parse_hex_string_introducer() {
+    assert_eq!(
+        mysql().verified_stmt("SELECT _latin1 X'4D7953514C'"),
+        Statement::Query(Box::new(Query {
+            with: None,
+            body: Box::new(SetExpr::Select(Box::new(Select {
+                distinct: false,
+                top: None,
+                projection: vec![SelectItem::UnnamedExpr(Expr::IntroducedString {
+                    introducer: "_latin1".to_string(),
+                    value: Value::HexStringLiteral("4D7953514C".to_string())
+                })],
+                from: vec![],
+                lateral_views: vec![],
+                selection: None,
+                group_by: vec![],
+                cluster_by: vec![],
+                distribute_by: vec![],
+                sort_by: vec![],
+                having: None,
+                qualify: None,
+                into: None
+            }))),
+            order_by: vec![],
+            limit: None,
+            offset: None,
+            fetch: None,
+            locks: vec![],
+        }))
+    )
+}
+
+#[test]
+fn parse_string_introducers() {
+    mysql().verified_stmt("SELECT _binary 'abc'");
+    mysql().one_statement_parses_to("SELECT _utf8'abc'", "SELECT _utf8 'abc'");
+    mysql().one_statement_parses_to("SELECT _utf8mb4'abc'", "SELECT _utf8mb4 'abc'");
+    mysql().verified_stmt("SELECT _binary 'abc', _utf8mb4 'abc'");
+}