Merge f96249cc1d into f84887d004

2025-12-23 11:12:51 +00:00 · 2025-12-16 16:03:41 -03:00 · 2025-12-16 16:03:41 -03:00 · 4bf7a7e1aa
commit 4bf7a7e1aa
parent f84887d004 f96249cc1d
4 changed files with 102 additions and 3 deletions
--- a/src/dialect/generic.rs
+++ b/src/dialect/generic.rs
@ -156,6 +156,10 @@ impl Dialect for GenericDialect {
        true
    }

+    fn supports_c_style_hints(&self) -> bool {
+        true
+    }
+
    fn supports_user_host_grantee(&self) -> bool {
        true
    }
--- a/src/dialect/mod.rs
+++ b/src/dialect/mod.rs
@ -900,6 +900,12 @@ pub trait Dialect: Debug + Any {
        false
    }

+    /// Returns true if the dialect supports hint and C-style comments
+    /// e.g. `/*! hint */`
+    fn supports_c_style_hints(&self) -> bool {
+        false
+    }
+
    /// Returns true if this dialect supports treating the equals operator `=` within a `SelectItem`
    /// as an alias assignment operator, rather than a boolean expression.
    /// For example: the following statements are equivalent for such a dialect:
--- a/src/dialect/mysql.rs
+++ b/src/dialect/mysql.rs
@ -84,6 +84,11 @@ impl Dialect for MySqlDialect {
        true
    }

+    /// see <https://dev.mysql.com/doc/refman/8.4/en/comments.html>
+    fn supports_c_style_hints(&self) -> bool {
+        true
+    }
+
    fn parse_infix(
        &self,
        parser: &mut crate::parser::Parser,
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@ -821,6 +821,8 @@ pub struct Tokenizer<'a> {
    /// If true (the default), the tokenizer will un-escape literal
    /// SQL strings See [`Tokenizer::with_unescape`] for more details.
    unescape: bool,
+    /// Tokens injected back into the stream (e.g. from MySQL C-style hints)
+    pending_tokens: Vec<Token>,
 }

 impl<'a> Tokenizer<'a> {
@ -845,6 +847,7 @@ impl<'a> Tokenizer<'a> {
            dialect,
            query,
            unescape: true,
+            pending_tokens: Vec::new(),
        }
    }

@ -947,10 +950,16 @@ impl<'a> Tokenizer<'a> {

    /// Get the next token or return None
    fn next_token(
-        &self,
+        &mut self,
        chars: &mut State,
        prev_token: Option<&Token>,
    ) -> Result<Option<Token>, TokenizerError> {
+        // Return any previously injected tokens first
+        {
+            if let Some(tok) = self.pending_tokens.pop() {
+                return Ok(Some(tok));
+            }
+        }
        match chars.peek() {
            Some(&ch) => match ch {
                ' ' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)),
@ -2190,13 +2199,14 @@ impl<'a> Tokenizer<'a> {
    }

    fn tokenize_multiline_comment(
-        &self,
+        &mut self,
        chars: &mut State,
    ) -> Result<Option<Token>, TokenizerError> {
        let mut s = String::new();
        let mut nested = 1;
+        let mut c_style_comments = false;
        let supports_nested_comments = self.dialect.supports_nested_comments();
-
+        let supports_c_style_comments = self.dialect.supports_c_style_hints();
        loop {
            match chars.next() {
                Some('/') if matches!(chars.peek(), Some('*')) && supports_nested_comments => {
@ -2205,10 +2215,24 @@ impl<'a> Tokenizer<'a> {
                    s.push('*');
                    nested += 1;
                }
+                Some('!') if supports_c_style_comments => {
+                    c_style_comments = true;
+                    // consume only version digits (leave following whitespace/content intact)
+                    while let Some(&c) = chars.peek() {
+                        if c.is_ascii_digit() {
+                            chars.next();
+                        } else {
+                            break;
+                        }
+                    }
+                }
                Some('*') if matches!(chars.peek(), Some('/')) => {
                    chars.next(); // consume the '/'
                    nested -= 1;
                    if nested == 0 {
+                        if c_style_comments {
+                            break self.inject_tokens_from_c_style_hints_and_return_first(s);
+                        }
                        break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
                    }
                    s.push('*');
@ -2227,6 +2251,26 @@ impl<'a> Tokenizer<'a> {
        }
    }

+    /// Tokenize the given string using the same dialect/unescape settings and inject
+    /// the resulting tokens back into this tokenizer so they are returned before
+    /// any further characters from the main stream. Returns the first injected token.
+    fn inject_tokens_from_c_style_hints_and_return_first(
+        &mut self,
+        inner_sql: String,
+    ) -> Result<Option<Token>, TokenizerError> {
+        let trimmed = inner_sql.trim();
+        if trimmed.is_empty() {
+            return Ok(None);
+        }
+        let mut inner = Tokenizer::new(self.dialect, trimmed).with_unescape(self.unescape);
+        let tokens = inner.tokenize()?;
+        // push in reverse so we can pop from the end efficiently
+        for t in tokens.into_iter().rev() {
+            self.pending_tokens.push(t);
+        }
+        Ok(self.pending_tokens.pop())
+    }
+
    fn parse_quoted_ident(&self, chars: &mut State, quote_end: char) -> (String, Option<char>) {
        let mut last_char = None;
        let mut s = String::new();
@ -4147,4 +4191,44 @@ mod tests {
            panic!("Tokenizer should have failed on {sql}, but it succeeded with {tokens:?}");
        }
    }
+    #[test]
+    fn tokenize_multiline_comment_with_c_style_comment() {
+        let sql = String::from("0/*! word */1");
+
+        let dialect = MySqlDialect {};
+        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
+        let expected = vec![
+            Token::Number("0".to_string(), false),
+            Token::Word(Word {
+                value: "word".to_string(),
+                quote_style: None,
+                keyword: Keyword::NoKeyword,
+            }),
+            Token::Number("1".to_string(), false),
+        ];
+        compare(expected, tokens);
+    }
+
+    #[test]
+    fn tokenize_multiline_comment_with_c_style_comment_and_version() {
+        let sql_multi = String::from("0 /*!50110 KEY_BLOCK_SIZE = 1024*/ 1");
+        let dialect = MySqlDialect {};
+        let tokens = Tokenizer::new(&dialect, &sql_multi).tokenize().unwrap();
+        let expected = vec![
+            Token::Number("0".to_string(), false),
+            Token::Whitespace(Whitespace::Space),
+            Token::Word(Word {
+                value: "KEY_BLOCK_SIZE".to_string(),
+                quote_style: None,
+                keyword: Keyword::KEY_BLOCK_SIZE,
+            }),
+            Token::Whitespace(Whitespace::Space),
+            Token::Eq,
+            Token::Whitespace(Whitespace::Space),
+            Token::Number("1024".to_string(), false),
+            Token::Whitespace(Whitespace::Space),
+            Token::Number("1".to_string(), false),
+        ];
+        compare(expected, tokens);
+    }
 }