Correctly tokenize nested comments (#1629)

2025-10-17 01:07:19 +00:00 · 2025-01-05 15:37:34 +01:00 · 2025-01-05 15:37:34 +01:00 · 8bc63f0e4a
commit 8bc63f0e4a
parent 94ea20628f
4 changed files with 109 additions and 18 deletions
--- a/src/dialect/generic.rs
+++ b/src/dialect/generic.rs
@ -131,4 +131,8 @@ impl Dialect for GenericDialect {
    fn supports_empty_projections(&self) -> bool {
        true
    }
    fn supports_nested_comments(&self) -> bool {
        true
    }
 }
--- a/src/dialect/mod.rs
+++ b/src/dialect/mod.rs
@ -682,6 +682,12 @@ pub trait Dialect: Debug + Any {
        false
    }
    /// Returns true if the dialect supports nested comments
    /// e.g. `/* /* nested */ */`
    fn supports_nested_comments(&self) -> bool {
        false
    }
    /// Returns true if this dialect supports treating the equals operator `=` within a `SelectItem`
    /// as an alias assignment operator, rather than a boolean expression.
    /// For example: the following statements are equivalent for such a dialect:
--- a/src/dialect/postgresql.rs
+++ b/src/dialect/postgresql.rs
@ -241,6 +241,10 @@ impl Dialect for PostgreSqlDialect {
    fn supports_empty_projections(&self) -> bool {
        true
    }
    fn supports_nested_comments(&self) -> bool {
        true
    }
 }
 pub fn parse_create(parser: &mut Parser) -> Option<Result<Statement, ParserError>> {
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@ -1855,28 +1855,33 @@ impl<'a> Tokenizer<'a> {
    ) -> Result<Option<Token>, TokenizerError> {
        let mut s = String::new();
        let mut nested = 1;
-        let mut last_ch = ' ';
+        let supports_nested_comments = self.dialect.supports_nested_comments();
        loop {
            match chars.next() {
-                Some(ch) => {
+                Some('/') if matches!(chars.peek(), Some('*')) && supports_nested_comments => {
-                    if last_ch == '/' && ch == '*' {
+                    chars.next(); // consume the '*'
                    s.push('/');
                    s.push('*');
                    nested += 1;
-                    } else if last_ch == '*' && ch == '/' {
+                }
                Some('*') if matches!(chars.peek(), Some('/')) => {
                    chars.next(); // consume the '/'
                    nested -= 1;
                    if nested == 0 {
                            s.pop();
                        break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
                    }
                    s.push('*');
                    s.push('/');
                }
                Some(ch) => {
                    s.push(ch);
                    last_ch = ch;
                }
                None => {
                    break self.tokenizer_error(
                        chars.location(),
                        "Unexpected EOF while in a multi-line comment",
-                    )
+                    );
                }
            }
        }
@ -2718,19 +2723,91 @@ mod tests {
    #[test]
    fn tokenize_nested_multiline_comment() {
        let sql = String::from("0/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/1");
        let dialect = GenericDialect {};
-        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
+        let test_cases = vec![
-        let expected = vec![
+            (
                "0/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/1",
                vec![
                    Token::Number("0".to_string(), false),
                    Token::Whitespace(Whitespace::MultiLineComment(
-                "multi-line\n* \n/* comment \n /*comment*/*/ */ /comment".to_string(),
+                        "multi-line\n* \n/* comment \n /*comment*/*/ ".into(),
                    )),
                    Token::Whitespace(Whitespace::Space),
                    Token::Div,
                    Token::Word(Word {
                        value: "comment".to_string(),
                        quote_style: None,
                        keyword: Keyword::COMMENT,
                    }),
                    Token::Mul,
                    Token::Div,
                    Token::Number("1".to_string(), false),
                ],
            ),
            (
                "0/*multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/*/1",
                vec![
                    Token::Number("0".to_string(), false),
                    Token::Whitespace(Whitespace::MultiLineComment(
                        "multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/".into(),
                    )),
                    Token::Number("1".to_string(), false),
                ],
            ),
            (
                "SELECT 1/* a /* b */ c */0",
                vec![
                    Token::make_keyword("SELECT"),
                    Token::Whitespace(Whitespace::Space),
                    Token::Number("1".to_string(), false),
                    Token::Whitespace(Whitespace::MultiLineComment(" a /* b */ c ".to_string())),
                    Token::Number("0".to_string(), false),
                ],
            ),
        ];
        for (sql, expected) in test_cases {
            let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
            compare(expected, tokens);
        }
    }
    #[test]
    fn tokenize_nested_multiline_comment_empty() {
        let sql = "select 1/*/**/*/0";
        let dialect = GenericDialect {};
        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
        let expected = vec![
            Token::make_keyword("select"),
            Token::Whitespace(Whitespace::Space),
            Token::Number("1".to_string(), false),
            Token::Whitespace(Whitespace::MultiLineComment("/**/".to_string())),
            Token::Number("0".to_string(), false),
        ];
        compare(expected, tokens);
    }
    #[test]
    fn tokenize_nested_comments_if_not_supported() {
        let dialect = SQLiteDialect {};
        let sql = "SELECT 1/*/* nested comment */*/0";
        let tokens = Tokenizer::new(&dialect, sql).tokenize();
        let expected = vec![
            Token::make_keyword("SELECT"),
            Token::Whitespace(Whitespace::Space),
            Token::Number("1".to_string(), false),
            Token::Whitespace(Whitespace::MultiLineComment(
                "/* nested comment ".to_string(),
            )),
            Token::Mul,
            Token::Div,
            Token::Number("0".to_string(), false),
        ];
        compare(expected, tokens.unwrap());
    }
    #[test]
    fn tokenize_multiline_comment_with_even_asterisks() {