Correctly tokenize nested comments (#1629)

This commit is contained in:
Hans Ott 2025-01-05 15:37:34 +01:00 committed by GitHub
parent 94ea20628f
commit 8bc63f0e4a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 109 additions and 18 deletions

View file

@ -131,4 +131,8 @@ impl Dialect for GenericDialect {
fn supports_empty_projections(&self) -> bool { fn supports_empty_projections(&self) -> bool {
true true
} }
fn supports_nested_comments(&self) -> bool {
true
}
} }

View file

@ -682,6 +682,12 @@ pub trait Dialect: Debug + Any {
false false
} }
/// Returns true if the dialect supports nested comments
/// e.g. `/* /* nested */ */`
fn supports_nested_comments(&self) -> bool {
false
}
/// Returns true if this dialect supports treating the equals operator `=` within a `SelectItem` /// Returns true if this dialect supports treating the equals operator `=` within a `SelectItem`
/// as an alias assignment operator, rather than a boolean expression. /// as an alias assignment operator, rather than a boolean expression.
/// For example: the following statements are equivalent for such a dialect: /// For example: the following statements are equivalent for such a dialect:

View file

@ -241,6 +241,10 @@ impl Dialect for PostgreSqlDialect {
fn supports_empty_projections(&self) -> bool { fn supports_empty_projections(&self) -> bool {
true true
} }
fn supports_nested_comments(&self) -> bool {
true
}
} }
pub fn parse_create(parser: &mut Parser) -> Option<Result<Statement, ParserError>> { pub fn parse_create(parser: &mut Parser) -> Option<Result<Statement, ParserError>> {

View file

@ -1855,28 +1855,33 @@ impl<'a> Tokenizer<'a> {
) -> Result<Option<Token>, TokenizerError> { ) -> Result<Option<Token>, TokenizerError> {
let mut s = String::new(); let mut s = String::new();
let mut nested = 1; let mut nested = 1;
let mut last_ch = ' '; let supports_nested_comments = self.dialect.supports_nested_comments();
loop { loop {
match chars.next() { match chars.next() {
Some(ch) => { Some('/') if matches!(chars.peek(), Some('*')) && supports_nested_comments => {
if last_ch == '/' && ch == '*' { chars.next(); // consume the '*'
s.push('/');
s.push('*');
nested += 1; nested += 1;
} else if last_ch == '*' && ch == '/' { }
Some('*') if matches!(chars.peek(), Some('/')) => {
chars.next(); // consume the '/'
nested -= 1; nested -= 1;
if nested == 0 { if nested == 0 {
s.pop();
break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s)))); break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
} }
s.push('*');
s.push('/');
} }
Some(ch) => {
s.push(ch); s.push(ch);
last_ch = ch;
} }
None => { None => {
break self.tokenizer_error( break self.tokenizer_error(
chars.location(), chars.location(),
"Unexpected EOF while in a multi-line comment", "Unexpected EOF while in a multi-line comment",
) );
} }
} }
} }
@ -2718,19 +2723,91 @@ mod tests {
#[test] #[test]
fn tokenize_nested_multiline_comment() { fn tokenize_nested_multiline_comment() {
let sql = String::from("0/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/1");
let dialect = GenericDialect {}; let dialect = GenericDialect {};
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap(); let test_cases = vec![
let expected = vec![ (
"0/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/1",
vec![
Token::Number("0".to_string(), false), Token::Number("0".to_string(), false),
Token::Whitespace(Whitespace::MultiLineComment( Token::Whitespace(Whitespace::MultiLineComment(
"multi-line\n* \n/* comment \n /*comment*/*/ */ /comment".to_string(), "multi-line\n* \n/* comment \n /*comment*/*/ ".into(),
)),
Token::Whitespace(Whitespace::Space),
Token::Div,
Token::Word(Word {
value: "comment".to_string(),
quote_style: None,
keyword: Keyword::COMMENT,
}),
Token::Mul,
Token::Div,
Token::Number("1".to_string(), false),
],
),
(
"0/*multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/*/1",
vec![
Token::Number("0".to_string(), false),
Token::Whitespace(Whitespace::MultiLineComment(
"multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/".into(),
)), )),
Token::Number("1".to_string(), false), Token::Number("1".to_string(), false),
],
),
(
"SELECT 1/* a /* b */ c */0",
vec![
Token::make_keyword("SELECT"),
Token::Whitespace(Whitespace::Space),
Token::Number("1".to_string(), false),
Token::Whitespace(Whitespace::MultiLineComment(" a /* b */ c ".to_string())),
Token::Number("0".to_string(), false),
],
),
]; ];
for (sql, expected) in test_cases {
let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
compare(expected, tokens); compare(expected, tokens);
} }
}
#[test]
fn tokenize_nested_multiline_comment_empty() {
let sql = "select 1/*/**/*/0";
let dialect = GenericDialect {};
let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
let expected = vec![
Token::make_keyword("select"),
Token::Whitespace(Whitespace::Space),
Token::Number("1".to_string(), false),
Token::Whitespace(Whitespace::MultiLineComment("/**/".to_string())),
Token::Number("0".to_string(), false),
];
compare(expected, tokens);
}
#[test]
fn tokenize_nested_comments_if_not_supported() {
let dialect = SQLiteDialect {};
let sql = "SELECT 1/*/* nested comment */*/0";
let tokens = Tokenizer::new(&dialect, sql).tokenize();
let expected = vec![
Token::make_keyword("SELECT"),
Token::Whitespace(Whitespace::Space),
Token::Number("1".to_string(), false),
Token::Whitespace(Whitespace::MultiLineComment(
"/* nested comment ".to_string(),
)),
Token::Mul,
Token::Div,
Token::Number("0".to_string(), false),
];
compare(expected, tokens.unwrap());
}
#[test] #[test]
fn tokenize_multiline_comment_with_even_asterisks() { fn tokenize_multiline_comment_with_even_asterisks() {