Fix: parsing ident starting with underscore in certain dialects (#1835)

This commit is contained in:
Mohamed Abdeen 2025-05-10 01:14:25 +01:00 committed by GitHub
parent 2182f7ea71
commit 052ad4a759
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -1191,6 +1191,22 @@ impl<'a> Tokenizer<'a> {
}
// numbers and period
'0'..='9' | '.' => {
// special case where if ._ is encountered after a word then that word
// is a table and the _ is the start of the col name.
// if the prev token is not a word, then this is not a valid sql
// word or number.
if ch == '.' && chars.peekable.clone().nth(1) == Some('_') {
if let Some(Token::Word(_)) = prev_token {
chars.next();
return Ok(Some(Token::Period));
}
return self.tokenizer_error(
chars.location(),
"Unexpected character '_'".to_string(),
);
}
// Some dialects support underscore as number separator
// There can only be one at a time and it must be followed by another digit
let is_number_separator = |ch: char, next_char: Option<char>| {
@ -4018,4 +4034,40 @@ mod tests {
],
);
}
#[test]
fn tokenize_period_underscore() {
let sql = String::from("SELECT table._col");
// a dialect that supports underscores in numeric literals
let dialect = PostgreSqlDialect {};
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
let expected = vec![
Token::make_keyword("SELECT"),
Token::Whitespace(Whitespace::Space),
Token::Word(Word {
value: "table".to_string(),
quote_style: None,
keyword: Keyword::TABLE,
}),
Token::Period,
Token::Word(Word {
value: "_col".to_string(),
quote_style: None,
keyword: Keyword::NoKeyword,
}),
];
compare(expected, tokens);
let sql = String::from("SELECT ._123");
if let Ok(tokens) = Tokenizer::new(&dialect, &sql).tokenize() {
panic!("Tokenizer should have failed on {sql}, but it succeeded with {tokens:?}");
}
let sql = String::from("SELECT ._abc");
if let Ok(tokens) = Tokenizer::new(&dialect, &sql).tokenize() {
panic!("Tokenizer should have failed on {sql}, but it succeeded with {tokens:?}");
}
}
}