Fix tokenization of qualified identifiers with numeric prefix. (#1803)

Co-authored-by: Roman Borschel <roman@cluvio.com>
This commit is contained in:
Roman Borschel 2025-04-11 20:58:43 +02:00 committed by GitHub
parent d090ad4ccf
commit bbc80d7537
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 185 additions and 11 deletions

View file

@ -895,7 +895,7 @@ impl<'a> Tokenizer<'a> {
};
let mut location = state.location();
while let Some(token) = self.next_token(&mut state)? {
while let Some(token) = self.next_token(&mut state, buf.last().map(|t| &t.token))? {
let span = location.span_to(state.location());
buf.push(TokenWithSpan { token, span });
@ -932,7 +932,11 @@ impl<'a> Tokenizer<'a> {
}
/// Get the next token or return None
fn next_token(&self, chars: &mut State) -> Result<Option<Token>, TokenizerError> {
fn next_token(
&self,
chars: &mut State,
prev_token: Option<&Token>,
) -> Result<Option<Token>, TokenizerError> {
match chars.peek() {
Some(&ch) => match ch {
' ' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)),
@ -1211,17 +1215,29 @@ impl<'a> Tokenizer<'a> {
chars.next();
}
// If the dialect supports identifiers that start with a numeric prefix
// and we have now consumed a dot, check if the previous token was a Word.
// If so, what follows is definitely not part of a decimal number and
// we should yield the dot as a dedicated token so compound identifiers
// starting with digits can be parsed correctly.
if s == "." && self.dialect.supports_numeric_prefix() {
if let Some(Token::Word(_)) = prev_token {
return Ok(Some(Token::Period));
}
}
// Consume fractional digits.
s += &peeking_next_take_while(chars, |ch, next_ch| {
ch.is_ascii_digit() || is_number_separator(ch, next_ch)
});
// No number -> Token::Period
// No fraction -> Token::Period
if s == "." {
return Ok(Some(Token::Period));
}
let mut exponent_part = String::new();
// Parse exponent as number
let mut exponent_part = String::new();
if chars.peek() == Some(&'e') || chars.peek() == Some(&'E') {
let mut char_clone = chars.peekable.clone();
exponent_part.push(char_clone.next().unwrap());
@ -1250,14 +1266,23 @@ impl<'a> Tokenizer<'a> {
}
}
// mysql dialect supports identifiers that start with a numeric prefix,
// as long as they aren't an exponent number.
if self.dialect.supports_numeric_prefix() && exponent_part.is_empty() {
let word =
peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch));
// If the dialect supports identifiers that start with a numeric prefix,
// we need to check if the value is in fact an identifier and must thus
// be tokenized as a word.
if self.dialect.supports_numeric_prefix() {
if exponent_part.is_empty() {
// If it is not a number with an exponent, it may be
// an identifier starting with digits.
let word =
peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch));
if !word.is_empty() {
s += word.as_str();
if !word.is_empty() {
s += word.as_str();
return Ok(Some(Token::make_word(s.as_str(), None)));
}
} else if prev_token == Some(&Token::Period) {
// If the previous token was a period, thus not belonging to a number,
// the value we have is part of an identifier.
return Ok(Some(Token::make_word(s.as_str(), None)));
}
}
@ -3960,4 +3985,31 @@ mod tests {
],
);
}
#[test]
fn test_tokenize_identifiers_numeric_prefix() {
all_dialects_where(|dialect| dialect.supports_numeric_prefix())
.tokenizes_to("123abc", vec![Token::make_word("123abc", None)]);
all_dialects_where(|dialect| dialect.supports_numeric_prefix())
.tokenizes_to("12e34", vec![Token::Number("12e34".to_string(), false)]);
all_dialects_where(|dialect| dialect.supports_numeric_prefix()).tokenizes_to(
"t.12e34",
vec![
Token::make_word("t", None),
Token::Period,
Token::make_word("12e34", None),
],
);
all_dialects_where(|dialect| dialect.supports_numeric_prefix()).tokenizes_to(
"t.1two3",
vec![
Token::make_word("t", None),
Token::Period,
Token::make_word("1two3", None),
],
);
}
}

View file

@ -1926,6 +1926,128 @@ fn parse_select_with_numeric_prefix_column_name() {
}
}
#[test]
fn parse_qualified_identifiers_with_numeric_prefix() {
// Case 1: Qualified column name that starts with digits.
match mysql().verified_stmt("SELECT t.15to29 FROM my_table AS t") {
Statement::Query(q) => match *q.body {
SetExpr::Select(s) => match s.projection.last() {
Some(SelectItem::UnnamedExpr(Expr::CompoundIdentifier(parts))) => {
assert_eq!(&[Ident::new("t"), Ident::new("15to29")], &parts[..]);
}
proj => panic!("Unexpected projection: {:?}", proj),
},
body => panic!("Unexpected statement body: {:?}", body),
},
stmt => panic!("Unexpected statement: {:?}", stmt),
}
// Case 2: Qualified column name that starts with digits and on its own represents a number.
match mysql().verified_stmt("SELECT t.15e29 FROM my_table AS t") {
Statement::Query(q) => match *q.body {
SetExpr::Select(s) => match s.projection.last() {
Some(SelectItem::UnnamedExpr(Expr::CompoundIdentifier(parts))) => {
assert_eq!(&[Ident::new("t"), Ident::new("15e29")], &parts[..]);
}
proj => panic!("Unexpected projection: {:?}", proj),
},
body => panic!("Unexpected statement body: {:?}", body),
},
stmt => panic!("Unexpected statement: {:?}", stmt),
}
// Case 3: Unqualified, the same token is parsed as a number.
match mysql()
.parse_sql_statements("SELECT 15e29 FROM my_table")
.unwrap()
.pop()
{
Some(Statement::Query(q)) => match *q.body {
SetExpr::Select(s) => match s.projection.last() {
Some(SelectItem::UnnamedExpr(Expr::Value(ValueWithSpan { value, .. }))) => {
assert_eq!(&number("15e29"), value);
}
proj => panic!("Unexpected projection: {:?}", proj),
},
body => panic!("Unexpected statement body: {:?}", body),
},
stmt => panic!("Unexpected statement: {:?}", stmt),
}
// Case 4: Quoted simple identifier.
match mysql().verified_stmt("SELECT `15e29` FROM my_table") {
Statement::Query(q) => match *q.body {
SetExpr::Select(s) => match s.projection.last() {
Some(SelectItem::UnnamedExpr(Expr::Identifier(name))) => {
assert_eq!(&Ident::with_quote('`', "15e29"), name);
}
proj => panic!("Unexpected projection: {:?}", proj),
},
body => panic!("Unexpected statement body: {:?}", body),
},
stmt => panic!("Unexpected statement: {:?}", stmt),
}
// Case 5: Quoted compound identifier.
match mysql().verified_stmt("SELECT t.`15e29` FROM my_table AS t") {
Statement::Query(q) => match *q.body {
SetExpr::Select(s) => match s.projection.last() {
Some(SelectItem::UnnamedExpr(Expr::CompoundIdentifier(parts))) => {
assert_eq!(
&[Ident::new("t"), Ident::with_quote('`', "15e29")],
&parts[..]
);
}
proj => panic!("Unexpected projection: {:?}", proj),
},
body => panic!("Unexpected statement body: {:?}", body),
},
stmt => panic!("Unexpected statement: {:?}", stmt),
}
// Case 6: Multi-level compound identifiers.
match mysql().verified_stmt("SELECT 1db.1table.1column") {
Statement::Query(q) => match *q.body {
SetExpr::Select(s) => match s.projection.last() {
Some(SelectItem::UnnamedExpr(Expr::CompoundIdentifier(parts))) => {
assert_eq!(
&[
Ident::new("1db"),
Ident::new("1table"),
Ident::new("1column")
],
&parts[..]
);
}
proj => panic!("Unexpected projection: {:?}", proj),
},
body => panic!("Unexpected statement body: {:?}", body),
},
stmt => panic!("Unexpected statement: {:?}", stmt),
}
// Case 7: Multi-level compound quoted identifiers.
match mysql().verified_stmt("SELECT `1`.`2`.`3`") {
Statement::Query(q) => match *q.body {
SetExpr::Select(s) => match s.projection.last() {
Some(SelectItem::UnnamedExpr(Expr::CompoundIdentifier(parts))) => {
assert_eq!(
&[
Ident::with_quote('`', "1"),
Ident::with_quote('`', "2"),
Ident::with_quote('`', "3")
],
&parts[..]
);
}
proj => panic!("Unexpected projection: {:?}", proj),
},
body => panic!("Unexpected statement body: {:?}", body),
},
stmt => panic!("Unexpected statement: {:?}", stmt),
}
}
// Don't run with bigdecimal as it fails like this on rust beta:
//
// 'parse_select_with_concatenation_of_exp_number_and_numeric_prefix_column'