mirror of
https://github.com/apache/datafusion-sqlparser-rs.git
synced 2025-07-07 17:04:59 +00:00
Fix tokenization of qualified identifiers with numeric prefix. (#1803)
Co-authored-by: Roman Borschel <roman@cluvio.com>
This commit is contained in:
parent
d090ad4ccf
commit
bbc80d7537
2 changed files with 185 additions and 11 deletions
|
@ -895,7 +895,7 @@ impl<'a> Tokenizer<'a> {
|
|||
};
|
||||
|
||||
let mut location = state.location();
|
||||
while let Some(token) = self.next_token(&mut state)? {
|
||||
while let Some(token) = self.next_token(&mut state, buf.last().map(|t| &t.token))? {
|
||||
let span = location.span_to(state.location());
|
||||
|
||||
buf.push(TokenWithSpan { token, span });
|
||||
|
@ -932,7 +932,11 @@ impl<'a> Tokenizer<'a> {
|
|||
}
|
||||
|
||||
/// Get the next token or return None
|
||||
fn next_token(&self, chars: &mut State) -> Result<Option<Token>, TokenizerError> {
|
||||
fn next_token(
|
||||
&self,
|
||||
chars: &mut State,
|
||||
prev_token: Option<&Token>,
|
||||
) -> Result<Option<Token>, TokenizerError> {
|
||||
match chars.peek() {
|
||||
Some(&ch) => match ch {
|
||||
' ' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)),
|
||||
|
@ -1211,17 +1215,29 @@ impl<'a> Tokenizer<'a> {
|
|||
chars.next();
|
||||
}
|
||||
|
||||
// If the dialect supports identifiers that start with a numeric prefix
|
||||
// and we have now consumed a dot, check if the previous token was a Word.
|
||||
// If so, what follows is definitely not part of a decimal number and
|
||||
// we should yield the dot as a dedicated token so compound identifiers
|
||||
// starting with digits can be parsed correctly.
|
||||
if s == "." && self.dialect.supports_numeric_prefix() {
|
||||
if let Some(Token::Word(_)) = prev_token {
|
||||
return Ok(Some(Token::Period));
|
||||
}
|
||||
}
|
||||
|
||||
// Consume fractional digits.
|
||||
s += &peeking_next_take_while(chars, |ch, next_ch| {
|
||||
ch.is_ascii_digit() || is_number_separator(ch, next_ch)
|
||||
});
|
||||
|
||||
// No number -> Token::Period
|
||||
// No fraction -> Token::Period
|
||||
if s == "." {
|
||||
return Ok(Some(Token::Period));
|
||||
}
|
||||
|
||||
let mut exponent_part = String::new();
|
||||
// Parse exponent as number
|
||||
let mut exponent_part = String::new();
|
||||
if chars.peek() == Some(&'e') || chars.peek() == Some(&'E') {
|
||||
let mut char_clone = chars.peekable.clone();
|
||||
exponent_part.push(char_clone.next().unwrap());
|
||||
|
@ -1250,14 +1266,23 @@ impl<'a> Tokenizer<'a> {
|
|||
}
|
||||
}
|
||||
|
||||
// mysql dialect supports identifiers that start with a numeric prefix,
|
||||
// as long as they aren't an exponent number.
|
||||
if self.dialect.supports_numeric_prefix() && exponent_part.is_empty() {
|
||||
let word =
|
||||
peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch));
|
||||
// If the dialect supports identifiers that start with a numeric prefix,
|
||||
// we need to check if the value is in fact an identifier and must thus
|
||||
// be tokenized as a word.
|
||||
if self.dialect.supports_numeric_prefix() {
|
||||
if exponent_part.is_empty() {
|
||||
// If it is not a number with an exponent, it may be
|
||||
// an identifier starting with digits.
|
||||
let word =
|
||||
peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch));
|
||||
|
||||
if !word.is_empty() {
|
||||
s += word.as_str();
|
||||
if !word.is_empty() {
|
||||
s += word.as_str();
|
||||
return Ok(Some(Token::make_word(s.as_str(), None)));
|
||||
}
|
||||
} else if prev_token == Some(&Token::Period) {
|
||||
// If the previous token was a period, thus not belonging to a number,
|
||||
// the value we have is part of an identifier.
|
||||
return Ok(Some(Token::make_word(s.as_str(), None)));
|
||||
}
|
||||
}
|
||||
|
@ -3960,4 +3985,31 @@ mod tests {
|
|||
],
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tokenize_identifiers_numeric_prefix() {
|
||||
all_dialects_where(|dialect| dialect.supports_numeric_prefix())
|
||||
.tokenizes_to("123abc", vec![Token::make_word("123abc", None)]);
|
||||
|
||||
all_dialects_where(|dialect| dialect.supports_numeric_prefix())
|
||||
.tokenizes_to("12e34", vec![Token::Number("12e34".to_string(), false)]);
|
||||
|
||||
all_dialects_where(|dialect| dialect.supports_numeric_prefix()).tokenizes_to(
|
||||
"t.12e34",
|
||||
vec![
|
||||
Token::make_word("t", None),
|
||||
Token::Period,
|
||||
Token::make_word("12e34", None),
|
||||
],
|
||||
);
|
||||
|
||||
all_dialects_where(|dialect| dialect.supports_numeric_prefix()).tokenizes_to(
|
||||
"t.1two3",
|
||||
vec![
|
||||
Token::make_word("t", None),
|
||||
Token::Period,
|
||||
Token::make_word("1two3", None),
|
||||
],
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1926,6 +1926,128 @@ fn parse_select_with_numeric_prefix_column_name() {
|
|||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_qualified_identifiers_with_numeric_prefix() {
|
||||
// Case 1: Qualified column name that starts with digits.
|
||||
match mysql().verified_stmt("SELECT t.15to29 FROM my_table AS t") {
|
||||
Statement::Query(q) => match *q.body {
|
||||
SetExpr::Select(s) => match s.projection.last() {
|
||||
Some(SelectItem::UnnamedExpr(Expr::CompoundIdentifier(parts))) => {
|
||||
assert_eq!(&[Ident::new("t"), Ident::new("15to29")], &parts[..]);
|
||||
}
|
||||
proj => panic!("Unexpected projection: {:?}", proj),
|
||||
},
|
||||
body => panic!("Unexpected statement body: {:?}", body),
|
||||
},
|
||||
stmt => panic!("Unexpected statement: {:?}", stmt),
|
||||
}
|
||||
|
||||
// Case 2: Qualified column name that starts with digits and on its own represents a number.
|
||||
match mysql().verified_stmt("SELECT t.15e29 FROM my_table AS t") {
|
||||
Statement::Query(q) => match *q.body {
|
||||
SetExpr::Select(s) => match s.projection.last() {
|
||||
Some(SelectItem::UnnamedExpr(Expr::CompoundIdentifier(parts))) => {
|
||||
assert_eq!(&[Ident::new("t"), Ident::new("15e29")], &parts[..]);
|
||||
}
|
||||
proj => panic!("Unexpected projection: {:?}", proj),
|
||||
},
|
||||
body => panic!("Unexpected statement body: {:?}", body),
|
||||
},
|
||||
stmt => panic!("Unexpected statement: {:?}", stmt),
|
||||
}
|
||||
|
||||
// Case 3: Unqualified, the same token is parsed as a number.
|
||||
match mysql()
|
||||
.parse_sql_statements("SELECT 15e29 FROM my_table")
|
||||
.unwrap()
|
||||
.pop()
|
||||
{
|
||||
Some(Statement::Query(q)) => match *q.body {
|
||||
SetExpr::Select(s) => match s.projection.last() {
|
||||
Some(SelectItem::UnnamedExpr(Expr::Value(ValueWithSpan { value, .. }))) => {
|
||||
assert_eq!(&number("15e29"), value);
|
||||
}
|
||||
proj => panic!("Unexpected projection: {:?}", proj),
|
||||
},
|
||||
body => panic!("Unexpected statement body: {:?}", body),
|
||||
},
|
||||
stmt => panic!("Unexpected statement: {:?}", stmt),
|
||||
}
|
||||
|
||||
// Case 4: Quoted simple identifier.
|
||||
match mysql().verified_stmt("SELECT `15e29` FROM my_table") {
|
||||
Statement::Query(q) => match *q.body {
|
||||
SetExpr::Select(s) => match s.projection.last() {
|
||||
Some(SelectItem::UnnamedExpr(Expr::Identifier(name))) => {
|
||||
assert_eq!(&Ident::with_quote('`', "15e29"), name);
|
||||
}
|
||||
proj => panic!("Unexpected projection: {:?}", proj),
|
||||
},
|
||||
body => panic!("Unexpected statement body: {:?}", body),
|
||||
},
|
||||
stmt => panic!("Unexpected statement: {:?}", stmt),
|
||||
}
|
||||
|
||||
// Case 5: Quoted compound identifier.
|
||||
match mysql().verified_stmt("SELECT t.`15e29` FROM my_table AS t") {
|
||||
Statement::Query(q) => match *q.body {
|
||||
SetExpr::Select(s) => match s.projection.last() {
|
||||
Some(SelectItem::UnnamedExpr(Expr::CompoundIdentifier(parts))) => {
|
||||
assert_eq!(
|
||||
&[Ident::new("t"), Ident::with_quote('`', "15e29")],
|
||||
&parts[..]
|
||||
);
|
||||
}
|
||||
proj => panic!("Unexpected projection: {:?}", proj),
|
||||
},
|
||||
body => panic!("Unexpected statement body: {:?}", body),
|
||||
},
|
||||
stmt => panic!("Unexpected statement: {:?}", stmt),
|
||||
}
|
||||
|
||||
// Case 6: Multi-level compound identifiers.
|
||||
match mysql().verified_stmt("SELECT 1db.1table.1column") {
|
||||
Statement::Query(q) => match *q.body {
|
||||
SetExpr::Select(s) => match s.projection.last() {
|
||||
Some(SelectItem::UnnamedExpr(Expr::CompoundIdentifier(parts))) => {
|
||||
assert_eq!(
|
||||
&[
|
||||
Ident::new("1db"),
|
||||
Ident::new("1table"),
|
||||
Ident::new("1column")
|
||||
],
|
||||
&parts[..]
|
||||
);
|
||||
}
|
||||
proj => panic!("Unexpected projection: {:?}", proj),
|
||||
},
|
||||
body => panic!("Unexpected statement body: {:?}", body),
|
||||
},
|
||||
stmt => panic!("Unexpected statement: {:?}", stmt),
|
||||
}
|
||||
|
||||
// Case 7: Multi-level compound quoted identifiers.
|
||||
match mysql().verified_stmt("SELECT `1`.`2`.`3`") {
|
||||
Statement::Query(q) => match *q.body {
|
||||
SetExpr::Select(s) => match s.projection.last() {
|
||||
Some(SelectItem::UnnamedExpr(Expr::CompoundIdentifier(parts))) => {
|
||||
assert_eq!(
|
||||
&[
|
||||
Ident::with_quote('`', "1"),
|
||||
Ident::with_quote('`', "2"),
|
||||
Ident::with_quote('`', "3")
|
||||
],
|
||||
&parts[..]
|
||||
);
|
||||
}
|
||||
proj => panic!("Unexpected projection: {:?}", proj),
|
||||
},
|
||||
body => panic!("Unexpected statement body: {:?}", body),
|
||||
},
|
||||
stmt => panic!("Unexpected statement: {:?}", stmt),
|
||||
}
|
||||
}
|
||||
|
||||
// Don't run with bigdecimal as it fails like this on rust beta:
|
||||
//
|
||||
// 'parse_select_with_concatenation_of_exp_number_and_numeric_prefix_column'
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue