mirror of
https://github.com/apache/datafusion-sqlparser-rs.git
synced 2025-09-26 15:39:12 +00:00
Fix tokenization of qualified identifiers with numeric prefix. (#1803)
Co-authored-by: Roman Borschel <roman@cluvio.com>
This commit is contained in:
parent
d090ad4ccf
commit
bbc80d7537
2 changed files with 185 additions and 11 deletions
|
@ -895,7 +895,7 @@ impl<'a> Tokenizer<'a> {
|
||||||
};
|
};
|
||||||
|
|
||||||
let mut location = state.location();
|
let mut location = state.location();
|
||||||
while let Some(token) = self.next_token(&mut state)? {
|
while let Some(token) = self.next_token(&mut state, buf.last().map(|t| &t.token))? {
|
||||||
let span = location.span_to(state.location());
|
let span = location.span_to(state.location());
|
||||||
|
|
||||||
buf.push(TokenWithSpan { token, span });
|
buf.push(TokenWithSpan { token, span });
|
||||||
|
@ -932,7 +932,11 @@ impl<'a> Tokenizer<'a> {
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Get the next token or return None
|
/// Get the next token or return None
|
||||||
fn next_token(&self, chars: &mut State) -> Result<Option<Token>, TokenizerError> {
|
fn next_token(
|
||||||
|
&self,
|
||||||
|
chars: &mut State,
|
||||||
|
prev_token: Option<&Token>,
|
||||||
|
) -> Result<Option<Token>, TokenizerError> {
|
||||||
match chars.peek() {
|
match chars.peek() {
|
||||||
Some(&ch) => match ch {
|
Some(&ch) => match ch {
|
||||||
' ' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)),
|
' ' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)),
|
||||||
|
@ -1211,17 +1215,29 @@ impl<'a> Tokenizer<'a> {
|
||||||
chars.next();
|
chars.next();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// If the dialect supports identifiers that start with a numeric prefix
|
||||||
|
// and we have now consumed a dot, check if the previous token was a Word.
|
||||||
|
// If so, what follows is definitely not part of a decimal number and
|
||||||
|
// we should yield the dot as a dedicated token so compound identifiers
|
||||||
|
// starting with digits can be parsed correctly.
|
||||||
|
if s == "." && self.dialect.supports_numeric_prefix() {
|
||||||
|
if let Some(Token::Word(_)) = prev_token {
|
||||||
|
return Ok(Some(Token::Period));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Consume fractional digits.
|
||||||
s += &peeking_next_take_while(chars, |ch, next_ch| {
|
s += &peeking_next_take_while(chars, |ch, next_ch| {
|
||||||
ch.is_ascii_digit() || is_number_separator(ch, next_ch)
|
ch.is_ascii_digit() || is_number_separator(ch, next_ch)
|
||||||
});
|
});
|
||||||
|
|
||||||
// No number -> Token::Period
|
// No fraction -> Token::Period
|
||||||
if s == "." {
|
if s == "." {
|
||||||
return Ok(Some(Token::Period));
|
return Ok(Some(Token::Period));
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut exponent_part = String::new();
|
|
||||||
// Parse exponent as number
|
// Parse exponent as number
|
||||||
|
let mut exponent_part = String::new();
|
||||||
if chars.peek() == Some(&'e') || chars.peek() == Some(&'E') {
|
if chars.peek() == Some(&'e') || chars.peek() == Some(&'E') {
|
||||||
let mut char_clone = chars.peekable.clone();
|
let mut char_clone = chars.peekable.clone();
|
||||||
exponent_part.push(char_clone.next().unwrap());
|
exponent_part.push(char_clone.next().unwrap());
|
||||||
|
@ -1250,14 +1266,23 @@ impl<'a> Tokenizer<'a> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// mysql dialect supports identifiers that start with a numeric prefix,
|
// If the dialect supports identifiers that start with a numeric prefix,
|
||||||
// as long as they aren't an exponent number.
|
// we need to check if the value is in fact an identifier and must thus
|
||||||
if self.dialect.supports_numeric_prefix() && exponent_part.is_empty() {
|
// be tokenized as a word.
|
||||||
let word =
|
if self.dialect.supports_numeric_prefix() {
|
||||||
peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch));
|
if exponent_part.is_empty() {
|
||||||
|
// If it is not a number with an exponent, it may be
|
||||||
|
// an identifier starting with digits.
|
||||||
|
let word =
|
||||||
|
peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch));
|
||||||
|
|
||||||
if !word.is_empty() {
|
if !word.is_empty() {
|
||||||
s += word.as_str();
|
s += word.as_str();
|
||||||
|
return Ok(Some(Token::make_word(s.as_str(), None)));
|
||||||
|
}
|
||||||
|
} else if prev_token == Some(&Token::Period) {
|
||||||
|
// If the previous token was a period, thus not belonging to a number,
|
||||||
|
// the value we have is part of an identifier.
|
||||||
return Ok(Some(Token::make_word(s.as_str(), None)));
|
return Ok(Some(Token::make_word(s.as_str(), None)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -3960,4 +3985,31 @@ mod tests {
|
||||||
],
|
],
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_tokenize_identifiers_numeric_prefix() {
|
||||||
|
all_dialects_where(|dialect| dialect.supports_numeric_prefix())
|
||||||
|
.tokenizes_to("123abc", vec![Token::make_word("123abc", None)]);
|
||||||
|
|
||||||
|
all_dialects_where(|dialect| dialect.supports_numeric_prefix())
|
||||||
|
.tokenizes_to("12e34", vec![Token::Number("12e34".to_string(), false)]);
|
||||||
|
|
||||||
|
all_dialects_where(|dialect| dialect.supports_numeric_prefix()).tokenizes_to(
|
||||||
|
"t.12e34",
|
||||||
|
vec![
|
||||||
|
Token::make_word("t", None),
|
||||||
|
Token::Period,
|
||||||
|
Token::make_word("12e34", None),
|
||||||
|
],
|
||||||
|
);
|
||||||
|
|
||||||
|
all_dialects_where(|dialect| dialect.supports_numeric_prefix()).tokenizes_to(
|
||||||
|
"t.1two3",
|
||||||
|
vec![
|
||||||
|
Token::make_word("t", None),
|
||||||
|
Token::Period,
|
||||||
|
Token::make_word("1two3", None),
|
||||||
|
],
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1926,6 +1926,128 @@ fn parse_select_with_numeric_prefix_column_name() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn parse_qualified_identifiers_with_numeric_prefix() {
|
||||||
|
// Case 1: Qualified column name that starts with digits.
|
||||||
|
match mysql().verified_stmt("SELECT t.15to29 FROM my_table AS t") {
|
||||||
|
Statement::Query(q) => match *q.body {
|
||||||
|
SetExpr::Select(s) => match s.projection.last() {
|
||||||
|
Some(SelectItem::UnnamedExpr(Expr::CompoundIdentifier(parts))) => {
|
||||||
|
assert_eq!(&[Ident::new("t"), Ident::new("15to29")], &parts[..]);
|
||||||
|
}
|
||||||
|
proj => panic!("Unexpected projection: {:?}", proj),
|
||||||
|
},
|
||||||
|
body => panic!("Unexpected statement body: {:?}", body),
|
||||||
|
},
|
||||||
|
stmt => panic!("Unexpected statement: {:?}", stmt),
|
||||||
|
}
|
||||||
|
|
||||||
|
// Case 2: Qualified column name that starts with digits and on its own represents a number.
|
||||||
|
match mysql().verified_stmt("SELECT t.15e29 FROM my_table AS t") {
|
||||||
|
Statement::Query(q) => match *q.body {
|
||||||
|
SetExpr::Select(s) => match s.projection.last() {
|
||||||
|
Some(SelectItem::UnnamedExpr(Expr::CompoundIdentifier(parts))) => {
|
||||||
|
assert_eq!(&[Ident::new("t"), Ident::new("15e29")], &parts[..]);
|
||||||
|
}
|
||||||
|
proj => panic!("Unexpected projection: {:?}", proj),
|
||||||
|
},
|
||||||
|
body => panic!("Unexpected statement body: {:?}", body),
|
||||||
|
},
|
||||||
|
stmt => panic!("Unexpected statement: {:?}", stmt),
|
||||||
|
}
|
||||||
|
|
||||||
|
// Case 3: Unqualified, the same token is parsed as a number.
|
||||||
|
match mysql()
|
||||||
|
.parse_sql_statements("SELECT 15e29 FROM my_table")
|
||||||
|
.unwrap()
|
||||||
|
.pop()
|
||||||
|
{
|
||||||
|
Some(Statement::Query(q)) => match *q.body {
|
||||||
|
SetExpr::Select(s) => match s.projection.last() {
|
||||||
|
Some(SelectItem::UnnamedExpr(Expr::Value(ValueWithSpan { value, .. }))) => {
|
||||||
|
assert_eq!(&number("15e29"), value);
|
||||||
|
}
|
||||||
|
proj => panic!("Unexpected projection: {:?}", proj),
|
||||||
|
},
|
||||||
|
body => panic!("Unexpected statement body: {:?}", body),
|
||||||
|
},
|
||||||
|
stmt => panic!("Unexpected statement: {:?}", stmt),
|
||||||
|
}
|
||||||
|
|
||||||
|
// Case 4: Quoted simple identifier.
|
||||||
|
match mysql().verified_stmt("SELECT `15e29` FROM my_table") {
|
||||||
|
Statement::Query(q) => match *q.body {
|
||||||
|
SetExpr::Select(s) => match s.projection.last() {
|
||||||
|
Some(SelectItem::UnnamedExpr(Expr::Identifier(name))) => {
|
||||||
|
assert_eq!(&Ident::with_quote('`', "15e29"), name);
|
||||||
|
}
|
||||||
|
proj => panic!("Unexpected projection: {:?}", proj),
|
||||||
|
},
|
||||||
|
body => panic!("Unexpected statement body: {:?}", body),
|
||||||
|
},
|
||||||
|
stmt => panic!("Unexpected statement: {:?}", stmt),
|
||||||
|
}
|
||||||
|
|
||||||
|
// Case 5: Quoted compound identifier.
|
||||||
|
match mysql().verified_stmt("SELECT t.`15e29` FROM my_table AS t") {
|
||||||
|
Statement::Query(q) => match *q.body {
|
||||||
|
SetExpr::Select(s) => match s.projection.last() {
|
||||||
|
Some(SelectItem::UnnamedExpr(Expr::CompoundIdentifier(parts))) => {
|
||||||
|
assert_eq!(
|
||||||
|
&[Ident::new("t"), Ident::with_quote('`', "15e29")],
|
||||||
|
&parts[..]
|
||||||
|
);
|
||||||
|
}
|
||||||
|
proj => panic!("Unexpected projection: {:?}", proj),
|
||||||
|
},
|
||||||
|
body => panic!("Unexpected statement body: {:?}", body),
|
||||||
|
},
|
||||||
|
stmt => panic!("Unexpected statement: {:?}", stmt),
|
||||||
|
}
|
||||||
|
|
||||||
|
// Case 6: Multi-level compound identifiers.
|
||||||
|
match mysql().verified_stmt("SELECT 1db.1table.1column") {
|
||||||
|
Statement::Query(q) => match *q.body {
|
||||||
|
SetExpr::Select(s) => match s.projection.last() {
|
||||||
|
Some(SelectItem::UnnamedExpr(Expr::CompoundIdentifier(parts))) => {
|
||||||
|
assert_eq!(
|
||||||
|
&[
|
||||||
|
Ident::new("1db"),
|
||||||
|
Ident::new("1table"),
|
||||||
|
Ident::new("1column")
|
||||||
|
],
|
||||||
|
&parts[..]
|
||||||
|
);
|
||||||
|
}
|
||||||
|
proj => panic!("Unexpected projection: {:?}", proj),
|
||||||
|
},
|
||||||
|
body => panic!("Unexpected statement body: {:?}", body),
|
||||||
|
},
|
||||||
|
stmt => panic!("Unexpected statement: {:?}", stmt),
|
||||||
|
}
|
||||||
|
|
||||||
|
// Case 7: Multi-level compound quoted identifiers.
|
||||||
|
match mysql().verified_stmt("SELECT `1`.`2`.`3`") {
|
||||||
|
Statement::Query(q) => match *q.body {
|
||||||
|
SetExpr::Select(s) => match s.projection.last() {
|
||||||
|
Some(SelectItem::UnnamedExpr(Expr::CompoundIdentifier(parts))) => {
|
||||||
|
assert_eq!(
|
||||||
|
&[
|
||||||
|
Ident::with_quote('`', "1"),
|
||||||
|
Ident::with_quote('`', "2"),
|
||||||
|
Ident::with_quote('`', "3")
|
||||||
|
],
|
||||||
|
&parts[..]
|
||||||
|
);
|
||||||
|
}
|
||||||
|
proj => panic!("Unexpected projection: {:?}", proj),
|
||||||
|
},
|
||||||
|
body => panic!("Unexpected statement body: {:?}", body),
|
||||||
|
},
|
||||||
|
stmt => panic!("Unexpected statement: {:?}", stmt),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Don't run with bigdecimal as it fails like this on rust beta:
|
// Don't run with bigdecimal as it fails like this on rust beta:
|
||||||
//
|
//
|
||||||
// 'parse_select_with_concatenation_of_exp_number_and_numeric_prefix_column'
|
// 'parse_select_with_concatenation_of_exp_number_and_numeric_prefix_column'
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue