mirror of
https://github.com/apache/datafusion-sqlparser-rs.git
synced 2025-10-16 00:39:00 +00:00
Fix tokenization of qualified identifiers with numeric prefix. (#1803)
Co-authored-by: Roman Borschel <roman@cluvio.com>
This commit is contained in:
parent
d090ad4ccf
commit
bbc80d7537
2 changed files with 185 additions and 11 deletions
|
@ -895,7 +895,7 @@ impl<'a> Tokenizer<'a> {
|
|||
};
|
||||
|
||||
let mut location = state.location();
|
||||
while let Some(token) = self.next_token(&mut state)? {
|
||||
while let Some(token) = self.next_token(&mut state, buf.last().map(|t| &t.token))? {
|
||||
let span = location.span_to(state.location());
|
||||
|
||||
buf.push(TokenWithSpan { token, span });
|
||||
|
@ -932,7 +932,11 @@ impl<'a> Tokenizer<'a> {
|
|||
}
|
||||
|
||||
/// Get the next token or return None
|
||||
fn next_token(&self, chars: &mut State) -> Result<Option<Token>, TokenizerError> {
|
||||
fn next_token(
|
||||
&self,
|
||||
chars: &mut State,
|
||||
prev_token: Option<&Token>,
|
||||
) -> Result<Option<Token>, TokenizerError> {
|
||||
match chars.peek() {
|
||||
Some(&ch) => match ch {
|
||||
' ' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)),
|
||||
|
@ -1211,17 +1215,29 @@ impl<'a> Tokenizer<'a> {
|
|||
chars.next();
|
||||
}
|
||||
|
||||
// If the dialect supports identifiers that start with a numeric prefix
|
||||
// and we have now consumed a dot, check if the previous token was a Word.
|
||||
// If so, what follows is definitely not part of a decimal number and
|
||||
// we should yield the dot as a dedicated token so compound identifiers
|
||||
// starting with digits can be parsed correctly.
|
||||
if s == "." && self.dialect.supports_numeric_prefix() {
|
||||
if let Some(Token::Word(_)) = prev_token {
|
||||
return Ok(Some(Token::Period));
|
||||
}
|
||||
}
|
||||
|
||||
// Consume fractional digits.
|
||||
s += &peeking_next_take_while(chars, |ch, next_ch| {
|
||||
ch.is_ascii_digit() || is_number_separator(ch, next_ch)
|
||||
});
|
||||
|
||||
// No number -> Token::Period
|
||||
// No fraction -> Token::Period
|
||||
if s == "." {
|
||||
return Ok(Some(Token::Period));
|
||||
}
|
||||
|
||||
let mut exponent_part = String::new();
|
||||
// Parse exponent as number
|
||||
let mut exponent_part = String::new();
|
||||
if chars.peek() == Some(&'e') || chars.peek() == Some(&'E') {
|
||||
let mut char_clone = chars.peekable.clone();
|
||||
exponent_part.push(char_clone.next().unwrap());
|
||||
|
@ -1250,14 +1266,23 @@ impl<'a> Tokenizer<'a> {
|
|||
}
|
||||
}
|
||||
|
||||
// mysql dialect supports identifiers that start with a numeric prefix,
|
||||
// as long as they aren't an exponent number.
|
||||
if self.dialect.supports_numeric_prefix() && exponent_part.is_empty() {
|
||||
let word =
|
||||
peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch));
|
||||
// If the dialect supports identifiers that start with a numeric prefix,
|
||||
// we need to check if the value is in fact an identifier and must thus
|
||||
// be tokenized as a word.
|
||||
if self.dialect.supports_numeric_prefix() {
|
||||
if exponent_part.is_empty() {
|
||||
// If it is not a number with an exponent, it may be
|
||||
// an identifier starting with digits.
|
||||
let word =
|
||||
peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch));
|
||||
|
||||
if !word.is_empty() {
|
||||
s += word.as_str();
|
||||
if !word.is_empty() {
|
||||
s += word.as_str();
|
||||
return Ok(Some(Token::make_word(s.as_str(), None)));
|
||||
}
|
||||
} else if prev_token == Some(&Token::Period) {
|
||||
// If the previous token was a period, thus not belonging to a number,
|
||||
// the value we have is part of an identifier.
|
||||
return Ok(Some(Token::make_word(s.as_str(), None)));
|
||||
}
|
||||
}
|
||||
|
@ -3960,4 +3985,31 @@ mod tests {
|
|||
],
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tokenize_identifiers_numeric_prefix() {
|
||||
all_dialects_where(|dialect| dialect.supports_numeric_prefix())
|
||||
.tokenizes_to("123abc", vec![Token::make_word("123abc", None)]);
|
||||
|
||||
all_dialects_where(|dialect| dialect.supports_numeric_prefix())
|
||||
.tokenizes_to("12e34", vec![Token::Number("12e34".to_string(), false)]);
|
||||
|
||||
all_dialects_where(|dialect| dialect.supports_numeric_prefix()).tokenizes_to(
|
||||
"t.12e34",
|
||||
vec![
|
||||
Token::make_word("t", None),
|
||||
Token::Period,
|
||||
Token::make_word("12e34", None),
|
||||
],
|
||||
);
|
||||
|
||||
all_dialects_where(|dialect| dialect.supports_numeric_prefix()).tokenizes_to(
|
||||
"t.1two3",
|
||||
vec![
|
||||
Token::make_word("t", None),
|
||||
Token::Period,
|
||||
Token::make_word("1two3", None),
|
||||
],
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue