Fix tokenization of qualified identifiers with numeric prefix. (#1803)

Co-authored-by: Roman Borschel <roman@cluvio.com>
2025-10-16 00:39:00 +00:00 · 2025-04-11 20:58:43 +02:00 · 2025-04-11 20:58:43 +02:00 · bbc80d7537
commit bbc80d7537
parent d090ad4ccf
2 changed files with 185 additions and 11 deletions
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@ -895,7 +895,7 @@ impl<'a> Tokenizer<'a> {
        };

        let mut location = state.location();
-        while let Some(token) = self.next_token(&mut state)? {
+        while let Some(token) = self.next_token(&mut state, buf.last().map(|t| &t.token))? {
            let span = location.span_to(state.location());

            buf.push(TokenWithSpan { token, span });
@ -932,7 +932,11 @@ impl<'a> Tokenizer<'a> {
    }

    /// Get the next token or return None
-    fn next_token(&self, chars: &mut State) -> Result<Option<Token>, TokenizerError> {
+    fn next_token(
+        &self,
+        chars: &mut State,
+        prev_token: Option<&Token>,
+    ) -> Result<Option<Token>, TokenizerError> {
        match chars.peek() {
            Some(&ch) => match ch {
                ' ' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)),
@ -1211,17 +1215,29 @@ impl<'a> Tokenizer<'a> {
                        chars.next();
                    }

+                    // If the dialect supports identifiers that start with a numeric prefix
+                    // and we have now consumed a dot, check if the previous token was a Word.
+                    // If so, what follows is definitely not part of a decimal number and
+                    // we should yield the dot as a dedicated token so compound identifiers
+                    // starting with digits can be parsed correctly.
+                    if s == "." && self.dialect.supports_numeric_prefix() {
+                        if let Some(Token::Word(_)) = prev_token {
+                            return Ok(Some(Token::Period));
+                        }
+                    }
+
+                    // Consume fractional digits.
                    s += &peeking_next_take_while(chars, |ch, next_ch| {
                        ch.is_ascii_digit() || is_number_separator(ch, next_ch)
                    });

-                    // No number -> Token::Period
+                    // No fraction -> Token::Period
                    if s == "." {
                        return Ok(Some(Token::Period));
                    }

-                    let mut exponent_part = String::new();
                    // Parse exponent as number
+                    let mut exponent_part = String::new();
                    if chars.peek() == Some(&'e') || chars.peek() == Some(&'E') {
                        let mut char_clone = chars.peekable.clone();
                        exponent_part.push(char_clone.next().unwrap());
@ -1250,14 +1266,23 @@ impl<'a> Tokenizer<'a> {
                        }
                    }

-                    // mysql dialect supports identifiers that start with a numeric prefix,
-                    // as long as they aren't an exponent number.
-                    if self.dialect.supports_numeric_prefix() && exponent_part.is_empty() {
-                        let word =
-                            peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch));
+                    // If the dialect supports identifiers that start with a numeric prefix,
+                    // we need to check if the value is in fact an identifier and must thus
+                    // be tokenized as a word.
+                    if self.dialect.supports_numeric_prefix() {
+                        if exponent_part.is_empty() {
+                            // If it is not a number with an exponent, it may be
+                            // an identifier starting with digits.
+                            let word =
+                                peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch));

-                        if !word.is_empty() {
-                            s += word.as_str();
+                            if !word.is_empty() {
+                                s += word.as_str();
+                                return Ok(Some(Token::make_word(s.as_str(), None)));
+                            }
+                        } else if prev_token == Some(&Token::Period) {
+                            // If the previous token was a period, thus not belonging to a number,
+                            // the value we have is part of an identifier.
                            return Ok(Some(Token::make_word(s.as_str(), None)));
                        }
                    }
@ -3960,4 +3985,31 @@ mod tests {
                ],
            );
    }
+
+    #[test]
+    fn test_tokenize_identifiers_numeric_prefix() {
+        all_dialects_where(|dialect| dialect.supports_numeric_prefix())
+            .tokenizes_to("123abc", vec![Token::make_word("123abc", None)]);
+
+        all_dialects_where(|dialect| dialect.supports_numeric_prefix())
+            .tokenizes_to("12e34", vec![Token::Number("12e34".to_string(), false)]);
+
+        all_dialects_where(|dialect| dialect.supports_numeric_prefix()).tokenizes_to(
+            "t.12e34",
+            vec![
+                Token::make_word("t", None),
+                Token::Period,
+                Token::make_word("12e34", None),
+            ],
+        );
+
+        all_dialects_where(|dialect| dialect.supports_numeric_prefix()).tokenizes_to(
+            "t.1two3",
+            vec![
+                Token::make_word("t", None),
+                Token::Period,
+                Token::make_word("1two3", None),
+            ],
+        );
+    }
 }