Extended placeholder syntax test and moved check in tokenizer

2025-12-23 11:12:51 +00:00 · 2025-10-28 16:26:35 +01:00 · 2025-10-28 16:26:35 +01:00 · c75f11bf47
commit c75f11bf47
parent 52338d6ef5
3 changed files with 29 additions and 4 deletions
--- a/src/parser/mod.rs
+++ b/src/parser/mod.rs
@ -18475,7 +18475,7 @@ mod tests {

    #[test]
    fn test_placeholder_invalid_whitespace() {
-        for w in ["  ", "/*invalid*/"] {
+        for w in [" ", "  ", "/*invalid*/", "\n", "\t", "\r\n", "--comment\n"] {
            let sql = format!("\nSELECT\n  :{w}fooBar");
            assert!(Parser::parse_sql(&GenericDialect, &sql).is_err());
        }
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@ -908,6 +908,22 @@ impl<'a> Tokenizer<'a> {
        Ok(Some(Token::make_word(&word, None)))
    }

+    /// Returns a standardized error if the previous token is a `:` and
+    /// the method is expected to be called when a space is found after it.
+    fn handle_colon_space_error(
+        &self,
+        chars: &State,
+        prev_token: Option<&Token>,
+    ) -> Result<Option<Token>, TokenizerError> {
+        if let Some(Token::Colon) = prev_token {
+            return Err(TokenizerError {
+                message: "Unexpected whitespace after ':'; did you mean ':placeholder' or '::'?".to_string(),
+                location: chars.location(),
+            });
+        }
+        Ok(None)
+    }
+
    /// Get the next token or return None
    fn next_token(
        &self,
@ -919,6 +935,7 @@ impl<'a> Tokenizer<'a> {
        match chars.peek() {
            Some(&ch) => match ch {
                ' ' | '\t' | '\n' | '\r' => {
+                    self.handle_colon_space_error(chars, prev_token)?;
                    chars.next(); // consume
                    *location = chars.location();
                    self.next_token(location, chars, prev_token, true)
@ -1166,7 +1183,7 @@ impl<'a> Tokenizer<'a> {
                    // if the prev token is not a word, then this is not a valid sql
                    // word or number.
                    if ch == '.' && chars.peekable.clone().nth(1) == Some('_') {
-                        if let Some(Token::Word(_)) = prev_token {
+                        if !preceded_by_whitespace {
                            chars.next();
                            return Ok(Some(Token::Period));
                        }
@ -1210,7 +1227,7 @@ impl<'a> Tokenizer<'a> {
                    // we should yield the dot as a dedicated token so compound identifiers
                    // starting with digits can be parsed correctly.
                    if s == "." && self.dialect.supports_numeric_prefix() {
-                        if let Some(Token::Word(_)) = prev_token {
+                        if !preceded_by_whitespace {
                            return Ok(Some(Token::Period));
                        }
                    }
@ -1300,6 +1317,7 @@ impl<'a> Tokenizer<'a> {
                            }

                            if is_comment {
+                                self.handle_colon_space_error(chars, prev_token)?;
                                chars.next(); // consume second '-'
                                // Consume the rest of the line as comment
                                let _comment = self.tokenize_single_line_comment(chars);
@ -1324,12 +1342,14 @@ impl<'a> Tokenizer<'a> {
                    chars.next(); // consume the '/'
                    match chars.peek() {
                        Some('*') => {
+                            self.handle_colon_space_error(chars, prev_token)?;
                            chars.next(); // consume the '*', starting a multi-line comment
                            let _comment = self.consume_multiline_comment(chars)?;
                            *location = chars.location();
                            self.next_token(location, chars, prev_token, true)
                        }
                        Some('/') if dialect_of!(self is SnowflakeDialect) => {
+                            self.handle_colon_space_error(chars, prev_token)?;
                            chars.next(); // consume the second '/', starting a snowflake single-line comment
                            // Consume the rest of the line as comment
                            let _comment = self.tokenize_single_line_comment(chars);
@ -1534,6 +1554,7 @@ impl<'a> Tokenizer<'a> {
                '}' => self.consume_and_return(chars, Token::RBrace),
                '#' if dialect_of!(self is SnowflakeDialect | BigQueryDialect | MySqlDialect | HiveDialect) =>
                {
+                    self.handle_colon_space_error(chars, prev_token)?;
                    chars.next(); // consume the '#', starting a snowflake single-line comment
                    // Consume the rest of the line as comment
                    let _comment = self.tokenize_single_line_comment(chars);
@ -1668,6 +1689,7 @@ impl<'a> Tokenizer<'a> {

                // whitespace check (including unicode chars) should be last as it covers some of the chars above
                ch if ch.is_whitespace() => {
+                    self.handle_colon_space_error(chars, prev_token)?;
                    chars.next(); // consume
                    *location = chars.location();
                    self.next_token(location, chars, prev_token, true)
--- a/tests/sqlparser_bigquery.rs
+++ b/tests/sqlparser_bigquery.rs
@ -1567,7 +1567,10 @@ fn parse_table_identifiers() {

    fn test_table_ident_err(ident: &str) {
        let sql = format!("SELECT 1 FROM {ident}");
-        assert!(bigquery().parse_sql_statements(&sql).is_err());
+        assert!(
+            bigquery().parse_sql_statements(&sql).is_err(),
+            "Expected error parsing identifier: `{ident}`, within SQL: `{sql}`"
+        );
    }

    test_table_ident("`spa ce`", None, vec![Ident::with_quote('`', "spa ce")]);