Extended CSV STDIN tests and resolved more corner cases in tokenizer

2025-12-23 11:12:51 +00:00 · 2025-10-29 09:35:54 +01:00 · 2025-10-29 09:35:54 +01:00 · 93ea5d2458
commit 93ea5d2458
parent b862dc7eab
8 changed files with 884 additions and 825 deletions
--- a/src/ast/mod.rs
+++ b/src/ast/mod.rs
@ -4649,7 +4649,7 @@ impl fmt::Display for Statement {
                    let data = String::from_utf8(writer.into_inner().map_err(|_| fmt::Error)?)
                        .map_err(|_| fmt::Error)?;
                    write!(f, "{}", data)?;
-                    write!(f, "\n\\.")?;
+                    write!(f, "\\.")?;
                }
                Ok(())
            }
--- a/src/dialect/bigquery.rs
+++ b/src/dialect/bigquery.rs
@ -83,7 +83,11 @@ impl Dialect for BigQueryDialect {
    }

    fn is_identifier_part(&self, ch: char) -> bool {
-        ch.is_ascii_lowercase() || ch.is_ascii_uppercase() || ch.is_ascii_digit() || ch == '_' || ch == '-'
+        ch.is_ascii_lowercase() || ch.is_ascii_uppercase() || ch.is_ascii_digit() || ch == '_'
+    }
+
+    fn supports_hyphenated_identifiers(&self) -> bool {
+        true
    }

    /// See [doc](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
--- a/src/dialect/mod.rs
+++ b/src/dialect/mod.rs
@ -178,6 +178,11 @@ pub trait Dialect: Debug + Any {
    /// Determine if a character is a valid unquoted identifier character
    fn is_identifier_part(&self, ch: char) -> bool;

+    /// Returns whether the dialect supports hyphenated identifiers
+    fn supports_hyphenated_identifiers(&self) -> bool {
+        false
+    }
+
    /// Most dialects do not have custom operators. Override this method to provide custom operators.
    fn is_custom_operator_part(&self, _ch: char) -> bool {
        false
--- a/src/parser/mod.rs
+++ b/src/parser/mod.rs
@ -9539,13 +9539,11 @@ impl<'a> Parser<'a> {
        legacy_options: &[CopyLegacyOption],
    ) -> Result<Vec<Vec<Option<String>>>, ParserError> {
        let Token::CopyFromStdin(body) = self.next_token().token else {
-            return self.expected(
-                "COPY ... FROM STDIN with CSV body",
-                self.peek_token(),
-            );
+            return self.expected("COPY ... FROM STDIN with CSV body", self.peek_token());
        };

        let mut reader_builder = csv::ReaderBuilder::new();
+        reader_builder.has_headers(false);

        let mut null_symbol = "\\N";

@ -11336,80 +11334,69 @@ impl<'a> Parser<'a> {
    /// Return a tuple of the identifier and a boolean indicating it ends with a period.
    fn parse_unquoted_hyphenated_identifier(&mut self) -> Result<(Ident, bool), ParserError> {
        match self.peek_token().token {
-            Token::UnquotedDashStringLiteral(lit) => {
-                let span = self.next_token().span;
-                Ok((
-                    Ident {
-                        value: lit,
-                        quote_style: None,
-                        span,
-                    },
-                    false,
-                ))
-            }
-            Token::Word(w) => {
-                let quote_style_is_none = w.quote_style.is_none();
-                let mut requires_whitespace = false;
-                let mut ident = w.into_ident(self.next_token().span);
-                if quote_style_is_none {
-                    while matches!(self.peek_token().token, Token::Minus) {
-                        unreachable!("Something went wrong in the tokenizer!");
-                        // self.next_token();
-                        // ident.value.push('-');
+            // Token::Word(w) => {
+            //     let quote_style_is_none = w.quote_style.is_none();
+            //     let mut requires_whitespace = false;
+            //     let mut ident = w.into_ident(self.next_token().span);
+            //     if quote_style_is_none {
+            //         while matches!(self.peek_token().token, Token::Minus) {
+            //             unreachable!("Something went wrong in the tokenizer!");
+            //             // self.next_token();
+            //             // ident.value.push('-');

-                        // let token = self
-                        //     .next_token_no_skip()
-                        //     .cloned()
-                        //     .unwrap_or(TokenWithSpan::wrap(Token::EOF));
-                        // requires_whitespace = match token.token {
-                        //     Token::Word(next_word) if next_word.quote_style.is_none() => {
-                        //         ident.value.push_str(&next_word.value);
-                        //         false
-                        //     }
-                        //     Token::Number(s, false) => {
-                        //         // A number token can represent a decimal value ending with a period, e.g., `Number('123.')`.
-                        //         // However, for an [ObjectName], it is part of a hyphenated identifier, e.g., `foo-123.bar`.
-                        //         //
-                        //         // If a number token is followed by a period, it is part of an [ObjectName].
-                        //         // Return the identifier with `true` if the number token is followed by a period, indicating that
-                        //         // parsing should continue for the next part of the hyphenated identifier.
-                        //         if s.ends_with('.') {
-                        //             let Some(s) = s.split('.').next().filter(|s| {
-                        //                 !s.is_empty() && s.chars().all(|c| c.is_ascii_digit())
-                        //             }) else {
-                        //                 return self.expected(
-                        //                     "continuation of hyphenated identifier",
-                        //                     TokenWithSpan::new(Token::Number(s, false), token.span),
-                        //                 );
-                        //             };
-                        //             ident.value.push_str(s);
-                        //             return Ok((ident, true));
-                        //         } else {
-                        //             ident.value.push_str(&s);
-                        //         }
-                        //         // If next token is period, then it is part of an ObjectName and we don't expect whitespace
-                        //         // after the number.
-                        //         !matches!(self.peek_token().token, Token::Period)
-                        //     }
-                        //     _ => {
-                        //         return self
-                        //             .expected("continuation of hyphenated identifier", token);
-                        //     }
-                        // }
-                    }
+            //             // let token = self
+            //             //     .next_token_no_skip()
+            //             //     .cloned()
+            //             //     .unwrap_or(TokenWithSpan::wrap(Token::EOF));
+            //             // requires_whitespace = match token.token {
+            //             //     Token::Word(next_word) if next_word.quote_style.is_none() => {
+            //             //         ident.value.push_str(&next_word.value);
+            //             //         false
+            //             //     }
+            //             //     Token::Number(s, false) => {
+            //             //         // A number token can represent a decimal value ending with a period, e.g., `Number('123.')`.
+            //             //         // However, for an [ObjectName], it is part of a hyphenated identifier, e.g., `foo-123.bar`.
+            //             //         //
+            //             //         // If a number token is followed by a period, it is part of an [ObjectName].
+            //             //         // Return the identifier with `true` if the number token is followed by a period, indicating that
+            //             //         // parsing should continue for the next part of the hyphenated identifier.
+            //             //         if s.ends_with('.') {
+            //             //             let Some(s) = s.split('.').next().filter(|s| {
+            //             //                 !s.is_empty() && s.chars().all(|c| c.is_ascii_digit())
+            //             //             }) else {
+            //             //                 return self.expected(
+            //             //                     "continuation of hyphenated identifier",
+            //             //                     TokenWithSpan::new(Token::Number(s, false), token.span),
+            //             //                 );
+            //             //             };
+            //             //             ident.value.push_str(s);
+            //             //             return Ok((ident, true));
+            //             //         } else {
+            //             //             ident.value.push_str(&s);
+            //             //         }
+            //             //         // If next token is period, then it is part of an ObjectName and we don't expect whitespace
+            //             //         // after the number.
+            //             //         !matches!(self.peek_token().token, Token::Period)
+            //             //     }
+            //             //     _ => {
+            //             //         return self
+            //             //             .expected("continuation of hyphenated identifier", token);
+            //             //     }
+            //             // }
+            //         }

-                    // If the last segment was a number, we must check that it's followed by whitespace,
-                    // otherwise foo-123a will be parsed as `foo-123` with the alias `a`.
-                    if requires_whitespace {
-                        let token = self.next_token();
-                        if !matches!(token.token, Token::EOF) {
-                            return self
-                                .expected("whitespace following hyphenated identifier", token);
-                        }
-                    }
-                }
-                Ok((ident, false))
-            }
+            //         // If the last segment was a number, we must check that it's followed by whitespace,
+            //         // otherwise foo-123a will be parsed as `foo-123` with the alias `a`.
+            //         if requires_whitespace {
+            //             let token = self.next_token();
+            //             if !matches!(token.token, Token::EOF) {
+            //                 return self
+            //                     .expected("whitespace following hyphenated identifier", token);
+            //             }
+            //         }
+            //     }
+            //     Ok((ident, false))
+            // }
            _ => Ok((self.parse_identifier()?, false)),
        }
    }
@ -18530,9 +18517,17 @@ mod tests {

    #[test]
    fn test_placeholder_invalid_whitespace() {
-        for w in [" ", "  ", "/*invalid*/", "\n", "\t", "\r\n", "--comment\n"] {
+        for w in [
+            "  ",
+            "/*invalid*/",
+            "\n",
+            "\t\t",
+            "\r\n",
+            "--comment\n",
+            "/* multi\nline\ncomment */",
+        ] {
            let sql = format!("\nSELECT\n  :{w}fooBar");
-            assert!(Parser::parse_sql(&GenericDialect, &sql).is_err());
+            assert!(Parser::parse_sql(&GenericDialect, &sql).is_err(), "Failed to error on when inserting the whitespace {w:?} within the placeholder SQL: `{sql}`");
        }
    }
 }
--- a/src/test_utils.rs
+++ b/src/test_utils.rs
@ -154,6 +154,7 @@ impl TestedDialects {
    ///
    ///  For multiple statements, use [`statements_parse_to`].
    pub fn one_statement_parses_to(&self, sql: &str, canonical: &str) -> Statement {
+        println!("Testing SQL: {}", sql);
        let mut statements = self.parse_sql_statements(sql).expect(sql);
        assert_eq!(statements.len(), 1);
        if !canonical.is_empty() && sql != canonical {
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
--- a/tests/sqlparser_common.rs
+++ b/tests/sqlparser_common.rs
@ -3589,6 +3589,7 @@ fn test_double_value() {

    for (input, expected) in test_cases {
        for (i, expr) in input.iter().enumerate() {
+            println!("Testing expression: {}", expr);
            if let Statement::Query(query) =
                dialects.one_statement_parses_to(&format!("SELECT {expr}"), "")
            {
--- a/tests/sqlparser_postgres.rs
+++ b/tests/sqlparser_postgres.rs
@ -1014,27 +1014,37 @@ fn parse_drop_schema_if_exists() {

 #[test]
 fn parse_copy_from_stdin() {
-    let sql = r#"COPY public.actor (actor_id, first_name, last_name, last_update, value) FROM stdin;
-1	PENELOPE	GUINESS	2006-02-15 09:34:33 0.11111
-2	NICK	WAHLBERG	2006-02-15 09:34:33 0.22222
-3	ED	CHASE	2006-02-15 09:34:33 0.312323
-4	JENNIFER	DAVIS	2006-02-15 09:34:33 0.3232
-5	JOHNNY	LOLLOBRIGIDA	2006-02-15 09:34:33 1.343
-6	BETTE	NICHOLSON	2006-02-15 09:34:33 5.0
-7	GRACE	MOSTEL	2006-02-15 09:34:33 6.0
-8	MATTHEW	JOHANSSON	2006-02-15 09:34:33 7.0
-9	JOE	SWANK	2006-02-15 09:34:33 8.0
-10	CHRISTIAN	GABLE	2006-02-15 09:34:33 9.1
-11	ZERO	CAGE	2006-02-15 09:34:33 10.001
-12	KARL	BERRY	2017-11-02 19:15:42.308637+08 11.001
-A Fateful Reflection of a Waitress And a Boat who must Discover a Sumo Wrestler in Ancient China
-Kwara & Kogi
-{"Deleted Scenes","Behind the Scenes"}
-'awe':5 'awe-inspir':4 'barbarella':1 'cat':13 'conquer':16 'dog':18 'feminist':10 'inspir':6 'monasteri':21 'must':15 'stori':7 'streetcar':2
-PHP	₱ USD $
-\N  Some other value
-\\."#;
-    pg_and_generic().one_statement_parses_to(sql, "");
+    let sql = r#"COPY public.actor (actor_id, first_name, last_name, last_update, value) FROM STDIN;
+1	PENELOPE	GUINESS	2006-02-15 09:34:33	0.11111
+2	NICK	WAHLBERG	2006-02-15 09:34:33	0.22222
+3	ED	CHASE	2006-02-15 09:34:33	0.312323
+4	JENNIFER	DAVIS	2006-02-15 09:34:33	0.3232
+5	JOHNNY	LOLLOBRIGIDA	2006-02-15 09:34:33	1.343
+6	BETTE	NICHOLSON	2006-02-15 09:34:33	5.0
+7	GRACE	MOSTEL	2006-02-15 09:34:33	6.0
+8	MATTHEW	JOHANSSON	2006-02-15 09:34:33	7.0
+9	JOE	SWANK	2006-02-15 09:34:33	8.0
+10	CHRISTIAN	GABLE	2006-02-15 09:34:33	9.1
+11	ZERO	CAGE	2006-02-15 09:34:33	10.001
+12	KARL	BERRY	2017-11-02 19:15:42.308637+08	11.001
+\."#;
+    pg_and_generic().verified_stmt(sql);
+
+    let sql_comma_separated = r#"COPY public.actor (actor_id, first_name, last_name, last_update, value) FROM STDIN (FORMAT csv, DELIMITER ',');
+1,PENELOPE,GUINESS,2006-02-15 09:34:33,0.11111
+2,NICK,WAHLBERG,2006-02-15 09:34:33,0.22222
+3,ED,CHASE,2006-02-15 09:34:33,0.312323
+4,JENNIFER,DAVIS,2006-02-15 09:34:33,0.3232
+5,JOHNNY,"LOLLO,BRIGIDA",2006-02-15 09:34:33,1.343
+6,BETTE,NICHOLSON,2006-02-15 09:34:33,5.0
+7,GRACE,MOSTEL,2006-02-15 09:34:33,6.0
+8,MATTHEW,JOHANSSON,2006-02-15 09:34:33,7.0
+9,JOE,SWANK,2006-02-15 09:34:33,8.0
+10,CHRISTIAN,GABLE,2006-02-15 09:34:33,9.1
+11,ZERO,CAGE,2006-02-15 09:34:33,10.001
+12,KARL,BERRY,2017-11-02 19:15:42.308637+08,11.001
+\."#;
+    pg_and_generic().verified_stmt(sql_comma_separated);
 }

 #[test]