Extended CSV STDIN tests and resolved more corner cases in tokenizer

This commit is contained in:
Luca 2025-10-29 09:35:54 +01:00
parent b862dc7eab
commit 93ea5d2458
8 changed files with 884 additions and 825 deletions

View file

@ -4649,7 +4649,7 @@ impl fmt::Display for Statement {
let data = String::from_utf8(writer.into_inner().map_err(|_| fmt::Error)?)
.map_err(|_| fmt::Error)?;
write!(f, "{}", data)?;
write!(f, "\n\\.")?;
write!(f, "\\.")?;
}
Ok(())
}

View file

@ -83,7 +83,11 @@ impl Dialect for BigQueryDialect {
}
fn is_identifier_part(&self, ch: char) -> bool {
ch.is_ascii_lowercase() || ch.is_ascii_uppercase() || ch.is_ascii_digit() || ch == '_' || ch == '-'
ch.is_ascii_lowercase() || ch.is_ascii_uppercase() || ch.is_ascii_digit() || ch == '_'
}
fn supports_hyphenated_identifiers(&self) -> bool {
true
}
/// See [doc](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)

View file

@ -178,6 +178,11 @@ pub trait Dialect: Debug + Any {
/// Determine if a character is a valid unquoted identifier character
fn is_identifier_part(&self, ch: char) -> bool;
/// Returns whether the dialect supports hyphenated identifiers
fn supports_hyphenated_identifiers(&self) -> bool {
false
}
/// Most dialects do not have custom operators. Override this method to provide custom operators.
fn is_custom_operator_part(&self, _ch: char) -> bool {
false

View file

@ -9539,13 +9539,11 @@ impl<'a> Parser<'a> {
legacy_options: &[CopyLegacyOption],
) -> Result<Vec<Vec<Option<String>>>, ParserError> {
let Token::CopyFromStdin(body) = self.next_token().token else {
return self.expected(
"COPY ... FROM STDIN with CSV body",
self.peek_token(),
);
return self.expected("COPY ... FROM STDIN with CSV body", self.peek_token());
};
let mut reader_builder = csv::ReaderBuilder::new();
reader_builder.has_headers(false);
let mut null_symbol = "\\N";
@ -11336,80 +11334,69 @@ impl<'a> Parser<'a> {
/// Return a tuple of the identifier and a boolean indicating it ends with a period.
fn parse_unquoted_hyphenated_identifier(&mut self) -> Result<(Ident, bool), ParserError> {
match self.peek_token().token {
Token::UnquotedDashStringLiteral(lit) => {
let span = self.next_token().span;
Ok((
Ident {
value: lit,
quote_style: None,
span,
},
false,
))
}
Token::Word(w) => {
let quote_style_is_none = w.quote_style.is_none();
let mut requires_whitespace = false;
let mut ident = w.into_ident(self.next_token().span);
if quote_style_is_none {
while matches!(self.peek_token().token, Token::Minus) {
unreachable!("Something went wrong in the tokenizer!");
// self.next_token();
// ident.value.push('-');
// Token::Word(w) => {
// let quote_style_is_none = w.quote_style.is_none();
// let mut requires_whitespace = false;
// let mut ident = w.into_ident(self.next_token().span);
// if quote_style_is_none {
// while matches!(self.peek_token().token, Token::Minus) {
// unreachable!("Something went wrong in the tokenizer!");
// // self.next_token();
// // ident.value.push('-');
// let token = self
// .next_token_no_skip()
// .cloned()
// .unwrap_or(TokenWithSpan::wrap(Token::EOF));
// requires_whitespace = match token.token {
// Token::Word(next_word) if next_word.quote_style.is_none() => {
// ident.value.push_str(&next_word.value);
// false
// }
// Token::Number(s, false) => {
// // A number token can represent a decimal value ending with a period, e.g., `Number('123.')`.
// // However, for an [ObjectName], it is part of a hyphenated identifier, e.g., `foo-123.bar`.
// //
// // If a number token is followed by a period, it is part of an [ObjectName].
// // Return the identifier with `true` if the number token is followed by a period, indicating that
// // parsing should continue for the next part of the hyphenated identifier.
// if s.ends_with('.') {
// let Some(s) = s.split('.').next().filter(|s| {
// !s.is_empty() && s.chars().all(|c| c.is_ascii_digit())
// }) else {
// return self.expected(
// "continuation of hyphenated identifier",
// TokenWithSpan::new(Token::Number(s, false), token.span),
// );
// };
// ident.value.push_str(s);
// return Ok((ident, true));
// } else {
// ident.value.push_str(&s);
// }
// // If next token is period, then it is part of an ObjectName and we don't expect whitespace
// // after the number.
// !matches!(self.peek_token().token, Token::Period)
// }
// _ => {
// return self
// .expected("continuation of hyphenated identifier", token);
// }
// }
}
// // let token = self
// // .next_token_no_skip()
// // .cloned()
// // .unwrap_or(TokenWithSpan::wrap(Token::EOF));
// // requires_whitespace = match token.token {
// // Token::Word(next_word) if next_word.quote_style.is_none() => {
// // ident.value.push_str(&next_word.value);
// // false
// // }
// // Token::Number(s, false) => {
// // // A number token can represent a decimal value ending with a period, e.g., `Number('123.')`.
// // // However, for an [ObjectName], it is part of a hyphenated identifier, e.g., `foo-123.bar`.
// // //
// // // If a number token is followed by a period, it is part of an [ObjectName].
// // // Return the identifier with `true` if the number token is followed by a period, indicating that
// // // parsing should continue for the next part of the hyphenated identifier.
// // if s.ends_with('.') {
// // let Some(s) = s.split('.').next().filter(|s| {
// // !s.is_empty() && s.chars().all(|c| c.is_ascii_digit())
// // }) else {
// // return self.expected(
// // "continuation of hyphenated identifier",
// // TokenWithSpan::new(Token::Number(s, false), token.span),
// // );
// // };
// // ident.value.push_str(s);
// // return Ok((ident, true));
// // } else {
// // ident.value.push_str(&s);
// // }
// // // If next token is period, then it is part of an ObjectName and we don't expect whitespace
// // // after the number.
// // !matches!(self.peek_token().token, Token::Period)
// // }
// // _ => {
// // return self
// // .expected("continuation of hyphenated identifier", token);
// // }
// // }
// }
// If the last segment was a number, we must check that it's followed by whitespace,
// otherwise foo-123a will be parsed as `foo-123` with the alias `a`.
if requires_whitespace {
let token = self.next_token();
if !matches!(token.token, Token::EOF) {
return self
.expected("whitespace following hyphenated identifier", token);
}
}
}
Ok((ident, false))
}
// // If the last segment was a number, we must check that it's followed by whitespace,
// // otherwise foo-123a will be parsed as `foo-123` with the alias `a`.
// if requires_whitespace {
// let token = self.next_token();
// if !matches!(token.token, Token::EOF) {
// return self
// .expected("whitespace following hyphenated identifier", token);
// }
// }
// }
// Ok((ident, false))
// }
_ => Ok((self.parse_identifier()?, false)),
}
}
@ -18530,9 +18517,17 @@ mod tests {
#[test]
fn test_placeholder_invalid_whitespace() {
for w in [" ", " ", "/*invalid*/", "\n", "\t", "\r\n", "--comment\n"] {
for w in [
" ",
"/*invalid*/",
"\n",
"\t\t",
"\r\n",
"--comment\n",
"/* multi\nline\ncomment */",
] {
let sql = format!("\nSELECT\n :{w}fooBar");
assert!(Parser::parse_sql(&GenericDialect, &sql).is_err());
assert!(Parser::parse_sql(&GenericDialect, &sql).is_err(), "Failed to error on when inserting the whitespace {w:?} within the placeholder SQL: `{sql}`");
}
}
}

View file

@ -154,6 +154,7 @@ impl TestedDialects {
///
/// For multiple statements, use [`statements_parse_to`].
pub fn one_statement_parses_to(&self, sql: &str, canonical: &str) -> Statement {
println!("Testing SQL: {}", sql);
let mut statements = self.parse_sql_statements(sql).expect(sql);
assert_eq!(statements.len(), 1);
if !canonical.is_empty() && sql != canonical {

File diff suppressed because it is too large Load diff

View file

@ -3589,6 +3589,7 @@ fn test_double_value() {
for (input, expected) in test_cases {
for (i, expr) in input.iter().enumerate() {
println!("Testing expression: {}", expr);
if let Statement::Query(query) =
dialects.one_statement_parses_to(&format!("SELECT {expr}"), "")
{

View file

@ -1014,27 +1014,37 @@ fn parse_drop_schema_if_exists() {
#[test]
fn parse_copy_from_stdin() {
let sql = r#"COPY public.actor (actor_id, first_name, last_name, last_update, value) FROM stdin;
1 PENELOPE GUINESS 2006-02-15 09:34:33 0.11111
2 NICK WAHLBERG 2006-02-15 09:34:33 0.22222
3 ED CHASE 2006-02-15 09:34:33 0.312323
4 JENNIFER DAVIS 2006-02-15 09:34:33 0.3232
5 JOHNNY LOLLOBRIGIDA 2006-02-15 09:34:33 1.343
6 BETTE NICHOLSON 2006-02-15 09:34:33 5.0
7 GRACE MOSTEL 2006-02-15 09:34:33 6.0
8 MATTHEW JOHANSSON 2006-02-15 09:34:33 7.0
9 JOE SWANK 2006-02-15 09:34:33 8.0
10 CHRISTIAN GABLE 2006-02-15 09:34:33 9.1
11 ZERO CAGE 2006-02-15 09:34:33 10.001
12 KARL BERRY 2017-11-02 19:15:42.308637+08 11.001
A Fateful Reflection of a Waitress And a Boat who must Discover a Sumo Wrestler in Ancient China
Kwara & Kogi
{"Deleted Scenes","Behind the Scenes"}
'awe':5 'awe-inspir':4 'barbarella':1 'cat':13 'conquer':16 'dog':18 'feminist':10 'inspir':6 'monasteri':21 'must':15 'stori':7 'streetcar':2
PHP USD $
\N Some other value
\\."#;
pg_and_generic().one_statement_parses_to(sql, "");
let sql = r#"COPY public.actor (actor_id, first_name, last_name, last_update, value) FROM STDIN;
1 PENELOPE GUINESS 2006-02-15 09:34:33 0.11111
2 NICK WAHLBERG 2006-02-15 09:34:33 0.22222
3 ED CHASE 2006-02-15 09:34:33 0.312323
4 JENNIFER DAVIS 2006-02-15 09:34:33 0.3232
5 JOHNNY LOLLOBRIGIDA 2006-02-15 09:34:33 1.343
6 BETTE NICHOLSON 2006-02-15 09:34:33 5.0
7 GRACE MOSTEL 2006-02-15 09:34:33 6.0
8 MATTHEW JOHANSSON 2006-02-15 09:34:33 7.0
9 JOE SWANK 2006-02-15 09:34:33 8.0
10 CHRISTIAN GABLE 2006-02-15 09:34:33 9.1
11 ZERO CAGE 2006-02-15 09:34:33 10.001
12 KARL BERRY 2017-11-02 19:15:42.308637+08 11.001
\."#;
pg_and_generic().verified_stmt(sql);
let sql_comma_separated = r#"COPY public.actor (actor_id, first_name, last_name, last_update, value) FROM STDIN (FORMAT csv, DELIMITER ',');
1,PENELOPE,GUINESS,2006-02-15 09:34:33,0.11111
2,NICK,WAHLBERG,2006-02-15 09:34:33,0.22222
3,ED,CHASE,2006-02-15 09:34:33,0.312323
4,JENNIFER,DAVIS,2006-02-15 09:34:33,0.3232
5,JOHNNY,"LOLLO,BRIGIDA",2006-02-15 09:34:33,1.343
6,BETTE,NICHOLSON,2006-02-15 09:34:33,5.0
7,GRACE,MOSTEL,2006-02-15 09:34:33,6.0
8,MATTHEW,JOHANSSON,2006-02-15 09:34:33,7.0
9,JOE,SWANK,2006-02-15 09:34:33,8.0
10,CHRISTIAN,GABLE,2006-02-15 09:34:33,9.1
11,ZERO,CAGE,2006-02-15 09:34:33,10.001
12,KARL,BERRY,2017-11-02 19:15:42.308637+08,11.001
\."#;
pg_and_generic().verified_stmt(sql_comma_separated);
}
#[test]