mirror of
https://github.com/apache/datafusion-sqlparser-rs.git
synced 2025-12-23 11:12:51 +00:00
Extended CSV STDIN tests and resolved more corner cases in tokenizer
This commit is contained in:
parent
b862dc7eab
commit
93ea5d2458
8 changed files with 884 additions and 825 deletions
|
|
@ -4649,7 +4649,7 @@ impl fmt::Display for Statement {
|
|||
let data = String::from_utf8(writer.into_inner().map_err(|_| fmt::Error)?)
|
||||
.map_err(|_| fmt::Error)?;
|
||||
write!(f, "{}", data)?;
|
||||
write!(f, "\n\\.")?;
|
||||
write!(f, "\\.")?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
|
|
|||
|
|
@ -83,7 +83,11 @@ impl Dialect for BigQueryDialect {
|
|||
}
|
||||
|
||||
fn is_identifier_part(&self, ch: char) -> bool {
|
||||
ch.is_ascii_lowercase() || ch.is_ascii_uppercase() || ch.is_ascii_digit() || ch == '_' || ch == '-'
|
||||
ch.is_ascii_lowercase() || ch.is_ascii_uppercase() || ch.is_ascii_digit() || ch == '_'
|
||||
}
|
||||
|
||||
fn supports_hyphenated_identifiers(&self) -> bool {
|
||||
true
|
||||
}
|
||||
|
||||
/// See [doc](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
|
||||
|
|
|
|||
|
|
@ -178,6 +178,11 @@ pub trait Dialect: Debug + Any {
|
|||
/// Determine if a character is a valid unquoted identifier character
|
||||
fn is_identifier_part(&self, ch: char) -> bool;
|
||||
|
||||
/// Returns whether the dialect supports hyphenated identifiers
|
||||
fn supports_hyphenated_identifiers(&self) -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
/// Most dialects do not have custom operators. Override this method to provide custom operators.
|
||||
fn is_custom_operator_part(&self, _ch: char) -> bool {
|
||||
false
|
||||
|
|
|
|||
|
|
@ -9539,13 +9539,11 @@ impl<'a> Parser<'a> {
|
|||
legacy_options: &[CopyLegacyOption],
|
||||
) -> Result<Vec<Vec<Option<String>>>, ParserError> {
|
||||
let Token::CopyFromStdin(body) = self.next_token().token else {
|
||||
return self.expected(
|
||||
"COPY ... FROM STDIN with CSV body",
|
||||
self.peek_token(),
|
||||
);
|
||||
return self.expected("COPY ... FROM STDIN with CSV body", self.peek_token());
|
||||
};
|
||||
|
||||
let mut reader_builder = csv::ReaderBuilder::new();
|
||||
reader_builder.has_headers(false);
|
||||
|
||||
let mut null_symbol = "\\N";
|
||||
|
||||
|
|
@ -11336,80 +11334,69 @@ impl<'a> Parser<'a> {
|
|||
/// Return a tuple of the identifier and a boolean indicating it ends with a period.
|
||||
fn parse_unquoted_hyphenated_identifier(&mut self) -> Result<(Ident, bool), ParserError> {
|
||||
match self.peek_token().token {
|
||||
Token::UnquotedDashStringLiteral(lit) => {
|
||||
let span = self.next_token().span;
|
||||
Ok((
|
||||
Ident {
|
||||
value: lit,
|
||||
quote_style: None,
|
||||
span,
|
||||
},
|
||||
false,
|
||||
))
|
||||
}
|
||||
Token::Word(w) => {
|
||||
let quote_style_is_none = w.quote_style.is_none();
|
||||
let mut requires_whitespace = false;
|
||||
let mut ident = w.into_ident(self.next_token().span);
|
||||
if quote_style_is_none {
|
||||
while matches!(self.peek_token().token, Token::Minus) {
|
||||
unreachable!("Something went wrong in the tokenizer!");
|
||||
// self.next_token();
|
||||
// ident.value.push('-');
|
||||
// Token::Word(w) => {
|
||||
// let quote_style_is_none = w.quote_style.is_none();
|
||||
// let mut requires_whitespace = false;
|
||||
// let mut ident = w.into_ident(self.next_token().span);
|
||||
// if quote_style_is_none {
|
||||
// while matches!(self.peek_token().token, Token::Minus) {
|
||||
// unreachable!("Something went wrong in the tokenizer!");
|
||||
// // self.next_token();
|
||||
// // ident.value.push('-');
|
||||
|
||||
// let token = self
|
||||
// .next_token_no_skip()
|
||||
// .cloned()
|
||||
// .unwrap_or(TokenWithSpan::wrap(Token::EOF));
|
||||
// requires_whitespace = match token.token {
|
||||
// Token::Word(next_word) if next_word.quote_style.is_none() => {
|
||||
// ident.value.push_str(&next_word.value);
|
||||
// false
|
||||
// }
|
||||
// Token::Number(s, false) => {
|
||||
// // A number token can represent a decimal value ending with a period, e.g., `Number('123.')`.
|
||||
// // However, for an [ObjectName], it is part of a hyphenated identifier, e.g., `foo-123.bar`.
|
||||
// //
|
||||
// // If a number token is followed by a period, it is part of an [ObjectName].
|
||||
// // Return the identifier with `true` if the number token is followed by a period, indicating that
|
||||
// // parsing should continue for the next part of the hyphenated identifier.
|
||||
// if s.ends_with('.') {
|
||||
// let Some(s) = s.split('.').next().filter(|s| {
|
||||
// !s.is_empty() && s.chars().all(|c| c.is_ascii_digit())
|
||||
// }) else {
|
||||
// return self.expected(
|
||||
// "continuation of hyphenated identifier",
|
||||
// TokenWithSpan::new(Token::Number(s, false), token.span),
|
||||
// );
|
||||
// };
|
||||
// ident.value.push_str(s);
|
||||
// return Ok((ident, true));
|
||||
// } else {
|
||||
// ident.value.push_str(&s);
|
||||
// }
|
||||
// // If next token is period, then it is part of an ObjectName and we don't expect whitespace
|
||||
// // after the number.
|
||||
// !matches!(self.peek_token().token, Token::Period)
|
||||
// }
|
||||
// _ => {
|
||||
// return self
|
||||
// .expected("continuation of hyphenated identifier", token);
|
||||
// }
|
||||
// }
|
||||
}
|
||||
// // let token = self
|
||||
// // .next_token_no_skip()
|
||||
// // .cloned()
|
||||
// // .unwrap_or(TokenWithSpan::wrap(Token::EOF));
|
||||
// // requires_whitespace = match token.token {
|
||||
// // Token::Word(next_word) if next_word.quote_style.is_none() => {
|
||||
// // ident.value.push_str(&next_word.value);
|
||||
// // false
|
||||
// // }
|
||||
// // Token::Number(s, false) => {
|
||||
// // // A number token can represent a decimal value ending with a period, e.g., `Number('123.')`.
|
||||
// // // However, for an [ObjectName], it is part of a hyphenated identifier, e.g., `foo-123.bar`.
|
||||
// // //
|
||||
// // // If a number token is followed by a period, it is part of an [ObjectName].
|
||||
// // // Return the identifier with `true` if the number token is followed by a period, indicating that
|
||||
// // // parsing should continue for the next part of the hyphenated identifier.
|
||||
// // if s.ends_with('.') {
|
||||
// // let Some(s) = s.split('.').next().filter(|s| {
|
||||
// // !s.is_empty() && s.chars().all(|c| c.is_ascii_digit())
|
||||
// // }) else {
|
||||
// // return self.expected(
|
||||
// // "continuation of hyphenated identifier",
|
||||
// // TokenWithSpan::new(Token::Number(s, false), token.span),
|
||||
// // );
|
||||
// // };
|
||||
// // ident.value.push_str(s);
|
||||
// // return Ok((ident, true));
|
||||
// // } else {
|
||||
// // ident.value.push_str(&s);
|
||||
// // }
|
||||
// // // If next token is period, then it is part of an ObjectName and we don't expect whitespace
|
||||
// // // after the number.
|
||||
// // !matches!(self.peek_token().token, Token::Period)
|
||||
// // }
|
||||
// // _ => {
|
||||
// // return self
|
||||
// // .expected("continuation of hyphenated identifier", token);
|
||||
// // }
|
||||
// // }
|
||||
// }
|
||||
|
||||
// If the last segment was a number, we must check that it's followed by whitespace,
|
||||
// otherwise foo-123a will be parsed as `foo-123` with the alias `a`.
|
||||
if requires_whitespace {
|
||||
let token = self.next_token();
|
||||
if !matches!(token.token, Token::EOF) {
|
||||
return self
|
||||
.expected("whitespace following hyphenated identifier", token);
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok((ident, false))
|
||||
}
|
||||
// // If the last segment was a number, we must check that it's followed by whitespace,
|
||||
// // otherwise foo-123a will be parsed as `foo-123` with the alias `a`.
|
||||
// if requires_whitespace {
|
||||
// let token = self.next_token();
|
||||
// if !matches!(token.token, Token::EOF) {
|
||||
// return self
|
||||
// .expected("whitespace following hyphenated identifier", token);
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// Ok((ident, false))
|
||||
// }
|
||||
_ => Ok((self.parse_identifier()?, false)),
|
||||
}
|
||||
}
|
||||
|
|
@ -18530,9 +18517,17 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn test_placeholder_invalid_whitespace() {
|
||||
for w in [" ", " ", "/*invalid*/", "\n", "\t", "\r\n", "--comment\n"] {
|
||||
for w in [
|
||||
" ",
|
||||
"/*invalid*/",
|
||||
"\n",
|
||||
"\t\t",
|
||||
"\r\n",
|
||||
"--comment\n",
|
||||
"/* multi\nline\ncomment */",
|
||||
] {
|
||||
let sql = format!("\nSELECT\n :{w}fooBar");
|
||||
assert!(Parser::parse_sql(&GenericDialect, &sql).is_err());
|
||||
assert!(Parser::parse_sql(&GenericDialect, &sql).is_err(), "Failed to error on when inserting the whitespace {w:?} within the placeholder SQL: `{sql}`");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -154,6 +154,7 @@ impl TestedDialects {
|
|||
///
|
||||
/// For multiple statements, use [`statements_parse_to`].
|
||||
pub fn one_statement_parses_to(&self, sql: &str, canonical: &str) -> Statement {
|
||||
println!("Testing SQL: {}", sql);
|
||||
let mut statements = self.parse_sql_statements(sql).expect(sql);
|
||||
assert_eq!(statements.len(), 1);
|
||||
if !canonical.is_empty() && sql != canonical {
|
||||
|
|
|
|||
1491
src/tokenizer.rs
1491
src/tokenizer.rs
File diff suppressed because it is too large
Load diff
|
|
@ -3589,6 +3589,7 @@ fn test_double_value() {
|
|||
|
||||
for (input, expected) in test_cases {
|
||||
for (i, expr) in input.iter().enumerate() {
|
||||
println!("Testing expression: {}", expr);
|
||||
if let Statement::Query(query) =
|
||||
dialects.one_statement_parses_to(&format!("SELECT {expr}"), "")
|
||||
{
|
||||
|
|
|
|||
|
|
@ -1014,27 +1014,37 @@ fn parse_drop_schema_if_exists() {
|
|||
|
||||
#[test]
|
||||
fn parse_copy_from_stdin() {
|
||||
let sql = r#"COPY public.actor (actor_id, first_name, last_name, last_update, value) FROM stdin;
|
||||
1 PENELOPE GUINESS 2006-02-15 09:34:33 0.11111
|
||||
2 NICK WAHLBERG 2006-02-15 09:34:33 0.22222
|
||||
3 ED CHASE 2006-02-15 09:34:33 0.312323
|
||||
4 JENNIFER DAVIS 2006-02-15 09:34:33 0.3232
|
||||
5 JOHNNY LOLLOBRIGIDA 2006-02-15 09:34:33 1.343
|
||||
6 BETTE NICHOLSON 2006-02-15 09:34:33 5.0
|
||||
7 GRACE MOSTEL 2006-02-15 09:34:33 6.0
|
||||
8 MATTHEW JOHANSSON 2006-02-15 09:34:33 7.0
|
||||
9 JOE SWANK 2006-02-15 09:34:33 8.0
|
||||
10 CHRISTIAN GABLE 2006-02-15 09:34:33 9.1
|
||||
11 ZERO CAGE 2006-02-15 09:34:33 10.001
|
||||
12 KARL BERRY 2017-11-02 19:15:42.308637+08 11.001
|
||||
A Fateful Reflection of a Waitress And a Boat who must Discover a Sumo Wrestler in Ancient China
|
||||
Kwara & Kogi
|
||||
{"Deleted Scenes","Behind the Scenes"}
|
||||
'awe':5 'awe-inspir':4 'barbarella':1 'cat':13 'conquer':16 'dog':18 'feminist':10 'inspir':6 'monasteri':21 'must':15 'stori':7 'streetcar':2
|
||||
PHP ₱ USD $
|
||||
\N Some other value
|
||||
\\."#;
|
||||
pg_and_generic().one_statement_parses_to(sql, "");
|
||||
let sql = r#"COPY public.actor (actor_id, first_name, last_name, last_update, value) FROM STDIN;
|
||||
1 PENELOPE GUINESS 2006-02-15 09:34:33 0.11111
|
||||
2 NICK WAHLBERG 2006-02-15 09:34:33 0.22222
|
||||
3 ED CHASE 2006-02-15 09:34:33 0.312323
|
||||
4 JENNIFER DAVIS 2006-02-15 09:34:33 0.3232
|
||||
5 JOHNNY LOLLOBRIGIDA 2006-02-15 09:34:33 1.343
|
||||
6 BETTE NICHOLSON 2006-02-15 09:34:33 5.0
|
||||
7 GRACE MOSTEL 2006-02-15 09:34:33 6.0
|
||||
8 MATTHEW JOHANSSON 2006-02-15 09:34:33 7.0
|
||||
9 JOE SWANK 2006-02-15 09:34:33 8.0
|
||||
10 CHRISTIAN GABLE 2006-02-15 09:34:33 9.1
|
||||
11 ZERO CAGE 2006-02-15 09:34:33 10.001
|
||||
12 KARL BERRY 2017-11-02 19:15:42.308637+08 11.001
|
||||
\."#;
|
||||
pg_and_generic().verified_stmt(sql);
|
||||
|
||||
let sql_comma_separated = r#"COPY public.actor (actor_id, first_name, last_name, last_update, value) FROM STDIN (FORMAT csv, DELIMITER ',');
|
||||
1,PENELOPE,GUINESS,2006-02-15 09:34:33,0.11111
|
||||
2,NICK,WAHLBERG,2006-02-15 09:34:33,0.22222
|
||||
3,ED,CHASE,2006-02-15 09:34:33,0.312323
|
||||
4,JENNIFER,DAVIS,2006-02-15 09:34:33,0.3232
|
||||
5,JOHNNY,"LOLLO,BRIGIDA",2006-02-15 09:34:33,1.343
|
||||
6,BETTE,NICHOLSON,2006-02-15 09:34:33,5.0
|
||||
7,GRACE,MOSTEL,2006-02-15 09:34:33,6.0
|
||||
8,MATTHEW,JOHANSSON,2006-02-15 09:34:33,7.0
|
||||
9,JOE,SWANK,2006-02-15 09:34:33,8.0
|
||||
10,CHRISTIAN,GABLE,2006-02-15 09:34:33,9.1
|
||||
11,ZERO,CAGE,2006-02-15 09:34:33,10.001
|
||||
12,KARL,BERRY,2017-11-02 19:15:42.308637+08,11.001
|
||||
\."#;
|
||||
pg_and_generic().verified_stmt(sql_comma_separated);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue