mirror of
https://github.com/apache/datafusion-sqlparser-rs.git
synced 2025-11-25 00:19:37 +00:00
Removed CSV validation
This commit is contained in:
parent
48c476e62d
commit
38a7c6ba9d
4 changed files with 28 additions and 367 deletions
214
src/ast/dml.rs
214
src/ast/dml.rs
|
|
@ -33,10 +33,9 @@ use crate::display_utils::{indented_list, Indent, SpaceOrNewline};
|
|||
|
||||
use super::{
|
||||
display_comma_separated, display_separated, helpers::attached_token::AttachedToken,
|
||||
query::InputFormatClause, Assignment, CopyLegacyCsvOption, CopyLegacyOption, CopyOption,
|
||||
CopySource, CopyTarget, Expr, FromTable, Ident, InsertAliases, MysqlInsertPriority, ObjectName,
|
||||
OnInsert, OrderByExpr, Query, SelectItem, Setting, SqliteOnConflict, TableObject,
|
||||
TableWithJoins, UpdateTableFromKind,
|
||||
query::InputFormatClause, Assignment, CopyLegacyOption, CopyOption, CopySource, CopyTarget,
|
||||
Expr, FromTable, Ident, InsertAliases, MysqlInsertPriority, ObjectName, OnInsert, OrderByExpr,
|
||||
Query, SelectItem, Setting, SqliteOnConflict, TableObject, TableWithJoins, UpdateTableFromKind,
|
||||
};
|
||||
|
||||
/// INSERT statement.
|
||||
|
|
@ -317,191 +316,6 @@ impl Display for Update {
|
|||
}
|
||||
}
|
||||
|
||||
/// CSV formatting options extracted from COPY options.
|
||||
///
|
||||
/// This struct encapsulates the CSV formatting settings used when parsing
|
||||
/// or formatting COPY statement data. It extracts relevant options from both
|
||||
/// modern [`CopyOption`] and legacy [`CopyLegacyOption`] variants.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct CsvFormatOptions {
|
||||
/// The field delimiter character (default: tab)
|
||||
pub(crate) delimiter: char,
|
||||
/// The quote character used to enclose fields (default: `"`)
|
||||
pub(crate) quote: char,
|
||||
/// The escape character (default: `\`)
|
||||
pub(crate) escape: char,
|
||||
/// The string representing NULL values (default: `\\N`)
|
||||
pub(crate) null_symbol: String,
|
||||
}
|
||||
|
||||
impl Default for CsvFormatOptions {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
delimiter: '\t',
|
||||
quote: '"',
|
||||
escape: '\\',
|
||||
null_symbol: "\\N".to_string(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl CsvFormatOptions {
|
||||
/// Extract CSV format options from CopyOption and CopyLegacyOption lists.
|
||||
///
|
||||
/// This method processes both modern and legacy COPY options to determine
|
||||
/// the CSV formatting settings. Later options in the lists override earlier ones.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `options` - Modern COPY options (PostgreSQL 9.0+)
|
||||
/// * `legacy_options` - Legacy COPY options (pre-PostgreSQL 9.0)
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// A `CsvFormatOptions` instance with the extracted settings, using defaults
|
||||
/// for any options not specified.
|
||||
pub(crate) fn from_copy_options(
|
||||
options: &[CopyOption],
|
||||
legacy_options: &[CopyLegacyOption],
|
||||
) -> Self {
|
||||
let mut csv_options = Self::default();
|
||||
|
||||
// Apply options
|
||||
for option in options {
|
||||
match option {
|
||||
CopyOption::Delimiter(c) => {
|
||||
csv_options.delimiter = *c;
|
||||
}
|
||||
CopyOption::Quote(c) => {
|
||||
csv_options.quote = *c;
|
||||
}
|
||||
CopyOption::Escape(c) => {
|
||||
csv_options.escape = *c;
|
||||
}
|
||||
CopyOption::Null(null) => {
|
||||
csv_options.null_symbol = null.clone();
|
||||
}
|
||||
// These options don't affect CSV formatting
|
||||
CopyOption::Format(_)
|
||||
| CopyOption::Freeze(_)
|
||||
| CopyOption::Header(_)
|
||||
| CopyOption::ForceQuote(_)
|
||||
| CopyOption::ForceNotNull(_)
|
||||
| CopyOption::ForceNull(_)
|
||||
| CopyOption::Encoding(_) => {}
|
||||
}
|
||||
}
|
||||
|
||||
// Apply legacy options
|
||||
for option in legacy_options {
|
||||
match option {
|
||||
CopyLegacyOption::Delimiter(c) => {
|
||||
csv_options.delimiter = *c;
|
||||
}
|
||||
CopyLegacyOption::Null(null) => {
|
||||
csv_options.null_symbol = null.clone();
|
||||
}
|
||||
CopyLegacyOption::Csv(csv_opts) => {
|
||||
for csv_option in csv_opts {
|
||||
match csv_option {
|
||||
CopyLegacyCsvOption::Quote(c) => {
|
||||
csv_options.quote = *c;
|
||||
}
|
||||
CopyLegacyCsvOption::Escape(c) => {
|
||||
csv_options.escape = *c;
|
||||
}
|
||||
// These CSV options don't affect CSV formatting
|
||||
CopyLegacyCsvOption::Header
|
||||
| CopyLegacyCsvOption::ForceQuote(_)
|
||||
| CopyLegacyCsvOption::ForceNotNull(_) => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
// These legacy options don't affect CSV formatting
|
||||
CopyLegacyOption::AcceptAnyDate
|
||||
| CopyLegacyOption::AcceptInvChars(_)
|
||||
| CopyLegacyOption::AddQuotes
|
||||
| CopyLegacyOption::AllowOverwrite
|
||||
| CopyLegacyOption::Binary
|
||||
| CopyLegacyOption::BlankAsNull
|
||||
| CopyLegacyOption::Bzip2
|
||||
| CopyLegacyOption::CleanPath
|
||||
| CopyLegacyOption::CompUpdate { .. }
|
||||
| CopyLegacyOption::DateFormat(_)
|
||||
| CopyLegacyOption::EmptyAsNull
|
||||
| CopyLegacyOption::Encrypted { .. }
|
||||
| CopyLegacyOption::Escape
|
||||
| CopyLegacyOption::Extension(_)
|
||||
| CopyLegacyOption::FixedWidth(_)
|
||||
| CopyLegacyOption::Gzip
|
||||
| CopyLegacyOption::Header
|
||||
| CopyLegacyOption::IamRole(_)
|
||||
| CopyLegacyOption::IgnoreHeader(_)
|
||||
| CopyLegacyOption::Json
|
||||
| CopyLegacyOption::Manifest { .. }
|
||||
| CopyLegacyOption::MaxFileSize(_)
|
||||
| CopyLegacyOption::Parallel(_)
|
||||
| CopyLegacyOption::Parquet
|
||||
| CopyLegacyOption::PartitionBy(_)
|
||||
| CopyLegacyOption::Region(_)
|
||||
| CopyLegacyOption::RemoveQuotes
|
||||
| CopyLegacyOption::RowGroupSize(_)
|
||||
| CopyLegacyOption::StatUpdate(_)
|
||||
| CopyLegacyOption::TimeFormat(_)
|
||||
| CopyLegacyOption::TruncateColumns
|
||||
| CopyLegacyOption::Zstd => {}
|
||||
}
|
||||
}
|
||||
|
||||
csv_options
|
||||
}
|
||||
|
||||
/// Format a single CSV field, adding quotes and escaping if necessary.
|
||||
///
|
||||
/// This method handles CSV field formatting according to the configured options:
|
||||
/// - Writes NULL values using the configured `null_symbol`
|
||||
/// - Adds quotes around fields containing delimiters, quotes, or newlines
|
||||
/// - Escapes quote characters by doubling them
|
||||
/// - Escapes escape characters
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `f` - The formatter to write to
|
||||
/// * `field` - The field value to format, or `None` for NULL
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// A `fmt::Result` indicating success or failure of the write operation.
|
||||
fn format_csv_field(&self, f: &mut fmt::Formatter, field: Option<&str>) -> fmt::Result {
|
||||
let field_value = field.unwrap_or(&self.null_symbol);
|
||||
|
||||
// Check if field needs quoting
|
||||
let needs_quoting = field_value.contains(self.delimiter)
|
||||
|| field_value.contains(self.quote)
|
||||
|| field_value.contains('\n')
|
||||
|| field_value.contains('\r');
|
||||
|
||||
if needs_quoting {
|
||||
write!(f, "{}", self.quote)?;
|
||||
for ch in field_value.chars() {
|
||||
if ch == self.quote {
|
||||
// Escape quote by doubling it
|
||||
write!(f, "{}{}", self.quote, self.quote)?;
|
||||
} else if ch == self.escape {
|
||||
// Escape escape character
|
||||
write!(f, "{}{}", self.escape, self.escape)?;
|
||||
} else {
|
||||
write!(f, "{}", ch)?;
|
||||
}
|
||||
}
|
||||
write!(f, "{}", self.quote)?;
|
||||
} else {
|
||||
write!(f, "{}", field_value)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// COPY statement.
|
||||
///
|
||||
/// Represents a PostgreSQL COPY statement for bulk data transfer between
|
||||
|
|
@ -550,7 +364,7 @@ pub struct Copy {
|
|||
/// CSV data rows for COPY FROM STDIN statements.
|
||||
/// Each row is a vector of optional strings (None represents NULL).
|
||||
/// Populated only when copying from STDIN with inline data.
|
||||
pub values: Vec<Vec<Option<String>>>,
|
||||
pub values: Option<String>,
|
||||
}
|
||||
|
||||
impl Display for Copy {
|
||||
|
|
@ -581,24 +395,8 @@ impl Display for Copy {
|
|||
write!(f, " {}", display_separated(&self.legacy_options, " "))?;
|
||||
}
|
||||
|
||||
if !self.values.is_empty() {
|
||||
writeln!(f, ";")?;
|
||||
|
||||
let csv_options =
|
||||
CsvFormatOptions::from_copy_options(&self.options, &self.legacy_options);
|
||||
|
||||
// Write CSV data
|
||||
for row in &self.values {
|
||||
for (idx, column) in row.iter().enumerate() {
|
||||
if idx > 0 {
|
||||
write!(f, "{}", csv_options.delimiter)?;
|
||||
}
|
||||
csv_options.format_csv_field(f, column.as_deref())?;
|
||||
}
|
||||
writeln!(f)?;
|
||||
}
|
||||
|
||||
write!(f, "\\.")?;
|
||||
if let Some(values) = &self.values {
|
||||
write!(f, ";{values}\\.")?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
|
|
|||
|
|
@ -75,7 +75,7 @@ pub use self::ddl::{
|
|||
UserDefinedTypeInternalLength, UserDefinedTypeRangeOption, UserDefinedTypeRepresentation,
|
||||
UserDefinedTypeSqlDefinitionOption, UserDefinedTypeStorage, ViewColumnDef,
|
||||
};
|
||||
pub use self::dml::{Copy, CsvFormatOptions, Delete, Insert, Update};
|
||||
pub use self::dml::{Copy, Delete, Insert, Update};
|
||||
pub use self::operator::{BinaryOperator, UnaryOperator};
|
||||
pub use self::query::{
|
||||
AfterMatchSkip, ConnectBy, Cte, CteAsMaterialized, Distinct, EmptyMatchesMode,
|
||||
|
|
|
|||
|
|
@ -9554,134 +9554,6 @@ impl<'a> Parser<'a> {
|
|||
}
|
||||
}
|
||||
|
||||
fn parse_csv_body(
|
||||
&mut self,
|
||||
options: &[CopyOption],
|
||||
legacy_options: &[CopyLegacyOption],
|
||||
) -> Result<Vec<Vec<Option<String>>>, ParserError> {
|
||||
let Token::CopyFromStdin(body) = self.next_token().token else {
|
||||
return self.expected("COPY ... FROM STDIN with CSV body", self.peek_token());
|
||||
};
|
||||
|
||||
let csv_options = CsvFormatOptions::from_copy_options(options, legacy_options);
|
||||
let delimiter = csv_options.delimiter;
|
||||
let quote = csv_options.quote;
|
||||
let escape = csv_options.escape;
|
||||
let null_symbol = csv_options.null_symbol.as_str();
|
||||
|
||||
// Simple CSV parser
|
||||
let mut result = vec![];
|
||||
let mut current_row = vec![];
|
||||
let mut current_field = String::new();
|
||||
let mut in_quotes = false;
|
||||
let mut chars = body.chars().peekable();
|
||||
let mut expected_column_count: Option<usize> = None;
|
||||
let mut row_number = 0;
|
||||
|
||||
while let Some(ch) = chars.next() {
|
||||
if in_quotes {
|
||||
if ch == quote {
|
||||
// Check if it's an escaped quote
|
||||
if let Some(&next_ch) = chars.peek() {
|
||||
if next_ch == quote {
|
||||
// Escaped quote
|
||||
current_field.push(quote);
|
||||
chars.next();
|
||||
} else {
|
||||
// End of quoted field
|
||||
in_quotes = false;
|
||||
}
|
||||
} else {
|
||||
// End of quoted field at end of input
|
||||
in_quotes = false;
|
||||
}
|
||||
} else if ch == escape {
|
||||
// Escape character
|
||||
if let Some(next_ch) = chars.next() {
|
||||
current_field.push(next_ch);
|
||||
}
|
||||
} else {
|
||||
current_field.push(ch);
|
||||
}
|
||||
} else if ch == quote {
|
||||
in_quotes = true;
|
||||
} else if ch == delimiter {
|
||||
// End of field
|
||||
if current_field == null_symbol {
|
||||
current_row.push(None);
|
||||
} else {
|
||||
current_row.push(Some(current_field.clone()));
|
||||
}
|
||||
current_field.clear();
|
||||
} else if ch == '\n' || ch == '\r' {
|
||||
// End of record
|
||||
if ch == '\r' {
|
||||
// Skip \n if it follows \r
|
||||
if let Some(&'\n') = chars.peek() {
|
||||
chars.next();
|
||||
}
|
||||
}
|
||||
if !current_field.is_empty() || !current_row.is_empty() {
|
||||
if current_field == null_symbol {
|
||||
current_row.push(None);
|
||||
} else {
|
||||
current_row.push(Some(current_field.clone()));
|
||||
}
|
||||
current_field.clear();
|
||||
|
||||
// Validate column count
|
||||
row_number += 1;
|
||||
if let Some(expected) = expected_column_count {
|
||||
if current_row.len() != expected {
|
||||
return Err(ParserError::ParserError(format!(
|
||||
"CSV row {} has {} columns, but expected {} columns based on first row",
|
||||
row_number,
|
||||
current_row.len(),
|
||||
expected
|
||||
)));
|
||||
}
|
||||
} else {
|
||||
// First row establishes the expected column count
|
||||
expected_column_count = Some(current_row.len());
|
||||
}
|
||||
|
||||
result.push(current_row.clone());
|
||||
current_row.clear();
|
||||
}
|
||||
} else {
|
||||
current_field.push(ch);
|
||||
}
|
||||
}
|
||||
|
||||
// Handle remaining field/row
|
||||
if !current_field.is_empty() || !current_row.is_empty() {
|
||||
if current_field == null_symbol {
|
||||
current_row.push(None);
|
||||
} else {
|
||||
current_row.push(Some(current_field));
|
||||
}
|
||||
|
||||
// Validate column count for last row
|
||||
row_number += 1;
|
||||
if let Some(expected) = expected_column_count {
|
||||
if current_row.len() != expected {
|
||||
return Err(ParserError::ParserError(format!(
|
||||
"CSV row {} has {} columns, but expected {} columns based on first row",
|
||||
row_number,
|
||||
current_row.len(),
|
||||
expected
|
||||
)));
|
||||
}
|
||||
}
|
||||
// Note: if this is the first and only row, we don't need to set expected_column_count
|
||||
// since there's nothing to validate against
|
||||
|
||||
result.push(current_row);
|
||||
}
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
/// Parse a copy statement
|
||||
pub fn parse_copy(&mut self) -> Result<Statement, ParserError> {
|
||||
let source;
|
||||
|
|
@ -9735,9 +9607,12 @@ impl<'a> Parser<'a> {
|
|||
}
|
||||
let values = if let CopyTarget::Stdin = target {
|
||||
self.expect_token(&Token::SemiColon)?;
|
||||
self.parse_csv_body(&options, &legacy_options)?
|
||||
let Token::CopyFromStdin(body) = self.next_token().token else {
|
||||
return self.expected("COPY ... FROM STDIN with CSV body", self.peek_token());
|
||||
};
|
||||
Some(body)
|
||||
} else {
|
||||
vec![]
|
||||
None
|
||||
};
|
||||
Ok(Statement::Copy(Copy {
|
||||
source,
|
||||
|
|
|
|||
|
|
@ -1050,25 +1050,13 @@ fn parse_copy_from_stdin() {
|
|||
1,PENELOPE,GUINESS,2006-02-15 09:34:33,0.11111
|
||||
2,NICK,WAHLBERG,2006-02-15 09:34:33
|
||||
\."#;
|
||||
let parsed = pg_and_generic().parse_sql_statements(incorrect_csv_sql);
|
||||
assert_eq!(
|
||||
parsed.unwrap_err(),
|
||||
ParserError::ParserError(
|
||||
"CSV row 2 has 4 columns, but expected 5 columns based on first row".to_string()
|
||||
)
|
||||
);
|
||||
pg_and_generic().verified_stmt(incorrect_csv_sql);
|
||||
|
||||
let mixed_incorrect_separators = r#"COPY public.actor (actor_id, first_name, last_name, last_update, value) FROM STDIN (FORMAT csv, DELIMITER ',');
|
||||
1,PENELOPE,GUINESS,2006-02-15 09:34:33,0.11111
|
||||
2 NICK WAHLBERG 2006-02-15 09:34:33,0.22222
|
||||
\."#;
|
||||
let parsed = pg_and_generic().parse_sql_statements(mixed_incorrect_separators);
|
||||
assert_eq!(
|
||||
parsed.unwrap_err(),
|
||||
ParserError::ParserError(
|
||||
"CSV row 2 has 2 columns, but expected 5 columns based on first row".to_string()
|
||||
)
|
||||
);
|
||||
pg_and_generic().verified_stmt(mixed_incorrect_separators);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
@ -1087,7 +1075,7 @@ fn test_copy_from() {
|
|||
},
|
||||
options: vec![],
|
||||
legacy_options: vec![],
|
||||
values: vec![],
|
||||
values: None,
|
||||
})
|
||||
);
|
||||
|
||||
|
|
@ -1105,7 +1093,7 @@ fn test_copy_from() {
|
|||
},
|
||||
options: vec![],
|
||||
legacy_options: vec![CopyLegacyOption::Delimiter(',')],
|
||||
values: vec![],
|
||||
values: None,
|
||||
})
|
||||
);
|
||||
|
||||
|
|
@ -1126,7 +1114,7 @@ fn test_copy_from() {
|
|||
CopyLegacyOption::Delimiter(','),
|
||||
CopyLegacyOption::Csv(vec![CopyLegacyCsvOption::Header,])
|
||||
],
|
||||
values: vec![],
|
||||
values: None,
|
||||
})
|
||||
);
|
||||
}
|
||||
|
|
@ -1147,7 +1135,7 @@ fn test_copy_to() {
|
|||
},
|
||||
options: vec![],
|
||||
legacy_options: vec![],
|
||||
values: vec![],
|
||||
values: None,
|
||||
})
|
||||
);
|
||||
|
||||
|
|
@ -1165,7 +1153,7 @@ fn test_copy_to() {
|
|||
},
|
||||
options: vec![],
|
||||
legacy_options: vec![CopyLegacyOption::Delimiter(',')],
|
||||
values: vec![],
|
||||
values: None,
|
||||
})
|
||||
);
|
||||
|
||||
|
|
@ -1186,7 +1174,7 @@ fn test_copy_to() {
|
|||
CopyLegacyOption::Delimiter(','),
|
||||
CopyLegacyOption::Csv(vec![CopyLegacyCsvOption::Header,])
|
||||
],
|
||||
values: vec![],
|
||||
values: None,
|
||||
})
|
||||
)
|
||||
}
|
||||
|
|
@ -1240,7 +1228,7 @@ fn parse_copy_from() {
|
|||
CopyOption::Encoding("utf8".into()),
|
||||
],
|
||||
legacy_options: vec![],
|
||||
values: vec![],
|
||||
values: None,
|
||||
})
|
||||
);
|
||||
}
|
||||
|
|
@ -1270,7 +1258,7 @@ fn parse_copy_to() {
|
|||
},
|
||||
options: vec![],
|
||||
legacy_options: vec![],
|
||||
values: vec![],
|
||||
values: None,
|
||||
})
|
||||
);
|
||||
|
||||
|
|
@ -1286,7 +1274,7 @@ fn parse_copy_to() {
|
|||
target: CopyTarget::Stdout,
|
||||
options: vec![CopyOption::Delimiter('|')],
|
||||
legacy_options: vec![],
|
||||
values: vec![],
|
||||
values: None,
|
||||
})
|
||||
);
|
||||
|
||||
|
|
@ -1305,7 +1293,7 @@ fn parse_copy_to() {
|
|||
},
|
||||
options: vec![],
|
||||
legacy_options: vec![],
|
||||
values: vec![],
|
||||
values: None,
|
||||
})
|
||||
);
|
||||
|
||||
|
|
@ -1373,7 +1361,7 @@ fn parse_copy_to() {
|
|||
},
|
||||
options: vec![],
|
||||
legacy_options: vec![],
|
||||
values: vec![],
|
||||
values: None,
|
||||
})
|
||||
)
|
||||
}
|
||||
|
|
@ -1404,7 +1392,7 @@ fn parse_copy_from_before_v9_0() {
|
|||
CopyLegacyCsvOption::ForceNotNull(vec!["column".into()]),
|
||||
]),
|
||||
],
|
||||
values: vec![],
|
||||
values: None,
|
||||
})
|
||||
);
|
||||
|
||||
|
|
@ -1430,7 +1418,7 @@ fn parse_copy_from_before_v9_0() {
|
|||
CopyLegacyCsvOption::Escape('\\'),
|
||||
]),
|
||||
],
|
||||
values: vec![],
|
||||
values: None,
|
||||
})
|
||||
);
|
||||
}
|
||||
|
|
@ -1461,7 +1449,7 @@ fn parse_copy_to_before_v9_0() {
|
|||
CopyLegacyCsvOption::ForceQuote(vec!["column".into()]),
|
||||
]),
|
||||
],
|
||||
values: vec![],
|
||||
values: None,
|
||||
})
|
||||
)
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue