Removed CSV validation

This commit is contained in:
Luca 2025-11-18 13:06:35 +01:00
parent 48c476e62d
commit 38a7c6ba9d
4 changed files with 28 additions and 367 deletions

View file

@ -33,10 +33,9 @@ use crate::display_utils::{indented_list, Indent, SpaceOrNewline};
use super::{
display_comma_separated, display_separated, helpers::attached_token::AttachedToken,
query::InputFormatClause, Assignment, CopyLegacyCsvOption, CopyLegacyOption, CopyOption,
CopySource, CopyTarget, Expr, FromTable, Ident, InsertAliases, MysqlInsertPriority, ObjectName,
OnInsert, OrderByExpr, Query, SelectItem, Setting, SqliteOnConflict, TableObject,
TableWithJoins, UpdateTableFromKind,
query::InputFormatClause, Assignment, CopyLegacyOption, CopyOption, CopySource, CopyTarget,
Expr, FromTable, Ident, InsertAliases, MysqlInsertPriority, ObjectName, OnInsert, OrderByExpr,
Query, SelectItem, Setting, SqliteOnConflict, TableObject, TableWithJoins, UpdateTableFromKind,
};
/// INSERT statement.
@ -317,191 +316,6 @@ impl Display for Update {
}
}
/// CSV formatting options extracted from COPY options.
///
/// This struct encapsulates the CSV formatting settings used when parsing
/// or formatting COPY statement data. It extracts relevant options from both
/// modern [`CopyOption`] and legacy [`CopyLegacyOption`] variants.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct CsvFormatOptions {
/// The field delimiter character (default: tab)
pub(crate) delimiter: char,
/// The quote character used to enclose fields (default: `"`)
pub(crate) quote: char,
/// The escape character (default: `\`)
pub(crate) escape: char,
/// The string representing NULL values (default: `\\N`)
pub(crate) null_symbol: String,
}
impl Default for CsvFormatOptions {
fn default() -> Self {
Self {
delimiter: '\t',
quote: '"',
escape: '\\',
null_symbol: "\\N".to_string(),
}
}
}
impl CsvFormatOptions {
/// Extract CSV format options from CopyOption and CopyLegacyOption lists.
///
/// This method processes both modern and legacy COPY options to determine
/// the CSV formatting settings. Later options in the lists override earlier ones.
///
/// # Arguments
///
/// * `options` - Modern COPY options (PostgreSQL 9.0+)
/// * `legacy_options` - Legacy COPY options (pre-PostgreSQL 9.0)
///
/// # Returns
///
/// A `CsvFormatOptions` instance with the extracted settings, using defaults
/// for any options not specified.
pub(crate) fn from_copy_options(
options: &[CopyOption],
legacy_options: &[CopyLegacyOption],
) -> Self {
let mut csv_options = Self::default();
// Apply options
for option in options {
match option {
CopyOption::Delimiter(c) => {
csv_options.delimiter = *c;
}
CopyOption::Quote(c) => {
csv_options.quote = *c;
}
CopyOption::Escape(c) => {
csv_options.escape = *c;
}
CopyOption::Null(null) => {
csv_options.null_symbol = null.clone();
}
// These options don't affect CSV formatting
CopyOption::Format(_)
| CopyOption::Freeze(_)
| CopyOption::Header(_)
| CopyOption::ForceQuote(_)
| CopyOption::ForceNotNull(_)
| CopyOption::ForceNull(_)
| CopyOption::Encoding(_) => {}
}
}
// Apply legacy options
for option in legacy_options {
match option {
CopyLegacyOption::Delimiter(c) => {
csv_options.delimiter = *c;
}
CopyLegacyOption::Null(null) => {
csv_options.null_symbol = null.clone();
}
CopyLegacyOption::Csv(csv_opts) => {
for csv_option in csv_opts {
match csv_option {
CopyLegacyCsvOption::Quote(c) => {
csv_options.quote = *c;
}
CopyLegacyCsvOption::Escape(c) => {
csv_options.escape = *c;
}
// These CSV options don't affect CSV formatting
CopyLegacyCsvOption::Header
| CopyLegacyCsvOption::ForceQuote(_)
| CopyLegacyCsvOption::ForceNotNull(_) => {}
}
}
}
// These legacy options don't affect CSV formatting
CopyLegacyOption::AcceptAnyDate
| CopyLegacyOption::AcceptInvChars(_)
| CopyLegacyOption::AddQuotes
| CopyLegacyOption::AllowOverwrite
| CopyLegacyOption::Binary
| CopyLegacyOption::BlankAsNull
| CopyLegacyOption::Bzip2
| CopyLegacyOption::CleanPath
| CopyLegacyOption::CompUpdate { .. }
| CopyLegacyOption::DateFormat(_)
| CopyLegacyOption::EmptyAsNull
| CopyLegacyOption::Encrypted { .. }
| CopyLegacyOption::Escape
| CopyLegacyOption::Extension(_)
| CopyLegacyOption::FixedWidth(_)
| CopyLegacyOption::Gzip
| CopyLegacyOption::Header
| CopyLegacyOption::IamRole(_)
| CopyLegacyOption::IgnoreHeader(_)
| CopyLegacyOption::Json
| CopyLegacyOption::Manifest { .. }
| CopyLegacyOption::MaxFileSize(_)
| CopyLegacyOption::Parallel(_)
| CopyLegacyOption::Parquet
| CopyLegacyOption::PartitionBy(_)
| CopyLegacyOption::Region(_)
| CopyLegacyOption::RemoveQuotes
| CopyLegacyOption::RowGroupSize(_)
| CopyLegacyOption::StatUpdate(_)
| CopyLegacyOption::TimeFormat(_)
| CopyLegacyOption::TruncateColumns
| CopyLegacyOption::Zstd => {}
}
}
csv_options
}
/// Format a single CSV field, adding quotes and escaping if necessary.
///
/// This method handles CSV field formatting according to the configured options:
/// - Writes NULL values using the configured `null_symbol`
/// - Adds quotes around fields containing delimiters, quotes, or newlines
/// - Escapes quote characters by doubling them
/// - Escapes escape characters
///
/// # Arguments
///
/// * `f` - The formatter to write to
/// * `field` - The field value to format, or `None` for NULL
///
/// # Returns
///
/// A `fmt::Result` indicating success or failure of the write operation.
fn format_csv_field(&self, f: &mut fmt::Formatter, field: Option<&str>) -> fmt::Result {
let field_value = field.unwrap_or(&self.null_symbol);
// Check if field needs quoting
let needs_quoting = field_value.contains(self.delimiter)
|| field_value.contains(self.quote)
|| field_value.contains('\n')
|| field_value.contains('\r');
if needs_quoting {
write!(f, "{}", self.quote)?;
for ch in field_value.chars() {
if ch == self.quote {
// Escape quote by doubling it
write!(f, "{}{}", self.quote, self.quote)?;
} else if ch == self.escape {
// Escape escape character
write!(f, "{}{}", self.escape, self.escape)?;
} else {
write!(f, "{}", ch)?;
}
}
write!(f, "{}", self.quote)?;
} else {
write!(f, "{}", field_value)?;
}
Ok(())
}
}
/// COPY statement.
///
/// Represents a PostgreSQL COPY statement for bulk data transfer between
@ -550,7 +364,7 @@ pub struct Copy {
/// CSV data rows for COPY FROM STDIN statements.
/// Each row is a vector of optional strings (None represents NULL).
/// Populated only when copying from STDIN with inline data.
pub values: Vec<Vec<Option<String>>>,
pub values: Option<String>,
}
impl Display for Copy {
@ -581,24 +395,8 @@ impl Display for Copy {
write!(f, " {}", display_separated(&self.legacy_options, " "))?;
}
if !self.values.is_empty() {
writeln!(f, ";")?;
let csv_options =
CsvFormatOptions::from_copy_options(&self.options, &self.legacy_options);
// Write CSV data
for row in &self.values {
for (idx, column) in row.iter().enumerate() {
if idx > 0 {
write!(f, "{}", csv_options.delimiter)?;
}
csv_options.format_csv_field(f, column.as_deref())?;
}
writeln!(f)?;
}
write!(f, "\\.")?;
if let Some(values) = &self.values {
write!(f, ";{values}\\.")?;
}
Ok(())
}

View file

@ -75,7 +75,7 @@ pub use self::ddl::{
UserDefinedTypeInternalLength, UserDefinedTypeRangeOption, UserDefinedTypeRepresentation,
UserDefinedTypeSqlDefinitionOption, UserDefinedTypeStorage, ViewColumnDef,
};
pub use self::dml::{Copy, CsvFormatOptions, Delete, Insert, Update};
pub use self::dml::{Copy, Delete, Insert, Update};
pub use self::operator::{BinaryOperator, UnaryOperator};
pub use self::query::{
AfterMatchSkip, ConnectBy, Cte, CteAsMaterialized, Distinct, EmptyMatchesMode,

View file

@ -9554,134 +9554,6 @@ impl<'a> Parser<'a> {
}
}
fn parse_csv_body(
&mut self,
options: &[CopyOption],
legacy_options: &[CopyLegacyOption],
) -> Result<Vec<Vec<Option<String>>>, ParserError> {
let Token::CopyFromStdin(body) = self.next_token().token else {
return self.expected("COPY ... FROM STDIN with CSV body", self.peek_token());
};
let csv_options = CsvFormatOptions::from_copy_options(options, legacy_options);
let delimiter = csv_options.delimiter;
let quote = csv_options.quote;
let escape = csv_options.escape;
let null_symbol = csv_options.null_symbol.as_str();
// Simple CSV parser
let mut result = vec![];
let mut current_row = vec![];
let mut current_field = String::new();
let mut in_quotes = false;
let mut chars = body.chars().peekable();
let mut expected_column_count: Option<usize> = None;
let mut row_number = 0;
while let Some(ch) = chars.next() {
if in_quotes {
if ch == quote {
// Check if it's an escaped quote
if let Some(&next_ch) = chars.peek() {
if next_ch == quote {
// Escaped quote
current_field.push(quote);
chars.next();
} else {
// End of quoted field
in_quotes = false;
}
} else {
// End of quoted field at end of input
in_quotes = false;
}
} else if ch == escape {
// Escape character
if let Some(next_ch) = chars.next() {
current_field.push(next_ch);
}
} else {
current_field.push(ch);
}
} else if ch == quote {
in_quotes = true;
} else if ch == delimiter {
// End of field
if current_field == null_symbol {
current_row.push(None);
} else {
current_row.push(Some(current_field.clone()));
}
current_field.clear();
} else if ch == '\n' || ch == '\r' {
// End of record
if ch == '\r' {
// Skip \n if it follows \r
if let Some(&'\n') = chars.peek() {
chars.next();
}
}
if !current_field.is_empty() || !current_row.is_empty() {
if current_field == null_symbol {
current_row.push(None);
} else {
current_row.push(Some(current_field.clone()));
}
current_field.clear();
// Validate column count
row_number += 1;
if let Some(expected) = expected_column_count {
if current_row.len() != expected {
return Err(ParserError::ParserError(format!(
"CSV row {} has {} columns, but expected {} columns based on first row",
row_number,
current_row.len(),
expected
)));
}
} else {
// First row establishes the expected column count
expected_column_count = Some(current_row.len());
}
result.push(current_row.clone());
current_row.clear();
}
} else {
current_field.push(ch);
}
}
// Handle remaining field/row
if !current_field.is_empty() || !current_row.is_empty() {
if current_field == null_symbol {
current_row.push(None);
} else {
current_row.push(Some(current_field));
}
// Validate column count for last row
row_number += 1;
if let Some(expected) = expected_column_count {
if current_row.len() != expected {
return Err(ParserError::ParserError(format!(
"CSV row {} has {} columns, but expected {} columns based on first row",
row_number,
current_row.len(),
expected
)));
}
}
// Note: if this is the first and only row, we don't need to set expected_column_count
// since there's nothing to validate against
result.push(current_row);
}
Ok(result)
}
/// Parse a copy statement
pub fn parse_copy(&mut self) -> Result<Statement, ParserError> {
let source;
@ -9735,9 +9607,12 @@ impl<'a> Parser<'a> {
}
let values = if let CopyTarget::Stdin = target {
self.expect_token(&Token::SemiColon)?;
self.parse_csv_body(&options, &legacy_options)?
let Token::CopyFromStdin(body) = self.next_token().token else {
return self.expected("COPY ... FROM STDIN with CSV body", self.peek_token());
};
Some(body)
} else {
vec![]
None
};
Ok(Statement::Copy(Copy {
source,

View file

@ -1050,25 +1050,13 @@ fn parse_copy_from_stdin() {
1,PENELOPE,GUINESS,2006-02-15 09:34:33,0.11111
2,NICK,WAHLBERG,2006-02-15 09:34:33
\."#;
let parsed = pg_and_generic().parse_sql_statements(incorrect_csv_sql);
assert_eq!(
parsed.unwrap_err(),
ParserError::ParserError(
"CSV row 2 has 4 columns, but expected 5 columns based on first row".to_string()
)
);
pg_and_generic().verified_stmt(incorrect_csv_sql);
let mixed_incorrect_separators = r#"COPY public.actor (actor_id, first_name, last_name, last_update, value) FROM STDIN (FORMAT csv, DELIMITER ',');
1,PENELOPE,GUINESS,2006-02-15 09:34:33,0.11111
2 NICK WAHLBERG 2006-02-15 09:34:33,0.22222
\."#;
let parsed = pg_and_generic().parse_sql_statements(mixed_incorrect_separators);
assert_eq!(
parsed.unwrap_err(),
ParserError::ParserError(
"CSV row 2 has 2 columns, but expected 5 columns based on first row".to_string()
)
);
pg_and_generic().verified_stmt(mixed_incorrect_separators);
}
#[test]
@ -1087,7 +1075,7 @@ fn test_copy_from() {
},
options: vec![],
legacy_options: vec![],
values: vec![],
values: None,
})
);
@ -1105,7 +1093,7 @@ fn test_copy_from() {
},
options: vec![],
legacy_options: vec![CopyLegacyOption::Delimiter(',')],
values: vec![],
values: None,
})
);
@ -1126,7 +1114,7 @@ fn test_copy_from() {
CopyLegacyOption::Delimiter(','),
CopyLegacyOption::Csv(vec![CopyLegacyCsvOption::Header,])
],
values: vec![],
values: None,
})
);
}
@ -1147,7 +1135,7 @@ fn test_copy_to() {
},
options: vec![],
legacy_options: vec![],
values: vec![],
values: None,
})
);
@ -1165,7 +1153,7 @@ fn test_copy_to() {
},
options: vec![],
legacy_options: vec![CopyLegacyOption::Delimiter(',')],
values: vec![],
values: None,
})
);
@ -1186,7 +1174,7 @@ fn test_copy_to() {
CopyLegacyOption::Delimiter(','),
CopyLegacyOption::Csv(vec![CopyLegacyCsvOption::Header,])
],
values: vec![],
values: None,
})
)
}
@ -1240,7 +1228,7 @@ fn parse_copy_from() {
CopyOption::Encoding("utf8".into()),
],
legacy_options: vec![],
values: vec![],
values: None,
})
);
}
@ -1270,7 +1258,7 @@ fn parse_copy_to() {
},
options: vec![],
legacy_options: vec![],
values: vec![],
values: None,
})
);
@ -1286,7 +1274,7 @@ fn parse_copy_to() {
target: CopyTarget::Stdout,
options: vec![CopyOption::Delimiter('|')],
legacy_options: vec![],
values: vec![],
values: None,
})
);
@ -1305,7 +1293,7 @@ fn parse_copy_to() {
},
options: vec![],
legacy_options: vec![],
values: vec![],
values: None,
})
);
@ -1373,7 +1361,7 @@ fn parse_copy_to() {
},
options: vec![],
legacy_options: vec![],
values: vec![],
values: None,
})
)
}
@ -1404,7 +1392,7 @@ fn parse_copy_from_before_v9_0() {
CopyLegacyCsvOption::ForceNotNull(vec!["column".into()]),
]),
],
values: vec![],
values: None,
})
);
@ -1430,7 +1418,7 @@ fn parse_copy_from_before_v9_0() {
CopyLegacyCsvOption::Escape('\\'),
]),
],
values: vec![],
values: None,
})
);
}
@ -1461,7 +1449,7 @@ fn parse_copy_to_before_v9_0() {
CopyLegacyCsvOption::ForceQuote(vec!["column".into()]),
]),
],
values: vec![],
values: None,
})
)
}