mirror of
https://github.com/apache/datafusion-sqlparser-rs.git
synced 2025-07-07 17:04:59 +00:00
Add support of parsing CLUSTERED BY clause for Hive (#1397)
This commit is contained in:
parent
222b7d127a
commit
7b4ac7ca9f
9 changed files with 166 additions and 36 deletions
|
@ -26,7 +26,7 @@ use sqlparser_derive::{Visit, VisitMut};
|
|||
use crate::ast::value::escape_single_quote_string;
|
||||
use crate::ast::{
|
||||
display_comma_separated, display_separated, DataType, Expr, Ident, MySQLColumnPosition,
|
||||
ObjectName, ProjectionSelect, SequenceOptions, SqlOption,
|
||||
ObjectName, OrderByExpr, ProjectionSelect, SequenceOptions, SqlOption, Value,
|
||||
};
|
||||
use crate::tokenizer::Token;
|
||||
|
||||
|
@ -1417,3 +1417,30 @@ impl fmt::Display for Deduplicate {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Hive supports `CLUSTERED BY` statement in `CREATE TABLE`.
|
||||
/// Syntax: `CLUSTERED BY (col_name, ...) [SORTED BY (col_name [ASC|DESC], ...)] INTO num_buckets BUCKETS`
|
||||
///
|
||||
/// [Hive](https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DDL#LanguageManualDDL-CreateTable)
|
||||
#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
|
||||
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
|
||||
#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
|
||||
pub struct ClusteredBy {
|
||||
pub columns: Vec<Ident>,
|
||||
pub sorted_by: Option<Vec<OrderByExpr>>,
|
||||
pub num_buckets: Value,
|
||||
}
|
||||
|
||||
impl fmt::Display for ClusteredBy {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"CLUSTERED BY ({})",
|
||||
display_comma_separated(&self.columns)
|
||||
)?;
|
||||
if let Some(ref sorted_by) = self.sorted_by {
|
||||
write!(f, " SORTED BY ({})", display_comma_separated(sorted_by))?;
|
||||
}
|
||||
write!(f, " INTO {} BUCKETS", self.num_buckets)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -22,11 +22,11 @@ use sqlparser_derive::{Visit, VisitMut};
|
|||
pub use super::ddl::{ColumnDef, TableConstraint};
|
||||
|
||||
use super::{
|
||||
display_comma_separated, display_separated, CommentDef, Expr, FileFormat, FromTable,
|
||||
HiveDistributionStyle, HiveFormat, HiveIOFormat, HiveRowFormat, Ident, InsertAliases,
|
||||
MysqlInsertPriority, ObjectName, OnCommit, OnInsert, OneOrManyWithParens, OrderByExpr, Query,
|
||||
RowAccessPolicy, SelectItem, SqlOption, SqliteOnConflict, TableEngine, TableWithJoins, Tag,
|
||||
WrappedCollection,
|
||||
display_comma_separated, display_separated, ClusteredBy, CommentDef, Expr, FileFormat,
|
||||
FromTable, HiveDistributionStyle, HiveFormat, HiveIOFormat, HiveRowFormat, Ident,
|
||||
InsertAliases, MysqlInsertPriority, ObjectName, OnCommit, OnInsert, OneOrManyWithParens,
|
||||
OrderByExpr, Query, RowAccessPolicy, SelectItem, SqlOption, SqliteOnConflict, TableEngine,
|
||||
TableWithJoins, Tag, WrappedCollection,
|
||||
};
|
||||
|
||||
/// CREATE INDEX statement.
|
||||
|
@ -140,6 +140,9 @@ pub struct CreateTable {
|
|||
/// BigQuery: Table clustering column list.
|
||||
/// <https://cloud.google.com/bigquery/docs/reference/standard-sql/data-definition-language#table_option_list>
|
||||
pub cluster_by: Option<WrappedCollection<Vec<Ident>>>,
|
||||
/// Hive: Table clustering column list.
|
||||
/// <https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DDL#LanguageManualDDL-CreateTable>
|
||||
pub clustered_by: Option<ClusteredBy>,
|
||||
/// BigQuery: Table options list.
|
||||
/// <https://cloud.google.com/bigquery/docs/reference/standard-sql/data-definition-language#table_option_list>
|
||||
pub options: Option<Vec<SqlOption>>,
|
||||
|
@ -236,19 +239,6 @@ impl Display for CreateTable {
|
|||
HiveDistributionStyle::PARTITIONED { columns } => {
|
||||
write!(f, " PARTITIONED BY ({})", display_comma_separated(columns))?;
|
||||
}
|
||||
HiveDistributionStyle::CLUSTERED {
|
||||
columns,
|
||||
sorted_by,
|
||||
num_buckets,
|
||||
} => {
|
||||
write!(f, " CLUSTERED BY ({})", display_comma_separated(columns))?;
|
||||
if !sorted_by.is_empty() {
|
||||
write!(f, " SORTED BY ({})", display_comma_separated(sorted_by))?;
|
||||
}
|
||||
if *num_buckets > 0 {
|
||||
write!(f, " INTO {num_buckets} BUCKETS")?;
|
||||
}
|
||||
}
|
||||
HiveDistributionStyle::SKEWED {
|
||||
columns,
|
||||
on,
|
||||
|
@ -267,6 +257,10 @@ impl Display for CreateTable {
|
|||
_ => (),
|
||||
}
|
||||
|
||||
if let Some(clustered_by) = &self.clustered_by {
|
||||
write!(f, " {clustered_by}")?;
|
||||
}
|
||||
|
||||
if let Some(HiveFormat {
|
||||
row_format,
|
||||
serde_properties,
|
||||
|
|
|
@ -9,9 +9,9 @@ use sqlparser_derive::{Visit, VisitMut};
|
|||
|
||||
use super::super::dml::CreateTable;
|
||||
use crate::ast::{
|
||||
ColumnDef, CommentDef, Expr, FileFormat, HiveDistributionStyle, HiveFormat, Ident, ObjectName,
|
||||
OnCommit, OneOrManyWithParens, Query, RowAccessPolicy, SqlOption, Statement, TableConstraint,
|
||||
TableEngine, Tag, WrappedCollection,
|
||||
ClusteredBy, ColumnDef, CommentDef, Expr, FileFormat, HiveDistributionStyle, HiveFormat, Ident,
|
||||
ObjectName, OnCommit, OneOrManyWithParens, Query, RowAccessPolicy, SqlOption, Statement,
|
||||
TableConstraint, TableEngine, Tag, WrappedCollection,
|
||||
};
|
||||
use crate::parser::ParserError;
|
||||
|
||||
|
@ -78,6 +78,7 @@ pub struct CreateTableBuilder {
|
|||
pub order_by: Option<OneOrManyWithParens<Expr>>,
|
||||
pub partition_by: Option<Box<Expr>>,
|
||||
pub cluster_by: Option<WrappedCollection<Vec<Ident>>>,
|
||||
pub clustered_by: Option<ClusteredBy>,
|
||||
pub options: Option<Vec<SqlOption>>,
|
||||
pub strict: bool,
|
||||
pub copy_grants: bool,
|
||||
|
@ -125,6 +126,7 @@ impl CreateTableBuilder {
|
|||
order_by: None,
|
||||
partition_by: None,
|
||||
cluster_by: None,
|
||||
clustered_by: None,
|
||||
options: None,
|
||||
strict: false,
|
||||
copy_grants: false,
|
||||
|
@ -286,6 +288,11 @@ impl CreateTableBuilder {
|
|||
self
|
||||
}
|
||||
|
||||
pub fn clustered_by(mut self, clustered_by: Option<ClusteredBy>) -> Self {
|
||||
self.clustered_by = clustered_by;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn options(mut self, options: Option<Vec<SqlOption>>) -> Self {
|
||||
self.options = options;
|
||||
self
|
||||
|
@ -380,6 +387,7 @@ impl CreateTableBuilder {
|
|||
order_by: self.order_by,
|
||||
partition_by: self.partition_by,
|
||||
cluster_by: self.cluster_by,
|
||||
clustered_by: self.clustered_by,
|
||||
options: self.options,
|
||||
strict: self.strict,
|
||||
copy_grants: self.copy_grants,
|
||||
|
@ -434,6 +442,7 @@ impl TryFrom<Statement> for CreateTableBuilder {
|
|||
order_by,
|
||||
partition_by,
|
||||
cluster_by,
|
||||
clustered_by,
|
||||
options,
|
||||
strict,
|
||||
copy_grants,
|
||||
|
@ -476,6 +485,7 @@ impl TryFrom<Statement> for CreateTableBuilder {
|
|||
order_by,
|
||||
partition_by,
|
||||
cluster_by,
|
||||
clustered_by,
|
||||
options,
|
||||
strict,
|
||||
copy_grants,
|
||||
|
|
|
@ -33,11 +33,11 @@ pub use self::data_type::{
|
|||
};
|
||||
pub use self::dcl::{AlterRoleOperation, ResetConfig, RoleOption, SetConfigValue, Use};
|
||||
pub use self::ddl::{
|
||||
AlterColumnOperation, AlterIndexOperation, AlterTableOperation, ColumnDef, ColumnOption,
|
||||
ColumnOptionDef, ConstraintCharacteristics, Deduplicate, DeferrableInitial, GeneratedAs,
|
||||
GeneratedExpressionMode, IndexOption, IndexType, KeyOrIndexDisplay, Owner, Partition,
|
||||
ProcedureParam, ReferentialAction, TableConstraint, UserDefinedTypeCompositeAttributeDef,
|
||||
UserDefinedTypeRepresentation, ViewColumnDef,
|
||||
AlterColumnOperation, AlterIndexOperation, AlterTableOperation, ClusteredBy, ColumnDef,
|
||||
ColumnOption, ColumnOptionDef, ConstraintCharacteristics, Deduplicate, DeferrableInitial,
|
||||
GeneratedAs, GeneratedExpressionMode, IndexOption, IndexType, KeyOrIndexDisplay, Owner,
|
||||
Partition, ProcedureParam, ReferentialAction, TableConstraint,
|
||||
UserDefinedTypeCompositeAttributeDef, UserDefinedTypeRepresentation, ViewColumnDef,
|
||||
};
|
||||
pub use self::dml::{CreateIndex, CreateTable, Delete, Insert};
|
||||
pub use self::operator::{BinaryOperator, UnaryOperator};
|
||||
|
@ -5398,11 +5398,6 @@ pub enum HiveDistributionStyle {
|
|||
PARTITIONED {
|
||||
columns: Vec<ColumnDef>,
|
||||
},
|
||||
CLUSTERED {
|
||||
columns: Vec<Ident>,
|
||||
sorted_by: Vec<ColumnDef>,
|
||||
num_buckets: i32,
|
||||
},
|
||||
SKEWED {
|
||||
columns: Vec<ColumnDef>,
|
||||
on: Vec<ColumnDef>,
|
||||
|
|
|
@ -125,6 +125,7 @@ define_keywords!(
|
|||
BOTH,
|
||||
BROWSE,
|
||||
BTREE,
|
||||
BUCKETS,
|
||||
BY,
|
||||
BYPASSRLS,
|
||||
BYTEA,
|
||||
|
@ -156,6 +157,7 @@ define_keywords!(
|
|||
CLONE,
|
||||
CLOSE,
|
||||
CLUSTER,
|
||||
CLUSTERED,
|
||||
COALESCE,
|
||||
COLLATE,
|
||||
COLLATION,
|
||||
|
@ -675,6 +677,7 @@ define_keywords!(
|
|||
SNAPSHOT,
|
||||
SOME,
|
||||
SORT,
|
||||
SORTED,
|
||||
SOURCE,
|
||||
SPATIAL,
|
||||
SPECIFIC,
|
||||
|
|
|
@ -5377,7 +5377,7 @@ impl<'a> Parser<'a> {
|
|||
})
|
||||
}
|
||||
|
||||
//TODO: Implement parsing for Skewed and Clustered
|
||||
//TODO: Implement parsing for Skewed
|
||||
pub fn parse_hive_distribution(&mut self) -> Result<HiveDistributionStyle, ParserError> {
|
||||
if self.parse_keywords(&[Keyword::PARTITIONED, Keyword::BY]) {
|
||||
self.expect_token(&Token::LParen)?;
|
||||
|
@ -5574,6 +5574,7 @@ impl<'a> Parser<'a> {
|
|||
let without_rowid = self.parse_keywords(&[Keyword::WITHOUT, Keyword::ROWID]);
|
||||
|
||||
let hive_distribution = self.parse_hive_distribution()?;
|
||||
let clustered_by = self.parse_optional_clustered_by()?;
|
||||
let hive_formats = self.parse_hive_formats()?;
|
||||
// PostgreSQL supports `WITH ( options )`, before `AS`
|
||||
let with_options = self.parse_options(Keyword::WITH)?;
|
||||
|
@ -5720,6 +5721,7 @@ impl<'a> Parser<'a> {
|
|||
.collation(collation)
|
||||
.on_commit(on_commit)
|
||||
.on_cluster(on_cluster)
|
||||
.clustered_by(clustered_by)
|
||||
.partition_by(create_table_config.partition_by)
|
||||
.cluster_by(create_table_config.cluster_by)
|
||||
.options(create_table_config.options)
|
||||
|
@ -6099,6 +6101,35 @@ impl<'a> Parser<'a> {
|
|||
}))
|
||||
}
|
||||
|
||||
pub fn parse_optional_clustered_by(&mut self) -> Result<Option<ClusteredBy>, ParserError> {
|
||||
let clustered_by = if dialect_of!(self is HiveDialect|GenericDialect)
|
||||
&& self.parse_keywords(&[Keyword::CLUSTERED, Keyword::BY])
|
||||
{
|
||||
let columns = self.parse_parenthesized_column_list(Mandatory, false)?;
|
||||
|
||||
let sorted_by = if self.parse_keywords(&[Keyword::SORTED, Keyword::BY]) {
|
||||
self.expect_token(&Token::LParen)?;
|
||||
let sorted_by_columns = self.parse_comma_separated(|p| p.parse_order_by_expr())?;
|
||||
self.expect_token(&Token::RParen)?;
|
||||
Some(sorted_by_columns)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
self.expect_keyword(Keyword::INTO)?;
|
||||
let num_buckets = self.parse_number_value()?;
|
||||
self.expect_keyword(Keyword::BUCKETS)?;
|
||||
Some(ClusteredBy {
|
||||
columns,
|
||||
sorted_by,
|
||||
num_buckets,
|
||||
})
|
||||
} else {
|
||||
None
|
||||
};
|
||||
Ok(clustered_by)
|
||||
}
|
||||
|
||||
pub fn parse_referential_action(&mut self) -> Result<ReferentialAction, ParserError> {
|
||||
if self.parse_keyword(Keyword::RESTRICT) {
|
||||
Ok(ReferentialAction::Restrict)
|
||||
|
|
|
@ -741,6 +741,7 @@ fn test_duckdb_union_datatype() {
|
|||
order_by: Default::default(),
|
||||
partition_by: Default::default(),
|
||||
cluster_by: Default::default(),
|
||||
clustered_by: Default::default(),
|
||||
options: Default::default(),
|
||||
strict: Default::default(),
|
||||
copy_grants: Default::default(),
|
||||
|
|
|
@ -16,9 +16,9 @@
|
|||
//! is also tested (on the inputs it can handle).
|
||||
|
||||
use sqlparser::ast::{
|
||||
CreateFunctionBody, CreateFunctionUsing, Expr, Function, FunctionArgumentList,
|
||||
FunctionArguments, Ident, ObjectName, OneOrManyWithParens, SelectItem, Statement, TableFactor,
|
||||
UnaryOperator, Use, Value,
|
||||
ClusteredBy, CreateFunctionBody, CreateFunctionUsing, CreateTable, Expr, Function,
|
||||
FunctionArgumentList, FunctionArguments, Ident, ObjectName, OneOrManyWithParens, OrderByExpr,
|
||||
SelectItem, Statement, TableFactor, UnaryOperator, Use, Value,
|
||||
};
|
||||
use sqlparser::dialect::{GenericDialect, HiveDialect, MsSqlDialect};
|
||||
use sqlparser::parser::ParserError;
|
||||
|
@ -115,6 +115,74 @@ fn create_table_like() {
|
|||
hive().verified_stmt(like);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn create_table_with_clustered_by() {
|
||||
let sql = concat!(
|
||||
"CREATE TABLE db.table_name (a INT, b STRING)",
|
||||
" PARTITIONED BY (a INT, b STRING)",
|
||||
" CLUSTERED BY (a, b) SORTED BY (a ASC, b DESC)",
|
||||
" INTO 4 BUCKETS"
|
||||
);
|
||||
match hive_and_generic().verified_stmt(sql) {
|
||||
Statement::CreateTable(CreateTable { clustered_by, .. }) => {
|
||||
assert_eq!(
|
||||
clustered_by.unwrap(),
|
||||
ClusteredBy {
|
||||
columns: vec![Ident::new("a"), Ident::new("b")],
|
||||
sorted_by: Some(vec![
|
||||
OrderByExpr {
|
||||
expr: Expr::Identifier(Ident::new("a")),
|
||||
asc: Some(true),
|
||||
nulls_first: None,
|
||||
with_fill: None,
|
||||
},
|
||||
OrderByExpr {
|
||||
expr: Expr::Identifier(Ident::new("b")),
|
||||
asc: Some(false),
|
||||
nulls_first: None,
|
||||
with_fill: None,
|
||||
},
|
||||
]),
|
||||
num_buckets: Value::Number("4".parse().unwrap(), false),
|
||||
}
|
||||
)
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
|
||||
// SORTED BY is optional
|
||||
hive_and_generic().verified_stmt("CREATE TABLE db.table_name (a INT, b STRING) PARTITIONED BY (a INT, b STRING) CLUSTERED BY (a, b) INTO 4 BUCKETS");
|
||||
|
||||
// missing INTO BUCKETS
|
||||
assert_eq!(
|
||||
hive_and_generic().parse_sql_statements(
|
||||
"CREATE TABLE db.table_name (a INT, b STRING) PARTITIONED BY (a INT, b STRING) CLUSTERED BY (a, b)"
|
||||
).unwrap_err(),
|
||||
ParserError::ParserError("Expected: INTO, found: EOF".to_string())
|
||||
);
|
||||
// missing CLUSTER BY columns
|
||||
assert_eq!(
|
||||
hive_and_generic().parse_sql_statements(
|
||||
"CREATE TABLE db.table_name (a INT, b STRING) PARTITIONED BY (a INT, b STRING) CLUSTERED BY () INTO 4 BUCKETS"
|
||||
).unwrap_err(),
|
||||
ParserError::ParserError("Expected: identifier, found: )".to_string())
|
||||
);
|
||||
// missing SORT BY columns
|
||||
assert_eq!(
|
||||
hive_and_generic().parse_sql_statements(
|
||||
"CREATE TABLE db.table_name (a INT, b STRING) PARTITIONED BY (a INT, b STRING) CLUSTERED BY (a, b) SORTED BY INTO 4 BUCKETS"
|
||||
).unwrap_err(),
|
||||
ParserError::ParserError("Expected: (, found: INTO".to_string())
|
||||
);
|
||||
// missing number BUCKETS
|
||||
assert_eq!(
|
||||
hive_and_generic().parse_sql_statements(
|
||||
"CREATE TABLE db.table_name (a INT, b STRING) PARTITIONED BY (a INT, b STRING) CLUSTERED BY (a, b) SORTED BY (a ASC, b DESC) INTO"
|
||||
).unwrap_err(),
|
||||
ParserError::ParserError("Expected: a value, found: EOF".to_string())
|
||||
);
|
||||
}
|
||||
|
||||
// Turning off this test until we can parse identifiers starting with numbers :(
|
||||
#[test]
|
||||
fn test_identifier() {
|
||||
|
|
|
@ -4838,6 +4838,7 @@ fn parse_trigger_related_functions() {
|
|||
order_by: None,
|
||||
partition_by: None,
|
||||
cluster_by: None,
|
||||
clustered_by: None,
|
||||
options: None,
|
||||
strict: false,
|
||||
copy_grants: false,
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue