Add support of parsing CLUSTERED BY clause for Hive (#1397)

This commit is contained in:
hulk 2024-09-01 19:21:26 +08:00 committed by GitHub
parent 222b7d127a
commit 7b4ac7ca9f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 166 additions and 36 deletions

View file

@ -26,7 +26,7 @@ use sqlparser_derive::{Visit, VisitMut};
use crate::ast::value::escape_single_quote_string;
use crate::ast::{
display_comma_separated, display_separated, DataType, Expr, Ident, MySQLColumnPosition,
ObjectName, ProjectionSelect, SequenceOptions, SqlOption,
ObjectName, OrderByExpr, ProjectionSelect, SequenceOptions, SqlOption, Value,
};
use crate::tokenizer::Token;
@ -1417,3 +1417,30 @@ impl fmt::Display for Deduplicate {
}
}
}
/// Hive supports `CLUSTERED BY` statement in `CREATE TABLE`.
/// Syntax: `CLUSTERED BY (col_name, ...) [SORTED BY (col_name [ASC|DESC], ...)] INTO num_buckets BUCKETS`
///
/// [Hive](https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DDL#LanguageManualDDL-CreateTable)
#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
pub struct ClusteredBy {
pub columns: Vec<Ident>,
pub sorted_by: Option<Vec<OrderByExpr>>,
pub num_buckets: Value,
}
impl fmt::Display for ClusteredBy {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(
f,
"CLUSTERED BY ({})",
display_comma_separated(&self.columns)
)?;
if let Some(ref sorted_by) = self.sorted_by {
write!(f, " SORTED BY ({})", display_comma_separated(sorted_by))?;
}
write!(f, " INTO {} BUCKETS", self.num_buckets)
}
}

View file

@ -22,11 +22,11 @@ use sqlparser_derive::{Visit, VisitMut};
pub use super::ddl::{ColumnDef, TableConstraint};
use super::{
display_comma_separated, display_separated, CommentDef, Expr, FileFormat, FromTable,
HiveDistributionStyle, HiveFormat, HiveIOFormat, HiveRowFormat, Ident, InsertAliases,
MysqlInsertPriority, ObjectName, OnCommit, OnInsert, OneOrManyWithParens, OrderByExpr, Query,
RowAccessPolicy, SelectItem, SqlOption, SqliteOnConflict, TableEngine, TableWithJoins, Tag,
WrappedCollection,
display_comma_separated, display_separated, ClusteredBy, CommentDef, Expr, FileFormat,
FromTable, HiveDistributionStyle, HiveFormat, HiveIOFormat, HiveRowFormat, Ident,
InsertAliases, MysqlInsertPriority, ObjectName, OnCommit, OnInsert, OneOrManyWithParens,
OrderByExpr, Query, RowAccessPolicy, SelectItem, SqlOption, SqliteOnConflict, TableEngine,
TableWithJoins, Tag, WrappedCollection,
};
/// CREATE INDEX statement.
@ -140,6 +140,9 @@ pub struct CreateTable {
/// BigQuery: Table clustering column list.
/// <https://cloud.google.com/bigquery/docs/reference/standard-sql/data-definition-language#table_option_list>
pub cluster_by: Option<WrappedCollection<Vec<Ident>>>,
/// Hive: Table clustering column list.
/// <https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DDL#LanguageManualDDL-CreateTable>
pub clustered_by: Option<ClusteredBy>,
/// BigQuery: Table options list.
/// <https://cloud.google.com/bigquery/docs/reference/standard-sql/data-definition-language#table_option_list>
pub options: Option<Vec<SqlOption>>,
@ -236,19 +239,6 @@ impl Display for CreateTable {
HiveDistributionStyle::PARTITIONED { columns } => {
write!(f, " PARTITIONED BY ({})", display_comma_separated(columns))?;
}
HiveDistributionStyle::CLUSTERED {
columns,
sorted_by,
num_buckets,
} => {
write!(f, " CLUSTERED BY ({})", display_comma_separated(columns))?;
if !sorted_by.is_empty() {
write!(f, " SORTED BY ({})", display_comma_separated(sorted_by))?;
}
if *num_buckets > 0 {
write!(f, " INTO {num_buckets} BUCKETS")?;
}
}
HiveDistributionStyle::SKEWED {
columns,
on,
@ -267,6 +257,10 @@ impl Display for CreateTable {
_ => (),
}
if let Some(clustered_by) = &self.clustered_by {
write!(f, " {clustered_by}")?;
}
if let Some(HiveFormat {
row_format,
serde_properties,

View file

@ -9,9 +9,9 @@ use sqlparser_derive::{Visit, VisitMut};
use super::super::dml::CreateTable;
use crate::ast::{
ColumnDef, CommentDef, Expr, FileFormat, HiveDistributionStyle, HiveFormat, Ident, ObjectName,
OnCommit, OneOrManyWithParens, Query, RowAccessPolicy, SqlOption, Statement, TableConstraint,
TableEngine, Tag, WrappedCollection,
ClusteredBy, ColumnDef, CommentDef, Expr, FileFormat, HiveDistributionStyle, HiveFormat, Ident,
ObjectName, OnCommit, OneOrManyWithParens, Query, RowAccessPolicy, SqlOption, Statement,
TableConstraint, TableEngine, Tag, WrappedCollection,
};
use crate::parser::ParserError;
@ -78,6 +78,7 @@ pub struct CreateTableBuilder {
pub order_by: Option<OneOrManyWithParens<Expr>>,
pub partition_by: Option<Box<Expr>>,
pub cluster_by: Option<WrappedCollection<Vec<Ident>>>,
pub clustered_by: Option<ClusteredBy>,
pub options: Option<Vec<SqlOption>>,
pub strict: bool,
pub copy_grants: bool,
@ -125,6 +126,7 @@ impl CreateTableBuilder {
order_by: None,
partition_by: None,
cluster_by: None,
clustered_by: None,
options: None,
strict: false,
copy_grants: false,
@ -286,6 +288,11 @@ impl CreateTableBuilder {
self
}
pub fn clustered_by(mut self, clustered_by: Option<ClusteredBy>) -> Self {
self.clustered_by = clustered_by;
self
}
pub fn options(mut self, options: Option<Vec<SqlOption>>) -> Self {
self.options = options;
self
@ -380,6 +387,7 @@ impl CreateTableBuilder {
order_by: self.order_by,
partition_by: self.partition_by,
cluster_by: self.cluster_by,
clustered_by: self.clustered_by,
options: self.options,
strict: self.strict,
copy_grants: self.copy_grants,
@ -434,6 +442,7 @@ impl TryFrom<Statement> for CreateTableBuilder {
order_by,
partition_by,
cluster_by,
clustered_by,
options,
strict,
copy_grants,
@ -476,6 +485,7 @@ impl TryFrom<Statement> for CreateTableBuilder {
order_by,
partition_by,
cluster_by,
clustered_by,
options,
strict,
copy_grants,

View file

@ -33,11 +33,11 @@ pub use self::data_type::{
};
pub use self::dcl::{AlterRoleOperation, ResetConfig, RoleOption, SetConfigValue, Use};
pub use self::ddl::{
AlterColumnOperation, AlterIndexOperation, AlterTableOperation, ColumnDef, ColumnOption,
ColumnOptionDef, ConstraintCharacteristics, Deduplicate, DeferrableInitial, GeneratedAs,
GeneratedExpressionMode, IndexOption, IndexType, KeyOrIndexDisplay, Owner, Partition,
ProcedureParam, ReferentialAction, TableConstraint, UserDefinedTypeCompositeAttributeDef,
UserDefinedTypeRepresentation, ViewColumnDef,
AlterColumnOperation, AlterIndexOperation, AlterTableOperation, ClusteredBy, ColumnDef,
ColumnOption, ColumnOptionDef, ConstraintCharacteristics, Deduplicate, DeferrableInitial,
GeneratedAs, GeneratedExpressionMode, IndexOption, IndexType, KeyOrIndexDisplay, Owner,
Partition, ProcedureParam, ReferentialAction, TableConstraint,
UserDefinedTypeCompositeAttributeDef, UserDefinedTypeRepresentation, ViewColumnDef,
};
pub use self::dml::{CreateIndex, CreateTable, Delete, Insert};
pub use self::operator::{BinaryOperator, UnaryOperator};
@ -5398,11 +5398,6 @@ pub enum HiveDistributionStyle {
PARTITIONED {
columns: Vec<ColumnDef>,
},
CLUSTERED {
columns: Vec<Ident>,
sorted_by: Vec<ColumnDef>,
num_buckets: i32,
},
SKEWED {
columns: Vec<ColumnDef>,
on: Vec<ColumnDef>,

View file

@ -125,6 +125,7 @@ define_keywords!(
BOTH,
BROWSE,
BTREE,
BUCKETS,
BY,
BYPASSRLS,
BYTEA,
@ -156,6 +157,7 @@ define_keywords!(
CLONE,
CLOSE,
CLUSTER,
CLUSTERED,
COALESCE,
COLLATE,
COLLATION,
@ -675,6 +677,7 @@ define_keywords!(
SNAPSHOT,
SOME,
SORT,
SORTED,
SOURCE,
SPATIAL,
SPECIFIC,

View file

@ -5377,7 +5377,7 @@ impl<'a> Parser<'a> {
})
}
//TODO: Implement parsing for Skewed and Clustered
//TODO: Implement parsing for Skewed
pub fn parse_hive_distribution(&mut self) -> Result<HiveDistributionStyle, ParserError> {
if self.parse_keywords(&[Keyword::PARTITIONED, Keyword::BY]) {
self.expect_token(&Token::LParen)?;
@ -5574,6 +5574,7 @@ impl<'a> Parser<'a> {
let without_rowid = self.parse_keywords(&[Keyword::WITHOUT, Keyword::ROWID]);
let hive_distribution = self.parse_hive_distribution()?;
let clustered_by = self.parse_optional_clustered_by()?;
let hive_formats = self.parse_hive_formats()?;
// PostgreSQL supports `WITH ( options )`, before `AS`
let with_options = self.parse_options(Keyword::WITH)?;
@ -5720,6 +5721,7 @@ impl<'a> Parser<'a> {
.collation(collation)
.on_commit(on_commit)
.on_cluster(on_cluster)
.clustered_by(clustered_by)
.partition_by(create_table_config.partition_by)
.cluster_by(create_table_config.cluster_by)
.options(create_table_config.options)
@ -6099,6 +6101,35 @@ impl<'a> Parser<'a> {
}))
}
pub fn parse_optional_clustered_by(&mut self) -> Result<Option<ClusteredBy>, ParserError> {
let clustered_by = if dialect_of!(self is HiveDialect|GenericDialect)
&& self.parse_keywords(&[Keyword::CLUSTERED, Keyword::BY])
{
let columns = self.parse_parenthesized_column_list(Mandatory, false)?;
let sorted_by = if self.parse_keywords(&[Keyword::SORTED, Keyword::BY]) {
self.expect_token(&Token::LParen)?;
let sorted_by_columns = self.parse_comma_separated(|p| p.parse_order_by_expr())?;
self.expect_token(&Token::RParen)?;
Some(sorted_by_columns)
} else {
None
};
self.expect_keyword(Keyword::INTO)?;
let num_buckets = self.parse_number_value()?;
self.expect_keyword(Keyword::BUCKETS)?;
Some(ClusteredBy {
columns,
sorted_by,
num_buckets,
})
} else {
None
};
Ok(clustered_by)
}
pub fn parse_referential_action(&mut self) -> Result<ReferentialAction, ParserError> {
if self.parse_keyword(Keyword::RESTRICT) {
Ok(ReferentialAction::Restrict)

View file

@ -741,6 +741,7 @@ fn test_duckdb_union_datatype() {
order_by: Default::default(),
partition_by: Default::default(),
cluster_by: Default::default(),
clustered_by: Default::default(),
options: Default::default(),
strict: Default::default(),
copy_grants: Default::default(),

View file

@ -16,9 +16,9 @@
//! is also tested (on the inputs it can handle).
use sqlparser::ast::{
CreateFunctionBody, CreateFunctionUsing, Expr, Function, FunctionArgumentList,
FunctionArguments, Ident, ObjectName, OneOrManyWithParens, SelectItem, Statement, TableFactor,
UnaryOperator, Use, Value,
ClusteredBy, CreateFunctionBody, CreateFunctionUsing, CreateTable, Expr, Function,
FunctionArgumentList, FunctionArguments, Ident, ObjectName, OneOrManyWithParens, OrderByExpr,
SelectItem, Statement, TableFactor, UnaryOperator, Use, Value,
};
use sqlparser::dialect::{GenericDialect, HiveDialect, MsSqlDialect};
use sqlparser::parser::ParserError;
@ -115,6 +115,74 @@ fn create_table_like() {
hive().verified_stmt(like);
}
#[test]
fn create_table_with_clustered_by() {
let sql = concat!(
"CREATE TABLE db.table_name (a INT, b STRING)",
" PARTITIONED BY (a INT, b STRING)",
" CLUSTERED BY (a, b) SORTED BY (a ASC, b DESC)",
" INTO 4 BUCKETS"
);
match hive_and_generic().verified_stmt(sql) {
Statement::CreateTable(CreateTable { clustered_by, .. }) => {
assert_eq!(
clustered_by.unwrap(),
ClusteredBy {
columns: vec![Ident::new("a"), Ident::new("b")],
sorted_by: Some(vec![
OrderByExpr {
expr: Expr::Identifier(Ident::new("a")),
asc: Some(true),
nulls_first: None,
with_fill: None,
},
OrderByExpr {
expr: Expr::Identifier(Ident::new("b")),
asc: Some(false),
nulls_first: None,
with_fill: None,
},
]),
num_buckets: Value::Number("4".parse().unwrap(), false),
}
)
}
_ => unreachable!(),
}
// SORTED BY is optional
hive_and_generic().verified_stmt("CREATE TABLE db.table_name (a INT, b STRING) PARTITIONED BY (a INT, b STRING) CLUSTERED BY (a, b) INTO 4 BUCKETS");
// missing INTO BUCKETS
assert_eq!(
hive_and_generic().parse_sql_statements(
"CREATE TABLE db.table_name (a INT, b STRING) PARTITIONED BY (a INT, b STRING) CLUSTERED BY (a, b)"
).unwrap_err(),
ParserError::ParserError("Expected: INTO, found: EOF".to_string())
);
// missing CLUSTER BY columns
assert_eq!(
hive_and_generic().parse_sql_statements(
"CREATE TABLE db.table_name (a INT, b STRING) PARTITIONED BY (a INT, b STRING) CLUSTERED BY () INTO 4 BUCKETS"
).unwrap_err(),
ParserError::ParserError("Expected: identifier, found: )".to_string())
);
// missing SORT BY columns
assert_eq!(
hive_and_generic().parse_sql_statements(
"CREATE TABLE db.table_name (a INT, b STRING) PARTITIONED BY (a INT, b STRING) CLUSTERED BY (a, b) SORTED BY INTO 4 BUCKETS"
).unwrap_err(),
ParserError::ParserError("Expected: (, found: INTO".to_string())
);
// missing number BUCKETS
assert_eq!(
hive_and_generic().parse_sql_statements(
"CREATE TABLE db.table_name (a INT, b STRING) PARTITIONED BY (a INT, b STRING) CLUSTERED BY (a, b) SORTED BY (a ASC, b DESC) INTO"
).unwrap_err(),
ParserError::ParserError("Expected: a value, found: EOF".to_string())
);
}
// Turning off this test until we can parse identifiers starting with numbers :(
#[test]
fn test_identifier() {

View file

@ -4838,6 +4838,7 @@ fn parse_trigger_related_functions() {
order_by: None,
partition_by: None,
cluster_by: None,
clustered_by: None,
options: None,
strict: false,
copy_grants: false,