Support for BigQuery struct, array and bytes , int64, float64 datatypes (#1003)

2025-10-21 19:21:46 +00:00 · 2023-10-25 18:57:33 +02:00 · 2023-10-25 18:57:33 +02:00 · 2f437db2a6
commit 2f437db2a6
parent 65317edcb9
8 changed files with 901 additions and 65 deletions
--- a/src/ast/data_type.rs
+++ b/src/ast/data_type.rs
@ -20,7 +20,7 @@ use serde::{Deserialize, Serialize};
 #[cfg(feature = "visitor")]
 use sqlparser_derive::{Visit, VisitMut};

-use crate::ast::ObjectName;
+use crate::ast::{display_comma_separated, ObjectName, StructField};

 use super::value::escape_single_quote_string;

@ -71,6 +71,10 @@ pub enum DataType {
    /// [standard]: https://jakewheat.github.io/sql-overview/sql-2016-foundation-grammar.html#binary-large-object-string-type
    /// [Oracle]: https://docs.oracle.com/javadb/10.8.3.0/ref/rrefblob.html
    Blob(Option<u64>),
+    /// Variable-length binary data with optional length.
+    ///
+    /// [bigquery]: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#bytes_type
+    Bytes(Option<u64>),
    /// Numeric type with optional precision and scale e.g. NUMERIC(10,2), [standard][1]
    ///
    /// [1]: https://jakewheat.github.io/sql-overview/sql-2016-foundation-grammar.html#exact-numeric-type
@ -125,6 +129,10 @@ pub enum DataType {
    ///
    /// [postgresql]: https://www.postgresql.org/docs/15/datatype.html
    Int4(Option<u64>),
+    /// Integer type in [bigquery]
+    ///
+    /// [bigquery]: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#integer_types
+    Int64,
    /// Integer with optional display width e.g. INTEGER or INTEGER(11)
    Integer(Option<u64>),
    /// Unsigned int with optional display width e.g. INT UNSIGNED or INT(11) UNSIGNED
@ -149,6 +157,10 @@ pub enum DataType {
    ///
    /// [postgresql]: https://www.postgresql.org/docs/15/datatype.html
    Float4,
+    /// Floating point in [bigquery]
+    ///
+    /// [bigquery]: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#floating_point_types
+    Float64,
    /// Floating point e.g. REAL
    Real,
    /// Float8 as alias for Double in [postgresql]
@ -190,18 +202,23 @@ pub enum DataType {
    Regclass,
    /// Text
    Text,
-    /// String
-    String,
+    /// String with optional length.
+    String(Option<u64>),
    /// Bytea
    Bytea,
    /// Custom type such as enums
    Custom(ObjectName, Vec<String>),
    /// Arrays
-    Array(Option<Box<DataType>>),
+    Array(ArrayElemTypeDef),
    /// Enums
    Enum(Vec<String>),
    /// Set
    Set(Vec<String>),
+    /// Struct
+    ///
+    /// [hive]: https://docs.cloudera.com/cdw-runtime/cloud/impala-sql-reference/topics/impala-struct.html
+    /// [bigquery]: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#struct_type
+    Struct(Vec<StructField>),
 }

 impl fmt::Display for DataType {
@ -231,6 +248,7 @@ impl fmt::Display for DataType {
                format_type_with_optional_length(f, "VARBINARY", size, false)
            }
            DataType::Blob(size) => format_type_with_optional_length(f, "BLOB", size, false),
+            DataType::Bytes(size) => format_type_with_optional_length(f, "BYTES", size, false),
            DataType::Numeric(info) => {
                write!(f, "NUMERIC{info}")
            }
@ -274,6 +292,9 @@ impl fmt::Display for DataType {
            DataType::Int4(zerofill) => {
                format_type_with_optional_length(f, "INT4", zerofill, false)
            }
+            DataType::Int64 => {
+                write!(f, "INT64")
+            }
            DataType::UnsignedInt4(zerofill) => {
                format_type_with_optional_length(f, "INT4", zerofill, true)
            }
@ -297,6 +318,7 @@ impl fmt::Display for DataType {
            }
            DataType::Real => write!(f, "REAL"),
            DataType::Float4 => write!(f, "FLOAT4"),
+            DataType::Float64 => write!(f, "FLOAT64"),
            DataType::Double => write!(f, "DOUBLE"),
            DataType::Float8 => write!(f, "FLOAT8"),
            DataType::DoublePrecision => write!(f, "DOUBLE PRECISION"),
@ -316,15 +338,13 @@ impl fmt::Display for DataType {
            DataType::JSON => write!(f, "JSON"),
            DataType::Regclass => write!(f, "REGCLASS"),
            DataType::Text => write!(f, "TEXT"),
-            DataType::String => write!(f, "STRING"),
+            DataType::String(size) => format_type_with_optional_length(f, "STRING", size, false),
            DataType::Bytea => write!(f, "BYTEA"),
-            DataType::Array(ty) => {
-                if let Some(t) = &ty {
-                    write!(f, "{t}[]")
-                } else {
-                    write!(f, "ARRAY")
-                }
-            }
+            DataType::Array(ty) => match ty {
+                ArrayElemTypeDef::None => write!(f, "ARRAY"),
+                ArrayElemTypeDef::SquareBracket(t) => write!(f, "{t}[]"),
+                ArrayElemTypeDef::AngleBracket(t) => write!(f, "ARRAY<{t}>"),
+            },
            DataType::Custom(ty, modifiers) => {
                if modifiers.is_empty() {
                    write!(f, "{ty}")
@ -352,6 +372,13 @@ impl fmt::Display for DataType {
                }
                write!(f, ")")
            }
+            DataType::Struct(fields) => {
+                if !fields.is_empty() {
+                    write!(f, "STRUCT<{}>", display_comma_separated(fields))
+                } else {
+                    write!(f, "STRUCT")
+                }
+            }
        }
    }
 }
@ -533,3 +560,19 @@ impl fmt::Display for CharLengthUnits {
        }
    }
 }
+
+/// Represents the data type of the elements in an array (if any) as well as
+/// the syntax used to declare the array.
+///
+/// For example: Bigquery/Hive use `ARRAY<INT>` whereas snowflake uses ARRAY.
+#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
+#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
+pub enum ArrayElemTypeDef {
+    /// `ARRAY`
+    None,
+    /// `ARRAY<INT>`
+    AngleBracket(Box<DataType>),
+    /// `[]INT`
+    SquareBracket(Box<DataType>),
+}
--- a/src/ast/mod.rs
+++ b/src/ast/mod.rs
@ -26,7 +26,7 @@ use serde::{Deserialize, Serialize};
 use sqlparser_derive::{Visit, VisitMut};

 pub use self::data_type::{
-    CharLengthUnits, CharacterLength, DataType, ExactNumberInfo, TimezoneInfo,
+    ArrayElemTypeDef, CharLengthUnits, CharacterLength, DataType, ExactNumberInfo, TimezoneInfo,
 };
 pub use self::dcl::{AlterRoleOperation, ResetConfig, RoleOption, SetConfigValue};
 pub use self::ddl::{
@ -323,6 +323,27 @@ impl fmt::Display for JsonOperator {
    }
 }

+/// A field definition within a struct.
+///
+/// [bigquery]: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#struct_type
+#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
+#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
+pub struct StructField {
+    pub field_name: Option<Ident>,
+    pub field_type: DataType,
+}
+
+impl fmt::Display for StructField {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        if let Some(name) = &self.field_name {
+            write!(f, "{name} {}", self.field_type)
+        } else {
+            write!(f, "{}", self.field_type)
+        }
+    }
+}
+
 /// Options for `CAST` / `TRY_CAST`
 /// BigQuery: <https://cloud.google.com/bigquery/docs/reference/standard-sql/format-elements#formatting_syntax>
 #[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
@ -597,6 +618,26 @@ pub enum Expr {
    Rollup(Vec<Vec<Expr>>),
    /// ROW / TUPLE a single value, such as `SELECT (1, 2)`
    Tuple(Vec<Expr>),
+    /// `BigQuery` specific `Struct` literal expression [1]
+    /// Syntax:
+    /// ```sql
+    /// STRUCT<[field_name] field_type, ...>( expr1 [, ... ])
+    /// ```
+    /// [1]: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#struct_type
+    Struct {
+        /// Struct values.
+        values: Vec<Expr>,
+        /// Struct field definitions.
+        fields: Vec<StructField>,
+    },
+    /// `BigQuery` specific: An named expression in a typeless struct [1]
+    ///
+    /// Syntax
+    /// ```sql
+    /// 1 AS A
+    /// ```
+    /// [1]: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#struct_type
+    Named { expr: Box<Expr>, name: Ident },
    /// An array index expression e.g. `(ARRAY[1, 2])[1]` or `(current_schemas(FALSE))[1]`
    ArrayIndex { obj: Box<Expr>, indexes: Vec<Expr> },
    /// An array expression e.g. `ARRAY[1, 2]`
@ -997,6 +1038,21 @@ impl fmt::Display for Expr {
            Expr::Tuple(exprs) => {
                write!(f, "({})", display_comma_separated(exprs))
            }
+            Expr::Struct { values, fields } => {
+                if !fields.is_empty() {
+                    write!(
+                        f,
+                        "STRUCT<{}>({})",
+                        display_comma_separated(fields),
+                        display_comma_separated(values)
+                    )
+                } else {
+                    write!(f, "STRUCT({})", display_comma_separated(values))
+                }
+            }
+            Expr::Named { expr, name } => {
+                write!(f, "{} AS {}", expr, name)
+            }
            Expr::ArrayIndex { obj, indexes } => {
                write!(f, "{obj}")?;
                for i in indexes {
--- a/src/keywords.rs
+++ b/src/keywords.rs
@ -120,6 +120,7 @@ define_keywords!(
    BY,
    BYPASSRLS,
    BYTEA,
+    BYTES,
    CACHE,
    CALL,
    CALLED,
@ -270,6 +271,7 @@ define_keywords!(
    FIRST_VALUE,
    FLOAT,
    FLOAT4,
+    FLOAT64,
    FLOAT8,
    FLOOR,
    FOLLOWING,
@ -293,6 +295,7 @@ define_keywords!(
    FUSION,
    GENERATE,
    GENERATED,
+    GEOGRAPHY,
    GET,
    GLOBAL,
    GRANT,
@ -328,6 +331,7 @@ define_keywords!(
    INT,
    INT2,
    INT4,
+    INT64,
    INT8,
    INTEGER,
    INTERSECT,
@ -584,6 +588,7 @@ define_keywords!(
    STORED,
    STRICT,
    STRING,
+    STRUCT,
    SUBMULTISET,
    SUBSTRING,
    SUBSTRING_REGEX,
--- a/src/parser/mod.rs
+++ b/src/parser/mod.rs
@ -30,7 +30,7 @@ use IsOptional::*;
 use crate::ast::helpers::stmt_create_table::CreateTableBuilder;
 use crate::ast::*;
 use crate::dialect::*;
-use crate::keywords::{self, Keyword};
+use crate::keywords::{self, Keyword, ALL_KEYWORDS};
 use crate::tokenizer::*;

 mod alter;
@ -197,6 +197,26 @@ impl std::error::Error for ParserError {}
 // By default, allow expressions up to this deep before erroring
 const DEFAULT_REMAINING_DEPTH: usize = 50;

+/// Composite types declarations using angle brackets syntax can be arbitrary
+/// nested such that the following declaration is possible:
+///      `ARRAY<ARRAY<INT>>`
+/// But the tokenizer recognizes the `>>` as a ShiftRight token.
+/// We work-around that limitation when parsing a data type by accepting
+/// either a `>` or `>>` token in such cases, remembering which variant we
+/// matched.
+/// In the latter case having matched a `>>`, the parent type will not look to
+/// match its closing `>` as a result since that will have taken place at the
+/// child type.
+///
+/// See [Parser::parse_data_type] for details
+struct MatchedTrailingBracket(bool);
+
+impl From<bool> for MatchedTrailingBracket {
+    fn from(value: bool) -> Self {
+        Self(value)
+    }
+}
+
 /// Options that control how the [`Parser`] parses SQL text
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct ParserOptions {
@ -833,6 +853,10 @@ impl<'a> Parser<'a> {
                Keyword::MATCH if dialect_of!(self is MySqlDialect | GenericDialect) => {
                    self.parse_match_against()
                }
+                Keyword::STRUCT if dialect_of!(self is BigQueryDialect | GenericDialect) => {
+                    self.prev_token();
+                    self.parse_bigquery_struct_literal()
+                }
                // Here `w` is a word, check if it's a part of a multi-part
                // identifier, a function call, or a simple identifier:
                _ => match self.peek_token().token {
@ -1798,6 +1822,172 @@ impl<'a> Parser<'a> {
        }))
    }

+    /// Bigquery specific: Parse a struct literal
+    /// Syntax
+    /// ```sql
+    /// -- typed
+    /// STRUCT<[field_name] field_type, ...>( expr1 [, ... ])
+    /// -- typeless
+    /// STRUCT( expr1 [AS field_name] [, ... ])
+    /// ```
+    fn parse_bigquery_struct_literal(&mut self) -> Result<Expr, ParserError> {
+        let (fields, trailing_bracket) =
+            self.parse_struct_type_def(Self::parse_big_query_struct_field_def)?;
+        if trailing_bracket.0 {
+            return parser_err!("unmatched > in STRUCT literal", self.peek_token().location);
+        }
+
+        self.expect_token(&Token::LParen)?;
+        let values = self
+            .parse_comma_separated(|parser| parser.parse_struct_field_expr(!fields.is_empty()))?;
+        self.expect_token(&Token::RParen)?;
+
+        Ok(Expr::Struct { values, fields })
+    }
+
+    /// Parse an expression value for a bigquery struct [1]
+    /// Syntax
+    /// ```sql
+    /// expr [AS name]
+    /// ```
+    ///
+    /// Parameter typed_syntax is set to true if the expression
+    /// is to be parsed as a field expression declared using typed
+    /// struct syntax [2], and false if using typeless struct syntax [3].
+    ///
+    /// [1]: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#constructing_a_struct
+    /// [2]: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#typed_struct_syntax
+    /// [3]: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#typeless_struct_syntax
+    fn parse_struct_field_expr(&mut self, typed_syntax: bool) -> Result<Expr, ParserError> {
+        let expr = self.parse_expr()?;
+        if self.parse_keyword(Keyword::AS) {
+            if typed_syntax {
+                return parser_err!("Typed syntax does not allow AS", {
+                    self.prev_token();
+                    self.peek_token().location
+                });
+            }
+            let field_name = self.parse_identifier()?;
+            Ok(Expr::Named {
+                expr: expr.into(),
+                name: field_name,
+            })
+        } else {
+            Ok(expr)
+        }
+    }
+
+    /// Parse a Struct type definition as a sequence of field-value pairs.
+    /// The syntax of the Struct elem differs by dialect so it is customised
+    /// by the `elem_parser` argument.
+    ///
+    /// Syntax
+    /// ```sql
+    /// Hive:
+    /// STRUCT<field_name: field_type>
+    ///
+    /// BigQuery:
+    /// STRUCT<[field_name] field_type>
+    /// ```
+    fn parse_struct_type_def<F>(
+        &mut self,
+        mut elem_parser: F,
+    ) -> Result<(Vec<StructField>, MatchedTrailingBracket), ParserError>
+    where
+        F: FnMut(&mut Parser<'a>) -> Result<(StructField, MatchedTrailingBracket), ParserError>,
+    {
+        let start_token = self.peek_token();
+        self.expect_keyword(Keyword::STRUCT)?;
+
+        // Nothing to do if we have no type information.
+        if Token::Lt != self.peek_token() {
+            return Ok((Default::default(), false.into()));
+        }
+        self.next_token();
+
+        let mut field_defs = vec![];
+        let trailing_bracket = loop {
+            let (def, trailing_bracket) = elem_parser(self)?;
+            field_defs.push(def);
+            if !self.consume_token(&Token::Comma) {
+                break trailing_bracket;
+            }
+
+            // Angle brackets are balanced so we only expect the trailing `>>` after
+            // we've matched all field types for the current struct.
+            // e.g. this is invalid syntax `STRUCT<STRUCT<INT>>>, INT>(NULL)`
+            if trailing_bracket.0 {
+                return parser_err!("unmatched > in STRUCT definition", start_token.location);
+            }
+        };
+
+        Ok((
+            field_defs,
+            self.expect_closing_angle_bracket(trailing_bracket)?,
+        ))
+    }
+
+    /// Parse a field definition in a BigQuery struct.
+    /// Syntax:
+    ///
+    /// ```sql
+    /// [field_name] field_type
+    /// ```
+    fn parse_big_query_struct_field_def(
+        &mut self,
+    ) -> Result<(StructField, MatchedTrailingBracket), ParserError> {
+        let is_anonymous_field = if let Token::Word(w) = self.peek_token().token {
+            ALL_KEYWORDS
+                .binary_search(&w.value.to_uppercase().as_str())
+                .is_ok()
+        } else {
+            false
+        };
+
+        let field_name = if is_anonymous_field {
+            None
+        } else {
+            Some(self.parse_identifier()?)
+        };
+
+        let (field_type, trailing_bracket) = self.parse_data_type_helper()?;
+
+        Ok((
+            StructField {
+                field_name,
+                field_type,
+            },
+            trailing_bracket,
+        ))
+    }
+
+    /// For nested types that use the angle bracket syntax, this matches either
+    /// `>`, `>>` or nothing depending on which variant is expected (specified by the previously
+    /// matched `trailing_bracket` argument). It returns whether there is a trailing
+    /// left to be matched - (i.e. if '>>' was matched).
+    fn expect_closing_angle_bracket(
+        &mut self,
+        trailing_bracket: MatchedTrailingBracket,
+    ) -> Result<MatchedTrailingBracket, ParserError> {
+        let trailing_bracket = if !trailing_bracket.0 {
+            match self.peek_token().token {
+                Token::Gt => {
+                    self.next_token();
+                    false.into()
+                }
+                Token::ShiftRight => {
+                    self.next_token();
+                    true.into()
+                }
+                _ => return self.expected(">", self.peek_token()),
+            }
+        } else {
+            false.into()
+        };
+
+        Ok(trailing_bracket)
+    }
+
    /// Parse an operator following an expression
    pub fn parse_infix(&mut self, expr: Expr, precedence: u8) -> Result<Expr, ParserError> {
        // allow the dialect to override infix parsing
@ -4876,7 +5066,22 @@ impl<'a> Parser<'a> {

    /// Parse a SQL datatype (in the context of a CREATE TABLE statement for example)
    pub fn parse_data_type(&mut self) -> Result<DataType, ParserError> {
+        let (ty, trailing_bracket) = self.parse_data_type_helper()?;
+        if trailing_bracket.0 {
+            return parser_err!(
+                format!("unmatched > after parsing data type {ty}"),
+                self.peek_token()
+            );
+        }
+
+        Ok(ty)
+    }
+
+    fn parse_data_type_helper(
+        &mut self,
+    ) -> Result<(DataType, MatchedTrailingBracket), ParserError> {
        let next_token = self.next_token();
+        let mut trailing_bracket = false.into();
        let mut data = match next_token.token {
            Token::Word(w) => match w.keyword {
                Keyword::BOOLEAN => Ok(DataType::Boolean),
@ -4884,6 +5089,7 @@ impl<'a> Parser<'a> {
                Keyword::FLOAT => Ok(DataType::Float(self.parse_optional_precision()?)),
                Keyword::REAL => Ok(DataType::Real),
                Keyword::FLOAT4 => Ok(DataType::Float4),
+                Keyword::FLOAT64 => Ok(DataType::Float64),
                Keyword::FLOAT8 => Ok(DataType::Float8),
                Keyword::DOUBLE => {
                    if self.parse_keyword(Keyword::PRECISION) {
@ -4940,6 +5146,7 @@ impl<'a> Parser<'a> {
                        Ok(DataType::Int4(optional_precision?))
                    }
                }
+                Keyword::INT64 => Ok(DataType::Int64),
                Keyword::INTEGER => {
                    let optional_precision = self.parse_optional_precision();
                    if self.parse_keyword(Keyword::UNSIGNED) {
@ -4994,6 +5201,7 @@ impl<'a> Parser<'a> {
                Keyword::BINARY => Ok(DataType::Binary(self.parse_optional_precision()?)),
                Keyword::VARBINARY => Ok(DataType::Varbinary(self.parse_optional_precision()?)),
                Keyword::BLOB => Ok(DataType::Blob(self.parse_optional_precision()?)),
+                Keyword::BYTES => Ok(DataType::Bytes(self.parse_optional_precision()?)),
                Keyword::UUID => Ok(DataType::Uuid),
                Keyword::DATE => Ok(DataType::Date),
                Keyword::DATETIME => Ok(DataType::Datetime(self.parse_optional_precision()?)),
@ -5037,7 +5245,7 @@ impl<'a> Parser<'a> {
                Keyword::INTERVAL => Ok(DataType::Interval),
                Keyword::JSON => Ok(DataType::JSON),
                Keyword::REGCLASS => Ok(DataType::Regclass),
-                Keyword::STRING => Ok(DataType::String),
+                Keyword::STRING => Ok(DataType::String(self.parse_optional_precision()?)),
                Keyword::TEXT => Ok(DataType::Text),
                Keyword::BYTEA => Ok(DataType::Bytea),
                Keyword::NUMERIC => Ok(DataType::Numeric(
@ -5059,17 +5267,23 @@ impl<'a> Parser<'a> {
                Keyword::SET => Ok(DataType::Set(self.parse_string_values()?)),
                Keyword::ARRAY => {
                    if dialect_of!(self is SnowflakeDialect) {
-                        Ok(DataType::Array(None))
+                        Ok(DataType::Array(ArrayElemTypeDef::None))
                    } else {
-                        // Hive array syntax. Note that nesting arrays - or other Hive syntax
-                        // that ends with > will fail due to "C++" problem - >> is parsed as
-                        // Token::ShiftRight
                        self.expect_token(&Token::Lt)?;
-                        let inside_type = self.parse_data_type()?;
-                        self.expect_token(&Token::Gt)?;
-                        Ok(DataType::Array(Some(Box::new(inside_type))))
+                        let (inside_type, _trailing_bracket) = self.parse_data_type_helper()?;
+                        trailing_bracket = self.expect_closing_angle_bracket(_trailing_bracket)?;
+                        Ok(DataType::Array(ArrayElemTypeDef::AngleBracket(Box::new(
+                            inside_type,
+                        ))))
                    }
                }
+                Keyword::STRUCT if dialect_of!(self is BigQueryDialect) => {
+                    self.prev_token();
+                    let (field_defs, _trailing_bracket) =
+                        self.parse_struct_type_def(Self::parse_big_query_struct_field_def)?;
+                    trailing_bracket = _trailing_bracket;
+                    Ok(DataType::Struct(field_defs))
+                }
                _ => {
                    self.prev_token();
                    let type_name = self.parse_object_name()?;
@ -5087,9 +5301,9 @@ impl<'a> Parser<'a> {
        // Keyword::ARRAY syntax from above
        while self.consume_token(&Token::LBracket) {
            self.expect_token(&Token::RBracket)?;
-            data = DataType::Array(Some(Box::new(data)))
+            data = DataType::Array(ArrayElemTypeDef::SquareBracket(Box::new(data)))
        }
-        Ok(data)
+        Ok((data, trailing_bracket))
    }

    pub fn parse_string_values(&mut self) -> Result<Vec<String>, ParserError> {