Support general "typed string" literals (#187)

Fixes #168 by enabling `DATE` and other keywords to be used as identifiers when not followed by a string literal. A "typed string" is our term for generalized version of `DATE '...'`/`TIME '...'`/ `TIMESTAMP '...'` literals, represented as `TypedString { data_type, value }` in the AST. Unlike DATE/TIME/TIMESTAMP literals, this is a non-standard extension supported by PostgreSQL at least. This is a port of MaterializeInc/materialize#3146 Co-authored-by: Nikhil Benesch <nikhil.benesch@gmail.com> Co-authored-by: Nickolay Ponomarev <asqueella@gmail.com>
2025-10-09 21:42:05 +00:00 · 2020-06-11 14:04:43 -07:00 · 2020-06-11 14:04:43 -07:00 · 6cdd4a146d
commit 6cdd4a146d
parent 34548e890b
5 changed files with 116 additions and 43 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -13,7 +13,9 @@ Check https://github.com/andygrove/sqlparser-rs/commits/master for undocumented
 - Change `Ident` (previously a simple `String`) to store the parsed (unquoted) `value` of the identifier and the `quote_style` separately (#143) - thanks @apparebit!
 - Support Snowflake's `FROM (table_name)` (#155) - thanks @eyalleshem!
 - Add line and column number to TokenizerError (#194) - thanks @Dandandan!
+- Use Token::EOF instead of Option<Token> (#195)
 - Make the units keyword following `INTERVAL '...'` optional (#184) - thanks @maxcountryman!
+- Generalize `DATE`/`TIME`/`TIMESTAMP` literals representation in the AST (`TypedString { data_type, value }`) and allow `DATE` and other keywords to be used as identifiers when not followed by a string (#187) - thanks @maxcountryman!

 ### Added
 - Support MSSQL `TOP (<N>) [ PERCENT ] [ WITH TIES ]` (#150) - thanks @alexkyllo!
@ -26,6 +28,7 @@ Check https://github.com/andygrove/sqlparser-rs/commits/master for undocumented
 - Support `LISTAGG()` (#174) - thanks @maxcountryman!
 - Support the string concatentation operator `||` (#178) - thanks @Dandandan!
 - Support bitwise AND (`&`), OR (`|`), XOR (`^`) (#181) - thanks @Dandandan!
+- Add serde support to AST structs and enums (#196) - thanks @panarch!

 ### Fixed
 - Report an error for unterminated string literals (#165)
--- a/src/ast/mod.rs
+++ b/src/ast/mod.rs
@ -210,6 +210,10 @@ pub enum Expr {
    Nested(Box<Expr>),
    /// A literal value, such as string, number, date or NULL
    Value(Value),
+    /// A constant of form `<data_type> 'value'`.
+    /// This can represent ANSI SQL `DATE`, `TIME`, and `TIMESTAMP` literals (such as `DATE '2020-01-01'`),
+    /// as well as constants of other types (a non-standard PostgreSQL extension).
+    TypedString { data_type: DataType, value: String },
    /// Scalar function call e.g. `LEFT(foo, 5)`
    Function(Function),
    /// `CASE [<operand>] WHEN <condition> THEN <result> ... [ELSE <result>] END`
@ -284,6 +288,10 @@ impl fmt::Display for Expr {
            Expr::Collate { expr, collation } => write!(f, "{} COLLATE {}", expr, collation),
            Expr::Nested(ast) => write!(f, "({})", ast),
            Expr::Value(v) => write!(f, "{}", v),
+            Expr::TypedString { data_type, value } => {
+                write!(f, "{}", data_type)?;
+                write!(f, " '{}'", &value::escape_single_quote_string(value))
+            }
            Expr::Function(fun) => write!(f, "{}", fun),
            Expr::Case {
                operand,
--- a/src/ast/value.rs
+++ b/src/ast/value.rs
@ -33,12 +33,6 @@ pub enum Value {
    HexStringLiteral(String),
    /// Boolean value true or false
    Boolean(bool),
-    /// `DATE '...'` literals
-    Date(String),
-    /// `TIME '...'` literals
-    Time(String),
-    /// `TIMESTAMP '...'` literals
-    Timestamp(String),
    /// INTERVAL literals, roughly in the following format:
    /// `INTERVAL '<value>' [ <leading_field> [ (<leading_precision>) ] ]
    /// [ TO <last_field> [ (<fractional_seconds_precision>) ] ]`,
@ -70,9 +64,6 @@ impl fmt::Display for Value {
            Value::NationalStringLiteral(v) => write!(f, "N'{}'", v),
            Value::HexStringLiteral(v) => write!(f, "X'{}'", v),
            Value::Boolean(v) => write!(f, "{}", v),
-            Value::Date(v) => write!(f, "DATE '{}'", escape_single_quote_string(v)),
-            Value::Time(v) => write!(f, "TIME '{}'", escape_single_quote_string(v)),
-            Value::Timestamp(v) => write!(f, "TIMESTAMP '{}'", escape_single_quote_string(v)),
            Value::Interval {
                value,
                leading_field: Some(DateTimeField::Second),
--- a/src/parser.rs
+++ b/src/parser.rs
@ -35,6 +35,15 @@ macro_rules! parser_err {
    };
 }

+// Returns a successful result if the optional expression is some
+macro_rules! return_ok_if_some {
+    ($e:expr) => {{
+        if let Some(v) = $e {
+            return Ok(v);
+        }
+    }};
+}
+
 #[derive(PartialEq)]
 pub enum IsOptional {
    Optional,
@ -172,6 +181,40 @@ impl Parser {

    /// Parse an expression prefix
    pub fn parse_prefix(&mut self) -> Result<Expr, ParserError> {
+        // PostgreSQL allows any string literal to be preceded by a type name, indicating that the
+        // string literal represents a literal of that type. Some examples:
+        //
+        //      DATE '2020-05-20'
+        //      TIMESTAMP WITH TIME ZONE '2020-05-20 7:43:54'
+        //      BOOL 'true'
+        //
+        // The first two are standard SQL, while the latter is a PostgreSQL extension. Complicating
+        // matters is the fact that INTERVAL string literals may optionally be followed by special
+        // keywords, e.g.:
+        //
+        //      INTERVAL '7' DAY
+        //
+        // Note also that naively `SELECT date` looks like a syntax error because the `date` type
+        // name is not followed by a string literal, but in fact in PostgreSQL it is a valid
+        // expression that should parse as the column name "date".
+        return_ok_if_some!(self.maybe_parse(|parser| {
+            match parser.parse_data_type()? {
+                DataType::Interval => parser.parse_literal_interval(),
+                // PosgreSQL allows almost any identifier to be used as custom data type name,
+                // and we support that in `parse_data_type()`. But unlike Postgres we don't
+                // have a list of globally reserved keywords (since they vary across dialects),
+                // so given `NOT 'a' LIKE 'b'`, we'd accept `NOT` as a possible custom data type
+                // name, resulting in `NOT 'a'` being recognized as a `TypedString` instead of
+                // an unary negation `NOT ('a' LIKE 'b')`. To solve this, we don't accept the
+                // `type 'string'` syntax for the custom data types at all.
+                DataType::Custom(..) => parser_err!("dummy"),
+                data_type => Ok(Expr::TypedString {
+                    data_type,
+                    value: parser.parse_literal_string()?,
+                }),
+            }
+        }));
+
        let expr = match self.next_token() {
            Token::Word(w) => match w.keyword {
                Keyword::TRUE | Keyword::FALSE | Keyword::NULL => {
@ -180,7 +223,6 @@ impl Parser {
                }
                Keyword::CASE => self.parse_case_expr(),
                Keyword::CAST => self.parse_cast_expr(),
-                Keyword::DATE => Ok(Expr::Value(Value::Date(self.parse_literal_string()?))),
                Keyword::EXISTS => self.parse_exists_expr(),
                Keyword::EXTRACT => self.parse_extract_expr(),
                Keyword::INTERVAL => self.parse_literal_interval(),
@ -189,10 +231,6 @@ impl Parser {
                    op: UnaryOperator::Not,
                    expr: Box::new(self.parse_subexpr(Self::UNARY_NOT_PREC)?),
                }),
-                Keyword::TIME => Ok(Expr::Value(Value::Time(self.parse_literal_string()?))),
-                Keyword::TIMESTAMP => {
-                    Ok(Expr::Value(Value::Timestamp(self.parse_literal_string()?)))
-                }
                // Here `w` is a word, check if it's a part of a multi-part
                // identifier, a function call, or a simple identifier:
                _ => match self.peek_token() {
@ -907,6 +945,22 @@ impl Parser {
        Ok(values)
    }

+    /// Run a parser method `f`, reverting back to the current position
+    /// if unsuccessful.
+    #[must_use]
+    fn maybe_parse<T, F>(&mut self, mut f: F) -> Option<T>
+    where
+        F: FnMut(&mut Parser) -> Result<T, ParserError>,
+    {
+        let index = self.index;
+        if let Ok(t) = f(self) {
+            Some(t)
+        } else {
+            self.index = index;
+            None
+        }
+    }
+
    /// Parse either `ALL` or `DISTINCT`. Returns `true` if `DISTINCT` is parsed and results in a
    /// `ParserError` if both `ALL` and `DISTINCT` are fround.
    pub fn parse_all_or_distinct(&mut self) -> Result<bool, ParserError> {
@ -1898,7 +1952,6 @@ impl Parser {
        }

        if self.consume_token(&Token::LParen) {
-            let index = self.index;
            // A left paren introduces either a derived table (i.e., a subquery)
            // or a nested join. It's nearly impossible to determine ahead of
            // time which it is... so we just try to parse both.
@ -1915,18 +1968,17 @@ impl Parser {
            //                   | (2) starts a nested join
            //                   (1) an additional set of parens around a nested join
            //
-            match self.parse_derived_table_factor(NotLateral) {
-                // The recently consumed '(' started a derived table, and we've
-                // parsed the subquery, followed by the closing ')', and the
-                // alias of the derived table. In the example above this is
-                // case (3), and the next token would be `NATURAL`.
-                Ok(table_factor) => Ok(table_factor),
-                Err(_) => {
-                    // A parsing error from `parse_derived_table_factor` indicates that
-                    // the '(' we've recently consumed does not start a derived table
-                    // (cases 1, 2, or 4). Ignore the error and back up to where we
-                    // were before - right after the opening '('.
-                    self.index = index;
+
+            // If the recently consumed '(' starts a derived table, the call to
+            // `parse_derived_table_factor` below will return success after parsing the
+            // subquery, followed by the closing ')', and the alias of the derived table.
+            // In the example above this is case (3).
+            return_ok_if_some!(
+                self.maybe_parse(|parser| parser.parse_derived_table_factor(NotLateral))
+            );
+            // A parsing error from `parse_derived_table_factor` indicates that the '(' we've
+            // recently consumed does not start a derived table (cases 1, 2, or 4).
+            // `maybe_parse` will ignore such an error and rewind to be after the opening '('.

            // Inside the parentheses we expect to find a table factor
            // followed by some joins or another level of nesting.
@ -1935,10 +1987,7 @@ impl Parser {
            // The SQL spec prohibits derived and bare tables from appearing
            // alone in parentheses. We don't enforce this as some databases
            // (e.g. Snowflake) allow such syntax.
-
            Ok(TableFactor::NestedJoin(Box::new(table_and_joins)))
-                }
-            }
        } else {
            let name = self.parse_object_name()?;
            // Postgres, MSSQL: table-valued functions:
--- a/tests/sqlparser_common.rs
+++ b/tests/sqlparser_common.rs
@ -413,6 +413,19 @@ fn parse_null_in_select() {
    );
 }

+#[test]
+fn parse_select_with_date_column_name() {
+    let sql = "SELECT date";
+    let select = verified_only_select(sql);
+    assert_eq!(
+        &Expr::Identifier(Ident {
+            value: "date".into(),
+            quote_style: None
+        }),
+        expr_from_projection(only(&select.projection)),
+    );
+}
+
 #[test]
 fn parse_escaped_single_quote_string_predicate() {
    use self::BinaryOperator::*;
@ -1426,30 +1439,39 @@ fn parse_literal_string() {

 #[test]
 fn parse_literal_date() {
-    let sql = "SELECT DATE '1999-01-01'";
+    let sql = "SELECT date '1999-01-01'";
    let select = verified_only_select(sql);
    assert_eq!(
-        &Expr::Value(Value::Date("1999-01-01".into())),
+        &Expr::TypedString {
+            data_type: DataType::Date,
+            value: "1999-01-01".into()
+        },
        expr_from_projection(only(&select.projection)),
    );
 }

 #[test]
 fn parse_literal_time() {
-    let sql = "SELECT TIME '01:23:34'";
+    let sql = "SELECT time '01:23:34'";
    let select = verified_only_select(sql);
    assert_eq!(
-        &Expr::Value(Value::Time("01:23:34".into())),
+        &Expr::TypedString {
+            data_type: DataType::Time,
+            value: "01:23:34".into()
+        },
        expr_from_projection(only(&select.projection)),
    );
 }

 #[test]
 fn parse_literal_timestamp() {
-    let sql = "SELECT TIMESTAMP '1999-01-01 01:23:34'";
+    let sql = "SELECT timestamp '1999-01-01 01:23:34'";
    let select = verified_only_select(sql);
    assert_eq!(
-        &Expr::Value(Value::Timestamp("1999-01-01 01:23:34".into())),
+        &Expr::TypedString {
+            data_type: DataType::Timestamp,
+            value: "1999-01-01 01:23:34".into()
+        },
        expr_from_projection(only(&select.projection)),
    );
 }