mirror of
https://github.com/apache/datafusion-sqlparser-rs.git
synced 2025-09-22 05:32:29 +00:00
feat: Support escaped string literals (PostgreSQL) (#502)
* feat: Support escaped string literals (PostgreSQL) Signed-off-by: Dmitry Patsura <talk@dmtry.me> * lint * escape ', \r, \t * Update src/ast/value.rs Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org> * Update src/tokenizer.rs Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org> * test: two slashes * remove dead code * test: parsing error * support generic dialect too (for DF) Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
This commit is contained in:
parent
4070f3ec6e
commit
2c0886d9fe
4 changed files with 167 additions and 0 deletions
|
@ -30,6 +30,9 @@ pub enum Value {
|
||||||
Number(BigDecimal, bool),
|
Number(BigDecimal, bool),
|
||||||
/// 'string value'
|
/// 'string value'
|
||||||
SingleQuotedString(String),
|
SingleQuotedString(String),
|
||||||
|
/// e'string value' (postgres extension)
|
||||||
|
/// <https://www.postgresql.org/docs/8.3/sql-syntax-lexical.html#SQL-SYNTAX-STRINGS
|
||||||
|
EscapedStringLiteral(String),
|
||||||
/// N'string value'
|
/// N'string value'
|
||||||
NationalStringLiteral(String),
|
NationalStringLiteral(String),
|
||||||
/// X'hex value'
|
/// X'hex value'
|
||||||
|
@ -69,6 +72,7 @@ impl fmt::Display for Value {
|
||||||
Value::Number(v, l) => write!(f, "{}{long}", v, long = if *l { "L" } else { "" }),
|
Value::Number(v, l) => write!(f, "{}{long}", v, long = if *l { "L" } else { "" }),
|
||||||
Value::DoubleQuotedString(v) => write!(f, "\"{}\"", v),
|
Value::DoubleQuotedString(v) => write!(f, "\"{}\"", v),
|
||||||
Value::SingleQuotedString(v) => write!(f, "'{}'", escape_single_quote_string(v)),
|
Value::SingleQuotedString(v) => write!(f, "'{}'", escape_single_quote_string(v)),
|
||||||
|
Value::EscapedStringLiteral(v) => write!(f, "E'{}'", escape_escaped_string(v)),
|
||||||
Value::NationalStringLiteral(v) => write!(f, "N'{}'", v),
|
Value::NationalStringLiteral(v) => write!(f, "N'{}'", v),
|
||||||
Value::HexStringLiteral(v) => write!(f, "X'{}'", v),
|
Value::HexStringLiteral(v) => write!(f, "X'{}'", v),
|
||||||
Value::Boolean(v) => write!(f, "{}", v),
|
Value::Boolean(v) => write!(f, "{}", v),
|
||||||
|
@ -193,6 +197,40 @@ pub fn escape_single_quote_string(s: &str) -> EscapeSingleQuoteString<'_> {
|
||||||
EscapeSingleQuoteString(s)
|
EscapeSingleQuoteString(s)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub struct EscapeEscapedStringLiteral<'a>(&'a str);
|
||||||
|
|
||||||
|
impl<'a> fmt::Display for EscapeEscapedStringLiteral<'a> {
|
||||||
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||||
|
for c in self.0.chars() {
|
||||||
|
match c {
|
||||||
|
'\'' => {
|
||||||
|
write!(f, r#"\'"#)?;
|
||||||
|
}
|
||||||
|
'\\' => {
|
||||||
|
write!(f, r#"\\"#)?;
|
||||||
|
}
|
||||||
|
'\n' => {
|
||||||
|
write!(f, r#"\n"#)?;
|
||||||
|
}
|
||||||
|
'\t' => {
|
||||||
|
write!(f, r#"\t"#)?;
|
||||||
|
}
|
||||||
|
'\r' => {
|
||||||
|
write!(f, r#"\r"#)?;
|
||||||
|
}
|
||||||
|
_ => {
|
||||||
|
write!(f, "{}", c)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn escape_escaped_string(s: &str) -> EscapeEscapedStringLiteral<'_> {
|
||||||
|
EscapeEscapedStringLiteral(s)
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||||
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
|
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
|
||||||
pub enum TrimWhereField {
|
pub enum TrimWhereField {
|
||||||
|
|
|
@ -497,6 +497,11 @@ impl<'a> Parser<'a> {
|
||||||
expr: Box::new(self.parse_subexpr(Self::PLUS_MINUS_PREC)?),
|
expr: Box::new(self.parse_subexpr(Self::PLUS_MINUS_PREC)?),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
Token::EscapedStringLiteral(_) if dialect_of!(self is PostgreSqlDialect | GenericDialect) =>
|
||||||
|
{
|
||||||
|
self.prev_token();
|
||||||
|
Ok(Expr::Value(self.parse_value()?))
|
||||||
|
}
|
||||||
Token::Number(_, _)
|
Token::Number(_, _)
|
||||||
| Token::SingleQuotedString(_)
|
| Token::SingleQuotedString(_)
|
||||||
| Token::NationalStringLiteral(_)
|
| Token::NationalStringLiteral(_)
|
||||||
|
@ -902,6 +907,7 @@ impl<'a> Parser<'a> {
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
Token::SingleQuotedString(_)
|
Token::SingleQuotedString(_)
|
||||||
|
| Token::EscapedStringLiteral(_)
|
||||||
| Token::NationalStringLiteral(_)
|
| Token::NationalStringLiteral(_)
|
||||||
| Token::HexStringLiteral(_) => Some(Box::new(self.parse_expr()?)),
|
| Token::HexStringLiteral(_) => Some(Box::new(self.parse_expr()?)),
|
||||||
unexpected => {
|
unexpected => {
|
||||||
|
@ -2576,6 +2582,7 @@ impl<'a> Parser<'a> {
|
||||||
},
|
},
|
||||||
Token::SingleQuotedString(ref s) => Ok(Value::SingleQuotedString(s.to_string())),
|
Token::SingleQuotedString(ref s) => Ok(Value::SingleQuotedString(s.to_string())),
|
||||||
Token::NationalStringLiteral(ref s) => Ok(Value::NationalStringLiteral(s.to_string())),
|
Token::NationalStringLiteral(ref s) => Ok(Value::NationalStringLiteral(s.to_string())),
|
||||||
|
Token::EscapedStringLiteral(ref s) => Ok(Value::EscapedStringLiteral(s.to_string())),
|
||||||
Token::HexStringLiteral(ref s) => Ok(Value::HexStringLiteral(s.to_string())),
|
Token::HexStringLiteral(ref s) => Ok(Value::HexStringLiteral(s.to_string())),
|
||||||
Token::Placeholder(ref s) => Ok(Value::Placeholder(s.to_string())),
|
Token::Placeholder(ref s) => Ok(Value::Placeholder(s.to_string())),
|
||||||
unexpected => self.expected("a value", unexpected),
|
unexpected => self.expected("a value", unexpected),
|
||||||
|
@ -2607,6 +2614,9 @@ impl<'a> Parser<'a> {
|
||||||
match self.next_token() {
|
match self.next_token() {
|
||||||
Token::Word(Word { value, keyword, .. }) if keyword == Keyword::NoKeyword => Ok(value),
|
Token::Word(Word { value, keyword, .. }) if keyword == Keyword::NoKeyword => Ok(value),
|
||||||
Token::SingleQuotedString(s) => Ok(s),
|
Token::SingleQuotedString(s) => Ok(s),
|
||||||
|
Token::EscapedStringLiteral(s) if dialect_of!(self is PostgreSqlDialect | GenericDialect) => {
|
||||||
|
Ok(s)
|
||||||
|
}
|
||||||
unexpected => self.expected("literal string", unexpected),
|
unexpected => self.expected("literal string", unexpected),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -51,6 +51,8 @@ pub enum Token {
|
||||||
SingleQuotedString(String),
|
SingleQuotedString(String),
|
||||||
/// "National" string literal: i.e: N'string'
|
/// "National" string literal: i.e: N'string'
|
||||||
NationalStringLiteral(String),
|
NationalStringLiteral(String),
|
||||||
|
/// "escaped" string literal, which are an extension to the SQL standard: i.e: e'first \n second' or E 'first \n second'
|
||||||
|
EscapedStringLiteral(String),
|
||||||
/// Hexadecimal string literal: i.e.: X'deadbeef'
|
/// Hexadecimal string literal: i.e.: X'deadbeef'
|
||||||
HexStringLiteral(String),
|
HexStringLiteral(String),
|
||||||
/// Comma
|
/// Comma
|
||||||
|
@ -160,6 +162,7 @@ impl fmt::Display for Token {
|
||||||
Token::Char(ref c) => write!(f, "{}", c),
|
Token::Char(ref c) => write!(f, "{}", c),
|
||||||
Token::SingleQuotedString(ref s) => write!(f, "'{}'", s),
|
Token::SingleQuotedString(ref s) => write!(f, "'{}'", s),
|
||||||
Token::NationalStringLiteral(ref s) => write!(f, "N'{}'", s),
|
Token::NationalStringLiteral(ref s) => write!(f, "N'{}'", s),
|
||||||
|
Token::EscapedStringLiteral(ref s) => write!(f, "E'{}'", s),
|
||||||
Token::HexStringLiteral(ref s) => write!(f, "X'{}'", s),
|
Token::HexStringLiteral(ref s) => write!(f, "X'{}'", s),
|
||||||
Token::Comma => f.write_str(","),
|
Token::Comma => f.write_str(","),
|
||||||
Token::Whitespace(ws) => write!(f, "{}", ws),
|
Token::Whitespace(ws) => write!(f, "{}", ws),
|
||||||
|
@ -392,6 +395,21 @@ impl<'a> Tokenizer<'a> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// PostgreSQL accepts "escape" string constants, which are an extension to the SQL standard.
|
||||||
|
x @ 'e' | x @ 'E' => {
|
||||||
|
chars.next(); // consume, to check the next char
|
||||||
|
match chars.peek() {
|
||||||
|
Some('\'') => {
|
||||||
|
let s = self.tokenize_escaped_single_quoted_string(chars)?;
|
||||||
|
Ok(Some(Token::EscapedStringLiteral(s)))
|
||||||
|
}
|
||||||
|
_ => {
|
||||||
|
// regular identifier starting with an "E" or "e"
|
||||||
|
let s = self.tokenize_word(x, chars);
|
||||||
|
Ok(Some(Token::make_word(&s, None)))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
// The spec only allows an uppercase 'X' to introduce a hex
|
// The spec only allows an uppercase 'X' to introduce a hex
|
||||||
// string, but PostgreSQL, at least, allows a lowercase 'x' too.
|
// string, but PostgreSQL, at least, allows a lowercase 'x' too.
|
||||||
x @ 'x' | x @ 'X' => {
|
x @ 'x' | x @ 'X' => {
|
||||||
|
@ -690,6 +708,66 @@ impl<'a> Tokenizer<'a> {
|
||||||
s
|
s
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Read a single quoted string, starting with the opening quote.
|
||||||
|
fn tokenize_escaped_single_quoted_string(
|
||||||
|
&self,
|
||||||
|
chars: &mut Peekable<Chars<'_>>,
|
||||||
|
) -> Result<String, TokenizerError> {
|
||||||
|
let mut s = String::new();
|
||||||
|
chars.next(); // consume the opening quote
|
||||||
|
|
||||||
|
// slash escaping
|
||||||
|
let mut is_escaped = false;
|
||||||
|
while let Some(&ch) = chars.peek() {
|
||||||
|
macro_rules! escape_control_character {
|
||||||
|
($ESCAPED:expr) => {{
|
||||||
|
if is_escaped {
|
||||||
|
s.push($ESCAPED);
|
||||||
|
is_escaped = false;
|
||||||
|
} else {
|
||||||
|
s.push(ch);
|
||||||
|
}
|
||||||
|
|
||||||
|
chars.next();
|
||||||
|
}};
|
||||||
|
}
|
||||||
|
|
||||||
|
match ch {
|
||||||
|
'\'' => {
|
||||||
|
chars.next(); // consume
|
||||||
|
if is_escaped {
|
||||||
|
s.push(ch);
|
||||||
|
is_escaped = false;
|
||||||
|
} else if chars.peek().map(|c| *c == '\'').unwrap_or(false) {
|
||||||
|
s.push(ch);
|
||||||
|
chars.next();
|
||||||
|
} else {
|
||||||
|
return Ok(s);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
'\\' => {
|
||||||
|
if is_escaped {
|
||||||
|
s.push('\\');
|
||||||
|
is_escaped = false;
|
||||||
|
} else {
|
||||||
|
is_escaped = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
chars.next();
|
||||||
|
}
|
||||||
|
'r' => escape_control_character!('\r'),
|
||||||
|
'n' => escape_control_character!('\n'),
|
||||||
|
't' => escape_control_character!('\t'),
|
||||||
|
_ => {
|
||||||
|
is_escaped = false;
|
||||||
|
chars.next(); // consume
|
||||||
|
s.push(ch);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
self.tokenizer_error("Unterminated encoded string literal")
|
||||||
|
}
|
||||||
|
|
||||||
/// Read a single quoted string, starting with the opening quote.
|
/// Read a single quoted string, starting with the opening quote.
|
||||||
fn tokenize_single_quoted_string(
|
fn tokenize_single_quoted_string(
|
||||||
&self,
|
&self,
|
||||||
|
|
|
@ -1467,3 +1467,44 @@ fn pg_and_generic() -> TestedDialects {
|
||||||
dialects: vec![Box::new(PostgreSqlDialect {}), Box::new(GenericDialect {})],
|
dialects: vec![Box::new(PostgreSqlDialect {}), Box::new(GenericDialect {})],
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn parse_escaped_literal_string() {
|
||||||
|
let sql =
|
||||||
|
r#"SELECT E's1 \n s1', E's2 \\n s2', E's3 \\\n s3', E's4 \\\\n s4', E'\'', E'foo \\'"#;
|
||||||
|
let select = pg_and_generic().verified_only_select(sql);
|
||||||
|
assert_eq!(6, select.projection.len());
|
||||||
|
assert_eq!(
|
||||||
|
&Expr::Value(Value::EscapedStringLiteral("s1 \n s1".to_string())),
|
||||||
|
expr_from_projection(&select.projection[0])
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
&Expr::Value(Value::EscapedStringLiteral("s2 \\n s2".to_string())),
|
||||||
|
expr_from_projection(&select.projection[1])
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
&Expr::Value(Value::EscapedStringLiteral("s3 \\\n s3".to_string())),
|
||||||
|
expr_from_projection(&select.projection[2])
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
&Expr::Value(Value::EscapedStringLiteral("s4 \\\\n s4".to_string())),
|
||||||
|
expr_from_projection(&select.projection[3])
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
&Expr::Value(Value::EscapedStringLiteral("'".to_string())),
|
||||||
|
expr_from_projection(&select.projection[4])
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
&Expr::Value(Value::EscapedStringLiteral("foo \\".to_string())),
|
||||||
|
expr_from_projection(&select.projection[5])
|
||||||
|
);
|
||||||
|
|
||||||
|
let sql = r#"SELECT E'\'"#;
|
||||||
|
assert_eq!(
|
||||||
|
pg_and_generic()
|
||||||
|
.parse_sql_statements(sql)
|
||||||
|
.unwrap_err()
|
||||||
|
.to_string(),
|
||||||
|
"sql parser error: Unterminated encoded string literal at Line: 1, Column 8"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue