mirror of
https://github.com/apache/datafusion-sqlparser-rs.git
synced 2025-10-13 07:22:02 +00:00
4073 lines
156 KiB
Rust
4073 lines
156 KiB
Rust
// Licensed to the Apache Software Foundation (ASF) under one
|
||
// or more contributor license agreements. See the NOTICE file
|
||
// distributed with this work for additional information
|
||
// regarding copyright ownership. The ASF licenses this file
|
||
// to you under the Apache License, Version 2.0 (the
|
||
// "License"); you may not use this file except in compliance
|
||
// with the License. You may obtain a copy of the License at
|
||
//
|
||
// http://www.apache.org/licenses/LICENSE-2.0
|
||
//
|
||
// Unless required by applicable law or agreed to in writing,
|
||
// software distributed under the License is distributed on an
|
||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||
// KIND, either express or implied. See the License for the
|
||
// specific language governing permissions and limitations
|
||
// under the License.
|
||
|
||
//! SQL Tokenizer
|
||
//!
|
||
//! The tokenizer (a.k.a. lexer) converts a string into a sequence of tokens.
|
||
//!
|
||
//! The tokens then form the input for the parser, which outputs an Abstract Syntax Tree (AST).
|
||
|
||
#[cfg(not(feature = "std"))]
|
||
use alloc::{
|
||
borrow::ToOwned,
|
||
format,
|
||
string::{String, ToString},
|
||
vec,
|
||
vec::Vec,
|
||
};
|
||
use core::iter::Peekable;
|
||
use core::num::NonZeroU8;
|
||
use core::str::Chars;
|
||
use core::{cmp, fmt};
|
||
|
||
#[cfg(feature = "serde")]
|
||
use serde::{Deserialize, Serialize};
|
||
|
||
#[cfg(feature = "visitor")]
|
||
use sqlparser_derive::{Visit, VisitMut};
|
||
|
||
use crate::dialect::Dialect;
|
||
use crate::dialect::{
|
||
BigQueryDialect, DuckDbDialect, GenericDialect, MySqlDialect, PostgreSqlDialect,
|
||
SnowflakeDialect,
|
||
};
|
||
use crate::keywords::{Keyword, ALL_KEYWORDS, ALL_KEYWORDS_INDEX};
|
||
use crate::{ast::DollarQuotedString, dialect::HiveDialect};
|
||
|
||
/// SQL Token enumeration
|
||
#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
|
||
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
|
||
#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
|
||
pub enum Token {
|
||
/// An end-of-file marker, not a real token
|
||
EOF,
|
||
/// A keyword (like SELECT) or an optionally quoted SQL identifier
|
||
Word(Word),
|
||
/// An unsigned numeric literal
|
||
Number(String, bool),
|
||
/// A character that could not be tokenized
|
||
Char(char),
|
||
/// Single quoted string: i.e: 'string'
|
||
SingleQuotedString(String),
|
||
/// Double quoted string: i.e: "string"
|
||
DoubleQuotedString(String),
|
||
/// Triple single quoted strings: Example '''abc'''
|
||
/// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
|
||
TripleSingleQuotedString(String),
|
||
/// Triple double quoted strings: Example """abc"""
|
||
/// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
|
||
TripleDoubleQuotedString(String),
|
||
/// Dollar quoted string: i.e: $$string$$ or $tag_name$string$tag_name$
|
||
DollarQuotedString(DollarQuotedString),
|
||
/// Byte string literal: i.e: b'string' or B'string' (note that some backends, such as
|
||
/// PostgreSQL, may treat this syntax as a bit string literal instead, i.e: b'10010101')
|
||
SingleQuotedByteStringLiteral(String),
|
||
/// Byte string literal: i.e: b"string" or B"string"
|
||
DoubleQuotedByteStringLiteral(String),
|
||
/// Triple single quoted literal with byte string prefix. Example `B'''abc'''`
|
||
/// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
|
||
TripleSingleQuotedByteStringLiteral(String),
|
||
/// Triple double quoted literal with byte string prefix. Example `B"""abc"""`
|
||
/// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
|
||
TripleDoubleQuotedByteStringLiteral(String),
|
||
/// Single quoted literal with raw string prefix. Example `R'abc'`
|
||
/// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
|
||
SingleQuotedRawStringLiteral(String),
|
||
/// Double quoted literal with raw string prefix. Example `R"abc"`
|
||
/// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
|
||
DoubleQuotedRawStringLiteral(String),
|
||
/// Triple single quoted literal with raw string prefix. Example `R'''abc'''`
|
||
/// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
|
||
TripleSingleQuotedRawStringLiteral(String),
|
||
/// Triple double quoted literal with raw string prefix. Example `R"""abc"""`
|
||
/// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
|
||
TripleDoubleQuotedRawStringLiteral(String),
|
||
/// "National" string literal: i.e: N'string'
|
||
NationalStringLiteral(String),
|
||
/// "escaped" string literal, which are an extension to the SQL standard: i.e: e'first \n second' or E 'first \n second'
|
||
EscapedStringLiteral(String),
|
||
/// Unicode string literal: i.e: U&'first \000A second'
|
||
UnicodeStringLiteral(String),
|
||
/// Hexadecimal string literal: i.e.: X'deadbeef'
|
||
HexStringLiteral(String),
|
||
/// Comma
|
||
Comma,
|
||
/// Whitespace (space, tab, etc)
|
||
Whitespace(Whitespace),
|
||
/// Double equals sign `==`
|
||
DoubleEq,
|
||
/// Equality operator `=`
|
||
Eq,
|
||
/// Not Equals operator `<>` (or `!=` in some dialects)
|
||
Neq,
|
||
/// Less Than operator `<`
|
||
Lt,
|
||
/// Greater Than operator `>`
|
||
Gt,
|
||
/// Less Than Or Equals operator `<=`
|
||
LtEq,
|
||
/// Greater Than Or Equals operator `>=`
|
||
GtEq,
|
||
/// Spaceship operator <=>
|
||
Spaceship,
|
||
/// Plus operator `+`
|
||
Plus,
|
||
/// Minus operator `-`
|
||
Minus,
|
||
/// Multiplication operator `*`
|
||
Mul,
|
||
/// Division operator `/`
|
||
Div,
|
||
/// Integer division operator `//` in DuckDB
|
||
DuckIntDiv,
|
||
/// Modulo Operator `%`
|
||
Mod,
|
||
/// String concatenation `||`
|
||
StringConcat,
|
||
/// Left parenthesis `(`
|
||
LParen,
|
||
/// Right parenthesis `)`
|
||
RParen,
|
||
/// Period (used for compound identifiers or projections into nested types)
|
||
Period,
|
||
/// Colon `:`
|
||
Colon,
|
||
/// DoubleColon `::` (used for casting in PostgreSQL)
|
||
DoubleColon,
|
||
/// Assignment `:=` (used for keyword argument in DuckDB macros and some functions, and for variable declarations in DuckDB and Snowflake)
|
||
Assignment,
|
||
/// SemiColon `;` used as separator for COPY and payload
|
||
SemiColon,
|
||
/// Backslash `\` used in terminating the COPY payload with `\.`
|
||
Backslash,
|
||
/// Left bracket `[`
|
||
LBracket,
|
||
/// Right bracket `]`
|
||
RBracket,
|
||
/// Ampersand `&`
|
||
Ampersand,
|
||
/// Pipe `|`
|
||
Pipe,
|
||
/// Caret `^`
|
||
Caret,
|
||
/// Left brace `{`
|
||
LBrace,
|
||
/// Right brace `}`
|
||
RBrace,
|
||
/// Right Arrow `=>`
|
||
RArrow,
|
||
/// Sharp `#` used for PostgreSQL Bitwise XOR operator, also PostgreSQL/Redshift geometrical unary/binary operator (Number of points in path or polygon/Intersection)
|
||
Sharp,
|
||
/// `##` PostgreSQL/Redshift geometrical binary operator (Point of closest proximity)
|
||
DoubleSharp,
|
||
/// Tilde `~` used for PostgreSQL Bitwise NOT operator or case sensitive match regular expression operator
|
||
Tilde,
|
||
/// `~*` , a case insensitive match regular expression operator in PostgreSQL
|
||
TildeAsterisk,
|
||
/// `!~` , a case sensitive not match regular expression operator in PostgreSQL
|
||
ExclamationMarkTilde,
|
||
/// `!~*` , a case insensitive not match regular expression operator in PostgreSQL
|
||
ExclamationMarkTildeAsterisk,
|
||
/// `~~`, a case sensitive match pattern operator in PostgreSQL
|
||
DoubleTilde,
|
||
/// `~~*`, a case insensitive match pattern operator in PostgreSQL
|
||
DoubleTildeAsterisk,
|
||
/// `!~~`, a case sensitive not match pattern operator in PostgreSQL
|
||
ExclamationMarkDoubleTilde,
|
||
/// `!~~*`, a case insensitive not match pattern operator in PostgreSQL
|
||
ExclamationMarkDoubleTildeAsterisk,
|
||
/// `<<`, a bitwise shift left operator in PostgreSQL
|
||
ShiftLeft,
|
||
/// `>>`, a bitwise shift right operator in PostgreSQL
|
||
ShiftRight,
|
||
/// `&&`, an overlap operator in PostgreSQL
|
||
Overlap,
|
||
/// Exclamation Mark `!` used for PostgreSQL factorial operator
|
||
ExclamationMark,
|
||
/// Double Exclamation Mark `!!` used for PostgreSQL prefix factorial operator
|
||
DoubleExclamationMark,
|
||
/// AtSign `@` used for PostgreSQL abs operator, also PostgreSQL/Redshift geometrical unary/binary operator (Center, Contained or on)
|
||
AtSign,
|
||
/// `^@`, a "starts with" string operator in PostgreSQL
|
||
CaretAt,
|
||
/// `|/`, a square root math operator in PostgreSQL
|
||
PGSquareRoot,
|
||
/// `||/`, a cube root math operator in PostgreSQL
|
||
PGCubeRoot,
|
||
/// `?` or `$` , a prepared statement arg placeholder
|
||
Placeholder(String),
|
||
/// `->`, used as a operator to extract json field in PostgreSQL
|
||
Arrow,
|
||
/// `->>`, used as a operator to extract json field as text in PostgreSQL
|
||
LongArrow,
|
||
/// `#>`, extracts JSON sub-object at the specified path
|
||
HashArrow,
|
||
/// `@-@` PostgreSQL/Redshift geometrical unary operator (Length or circumference)
|
||
AtDashAt,
|
||
/// `?-` PostgreSQL/Redshift geometrical unary/binary operator (Is horizontal?/Are horizontally aligned?)
|
||
QuestionMarkDash,
|
||
/// `&<` PostgreSQL/Redshift geometrical binary operator (Overlaps to left?)
|
||
AmpersandLeftAngleBracket,
|
||
/// `&>` PostgreSQL/Redshift geometrical binary operator (Overlaps to right?)`
|
||
AmpersandRightAngleBracket,
|
||
/// `&<|` PostgreSQL/Redshift geometrical binary operator (Does not extend above?)`
|
||
AmpersandLeftAngleBracketVerticalBar,
|
||
/// `|&>` PostgreSQL/Redshift geometrical binary operator (Does not extend below?)`
|
||
VerticalBarAmpersandRightAngleBracket,
|
||
/// `<->` PostgreSQL/Redshift geometrical binary operator (Distance between)
|
||
TwoWayArrow,
|
||
/// `<^` PostgreSQL/Redshift geometrical binary operator (Is below?)
|
||
LeftAngleBracketCaret,
|
||
/// `>^` PostgreSQL/Redshift geometrical binary operator (Is above?)
|
||
RightAngleBracketCaret,
|
||
/// `?#` PostgreSQL/Redshift geometrical binary operator (Intersects or overlaps)
|
||
QuestionMarkSharp,
|
||
/// `?-|` PostgreSQL/Redshift geometrical binary operator (Is perpendicular?)
|
||
QuestionMarkDashVerticalBar,
|
||
/// `?||` PostgreSQL/Redshift geometrical binary operator (Are parallel?)
|
||
QuestionMarkDoubleVerticalBar,
|
||
/// `~=` PostgreSQL/Redshift geometrical binary operator (Same as)
|
||
TildeEqual,
|
||
/// `<<| PostgreSQL/Redshift geometrical binary operator (Is strictly below?)
|
||
ShiftLeftVerticalBar,
|
||
/// `|>> PostgreSQL/Redshift geometrical binary operator (Is strictly above?)
|
||
VerticalBarShiftRight,
|
||
/// `|> BigQuery pipe operator
|
||
VerticalBarRightAngleBracket,
|
||
/// `#>>`, extracts JSON sub-object at the specified path as text
|
||
HashLongArrow,
|
||
/// jsonb @> jsonb -> boolean: Test whether left json contains the right json
|
||
AtArrow,
|
||
/// jsonb <@ jsonb -> boolean: Test whether right json contains the left json
|
||
ArrowAt,
|
||
/// jsonb #- text[] -> jsonb: Deletes the field or array element at the specified
|
||
/// path, where path elements can be either field keys or array indexes.
|
||
HashMinus,
|
||
/// jsonb @? jsonpath -> boolean: Does JSON path return any item for the specified
|
||
/// JSON value?
|
||
AtQuestion,
|
||
/// jsonb @@ jsonpath → boolean: Returns the result of a JSON path predicate check
|
||
/// for the specified JSON value. Only the first item of the result is taken into
|
||
/// account. If the result is not Boolean, then NULL is returned.
|
||
AtAt,
|
||
/// jsonb ? text -> boolean: Checks whether the string exists as a top-level key within the
|
||
/// jsonb object
|
||
Question,
|
||
/// jsonb ?& text[] -> boolean: Check whether all members of the text array exist as top-level
|
||
/// keys within the jsonb object
|
||
QuestionAnd,
|
||
/// jsonb ?| text[] -> boolean: Check whether any member of the text array exists as top-level
|
||
/// keys within the jsonb object
|
||
QuestionPipe,
|
||
/// Custom binary operator
|
||
/// This is used to represent any custom binary operator that is not part of the SQL standard.
|
||
/// PostgreSQL allows defining custom binary operators using CREATE OPERATOR.
|
||
CustomBinaryOperator(String),
|
||
}
|
||
|
||
impl fmt::Display for Token {
|
||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||
match self {
|
||
Token::EOF => f.write_str("EOF"),
|
||
Token::Word(ref w) => write!(f, "{w}"),
|
||
Token::Number(ref n, l) => write!(f, "{}{long}", n, long = if *l { "L" } else { "" }),
|
||
Token::Char(ref c) => write!(f, "{c}"),
|
||
Token::SingleQuotedString(ref s) => write!(f, "'{s}'"),
|
||
Token::TripleSingleQuotedString(ref s) => write!(f, "'''{s}'''"),
|
||
Token::DoubleQuotedString(ref s) => write!(f, "\"{s}\""),
|
||
Token::TripleDoubleQuotedString(ref s) => write!(f, "\"\"\"{s}\"\"\""),
|
||
Token::DollarQuotedString(ref s) => write!(f, "{s}"),
|
||
Token::NationalStringLiteral(ref s) => write!(f, "N'{s}'"),
|
||
Token::EscapedStringLiteral(ref s) => write!(f, "E'{s}'"),
|
||
Token::UnicodeStringLiteral(ref s) => write!(f, "U&'{s}'"),
|
||
Token::HexStringLiteral(ref s) => write!(f, "X'{s}'"),
|
||
Token::SingleQuotedByteStringLiteral(ref s) => write!(f, "B'{s}'"),
|
||
Token::TripleSingleQuotedByteStringLiteral(ref s) => write!(f, "B'''{s}'''"),
|
||
Token::DoubleQuotedByteStringLiteral(ref s) => write!(f, "B\"{s}\""),
|
||
Token::TripleDoubleQuotedByteStringLiteral(ref s) => write!(f, "B\"\"\"{s}\"\"\""),
|
||
Token::SingleQuotedRawStringLiteral(ref s) => write!(f, "R'{s}'"),
|
||
Token::DoubleQuotedRawStringLiteral(ref s) => write!(f, "R\"{s}\""),
|
||
Token::TripleSingleQuotedRawStringLiteral(ref s) => write!(f, "R'''{s}'''"),
|
||
Token::TripleDoubleQuotedRawStringLiteral(ref s) => write!(f, "R\"\"\"{s}\"\"\""),
|
||
Token::Comma => f.write_str(","),
|
||
Token::Whitespace(ws) => write!(f, "{ws}"),
|
||
Token::DoubleEq => f.write_str("=="),
|
||
Token::Spaceship => f.write_str("<=>"),
|
||
Token::Eq => f.write_str("="),
|
||
Token::Neq => f.write_str("<>"),
|
||
Token::Lt => f.write_str("<"),
|
||
Token::Gt => f.write_str(">"),
|
||
Token::LtEq => f.write_str("<="),
|
||
Token::GtEq => f.write_str(">="),
|
||
Token::Plus => f.write_str("+"),
|
||
Token::Minus => f.write_str("-"),
|
||
Token::Mul => f.write_str("*"),
|
||
Token::Div => f.write_str("/"),
|
||
Token::DuckIntDiv => f.write_str("//"),
|
||
Token::StringConcat => f.write_str("||"),
|
||
Token::Mod => f.write_str("%"),
|
||
Token::LParen => f.write_str("("),
|
||
Token::RParen => f.write_str(")"),
|
||
Token::Period => f.write_str("."),
|
||
Token::Colon => f.write_str(":"),
|
||
Token::DoubleColon => f.write_str("::"),
|
||
Token::Assignment => f.write_str(":="),
|
||
Token::SemiColon => f.write_str(";"),
|
||
Token::Backslash => f.write_str("\\"),
|
||
Token::LBracket => f.write_str("["),
|
||
Token::RBracket => f.write_str("]"),
|
||
Token::Ampersand => f.write_str("&"),
|
||
Token::Caret => f.write_str("^"),
|
||
Token::Pipe => f.write_str("|"),
|
||
Token::LBrace => f.write_str("{"),
|
||
Token::RBrace => f.write_str("}"),
|
||
Token::RArrow => f.write_str("=>"),
|
||
Token::Sharp => f.write_str("#"),
|
||
Token::DoubleSharp => f.write_str("##"),
|
||
Token::ExclamationMark => f.write_str("!"),
|
||
Token::DoubleExclamationMark => f.write_str("!!"),
|
||
Token::Tilde => f.write_str("~"),
|
||
Token::TildeAsterisk => f.write_str("~*"),
|
||
Token::ExclamationMarkTilde => f.write_str("!~"),
|
||
Token::ExclamationMarkTildeAsterisk => f.write_str("!~*"),
|
||
Token::DoubleTilde => f.write_str("~~"),
|
||
Token::DoubleTildeAsterisk => f.write_str("~~*"),
|
||
Token::ExclamationMarkDoubleTilde => f.write_str("!~~"),
|
||
Token::ExclamationMarkDoubleTildeAsterisk => f.write_str("!~~*"),
|
||
Token::AtSign => f.write_str("@"),
|
||
Token::CaretAt => f.write_str("^@"),
|
||
Token::ShiftLeft => f.write_str("<<"),
|
||
Token::ShiftRight => f.write_str(">>"),
|
||
Token::Overlap => f.write_str("&&"),
|
||
Token::PGSquareRoot => f.write_str("|/"),
|
||
Token::PGCubeRoot => f.write_str("||/"),
|
||
Token::AtDashAt => f.write_str("@-@"),
|
||
Token::QuestionMarkDash => f.write_str("?-"),
|
||
Token::AmpersandLeftAngleBracket => f.write_str("&<"),
|
||
Token::AmpersandRightAngleBracket => f.write_str("&>"),
|
||
Token::AmpersandLeftAngleBracketVerticalBar => f.write_str("&<|"),
|
||
Token::VerticalBarAmpersandRightAngleBracket => f.write_str("|&>"),
|
||
Token::VerticalBarRightAngleBracket => f.write_str("|>"),
|
||
Token::TwoWayArrow => f.write_str("<->"),
|
||
Token::LeftAngleBracketCaret => f.write_str("<^"),
|
||
Token::RightAngleBracketCaret => f.write_str(">^"),
|
||
Token::QuestionMarkSharp => f.write_str("?#"),
|
||
Token::QuestionMarkDashVerticalBar => f.write_str("?-|"),
|
||
Token::QuestionMarkDoubleVerticalBar => f.write_str("?||"),
|
||
Token::TildeEqual => f.write_str("~="),
|
||
Token::ShiftLeftVerticalBar => f.write_str("<<|"),
|
||
Token::VerticalBarShiftRight => f.write_str("|>>"),
|
||
Token::Placeholder(ref s) => write!(f, "{s}"),
|
||
Token::Arrow => write!(f, "->"),
|
||
Token::LongArrow => write!(f, "->>"),
|
||
Token::HashArrow => write!(f, "#>"),
|
||
Token::HashLongArrow => write!(f, "#>>"),
|
||
Token::AtArrow => write!(f, "@>"),
|
||
Token::ArrowAt => write!(f, "<@"),
|
||
Token::HashMinus => write!(f, "#-"),
|
||
Token::AtQuestion => write!(f, "@?"),
|
||
Token::AtAt => write!(f, "@@"),
|
||
Token::Question => write!(f, "?"),
|
||
Token::QuestionAnd => write!(f, "?&"),
|
||
Token::QuestionPipe => write!(f, "?|"),
|
||
Token::CustomBinaryOperator(s) => f.write_str(s),
|
||
}
|
||
}
|
||
}
|
||
|
||
impl Token {
|
||
pub fn make_keyword(keyword: &str) -> Self {
|
||
Token::make_word(keyword, None)
|
||
}
|
||
|
||
pub fn make_word(word: &str, quote_style: Option<char>) -> Self {
|
||
let word_uppercase = word.to_uppercase();
|
||
Token::Word(Word {
|
||
value: word.to_string(),
|
||
quote_style,
|
||
keyword: if quote_style.is_none() {
|
||
let keyword = ALL_KEYWORDS.binary_search(&word_uppercase.as_str());
|
||
keyword.map_or(Keyword::NoKeyword, |x| ALL_KEYWORDS_INDEX[x])
|
||
} else {
|
||
Keyword::NoKeyword
|
||
},
|
||
})
|
||
}
|
||
}
|
||
|
||
/// A keyword (like SELECT) or an optionally quoted SQL identifier
|
||
#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
|
||
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
|
||
#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
|
||
pub struct Word {
|
||
/// The value of the token, without the enclosing quotes, and with the
|
||
/// escape sequences (if any) processed (TODO: escapes are not handled)
|
||
pub value: String,
|
||
/// An identifier can be "quoted" (<delimited identifier> in ANSI parlance).
|
||
/// The standard and most implementations allow using double quotes for this,
|
||
/// but some implementations support other quoting styles as well (e.g. \[MS SQL])
|
||
pub quote_style: Option<char>,
|
||
/// If the word was not quoted and it matched one of the known keywords,
|
||
/// this will have one of the values from dialect::keywords, otherwise empty
|
||
pub keyword: Keyword,
|
||
}
|
||
|
||
impl fmt::Display for Word {
|
||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||
match self.quote_style {
|
||
Some(s) if s == '"' || s == '[' || s == '`' => {
|
||
write!(f, "{}{}{}", s, self.value, Word::matching_end_quote(s))
|
||
}
|
||
None => f.write_str(&self.value),
|
||
_ => panic!("Unexpected quote_style!"),
|
||
}
|
||
}
|
||
}
|
||
|
||
impl Word {
|
||
fn matching_end_quote(ch: char) -> char {
|
||
match ch {
|
||
'"' => '"', // ANSI and most dialects
|
||
'[' => ']', // MS SQL
|
||
'`' => '`', // MySQL
|
||
_ => panic!("unexpected quoting style!"),
|
||
}
|
||
}
|
||
}
|
||
|
||
#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
|
||
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
|
||
#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
|
||
pub enum Whitespace {
|
||
Space,
|
||
Newline,
|
||
Tab,
|
||
SingleLineComment { comment: String, prefix: String },
|
||
MultiLineComment(String),
|
||
}
|
||
|
||
impl fmt::Display for Whitespace {
|
||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||
match self {
|
||
Whitespace::Space => f.write_str(" "),
|
||
Whitespace::Newline => f.write_str("\n"),
|
||
Whitespace::Tab => f.write_str("\t"),
|
||
Whitespace::SingleLineComment { prefix, comment } => write!(f, "{prefix}{comment}"),
|
||
Whitespace::MultiLineComment(s) => write!(f, "/*{s}*/"),
|
||
}
|
||
}
|
||
}
|
||
|
||
/// Location in input string
|
||
///
|
||
/// # Create an "empty" (unknown) `Location`
|
||
/// ```
|
||
/// # use sqlparser::tokenizer::Location;
|
||
/// let location = Location::empty();
|
||
/// ```
|
||
///
|
||
/// # Create a `Location` from a line and column
|
||
/// ```
|
||
/// # use sqlparser::tokenizer::Location;
|
||
/// let location = Location::new(1, 1);
|
||
/// ```
|
||
///
|
||
/// # Create a `Location` from a pair
|
||
/// ```
|
||
/// # use sqlparser::tokenizer::Location;
|
||
/// let location = Location::from((1, 1));
|
||
/// ```
|
||
#[derive(Eq, PartialEq, Hash, Clone, Copy, Ord, PartialOrd)]
|
||
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
|
||
#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
|
||
pub struct Location {
|
||
/// Line number, starting from 1.
|
||
///
|
||
/// Note: Line 0 is used for empty spans
|
||
pub line: u64,
|
||
/// Line column, starting from 1.
|
||
///
|
||
/// Note: Column 0 is used for empty spans
|
||
pub column: u64,
|
||
}
|
||
|
||
impl fmt::Display for Location {
|
||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||
if self.line == 0 {
|
||
return Ok(());
|
||
}
|
||
write!(f, " at Line: {}, Column: {}", self.line, self.column)
|
||
}
|
||
}
|
||
|
||
impl fmt::Debug for Location {
|
||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||
write!(f, "Location({},{})", self.line, self.column)
|
||
}
|
||
}
|
||
|
||
impl Location {
|
||
/// Return an "empty" / unknown location
|
||
pub fn empty() -> Self {
|
||
Self { line: 0, column: 0 }
|
||
}
|
||
|
||
/// Create a new `Location` for a given line and column
|
||
pub fn new(line: u64, column: u64) -> Self {
|
||
Self { line, column }
|
||
}
|
||
|
||
/// Create a new location for a given line and column
|
||
///
|
||
/// Alias for [`Self::new`]
|
||
// TODO: remove / deprecate in favor of` `new` for consistency?
|
||
pub fn of(line: u64, column: u64) -> Self {
|
||
Self::new(line, column)
|
||
}
|
||
|
||
/// Combine self and `end` into a new `Span`
|
||
pub fn span_to(self, end: Self) -> Span {
|
||
Span { start: self, end }
|
||
}
|
||
}
|
||
|
||
impl From<(u64, u64)> for Location {
|
||
fn from((line, column): (u64, u64)) -> Self {
|
||
Self { line, column }
|
||
}
|
||
}
|
||
|
||
/// A span represents a linear portion of the input string (start, end)
|
||
///
|
||
/// See [Spanned](crate::ast::Spanned) for more information.
|
||
#[derive(Eq, PartialEq, Hash, Clone, PartialOrd, Ord, Copy)]
|
||
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
|
||
#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
|
||
pub struct Span {
|
||
pub start: Location,
|
||
pub end: Location,
|
||
}
|
||
|
||
impl fmt::Debug for Span {
|
||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||
write!(f, "Span({:?}..{:?})", self.start, self.end)
|
||
}
|
||
}
|
||
|
||
impl Span {
|
||
// An empty span (0, 0) -> (0, 0)
|
||
// We need a const instance for pattern matching
|
||
const EMPTY: Span = Self::empty();
|
||
|
||
/// Create a new span from a start and end [`Location`]
|
||
pub fn new(start: Location, end: Location) -> Span {
|
||
Span { start, end }
|
||
}
|
||
|
||
/// Returns an empty span `(0, 0) -> (0, 0)`
|
||
///
|
||
/// Empty spans represent no knowledge of source location
|
||
/// See [Spanned](crate::ast::Spanned) for more information.
|
||
pub const fn empty() -> Span {
|
||
Span {
|
||
start: Location { line: 0, column: 0 },
|
||
end: Location { line: 0, column: 0 },
|
||
}
|
||
}
|
||
|
||
/// Returns the smallest Span that contains both `self` and `other`
|
||
/// If either span is [Span::empty], the other span is returned
|
||
///
|
||
/// # Examples
|
||
/// ```
|
||
/// # use sqlparser::tokenizer::{Span, Location};
|
||
/// // line 1, column1 -> line 2, column 5
|
||
/// let span1 = Span::new(Location::new(1, 1), Location::new(2, 5));
|
||
/// // line 2, column 3 -> line 3, column 7
|
||
/// let span2 = Span::new(Location::new(2, 3), Location::new(3, 7));
|
||
/// // Union of the two is the min/max of the two spans
|
||
/// // line 1, column 1 -> line 3, column 7
|
||
/// let union = span1.union(&span2);
|
||
/// assert_eq!(union, Span::new(Location::new(1, 1), Location::new(3, 7)));
|
||
/// ```
|
||
pub fn union(&self, other: &Span) -> Span {
|
||
// If either span is empty, return the other
|
||
// this prevents propagating (0, 0) through the tree
|
||
match (self, other) {
|
||
(&Span::EMPTY, _) => *other,
|
||
(_, &Span::EMPTY) => *self,
|
||
_ => Span {
|
||
start: cmp::min(self.start, other.start),
|
||
end: cmp::max(self.end, other.end),
|
||
},
|
||
}
|
||
}
|
||
|
||
/// Same as [Span::union] for `Option<Span>`
|
||
///
|
||
/// If `other` is `None`, `self` is returned
|
||
pub fn union_opt(&self, other: &Option<Span>) -> Span {
|
||
match other {
|
||
Some(other) => self.union(other),
|
||
None => *self,
|
||
}
|
||
}
|
||
|
||
/// Return the [Span::union] of all spans in the iterator
|
||
///
|
||
/// If the iterator is empty, an empty span is returned
|
||
///
|
||
/// # Example
|
||
/// ```
|
||
/// # use sqlparser::tokenizer::{Span, Location};
|
||
/// let spans = vec![
|
||
/// Span::new(Location::new(1, 1), Location::new(2, 5)),
|
||
/// Span::new(Location::new(2, 3), Location::new(3, 7)),
|
||
/// Span::new(Location::new(3, 1), Location::new(4, 2)),
|
||
/// ];
|
||
/// // line 1, column 1 -> line 4, column 2
|
||
/// assert_eq!(
|
||
/// Span::union_iter(spans),
|
||
/// Span::new(Location::new(1, 1), Location::new(4, 2))
|
||
/// );
|
||
pub fn union_iter<I: IntoIterator<Item = Span>>(iter: I) -> Span {
|
||
iter.into_iter()
|
||
.reduce(|acc, item| acc.union(&item))
|
||
.unwrap_or(Span::empty())
|
||
}
|
||
}
|
||
|
||
/// Backwards compatibility struct for [`TokenWithSpan`]
|
||
#[deprecated(since = "0.53.0", note = "please use `TokenWithSpan` instead")]
|
||
pub type TokenWithLocation = TokenWithSpan;
|
||
|
||
/// A [Token] with [Span] attached to it
|
||
///
|
||
/// This is used to track the location of a token in the input string
|
||
///
|
||
/// # Examples
|
||
/// ```
|
||
/// # use sqlparser::tokenizer::{Location, Span, Token, TokenWithSpan};
|
||
/// // commas @ line 1, column 10
|
||
/// let tok1 = TokenWithSpan::new(
|
||
/// Token::Comma,
|
||
/// Span::new(Location::new(1, 10), Location::new(1, 11)),
|
||
/// );
|
||
/// assert_eq!(tok1, Token::Comma); // can compare the token
|
||
///
|
||
/// // commas @ line 2, column 20
|
||
/// let tok2 = TokenWithSpan::new(
|
||
/// Token::Comma,
|
||
/// Span::new(Location::new(2, 20), Location::new(2, 21)),
|
||
/// );
|
||
/// // same token but different locations are not equal
|
||
/// assert_ne!(tok1, tok2);
|
||
/// ```
|
||
#[derive(Debug, Clone, Hash, Ord, PartialOrd, Eq, PartialEq)]
|
||
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
|
||
#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
|
||
pub struct TokenWithSpan {
|
||
pub token: Token,
|
||
pub span: Span,
|
||
}
|
||
|
||
impl TokenWithSpan {
|
||
/// Create a new [`TokenWithSpan`] from a [`Token`] and a [`Span`]
|
||
pub fn new(token: Token, span: Span) -> Self {
|
||
Self { token, span }
|
||
}
|
||
|
||
/// Wrap a token with an empty span
|
||
pub fn wrap(token: Token) -> Self {
|
||
Self::new(token, Span::empty())
|
||
}
|
||
|
||
/// Wrap a token with a location from `start` to `end`
|
||
pub fn at(token: Token, start: Location, end: Location) -> Self {
|
||
Self::new(token, Span::new(start, end))
|
||
}
|
||
|
||
/// Return an EOF token with no location
|
||
pub fn new_eof() -> Self {
|
||
Self::wrap(Token::EOF)
|
||
}
|
||
}
|
||
|
||
impl PartialEq<Token> for TokenWithSpan {
|
||
fn eq(&self, other: &Token) -> bool {
|
||
&self.token == other
|
||
}
|
||
}
|
||
|
||
impl PartialEq<TokenWithSpan> for Token {
|
||
fn eq(&self, other: &TokenWithSpan) -> bool {
|
||
self == &other.token
|
||
}
|
||
}
|
||
|
||
impl fmt::Display for TokenWithSpan {
|
||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||
self.token.fmt(f)
|
||
}
|
||
}
|
||
|
||
/// Tokenizer error
|
||
#[derive(Debug, PartialEq, Eq)]
|
||
pub struct TokenizerError {
|
||
pub message: String,
|
||
pub location: Location,
|
||
}
|
||
|
||
impl fmt::Display for TokenizerError {
|
||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||
write!(f, "{}{}", self.message, self.location,)
|
||
}
|
||
}
|
||
|
||
#[cfg(feature = "std")]
|
||
impl std::error::Error for TokenizerError {}
|
||
|
||
struct State<'a> {
|
||
peekable: Peekable<Chars<'a>>,
|
||
pub line: u64,
|
||
pub col: u64,
|
||
}
|
||
|
||
impl State<'_> {
|
||
/// return the next character and advance the stream
|
||
pub fn next(&mut self) -> Option<char> {
|
||
match self.peekable.next() {
|
||
None => None,
|
||
Some(s) => {
|
||
if s == '\n' {
|
||
self.line += 1;
|
||
self.col = 1;
|
||
} else {
|
||
self.col += 1;
|
||
}
|
||
Some(s)
|
||
}
|
||
}
|
||
}
|
||
|
||
/// return the next character but do not advance the stream
|
||
pub fn peek(&mut self) -> Option<&char> {
|
||
self.peekable.peek()
|
||
}
|
||
|
||
pub fn location(&self) -> Location {
|
||
Location {
|
||
line: self.line,
|
||
column: self.col,
|
||
}
|
||
}
|
||
}
|
||
|
||
/// Represents how many quote characters enclose a string literal.
|
||
#[derive(Copy, Clone)]
|
||
enum NumStringQuoteChars {
|
||
/// e.g. `"abc"`, `'abc'`, `r'abc'`
|
||
One,
|
||
/// e.g. `"""abc"""`, `'''abc'''`, `r'''abc'''`
|
||
Many(NonZeroU8),
|
||
}
|
||
|
||
/// Settings for tokenizing a quoted string literal.
|
||
struct TokenizeQuotedStringSettings {
|
||
/// The character used to quote the string.
|
||
quote_style: char,
|
||
/// Represents how many quotes characters enclose the string literal.
|
||
num_quote_chars: NumStringQuoteChars,
|
||
/// The number of opening quotes left to consume, before parsing
|
||
/// the remaining string literal.
|
||
/// For example: given initial string `"""abc"""`. If the caller has
|
||
/// already parsed the first quote for some reason, then this value
|
||
/// is set to 1, flagging to look to consume only 2 leading quotes.
|
||
num_opening_quotes_to_consume: u8,
|
||
/// True if the string uses backslash escaping of special characters
|
||
/// e.g `'abc\ndef\'ghi'
|
||
backslash_escape: bool,
|
||
}
|
||
|
||
/// SQL Tokenizer
|
||
pub struct Tokenizer<'a> {
|
||
dialect: &'a dyn Dialect,
|
||
query: &'a str,
|
||
/// If true (the default), the tokenizer will un-escape literal
|
||
/// SQL strings See [`Tokenizer::with_unescape`] for more details.
|
||
unescape: bool,
|
||
}
|
||
|
||
impl<'a> Tokenizer<'a> {
|
||
/// Create a new SQL tokenizer for the specified SQL statement
|
||
///
|
||
/// ```
|
||
/// # use sqlparser::tokenizer::{Token, Whitespace, Tokenizer};
|
||
/// # use sqlparser::dialect::GenericDialect;
|
||
/// # let dialect = GenericDialect{};
|
||
/// let query = r#"SELECT 'foo'"#;
|
||
///
|
||
/// // Parsing the query
|
||
/// let tokens = Tokenizer::new(&dialect, &query).tokenize().unwrap();
|
||
///
|
||
/// assert_eq!(tokens, vec![
|
||
/// Token::make_word("SELECT", None),
|
||
/// Token::Whitespace(Whitespace::Space),
|
||
/// Token::SingleQuotedString("foo".to_string()),
|
||
/// ]);
|
||
pub fn new(dialect: &'a dyn Dialect, query: &'a str) -> Self {
|
||
Self {
|
||
dialect,
|
||
query,
|
||
unescape: true,
|
||
}
|
||
}
|
||
|
||
/// Set unescape mode
|
||
///
|
||
/// When true (default) the tokenizer unescapes literal values
|
||
/// (for example, `""` in SQL is unescaped to the literal `"`).
|
||
///
|
||
/// When false, the tokenizer provides the raw strings as provided
|
||
/// in the query. This can be helpful for programs that wish to
|
||
/// recover the *exact* original query text without normalizing
|
||
/// the escaping
|
||
///
|
||
/// # Example
|
||
///
|
||
/// ```
|
||
/// # use sqlparser::tokenizer::{Token, Tokenizer};
|
||
/// # use sqlparser::dialect::GenericDialect;
|
||
/// # let dialect = GenericDialect{};
|
||
/// let query = r#""Foo "" Bar""#;
|
||
/// let unescaped = Token::make_word(r#"Foo " Bar"#, Some('"'));
|
||
/// let original = Token::make_word(r#"Foo "" Bar"#, Some('"'));
|
||
///
|
||
/// // Parsing with unescaping (default)
|
||
/// let tokens = Tokenizer::new(&dialect, &query).tokenize().unwrap();
|
||
/// assert_eq!(tokens, vec![unescaped]);
|
||
///
|
||
/// // Parsing with unescape = false
|
||
/// let tokens = Tokenizer::new(&dialect, &query)
|
||
/// .with_unescape(false)
|
||
/// .tokenize().unwrap();
|
||
/// assert_eq!(tokens, vec![original]);
|
||
/// ```
|
||
pub fn with_unescape(mut self, unescape: bool) -> Self {
|
||
self.unescape = unescape;
|
||
self
|
||
}
|
||
|
||
/// Tokenize the statement and produce a vector of tokens
|
||
pub fn tokenize(&mut self) -> Result<Vec<Token>, TokenizerError> {
|
||
let twl = self.tokenize_with_location()?;
|
||
Ok(twl.into_iter().map(|t| t.token).collect())
|
||
}
|
||
|
||
/// Tokenize the statement and produce a vector of tokens with location information
|
||
pub fn tokenize_with_location(&mut self) -> Result<Vec<TokenWithSpan>, TokenizerError> {
|
||
let mut tokens: Vec<TokenWithSpan> = vec![];
|
||
self.tokenize_with_location_into_buf(&mut tokens)
|
||
.map(|_| tokens)
|
||
}
|
||
|
||
/// Tokenize the statement and append tokens with location information into the provided buffer.
|
||
/// If an error is thrown, the buffer will contain all tokens that were successfully parsed before the error.
|
||
pub fn tokenize_with_location_into_buf(
|
||
&mut self,
|
||
buf: &mut Vec<TokenWithSpan>,
|
||
) -> Result<(), TokenizerError> {
|
||
let mut state = State {
|
||
peekable: self.query.chars().peekable(),
|
||
line: 1,
|
||
col: 1,
|
||
};
|
||
|
||
let mut location = state.location();
|
||
while let Some(token) = self.next_token(&mut state, buf.last().map(|t| &t.token))? {
|
||
let span = location.span_to(state.location());
|
||
|
||
buf.push(TokenWithSpan { token, span });
|
||
|
||
location = state.location();
|
||
}
|
||
Ok(())
|
||
}
|
||
|
||
// Tokenize the identifier or keywords in `ch`
|
||
fn tokenize_identifier_or_keyword(
|
||
&self,
|
||
ch: impl IntoIterator<Item = char>,
|
||
chars: &mut State,
|
||
) -> Result<Option<Token>, TokenizerError> {
|
||
chars.next(); // consume the first char
|
||
let ch: String = ch.into_iter().collect();
|
||
let word = self.tokenize_word(ch, chars);
|
||
|
||
// TODO: implement parsing of exponent here
|
||
if word.chars().all(|x| x.is_ascii_digit() || x == '.') {
|
||
let mut inner_state = State {
|
||
peekable: word.chars().peekable(),
|
||
line: 0,
|
||
col: 0,
|
||
};
|
||
let mut s = peeking_take_while(&mut inner_state, |ch| matches!(ch, '0'..='9' | '.'));
|
||
let s2 = peeking_take_while(chars, |ch| matches!(ch, '0'..='9' | '.'));
|
||
s += s2.as_str();
|
||
return Ok(Some(Token::Number(s, false)));
|
||
}
|
||
|
||
Ok(Some(Token::make_word(&word, None)))
|
||
}
|
||
|
||
/// Get the next token or return None
|
||
fn next_token(
|
||
&self,
|
||
chars: &mut State,
|
||
prev_token: Option<&Token>,
|
||
) -> Result<Option<Token>, TokenizerError> {
|
||
match chars.peek() {
|
||
Some(&ch) => match ch {
|
||
' ' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)),
|
||
'\t' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Tab)),
|
||
'\n' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Newline)),
|
||
'\r' => {
|
||
// Emit a single Whitespace::Newline token for \r and \r\n
|
||
chars.next();
|
||
if let Some('\n') = chars.peek() {
|
||
chars.next();
|
||
}
|
||
Ok(Some(Token::Whitespace(Whitespace::Newline)))
|
||
}
|
||
// BigQuery and MySQL use b or B for byte string literal, Postgres for bit strings
|
||
b @ 'B' | b @ 'b' if dialect_of!(self is BigQueryDialect | PostgreSqlDialect | MySqlDialect | GenericDialect) =>
|
||
{
|
||
chars.next(); // consume
|
||
match chars.peek() {
|
||
Some('\'') => {
|
||
if self.dialect.supports_triple_quoted_string() {
|
||
return self
|
||
.tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
|
||
chars,
|
||
'\'',
|
||
false,
|
||
Token::SingleQuotedByteStringLiteral,
|
||
Token::TripleSingleQuotedByteStringLiteral,
|
||
);
|
||
}
|
||
let s = self.tokenize_single_quoted_string(chars, '\'', false)?;
|
||
Ok(Some(Token::SingleQuotedByteStringLiteral(s)))
|
||
}
|
||
Some('\"') => {
|
||
if self.dialect.supports_triple_quoted_string() {
|
||
return self
|
||
.tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
|
||
chars,
|
||
'"',
|
||
false,
|
||
Token::DoubleQuotedByteStringLiteral,
|
||
Token::TripleDoubleQuotedByteStringLiteral,
|
||
);
|
||
}
|
||
let s = self.tokenize_single_quoted_string(chars, '\"', false)?;
|
||
Ok(Some(Token::DoubleQuotedByteStringLiteral(s)))
|
||
}
|
||
_ => {
|
||
// regular identifier starting with an "b" or "B"
|
||
let s = self.tokenize_word(b, chars);
|
||
Ok(Some(Token::make_word(&s, None)))
|
||
}
|
||
}
|
||
}
|
||
// BigQuery uses r or R for raw string literal
|
||
b @ 'R' | b @ 'r' if dialect_of!(self is BigQueryDialect | GenericDialect) => {
|
||
chars.next(); // consume
|
||
match chars.peek() {
|
||
Some('\'') => self
|
||
.tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
|
||
chars,
|
||
'\'',
|
||
false,
|
||
Token::SingleQuotedRawStringLiteral,
|
||
Token::TripleSingleQuotedRawStringLiteral,
|
||
),
|
||
Some('\"') => self
|
||
.tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
|
||
chars,
|
||
'"',
|
||
false,
|
||
Token::DoubleQuotedRawStringLiteral,
|
||
Token::TripleDoubleQuotedRawStringLiteral,
|
||
),
|
||
_ => {
|
||
// regular identifier starting with an "r" or "R"
|
||
let s = self.tokenize_word(b, chars);
|
||
Ok(Some(Token::make_word(&s, None)))
|
||
}
|
||
}
|
||
}
|
||
// Redshift uses lower case n for national string literal
|
||
n @ 'N' | n @ 'n' => {
|
||
chars.next(); // consume, to check the next char
|
||
match chars.peek() {
|
||
Some('\'') => {
|
||
// N'...' - a <national character string literal>
|
||
let backslash_escape =
|
||
self.dialect.supports_string_literal_backslash_escape();
|
||
let s =
|
||
self.tokenize_single_quoted_string(chars, '\'', backslash_escape)?;
|
||
Ok(Some(Token::NationalStringLiteral(s)))
|
||
}
|
||
_ => {
|
||
// regular identifier starting with an "N"
|
||
let s = self.tokenize_word(n, chars);
|
||
Ok(Some(Token::make_word(&s, None)))
|
||
}
|
||
}
|
||
}
|
||
// PostgreSQL accepts "escape" string constants, which are an extension to the SQL standard.
|
||
x @ 'e' | x @ 'E' if self.dialect.supports_string_escape_constant() => {
|
||
let starting_loc = chars.location();
|
||
chars.next(); // consume, to check the next char
|
||
match chars.peek() {
|
||
Some('\'') => {
|
||
let s =
|
||
self.tokenize_escaped_single_quoted_string(starting_loc, chars)?;
|
||
Ok(Some(Token::EscapedStringLiteral(s)))
|
||
}
|
||
_ => {
|
||
// regular identifier starting with an "E" or "e"
|
||
let s = self.tokenize_word(x, chars);
|
||
Ok(Some(Token::make_word(&s, None)))
|
||
}
|
||
}
|
||
}
|
||
// Unicode string literals like U&'first \000A second' are supported in some dialects, including PostgreSQL
|
||
x @ 'u' | x @ 'U' if self.dialect.supports_unicode_string_literal() => {
|
||
chars.next(); // consume, to check the next char
|
||
if chars.peek() == Some(&'&') {
|
||
// we cannot advance the iterator here, as we need to consume the '&' later if the 'u' was an identifier
|
||
let mut chars_clone = chars.peekable.clone();
|
||
chars_clone.next(); // consume the '&' in the clone
|
||
if chars_clone.peek() == Some(&'\'') {
|
||
chars.next(); // consume the '&' in the original iterator
|
||
let s = unescape_unicode_single_quoted_string(chars)?;
|
||
return Ok(Some(Token::UnicodeStringLiteral(s)));
|
||
}
|
||
}
|
||
// regular identifier starting with an "U" or "u"
|
||
let s = self.tokenize_word(x, chars);
|
||
Ok(Some(Token::make_word(&s, None)))
|
||
}
|
||
// The spec only allows an uppercase 'X' to introduce a hex
|
||
// string, but PostgreSQL, at least, allows a lowercase 'x' too.
|
||
x @ 'x' | x @ 'X' => {
|
||
chars.next(); // consume, to check the next char
|
||
match chars.peek() {
|
||
Some('\'') => {
|
||
// X'...' - a <binary string literal>
|
||
let s = self.tokenize_single_quoted_string(chars, '\'', true)?;
|
||
Ok(Some(Token::HexStringLiteral(s)))
|
||
}
|
||
_ => {
|
||
// regular identifier starting with an "X"
|
||
let s = self.tokenize_word(x, chars);
|
||
Ok(Some(Token::make_word(&s, None)))
|
||
}
|
||
}
|
||
}
|
||
// single quoted string
|
||
'\'' => {
|
||
if self.dialect.supports_triple_quoted_string() {
|
||
return self
|
||
.tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
|
||
chars,
|
||
'\'',
|
||
self.dialect.supports_string_literal_backslash_escape(),
|
||
Token::SingleQuotedString,
|
||
Token::TripleSingleQuotedString,
|
||
);
|
||
}
|
||
let s = self.tokenize_single_quoted_string(
|
||
chars,
|
||
'\'',
|
||
self.dialect.supports_string_literal_backslash_escape(),
|
||
)?;
|
||
|
||
Ok(Some(Token::SingleQuotedString(s)))
|
||
}
|
||
// double quoted string
|
||
'\"' if !self.dialect.is_delimited_identifier_start(ch)
|
||
&& !self.dialect.is_identifier_start(ch) =>
|
||
{
|
||
if self.dialect.supports_triple_quoted_string() {
|
||
return self
|
||
.tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
|
||
chars,
|
||
'"',
|
||
self.dialect.supports_string_literal_backslash_escape(),
|
||
Token::DoubleQuotedString,
|
||
Token::TripleDoubleQuotedString,
|
||
);
|
||
}
|
||
let s = self.tokenize_single_quoted_string(
|
||
chars,
|
||
'"',
|
||
self.dialect.supports_string_literal_backslash_escape(),
|
||
)?;
|
||
|
||
Ok(Some(Token::DoubleQuotedString(s)))
|
||
}
|
||
// delimited (quoted) identifier
|
||
quote_start if self.dialect.is_delimited_identifier_start(ch) => {
|
||
let word = self.tokenize_quoted_identifier(quote_start, chars)?;
|
||
Ok(Some(Token::make_word(&word, Some(quote_start))))
|
||
}
|
||
// Potentially nested delimited (quoted) identifier
|
||
quote_start
|
||
if self
|
||
.dialect
|
||
.is_nested_delimited_identifier_start(quote_start)
|
||
&& self
|
||
.dialect
|
||
.peek_nested_delimited_identifier_quotes(chars.peekable.clone())
|
||
.is_some() =>
|
||
{
|
||
let Some((quote_start, nested_quote_start)) = self
|
||
.dialect
|
||
.peek_nested_delimited_identifier_quotes(chars.peekable.clone())
|
||
else {
|
||
return self.tokenizer_error(
|
||
chars.location(),
|
||
format!("Expected nested delimiter '{quote_start}' before EOF."),
|
||
);
|
||
};
|
||
|
||
let Some(nested_quote_start) = nested_quote_start else {
|
||
let word = self.tokenize_quoted_identifier(quote_start, chars)?;
|
||
return Ok(Some(Token::make_word(&word, Some(quote_start))));
|
||
};
|
||
|
||
let mut word = vec![];
|
||
let quote_end = Word::matching_end_quote(quote_start);
|
||
let nested_quote_end = Word::matching_end_quote(nested_quote_start);
|
||
let error_loc = chars.location();
|
||
|
||
chars.next(); // skip the first delimiter
|
||
peeking_take_while(chars, |ch| ch.is_whitespace());
|
||
if chars.peek() != Some(&nested_quote_start) {
|
||
return self.tokenizer_error(
|
||
error_loc,
|
||
format!("Expected nested delimiter '{nested_quote_start}' before EOF."),
|
||
);
|
||
}
|
||
word.push(nested_quote_start.into());
|
||
word.push(self.tokenize_quoted_identifier(nested_quote_end, chars)?);
|
||
word.push(nested_quote_end.into());
|
||
peeking_take_while(chars, |ch| ch.is_whitespace());
|
||
if chars.peek() != Some("e_end) {
|
||
return self.tokenizer_error(
|
||
error_loc,
|
||
format!("Expected close delimiter '{quote_end}' before EOF."),
|
||
);
|
||
}
|
||
chars.next(); // skip close delimiter
|
||
|
||
Ok(Some(Token::make_word(&word.concat(), Some(quote_start))))
|
||
}
|
||
// numbers and period
|
||
'0'..='9' | '.' => {
|
||
// special case where if ._ is encountered after a word then that word
|
||
// is a table and the _ is the start of the col name.
|
||
// if the prev token is not a word, then this is not a valid sql
|
||
// word or number.
|
||
if ch == '.' && chars.peekable.clone().nth(1) == Some('_') {
|
||
if let Some(Token::Word(_)) = prev_token {
|
||
chars.next();
|
||
return Ok(Some(Token::Period));
|
||
}
|
||
|
||
return self.tokenizer_error(
|
||
chars.location(),
|
||
"Unexpected character '_'".to_string(),
|
||
);
|
||
}
|
||
|
||
// Some dialects support underscore as number separator
|
||
// There can only be one at a time and it must be followed by another digit
|
||
let is_number_separator = |ch: char, next_char: Option<char>| {
|
||
self.dialect.supports_numeric_literal_underscores()
|
||
&& ch == '_'
|
||
&& next_char.is_some_and(|next_ch| next_ch.is_ascii_hexdigit())
|
||
};
|
||
|
||
let mut s = peeking_next_take_while(chars, |ch, next_ch| {
|
||
ch.is_ascii_digit() || is_number_separator(ch, next_ch)
|
||
});
|
||
|
||
// match binary literal that starts with 0x
|
||
if s == "0" && chars.peek() == Some(&'x') {
|
||
chars.next();
|
||
let s2 = peeking_next_take_while(chars, |ch, next_ch| {
|
||
ch.is_ascii_hexdigit() || is_number_separator(ch, next_ch)
|
||
});
|
||
return Ok(Some(Token::HexStringLiteral(s2)));
|
||
}
|
||
|
||
// match one period
|
||
if let Some('.') = chars.peek() {
|
||
s.push('.');
|
||
chars.next();
|
||
}
|
||
|
||
// If the dialect supports identifiers that start with a numeric prefix
|
||
// and we have now consumed a dot, check if the previous token was a Word.
|
||
// If so, what follows is definitely not part of a decimal number and
|
||
// we should yield the dot as a dedicated token so compound identifiers
|
||
// starting with digits can be parsed correctly.
|
||
if s == "." && self.dialect.supports_numeric_prefix() {
|
||
if let Some(Token::Word(_)) = prev_token {
|
||
return Ok(Some(Token::Period));
|
||
}
|
||
}
|
||
|
||
// Consume fractional digits.
|
||
s += &peeking_next_take_while(chars, |ch, next_ch| {
|
||
ch.is_ascii_digit() || is_number_separator(ch, next_ch)
|
||
});
|
||
|
||
// No fraction -> Token::Period
|
||
if s == "." {
|
||
return Ok(Some(Token::Period));
|
||
}
|
||
|
||
// Parse exponent as number
|
||
let mut exponent_part = String::new();
|
||
if chars.peek() == Some(&'e') || chars.peek() == Some(&'E') {
|
||
let mut char_clone = chars.peekable.clone();
|
||
exponent_part.push(char_clone.next().unwrap());
|
||
|
||
// Optional sign
|
||
match char_clone.peek() {
|
||
Some(&c) if matches!(c, '+' | '-') => {
|
||
exponent_part.push(c);
|
||
char_clone.next();
|
||
}
|
||
_ => (),
|
||
}
|
||
|
||
match char_clone.peek() {
|
||
// Definitely an exponent, get original iterator up to speed and use it
|
||
Some(&c) if c.is_ascii_digit() => {
|
||
for _ in 0..exponent_part.len() {
|
||
chars.next();
|
||
}
|
||
exponent_part +=
|
||
&peeking_take_while(chars, |ch| ch.is_ascii_digit());
|
||
s += exponent_part.as_str();
|
||
}
|
||
// Not an exponent, discard the work done
|
||
_ => (),
|
||
}
|
||
}
|
||
|
||
// If the dialect supports identifiers that start with a numeric prefix,
|
||
// we need to check if the value is in fact an identifier and must thus
|
||
// be tokenized as a word.
|
||
if self.dialect.supports_numeric_prefix() {
|
||
if exponent_part.is_empty() {
|
||
// If it is not a number with an exponent, it may be
|
||
// an identifier starting with digits.
|
||
let word =
|
||
peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch));
|
||
|
||
if !word.is_empty() {
|
||
s += word.as_str();
|
||
return Ok(Some(Token::make_word(s.as_str(), None)));
|
||
}
|
||
} else if prev_token == Some(&Token::Period) {
|
||
// If the previous token was a period, thus not belonging to a number,
|
||
// the value we have is part of an identifier.
|
||
return Ok(Some(Token::make_word(s.as_str(), None)));
|
||
}
|
||
}
|
||
|
||
let long = if chars.peek() == Some(&'L') {
|
||
chars.next();
|
||
true
|
||
} else {
|
||
false
|
||
};
|
||
Ok(Some(Token::Number(s, long)))
|
||
}
|
||
// punctuation
|
||
'(' => self.consume_and_return(chars, Token::LParen),
|
||
')' => self.consume_and_return(chars, Token::RParen),
|
||
',' => self.consume_and_return(chars, Token::Comma),
|
||
// operators
|
||
'-' => {
|
||
chars.next(); // consume the '-'
|
||
|
||
match chars.peek() {
|
||
Some('-') => {
|
||
let mut is_comment = true;
|
||
if self.dialect.requires_single_line_comment_whitespace() {
|
||
is_comment = Some(' ') == chars.peekable.clone().nth(1);
|
||
}
|
||
|
||
if is_comment {
|
||
chars.next(); // consume second '-'
|
||
let comment = self.tokenize_single_line_comment(chars);
|
||
return Ok(Some(Token::Whitespace(
|
||
Whitespace::SingleLineComment {
|
||
prefix: "--".to_owned(),
|
||
comment,
|
||
},
|
||
)));
|
||
}
|
||
|
||
self.start_binop(chars, "-", Token::Minus)
|
||
}
|
||
Some('>') => {
|
||
chars.next();
|
||
match chars.peek() {
|
||
Some('>') => self.consume_for_binop(chars, "->>", Token::LongArrow),
|
||
_ => self.start_binop(chars, "->", Token::Arrow),
|
||
}
|
||
}
|
||
// a regular '-' operator
|
||
_ => self.start_binop(chars, "-", Token::Minus),
|
||
}
|
||
}
|
||
'/' => {
|
||
chars.next(); // consume the '/'
|
||
match chars.peek() {
|
||
Some('*') => {
|
||
chars.next(); // consume the '*', starting a multi-line comment
|
||
self.tokenize_multiline_comment(chars)
|
||
}
|
||
Some('/') if dialect_of!(self is SnowflakeDialect) => {
|
||
chars.next(); // consume the second '/', starting a snowflake single-line comment
|
||
let comment = self.tokenize_single_line_comment(chars);
|
||
Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
|
||
prefix: "//".to_owned(),
|
||
comment,
|
||
})))
|
||
}
|
||
Some('/') if dialect_of!(self is DuckDbDialect | GenericDialect) => {
|
||
self.consume_and_return(chars, Token::DuckIntDiv)
|
||
}
|
||
// a regular '/' operator
|
||
_ => Ok(Some(Token::Div)),
|
||
}
|
||
}
|
||
'+' => self.consume_and_return(chars, Token::Plus),
|
||
'*' => self.consume_and_return(chars, Token::Mul),
|
||
'%' => {
|
||
chars.next(); // advance past '%'
|
||
match chars.peek() {
|
||
Some(s) if s.is_whitespace() => Ok(Some(Token::Mod)),
|
||
Some(sch) if self.dialect.is_identifier_start('%') => {
|
||
self.tokenize_identifier_or_keyword([ch, *sch], chars)
|
||
}
|
||
_ => self.start_binop(chars, "%", Token::Mod),
|
||
}
|
||
}
|
||
'|' => {
|
||
chars.next(); // consume the '|'
|
||
match chars.peek() {
|
||
Some('/') => self.consume_for_binop(chars, "|/", Token::PGSquareRoot),
|
||
Some('|') => {
|
||
chars.next(); // consume the second '|'
|
||
match chars.peek() {
|
||
Some('/') => {
|
||
self.consume_for_binop(chars, "||/", Token::PGCubeRoot)
|
||
}
|
||
_ => self.start_binop(chars, "||", Token::StringConcat),
|
||
}
|
||
}
|
||
Some('&') if self.dialect.supports_geometric_types() => {
|
||
chars.next(); // consume
|
||
match chars.peek() {
|
||
Some('>') => self.consume_for_binop(
|
||
chars,
|
||
"|&>",
|
||
Token::VerticalBarAmpersandRightAngleBracket,
|
||
),
|
||
_ => self.start_binop_opt(chars, "|&", None),
|
||
}
|
||
}
|
||
Some('>') if self.dialect.supports_geometric_types() => {
|
||
chars.next(); // consume
|
||
match chars.peek() {
|
||
Some('>') => self.consume_for_binop(
|
||
chars,
|
||
"|>>",
|
||
Token::VerticalBarShiftRight,
|
||
),
|
||
_ => self.start_binop_opt(chars, "|>", None),
|
||
}
|
||
}
|
||
Some('>') if self.dialect.supports_pipe_operator() => {
|
||
self.consume_for_binop(chars, "|>", Token::VerticalBarRightAngleBracket)
|
||
}
|
||
// Bitshift '|' operator
|
||
_ => self.start_binop(chars, "|", Token::Pipe),
|
||
}
|
||
}
|
||
'=' => {
|
||
chars.next(); // consume
|
||
match chars.peek() {
|
||
Some('>') => self.consume_and_return(chars, Token::RArrow),
|
||
Some('=') => self.consume_and_return(chars, Token::DoubleEq),
|
||
_ => Ok(Some(Token::Eq)),
|
||
}
|
||
}
|
||
'!' => {
|
||
chars.next(); // consume
|
||
match chars.peek() {
|
||
Some('=') => self.consume_and_return(chars, Token::Neq),
|
||
Some('!') => self.consume_and_return(chars, Token::DoubleExclamationMark),
|
||
Some('~') => {
|
||
chars.next();
|
||
match chars.peek() {
|
||
Some('*') => self
|
||
.consume_and_return(chars, Token::ExclamationMarkTildeAsterisk),
|
||
Some('~') => {
|
||
chars.next();
|
||
match chars.peek() {
|
||
Some('*') => self.consume_and_return(
|
||
chars,
|
||
Token::ExclamationMarkDoubleTildeAsterisk,
|
||
),
|
||
_ => Ok(Some(Token::ExclamationMarkDoubleTilde)),
|
||
}
|
||
}
|
||
_ => Ok(Some(Token::ExclamationMarkTilde)),
|
||
}
|
||
}
|
||
_ => Ok(Some(Token::ExclamationMark)),
|
||
}
|
||
}
|
||
'<' => {
|
||
chars.next(); // consume
|
||
match chars.peek() {
|
||
Some('=') => {
|
||
chars.next();
|
||
match chars.peek() {
|
||
Some('>') => self.consume_for_binop(chars, "<=>", Token::Spaceship),
|
||
_ => self.start_binop(chars, "<=", Token::LtEq),
|
||
}
|
||
}
|
||
Some('|') if self.dialect.supports_geometric_types() => {
|
||
self.consume_for_binop(chars, "<<|", Token::ShiftLeftVerticalBar)
|
||
}
|
||
Some('>') => self.consume_for_binop(chars, "<>", Token::Neq),
|
||
Some('<') if self.dialect.supports_geometric_types() => {
|
||
chars.next(); // consume
|
||
match chars.peek() {
|
||
Some('|') => self.consume_for_binop(
|
||
chars,
|
||
"<<|",
|
||
Token::ShiftLeftVerticalBar,
|
||
),
|
||
_ => self.start_binop(chars, "<<", Token::ShiftLeft),
|
||
}
|
||
}
|
||
Some('<') => self.consume_for_binop(chars, "<<", Token::ShiftLeft),
|
||
Some('-') if self.dialect.supports_geometric_types() => {
|
||
chars.next(); // consume
|
||
match chars.peek() {
|
||
Some('>') => {
|
||
self.consume_for_binop(chars, "<->", Token::TwoWayArrow)
|
||
}
|
||
_ => self.start_binop_opt(chars, "<-", None),
|
||
}
|
||
}
|
||
Some('^') if self.dialect.supports_geometric_types() => {
|
||
self.consume_for_binop(chars, "<^", Token::LeftAngleBracketCaret)
|
||
}
|
||
Some('@') => self.consume_for_binop(chars, "<@", Token::ArrowAt),
|
||
_ => self.start_binop(chars, "<", Token::Lt),
|
||
}
|
||
}
|
||
'>' => {
|
||
chars.next(); // consume
|
||
match chars.peek() {
|
||
Some('=') => self.consume_for_binop(chars, ">=", Token::GtEq),
|
||
Some('>') => self.consume_for_binop(chars, ">>", Token::ShiftRight),
|
||
Some('^') if self.dialect.supports_geometric_types() => {
|
||
self.consume_for_binop(chars, ">^", Token::RightAngleBracketCaret)
|
||
}
|
||
_ => self.start_binop(chars, ">", Token::Gt),
|
||
}
|
||
}
|
||
':' => {
|
||
chars.next();
|
||
match chars.peek() {
|
||
Some(':') => self.consume_and_return(chars, Token::DoubleColon),
|
||
Some('=') => self.consume_and_return(chars, Token::Assignment),
|
||
_ => Ok(Some(Token::Colon)),
|
||
}
|
||
}
|
||
';' => self.consume_and_return(chars, Token::SemiColon),
|
||
'\\' => self.consume_and_return(chars, Token::Backslash),
|
||
'[' => self.consume_and_return(chars, Token::LBracket),
|
||
']' => self.consume_and_return(chars, Token::RBracket),
|
||
'&' => {
|
||
chars.next(); // consume the '&'
|
||
match chars.peek() {
|
||
Some('>') if self.dialect.supports_geometric_types() => {
|
||
chars.next();
|
||
self.consume_and_return(chars, Token::AmpersandRightAngleBracket)
|
||
}
|
||
Some('<') if self.dialect.supports_geometric_types() => {
|
||
chars.next(); // consume
|
||
match chars.peek() {
|
||
Some('|') => self.consume_and_return(
|
||
chars,
|
||
Token::AmpersandLeftAngleBracketVerticalBar,
|
||
),
|
||
_ => {
|
||
self.start_binop(chars, "&<", Token::AmpersandLeftAngleBracket)
|
||
}
|
||
}
|
||
}
|
||
Some('&') => {
|
||
chars.next(); // consume the second '&'
|
||
self.start_binop(chars, "&&", Token::Overlap)
|
||
}
|
||
// Bitshift '&' operator
|
||
_ => self.start_binop(chars, "&", Token::Ampersand),
|
||
}
|
||
}
|
||
'^' => {
|
||
chars.next(); // consume the '^'
|
||
match chars.peek() {
|
||
Some('@') => self.consume_and_return(chars, Token::CaretAt),
|
||
_ => Ok(Some(Token::Caret)),
|
||
}
|
||
}
|
||
'{' => self.consume_and_return(chars, Token::LBrace),
|
||
'}' => self.consume_and_return(chars, Token::RBrace),
|
||
'#' if dialect_of!(self is SnowflakeDialect | BigQueryDialect | MySqlDialect | HiveDialect) =>
|
||
{
|
||
chars.next(); // consume the '#', starting a snowflake single-line comment
|
||
let comment = self.tokenize_single_line_comment(chars);
|
||
Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
|
||
prefix: "#".to_owned(),
|
||
comment,
|
||
})))
|
||
}
|
||
'~' => {
|
||
chars.next(); // consume
|
||
match chars.peek() {
|
||
Some('*') => self.consume_for_binop(chars, "~*", Token::TildeAsterisk),
|
||
Some('=') if self.dialect.supports_geometric_types() => {
|
||
self.consume_for_binop(chars, "~=", Token::TildeEqual)
|
||
}
|
||
Some('~') => {
|
||
chars.next();
|
||
match chars.peek() {
|
||
Some('*') => {
|
||
self.consume_for_binop(chars, "~~*", Token::DoubleTildeAsterisk)
|
||
}
|
||
_ => self.start_binop(chars, "~~", Token::DoubleTilde),
|
||
}
|
||
}
|
||
_ => self.start_binop(chars, "~", Token::Tilde),
|
||
}
|
||
}
|
||
'#' => {
|
||
chars.next();
|
||
match chars.peek() {
|
||
Some('-') => self.consume_for_binop(chars, "#-", Token::HashMinus),
|
||
Some('>') => {
|
||
chars.next();
|
||
match chars.peek() {
|
||
Some('>') => {
|
||
self.consume_for_binop(chars, "#>>", Token::HashLongArrow)
|
||
}
|
||
_ => self.start_binop(chars, "#>", Token::HashArrow),
|
||
}
|
||
}
|
||
Some(' ') => Ok(Some(Token::Sharp)),
|
||
Some('#') if self.dialect.supports_geometric_types() => {
|
||
self.consume_for_binop(chars, "##", Token::DoubleSharp)
|
||
}
|
||
Some(sch) if self.dialect.is_identifier_start('#') => {
|
||
self.tokenize_identifier_or_keyword([ch, *sch], chars)
|
||
}
|
||
_ => self.start_binop(chars, "#", Token::Sharp),
|
||
}
|
||
}
|
||
'@' => {
|
||
chars.next();
|
||
match chars.peek() {
|
||
Some('@') if self.dialect.supports_geometric_types() => {
|
||
self.consume_and_return(chars, Token::AtAt)
|
||
}
|
||
Some('-') if self.dialect.supports_geometric_types() => {
|
||
chars.next();
|
||
match chars.peek() {
|
||
Some('@') => self.consume_and_return(chars, Token::AtDashAt),
|
||
_ => self.start_binop_opt(chars, "@-", None),
|
||
}
|
||
}
|
||
Some('>') => self.consume_and_return(chars, Token::AtArrow),
|
||
Some('?') => self.consume_and_return(chars, Token::AtQuestion),
|
||
Some('@') => {
|
||
chars.next();
|
||
match chars.peek() {
|
||
Some(' ') => Ok(Some(Token::AtAt)),
|
||
Some(tch) if self.dialect.is_identifier_start('@') => {
|
||
self.tokenize_identifier_or_keyword([ch, '@', *tch], chars)
|
||
}
|
||
_ => Ok(Some(Token::AtAt)),
|
||
}
|
||
}
|
||
Some(' ') => Ok(Some(Token::AtSign)),
|
||
// We break on quotes here, because no dialect allows identifiers starting
|
||
// with @ and containing quotation marks (e.g. `@'foo'`) unless they are
|
||
// quoted, which is tokenized as a quoted string, not here (e.g.
|
||
// `"@'foo'"`). Further, at least two dialects parse `@` followed by a
|
||
// quoted string as two separate tokens, which this allows. For example,
|
||
// Postgres parses `@'1'` as the absolute value of '1' which is implicitly
|
||
// cast to a numeric type. And when parsing MySQL-style grantees (e.g.
|
||
// `GRANT ALL ON *.* to 'root'@'localhost'`), we also want separate tokens
|
||
// for the user, the `@`, and the host.
|
||
Some('\'') => Ok(Some(Token::AtSign)),
|
||
Some('\"') => Ok(Some(Token::AtSign)),
|
||
Some('`') => Ok(Some(Token::AtSign)),
|
||
Some(sch) if self.dialect.is_identifier_start('@') => {
|
||
self.tokenize_identifier_or_keyword([ch, *sch], chars)
|
||
}
|
||
_ => Ok(Some(Token::AtSign)),
|
||
}
|
||
}
|
||
// Postgres uses ? for jsonb operators, not prepared statements
|
||
'?' if self.dialect.supports_geometric_types() => {
|
||
chars.next(); // consume
|
||
match chars.peek() {
|
||
Some('|') => {
|
||
chars.next();
|
||
match chars.peek() {
|
||
Some('|') => self.consume_and_return(
|
||
chars,
|
||
Token::QuestionMarkDoubleVerticalBar,
|
||
),
|
||
_ => Ok(Some(Token::QuestionPipe)),
|
||
}
|
||
}
|
||
|
||
Some('&') => self.consume_and_return(chars, Token::QuestionAnd),
|
||
Some('-') => {
|
||
chars.next(); // consume
|
||
match chars.peek() {
|
||
Some('|') => self
|
||
.consume_and_return(chars, Token::QuestionMarkDashVerticalBar),
|
||
_ => Ok(Some(Token::QuestionMarkDash)),
|
||
}
|
||
}
|
||
Some('#') => self.consume_and_return(chars, Token::QuestionMarkSharp),
|
||
_ => self.consume_and_return(chars, Token::Question),
|
||
}
|
||
}
|
||
'?' => {
|
||
chars.next();
|
||
let s = peeking_take_while(chars, |ch| ch.is_numeric());
|
||
Ok(Some(Token::Placeholder(String::from("?") + &s)))
|
||
}
|
||
|
||
// identifier or keyword
|
||
ch if self.dialect.is_identifier_start(ch) => {
|
||
self.tokenize_identifier_or_keyword([ch], chars)
|
||
}
|
||
'$' => Ok(Some(self.tokenize_dollar_preceded_value(chars)?)),
|
||
|
||
// whitespace check (including unicode chars) should be last as it covers some of the chars above
|
||
ch if ch.is_whitespace() => {
|
||
self.consume_and_return(chars, Token::Whitespace(Whitespace::Space))
|
||
}
|
||
other => self.consume_and_return(chars, Token::Char(other)),
|
||
},
|
||
None => Ok(None),
|
||
}
|
||
}
|
||
|
||
/// Consume the next character, then parse a custom binary operator. The next character should be included in the prefix
|
||
fn consume_for_binop(
|
||
&self,
|
||
chars: &mut State,
|
||
prefix: &str,
|
||
default: Token,
|
||
) -> Result<Option<Token>, TokenizerError> {
|
||
chars.next(); // consume the first char
|
||
self.start_binop_opt(chars, prefix, Some(default))
|
||
}
|
||
|
||
/// parse a custom binary operator
|
||
fn start_binop(
|
||
&self,
|
||
chars: &mut State,
|
||
prefix: &str,
|
||
default: Token,
|
||
) -> Result<Option<Token>, TokenizerError> {
|
||
self.start_binop_opt(chars, prefix, Some(default))
|
||
}
|
||
|
||
/// parse a custom binary operator
|
||
fn start_binop_opt(
|
||
&self,
|
||
chars: &mut State,
|
||
prefix: &str,
|
||
default: Option<Token>,
|
||
) -> Result<Option<Token>, TokenizerError> {
|
||
let mut custom = None;
|
||
while let Some(&ch) = chars.peek() {
|
||
if !self.dialect.is_custom_operator_part(ch) {
|
||
break;
|
||
}
|
||
|
||
custom.get_or_insert_with(|| prefix.to_string()).push(ch);
|
||
chars.next();
|
||
}
|
||
match (custom, default) {
|
||
(Some(custom), _) => Ok(Token::CustomBinaryOperator(custom).into()),
|
||
(None, Some(tok)) => Ok(Some(tok)),
|
||
(None, None) => self.tokenizer_error(
|
||
chars.location(),
|
||
format!("Expected a valid binary operator after '{prefix}'"),
|
||
),
|
||
}
|
||
}
|
||
|
||
/// Tokenize dollar preceded value (i.e: a string/placeholder)
|
||
fn tokenize_dollar_preceded_value(&self, chars: &mut State) -> Result<Token, TokenizerError> {
|
||
let mut s = String::new();
|
||
let mut value = String::new();
|
||
|
||
chars.next();
|
||
|
||
// If the dialect does not support dollar-quoted strings, then `$$` is rather a placeholder.
|
||
if matches!(chars.peek(), Some('$')) && !self.dialect.supports_dollar_placeholder() {
|
||
chars.next();
|
||
|
||
let mut is_terminated = false;
|
||
let mut prev: Option<char> = None;
|
||
|
||
while let Some(&ch) = chars.peek() {
|
||
if prev == Some('$') {
|
||
if ch == '$' {
|
||
chars.next();
|
||
is_terminated = true;
|
||
break;
|
||
} else {
|
||
s.push('$');
|
||
s.push(ch);
|
||
}
|
||
} else if ch != '$' {
|
||
s.push(ch);
|
||
}
|
||
|
||
prev = Some(ch);
|
||
chars.next();
|
||
}
|
||
|
||
return if chars.peek().is_none() && !is_terminated {
|
||
self.tokenizer_error(chars.location(), "Unterminated dollar-quoted string")
|
||
} else {
|
||
Ok(Token::DollarQuotedString(DollarQuotedString {
|
||
value: s,
|
||
tag: None,
|
||
}))
|
||
};
|
||
} else {
|
||
value.push_str(&peeking_take_while(chars, |ch| {
|
||
ch.is_alphanumeric()
|
||
|| ch == '_'
|
||
// Allow $ as a placeholder character if the dialect supports it
|
||
|| matches!(ch, '$' if self.dialect.supports_dollar_placeholder())
|
||
}));
|
||
|
||
// If the dialect does not support dollar-quoted strings, don't look for the end delimiter.
|
||
if matches!(chars.peek(), Some('$')) && !self.dialect.supports_dollar_placeholder() {
|
||
chars.next();
|
||
|
||
let mut temp = String::new();
|
||
let end_delimiter = format!("${value}$");
|
||
|
||
loop {
|
||
match chars.next() {
|
||
Some(ch) => {
|
||
temp.push(ch);
|
||
|
||
if temp.ends_with(&end_delimiter) {
|
||
if let Some(temp) = temp.strip_suffix(&end_delimiter) {
|
||
s.push_str(temp);
|
||
}
|
||
break;
|
||
}
|
||
}
|
||
None => {
|
||
if temp.ends_with(&end_delimiter) {
|
||
if let Some(temp) = temp.strip_suffix(&end_delimiter) {
|
||
s.push_str(temp);
|
||
}
|
||
break;
|
||
}
|
||
|
||
return self.tokenizer_error(
|
||
chars.location(),
|
||
"Unterminated dollar-quoted, expected $",
|
||
);
|
||
}
|
||
}
|
||
}
|
||
} else {
|
||
return Ok(Token::Placeholder(String::from("$") + &value));
|
||
}
|
||
}
|
||
|
||
Ok(Token::DollarQuotedString(DollarQuotedString {
|
||
value: s,
|
||
tag: if value.is_empty() { None } else { Some(value) },
|
||
}))
|
||
}
|
||
|
||
fn tokenizer_error<R>(
|
||
&self,
|
||
loc: Location,
|
||
message: impl Into<String>,
|
||
) -> Result<R, TokenizerError> {
|
||
Err(TokenizerError {
|
||
message: message.into(),
|
||
location: loc,
|
||
})
|
||
}
|
||
|
||
// Consume characters until newline
|
||
fn tokenize_single_line_comment(&self, chars: &mut State) -> String {
|
||
let mut comment = peeking_take_while(chars, |ch| match ch {
|
||
'\n' => false, // Always stop at \n
|
||
'\r' if dialect_of!(self is PostgreSqlDialect) => false, // Stop at \r for Postgres
|
||
_ => true, // Keep consuming for other characters
|
||
});
|
||
|
||
if let Some(ch) = chars.next() {
|
||
assert!(ch == '\n' || ch == '\r');
|
||
comment.push(ch);
|
||
}
|
||
|
||
comment
|
||
}
|
||
|
||
/// Tokenize an identifier or keyword, after the first char is already consumed.
|
||
fn tokenize_word(&self, first_chars: impl Into<String>, chars: &mut State) -> String {
|
||
let mut s = first_chars.into();
|
||
s.push_str(&peeking_take_while(chars, |ch| {
|
||
self.dialect.is_identifier_part(ch)
|
||
}));
|
||
s
|
||
}
|
||
|
||
/// Read a quoted identifier
|
||
fn tokenize_quoted_identifier(
|
||
&self,
|
||
quote_start: char,
|
||
chars: &mut State,
|
||
) -> Result<String, TokenizerError> {
|
||
let error_loc = chars.location();
|
||
chars.next(); // consume the opening quote
|
||
let quote_end = Word::matching_end_quote(quote_start);
|
||
let (s, last_char) = self.parse_quoted_ident(chars, quote_end);
|
||
|
||
if last_char == Some(quote_end) {
|
||
Ok(s)
|
||
} else {
|
||
self.tokenizer_error(
|
||
error_loc,
|
||
format!("Expected close delimiter '{quote_end}' before EOF."),
|
||
)
|
||
}
|
||
}
|
||
|
||
/// Read a single quoted string, starting with the opening quote.
|
||
fn tokenize_escaped_single_quoted_string(
|
||
&self,
|
||
starting_loc: Location,
|
||
chars: &mut State,
|
||
) -> Result<String, TokenizerError> {
|
||
if let Some(s) = unescape_single_quoted_string(chars) {
|
||
return Ok(s);
|
||
}
|
||
|
||
self.tokenizer_error(starting_loc, "Unterminated encoded string literal")
|
||
}
|
||
|
||
/// Reads a string literal quoted by a single or triple quote characters.
|
||
/// Examples: `'abc'`, `'''abc'''`, `"""abc"""`.
|
||
fn tokenize_single_or_triple_quoted_string<F>(
|
||
&self,
|
||
chars: &mut State,
|
||
quote_style: char,
|
||
backslash_escape: bool,
|
||
single_quote_token: F,
|
||
triple_quote_token: F,
|
||
) -> Result<Option<Token>, TokenizerError>
|
||
where
|
||
F: Fn(String) -> Token,
|
||
{
|
||
let error_loc = chars.location();
|
||
|
||
let mut num_opening_quotes = 0u8;
|
||
for _ in 0..3 {
|
||
if Some("e_style) == chars.peek() {
|
||
chars.next(); // Consume quote.
|
||
num_opening_quotes += 1;
|
||
} else {
|
||
break;
|
||
}
|
||
}
|
||
|
||
let (token_fn, num_quote_chars) = match num_opening_quotes {
|
||
1 => (single_quote_token, NumStringQuoteChars::One),
|
||
2 => {
|
||
// If we matched double quotes, then this is an empty string.
|
||
return Ok(Some(single_quote_token("".into())));
|
||
}
|
||
3 => {
|
||
let Some(num_quote_chars) = NonZeroU8::new(3) else {
|
||
return self.tokenizer_error(error_loc, "invalid number of opening quotes");
|
||
};
|
||
(
|
||
triple_quote_token,
|
||
NumStringQuoteChars::Many(num_quote_chars),
|
||
)
|
||
}
|
||
_ => {
|
||
return self.tokenizer_error(error_loc, "invalid string literal opening");
|
||
}
|
||
};
|
||
|
||
let settings = TokenizeQuotedStringSettings {
|
||
quote_style,
|
||
num_quote_chars,
|
||
num_opening_quotes_to_consume: 0,
|
||
backslash_escape,
|
||
};
|
||
|
||
self.tokenize_quoted_string(chars, settings)
|
||
.map(token_fn)
|
||
.map(Some)
|
||
}
|
||
|
||
/// Reads a string literal quoted by a single quote character.
|
||
fn tokenize_single_quoted_string(
|
||
&self,
|
||
chars: &mut State,
|
||
quote_style: char,
|
||
backslash_escape: bool,
|
||
) -> Result<String, TokenizerError> {
|
||
self.tokenize_quoted_string(
|
||
chars,
|
||
TokenizeQuotedStringSettings {
|
||
quote_style,
|
||
num_quote_chars: NumStringQuoteChars::One,
|
||
num_opening_quotes_to_consume: 1,
|
||
backslash_escape,
|
||
},
|
||
)
|
||
}
|
||
|
||
/// Read a quoted string.
|
||
fn tokenize_quoted_string(
|
||
&self,
|
||
chars: &mut State,
|
||
settings: TokenizeQuotedStringSettings,
|
||
) -> Result<String, TokenizerError> {
|
||
let mut s = String::new();
|
||
let error_loc = chars.location();
|
||
|
||
// Consume any opening quotes.
|
||
for _ in 0..settings.num_opening_quotes_to_consume {
|
||
if Some(settings.quote_style) != chars.next() {
|
||
return self.tokenizer_error(error_loc, "invalid string literal opening");
|
||
}
|
||
}
|
||
|
||
let mut num_consecutive_quotes = 0;
|
||
while let Some(&ch) = chars.peek() {
|
||
let pending_final_quote = match settings.num_quote_chars {
|
||
NumStringQuoteChars::One => Some(NumStringQuoteChars::One),
|
||
n @ NumStringQuoteChars::Many(count)
|
||
if num_consecutive_quotes + 1 == count.get() =>
|
||
{
|
||
Some(n)
|
||
}
|
||
NumStringQuoteChars::Many(_) => None,
|
||
};
|
||
|
||
match ch {
|
||
char if char == settings.quote_style && pending_final_quote.is_some() => {
|
||
chars.next(); // consume
|
||
|
||
if let Some(NumStringQuoteChars::Many(count)) = pending_final_quote {
|
||
// For an initial string like `"""abc"""`, at this point we have
|
||
// `abc""` in the buffer and have now matched the final `"`.
|
||
// However, the string to return is simply `abc`, so we strip off
|
||
// the trailing quotes before returning.
|
||
let mut buf = s.chars();
|
||
for _ in 1..count.get() {
|
||
buf.next_back();
|
||
}
|
||
return Ok(buf.as_str().to_string());
|
||
} else if chars
|
||
.peek()
|
||
.map(|c| *c == settings.quote_style)
|
||
.unwrap_or(false)
|
||
{
|
||
s.push(ch);
|
||
if !self.unescape {
|
||
// In no-escape mode, the given query has to be saved completely
|
||
s.push(ch);
|
||
}
|
||
chars.next();
|
||
} else {
|
||
return Ok(s);
|
||
}
|
||
}
|
||
'\\' if settings.backslash_escape => {
|
||
// consume backslash
|
||
chars.next();
|
||
|
||
num_consecutive_quotes = 0;
|
||
|
||
if let Some(next) = chars.peek() {
|
||
if !self.unescape
|
||
|| (self.dialect.ignores_wildcard_escapes()
|
||
&& (*next == '%' || *next == '_'))
|
||
{
|
||
// In no-escape mode, the given query has to be saved completely
|
||
// including backslashes. Similarly, with ignore_like_wildcard_escapes,
|
||
// the backslash is not stripped.
|
||
s.push(ch);
|
||
s.push(*next);
|
||
chars.next(); // consume next
|
||
} else {
|
||
let n = match next {
|
||
'0' => '\0',
|
||
'a' => '\u{7}',
|
||
'b' => '\u{8}',
|
||
'f' => '\u{c}',
|
||
'n' => '\n',
|
||
'r' => '\r',
|
||
't' => '\t',
|
||
'Z' => '\u{1a}',
|
||
_ => *next,
|
||
};
|
||
s.push(n);
|
||
chars.next(); // consume next
|
||
}
|
||
}
|
||
}
|
||
ch => {
|
||
chars.next(); // consume ch
|
||
|
||
if ch == settings.quote_style {
|
||
num_consecutive_quotes += 1;
|
||
} else {
|
||
num_consecutive_quotes = 0;
|
||
}
|
||
|
||
s.push(ch);
|
||
}
|
||
}
|
||
}
|
||
self.tokenizer_error(error_loc, "Unterminated string literal")
|
||
}
|
||
|
||
fn tokenize_multiline_comment(
|
||
&self,
|
||
chars: &mut State,
|
||
) -> Result<Option<Token>, TokenizerError> {
|
||
let mut s = String::new();
|
||
let mut nested = 1;
|
||
let supports_nested_comments = self.dialect.supports_nested_comments();
|
||
|
||
loop {
|
||
match chars.next() {
|
||
Some('/') if matches!(chars.peek(), Some('*')) && supports_nested_comments => {
|
||
chars.next(); // consume the '*'
|
||
s.push('/');
|
||
s.push('*');
|
||
nested += 1;
|
||
}
|
||
Some('*') if matches!(chars.peek(), Some('/')) => {
|
||
chars.next(); // consume the '/'
|
||
nested -= 1;
|
||
if nested == 0 {
|
||
break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
|
||
}
|
||
s.push('*');
|
||
s.push('/');
|
||
}
|
||
Some(ch) => {
|
||
s.push(ch);
|
||
}
|
||
None => {
|
||
break self.tokenizer_error(
|
||
chars.location(),
|
||
"Unexpected EOF while in a multi-line comment",
|
||
);
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
fn parse_quoted_ident(&self, chars: &mut State, quote_end: char) -> (String, Option<char>) {
|
||
let mut last_char = None;
|
||
let mut s = String::new();
|
||
while let Some(ch) = chars.next() {
|
||
if ch == quote_end {
|
||
if chars.peek() == Some("e_end) {
|
||
chars.next();
|
||
s.push(ch);
|
||
if !self.unescape {
|
||
// In no-escape mode, the given query has to be saved completely
|
||
s.push(ch);
|
||
}
|
||
} else {
|
||
last_char = Some(quote_end);
|
||
break;
|
||
}
|
||
} else {
|
||
s.push(ch);
|
||
}
|
||
}
|
||
(s, last_char)
|
||
}
|
||
|
||
#[allow(clippy::unnecessary_wraps)]
|
||
fn consume_and_return(
|
||
&self,
|
||
chars: &mut State,
|
||
t: Token,
|
||
) -> Result<Option<Token>, TokenizerError> {
|
||
chars.next();
|
||
Ok(Some(t))
|
||
}
|
||
}
|
||
|
||
/// Read from `chars` until `predicate` returns `false` or EOF is hit.
|
||
/// Return the characters read as String, and keep the first non-matching
|
||
/// char available as `chars.next()`.
|
||
fn peeking_take_while(chars: &mut State, mut predicate: impl FnMut(char) -> bool) -> String {
|
||
let mut s = String::new();
|
||
while let Some(&ch) = chars.peek() {
|
||
if predicate(ch) {
|
||
chars.next(); // consume
|
||
s.push(ch);
|
||
} else {
|
||
break;
|
||
}
|
||
}
|
||
s
|
||
}
|
||
|
||
/// Same as peeking_take_while, but also passes the next character to the predicate.
|
||
fn peeking_next_take_while(
|
||
chars: &mut State,
|
||
mut predicate: impl FnMut(char, Option<char>) -> bool,
|
||
) -> String {
|
||
let mut s = String::new();
|
||
while let Some(&ch) = chars.peek() {
|
||
let next_char = chars.peekable.clone().nth(1);
|
||
if predicate(ch, next_char) {
|
||
chars.next(); // consume
|
||
s.push(ch);
|
||
} else {
|
||
break;
|
||
}
|
||
}
|
||
s
|
||
}
|
||
|
||
fn unescape_single_quoted_string(chars: &mut State<'_>) -> Option<String> {
|
||
Unescape::new(chars).unescape()
|
||
}
|
||
|
||
struct Unescape<'a: 'b, 'b> {
|
||
chars: &'b mut State<'a>,
|
||
}
|
||
|
||
impl<'a: 'b, 'b> Unescape<'a, 'b> {
|
||
fn new(chars: &'b mut State<'a>) -> Self {
|
||
Self { chars }
|
||
}
|
||
fn unescape(mut self) -> Option<String> {
|
||
let mut unescaped = String::new();
|
||
|
||
self.chars.next();
|
||
|
||
while let Some(c) = self.chars.next() {
|
||
if c == '\'' {
|
||
// case: ''''
|
||
if self.chars.peek().map(|c| *c == '\'').unwrap_or(false) {
|
||
self.chars.next();
|
||
unescaped.push('\'');
|
||
continue;
|
||
}
|
||
return Some(unescaped);
|
||
}
|
||
|
||
if c != '\\' {
|
||
unescaped.push(c);
|
||
continue;
|
||
}
|
||
|
||
let c = match self.chars.next()? {
|
||
'b' => '\u{0008}',
|
||
'f' => '\u{000C}',
|
||
'n' => '\n',
|
||
'r' => '\r',
|
||
't' => '\t',
|
||
'u' => self.unescape_unicode_16()?,
|
||
'U' => self.unescape_unicode_32()?,
|
||
'x' => self.unescape_hex()?,
|
||
c if c.is_digit(8) => self.unescape_octal(c)?,
|
||
c => c,
|
||
};
|
||
|
||
unescaped.push(Self::check_null(c)?);
|
||
}
|
||
|
||
None
|
||
}
|
||
|
||
#[inline]
|
||
fn check_null(c: char) -> Option<char> {
|
||
if c == '\0' {
|
||
None
|
||
} else {
|
||
Some(c)
|
||
}
|
||
}
|
||
|
||
#[inline]
|
||
fn byte_to_char<const RADIX: u32>(s: &str) -> Option<char> {
|
||
// u32 is used here because Pg has an overflow operation rather than throwing an exception directly.
|
||
match u32::from_str_radix(s, RADIX) {
|
||
Err(_) => None,
|
||
Ok(n) => {
|
||
let n = n & 0xFF;
|
||
if n <= 127 {
|
||
char::from_u32(n)
|
||
} else {
|
||
None
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
// Hexadecimal byte value. \xh, \xhh (h = 0–9, A–F)
|
||
fn unescape_hex(&mut self) -> Option<char> {
|
||
let mut s = String::new();
|
||
|
||
for _ in 0..2 {
|
||
match self.next_hex_digit() {
|
||
Some(c) => s.push(c),
|
||
None => break,
|
||
}
|
||
}
|
||
|
||
if s.is_empty() {
|
||
return Some('x');
|
||
}
|
||
|
||
Self::byte_to_char::<16>(&s)
|
||
}
|
||
|
||
#[inline]
|
||
fn next_hex_digit(&mut self) -> Option<char> {
|
||
match self.chars.peek() {
|
||
Some(c) if c.is_ascii_hexdigit() => self.chars.next(),
|
||
_ => None,
|
||
}
|
||
}
|
||
|
||
// Octal byte value. \o, \oo, \ooo (o = 0–7)
|
||
fn unescape_octal(&mut self, c: char) -> Option<char> {
|
||
let mut s = String::new();
|
||
|
||
s.push(c);
|
||
for _ in 0..2 {
|
||
match self.next_octal_digest() {
|
||
Some(c) => s.push(c),
|
||
None => break,
|
||
}
|
||
}
|
||
|
||
Self::byte_to_char::<8>(&s)
|
||
}
|
||
|
||
#[inline]
|
||
fn next_octal_digest(&mut self) -> Option<char> {
|
||
match self.chars.peek() {
|
||
Some(c) if c.is_digit(8) => self.chars.next(),
|
||
_ => None,
|
||
}
|
||
}
|
||
|
||
// 16-bit hexadecimal Unicode character value. \uxxxx (x = 0–9, A–F)
|
||
fn unescape_unicode_16(&mut self) -> Option<char> {
|
||
self.unescape_unicode::<4>()
|
||
}
|
||
|
||
// 32-bit hexadecimal Unicode character value. \Uxxxxxxxx (x = 0–9, A–F)
|
||
fn unescape_unicode_32(&mut self) -> Option<char> {
|
||
self.unescape_unicode::<8>()
|
||
}
|
||
|
||
fn unescape_unicode<const NUM: usize>(&mut self) -> Option<char> {
|
||
let mut s = String::new();
|
||
for _ in 0..NUM {
|
||
s.push(self.chars.next()?);
|
||
}
|
||
match u32::from_str_radix(&s, 16) {
|
||
Err(_) => None,
|
||
Ok(n) => char::from_u32(n),
|
||
}
|
||
}
|
||
}
|
||
|
||
fn unescape_unicode_single_quoted_string(chars: &mut State<'_>) -> Result<String, TokenizerError> {
|
||
let mut unescaped = String::new();
|
||
chars.next(); // consume the opening quote
|
||
while let Some(c) = chars.next() {
|
||
match c {
|
||
'\'' => {
|
||
if chars.peek() == Some(&'\'') {
|
||
chars.next();
|
||
unescaped.push('\'');
|
||
} else {
|
||
return Ok(unescaped);
|
||
}
|
||
}
|
||
'\\' => match chars.peek() {
|
||
Some('\\') => {
|
||
chars.next();
|
||
unescaped.push('\\');
|
||
}
|
||
Some('+') => {
|
||
chars.next();
|
||
unescaped.push(take_char_from_hex_digits(chars, 6)?);
|
||
}
|
||
_ => unescaped.push(take_char_from_hex_digits(chars, 4)?),
|
||
},
|
||
_ => {
|
||
unescaped.push(c);
|
||
}
|
||
}
|
||
}
|
||
Err(TokenizerError {
|
||
message: "Unterminated unicode encoded string literal".to_string(),
|
||
location: chars.location(),
|
||
})
|
||
}
|
||
|
||
fn take_char_from_hex_digits(
|
||
chars: &mut State<'_>,
|
||
max_digits: usize,
|
||
) -> Result<char, TokenizerError> {
|
||
let mut result = 0u32;
|
||
for _ in 0..max_digits {
|
||
let next_char = chars.next().ok_or_else(|| TokenizerError {
|
||
message: "Unexpected EOF while parsing hex digit in escaped unicode string."
|
||
.to_string(),
|
||
location: chars.location(),
|
||
})?;
|
||
let digit = next_char.to_digit(16).ok_or_else(|| TokenizerError {
|
||
message: format!("Invalid hex digit in escaped unicode string: {next_char}"),
|
||
location: chars.location(),
|
||
})?;
|
||
result = result * 16 + digit;
|
||
}
|
||
char::from_u32(result).ok_or_else(|| TokenizerError {
|
||
message: format!("Invalid unicode character: {result:x}"),
|
||
location: chars.location(),
|
||
})
|
||
}
|
||
|
||
#[cfg(test)]
|
||
mod tests {
|
||
use super::*;
|
||
use crate::dialect::{
|
||
BigQueryDialect, ClickHouseDialect, HiveDialect, MsSqlDialect, MySqlDialect, SQLiteDialect,
|
||
};
|
||
use crate::test_utils::all_dialects_where;
|
||
use core::fmt::Debug;
|
||
|
||
#[test]
|
||
fn tokenizer_error_impl() {
|
||
let err = TokenizerError {
|
||
message: "test".into(),
|
||
location: Location { line: 1, column: 1 },
|
||
};
|
||
#[cfg(feature = "std")]
|
||
{
|
||
use std::error::Error;
|
||
assert!(err.source().is_none());
|
||
}
|
||
assert_eq!(err.to_string(), "test at Line: 1, Column: 1");
|
||
}
|
||
|
||
#[test]
|
||
fn tokenize_select_1() {
|
||
let sql = String::from("SELECT 1");
|
||
let dialect = GenericDialect {};
|
||
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
|
||
|
||
let expected = vec![
|
||
Token::make_keyword("SELECT"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::Number(String::from("1"), false),
|
||
];
|
||
|
||
compare(expected, tokens);
|
||
}
|
||
|
||
#[test]
|
||
fn tokenize_select_float() {
|
||
let sql = String::from("SELECT .1");
|
||
let dialect = GenericDialect {};
|
||
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
|
||
|
||
let expected = vec![
|
||
Token::make_keyword("SELECT"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::Number(String::from(".1"), false),
|
||
];
|
||
|
||
compare(expected, tokens);
|
||
}
|
||
|
||
#[test]
|
||
fn tokenize_clickhouse_double_equal() {
|
||
let sql = String::from("SELECT foo=='1'");
|
||
let dialect = ClickHouseDialect {};
|
||
let mut tokenizer = Tokenizer::new(&dialect, &sql);
|
||
let tokens = tokenizer.tokenize().unwrap();
|
||
|
||
let expected = vec![
|
||
Token::make_keyword("SELECT"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::Word(Word {
|
||
value: "foo".to_string(),
|
||
quote_style: None,
|
||
keyword: Keyword::NoKeyword,
|
||
}),
|
||
Token::DoubleEq,
|
||
Token::SingleQuotedString("1".to_string()),
|
||
];
|
||
|
||
compare(expected, tokens);
|
||
}
|
||
|
||
#[test]
|
||
fn tokenize_numeric_literal_underscore() {
|
||
let dialect = GenericDialect {};
|
||
let sql = String::from("SELECT 10_000");
|
||
let mut tokenizer = Tokenizer::new(&dialect, &sql);
|
||
let tokens = tokenizer.tokenize().unwrap();
|
||
let expected = vec![
|
||
Token::make_keyword("SELECT"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::Number("10".to_string(), false),
|
||
Token::make_word("_000", None),
|
||
];
|
||
compare(expected, tokens);
|
||
|
||
all_dialects_where(|dialect| dialect.supports_numeric_literal_underscores()).tokenizes_to(
|
||
"SELECT 10_000, _10_000, 10_00_, 10___0",
|
||
vec![
|
||
Token::make_keyword("SELECT"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::Number("10_000".to_string(), false),
|
||
Token::Comma,
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::make_word("_10_000", None), // leading underscore tokenizes as a word (parsed as column identifier)
|
||
Token::Comma,
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::Number("10_00".to_string(), false),
|
||
Token::make_word("_", None), // trailing underscores tokenizes as a word (syntax error in some dialects)
|
||
Token::Comma,
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::Number("10".to_string(), false),
|
||
Token::make_word("___0", None), // multiple underscores tokenizes as a word (syntax error in some dialects)
|
||
],
|
||
);
|
||
}
|
||
|
||
#[test]
|
||
fn tokenize_select_exponent() {
|
||
let sql = String::from("SELECT 1e10, 1e-10, 1e+10, 1ea, 1e-10a, 1e-10-10");
|
||
let dialect = GenericDialect {};
|
||
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
|
||
|
||
let expected = vec![
|
||
Token::make_keyword("SELECT"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::Number(String::from("1e10"), false),
|
||
Token::Comma,
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::Number(String::from("1e-10"), false),
|
||
Token::Comma,
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::Number(String::from("1e+10"), false),
|
||
Token::Comma,
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::Number(String::from("1"), false),
|
||
Token::make_word("ea", None),
|
||
Token::Comma,
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::Number(String::from("1e-10"), false),
|
||
Token::make_word("a", None),
|
||
Token::Comma,
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::Number(String::from("1e-10"), false),
|
||
Token::Minus,
|
||
Token::Number(String::from("10"), false),
|
||
];
|
||
|
||
compare(expected, tokens);
|
||
}
|
||
|
||
#[test]
|
||
fn tokenize_scalar_function() {
|
||
let sql = String::from("SELECT sqrt(1)");
|
||
let dialect = GenericDialect {};
|
||
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
|
||
|
||
let expected = vec![
|
||
Token::make_keyword("SELECT"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::make_word("sqrt", None),
|
||
Token::LParen,
|
||
Token::Number(String::from("1"), false),
|
||
Token::RParen,
|
||
];
|
||
|
||
compare(expected, tokens);
|
||
}
|
||
|
||
#[test]
|
||
fn tokenize_string_string_concat() {
|
||
let sql = String::from("SELECT 'a' || 'b'");
|
||
let dialect = GenericDialect {};
|
||
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
|
||
|
||
let expected = vec![
|
||
Token::make_keyword("SELECT"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::SingleQuotedString(String::from("a")),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::StringConcat,
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::SingleQuotedString(String::from("b")),
|
||
];
|
||
|
||
compare(expected, tokens);
|
||
}
|
||
#[test]
|
||
fn tokenize_bitwise_op() {
|
||
let sql = String::from("SELECT one | two ^ three");
|
||
let dialect = GenericDialect {};
|
||
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
|
||
|
||
let expected = vec![
|
||
Token::make_keyword("SELECT"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::make_word("one", None),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::Pipe,
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::make_word("two", None),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::Caret,
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::make_word("three", None),
|
||
];
|
||
compare(expected, tokens);
|
||
}
|
||
|
||
#[test]
|
||
fn tokenize_logical_xor() {
|
||
let sql =
|
||
String::from("SELECT true XOR true, false XOR false, true XOR false, false XOR true");
|
||
let dialect = GenericDialect {};
|
||
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
|
||
|
||
let expected = vec![
|
||
Token::make_keyword("SELECT"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::make_keyword("true"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::make_keyword("XOR"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::make_keyword("true"),
|
||
Token::Comma,
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::make_keyword("false"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::make_keyword("XOR"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::make_keyword("false"),
|
||
Token::Comma,
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::make_keyword("true"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::make_keyword("XOR"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::make_keyword("false"),
|
||
Token::Comma,
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::make_keyword("false"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::make_keyword("XOR"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::make_keyword("true"),
|
||
];
|
||
compare(expected, tokens);
|
||
}
|
||
|
||
#[test]
|
||
fn tokenize_simple_select() {
|
||
let sql = String::from("SELECT * FROM customer WHERE id = 1 LIMIT 5");
|
||
let dialect = GenericDialect {};
|
||
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
|
||
|
||
let expected = vec![
|
||
Token::make_keyword("SELECT"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::Mul,
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::make_keyword("FROM"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::make_word("customer", None),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::make_keyword("WHERE"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::make_word("id", None),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::Eq,
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::Number(String::from("1"), false),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::make_keyword("LIMIT"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::Number(String::from("5"), false),
|
||
];
|
||
|
||
compare(expected, tokens);
|
||
}
|
||
|
||
#[test]
|
||
fn tokenize_explain_select() {
|
||
let sql = String::from("EXPLAIN SELECT * FROM customer WHERE id = 1");
|
||
let dialect = GenericDialect {};
|
||
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
|
||
|
||
let expected = vec![
|
||
Token::make_keyword("EXPLAIN"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::make_keyword("SELECT"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::Mul,
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::make_keyword("FROM"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::make_word("customer", None),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::make_keyword("WHERE"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::make_word("id", None),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::Eq,
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::Number(String::from("1"), false),
|
||
];
|
||
|
||
compare(expected, tokens);
|
||
}
|
||
|
||
#[test]
|
||
fn tokenize_explain_analyze_select() {
|
||
let sql = String::from("EXPLAIN ANALYZE SELECT * FROM customer WHERE id = 1");
|
||
let dialect = GenericDialect {};
|
||
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
|
||
|
||
let expected = vec![
|
||
Token::make_keyword("EXPLAIN"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::make_keyword("ANALYZE"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::make_keyword("SELECT"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::Mul,
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::make_keyword("FROM"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::make_word("customer", None),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::make_keyword("WHERE"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::make_word("id", None),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::Eq,
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::Number(String::from("1"), false),
|
||
];
|
||
|
||
compare(expected, tokens);
|
||
}
|
||
|
||
#[test]
|
||
fn tokenize_string_predicate() {
|
||
let sql = String::from("SELECT * FROM customer WHERE salary != 'Not Provided'");
|
||
let dialect = GenericDialect {};
|
||
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
|
||
|
||
let expected = vec![
|
||
Token::make_keyword("SELECT"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::Mul,
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::make_keyword("FROM"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::make_word("customer", None),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::make_keyword("WHERE"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::make_word("salary", None),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::Neq,
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::SingleQuotedString(String::from("Not Provided")),
|
||
];
|
||
|
||
compare(expected, tokens);
|
||
}
|
||
|
||
#[test]
|
||
fn tokenize_invalid_string() {
|
||
let sql = String::from("\n💝مصطفىh");
|
||
|
||
let dialect = GenericDialect {};
|
||
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
|
||
// println!("tokens: {:#?}", tokens);
|
||
let expected = vec![
|
||
Token::Whitespace(Whitespace::Newline),
|
||
Token::Char('💝'),
|
||
Token::make_word("مصطفىh", None),
|
||
];
|
||
compare(expected, tokens);
|
||
}
|
||
|
||
#[test]
|
||
fn tokenize_newline_in_string_literal() {
|
||
let sql = String::from("'foo\r\nbar\nbaz'");
|
||
|
||
let dialect = GenericDialect {};
|
||
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
|
||
let expected = vec![Token::SingleQuotedString("foo\r\nbar\nbaz".to_string())];
|
||
compare(expected, tokens);
|
||
}
|
||
|
||
#[test]
|
||
fn tokenize_unterminated_string_literal() {
|
||
let sql = String::from("select 'foo");
|
||
|
||
let dialect = GenericDialect {};
|
||
let mut tokenizer = Tokenizer::new(&dialect, &sql);
|
||
assert_eq!(
|
||
tokenizer.tokenize(),
|
||
Err(TokenizerError {
|
||
message: "Unterminated string literal".to_string(),
|
||
location: Location { line: 1, column: 8 },
|
||
})
|
||
);
|
||
}
|
||
|
||
#[test]
|
||
fn tokenize_unterminated_string_literal_utf8() {
|
||
let sql = String::from("SELECT \"なにか\" FROM Y WHERE \"なにか\" = 'test;");
|
||
|
||
let dialect = GenericDialect {};
|
||
let mut tokenizer = Tokenizer::new(&dialect, &sql);
|
||
assert_eq!(
|
||
tokenizer.tokenize(),
|
||
Err(TokenizerError {
|
||
message: "Unterminated string literal".to_string(),
|
||
location: Location {
|
||
line: 1,
|
||
column: 35
|
||
}
|
||
})
|
||
);
|
||
}
|
||
|
||
#[test]
|
||
fn tokenize_invalid_string_cols() {
|
||
let sql = String::from("\n\nSELECT * FROM table\t💝مصطفىh");
|
||
|
||
let dialect = GenericDialect {};
|
||
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
|
||
// println!("tokens: {:#?}", tokens);
|
||
let expected = vec![
|
||
Token::Whitespace(Whitespace::Newline),
|
||
Token::Whitespace(Whitespace::Newline),
|
||
Token::make_keyword("SELECT"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::Mul,
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::make_keyword("FROM"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::make_keyword("table"),
|
||
Token::Whitespace(Whitespace::Tab),
|
||
Token::Char('💝'),
|
||
Token::make_word("مصطفىh", None),
|
||
];
|
||
compare(expected, tokens);
|
||
}
|
||
|
||
#[test]
|
||
fn tokenize_dollar_quoted_string_tagged() {
|
||
let test_cases = vec![
|
||
(
|
||
String::from("SELECT $tag$dollar '$' quoted strings have $tags like this$ or like this $$$tag$"),
|
||
vec![
|
||
Token::make_keyword("SELECT"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::DollarQuotedString(DollarQuotedString {
|
||
value: "dollar '$' quoted strings have $tags like this$ or like this $$".into(),
|
||
tag: Some("tag".into()),
|
||
})
|
||
]
|
||
),
|
||
(
|
||
String::from("SELECT $abc$x$ab$abc$"),
|
||
vec![
|
||
Token::make_keyword("SELECT"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::DollarQuotedString(DollarQuotedString {
|
||
value: "x$ab".into(),
|
||
tag: Some("abc".into()),
|
||
})
|
||
]
|
||
),
|
||
(
|
||
String::from("SELECT $abc$$abc$"),
|
||
vec![
|
||
Token::make_keyword("SELECT"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::DollarQuotedString(DollarQuotedString {
|
||
value: "".into(),
|
||
tag: Some("abc".into()),
|
||
})
|
||
]
|
||
),
|
||
(
|
||
String::from("0$abc$$abc$1"),
|
||
vec![
|
||
Token::Number("0".into(), false),
|
||
Token::DollarQuotedString(DollarQuotedString {
|
||
value: "".into(),
|
||
tag: Some("abc".into()),
|
||
}),
|
||
Token::Number("1".into(), false),
|
||
]
|
||
),
|
||
(
|
||
String::from("$function$abc$q$data$q$$function$"),
|
||
vec![
|
||
Token::DollarQuotedString(DollarQuotedString {
|
||
value: "abc$q$data$q$".into(),
|
||
tag: Some("function".into()),
|
||
}),
|
||
]
|
||
),
|
||
];
|
||
|
||
let dialect = GenericDialect {};
|
||
for (sql, expected) in test_cases {
|
||
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
|
||
compare(expected, tokens);
|
||
}
|
||
}
|
||
|
||
#[test]
|
||
fn tokenize_dollar_quoted_string_tagged_unterminated() {
|
||
let sql = String::from("SELECT $tag$dollar '$' quoted strings have $tags like this$ or like this $$$different tag$");
|
||
let dialect = GenericDialect {};
|
||
assert_eq!(
|
||
Tokenizer::new(&dialect, &sql).tokenize(),
|
||
Err(TokenizerError {
|
||
message: "Unterminated dollar-quoted, expected $".into(),
|
||
location: Location {
|
||
line: 1,
|
||
column: 91
|
||
}
|
||
})
|
||
);
|
||
}
|
||
|
||
#[test]
|
||
fn tokenize_dollar_quoted_string_tagged_unterminated_mirror() {
|
||
let sql = String::from("SELECT $abc$abc$");
|
||
let dialect = GenericDialect {};
|
||
assert_eq!(
|
||
Tokenizer::new(&dialect, &sql).tokenize(),
|
||
Err(TokenizerError {
|
||
message: "Unterminated dollar-quoted, expected $".into(),
|
||
location: Location {
|
||
line: 1,
|
||
column: 17
|
||
}
|
||
})
|
||
);
|
||
}
|
||
|
||
#[test]
|
||
fn tokenize_dollar_placeholder() {
|
||
let sql = String::from("SELECT $$, $$ABC$$, $ABC$, $ABC");
|
||
let dialect = SQLiteDialect {};
|
||
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
|
||
assert_eq!(
|
||
tokens,
|
||
vec![
|
||
Token::make_keyword("SELECT"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::Placeholder("$$".into()),
|
||
Token::Comma,
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::Placeholder("$$ABC$$".into()),
|
||
Token::Comma,
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::Placeholder("$ABC$".into()),
|
||
Token::Comma,
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::Placeholder("$ABC".into()),
|
||
]
|
||
);
|
||
}
|
||
|
||
#[test]
|
||
fn tokenize_nested_dollar_quoted_strings() {
|
||
let sql = String::from("SELECT $tag$dollar $nested$ string$tag$");
|
||
let dialect = GenericDialect {};
|
||
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
|
||
let expected = vec![
|
||
Token::make_keyword("SELECT"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::DollarQuotedString(DollarQuotedString {
|
||
value: "dollar $nested$ string".into(),
|
||
tag: Some("tag".into()),
|
||
}),
|
||
];
|
||
compare(expected, tokens);
|
||
}
|
||
|
||
#[test]
|
||
fn tokenize_dollar_quoted_string_untagged_empty() {
|
||
let sql = String::from("SELECT $$$$");
|
||
let dialect = GenericDialect {};
|
||
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
|
||
let expected = vec![
|
||
Token::make_keyword("SELECT"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::DollarQuotedString(DollarQuotedString {
|
||
value: "".into(),
|
||
tag: None,
|
||
}),
|
||
];
|
||
compare(expected, tokens);
|
||
}
|
||
|
||
#[test]
|
||
fn tokenize_dollar_quoted_string_untagged() {
|
||
let sql =
|
||
String::from("SELECT $$within dollar '$' quoted strings have $tags like this$ $$");
|
||
let dialect = GenericDialect {};
|
||
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
|
||
let expected = vec![
|
||
Token::make_keyword("SELECT"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::DollarQuotedString(DollarQuotedString {
|
||
value: "within dollar '$' quoted strings have $tags like this$ ".into(),
|
||
tag: None,
|
||
}),
|
||
];
|
||
compare(expected, tokens);
|
||
}
|
||
|
||
#[test]
|
||
fn tokenize_dollar_quoted_string_untagged_unterminated() {
|
||
let sql = String::from(
|
||
"SELECT $$dollar '$' quoted strings have $tags like this$ or like this $different tag$",
|
||
);
|
||
let dialect = GenericDialect {};
|
||
assert_eq!(
|
||
Tokenizer::new(&dialect, &sql).tokenize(),
|
||
Err(TokenizerError {
|
||
message: "Unterminated dollar-quoted string".into(),
|
||
location: Location {
|
||
line: 1,
|
||
column: 86
|
||
}
|
||
})
|
||
);
|
||
}
|
||
|
||
#[test]
|
||
fn tokenize_right_arrow() {
|
||
let sql = String::from("FUNCTION(key=>value)");
|
||
let dialect = GenericDialect {};
|
||
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
|
||
let expected = vec![
|
||
Token::make_word("FUNCTION", None),
|
||
Token::LParen,
|
||
Token::make_word("key", None),
|
||
Token::RArrow,
|
||
Token::make_word("value", None),
|
||
Token::RParen,
|
||
];
|
||
compare(expected, tokens);
|
||
}
|
||
|
||
#[test]
|
||
fn tokenize_is_null() {
|
||
let sql = String::from("a IS NULL");
|
||
let dialect = GenericDialect {};
|
||
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
|
||
|
||
let expected = vec![
|
||
Token::make_word("a", None),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::make_keyword("IS"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::make_keyword("NULL"),
|
||
];
|
||
|
||
compare(expected, tokens);
|
||
}
|
||
|
||
#[test]
|
||
fn tokenize_comment() {
|
||
let test_cases = vec![
|
||
(
|
||
String::from("0--this is a comment\n1"),
|
||
vec![
|
||
Token::Number("0".to_string(), false),
|
||
Token::Whitespace(Whitespace::SingleLineComment {
|
||
prefix: "--".to_string(),
|
||
comment: "this is a comment\n".to_string(),
|
||
}),
|
||
Token::Number("1".to_string(), false),
|
||
],
|
||
),
|
||
(
|
||
String::from("0--this is a comment\r1"),
|
||
vec![
|
||
Token::Number("0".to_string(), false),
|
||
Token::Whitespace(Whitespace::SingleLineComment {
|
||
prefix: "--".to_string(),
|
||
comment: "this is a comment\r1".to_string(),
|
||
}),
|
||
],
|
||
),
|
||
(
|
||
String::from("0--this is a comment\r\n1"),
|
||
vec![
|
||
Token::Number("0".to_string(), false),
|
||
Token::Whitespace(Whitespace::SingleLineComment {
|
||
prefix: "--".to_string(),
|
||
comment: "this is a comment\r\n".to_string(),
|
||
}),
|
||
Token::Number("1".to_string(), false),
|
||
],
|
||
),
|
||
];
|
||
|
||
let dialect = GenericDialect {};
|
||
|
||
for (sql, expected) in test_cases {
|
||
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
|
||
compare(expected, tokens);
|
||
}
|
||
}
|
||
|
||
#[test]
|
||
fn tokenize_comment_postgres() {
|
||
let sql = String::from("1--\r0");
|
||
|
||
let dialect = PostgreSqlDialect {};
|
||
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
|
||
let expected = vec![
|
||
Token::Number("1".to_string(), false),
|
||
Token::Whitespace(Whitespace::SingleLineComment {
|
||
prefix: "--".to_string(),
|
||
comment: "\r".to_string(),
|
||
}),
|
||
Token::Number("0".to_string(), false),
|
||
];
|
||
compare(expected, tokens);
|
||
}
|
||
|
||
#[test]
|
||
fn tokenize_comment_at_eof() {
|
||
let sql = String::from("--this is a comment");
|
||
|
||
let dialect = GenericDialect {};
|
||
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
|
||
let expected = vec![Token::Whitespace(Whitespace::SingleLineComment {
|
||
prefix: "--".to_string(),
|
||
comment: "this is a comment".to_string(),
|
||
})];
|
||
compare(expected, tokens);
|
||
}
|
||
|
||
#[test]
|
||
fn tokenize_multiline_comment() {
|
||
let sql = String::from("0/*multi-line\n* /comment*/1");
|
||
|
||
let dialect = GenericDialect {};
|
||
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
|
||
let expected = vec![
|
||
Token::Number("0".to_string(), false),
|
||
Token::Whitespace(Whitespace::MultiLineComment(
|
||
"multi-line\n* /comment".to_string(),
|
||
)),
|
||
Token::Number("1".to_string(), false),
|
||
];
|
||
compare(expected, tokens);
|
||
}
|
||
|
||
#[test]
|
||
fn tokenize_nested_multiline_comment() {
|
||
let dialect = GenericDialect {};
|
||
let test_cases = vec![
|
||
(
|
||
"0/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/1",
|
||
vec![
|
||
Token::Number("0".to_string(), false),
|
||
Token::Whitespace(Whitespace::MultiLineComment(
|
||
"multi-line\n* \n/* comment \n /*comment*/*/ ".into(),
|
||
)),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::Div,
|
||
Token::Word(Word {
|
||
value: "comment".to_string(),
|
||
quote_style: None,
|
||
keyword: Keyword::COMMENT,
|
||
}),
|
||
Token::Mul,
|
||
Token::Div,
|
||
Token::Number("1".to_string(), false),
|
||
],
|
||
),
|
||
(
|
||
"0/*multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/*/1",
|
||
vec![
|
||
Token::Number("0".to_string(), false),
|
||
Token::Whitespace(Whitespace::MultiLineComment(
|
||
"multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/".into(),
|
||
)),
|
||
Token::Number("1".to_string(), false),
|
||
],
|
||
),
|
||
(
|
||
"SELECT 1/* a /* b */ c */0",
|
||
vec![
|
||
Token::make_keyword("SELECT"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::Number("1".to_string(), false),
|
||
Token::Whitespace(Whitespace::MultiLineComment(" a /* b */ c ".to_string())),
|
||
Token::Number("0".to_string(), false),
|
||
],
|
||
),
|
||
];
|
||
|
||
for (sql, expected) in test_cases {
|
||
let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
|
||
compare(expected, tokens);
|
||
}
|
||
}
|
||
|
||
#[test]
|
||
fn tokenize_nested_multiline_comment_empty() {
|
||
let sql = "select 1/*/**/*/0";
|
||
|
||
let dialect = GenericDialect {};
|
||
let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
|
||
let expected = vec![
|
||
Token::make_keyword("select"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::Number("1".to_string(), false),
|
||
Token::Whitespace(Whitespace::MultiLineComment("/**/".to_string())),
|
||
Token::Number("0".to_string(), false),
|
||
];
|
||
|
||
compare(expected, tokens);
|
||
}
|
||
|
||
#[test]
|
||
fn tokenize_nested_comments_if_not_supported() {
|
||
let dialect = SQLiteDialect {};
|
||
let sql = "SELECT 1/*/* nested comment */*/0";
|
||
let tokens = Tokenizer::new(&dialect, sql).tokenize();
|
||
let expected = vec![
|
||
Token::make_keyword("SELECT"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::Number("1".to_string(), false),
|
||
Token::Whitespace(Whitespace::MultiLineComment(
|
||
"/* nested comment ".to_string(),
|
||
)),
|
||
Token::Mul,
|
||
Token::Div,
|
||
Token::Number("0".to_string(), false),
|
||
];
|
||
|
||
compare(expected, tokens.unwrap());
|
||
}
|
||
|
||
#[test]
|
||
fn tokenize_multiline_comment_with_even_asterisks() {
|
||
let sql = String::from("\n/** Comment **/\n");
|
||
|
||
let dialect = GenericDialect {};
|
||
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
|
||
let expected = vec![
|
||
Token::Whitespace(Whitespace::Newline),
|
||
Token::Whitespace(Whitespace::MultiLineComment("* Comment *".to_string())),
|
||
Token::Whitespace(Whitespace::Newline),
|
||
];
|
||
compare(expected, tokens);
|
||
}
|
||
|
||
#[test]
|
||
fn tokenize_unicode_whitespace() {
|
||
let sql = String::from(" \u{2003}\n");
|
||
|
||
let dialect = GenericDialect {};
|
||
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
|
||
let expected = vec![
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::Whitespace(Whitespace::Newline),
|
||
];
|
||
compare(expected, tokens);
|
||
}
|
||
|
||
#[test]
|
||
fn tokenize_mismatched_quotes() {
|
||
let sql = String::from("\"foo");
|
||
|
||
let dialect = GenericDialect {};
|
||
let mut tokenizer = Tokenizer::new(&dialect, &sql);
|
||
assert_eq!(
|
||
tokenizer.tokenize(),
|
||
Err(TokenizerError {
|
||
message: "Expected close delimiter '\"' before EOF.".to_string(),
|
||
location: Location { line: 1, column: 1 },
|
||
})
|
||
);
|
||
}
|
||
|
||
#[test]
|
||
fn tokenize_newlines() {
|
||
let sql = String::from("line1\nline2\rline3\r\nline4\r");
|
||
|
||
let dialect = GenericDialect {};
|
||
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
|
||
let expected = vec![
|
||
Token::make_word("line1", None),
|
||
Token::Whitespace(Whitespace::Newline),
|
||
Token::make_word("line2", None),
|
||
Token::Whitespace(Whitespace::Newline),
|
||
Token::make_word("line3", None),
|
||
Token::Whitespace(Whitespace::Newline),
|
||
Token::make_word("line4", None),
|
||
Token::Whitespace(Whitespace::Newline),
|
||
];
|
||
compare(expected, tokens);
|
||
}
|
||
|
||
#[test]
|
||
fn tokenize_mssql_top() {
|
||
let sql = "SELECT TOP 5 [bar] FROM foo";
|
||
let dialect = MsSqlDialect {};
|
||
let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
|
||
let expected = vec![
|
||
Token::make_keyword("SELECT"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::make_keyword("TOP"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::Number(String::from("5"), false),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::make_word("bar", Some('[')),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::make_keyword("FROM"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::make_word("foo", None),
|
||
];
|
||
compare(expected, tokens);
|
||
}
|
||
|
||
#[test]
|
||
fn tokenize_pg_regex_match() {
|
||
let sql = "SELECT col ~ '^a', col ~* '^a', col !~ '^a', col !~* '^a'";
|
||
let dialect = GenericDialect {};
|
||
let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
|
||
let expected = vec![
|
||
Token::make_keyword("SELECT"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::make_word("col", None),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::Tilde,
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::SingleQuotedString("^a".into()),
|
||
Token::Comma,
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::make_word("col", None),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::TildeAsterisk,
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::SingleQuotedString("^a".into()),
|
||
Token::Comma,
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::make_word("col", None),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::ExclamationMarkTilde,
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::SingleQuotedString("^a".into()),
|
||
Token::Comma,
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::make_word("col", None),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::ExclamationMarkTildeAsterisk,
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::SingleQuotedString("^a".into()),
|
||
];
|
||
compare(expected, tokens);
|
||
}
|
||
|
||
#[test]
|
||
fn tokenize_pg_like_match() {
|
||
let sql = "SELECT col ~~ '_a%', col ~~* '_a%', col !~~ '_a%', col !~~* '_a%'";
|
||
let dialect = GenericDialect {};
|
||
let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
|
||
let expected = vec![
|
||
Token::make_keyword("SELECT"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::make_word("col", None),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::DoubleTilde,
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::SingleQuotedString("_a%".into()),
|
||
Token::Comma,
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::make_word("col", None),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::DoubleTildeAsterisk,
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::SingleQuotedString("_a%".into()),
|
||
Token::Comma,
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::make_word("col", None),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::ExclamationMarkDoubleTilde,
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::SingleQuotedString("_a%".into()),
|
||
Token::Comma,
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::make_word("col", None),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::ExclamationMarkDoubleTildeAsterisk,
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::SingleQuotedString("_a%".into()),
|
||
];
|
||
compare(expected, tokens);
|
||
}
|
||
|
||
#[test]
|
||
fn tokenize_quoted_identifier() {
|
||
let sql = r#" "a "" b" "a """ "c """"" "#;
|
||
let dialect = GenericDialect {};
|
||
let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
|
||
let expected = vec![
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::make_word(r#"a " b"#, Some('"')),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::make_word(r#"a ""#, Some('"')),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::make_word(r#"c """#, Some('"')),
|
||
Token::Whitespace(Whitespace::Space),
|
||
];
|
||
compare(expected, tokens);
|
||
}
|
||
|
||
#[test]
|
||
fn tokenize_snowflake_div() {
|
||
let sql = r#"field/1000"#;
|
||
let dialect = SnowflakeDialect {};
|
||
let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
|
||
let expected = vec![
|
||
Token::make_word(r#"field"#, None),
|
||
Token::Div,
|
||
Token::Number("1000".to_string(), false),
|
||
];
|
||
compare(expected, tokens);
|
||
}
|
||
|
||
#[test]
|
||
fn tokenize_quoted_identifier_with_no_escape() {
|
||
let sql = r#" "a "" b" "a """ "c """"" "#;
|
||
let dialect = GenericDialect {};
|
||
let tokens = Tokenizer::new(&dialect, sql)
|
||
.with_unescape(false)
|
||
.tokenize()
|
||
.unwrap();
|
||
let expected = vec![
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::make_word(r#"a "" b"#, Some('"')),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::make_word(r#"a """#, Some('"')),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::make_word(r#"c """""#, Some('"')),
|
||
Token::Whitespace(Whitespace::Space),
|
||
];
|
||
compare(expected, tokens);
|
||
}
|
||
|
||
#[test]
|
||
fn tokenize_with_location() {
|
||
let sql = "SELECT a,\n b";
|
||
let dialect = GenericDialect {};
|
||
let tokens = Tokenizer::new(&dialect, sql)
|
||
.tokenize_with_location()
|
||
.unwrap();
|
||
let expected = vec![
|
||
TokenWithSpan::at(Token::make_keyword("SELECT"), (1, 1).into(), (1, 7).into()),
|
||
TokenWithSpan::at(
|
||
Token::Whitespace(Whitespace::Space),
|
||
(1, 7).into(),
|
||
(1, 8).into(),
|
||
),
|
||
TokenWithSpan::at(Token::make_word("a", None), (1, 8).into(), (1, 9).into()),
|
||
TokenWithSpan::at(Token::Comma, (1, 9).into(), (1, 10).into()),
|
||
TokenWithSpan::at(
|
||
Token::Whitespace(Whitespace::Newline),
|
||
(1, 10).into(),
|
||
(2, 1).into(),
|
||
),
|
||
TokenWithSpan::at(
|
||
Token::Whitespace(Whitespace::Space),
|
||
(2, 1).into(),
|
||
(2, 2).into(),
|
||
),
|
||
TokenWithSpan::at(Token::make_word("b", None), (2, 2).into(), (2, 3).into()),
|
||
];
|
||
compare(expected, tokens);
|
||
}
|
||
|
||
fn compare<T: PartialEq + fmt::Debug>(expected: Vec<T>, actual: Vec<T>) {
|
||
//println!("------------------------------");
|
||
//println!("tokens = {:?}", actual);
|
||
//println!("expected = {:?}", expected);
|
||
//println!("------------------------------");
|
||
assert_eq!(expected, actual);
|
||
}
|
||
|
||
fn check_unescape(s: &str, expected: Option<&str>) {
|
||
let s = format!("'{s}'");
|
||
let mut state = State {
|
||
peekable: s.chars().peekable(),
|
||
line: 0,
|
||
col: 0,
|
||
};
|
||
|
||
assert_eq!(
|
||
unescape_single_quoted_string(&mut state),
|
||
expected.map(|s| s.to_string())
|
||
);
|
||
}
|
||
|
||
#[test]
|
||
fn test_unescape() {
|
||
check_unescape(r"\b", Some("\u{0008}"));
|
||
check_unescape(r"\f", Some("\u{000C}"));
|
||
check_unescape(r"\t", Some("\t"));
|
||
check_unescape(r"\r\n", Some("\r\n"));
|
||
check_unescape(r"\/", Some("/"));
|
||
check_unescape(r"/", Some("/"));
|
||
check_unescape(r"\\", Some("\\"));
|
||
|
||
// 16 and 32-bit hexadecimal Unicode character value
|
||
check_unescape(r"\u0001", Some("\u{0001}"));
|
||
check_unescape(r"\u4c91", Some("\u{4c91}"));
|
||
check_unescape(r"\u4c916", Some("\u{4c91}6"));
|
||
check_unescape(r"\u4c", None);
|
||
check_unescape(r"\u0000", None);
|
||
check_unescape(r"\U0010FFFF", Some("\u{10FFFF}"));
|
||
check_unescape(r"\U00110000", None);
|
||
check_unescape(r"\U00000000", None);
|
||
check_unescape(r"\u", None);
|
||
check_unescape(r"\U", None);
|
||
check_unescape(r"\U1010FFFF", None);
|
||
|
||
// hexadecimal byte value
|
||
check_unescape(r"\x4B", Some("\u{004b}"));
|
||
check_unescape(r"\x4", Some("\u{0004}"));
|
||
check_unescape(r"\x4L", Some("\u{0004}L"));
|
||
check_unescape(r"\x", Some("x"));
|
||
check_unescape(r"\xP", Some("xP"));
|
||
check_unescape(r"\x0", None);
|
||
check_unescape(r"\xCAD", None);
|
||
check_unescape(r"\xA9", None);
|
||
|
||
// octal byte value
|
||
check_unescape(r"\1", Some("\u{0001}"));
|
||
check_unescape(r"\12", Some("\u{000a}"));
|
||
check_unescape(r"\123", Some("\u{0053}"));
|
||
check_unescape(r"\1232", Some("\u{0053}2"));
|
||
check_unescape(r"\4", Some("\u{0004}"));
|
||
check_unescape(r"\45", Some("\u{0025}"));
|
||
check_unescape(r"\450", Some("\u{0028}"));
|
||
check_unescape(r"\603", None);
|
||
check_unescape(r"\0", None);
|
||
check_unescape(r"\080", None);
|
||
|
||
// others
|
||
check_unescape(r"\9", Some("9"));
|
||
check_unescape(r"''", Some("'"));
|
||
check_unescape(
|
||
r"Hello\r\nRust/\u4c91 SQL Parser\U0010ABCD\1232",
|
||
Some("Hello\r\nRust/\u{4c91} SQL Parser\u{10abcd}\u{0053}2"),
|
||
);
|
||
check_unescape(r"Hello\0", None);
|
||
check_unescape(r"Hello\xCADRust", None);
|
||
}
|
||
|
||
#[test]
|
||
fn tokenize_numeric_prefix_trait() {
|
||
#[derive(Debug)]
|
||
struct NumericPrefixDialect;
|
||
|
||
impl Dialect for NumericPrefixDialect {
|
||
fn is_identifier_start(&self, ch: char) -> bool {
|
||
ch.is_ascii_lowercase()
|
||
|| ch.is_ascii_uppercase()
|
||
|| ch.is_ascii_digit()
|
||
|| ch == '$'
|
||
}
|
||
|
||
fn is_identifier_part(&self, ch: char) -> bool {
|
||
ch.is_ascii_lowercase()
|
||
|| ch.is_ascii_uppercase()
|
||
|| ch.is_ascii_digit()
|
||
|| ch == '_'
|
||
|| ch == '$'
|
||
|| ch == '{'
|
||
|| ch == '}'
|
||
}
|
||
|
||
fn supports_numeric_prefix(&self) -> bool {
|
||
true
|
||
}
|
||
}
|
||
|
||
tokenize_numeric_prefix_inner(&NumericPrefixDialect {});
|
||
tokenize_numeric_prefix_inner(&HiveDialect {});
|
||
tokenize_numeric_prefix_inner(&MySqlDialect {});
|
||
}
|
||
|
||
fn tokenize_numeric_prefix_inner(dialect: &dyn Dialect) {
|
||
let sql = r#"SELECT * FROM 1"#;
|
||
let tokens = Tokenizer::new(dialect, sql).tokenize().unwrap();
|
||
let expected = vec![
|
||
Token::make_keyword("SELECT"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::Mul,
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::make_keyword("FROM"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::Number(String::from("1"), false),
|
||
];
|
||
compare(expected, tokens);
|
||
}
|
||
|
||
#[test]
|
||
fn tokenize_quoted_string_escape() {
|
||
let dialect = SnowflakeDialect {};
|
||
for (sql, expected, expected_unescaped) in [
|
||
(r#"'%a\'%b'"#, r#"%a\'%b"#, r#"%a'%b"#),
|
||
(r#"'a\'\'b\'c\'d'"#, r#"a\'\'b\'c\'d"#, r#"a''b'c'd"#),
|
||
(r#"'\\'"#, r#"\\"#, r#"\"#),
|
||
(
|
||
r#"'\0\a\b\f\n\r\t\Z'"#,
|
||
r#"\0\a\b\f\n\r\t\Z"#,
|
||
"\0\u{7}\u{8}\u{c}\n\r\t\u{1a}",
|
||
),
|
||
(r#"'\"'"#, r#"\""#, "\""),
|
||
(r#"'\\a\\b\'c'"#, r#"\\a\\b\'c"#, r#"\a\b'c"#),
|
||
(r#"'\'abcd'"#, r#"\'abcd"#, r#"'abcd"#),
|
||
(r#"'''a''b'"#, r#"''a''b"#, r#"'a'b"#),
|
||
(r#"'\q'"#, r#"\q"#, r#"q"#),
|
||
(r#"'\%\_'"#, r#"\%\_"#, r#"%_"#),
|
||
(r#"'\\%\\_'"#, r#"\\%\\_"#, r#"\%\_"#),
|
||
] {
|
||
let tokens = Tokenizer::new(&dialect, sql)
|
||
.with_unescape(false)
|
||
.tokenize()
|
||
.unwrap();
|
||
let expected = vec![Token::SingleQuotedString(expected.to_string())];
|
||
compare(expected, tokens);
|
||
|
||
let tokens = Tokenizer::new(&dialect, sql)
|
||
.with_unescape(true)
|
||
.tokenize()
|
||
.unwrap();
|
||
let expected = vec![Token::SingleQuotedString(expected_unescaped.to_string())];
|
||
compare(expected, tokens);
|
||
}
|
||
|
||
for sql in [r#"'\'"#, r#"'ab\'"#] {
|
||
let mut tokenizer = Tokenizer::new(&dialect, sql);
|
||
assert_eq!(
|
||
"Unterminated string literal",
|
||
tokenizer.tokenize().unwrap_err().message.as_str(),
|
||
);
|
||
}
|
||
|
||
// Non-escape dialect
|
||
for (sql, expected) in [(r#"'\'"#, r#"\"#), (r#"'ab\'"#, r#"ab\"#)] {
|
||
let dialect = GenericDialect {};
|
||
let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
|
||
|
||
let expected = vec![Token::SingleQuotedString(expected.to_string())];
|
||
|
||
compare(expected, tokens);
|
||
}
|
||
|
||
// MySQL special case for LIKE escapes
|
||
for (sql, expected) in [(r#"'\%'"#, r#"\%"#), (r#"'\_'"#, r#"\_"#)] {
|
||
let dialect = MySqlDialect {};
|
||
let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
|
||
|
||
let expected = vec![Token::SingleQuotedString(expected.to_string())];
|
||
|
||
compare(expected, tokens);
|
||
}
|
||
}
|
||
|
||
#[test]
|
||
fn tokenize_triple_quoted_string() {
|
||
fn check<F>(
|
||
q: char, // The quote character to test
|
||
r: char, // An alternate quote character.
|
||
quote_token: F,
|
||
) where
|
||
F: Fn(String) -> Token,
|
||
{
|
||
let dialect = BigQueryDialect {};
|
||
|
||
for (sql, expected, expected_unescaped) in [
|
||
// Empty string
|
||
(format!(r#"{q}{q}{q}{q}{q}{q}"#), "".into(), "".into()),
|
||
// Should not count escaped quote as end of string.
|
||
(
|
||
format!(r#"{q}{q}{q}ab{q}{q}\{q}{q}cd{q}{q}{q}"#),
|
||
format!(r#"ab{q}{q}\{q}{q}cd"#),
|
||
format!(r#"ab{q}{q}{q}{q}cd"#),
|
||
),
|
||
// Simple string
|
||
(
|
||
format!(r#"{q}{q}{q}abc{q}{q}{q}"#),
|
||
"abc".into(),
|
||
"abc".into(),
|
||
),
|
||
// Mix single-double quotes unescaped.
|
||
(
|
||
format!(r#"{q}{q}{q}ab{r}{r}{r}c{r}def{r}{r}{r}{q}{q}{q}"#),
|
||
format!("ab{r}{r}{r}c{r}def{r}{r}{r}"),
|
||
format!("ab{r}{r}{r}c{r}def{r}{r}{r}"),
|
||
),
|
||
// Escaped quote.
|
||
(
|
||
format!(r#"{q}{q}{q}ab{q}{q}c{q}{q}\{q}de{q}{q}f{q}{q}{q}"#),
|
||
format!(r#"ab{q}{q}c{q}{q}\{q}de{q}{q}f"#),
|
||
format!(r#"ab{q}{q}c{q}{q}{q}de{q}{q}f"#),
|
||
),
|
||
// backslash-escaped quote characters.
|
||
(
|
||
format!(r#"{q}{q}{q}a\'\'b\'c\'d{q}{q}{q}"#),
|
||
r#"a\'\'b\'c\'d"#.into(),
|
||
r#"a''b'c'd"#.into(),
|
||
),
|
||
// backslash-escaped characters
|
||
(
|
||
format!(r#"{q}{q}{q}abc\0\n\rdef{q}{q}{q}"#),
|
||
r#"abc\0\n\rdef"#.into(),
|
||
"abc\0\n\rdef".into(),
|
||
),
|
||
] {
|
||
let tokens = Tokenizer::new(&dialect, sql.as_str())
|
||
.with_unescape(false)
|
||
.tokenize()
|
||
.unwrap();
|
||
let expected = vec![quote_token(expected.to_string())];
|
||
compare(expected, tokens);
|
||
|
||
let tokens = Tokenizer::new(&dialect, sql.as_str())
|
||
.with_unescape(true)
|
||
.tokenize()
|
||
.unwrap();
|
||
let expected = vec![quote_token(expected_unescaped.to_string())];
|
||
compare(expected, tokens);
|
||
}
|
||
|
||
for sql in [
|
||
format!(r#"{q}{q}{q}{q}{q}\{q}"#),
|
||
format!(r#"{q}{q}{q}abc{q}{q}\{q}"#),
|
||
format!(r#"{q}{q}{q}{q}"#),
|
||
format!(r#"{q}{q}{q}{r}{r}"#),
|
||
format!(r#"{q}{q}{q}abc{q}"#),
|
||
format!(r#"{q}{q}{q}abc{q}{q}"#),
|
||
format!(r#"{q}{q}{q}abc"#),
|
||
] {
|
||
let dialect = BigQueryDialect {};
|
||
let mut tokenizer = Tokenizer::new(&dialect, sql.as_str());
|
||
assert_eq!(
|
||
"Unterminated string literal",
|
||
tokenizer.tokenize().unwrap_err().message.as_str(),
|
||
);
|
||
}
|
||
}
|
||
|
||
check('"', '\'', Token::TripleDoubleQuotedString);
|
||
|
||
check('\'', '"', Token::TripleSingleQuotedString);
|
||
|
||
let dialect = BigQueryDialect {};
|
||
|
||
let sql = r#"""''"#;
|
||
let tokens = Tokenizer::new(&dialect, sql)
|
||
.with_unescape(true)
|
||
.tokenize()
|
||
.unwrap();
|
||
let expected = vec![
|
||
Token::DoubleQuotedString("".to_string()),
|
||
Token::SingleQuotedString("".to_string()),
|
||
];
|
||
compare(expected, tokens);
|
||
|
||
let sql = r#"''"""#;
|
||
let tokens = Tokenizer::new(&dialect, sql)
|
||
.with_unescape(true)
|
||
.tokenize()
|
||
.unwrap();
|
||
let expected = vec![
|
||
Token::SingleQuotedString("".to_string()),
|
||
Token::DoubleQuotedString("".to_string()),
|
||
];
|
||
compare(expected, tokens);
|
||
|
||
// Non-triple quoted string dialect
|
||
let dialect = SnowflakeDialect {};
|
||
let sql = r#"''''''"#;
|
||
let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
|
||
let expected = vec![Token::SingleQuotedString("''".to_string())];
|
||
compare(expected, tokens);
|
||
}
|
||
|
||
#[test]
|
||
fn test_mysql_users_grantees() {
|
||
let dialect = MySqlDialect {};
|
||
|
||
let sql = "CREATE USER `root`@`%`";
|
||
let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
|
||
let expected = vec![
|
||
Token::make_keyword("CREATE"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::make_keyword("USER"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::make_word("root", Some('`')),
|
||
Token::AtSign,
|
||
Token::make_word("%", Some('`')),
|
||
];
|
||
compare(expected, tokens);
|
||
}
|
||
|
||
#[test]
|
||
fn test_postgres_abs_without_space_and_string_literal() {
|
||
let dialect = MySqlDialect {};
|
||
|
||
let sql = "SELECT @'1'";
|
||
let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
|
||
let expected = vec![
|
||
Token::make_keyword("SELECT"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::AtSign,
|
||
Token::SingleQuotedString("1".to_string()),
|
||
];
|
||
compare(expected, tokens);
|
||
}
|
||
|
||
#[test]
|
||
fn test_postgres_abs_without_space_and_quoted_column() {
|
||
let dialect = MySqlDialect {};
|
||
|
||
let sql = r#"SELECT @"bar" FROM foo"#;
|
||
let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
|
||
let expected = vec![
|
||
Token::make_keyword("SELECT"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::AtSign,
|
||
Token::DoubleQuotedString("bar".to_string()),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::make_keyword("FROM"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::make_word("foo", None),
|
||
];
|
||
compare(expected, tokens);
|
||
}
|
||
|
||
#[test]
|
||
fn test_national_strings_backslash_escape_not_supported() {
|
||
all_dialects_where(|dialect| !dialect.supports_string_literal_backslash_escape())
|
||
.tokenizes_to(
|
||
"select n'''''\\'",
|
||
vec![
|
||
Token::make_keyword("select"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::NationalStringLiteral("''\\".to_string()),
|
||
],
|
||
);
|
||
}
|
||
|
||
#[test]
|
||
fn test_national_strings_backslash_escape_supported() {
|
||
all_dialects_where(|dialect| dialect.supports_string_literal_backslash_escape())
|
||
.tokenizes_to(
|
||
"select n'''''\\''",
|
||
vec![
|
||
Token::make_keyword("select"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::NationalStringLiteral("'''".to_string()),
|
||
],
|
||
);
|
||
}
|
||
|
||
#[test]
|
||
fn test_string_escape_constant_not_supported() {
|
||
all_dialects_where(|dialect| !dialect.supports_string_escape_constant()).tokenizes_to(
|
||
"select e'...'",
|
||
vec![
|
||
Token::make_keyword("select"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::make_word("e", None),
|
||
Token::SingleQuotedString("...".to_string()),
|
||
],
|
||
);
|
||
|
||
all_dialects_where(|dialect| !dialect.supports_string_escape_constant()).tokenizes_to(
|
||
"select E'...'",
|
||
vec![
|
||
Token::make_keyword("select"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::make_word("E", None),
|
||
Token::SingleQuotedString("...".to_string()),
|
||
],
|
||
);
|
||
}
|
||
|
||
#[test]
|
||
fn test_string_escape_constant_supported() {
|
||
all_dialects_where(|dialect| dialect.supports_string_escape_constant()).tokenizes_to(
|
||
"select e'\\''",
|
||
vec![
|
||
Token::make_keyword("select"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::EscapedStringLiteral("'".to_string()),
|
||
],
|
||
);
|
||
|
||
all_dialects_where(|dialect| dialect.supports_string_escape_constant()).tokenizes_to(
|
||
"select E'\\''",
|
||
vec![
|
||
Token::make_keyword("select"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::EscapedStringLiteral("'".to_string()),
|
||
],
|
||
);
|
||
}
|
||
|
||
#[test]
|
||
fn test_whitespace_required_after_single_line_comment() {
|
||
all_dialects_where(|dialect| dialect.requires_single_line_comment_whitespace())
|
||
.tokenizes_to(
|
||
"SELECT --'abc'",
|
||
vec![
|
||
Token::make_keyword("SELECT"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::Minus,
|
||
Token::Minus,
|
||
Token::SingleQuotedString("abc".to_string()),
|
||
],
|
||
);
|
||
|
||
all_dialects_where(|dialect| dialect.requires_single_line_comment_whitespace())
|
||
.tokenizes_to(
|
||
"SELECT -- 'abc'",
|
||
vec![
|
||
Token::make_keyword("SELECT"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::Whitespace(Whitespace::SingleLineComment {
|
||
prefix: "--".to_string(),
|
||
comment: " 'abc'".to_string(),
|
||
}),
|
||
],
|
||
);
|
||
|
||
all_dialects_where(|dialect| dialect.requires_single_line_comment_whitespace())
|
||
.tokenizes_to(
|
||
"SELECT --",
|
||
vec![
|
||
Token::make_keyword("SELECT"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::Minus,
|
||
Token::Minus,
|
||
],
|
||
);
|
||
}
|
||
|
||
#[test]
|
||
fn test_whitespace_not_required_after_single_line_comment() {
|
||
all_dialects_where(|dialect| !dialect.requires_single_line_comment_whitespace())
|
||
.tokenizes_to(
|
||
"SELECT --'abc'",
|
||
vec![
|
||
Token::make_keyword("SELECT"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::Whitespace(Whitespace::SingleLineComment {
|
||
prefix: "--".to_string(),
|
||
comment: "'abc'".to_string(),
|
||
}),
|
||
],
|
||
);
|
||
|
||
all_dialects_where(|dialect| !dialect.requires_single_line_comment_whitespace())
|
||
.tokenizes_to(
|
||
"SELECT -- 'abc'",
|
||
vec![
|
||
Token::make_keyword("SELECT"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::Whitespace(Whitespace::SingleLineComment {
|
||
prefix: "--".to_string(),
|
||
comment: " 'abc'".to_string(),
|
||
}),
|
||
],
|
||
);
|
||
|
||
all_dialects_where(|dialect| !dialect.requires_single_line_comment_whitespace())
|
||
.tokenizes_to(
|
||
"SELECT --",
|
||
vec![
|
||
Token::make_keyword("SELECT"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::Whitespace(Whitespace::SingleLineComment {
|
||
prefix: "--".to_string(),
|
||
comment: "".to_string(),
|
||
}),
|
||
],
|
||
);
|
||
}
|
||
|
||
#[test]
|
||
fn test_tokenize_identifiers_numeric_prefix() {
|
||
all_dialects_where(|dialect| dialect.supports_numeric_prefix())
|
||
.tokenizes_to("123abc", vec![Token::make_word("123abc", None)]);
|
||
|
||
all_dialects_where(|dialect| dialect.supports_numeric_prefix())
|
||
.tokenizes_to("12e34", vec![Token::Number("12e34".to_string(), false)]);
|
||
|
||
all_dialects_where(|dialect| dialect.supports_numeric_prefix()).tokenizes_to(
|
||
"t.12e34",
|
||
vec![
|
||
Token::make_word("t", None),
|
||
Token::Period,
|
||
Token::make_word("12e34", None),
|
||
],
|
||
);
|
||
|
||
all_dialects_where(|dialect| dialect.supports_numeric_prefix()).tokenizes_to(
|
||
"t.1two3",
|
||
vec![
|
||
Token::make_word("t", None),
|
||
Token::Period,
|
||
Token::make_word("1two3", None),
|
||
],
|
||
);
|
||
}
|
||
|
||
#[test]
|
||
fn tokenize_period_underscore() {
|
||
let sql = String::from("SELECT table._col");
|
||
// a dialect that supports underscores in numeric literals
|
||
let dialect = PostgreSqlDialect {};
|
||
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
|
||
|
||
let expected = vec![
|
||
Token::make_keyword("SELECT"),
|
||
Token::Whitespace(Whitespace::Space),
|
||
Token::Word(Word {
|
||
value: "table".to_string(),
|
||
quote_style: None,
|
||
keyword: Keyword::TABLE,
|
||
}),
|
||
Token::Period,
|
||
Token::Word(Word {
|
||
value: "_col".to_string(),
|
||
quote_style: None,
|
||
keyword: Keyword::NoKeyword,
|
||
}),
|
||
];
|
||
|
||
compare(expected, tokens);
|
||
|
||
let sql = String::from("SELECT ._123");
|
||
if let Ok(tokens) = Tokenizer::new(&dialect, &sql).tokenize() {
|
||
panic!("Tokenizer should have failed on {sql}, but it succeeded with {tokens:?}");
|
||
}
|
||
|
||
let sql = String::from("SELECT ._abc");
|
||
if let Ok(tokens) = Tokenizer::new(&dialect, &sql).tokenize() {
|
||
panic!("Tokenizer should have failed on {sql}, but it succeeded with {tokens:?}");
|
||
}
|
||
}
|
||
}
|