datafusion-sqlparse/src/tokenizer.rs

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

//! SQL Tokenizer
//!
//! The tokenizer (a.k.a. lexer) converts a string into a sequence of tokens.
//!
//! The tokens then form the input for the parser, which outputs an Abstract Syntax Tree (AST).

#[cfg(not(feature = "std"))]
use alloc::{
    borrow::ToOwned,
    format,
    string::{String, ToString},
    vec,
    vec::Vec,
};
use core::iter::Peekable;
use core::num::NonZeroU8;
use core::str::Chars;
use core::{cmp, fmt};

#[cfg(feature = "serde")]
use serde::{Deserialize, Serialize};

#[cfg(feature = "visitor")]
use sqlparser_derive::{Visit, VisitMut};

use crate::dialect::Dialect;
use crate::dialect::{
    BigQueryDialect, DuckDbDialect, GenericDialect, MySqlDialect, PostgreSqlDialect,
    SnowflakeDialect,
};
use crate::keywords::{Keyword, ALL_KEYWORDS, ALL_KEYWORDS_INDEX};
use crate::{ast::DollarQuotedString, dialect::HiveDialect};

/// SQL Token enumeration
#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
pub enum Token {
    /// An end-of-file marker, not a real token
    EOF,
    /// A keyword (like SELECT) or an optionally quoted SQL identifier
    Word(Word),
    /// An unsigned numeric literal
    Number(String, bool),
    /// A character that could not be tokenized
    Char(char),
    /// Single quoted string: i.e: 'string'
    SingleQuotedString(String),
    /// Double quoted string: i.e: "string"
    DoubleQuotedString(String),
    /// Triple single quoted strings: Example '''abc'''
    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
    TripleSingleQuotedString(String),
    /// Triple double quoted strings: Example """abc"""
    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
    TripleDoubleQuotedString(String),
    /// Dollar quoted string: i.e: $$string$$ or $tag_name$string$tag_name$
    DollarQuotedString(DollarQuotedString),
    /// Byte string literal: i.e: b'string' or B'string' (note that some backends, such as
    /// PostgreSQL, may treat this syntax as a bit string literal instead, i.e: b'10010101')
    SingleQuotedByteStringLiteral(String),
    /// Byte string literal: i.e: b"string" or B"string"
    DoubleQuotedByteStringLiteral(String),
    /// Triple single quoted literal with byte string prefix. Example `B'''abc'''`
    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
    TripleSingleQuotedByteStringLiteral(String),
    /// Triple double quoted literal with byte string prefix. Example `B"""abc"""`
    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
    TripleDoubleQuotedByteStringLiteral(String),
    /// Single quoted literal with raw string prefix. Example `R'abc'`
    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
    SingleQuotedRawStringLiteral(String),
    /// Double quoted literal with raw string prefix. Example `R"abc"`
    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
    DoubleQuotedRawStringLiteral(String),
    /// Triple single quoted literal with raw string prefix. Example `R'''abc'''`
    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
    TripleSingleQuotedRawStringLiteral(String),
    /// Triple double quoted literal with raw string prefix. Example `R"""abc"""`
    /// [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#quoted_literals)
    TripleDoubleQuotedRawStringLiteral(String),
    /// "National" string literal: i.e: N'string'
    NationalStringLiteral(String),
    /// "escaped" string literal, which are an extension to the SQL standard: i.e: e'first \n second' or E 'first \n second'
    EscapedStringLiteral(String),
    /// Unicode string literal: i.e: U&'first \000A second'
    UnicodeStringLiteral(String),
    /// Hexadecimal string literal: i.e.: X'deadbeef'
    HexStringLiteral(String),
    /// Comma
    Comma,
    /// Whitespace (space, tab, etc)
    Whitespace(Whitespace),
    /// Double equals sign `==`
    DoubleEq,
    /// Equality operator `=`
    Eq,
    /// Not Equals operator `<>` (or `!=` in some dialects)
    Neq,
    /// Less Than operator `<`
    Lt,
    /// Greater Than operator `>`
    Gt,
    /// Less Than Or Equals operator `<=`
    LtEq,
    /// Greater Than Or Equals operator `>=`
    GtEq,
    /// Spaceship operator <=>
    Spaceship,
    /// Plus operator `+`
    Plus,
    /// Minus operator `-`
    Minus,
    /// Multiplication operator `*`
    Mul,
    /// Division operator `/`
    Div,
    /// Integer division operator `//` in DuckDB
    DuckIntDiv,
    /// Modulo Operator `%`
    Mod,
    /// String concatenation `||`
    StringConcat,
    /// Left parenthesis `(`
    LParen,
    /// Right parenthesis `)`
    RParen,
    /// Period (used for compound identifiers or projections into nested types)
    Period,
    /// Colon `:`
    Colon,
    /// DoubleColon `::` (used for casting in PostgreSQL)
    DoubleColon,
    /// Assignment `:=` (used for keyword argument in DuckDB macros and some functions, and for variable declarations in DuckDB and Snowflake)
    Assignment,
    /// SemiColon `;` used as separator for COPY and payload
    SemiColon,
    /// Backslash `\` used in terminating the COPY payload with `\.`
    Backslash,
    /// Left bracket `[`
    LBracket,
    /// Right bracket `]`
    RBracket,
    /// Ampersand `&`
    Ampersand,
    /// Pipe `|`
    Pipe,
    /// Caret `^`
    Caret,
    /// Left brace `{`
    LBrace,
    /// Right brace `}`
    RBrace,
    /// Right Arrow `=>`
    RArrow,
    /// Sharp `#` used for PostgreSQL Bitwise XOR operator, also PostgreSQL/Redshift geometrical unary/binary operator (Number of points in path or polygon/Intersection)
    Sharp,
    /// `##` PostgreSQL/Redshift geometrical binary operator (Point of closest proximity)
    DoubleSharp,
    /// Tilde `~` used for PostgreSQL Bitwise NOT operator or case sensitive match regular expression operator
    Tilde,
    /// `~*` , a case insensitive match regular expression operator in PostgreSQL
    TildeAsterisk,
    /// `!~` , a case sensitive not match regular expression operator in PostgreSQL
    ExclamationMarkTilde,
    /// `!~*` , a case insensitive not match regular expression operator in PostgreSQL
    ExclamationMarkTildeAsterisk,
    /// `~~`, a case sensitive match pattern operator in PostgreSQL
    DoubleTilde,
    /// `~~*`, a case insensitive match pattern operator in PostgreSQL
    DoubleTildeAsterisk,
    /// `!~~`, a case sensitive not match pattern operator in PostgreSQL
    ExclamationMarkDoubleTilde,
    /// `!~~*`, a case insensitive not match pattern operator in PostgreSQL
    ExclamationMarkDoubleTildeAsterisk,
    /// `<<`, a bitwise shift left operator in PostgreSQL
    ShiftLeft,
    /// `>>`, a bitwise shift right operator in PostgreSQL
    ShiftRight,
    /// `&&`, an overlap operator in PostgreSQL
    Overlap,
    /// Exclamation Mark `!` used for PostgreSQL factorial operator
    ExclamationMark,
    /// Double Exclamation Mark `!!` used for PostgreSQL prefix factorial operator
    DoubleExclamationMark,
    /// AtSign `@` used for PostgreSQL abs operator, also PostgreSQL/Redshift geometrical unary/binary operator (Center, Contained or on)
    AtSign,
    /// `^@`, a "starts with" string operator in PostgreSQL
    CaretAt,
    /// `|/`, a square root math operator in PostgreSQL
    PGSquareRoot,
    /// `||/`, a cube root math operator in PostgreSQL
    PGCubeRoot,
    /// `?` or `$` , a prepared statement arg placeholder
    Placeholder(String),
    /// `->`, used as a operator to extract json field in PostgreSQL
    Arrow,
    /// `->>`, used as a operator to extract json field as text in PostgreSQL
    LongArrow,
    /// `#>`, extracts JSON sub-object at the specified path
    HashArrow,
    /// `@-@` PostgreSQL/Redshift geometrical unary operator (Length or circumference)
    AtDashAt,
    /// `?-` PostgreSQL/Redshift geometrical unary/binary operator (Is horizontal?/Are horizontally aligned?)
    QuestionMarkDash,
    /// `&<` PostgreSQL/Redshift geometrical binary operator (Overlaps to left?)
    AmpersandLeftAngleBracket,
    /// `&>` PostgreSQL/Redshift geometrical binary operator (Overlaps to right?)`
    AmpersandRightAngleBracket,
    /// `&<|` PostgreSQL/Redshift geometrical binary operator (Does not extend above?)`
    AmpersandLeftAngleBracketVerticalBar,
    /// `|&>` PostgreSQL/Redshift geometrical binary operator (Does not extend below?)`
    VerticalBarAmpersandRightAngleBracket,
    /// `<->` PostgreSQL/Redshift geometrical binary operator (Distance between)
    TwoWayArrow,
    /// `<^` PostgreSQL/Redshift geometrical binary operator (Is below?)
    LeftAngleBracketCaret,
    /// `>^` PostgreSQL/Redshift geometrical binary operator (Is above?)
    RightAngleBracketCaret,
    /// `?#` PostgreSQL/Redshift geometrical binary operator (Intersects or overlaps)
    QuestionMarkSharp,
    /// `?-|` PostgreSQL/Redshift geometrical binary operator (Is perpendicular?)
    QuestionMarkDashVerticalBar,
    /// `?||` PostgreSQL/Redshift geometrical binary operator (Are parallel?)
    QuestionMarkDoubleVerticalBar,
    /// `~=` PostgreSQL/Redshift geometrical binary operator (Same as)
    TildeEqual,
    /// `<<| PostgreSQL/Redshift geometrical binary operator (Is strictly below?)
    ShiftLeftVerticalBar,
    /// `|>> PostgreSQL/Redshift geometrical binary operator (Is strictly above?)
    VerticalBarShiftRight,
    /// `|> BigQuery pipe operator
    VerticalBarRightAngleBracket,
    /// `#>>`, extracts JSON sub-object at the specified path as text
    HashLongArrow,
    /// jsonb @> jsonb -> boolean: Test whether left json contains the right json
    AtArrow,
    /// jsonb <@ jsonb -> boolean: Test whether right json contains the left json
    ArrowAt,
    /// jsonb #- text[] -> jsonb: Deletes the field or array element at the specified
    /// path, where path elements can be either field keys or array indexes.
    HashMinus,
    /// jsonb @? jsonpath -> boolean: Does JSON path return any item for the specified
    /// JSON value?
    AtQuestion,
    /// jsonb @@ jsonpath → boolean: Returns the result of a JSON path predicate check
    /// for the specified JSON value. Only the first item of the result is taken into
    /// account. If the result is not Boolean, then NULL is returned.
    AtAt,
    /// jsonb ? text -> boolean: Checks whether the string exists as a top-level key within the
    /// jsonb object
    Question,
    /// jsonb ?& text[] -> boolean: Check whether all members of the text array exist as top-level
    /// keys within the jsonb object
    QuestionAnd,
    /// jsonb ?| text[] -> boolean: Check whether any member of the text array exists as top-level
    /// keys within the jsonb object
    QuestionPipe,
    /// Custom binary operator
    /// This is used to represent any custom binary operator that is not part of the SQL standard.
    /// PostgreSQL allows defining custom binary operators using CREATE OPERATOR.
    CustomBinaryOperator(String),
}

impl fmt::Display for Token {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        match self {
            Token::EOF => f.write_str("EOF"),
            Token::Word(ref w) => write!(f, "{w}"),
            Token::Number(ref n, l) => write!(f, "{}{long}", n, long = if *l { "L" } else { "" }),
            Token::Char(ref c) => write!(f, "{c}"),
            Token::SingleQuotedString(ref s) => write!(f, "'{s}'"),
            Token::TripleSingleQuotedString(ref s) => write!(f, "'''{s}'''"),
            Token::DoubleQuotedString(ref s) => write!(f, "\"{s}\""),
            Token::TripleDoubleQuotedString(ref s) => write!(f, "\"\"\"{s}\"\"\""),
            Token::DollarQuotedString(ref s) => write!(f, "{s}"),
            Token::NationalStringLiteral(ref s) => write!(f, "N'{s}'"),
            Token::EscapedStringLiteral(ref s) => write!(f, "E'{s}'"),
            Token::UnicodeStringLiteral(ref s) => write!(f, "U&'{s}'"),
            Token::HexStringLiteral(ref s) => write!(f, "X'{s}'"),
            Token::SingleQuotedByteStringLiteral(ref s) => write!(f, "B'{s}'"),
            Token::TripleSingleQuotedByteStringLiteral(ref s) => write!(f, "B'''{s}'''"),
            Token::DoubleQuotedByteStringLiteral(ref s) => write!(f, "B\"{s}\""),
            Token::TripleDoubleQuotedByteStringLiteral(ref s) => write!(f, "B\"\"\"{s}\"\"\""),
            Token::SingleQuotedRawStringLiteral(ref s) => write!(f, "R'{s}'"),
            Token::DoubleQuotedRawStringLiteral(ref s) => write!(f, "R\"{s}\""),
            Token::TripleSingleQuotedRawStringLiteral(ref s) => write!(f, "R'''{s}'''"),
            Token::TripleDoubleQuotedRawStringLiteral(ref s) => write!(f, "R\"\"\"{s}\"\"\""),
            Token::Comma => f.write_str(","),
            Token::Whitespace(ws) => write!(f, "{ws}"),
            Token::DoubleEq => f.write_str("=="),
            Token::Spaceship => f.write_str("<=>"),
            Token::Eq => f.write_str("="),
            Token::Neq => f.write_str("<>"),
            Token::Lt => f.write_str("<"),
            Token::Gt => f.write_str(">"),
            Token::LtEq => f.write_str("<="),
            Token::GtEq => f.write_str(">="),
            Token::Plus => f.write_str("+"),
            Token::Minus => f.write_str("-"),
            Token::Mul => f.write_str("*"),
            Token::Div => f.write_str("/"),
            Token::DuckIntDiv => f.write_str("//"),
            Token::StringConcat => f.write_str("||"),
            Token::Mod => f.write_str("%"),
            Token::LParen => f.write_str("("),
            Token::RParen => f.write_str(")"),
            Token::Period => f.write_str("."),
            Token::Colon => f.write_str(":"),
            Token::DoubleColon => f.write_str("::"),
            Token::Assignment => f.write_str(":="),
            Token::SemiColon => f.write_str(";"),
            Token::Backslash => f.write_str("\\"),
            Token::LBracket => f.write_str("["),
            Token::RBracket => f.write_str("]"),
            Token::Ampersand => f.write_str("&"),
            Token::Caret => f.write_str("^"),
            Token::Pipe => f.write_str("|"),
            Token::LBrace => f.write_str("{"),
            Token::RBrace => f.write_str("}"),
            Token::RArrow => f.write_str("=>"),
            Token::Sharp => f.write_str("#"),
            Token::DoubleSharp => f.write_str("##"),
            Token::ExclamationMark => f.write_str("!"),
            Token::DoubleExclamationMark => f.write_str("!!"),
            Token::Tilde => f.write_str("~"),
            Token::TildeAsterisk => f.write_str("~*"),
            Token::ExclamationMarkTilde => f.write_str("!~"),
            Token::ExclamationMarkTildeAsterisk => f.write_str("!~*"),
            Token::DoubleTilde => f.write_str("~~"),
            Token::DoubleTildeAsterisk => f.write_str("~~*"),
            Token::ExclamationMarkDoubleTilde => f.write_str("!~~"),
            Token::ExclamationMarkDoubleTildeAsterisk => f.write_str("!~~*"),
            Token::AtSign => f.write_str("@"),
            Token::CaretAt => f.write_str("^@"),
            Token::ShiftLeft => f.write_str("<<"),
            Token::ShiftRight => f.write_str(">>"),
            Token::Overlap => f.write_str("&&"),
            Token::PGSquareRoot => f.write_str("|/"),
            Token::PGCubeRoot => f.write_str("||/"),
            Token::AtDashAt => f.write_str("@-@"),
            Token::QuestionMarkDash => f.write_str("?-"),
            Token::AmpersandLeftAngleBracket => f.write_str("&<"),
            Token::AmpersandRightAngleBracket => f.write_str("&>"),
            Token::AmpersandLeftAngleBracketVerticalBar => f.write_str("&<|"),
            Token::VerticalBarAmpersandRightAngleBracket => f.write_str("|&>"),
            Token::VerticalBarRightAngleBracket => f.write_str("|>"),
            Token::TwoWayArrow => f.write_str("<->"),
            Token::LeftAngleBracketCaret => f.write_str("<^"),
            Token::RightAngleBracketCaret => f.write_str(">^"),
            Token::QuestionMarkSharp => f.write_str("?#"),
            Token::QuestionMarkDashVerticalBar => f.write_str("?-|"),
            Token::QuestionMarkDoubleVerticalBar => f.write_str("?||"),
            Token::TildeEqual => f.write_str("~="),
            Token::ShiftLeftVerticalBar => f.write_str("<<|"),
            Token::VerticalBarShiftRight => f.write_str("|>>"),
            Token::Placeholder(ref s) => write!(f, "{s}"),
            Token::Arrow => write!(f, "->"),
            Token::LongArrow => write!(f, "->>"),
            Token::HashArrow => write!(f, "#>"),
            Token::HashLongArrow => write!(f, "#>>"),
            Token::AtArrow => write!(f, "@>"),
            Token::ArrowAt => write!(f, "<@"),
            Token::HashMinus => write!(f, "#-"),
            Token::AtQuestion => write!(f, "@?"),
            Token::AtAt => write!(f, "@@"),
            Token::Question => write!(f, "?"),
            Token::QuestionAnd => write!(f, "?&"),
            Token::QuestionPipe => write!(f, "?|"),
            Token::CustomBinaryOperator(s) => f.write_str(s),
        }
    }
}

impl Token {
    pub fn make_keyword(keyword: &str) -> Self {
        Token::make_word(keyword, None)
    }

    pub fn make_word(word: &str, quote_style: Option<char>) -> Self {
        let word_uppercase = word.to_uppercase();
        Token::Word(Word {
            value: word.to_string(),
            quote_style,
            keyword: if quote_style.is_none() {
                let keyword = ALL_KEYWORDS.binary_search(&word_uppercase.as_str());
                keyword.map_or(Keyword::NoKeyword, |x| ALL_KEYWORDS_INDEX[x])
            } else {
                Keyword::NoKeyword
            },
        })
    }
}

/// A keyword (like SELECT) or an optionally quoted SQL identifier
#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
pub struct Word {
    /// The value of the token, without the enclosing quotes, and with the
    /// escape sequences (if any) processed (TODO: escapes are not handled)
    pub value: String,
    /// An identifier can be "quoted" (&lt;delimited identifier> in ANSI parlance).
    /// The standard and most implementations allow using double quotes for this,
    /// but some implementations support other quoting styles as well (e.g. \[MS SQL])
    pub quote_style: Option<char>,
    /// If the word was not quoted and it matched one of the known keywords,
    /// this will have one of the values from dialect::keywords, otherwise empty
    pub keyword: Keyword,
}

impl fmt::Display for Word {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        match self.quote_style {
            Some(s) if s == '"' || s == '[' || s == '`' => {
                write!(f, "{}{}{}", s, self.value, Word::matching_end_quote(s))
            }
            None => f.write_str(&self.value),
            _ => panic!("Unexpected quote_style!"),
        }
    }
}

impl Word {
    fn matching_end_quote(ch: char) -> char {
        match ch {
            '"' => '"', // ANSI and most dialects
            '[' => ']', // MS SQL
            '`' => '`', // MySQL
            _ => panic!("unexpected quoting style!"),
        }
    }
}

#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
pub enum Whitespace {
    Space,
    Newline,
    Tab,
    SingleLineComment { comment: String, prefix: String },
    MultiLineComment(String),
}

impl fmt::Display for Whitespace {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        match self {
            Whitespace::Space => f.write_str(" "),
            Whitespace::Newline => f.write_str("\n"),
            Whitespace::Tab => f.write_str("\t"),
            Whitespace::SingleLineComment { prefix, comment } => write!(f, "{prefix}{comment}"),
            Whitespace::MultiLineComment(s) => write!(f, "/*{s}*/"),
        }
    }
}

/// Location in input string
///
/// # Create an "empty" (unknown) `Location`
/// ```
/// # use sqlparser::tokenizer::Location;
/// let location = Location::empty();
/// ```
///
/// # Create a `Location` from a line and column
/// ```
/// # use sqlparser::tokenizer::Location;
/// let location = Location::new(1, 1);
/// ```
///
/// # Create a `Location` from a pair
/// ```
/// # use sqlparser::tokenizer::Location;
/// let location = Location::from((1, 1));
/// ```
#[derive(Eq, PartialEq, Hash, Clone, Copy, Ord, PartialOrd)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
pub struct Location {
    /// Line number, starting from 1.
    ///
    /// Note: Line 0 is used for empty spans
    pub line: u64,
    /// Line column, starting from 1.
    ///
    /// Note: Column 0 is used for empty spans
    pub column: u64,
}

impl fmt::Display for Location {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        if self.line == 0 {
            return Ok(());
        }
        write!(f, " at Line: {}, Column: {}", self.line, self.column)
    }
}

impl fmt::Debug for Location {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        write!(f, "Location({},{})", self.line, self.column)
    }
}

impl Location {
    /// Return an "empty" / unknown location
    pub fn empty() -> Self {
        Self { line: 0, column: 0 }
    }

    /// Create a new `Location` for a given line and column
    pub fn new(line: u64, column: u64) -> Self {
        Self { line, column }
    }

    /// Create a new location for a given line and column
    ///
    /// Alias for [`Self::new`]
    // TODO: remove / deprecate in favor of` `new` for consistency?
    pub fn of(line: u64, column: u64) -> Self {
        Self::new(line, column)
    }

    /// Combine self and `end` into a new `Span`
    pub fn span_to(self, end: Self) -> Span {
        Span { start: self, end }
    }
}

impl From<(u64, u64)> for Location {
    fn from((line, column): (u64, u64)) -> Self {
        Self { line, column }
    }
}

/// A span represents a linear portion of the input string (start, end)
///
/// See [Spanned](crate::ast::Spanned) for more information.
#[derive(Eq, PartialEq, Hash, Clone, PartialOrd, Ord, Copy)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
pub struct Span {
    pub start: Location,
    pub end: Location,
}

impl fmt::Debug for Span {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        write!(f, "Span({:?}..{:?})", self.start, self.end)
    }
}

impl Span {
    // An empty span (0, 0) -> (0, 0)
    // We need a const instance for pattern matching
    const EMPTY: Span = Self::empty();

    /// Create a new span from a start and end [`Location`]
    pub fn new(start: Location, end: Location) -> Span {
        Span { start, end }
    }

    /// Returns an empty span `(0, 0) -> (0, 0)`
    ///
    /// Empty spans represent no knowledge of source location
    /// See [Spanned](crate::ast::Spanned) for more information.
    pub const fn empty() -> Span {
        Span {
            start: Location { line: 0, column: 0 },
            end: Location { line: 0, column: 0 },
        }
    }

    /// Returns the smallest Span that contains both `self` and `other`
    /// If either span is [Span::empty], the other span is returned
    ///
    /// # Examples
    /// ```
    /// # use sqlparser::tokenizer::{Span, Location};
    /// // line 1, column1 -> line 2, column 5
    /// let span1 = Span::new(Location::new(1, 1), Location::new(2, 5));
    /// // line 2, column 3 -> line 3, column 7
    /// let span2 = Span::new(Location::new(2, 3), Location::new(3, 7));
    /// // Union of the two is the min/max of the two spans
    /// // line 1, column 1 -> line 3, column 7
    /// let union = span1.union(&span2);
    /// assert_eq!(union, Span::new(Location::new(1, 1), Location::new(3, 7)));
    /// ```
    pub fn union(&self, other: &Span) -> Span {
        // If either span is empty, return the other
        // this prevents propagating (0, 0) through the tree
        match (self, other) {
            (&Span::EMPTY, _) => *other,
            (_, &Span::EMPTY) => *self,
            _ => Span {
                start: cmp::min(self.start, other.start),
                end: cmp::max(self.end, other.end),
            },
        }
    }

    /// Same as [Span::union] for `Option<Span>`
    ///
    /// If `other` is `None`, `self` is returned
    pub fn union_opt(&self, other: &Option<Span>) -> Span {
        match other {
            Some(other) => self.union(other),
            None => *self,
        }
    }

    /// Return the [Span::union] of all spans in the iterator
    ///
    /// If the iterator is empty, an empty span is returned
    ///
    /// # Example
    /// ```
    /// # use sqlparser::tokenizer::{Span, Location};
    /// let spans = vec![
    ///     Span::new(Location::new(1, 1), Location::new(2, 5)),
    ///     Span::new(Location::new(2, 3), Location::new(3, 7)),
    ///     Span::new(Location::new(3, 1), Location::new(4, 2)),
    /// ];
    /// // line 1, column 1 -> line 4, column 2
    /// assert_eq!(
    ///   Span::union_iter(spans),
    ///   Span::new(Location::new(1, 1), Location::new(4, 2))
    /// );
    pub fn union_iter<I: IntoIterator<Item = Span>>(iter: I) -> Span {
        iter.into_iter()
            .reduce(|acc, item| acc.union(&item))
            .unwrap_or(Span::empty())
    }
}

/// Backwards compatibility struct for [`TokenWithSpan`]
#[deprecated(since = "0.53.0", note = "please use `TokenWithSpan` instead")]
pub type TokenWithLocation = TokenWithSpan;

/// A [Token] with [Span] attached to it
///
/// This is used to track the location of a token in the input string
///
/// # Examples
/// ```
/// # use sqlparser::tokenizer::{Location, Span, Token, TokenWithSpan};
/// // commas @ line 1, column 10
/// let tok1 = TokenWithSpan::new(
///   Token::Comma,
///   Span::new(Location::new(1, 10), Location::new(1, 11)),
/// );
/// assert_eq!(tok1, Token::Comma); // can compare the token
///
/// // commas @ line 2, column 20
/// let tok2 = TokenWithSpan::new(
///   Token::Comma,
///   Span::new(Location::new(2, 20), Location::new(2, 21)),
/// );
/// // same token but different locations are not equal
/// assert_ne!(tok1, tok2);
/// ```
#[derive(Debug, Clone, Hash, Ord, PartialOrd, Eq, PartialEq)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
pub struct TokenWithSpan {
    pub token: Token,
    pub span: Span,
}

impl TokenWithSpan {
    /// Create a new [`TokenWithSpan`] from a [`Token`] and a [`Span`]
    pub fn new(token: Token, span: Span) -> Self {
        Self { token, span }
    }

    /// Wrap a token with an empty span
    pub fn wrap(token: Token) -> Self {
        Self::new(token, Span::empty())
    }

    /// Wrap a token with a location from `start` to `end`
    pub fn at(token: Token, start: Location, end: Location) -> Self {
        Self::new(token, Span::new(start, end))
    }

    /// Return an EOF token with no location
    pub fn new_eof() -> Self {
        Self::wrap(Token::EOF)
    }
}

impl PartialEq<Token> for TokenWithSpan {
    fn eq(&self, other: &Token) -> bool {
        &self.token == other
    }
}

impl PartialEq<TokenWithSpan> for Token {
    fn eq(&self, other: &TokenWithSpan) -> bool {
        self == &other.token
    }
}

impl fmt::Display for TokenWithSpan {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        self.token.fmt(f)
    }
}

/// Tokenizer error
#[derive(Debug, PartialEq, Eq)]
pub struct TokenizerError {
    pub message: String,
    pub location: Location,
}

impl fmt::Display for TokenizerError {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        write!(f, "{}{}", self.message, self.location,)
    }
}

#[cfg(feature = "std")]
impl std::error::Error for TokenizerError {}

struct State<'a> {
    peekable: Peekable<Chars<'a>>,
    pub line: u64,
    pub col: u64,
}

impl State<'_> {
    /// return the next character and advance the stream
    pub fn next(&mut self) -> Option<char> {
        match self.peekable.next() {
            None => None,
            Some(s) => {
                if s == '\n' {
                    self.line += 1;
                    self.col = 1;
                } else {
                    self.col += 1;
                }
                Some(s)
            }
        }
    }

    /// return the next character but do not advance the stream
    pub fn peek(&mut self) -> Option<&char> {
        self.peekable.peek()
    }

    pub fn location(&self) -> Location {
        Location {
            line: self.line,
            column: self.col,
        }
    }
}

/// Represents how many quote characters enclose a string literal.
#[derive(Copy, Clone)]
enum NumStringQuoteChars {
    /// e.g. `"abc"`, `'abc'`, `r'abc'`
    One,
    /// e.g. `"""abc"""`, `'''abc'''`, `r'''abc'''`
    Many(NonZeroU8),
}

/// Settings for tokenizing a quoted string literal.
struct TokenizeQuotedStringSettings {
    /// The character used to quote the string.
    quote_style: char,
    /// Represents how many quotes characters enclose the string literal.
    num_quote_chars: NumStringQuoteChars,
    /// The number of opening quotes left to consume, before parsing
    /// the remaining string literal.
    /// For example: given initial string `"""abc"""`. If the caller has
    /// already parsed the first quote for some reason, then this value
    /// is set to 1, flagging to look to consume only 2 leading quotes.
    num_opening_quotes_to_consume: u8,
    /// True if the string uses backslash escaping of special characters
    /// e.g `'abc\ndef\'ghi'
    backslash_escape: bool,
}

/// SQL Tokenizer
pub struct Tokenizer<'a> {
    dialect: &'a dyn Dialect,
    query: &'a str,
    /// If true (the default), the tokenizer will un-escape literal
    /// SQL strings See [`Tokenizer::with_unescape`] for more details.
    unescape: bool,
}

impl<'a> Tokenizer<'a> {
    /// Create a new SQL tokenizer for the specified SQL statement
    ///
    /// ```
    /// # use sqlparser::tokenizer::{Token, Whitespace, Tokenizer};
    /// # use sqlparser::dialect::GenericDialect;
    /// # let dialect = GenericDialect{};
    /// let query = r#"SELECT 'foo'"#;
    ///
    /// // Parsing the query
    /// let tokens = Tokenizer::new(&dialect, &query).tokenize().unwrap();
    ///
    /// assert_eq!(tokens, vec![
    ///   Token::make_word("SELECT", None),
    ///   Token::Whitespace(Whitespace::Space),
    ///   Token::SingleQuotedString("foo".to_string()),
    /// ]);
    pub fn new(dialect: &'a dyn Dialect, query: &'a str) -> Self {
        Self {
            dialect,
            query,
            unescape: true,
        }
    }

    /// Set unescape mode
    ///
    /// When true (default) the tokenizer unescapes literal values
    /// (for example, `""` in SQL is unescaped to the literal `"`).
    ///
    /// When false, the tokenizer provides the raw strings as provided
    /// in the query.  This can be helpful for programs that wish to
    /// recover the *exact* original query text without normalizing
    /// the escaping
    ///
    /// # Example
    ///
    /// ```
    /// # use sqlparser::tokenizer::{Token, Tokenizer};
    /// # use sqlparser::dialect::GenericDialect;
    /// # let dialect = GenericDialect{};
    /// let query = r#""Foo "" Bar""#;
    /// let unescaped = Token::make_word(r#"Foo " Bar"#, Some('"'));
    /// let original  = Token::make_word(r#"Foo "" Bar"#, Some('"'));
    ///
    /// // Parsing with unescaping (default)
    /// let tokens = Tokenizer::new(&dialect, &query).tokenize().unwrap();
    /// assert_eq!(tokens, vec![unescaped]);
    ///
    /// // Parsing with unescape = false
    /// let tokens = Tokenizer::new(&dialect, &query)
    ///    .with_unescape(false)
    ///    .tokenize().unwrap();
    /// assert_eq!(tokens, vec![original]);
    /// ```
    pub fn with_unescape(mut self, unescape: bool) -> Self {
        self.unescape = unescape;
        self
    }

    /// Tokenize the statement and produce a vector of tokens
    pub fn tokenize(&mut self) -> Result<Vec<Token>, TokenizerError> {
        let twl = self.tokenize_with_location()?;
        Ok(twl.into_iter().map(|t| t.token).collect())
    }

    /// Tokenize the statement and produce a vector of tokens with location information
    pub fn tokenize_with_location(&mut self) -> Result<Vec<TokenWithSpan>, TokenizerError> {
        let mut tokens: Vec<TokenWithSpan> = vec![];
        self.tokenize_with_location_into_buf(&mut tokens)
            .map(|_| tokens)
    }

    /// Tokenize the statement and append tokens with location information into the provided buffer.
    /// If an error is thrown, the buffer will contain all tokens that were successfully parsed before the error.
    pub fn tokenize_with_location_into_buf(
        &mut self,
        buf: &mut Vec<TokenWithSpan>,
    ) -> Result<(), TokenizerError> {
        let mut state = State {
            peekable: self.query.chars().peekable(),
            line: 1,
            col: 1,
        };

        let mut location = state.location();
        while let Some(token) = self.next_token(&mut state, buf.last().map(|t| &t.token))? {
            let span = location.span_to(state.location());

            buf.push(TokenWithSpan { token, span });

            location = state.location();
        }
        Ok(())
    }

    // Tokenize the identifier or keywords in `ch`
    fn tokenize_identifier_or_keyword(
        &self,
        ch: impl IntoIterator<Item = char>,
        chars: &mut State,
    ) -> Result<Option<Token>, TokenizerError> {
        chars.next(); // consume the first char
        let ch: String = ch.into_iter().collect();
        let word = self.tokenize_word(ch, chars);

        // TODO: implement parsing of exponent here
        if word.chars().all(|x| x.is_ascii_digit() || x == '.') {
            let mut inner_state = State {
                peekable: word.chars().peekable(),
                line: 0,
                col: 0,
            };
            let mut s = peeking_take_while(&mut inner_state, |ch| matches!(ch, '0'..='9' | '.'));
            let s2 = peeking_take_while(chars, |ch| matches!(ch, '0'..='9' | '.'));
            s += s2.as_str();
            return Ok(Some(Token::Number(s, false)));
        }

        Ok(Some(Token::make_word(&word, None)))
    }

    /// Get the next token or return None
    fn next_token(
        &self,
        chars: &mut State,
        prev_token: Option<&Token>,
    ) -> Result<Option<Token>, TokenizerError> {
        match chars.peek() {
            Some(&ch) => match ch {
                ' ' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)),
                '\t' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Tab)),
                '\n' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Newline)),
                '\r' => {
                    // Emit a single Whitespace::Newline token for \r and \r\n
                    chars.next();
                    if let Some('\n') = chars.peek() {
                        chars.next();
                    }
                    Ok(Some(Token::Whitespace(Whitespace::Newline)))
                }
                // BigQuery and MySQL use b or B for byte string literal, Postgres for bit strings
                b @ 'B' | b @ 'b' if dialect_of!(self is BigQueryDialect | PostgreSqlDialect | MySqlDialect | GenericDialect) =>
                {
                    chars.next(); // consume
                    match chars.peek() {
                        Some('\'') => {
                            if self.dialect.supports_triple_quoted_string() {
                                return self
                                    .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
                                        chars,
                                        '\'',
                                        false,
                                        Token::SingleQuotedByteStringLiteral,
                                        Token::TripleSingleQuotedByteStringLiteral,
                                    );
                            }
                            let s = self.tokenize_single_quoted_string(chars, '\'', false)?;
                            Ok(Some(Token::SingleQuotedByteStringLiteral(s)))
                        }
                        Some('\"') => {
                            if self.dialect.supports_triple_quoted_string() {
                                return self
                                    .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
                                        chars,
                                        '"',
                                        false,
                                        Token::DoubleQuotedByteStringLiteral,
                                        Token::TripleDoubleQuotedByteStringLiteral,
                                    );
                            }
                            let s = self.tokenize_single_quoted_string(chars, '\"', false)?;
                            Ok(Some(Token::DoubleQuotedByteStringLiteral(s)))
                        }
                        _ => {
                            // regular identifier starting with an "b" or "B"
                            let s = self.tokenize_word(b, chars);
                            Ok(Some(Token::make_word(&s, None)))
                        }
                    }
                }
                // BigQuery uses r or R for raw string literal
                b @ 'R' | b @ 'r' if dialect_of!(self is BigQueryDialect | GenericDialect) => {
                    chars.next(); // consume
                    match chars.peek() {
                        Some('\'') => self
                            .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
                                chars,
                                '\'',
                                false,
                                Token::SingleQuotedRawStringLiteral,
                                Token::TripleSingleQuotedRawStringLiteral,
                            ),
                        Some('\"') => self
                            .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
                                chars,
                                '"',
                                false,
                                Token::DoubleQuotedRawStringLiteral,
                                Token::TripleDoubleQuotedRawStringLiteral,
                            ),
                        _ => {
                            // regular identifier starting with an "r" or "R"
                            let s = self.tokenize_word(b, chars);
                            Ok(Some(Token::make_word(&s, None)))
                        }
                    }
                }
                // Redshift uses lower case n for national string literal
                n @ 'N' | n @ 'n' => {
                    chars.next(); // consume, to check the next char
                    match chars.peek() {
                        Some('\'') => {
                            // N'...' - a <national character string literal>
                            let backslash_escape =
                                self.dialect.supports_string_literal_backslash_escape();
                            let s =
                                self.tokenize_single_quoted_string(chars, '\'', backslash_escape)?;
                            Ok(Some(Token::NationalStringLiteral(s)))
                        }
                        _ => {
                            // regular identifier starting with an "N"
                            let s = self.tokenize_word(n, chars);
                            Ok(Some(Token::make_word(&s, None)))
                        }
                    }
                }
                // PostgreSQL accepts "escape" string constants, which are an extension to the SQL standard.
                x @ 'e' | x @ 'E' if self.dialect.supports_string_escape_constant() => {
                    let starting_loc = chars.location();
                    chars.next(); // consume, to check the next char
                    match chars.peek() {
                        Some('\'') => {
                            let s =
                                self.tokenize_escaped_single_quoted_string(starting_loc, chars)?;
                            Ok(Some(Token::EscapedStringLiteral(s)))
                        }
                        _ => {
                            // regular identifier starting with an "E" or "e"
                            let s = self.tokenize_word(x, chars);
                            Ok(Some(Token::make_word(&s, None)))
                        }
                    }
                }
                // Unicode string literals like U&'first \000A second' are supported in some dialects, including PostgreSQL
                x @ 'u' | x @ 'U' if self.dialect.supports_unicode_string_literal() => {
                    chars.next(); // consume, to check the next char
                    if chars.peek() == Some(&'&') {
                        // we cannot advance the iterator here, as we need to consume the '&' later if the 'u' was an identifier
                        let mut chars_clone = chars.peekable.clone();
                        chars_clone.next(); // consume the '&' in the clone
                        if chars_clone.peek() == Some(&'\'') {
                            chars.next(); // consume the '&' in the original iterator
                            let s = unescape_unicode_single_quoted_string(chars)?;
                            return Ok(Some(Token::UnicodeStringLiteral(s)));
                        }
                    }
                    // regular identifier starting with an "U" or "u"
                    let s = self.tokenize_word(x, chars);
                    Ok(Some(Token::make_word(&s, None)))
                }
                // The spec only allows an uppercase 'X' to introduce a hex
                // string, but PostgreSQL, at least, allows a lowercase 'x' too.
                x @ 'x' | x @ 'X' => {
                    chars.next(); // consume, to check the next char
                    match chars.peek() {
                        Some('\'') => {
                            // X'...' - a <binary string literal>
                            let s = self.tokenize_single_quoted_string(chars, '\'', true)?;
                            Ok(Some(Token::HexStringLiteral(s)))
                        }
                        _ => {
                            // regular identifier starting with an "X"
                            let s = self.tokenize_word(x, chars);
                            Ok(Some(Token::make_word(&s, None)))
                        }
                    }
                }
                // single quoted string
                '\'' => {
                    if self.dialect.supports_triple_quoted_string() {
                        return self
                            .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
                                chars,
                                '\'',
                                self.dialect.supports_string_literal_backslash_escape(),
                                Token::SingleQuotedString,
                                Token::TripleSingleQuotedString,
                            );
                    }
                    let s = self.tokenize_single_quoted_string(
                        chars,
                        '\'',
                        self.dialect.supports_string_literal_backslash_escape(),
                    )?;

                    Ok(Some(Token::SingleQuotedString(s)))
                }
                // double quoted string
                '\"' if !self.dialect.is_delimited_identifier_start(ch)
                    && !self.dialect.is_identifier_start(ch) =>
                {
                    if self.dialect.supports_triple_quoted_string() {
                        return self
                            .tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
                                chars,
                                '"',
                                self.dialect.supports_string_literal_backslash_escape(),
                                Token::DoubleQuotedString,
                                Token::TripleDoubleQuotedString,
                            );
                    }
                    let s = self.tokenize_single_quoted_string(
                        chars,
                        '"',
                        self.dialect.supports_string_literal_backslash_escape(),
                    )?;

                    Ok(Some(Token::DoubleQuotedString(s)))
                }
                // delimited (quoted) identifier
                quote_start if self.dialect.is_delimited_identifier_start(ch) => {
                    let word = self.tokenize_quoted_identifier(quote_start, chars)?;
                    Ok(Some(Token::make_word(&word, Some(quote_start))))
                }
                // Potentially nested delimited (quoted) identifier
                quote_start
                    if self
                        .dialect
                        .is_nested_delimited_identifier_start(quote_start)
                        && self
                            .dialect
                            .peek_nested_delimited_identifier_quotes(chars.peekable.clone())
                            .is_some() =>
                {
                    let Some((quote_start, nested_quote_start)) = self
                        .dialect
                        .peek_nested_delimited_identifier_quotes(chars.peekable.clone())
                    else {
                        return self.tokenizer_error(
                            chars.location(),
                            format!("Expected nested delimiter '{quote_start}' before EOF."),
                        );
                    };

                    let Some(nested_quote_start) = nested_quote_start else {
                        let word = self.tokenize_quoted_identifier(quote_start, chars)?;
                        return Ok(Some(Token::make_word(&word, Some(quote_start))));
                    };

                    let mut word = vec![];
                    let quote_end = Word::matching_end_quote(quote_start);
                    let nested_quote_end = Word::matching_end_quote(nested_quote_start);
                    let error_loc = chars.location();

                    chars.next(); // skip the first delimiter
                    peeking_take_while(chars, |ch| ch.is_whitespace());
                    if chars.peek() != Some(&nested_quote_start) {
                        return self.tokenizer_error(
                            error_loc,
                            format!("Expected nested delimiter '{nested_quote_start}' before EOF."),
                        );
                    }
                    word.push(nested_quote_start.into());
                    word.push(self.tokenize_quoted_identifier(nested_quote_end, chars)?);
                    word.push(nested_quote_end.into());
                    peeking_take_while(chars, |ch| ch.is_whitespace());
                    if chars.peek() != Some(&quote_end) {
                        return self.tokenizer_error(
                            error_loc,
                            format!("Expected close delimiter '{quote_end}' before EOF."),
                        );
                    }
                    chars.next(); // skip close delimiter

                    Ok(Some(Token::make_word(&word.concat(), Some(quote_start))))
                }
                // numbers and period
                '0'..='9' | '.' => {
                    // special case where if ._ is encountered after a word then that word
                    // is a table and the _ is the start of the col name.
                    // if the prev token is not a word, then this is not a valid sql
                    // word or number.
                    if ch == '.' && chars.peekable.clone().nth(1) == Some('_') {
                        if let Some(Token::Word(_)) = prev_token {
                            chars.next();
                            return Ok(Some(Token::Period));
                        }

                        return self.tokenizer_error(
                            chars.location(),
                            "Unexpected character '_'".to_string(),
                        );
                    }

                    // Some dialects support underscore as number separator
                    // There can only be one at a time and it must be followed by another digit
                    let is_number_separator = |ch: char, next_char: Option<char>| {
                        self.dialect.supports_numeric_literal_underscores()
                            && ch == '_'
                            && next_char.is_some_and(|next_ch| next_ch.is_ascii_hexdigit())
                    };

                    let mut s = peeking_next_take_while(chars, |ch, next_ch| {
                        ch.is_ascii_digit() || is_number_separator(ch, next_ch)
                    });

                    // match binary literal that starts with 0x
                    if s == "0" && chars.peek() == Some(&'x') {
                        chars.next();
                        let s2 = peeking_next_take_while(chars, |ch, next_ch| {
                            ch.is_ascii_hexdigit() || is_number_separator(ch, next_ch)
                        });
                        return Ok(Some(Token::HexStringLiteral(s2)));
                    }

                    // match one period
                    if let Some('.') = chars.peek() {
                        s.push('.');
                        chars.next();
                    }

                    // If the dialect supports identifiers that start with a numeric prefix
                    // and we have now consumed a dot, check if the previous token was a Word.
                    // If so, what follows is definitely not part of a decimal number and
                    // we should yield the dot as a dedicated token so compound identifiers
                    // starting with digits can be parsed correctly.
                    if s == "." && self.dialect.supports_numeric_prefix() {
                        if let Some(Token::Word(_)) = prev_token {
                            return Ok(Some(Token::Period));
                        }
                    }

                    // Consume fractional digits.
                    s += &peeking_next_take_while(chars, |ch, next_ch| {
                        ch.is_ascii_digit() || is_number_separator(ch, next_ch)
                    });

                    // No fraction -> Token::Period
                    if s == "." {
                        return Ok(Some(Token::Period));
                    }

                    // Parse exponent as number
                    let mut exponent_part = String::new();
                    if chars.peek() == Some(&'e') || chars.peek() == Some(&'E') {
                        let mut char_clone = chars.peekable.clone();
                        exponent_part.push(char_clone.next().unwrap());

                        // Optional sign
                        match char_clone.peek() {
                            Some(&c) if matches!(c, '+' | '-') => {
                                exponent_part.push(c);
                                char_clone.next();
                            }
                            _ => (),
                        }

                        match char_clone.peek() {
                            // Definitely an exponent, get original iterator up to speed and use it
                            Some(&c) if c.is_ascii_digit() => {
                                for _ in 0..exponent_part.len() {
                                    chars.next();
                                }
                                exponent_part +=
                                    &peeking_take_while(chars, |ch| ch.is_ascii_digit());
                                s += exponent_part.as_str();
                            }
                            // Not an exponent, discard the work done
                            _ => (),
                        }
                    }

                    // If the dialect supports identifiers that start with a numeric prefix,
                    // we need to check if the value is in fact an identifier and must thus
                    // be tokenized as a word.
                    if self.dialect.supports_numeric_prefix() {
                        if exponent_part.is_empty() {
                            // If it is not a number with an exponent, it may be
                            // an identifier starting with digits.
                            let word =
                                peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch));

                            if !word.is_empty() {
                                s += word.as_str();
                                return Ok(Some(Token::make_word(s.as_str(), None)));
                            }
                        } else if prev_token == Some(&Token::Period) {
                            // If the previous token was a period, thus not belonging to a number,
                            // the value we have is part of an identifier.
                            return Ok(Some(Token::make_word(s.as_str(), None)));
                        }
                    }

                    let long = if chars.peek() == Some(&'L') {
                        chars.next();
                        true
                    } else {
                        false
                    };
                    Ok(Some(Token::Number(s, long)))
                }
                // punctuation
                '(' => self.consume_and_return(chars, Token::LParen),
                ')' => self.consume_and_return(chars, Token::RParen),
                ',' => self.consume_and_return(chars, Token::Comma),
                // operators
                '-' => {
                    chars.next(); // consume the '-'

                    match chars.peek() {
                        Some('-') => {
                            let mut is_comment = true;
                            if self.dialect.requires_single_line_comment_whitespace() {
                                is_comment = Some(' ') == chars.peekable.clone().nth(1);
                            }

                            if is_comment {
                                chars.next(); // consume second '-'
                                let comment = self.tokenize_single_line_comment(chars);
                                return Ok(Some(Token::Whitespace(
                                    Whitespace::SingleLineComment {
                                        prefix: "--".to_owned(),
                                        comment,
                                    },
                                )));
                            }

                            self.start_binop(chars, "-", Token::Minus)
                        }
                        Some('>') => {
                            chars.next();
                            match chars.peek() {
                                Some('>') => self.consume_for_binop(chars, "->>", Token::LongArrow),
                                _ => self.start_binop(chars, "->", Token::Arrow),
                            }
                        }
                        // a regular '-' operator
                        _ => self.start_binop(chars, "-", Token::Minus),
                    }
                }
                '/' => {
                    chars.next(); // consume the '/'
                    match chars.peek() {
                        Some('*') => {
                            chars.next(); // consume the '*', starting a multi-line comment
                            self.tokenize_multiline_comment(chars)
                        }
                        Some('/') if dialect_of!(self is SnowflakeDialect) => {
                            chars.next(); // consume the second '/', starting a snowflake single-line comment
                            let comment = self.tokenize_single_line_comment(chars);
                            Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
                                prefix: "//".to_owned(),
                                comment,
                            })))
                        }
                        Some('/') if dialect_of!(self is DuckDbDialect | GenericDialect) => {
                            self.consume_and_return(chars, Token::DuckIntDiv)
                        }
                        // a regular '/' operator
                        _ => Ok(Some(Token::Div)),
                    }
                }
                '+' => self.consume_and_return(chars, Token::Plus),
                '*' => self.consume_and_return(chars, Token::Mul),
                '%' => {
                    chars.next(); // advance past '%'
                    match chars.peek() {
                        Some(s) if s.is_whitespace() => Ok(Some(Token::Mod)),
                        Some(sch) if self.dialect.is_identifier_start('%') => {
                            self.tokenize_identifier_or_keyword([ch, *sch], chars)
                        }
                        _ => self.start_binop(chars, "%", Token::Mod),
                    }
                }
                '|' => {
                    chars.next(); // consume the '|'
                    match chars.peek() {
                        Some('/') => self.consume_for_binop(chars, "|/", Token::PGSquareRoot),
                        Some('|') => {
                            chars.next(); // consume the second '|'
                            match chars.peek() {
                                Some('/') => {
                                    self.consume_for_binop(chars, "||/", Token::PGCubeRoot)
                                }
                                _ => self.start_binop(chars, "||", Token::StringConcat),
                            }
                        }
                        Some('&') if self.dialect.supports_geometric_types() => {
                            chars.next(); // consume
                            match chars.peek() {
                                Some('>') => self.consume_for_binop(
                                    chars,
                                    "|&>",
                                    Token::VerticalBarAmpersandRightAngleBracket,
                                ),
                                _ => self.start_binop_opt(chars, "|&", None),
                            }
                        }
                        Some('>') if self.dialect.supports_geometric_types() => {
                            chars.next(); // consume
                            match chars.peek() {
                                Some('>') => self.consume_for_binop(
                                    chars,
                                    "|>>",
                                    Token::VerticalBarShiftRight,
                                ),
                                _ => self.start_binop_opt(chars, "|>", None),
                            }
                        }
                        Some('>') if self.dialect.supports_pipe_operator() => {
                            self.consume_for_binop(chars, "|>", Token::VerticalBarRightAngleBracket)
                        }
                        // Bitshift '|' operator
                        _ => self.start_binop(chars, "|", Token::Pipe),
                    }
                }
                '=' => {
                    chars.next(); // consume
                    match chars.peek() {
                        Some('>') => self.consume_and_return(chars, Token::RArrow),
                        Some('=') => self.consume_and_return(chars, Token::DoubleEq),
                        _ => Ok(Some(Token::Eq)),
                    }
                }
                '!' => {
                    chars.next(); // consume
                    match chars.peek() {
                        Some('=') => self.consume_and_return(chars, Token::Neq),
                        Some('!') => self.consume_and_return(chars, Token::DoubleExclamationMark),
                        Some('~') => {
                            chars.next();
                            match chars.peek() {
                                Some('*') => self
                                    .consume_and_return(chars, Token::ExclamationMarkTildeAsterisk),
                                Some('~') => {
                                    chars.next();
                                    match chars.peek() {
                                        Some('*') => self.consume_and_return(
                                            chars,
                                            Token::ExclamationMarkDoubleTildeAsterisk,
                                        ),
                                        _ => Ok(Some(Token::ExclamationMarkDoubleTilde)),
                                    }
                                }
                                _ => Ok(Some(Token::ExclamationMarkTilde)),
                            }
                        }
                        _ => Ok(Some(Token::ExclamationMark)),
                    }
                }
                '<' => {
                    chars.next(); // consume
                    match chars.peek() {
                        Some('=') => {
                            chars.next();
                            match chars.peek() {
                                Some('>') => self.consume_for_binop(chars, "<=>", Token::Spaceship),
                                _ => self.start_binop(chars, "<=", Token::LtEq),
                            }
                        }
                        Some('|') if self.dialect.supports_geometric_types() => {
                            self.consume_for_binop(chars, "<<|", Token::ShiftLeftVerticalBar)
                        }
                        Some('>') => self.consume_for_binop(chars, "<>", Token::Neq),
                        Some('<') if self.dialect.supports_geometric_types() => {
                            chars.next(); // consume
                            match chars.peek() {
                                Some('|') => self.consume_for_binop(
                                    chars,
                                    "<<|",
                                    Token::ShiftLeftVerticalBar,
                                ),
                                _ => self.start_binop(chars, "<<", Token::ShiftLeft),
                            }
                        }
                        Some('<') => self.consume_for_binop(chars, "<<", Token::ShiftLeft),
                        Some('-') if self.dialect.supports_geometric_types() => {
                            chars.next(); // consume
                            match chars.peek() {
                                Some('>') => {
                                    self.consume_for_binop(chars, "<->", Token::TwoWayArrow)
                                }
                                _ => self.start_binop_opt(chars, "<-", None),
                            }
                        }
                        Some('^') if self.dialect.supports_geometric_types() => {
                            self.consume_for_binop(chars, "<^", Token::LeftAngleBracketCaret)
                        }
                        Some('@') => self.consume_for_binop(chars, "<@", Token::ArrowAt),
                        _ => self.start_binop(chars, "<", Token::Lt),
                    }
                }
                '>' => {
                    chars.next(); // consume
                    match chars.peek() {
                        Some('=') => self.consume_for_binop(chars, ">=", Token::GtEq),
                        Some('>') => self.consume_for_binop(chars, ">>", Token::ShiftRight),
                        Some('^') if self.dialect.supports_geometric_types() => {
                            self.consume_for_binop(chars, ">^", Token::RightAngleBracketCaret)
                        }
                        _ => self.start_binop(chars, ">", Token::Gt),
                    }
                }
                ':' => {
                    chars.next();
                    match chars.peek() {
                        Some(':') => self.consume_and_return(chars, Token::DoubleColon),
                        Some('=') => self.consume_and_return(chars, Token::Assignment),
                        _ => Ok(Some(Token::Colon)),
                    }
                }
                ';' => self.consume_and_return(chars, Token::SemiColon),
                '\\' => self.consume_and_return(chars, Token::Backslash),
                '[' => self.consume_and_return(chars, Token::LBracket),
                ']' => self.consume_and_return(chars, Token::RBracket),
                '&' => {
                    chars.next(); // consume the '&'
                    match chars.peek() {
                        Some('>') if self.dialect.supports_geometric_types() => {
                            chars.next();
                            self.consume_and_return(chars, Token::AmpersandRightAngleBracket)
                        }
                        Some('<') if self.dialect.supports_geometric_types() => {
                            chars.next(); // consume
                            match chars.peek() {
                                Some('|') => self.consume_and_return(
                                    chars,
                                    Token::AmpersandLeftAngleBracketVerticalBar,
                                ),
                                _ => {
                                    self.start_binop(chars, "&<", Token::AmpersandLeftAngleBracket)
                                }
                            }
                        }
                        Some('&') => {
                            chars.next(); // consume the second '&'
                            self.start_binop(chars, "&&", Token::Overlap)
                        }
                        // Bitshift '&' operator
                        _ => self.start_binop(chars, "&", Token::Ampersand),
                    }
                }
                '^' => {
                    chars.next(); // consume the '^'
                    match chars.peek() {
                        Some('@') => self.consume_and_return(chars, Token::CaretAt),
                        _ => Ok(Some(Token::Caret)),
                    }
                }
                '{' => self.consume_and_return(chars, Token::LBrace),
                '}' => self.consume_and_return(chars, Token::RBrace),
                '#' if dialect_of!(self is SnowflakeDialect | BigQueryDialect | MySqlDialect | HiveDialect) =>
                {
                    chars.next(); // consume the '#', starting a snowflake single-line comment
                    let comment = self.tokenize_single_line_comment(chars);
                    Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
                        prefix: "#".to_owned(),
                        comment,
                    })))
                }
                '~' => {
                    chars.next(); // consume
                    match chars.peek() {
                        Some('*') => self.consume_for_binop(chars, "~*", Token::TildeAsterisk),
                        Some('=') if self.dialect.supports_geometric_types() => {
                            self.consume_for_binop(chars, "~=", Token::TildeEqual)
                        }
                        Some('~') => {
                            chars.next();
                            match chars.peek() {
                                Some('*') => {
                                    self.consume_for_binop(chars, "~~*", Token::DoubleTildeAsterisk)
                                }
                                _ => self.start_binop(chars, "~~", Token::DoubleTilde),
                            }
                        }
                        _ => self.start_binop(chars, "~", Token::Tilde),
                    }
                }
                '#' => {
                    chars.next();
                    match chars.peek() {
                        Some('-') => self.consume_for_binop(chars, "#-", Token::HashMinus),
                        Some('>') => {
                            chars.next();
                            match chars.peek() {
                                Some('>') => {
                                    self.consume_for_binop(chars, "#>>", Token::HashLongArrow)
                                }
                                _ => self.start_binop(chars, "#>", Token::HashArrow),
                            }
                        }
                        Some(' ') => Ok(Some(Token::Sharp)),
                        Some('#') if self.dialect.supports_geometric_types() => {
                            self.consume_for_binop(chars, "##", Token::DoubleSharp)
                        }
                        Some(sch) if self.dialect.is_identifier_start('#') => {
                            self.tokenize_identifier_or_keyword([ch, *sch], chars)
                        }
                        _ => self.start_binop(chars, "#", Token::Sharp),
                    }
                }
                '@' => {
                    chars.next();
                    match chars.peek() {
                        Some('@') if self.dialect.supports_geometric_types() => {
                            self.consume_and_return(chars, Token::AtAt)
                        }
                        Some('-') if self.dialect.supports_geometric_types() => {
                            chars.next();
                            match chars.peek() {
                                Some('@') => self.consume_and_return(chars, Token::AtDashAt),
                                _ => self.start_binop_opt(chars, "@-", None),
                            }
                        }
                        Some('>') => self.consume_and_return(chars, Token::AtArrow),
                        Some('?') => self.consume_and_return(chars, Token::AtQuestion),
                        Some('@') => {
                            chars.next();
                            match chars.peek() {
                                Some(' ') => Ok(Some(Token::AtAt)),
                                Some(tch) if self.dialect.is_identifier_start('@') => {
                                    self.tokenize_identifier_or_keyword([ch, '@', *tch], chars)
                                }
                                _ => Ok(Some(Token::AtAt)),
                            }
                        }
                        Some(' ') => Ok(Some(Token::AtSign)),
                        // We break on quotes here, because no dialect allows identifiers starting
                        // with @ and containing quotation marks (e.g. `@'foo'`) unless they are
                        // quoted, which is tokenized as a quoted string, not here (e.g.
                        // `"@'foo'"`). Further, at least two dialects parse `@` followed by a
                        // quoted string as two separate tokens, which this allows. For example,
                        // Postgres parses `@'1'` as the absolute value of '1' which is implicitly
                        // cast to a numeric type. And when parsing MySQL-style grantees (e.g.
                        // `GRANT ALL ON *.* to 'root'@'localhost'`), we also want separate tokens
                        // for the user, the `@`, and the host.
                        Some('\'') => Ok(Some(Token::AtSign)),
                        Some('\"') => Ok(Some(Token::AtSign)),
                        Some('`') => Ok(Some(Token::AtSign)),
                        Some(sch) if self.dialect.is_identifier_start('@') => {
                            self.tokenize_identifier_or_keyword([ch, *sch], chars)
                        }
                        _ => Ok(Some(Token::AtSign)),
                    }
                }
                // Postgres uses ? for jsonb operators, not prepared statements
                '?' if self.dialect.supports_geometric_types() => {
                    chars.next(); // consume
                    match chars.peek() {
                        Some('|') => {
                            chars.next();
                            match chars.peek() {
                                Some('|') => self.consume_and_return(
                                    chars,
                                    Token::QuestionMarkDoubleVerticalBar,
                                ),
                                _ => Ok(Some(Token::QuestionPipe)),
                            }
                        }

                        Some('&') => self.consume_and_return(chars, Token::QuestionAnd),
                        Some('-') => {
                            chars.next(); // consume
                            match chars.peek() {
                                Some('|') => self
                                    .consume_and_return(chars, Token::QuestionMarkDashVerticalBar),
                                _ => Ok(Some(Token::QuestionMarkDash)),
                            }
                        }
                        Some('#') => self.consume_and_return(chars, Token::QuestionMarkSharp),
                        _ => self.consume_and_return(chars, Token::Question),
                    }
                }
                '?' => {
                    chars.next();
                    let s = peeking_take_while(chars, |ch| ch.is_numeric());
                    Ok(Some(Token::Placeholder(String::from("?") + &s)))
                }

                // identifier or keyword
                ch if self.dialect.is_identifier_start(ch) => {
                    self.tokenize_identifier_or_keyword([ch], chars)
                }
                '$' => Ok(Some(self.tokenize_dollar_preceded_value(chars)?)),

                // whitespace check (including unicode chars) should be last as it covers some of the chars above
                ch if ch.is_whitespace() => {
                    self.consume_and_return(chars, Token::Whitespace(Whitespace::Space))
                }
                other => self.consume_and_return(chars, Token::Char(other)),
            },
            None => Ok(None),
        }
    }

    /// Consume the next character, then parse a custom binary operator. The next character should be included in the prefix
    fn consume_for_binop(
        &self,
        chars: &mut State,
        prefix: &str,
        default: Token,
    ) -> Result<Option<Token>, TokenizerError> {
        chars.next(); // consume the first char
        self.start_binop_opt(chars, prefix, Some(default))
    }

    /// parse a custom binary operator
    fn start_binop(
        &self,
        chars: &mut State,
        prefix: &str,
        default: Token,
    ) -> Result<Option<Token>, TokenizerError> {
        self.start_binop_opt(chars, prefix, Some(default))
    }

    /// parse a custom binary operator
    fn start_binop_opt(
        &self,
        chars: &mut State,
        prefix: &str,
        default: Option<Token>,
    ) -> Result<Option<Token>, TokenizerError> {
        let mut custom = None;
        while let Some(&ch) = chars.peek() {
            if !self.dialect.is_custom_operator_part(ch) {
                break;
            }

            custom.get_or_insert_with(|| prefix.to_string()).push(ch);
            chars.next();
        }
        match (custom, default) {
            (Some(custom), _) => Ok(Token::CustomBinaryOperator(custom).into()),
            (None, Some(tok)) => Ok(Some(tok)),
            (None, None) => self.tokenizer_error(
                chars.location(),
                format!("Expected a valid binary operator after '{prefix}'"),
            ),
        }
    }

    /// Tokenize dollar preceded value (i.e: a string/placeholder)
    fn tokenize_dollar_preceded_value(&self, chars: &mut State) -> Result<Token, TokenizerError> {
        let mut s = String::new();
        let mut value = String::new();

        chars.next();

        // If the dialect does not support dollar-quoted strings, then `$$` is rather a placeholder.
        if matches!(chars.peek(), Some('$')) && !self.dialect.supports_dollar_placeholder() {
            chars.next();

            let mut is_terminated = false;
            let mut prev: Option<char> = None;

            while let Some(&ch) = chars.peek() {
                if prev == Some('$') {
                    if ch == '$' {
                        chars.next();
                        is_terminated = true;
                        break;
                    } else {
                        s.push('$');
                        s.push(ch);
                    }
                } else if ch != '$' {
                    s.push(ch);
                }

                prev = Some(ch);
                chars.next();
            }

            return if chars.peek().is_none() && !is_terminated {
                self.tokenizer_error(chars.location(), "Unterminated dollar-quoted string")
            } else {
                Ok(Token::DollarQuotedString(DollarQuotedString {
                    value: s,
                    tag: None,
                }))
            };
        } else {
            value.push_str(&peeking_take_while(chars, |ch| {
                ch.is_alphanumeric()
                    || ch == '_'
                    // Allow $ as a placeholder character if the dialect supports it
                    || matches!(ch, '$' if self.dialect.supports_dollar_placeholder())
            }));

            // If the dialect does not support dollar-quoted strings, don't look for the end delimiter.
            if matches!(chars.peek(), Some('$')) && !self.dialect.supports_dollar_placeholder() {
                chars.next();

                let mut temp = String::new();
                let end_delimiter = format!("${value}$");

                loop {
                    match chars.next() {
                        Some(ch) => {
                            temp.push(ch);

                            if temp.ends_with(&end_delimiter) {
                                if let Some(temp) = temp.strip_suffix(&end_delimiter) {
                                    s.push_str(temp);
                                }
                                break;
                            }
                        }
                        None => {
                            if temp.ends_with(&end_delimiter) {
                                if let Some(temp) = temp.strip_suffix(&end_delimiter) {
                                    s.push_str(temp);
                                }
                                break;
                            }

                            return self.tokenizer_error(
                                chars.location(),
                                "Unterminated dollar-quoted, expected $",
                            );
                        }
                    }
                }
            } else {
                return Ok(Token::Placeholder(String::from("$") + &value));
            }
        }

        Ok(Token::DollarQuotedString(DollarQuotedString {
            value: s,
            tag: if value.is_empty() { None } else { Some(value) },
        }))
    }

    fn tokenizer_error<R>(
        &self,
        loc: Location,
        message: impl Into<String>,
    ) -> Result<R, TokenizerError> {
        Err(TokenizerError {
            message: message.into(),
            location: loc,
        })
    }

    // Consume characters until newline
    fn tokenize_single_line_comment(&self, chars: &mut State) -> String {
        let mut comment = peeking_take_while(chars, |ch| match ch {
            '\n' => false,                                           // Always stop at \n
            '\r' if dialect_of!(self is PostgreSqlDialect) => false, // Stop at \r for Postgres
            _ => true, // Keep consuming for other characters
        });

        if let Some(ch) = chars.next() {
            assert!(ch == '\n' || ch == '\r');
            comment.push(ch);
        }

        comment
    }

    /// Tokenize an identifier or keyword, after the first char is already consumed.
    fn tokenize_word(&self, first_chars: impl Into<String>, chars: &mut State) -> String {
        let mut s = first_chars.into();
        s.push_str(&peeking_take_while(chars, |ch| {
            self.dialect.is_identifier_part(ch)
        }));
        s
    }

    /// Read a quoted identifier
    fn tokenize_quoted_identifier(
        &self,
        quote_start: char,
        chars: &mut State,
    ) -> Result<String, TokenizerError> {
        let error_loc = chars.location();
        chars.next(); // consume the opening quote
        let quote_end = Word::matching_end_quote(quote_start);
        let (s, last_char) = self.parse_quoted_ident(chars, quote_end);

        if last_char == Some(quote_end) {
            Ok(s)
        } else {
            self.tokenizer_error(
                error_loc,
                format!("Expected close delimiter '{quote_end}' before EOF."),
            )
        }
    }

    /// Read a single quoted string, starting with the opening quote.
    fn tokenize_escaped_single_quoted_string(
        &self,
        starting_loc: Location,
        chars: &mut State,
    ) -> Result<String, TokenizerError> {
        if let Some(s) = unescape_single_quoted_string(chars) {
            return Ok(s);
        }

        self.tokenizer_error(starting_loc, "Unterminated encoded string literal")
    }

    /// Reads a string literal quoted by a single or triple quote characters.
    /// Examples: `'abc'`, `'''abc'''`, `"""abc"""`.
    fn tokenize_single_or_triple_quoted_string<F>(
        &self,
        chars: &mut State,
        quote_style: char,
        backslash_escape: bool,
        single_quote_token: F,
        triple_quote_token: F,
    ) -> Result<Option<Token>, TokenizerError>
    where
        F: Fn(String) -> Token,
    {
        let error_loc = chars.location();

        let mut num_opening_quotes = 0u8;
        for _ in 0..3 {
            if Some(&quote_style) == chars.peek() {
                chars.next(); // Consume quote.
                num_opening_quotes += 1;
            } else {
                break;
            }
        }

        let (token_fn, num_quote_chars) = match num_opening_quotes {
            1 => (single_quote_token, NumStringQuoteChars::One),
            2 => {
                // If we matched double quotes, then this is an empty string.
                return Ok(Some(single_quote_token("".into())));
            }
            3 => {
                let Some(num_quote_chars) = NonZeroU8::new(3) else {
                    return self.tokenizer_error(error_loc, "invalid number of opening quotes");
                };
                (
                    triple_quote_token,
                    NumStringQuoteChars::Many(num_quote_chars),
                )
            }
            _ => {
                return self.tokenizer_error(error_loc, "invalid string literal opening");
            }
        };

        let settings = TokenizeQuotedStringSettings {
            quote_style,
            num_quote_chars,
            num_opening_quotes_to_consume: 0,
            backslash_escape,
        };

        self.tokenize_quoted_string(chars, settings)
            .map(token_fn)
            .map(Some)
    }

    /// Reads a string literal quoted by a single quote character.
    fn tokenize_single_quoted_string(
        &self,
        chars: &mut State,
        quote_style: char,
        backslash_escape: bool,
    ) -> Result<String, TokenizerError> {
        self.tokenize_quoted_string(
            chars,
            TokenizeQuotedStringSettings {
                quote_style,
                num_quote_chars: NumStringQuoteChars::One,
                num_opening_quotes_to_consume: 1,
                backslash_escape,
            },
        )
    }

    /// Read a quoted string.
    fn tokenize_quoted_string(
        &self,
        chars: &mut State,
        settings: TokenizeQuotedStringSettings,
    ) -> Result<String, TokenizerError> {
        let mut s = String::new();
        let error_loc = chars.location();

        // Consume any opening quotes.
        for _ in 0..settings.num_opening_quotes_to_consume {
            if Some(settings.quote_style) != chars.next() {
                return self.tokenizer_error(error_loc, "invalid string literal opening");
            }
        }

        let mut num_consecutive_quotes = 0;
        while let Some(&ch) = chars.peek() {
            let pending_final_quote = match settings.num_quote_chars {
                NumStringQuoteChars::One => Some(NumStringQuoteChars::One),
                n @ NumStringQuoteChars::Many(count)
                    if num_consecutive_quotes + 1 == count.get() =>
                {
                    Some(n)
                }
                NumStringQuoteChars::Many(_) => None,
            };

            match ch {
                char if char == settings.quote_style && pending_final_quote.is_some() => {
                    chars.next(); // consume

                    if let Some(NumStringQuoteChars::Many(count)) = pending_final_quote {
                        // For an initial string like `"""abc"""`, at this point we have
                        // `abc""` in the buffer and have now matched the final `"`.
                        // However, the string to return is simply `abc`, so we strip off
                        // the trailing quotes before returning.
                        let mut buf = s.chars();
                        for _ in 1..count.get() {
                            buf.next_back();
                        }
                        return Ok(buf.as_str().to_string());
                    } else if chars
                        .peek()
                        .map(|c| *c == settings.quote_style)
                        .unwrap_or(false)
                    {
                        s.push(ch);
                        if !self.unescape {
                            // In no-escape mode, the given query has to be saved completely
                            s.push(ch);
                        }
                        chars.next();
                    } else {
                        return Ok(s);
                    }
                }
                '\\' if settings.backslash_escape => {
                    // consume backslash
                    chars.next();

                    num_consecutive_quotes = 0;

                    if let Some(next) = chars.peek() {
                        if !self.unescape
                            || (self.dialect.ignores_wildcard_escapes()
                                && (*next == '%' || *next == '_'))
                        {
                            // In no-escape mode, the given query has to be saved completely
                            // including backslashes. Similarly, with ignore_like_wildcard_escapes,
                            // the backslash is not stripped.
                            s.push(ch);
                            s.push(*next);
                            chars.next(); // consume next
                        } else {
                            let n = match next {
                                '0' => '\0',
                                'a' => '\u{7}',
                                'b' => '\u{8}',
                                'f' => '\u{c}',
                                'n' => '\n',
                                'r' => '\r',
                                't' => '\t',
                                'Z' => '\u{1a}',
                                _ => *next,
                            };
                            s.push(n);
                            chars.next(); // consume next
                        }
                    }
                }
                ch => {
                    chars.next(); // consume ch

                    if ch == settings.quote_style {
                        num_consecutive_quotes += 1;
                    } else {
                        num_consecutive_quotes = 0;
                    }

                    s.push(ch);
                }
            }
        }
        self.tokenizer_error(error_loc, "Unterminated string literal")
    }

    fn tokenize_multiline_comment(
        &self,
        chars: &mut State,
    ) -> Result<Option<Token>, TokenizerError> {
        let mut s = String::new();
        let mut nested = 1;
        let supports_nested_comments = self.dialect.supports_nested_comments();

        loop {
            match chars.next() {
                Some('/') if matches!(chars.peek(), Some('*')) && supports_nested_comments => {
                    chars.next(); // consume the '*'
                    s.push('/');
                    s.push('*');
                    nested += 1;
                }
                Some('*') if matches!(chars.peek(), Some('/')) => {
                    chars.next(); // consume the '/'
                    nested -= 1;
                    if nested == 0 {
                        break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
                    }
                    s.push('*');
                    s.push('/');
                }
                Some(ch) => {
                    s.push(ch);
                }
                None => {
                    break self.tokenizer_error(
                        chars.location(),
                        "Unexpected EOF while in a multi-line comment",
                    );
                }
            }
        }
    }

    fn parse_quoted_ident(&self, chars: &mut State, quote_end: char) -> (String, Option<char>) {
        let mut last_char = None;
        let mut s = String::new();
        while let Some(ch) = chars.next() {
            if ch == quote_end {
                if chars.peek() == Some(&quote_end) {
                    chars.next();
                    s.push(ch);
                    if !self.unescape {
                        // In no-escape mode, the given query has to be saved completely
                        s.push(ch);
                    }
                } else {
                    last_char = Some(quote_end);
                    break;
                }
            } else {
                s.push(ch);
            }
        }
        (s, last_char)
    }

    #[allow(clippy::unnecessary_wraps)]
    fn consume_and_return(
        &self,
        chars: &mut State,
        t: Token,
    ) -> Result<Option<Token>, TokenizerError> {
        chars.next();
        Ok(Some(t))
    }
}

/// Read from `chars` until `predicate` returns `false` or EOF is hit.
/// Return the characters read as String, and keep the first non-matching
/// char available as `chars.next()`.
fn peeking_take_while(chars: &mut State, mut predicate: impl FnMut(char) -> bool) -> String {
    let mut s = String::new();
    while let Some(&ch) = chars.peek() {
        if predicate(ch) {
            chars.next(); // consume
            s.push(ch);
        } else {
            break;
        }
    }
    s
}

/// Same as peeking_take_while, but also passes the next character to the predicate.
fn peeking_next_take_while(
    chars: &mut State,
    mut predicate: impl FnMut(char, Option<char>) -> bool,
) -> String {
    let mut s = String::new();
    while let Some(&ch) = chars.peek() {
        let next_char = chars.peekable.clone().nth(1);
        if predicate(ch, next_char) {
            chars.next(); // consume
            s.push(ch);
        } else {
            break;
        }
    }
    s
}

fn unescape_single_quoted_string(chars: &mut State<'_>) -> Option<String> {
    Unescape::new(chars).unescape()
}

struct Unescape<'a: 'b, 'b> {
    chars: &'b mut State<'a>,
}

impl<'a: 'b, 'b> Unescape<'a, 'b> {
    fn new(chars: &'b mut State<'a>) -> Self {
        Self { chars }
    }
    fn unescape(mut self) -> Option<String> {
        let mut unescaped = String::new();

        self.chars.next();

        while let Some(c) = self.chars.next() {
            if c == '\'' {
                // case: ''''
                if self.chars.peek().map(|c| *c == '\'').unwrap_or(false) {
                    self.chars.next();
                    unescaped.push('\'');
                    continue;
                }
                return Some(unescaped);
            }

            if c != '\\' {
                unescaped.push(c);
                continue;
            }

            let c = match self.chars.next()? {
                'b' => '\u{0008}',
                'f' => '\u{000C}',
                'n' => '\n',
                'r' => '\r',
                't' => '\t',
                'u' => self.unescape_unicode_16()?,
                'U' => self.unescape_unicode_32()?,
                'x' => self.unescape_hex()?,
                c if c.is_digit(8) => self.unescape_octal(c)?,
                c => c,
            };

            unescaped.push(Self::check_null(c)?);
        }

        None
    }

    #[inline]
    fn check_null(c: char) -> Option<char> {
        if c == '\0' {
            None
        } else {
            Some(c)
        }
    }

    #[inline]
    fn byte_to_char<const RADIX: u32>(s: &str) -> Option<char> {
        // u32 is used here because Pg has an overflow operation rather than throwing an exception directly.
        match u32::from_str_radix(s, RADIX) {
            Err(_) => None,
            Ok(n) => {
                let n = n & 0xFF;
                if n <= 127 {
                    char::from_u32(n)
                } else {
                    None
                }
            }
        }
    }

    // Hexadecimal byte value. \xh, \xhh (h = 0–9, A–F)
    fn unescape_hex(&mut self) -> Option<char> {
        let mut s = String::new();

        for _ in 0..2 {
            match self.next_hex_digit() {
                Some(c) => s.push(c),
                None => break,
            }
        }

        if s.is_empty() {
            return Some('x');
        }

        Self::byte_to_char::<16>(&s)
    }

    #[inline]
    fn next_hex_digit(&mut self) -> Option<char> {
        match self.chars.peek() {
            Some(c) if c.is_ascii_hexdigit() => self.chars.next(),
            _ => None,
        }
    }

    // Octal byte value. \o, \oo, \ooo (o = 0–7)
    fn unescape_octal(&mut self, c: char) -> Option<char> {
        let mut s = String::new();

        s.push(c);
        for _ in 0..2 {
            match self.next_octal_digest() {
                Some(c) => s.push(c),
                None => break,
            }
        }

        Self::byte_to_char::<8>(&s)
    }

    #[inline]
    fn next_octal_digest(&mut self) -> Option<char> {
        match self.chars.peek() {
            Some(c) if c.is_digit(8) => self.chars.next(),
            _ => None,
        }
    }

    // 16-bit hexadecimal Unicode character value. \uxxxx (x = 0–9, A–F)
    fn unescape_unicode_16(&mut self) -> Option<char> {
        self.unescape_unicode::<4>()
    }

    // 32-bit hexadecimal Unicode character value. \Uxxxxxxxx (x = 0–9, A–F)
    fn unescape_unicode_32(&mut self) -> Option<char> {
        self.unescape_unicode::<8>()
    }

    fn unescape_unicode<const NUM: usize>(&mut self) -> Option<char> {
        let mut s = String::new();
        for _ in 0..NUM {
            s.push(self.chars.next()?);
        }
        match u32::from_str_radix(&s, 16) {
            Err(_) => None,
            Ok(n) => char::from_u32(n),
        }
    }
}

fn unescape_unicode_single_quoted_string(chars: &mut State<'_>) -> Result<String, TokenizerError> {
    let mut unescaped = String::new();
    chars.next(); // consume the opening quote
    while let Some(c) = chars.next() {
        match c {
            '\'' => {
                if chars.peek() == Some(&'\'') {
                    chars.next();
                    unescaped.push('\'');
                } else {
                    return Ok(unescaped);
                }
            }
            '\\' => match chars.peek() {
                Some('\\') => {
                    chars.next();
                    unescaped.push('\\');
                }
                Some('+') => {
                    chars.next();
                    unescaped.push(take_char_from_hex_digits(chars, 6)?);
                }
                _ => unescaped.push(take_char_from_hex_digits(chars, 4)?),
            },
            _ => {
                unescaped.push(c);
            }
        }
    }
    Err(TokenizerError {
        message: "Unterminated unicode encoded string literal".to_string(),
        location: chars.location(),
    })
}

fn take_char_from_hex_digits(
    chars: &mut State<'_>,
    max_digits: usize,
) -> Result<char, TokenizerError> {
    let mut result = 0u32;
    for _ in 0..max_digits {
        let next_char = chars.next().ok_or_else(|| TokenizerError {
            message: "Unexpected EOF while parsing hex digit in escaped unicode string."
                .to_string(),
            location: chars.location(),
        })?;
        let digit = next_char.to_digit(16).ok_or_else(|| TokenizerError {
            message: format!("Invalid hex digit in escaped unicode string: {next_char}"),
            location: chars.location(),
        })?;
        result = result * 16 + digit;
    }
    char::from_u32(result).ok_or_else(|| TokenizerError {
        message: format!("Invalid unicode character: {result:x}"),
        location: chars.location(),
    })
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::dialect::{
        BigQueryDialect, ClickHouseDialect, HiveDialect, MsSqlDialect, MySqlDialect, SQLiteDialect,
    };
    use crate::test_utils::all_dialects_where;
    use core::fmt::Debug;

    #[test]
    fn tokenizer_error_impl() {
        let err = TokenizerError {
            message: "test".into(),
            location: Location { line: 1, column: 1 },
        };
        #[cfg(feature = "std")]
        {
            use std::error::Error;
            assert!(err.source().is_none());
        }
        assert_eq!(err.to_string(), "test at Line: 1, Column: 1");
    }

    #[test]
    fn tokenize_select_1() {
        let sql = String::from("SELECT 1");
        let dialect = GenericDialect {};
        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();

        let expected = vec![
            Token::make_keyword("SELECT"),
            Token::Whitespace(Whitespace::Space),
            Token::Number(String::from("1"), false),
        ];

        compare(expected, tokens);
    }

    #[test]
    fn tokenize_select_float() {
        let sql = String::from("SELECT .1");
        let dialect = GenericDialect {};
        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();

        let expected = vec![
            Token::make_keyword("SELECT"),
            Token::Whitespace(Whitespace::Space),
            Token::Number(String::from(".1"), false),
        ];

        compare(expected, tokens);
    }

    #[test]
    fn tokenize_clickhouse_double_equal() {
        let sql = String::from("SELECT foo=='1'");
        let dialect = ClickHouseDialect {};
        let mut tokenizer = Tokenizer::new(&dialect, &sql);
        let tokens = tokenizer.tokenize().unwrap();

        let expected = vec![
            Token::make_keyword("SELECT"),
            Token::Whitespace(Whitespace::Space),
            Token::Word(Word {
                value: "foo".to_string(),
                quote_style: None,
                keyword: Keyword::NoKeyword,
            }),
            Token::DoubleEq,
            Token::SingleQuotedString("1".to_string()),
        ];

        compare(expected, tokens);
    }

    #[test]
    fn tokenize_numeric_literal_underscore() {
        let dialect = GenericDialect {};
        let sql = String::from("SELECT 10_000");
        let mut tokenizer = Tokenizer::new(&dialect, &sql);
        let tokens = tokenizer.tokenize().unwrap();
        let expected = vec![
            Token::make_keyword("SELECT"),
            Token::Whitespace(Whitespace::Space),
            Token::Number("10".to_string(), false),
            Token::make_word("_000", None),
        ];
        compare(expected, tokens);

        all_dialects_where(|dialect| dialect.supports_numeric_literal_underscores()).tokenizes_to(
            "SELECT 10_000, _10_000, 10_00_, 10___0",
            vec![
                Token::make_keyword("SELECT"),
                Token::Whitespace(Whitespace::Space),
                Token::Number("10_000".to_string(), false),
                Token::Comma,
                Token::Whitespace(Whitespace::Space),
                Token::make_word("_10_000", None), // leading underscore tokenizes as a word (parsed as column identifier)
                Token::Comma,
                Token::Whitespace(Whitespace::Space),
                Token::Number("10_00".to_string(), false),
                Token::make_word("_", None), // trailing underscores tokenizes as a word (syntax error in some dialects)
                Token::Comma,
                Token::Whitespace(Whitespace::Space),
                Token::Number("10".to_string(), false),
                Token::make_word("___0", None), // multiple underscores tokenizes as a word (syntax error in some dialects)
            ],
        );
    }

    #[test]
    fn tokenize_select_exponent() {
        let sql = String::from("SELECT 1e10, 1e-10, 1e+10, 1ea, 1e-10a, 1e-10-10");
        let dialect = GenericDialect {};
        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();

        let expected = vec![
            Token::make_keyword("SELECT"),
            Token::Whitespace(Whitespace::Space),
            Token::Number(String::from("1e10"), false),
            Token::Comma,
            Token::Whitespace(Whitespace::Space),
            Token::Number(String::from("1e-10"), false),
            Token::Comma,
            Token::Whitespace(Whitespace::Space),
            Token::Number(String::from("1e+10"), false),
            Token::Comma,
            Token::Whitespace(Whitespace::Space),
            Token::Number(String::from("1"), false),
            Token::make_word("ea", None),
            Token::Comma,
            Token::Whitespace(Whitespace::Space),
            Token::Number(String::from("1e-10"), false),
            Token::make_word("a", None),
            Token::Comma,
            Token::Whitespace(Whitespace::Space),
            Token::Number(String::from("1e-10"), false),
            Token::Minus,
            Token::Number(String::from("10"), false),
        ];

        compare(expected, tokens);
    }

    #[test]
    fn tokenize_scalar_function() {
        let sql = String::from("SELECT sqrt(1)");
        let dialect = GenericDialect {};
        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();

        let expected = vec![
            Token::make_keyword("SELECT"),
            Token::Whitespace(Whitespace::Space),
            Token::make_word("sqrt", None),
            Token::LParen,
            Token::Number(String::from("1"), false),
            Token::RParen,
        ];

        compare(expected, tokens);
    }

    #[test]
    fn tokenize_string_string_concat() {
        let sql = String::from("SELECT 'a' || 'b'");
        let dialect = GenericDialect {};
        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();

        let expected = vec![
            Token::make_keyword("SELECT"),
            Token::Whitespace(Whitespace::Space),
            Token::SingleQuotedString(String::from("a")),
            Token::Whitespace(Whitespace::Space),
            Token::StringConcat,
            Token::Whitespace(Whitespace::Space),
            Token::SingleQuotedString(String::from("b")),
        ];

        compare(expected, tokens);
    }
    #[test]
    fn tokenize_bitwise_op() {
        let sql = String::from("SELECT one | two ^ three");
        let dialect = GenericDialect {};
        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();

        let expected = vec![
            Token::make_keyword("SELECT"),
            Token::Whitespace(Whitespace::Space),
            Token::make_word("one", None),
            Token::Whitespace(Whitespace::Space),
            Token::Pipe,
            Token::Whitespace(Whitespace::Space),
            Token::make_word("two", None),
            Token::Whitespace(Whitespace::Space),
            Token::Caret,
            Token::Whitespace(Whitespace::Space),
            Token::make_word("three", None),
        ];
        compare(expected, tokens);
    }

    #[test]
    fn tokenize_logical_xor() {
        let sql =
            String::from("SELECT true XOR true, false XOR false, true XOR false, false XOR true");
        let dialect = GenericDialect {};
        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();

        let expected = vec![
            Token::make_keyword("SELECT"),
            Token::Whitespace(Whitespace::Space),
            Token::make_keyword("true"),
            Token::Whitespace(Whitespace::Space),
            Token::make_keyword("XOR"),
            Token::Whitespace(Whitespace::Space),
            Token::make_keyword("true"),
            Token::Comma,
            Token::Whitespace(Whitespace::Space),
            Token::make_keyword("false"),
            Token::Whitespace(Whitespace::Space),
            Token::make_keyword("XOR"),
            Token::Whitespace(Whitespace::Space),
            Token::make_keyword("false"),
            Token::Comma,
            Token::Whitespace(Whitespace::Space),
            Token::make_keyword("true"),
            Token::Whitespace(Whitespace::Space),
            Token::make_keyword("XOR"),
            Token::Whitespace(Whitespace::Space),
            Token::make_keyword("false"),
            Token::Comma,
            Token::Whitespace(Whitespace::Space),
            Token::make_keyword("false"),
            Token::Whitespace(Whitespace::Space),
            Token::make_keyword("XOR"),
            Token::Whitespace(Whitespace::Space),
            Token::make_keyword("true"),
        ];
        compare(expected, tokens);
    }

    #[test]
    fn tokenize_simple_select() {
        let sql = String::from("SELECT * FROM customer WHERE id = 1 LIMIT 5");
        let dialect = GenericDialect {};
        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();

        let expected = vec![
            Token::make_keyword("SELECT"),
            Token::Whitespace(Whitespace::Space),
            Token::Mul,
            Token::Whitespace(Whitespace::Space),
            Token::make_keyword("FROM"),
            Token::Whitespace(Whitespace::Space),
            Token::make_word("customer", None),
            Token::Whitespace(Whitespace::Space),
            Token::make_keyword("WHERE"),
            Token::Whitespace(Whitespace::Space),
            Token::make_word("id", None),
            Token::Whitespace(Whitespace::Space),
            Token::Eq,
            Token::Whitespace(Whitespace::Space),
            Token::Number(String::from("1"), false),
            Token::Whitespace(Whitespace::Space),
            Token::make_keyword("LIMIT"),
            Token::Whitespace(Whitespace::Space),
            Token::Number(String::from("5"), false),
        ];

        compare(expected, tokens);
    }

    #[test]
    fn tokenize_explain_select() {
        let sql = String::from("EXPLAIN SELECT * FROM customer WHERE id = 1");
        let dialect = GenericDialect {};
        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();

        let expected = vec![
            Token::make_keyword("EXPLAIN"),
            Token::Whitespace(Whitespace::Space),
            Token::make_keyword("SELECT"),
            Token::Whitespace(Whitespace::Space),
            Token::Mul,
            Token::Whitespace(Whitespace::Space),
            Token::make_keyword("FROM"),
            Token::Whitespace(Whitespace::Space),
            Token::make_word("customer", None),
            Token::Whitespace(Whitespace::Space),
            Token::make_keyword("WHERE"),
            Token::Whitespace(Whitespace::Space),
            Token::make_word("id", None),
            Token::Whitespace(Whitespace::Space),
            Token::Eq,
            Token::Whitespace(Whitespace::Space),
            Token::Number(String::from("1"), false),
        ];

        compare(expected, tokens);
    }

    #[test]
    fn tokenize_explain_analyze_select() {
        let sql = String::from("EXPLAIN ANALYZE SELECT * FROM customer WHERE id = 1");
        let dialect = GenericDialect {};
        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();

        let expected = vec![
            Token::make_keyword("EXPLAIN"),
            Token::Whitespace(Whitespace::Space),
            Token::make_keyword("ANALYZE"),
            Token::Whitespace(Whitespace::Space),
            Token::make_keyword("SELECT"),
            Token::Whitespace(Whitespace::Space),
            Token::Mul,
            Token::Whitespace(Whitespace::Space),
            Token::make_keyword("FROM"),
            Token::Whitespace(Whitespace::Space),
            Token::make_word("customer", None),
            Token::Whitespace(Whitespace::Space),
            Token::make_keyword("WHERE"),
            Token::Whitespace(Whitespace::Space),
            Token::make_word("id", None),
            Token::Whitespace(Whitespace::Space),
            Token::Eq,
            Token::Whitespace(Whitespace::Space),
            Token::Number(String::from("1"), false),
        ];

        compare(expected, tokens);
    }

    #[test]
    fn tokenize_string_predicate() {
        let sql = String::from("SELECT * FROM customer WHERE salary != 'Not Provided'");
        let dialect = GenericDialect {};
        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();

        let expected = vec![
            Token::make_keyword("SELECT"),
            Token::Whitespace(Whitespace::Space),
            Token::Mul,
            Token::Whitespace(Whitespace::Space),
            Token::make_keyword("FROM"),
            Token::Whitespace(Whitespace::Space),
            Token::make_word("customer", None),
            Token::Whitespace(Whitespace::Space),
            Token::make_keyword("WHERE"),
            Token::Whitespace(Whitespace::Space),
            Token::make_word("salary", None),
            Token::Whitespace(Whitespace::Space),
            Token::Neq,
            Token::Whitespace(Whitespace::Space),
            Token::SingleQuotedString(String::from("Not Provided")),
        ];

        compare(expected, tokens);
    }

    #[test]
    fn tokenize_invalid_string() {
        let sql = String::from("\n💝مصطفىh");

        let dialect = GenericDialect {};
        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
        // println!("tokens: {:#?}", tokens);
        let expected = vec![
            Token::Whitespace(Whitespace::Newline),
            Token::Char('💝'),
            Token::make_word("مصطفىh", None),
        ];
        compare(expected, tokens);
    }

    #[test]
    fn tokenize_newline_in_string_literal() {
        let sql = String::from("'foo\r\nbar\nbaz'");

        let dialect = GenericDialect {};
        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
        let expected = vec![Token::SingleQuotedString("foo\r\nbar\nbaz".to_string())];
        compare(expected, tokens);
    }

    #[test]
    fn tokenize_unterminated_string_literal() {
        let sql = String::from("select 'foo");

        let dialect = GenericDialect {};
        let mut tokenizer = Tokenizer::new(&dialect, &sql);
        assert_eq!(
            tokenizer.tokenize(),
            Err(TokenizerError {
                message: "Unterminated string literal".to_string(),
                location: Location { line: 1, column: 8 },
            })
        );
    }

    #[test]
    fn tokenize_unterminated_string_literal_utf8() {
        let sql = String::from("SELECT \"なにか\" FROM Y WHERE \"なにか\" = 'test;");

        let dialect = GenericDialect {};
        let mut tokenizer = Tokenizer::new(&dialect, &sql);
        assert_eq!(
            tokenizer.tokenize(),
            Err(TokenizerError {
                message: "Unterminated string literal".to_string(),
                location: Location {
                    line: 1,
                    column: 35
                }
            })
        );
    }

    #[test]
    fn tokenize_invalid_string_cols() {
        let sql = String::from("\n\nSELECT * FROM table\t💝مصطفىh");

        let dialect = GenericDialect {};
        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
        // println!("tokens: {:#?}", tokens);
        let expected = vec![
            Token::Whitespace(Whitespace::Newline),
            Token::Whitespace(Whitespace::Newline),
            Token::make_keyword("SELECT"),
            Token::Whitespace(Whitespace::Space),
            Token::Mul,
            Token::Whitespace(Whitespace::Space),
            Token::make_keyword("FROM"),
            Token::Whitespace(Whitespace::Space),
            Token::make_keyword("table"),
            Token::Whitespace(Whitespace::Tab),
            Token::Char('💝'),
            Token::make_word("مصطفىh", None),
        ];
        compare(expected, tokens);
    }

    #[test]
    fn tokenize_dollar_quoted_string_tagged() {
        let test_cases = vec![
            (
                String::from("SELECT $tag$dollar '$' quoted strings have $tags like this$ or like this $$$tag$"),
                vec![
                    Token::make_keyword("SELECT"),
                    Token::Whitespace(Whitespace::Space),
                    Token::DollarQuotedString(DollarQuotedString {
                        value: "dollar '$' quoted strings have $tags like this$ or like this $$".into(),
                        tag: Some("tag".into()),
                    })
                ]
            ),
            (
                String::from("SELECT $abc$x$ab$abc$"),
                vec![
                    Token::make_keyword("SELECT"),
                    Token::Whitespace(Whitespace::Space),
                    Token::DollarQuotedString(DollarQuotedString {
                        value: "x$ab".into(),
                        tag: Some("abc".into()),
                    })
                ]
            ),
            (
                String::from("SELECT $abc$$abc$"),
                vec![
                    Token::make_keyword("SELECT"),
                    Token::Whitespace(Whitespace::Space),
                    Token::DollarQuotedString(DollarQuotedString {
                        value: "".into(),
                        tag: Some("abc".into()),
                    })
                ]
            ),
            (
                String::from("0$abc$$abc$1"),
                vec![
                    Token::Number("0".into(), false),
                    Token::DollarQuotedString(DollarQuotedString {
                        value: "".into(),
                        tag: Some("abc".into()),
                    }),
                    Token::Number("1".into(), false),
                ]
            ),
            (
                String::from("$function$abc$q$data$q$$function$"),
                vec![
                    Token::DollarQuotedString(DollarQuotedString {
                        value: "abc$q$data$q$".into(),
                        tag: Some("function".into()),
                    }),
                ]
            ),
        ];

        let dialect = GenericDialect {};
        for (sql, expected) in test_cases {
            let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
            compare(expected, tokens);
        }
    }

    #[test]
    fn tokenize_dollar_quoted_string_tagged_unterminated() {
        let sql = String::from("SELECT $tag$dollar '$' quoted strings have $tags like this$ or like this $$$different tag$");
        let dialect = GenericDialect {};
        assert_eq!(
            Tokenizer::new(&dialect, &sql).tokenize(),
            Err(TokenizerError {
                message: "Unterminated dollar-quoted, expected $".into(),
                location: Location {
                    line: 1,
                    column: 91
                }
            })
        );
    }

    #[test]
    fn tokenize_dollar_quoted_string_tagged_unterminated_mirror() {
        let sql = String::from("SELECT $abc$abc$");
        let dialect = GenericDialect {};
        assert_eq!(
            Tokenizer::new(&dialect, &sql).tokenize(),
            Err(TokenizerError {
                message: "Unterminated dollar-quoted, expected $".into(),
                location: Location {
                    line: 1,
                    column: 17
                }
            })
        );
    }

    #[test]
    fn tokenize_dollar_placeholder() {
        let sql = String::from("SELECT $$, $$ABC$$, $ABC$, $ABC");
        let dialect = SQLiteDialect {};
        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
        assert_eq!(
            tokens,
            vec![
                Token::make_keyword("SELECT"),
                Token::Whitespace(Whitespace::Space),
                Token::Placeholder("$$".into()),
                Token::Comma,
                Token::Whitespace(Whitespace::Space),
                Token::Placeholder("$$ABC$$".into()),
                Token::Comma,
                Token::Whitespace(Whitespace::Space),
                Token::Placeholder("$ABC$".into()),
                Token::Comma,
                Token::Whitespace(Whitespace::Space),
                Token::Placeholder("$ABC".into()),
            ]
        );
    }

    #[test]
    fn tokenize_nested_dollar_quoted_strings() {
        let sql = String::from("SELECT $tag$dollar $nested$ string$tag$");
        let dialect = GenericDialect {};
        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
        let expected = vec![
            Token::make_keyword("SELECT"),
            Token::Whitespace(Whitespace::Space),
            Token::DollarQuotedString(DollarQuotedString {
                value: "dollar $nested$ string".into(),
                tag: Some("tag".into()),
            }),
        ];
        compare(expected, tokens);
    }

    #[test]
    fn tokenize_dollar_quoted_string_untagged_empty() {
        let sql = String::from("SELECT $$$$");
        let dialect = GenericDialect {};
        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
        let expected = vec![
            Token::make_keyword("SELECT"),
            Token::Whitespace(Whitespace::Space),
            Token::DollarQuotedString(DollarQuotedString {
                value: "".into(),
                tag: None,
            }),
        ];
        compare(expected, tokens);
    }

    #[test]
    fn tokenize_dollar_quoted_string_untagged() {
        let sql =
            String::from("SELECT $$within dollar '$' quoted strings have $tags like this$ $$");
        let dialect = GenericDialect {};
        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
        let expected = vec![
            Token::make_keyword("SELECT"),
            Token::Whitespace(Whitespace::Space),
            Token::DollarQuotedString(DollarQuotedString {
                value: "within dollar '$' quoted strings have $tags like this$ ".into(),
                tag: None,
            }),
        ];
        compare(expected, tokens);
    }

    #[test]
    fn tokenize_dollar_quoted_string_untagged_unterminated() {
        let sql = String::from(
            "SELECT $$dollar '$' quoted strings have $tags like this$ or like this $different tag$",
        );
        let dialect = GenericDialect {};
        assert_eq!(
            Tokenizer::new(&dialect, &sql).tokenize(),
            Err(TokenizerError {
                message: "Unterminated dollar-quoted string".into(),
                location: Location {
                    line: 1,
                    column: 86
                }
            })
        );
    }

    #[test]
    fn tokenize_right_arrow() {
        let sql = String::from("FUNCTION(key=>value)");
        let dialect = GenericDialect {};
        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
        let expected = vec![
            Token::make_word("FUNCTION", None),
            Token::LParen,
            Token::make_word("key", None),
            Token::RArrow,
            Token::make_word("value", None),
            Token::RParen,
        ];
        compare(expected, tokens);
    }

    #[test]
    fn tokenize_is_null() {
        let sql = String::from("a IS NULL");
        let dialect = GenericDialect {};
        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();

        let expected = vec![
            Token::make_word("a", None),
            Token::Whitespace(Whitespace::Space),
            Token::make_keyword("IS"),
            Token::Whitespace(Whitespace::Space),
            Token::make_keyword("NULL"),
        ];

        compare(expected, tokens);
    }

    #[test]
    fn tokenize_comment() {
        let test_cases = vec![
            (
                String::from("0--this is a comment\n1"),
                vec![
                    Token::Number("0".to_string(), false),
                    Token::Whitespace(Whitespace::SingleLineComment {
                        prefix: "--".to_string(),
                        comment: "this is a comment\n".to_string(),
                    }),
                    Token::Number("1".to_string(), false),
                ],
            ),
            (
                String::from("0--this is a comment\r1"),
                vec![
                    Token::Number("0".to_string(), false),
                    Token::Whitespace(Whitespace::SingleLineComment {
                        prefix: "--".to_string(),
                        comment: "this is a comment\r1".to_string(),
                    }),
                ],
            ),
            (
                String::from("0--this is a comment\r\n1"),
                vec![
                    Token::Number("0".to_string(), false),
                    Token::Whitespace(Whitespace::SingleLineComment {
                        prefix: "--".to_string(),
                        comment: "this is a comment\r\n".to_string(),
                    }),
                    Token::Number("1".to_string(), false),
                ],
            ),
        ];

        let dialect = GenericDialect {};

        for (sql, expected) in test_cases {
            let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
            compare(expected, tokens);
        }
    }

    #[test]
    fn tokenize_comment_postgres() {
        let sql = String::from("1--\r0");

        let dialect = PostgreSqlDialect {};
        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
        let expected = vec![
            Token::Number("1".to_string(), false),
            Token::Whitespace(Whitespace::SingleLineComment {
                prefix: "--".to_string(),
                comment: "\r".to_string(),
            }),
            Token::Number("0".to_string(), false),
        ];
        compare(expected, tokens);
    }

    #[test]
    fn tokenize_comment_at_eof() {
        let sql = String::from("--this is a comment");

        let dialect = GenericDialect {};
        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
        let expected = vec![Token::Whitespace(Whitespace::SingleLineComment {
            prefix: "--".to_string(),
            comment: "this is a comment".to_string(),
        })];
        compare(expected, tokens);
    }

    #[test]
    fn tokenize_multiline_comment() {
        let sql = String::from("0/*multi-line\n* /comment*/1");

        let dialect = GenericDialect {};
        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
        let expected = vec![
            Token::Number("0".to_string(), false),
            Token::Whitespace(Whitespace::MultiLineComment(
                "multi-line\n* /comment".to_string(),
            )),
            Token::Number("1".to_string(), false),
        ];
        compare(expected, tokens);
    }

    #[test]
    fn tokenize_nested_multiline_comment() {
        let dialect = GenericDialect {};
        let test_cases = vec![
            (
                "0/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/1",
                vec![
                    Token::Number("0".to_string(), false),
                    Token::Whitespace(Whitespace::MultiLineComment(
                        "multi-line\n* \n/* comment \n /*comment*/*/ ".into(),
                    )),
                    Token::Whitespace(Whitespace::Space),
                    Token::Div,
                    Token::Word(Word {
                        value: "comment".to_string(),
                        quote_style: None,
                        keyword: Keyword::COMMENT,
                    }),
                    Token::Mul,
                    Token::Div,
                    Token::Number("1".to_string(), false),
                ],
            ),
            (
                "0/*multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/*/1",
                vec![
                    Token::Number("0".to_string(), false),
                    Token::Whitespace(Whitespace::MultiLineComment(
                        "multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/".into(),
                    )),
                    Token::Number("1".to_string(), false),
                ],
            ),
            (
                "SELECT 1/* a /* b */ c */0",
                vec![
                    Token::make_keyword("SELECT"),
                    Token::Whitespace(Whitespace::Space),
                    Token::Number("1".to_string(), false),
                    Token::Whitespace(Whitespace::MultiLineComment(" a /* b */ c ".to_string())),
                    Token::Number("0".to_string(), false),
                ],
            ),
        ];

        for (sql, expected) in test_cases {
            let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
            compare(expected, tokens);
        }
    }

    #[test]
    fn tokenize_nested_multiline_comment_empty() {
        let sql = "select 1/*/**/*/0";

        let dialect = GenericDialect {};
        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
        let expected = vec![
            Token::make_keyword("select"),
            Token::Whitespace(Whitespace::Space),
            Token::Number("1".to_string(), false),
            Token::Whitespace(Whitespace::MultiLineComment("/**/".to_string())),
            Token::Number("0".to_string(), false),
        ];

        compare(expected, tokens);
    }

    #[test]
    fn tokenize_nested_comments_if_not_supported() {
        let dialect = SQLiteDialect {};
        let sql = "SELECT 1/*/* nested comment */*/0";
        let tokens = Tokenizer::new(&dialect, sql).tokenize();
        let expected = vec![
            Token::make_keyword("SELECT"),
            Token::Whitespace(Whitespace::Space),
            Token::Number("1".to_string(), false),
            Token::Whitespace(Whitespace::MultiLineComment(
                "/* nested comment ".to_string(),
            )),
            Token::Mul,
            Token::Div,
            Token::Number("0".to_string(), false),
        ];

        compare(expected, tokens.unwrap());
    }

    #[test]
    fn tokenize_multiline_comment_with_even_asterisks() {
        let sql = String::from("\n/** Comment **/\n");

        let dialect = GenericDialect {};
        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
        let expected = vec![
            Token::Whitespace(Whitespace::Newline),
            Token::Whitespace(Whitespace::MultiLineComment("* Comment *".to_string())),
            Token::Whitespace(Whitespace::Newline),
        ];
        compare(expected, tokens);
    }

    #[test]
    fn tokenize_unicode_whitespace() {
        let sql = String::from(" \u{2003}\n");

        let dialect = GenericDialect {};
        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
        let expected = vec![
            Token::Whitespace(Whitespace::Space),
            Token::Whitespace(Whitespace::Space),
            Token::Whitespace(Whitespace::Newline),
        ];
        compare(expected, tokens);
    }

    #[test]
    fn tokenize_mismatched_quotes() {
        let sql = String::from("\"foo");

        let dialect = GenericDialect {};
        let mut tokenizer = Tokenizer::new(&dialect, &sql);
        assert_eq!(
            tokenizer.tokenize(),
            Err(TokenizerError {
                message: "Expected close delimiter '\"' before EOF.".to_string(),
                location: Location { line: 1, column: 1 },
            })
        );
    }

    #[test]
    fn tokenize_newlines() {
        let sql = String::from("line1\nline2\rline3\r\nline4\r");

        let dialect = GenericDialect {};
        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
        let expected = vec![
            Token::make_word("line1", None),
            Token::Whitespace(Whitespace::Newline),
            Token::make_word("line2", None),
            Token::Whitespace(Whitespace::Newline),
            Token::make_word("line3", None),
            Token::Whitespace(Whitespace::Newline),
            Token::make_word("line4", None),
            Token::Whitespace(Whitespace::Newline),
        ];
        compare(expected, tokens);
    }

    #[test]
    fn tokenize_mssql_top() {
        let sql = "SELECT TOP 5 [bar] FROM foo";
        let dialect = MsSqlDialect {};
        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
        let expected = vec![
            Token::make_keyword("SELECT"),
            Token::Whitespace(Whitespace::Space),
            Token::make_keyword("TOP"),
            Token::Whitespace(Whitespace::Space),
            Token::Number(String::from("5"), false),
            Token::Whitespace(Whitespace::Space),
            Token::make_word("bar", Some('[')),
            Token::Whitespace(Whitespace::Space),
            Token::make_keyword("FROM"),
            Token::Whitespace(Whitespace::Space),
            Token::make_word("foo", None),
        ];
        compare(expected, tokens);
    }

    #[test]
    fn tokenize_pg_regex_match() {
        let sql = "SELECT col ~ '^a', col ~* '^a', col !~ '^a', col !~* '^a'";
        let dialect = GenericDialect {};
        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
        let expected = vec![
            Token::make_keyword("SELECT"),
            Token::Whitespace(Whitespace::Space),
            Token::make_word("col", None),
            Token::Whitespace(Whitespace::Space),
            Token::Tilde,
            Token::Whitespace(Whitespace::Space),
            Token::SingleQuotedString("^a".into()),
            Token::Comma,
            Token::Whitespace(Whitespace::Space),
            Token::make_word("col", None),
            Token::Whitespace(Whitespace::Space),
            Token::TildeAsterisk,
            Token::Whitespace(Whitespace::Space),
            Token::SingleQuotedString("^a".into()),
            Token::Comma,
            Token::Whitespace(Whitespace::Space),
            Token::make_word("col", None),
            Token::Whitespace(Whitespace::Space),
            Token::ExclamationMarkTilde,
            Token::Whitespace(Whitespace::Space),
            Token::SingleQuotedString("^a".into()),
            Token::Comma,
            Token::Whitespace(Whitespace::Space),
            Token::make_word("col", None),
            Token::Whitespace(Whitespace::Space),
            Token::ExclamationMarkTildeAsterisk,
            Token::Whitespace(Whitespace::Space),
            Token::SingleQuotedString("^a".into()),
        ];
        compare(expected, tokens);
    }

    #[test]
    fn tokenize_pg_like_match() {
        let sql = "SELECT col ~~ '_a%', col ~~* '_a%', col !~~ '_a%', col !~~* '_a%'";
        let dialect = GenericDialect {};
        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
        let expected = vec![
            Token::make_keyword("SELECT"),
            Token::Whitespace(Whitespace::Space),
            Token::make_word("col", None),
            Token::Whitespace(Whitespace::Space),
            Token::DoubleTilde,
            Token::Whitespace(Whitespace::Space),
            Token::SingleQuotedString("_a%".into()),
            Token::Comma,
            Token::Whitespace(Whitespace::Space),
            Token::make_word("col", None),
            Token::Whitespace(Whitespace::Space),
            Token::DoubleTildeAsterisk,
            Token::Whitespace(Whitespace::Space),
            Token::SingleQuotedString("_a%".into()),
            Token::Comma,
            Token::Whitespace(Whitespace::Space),
            Token::make_word("col", None),
            Token::Whitespace(Whitespace::Space),
            Token::ExclamationMarkDoubleTilde,
            Token::Whitespace(Whitespace::Space),
            Token::SingleQuotedString("_a%".into()),
            Token::Comma,
            Token::Whitespace(Whitespace::Space),
            Token::make_word("col", None),
            Token::Whitespace(Whitespace::Space),
            Token::ExclamationMarkDoubleTildeAsterisk,
            Token::Whitespace(Whitespace::Space),
            Token::SingleQuotedString("_a%".into()),
        ];
        compare(expected, tokens);
    }

    #[test]
    fn tokenize_quoted_identifier() {
        let sql = r#" "a "" b" "a """ "c """"" "#;
        let dialect = GenericDialect {};
        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
        let expected = vec![
            Token::Whitespace(Whitespace::Space),
            Token::make_word(r#"a " b"#, Some('"')),
            Token::Whitespace(Whitespace::Space),
            Token::make_word(r#"a ""#, Some('"')),
            Token::Whitespace(Whitespace::Space),
            Token::make_word(r#"c """#, Some('"')),
            Token::Whitespace(Whitespace::Space),
        ];
        compare(expected, tokens);
    }

    #[test]
    fn tokenize_snowflake_div() {
        let sql = r#"field/1000"#;
        let dialect = SnowflakeDialect {};
        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
        let expected = vec![
            Token::make_word(r#"field"#, None),
            Token::Div,
            Token::Number("1000".to_string(), false),
        ];
        compare(expected, tokens);
    }

    #[test]
    fn tokenize_quoted_identifier_with_no_escape() {
        let sql = r#" "a "" b" "a """ "c """"" "#;
        let dialect = GenericDialect {};
        let tokens = Tokenizer::new(&dialect, sql)
            .with_unescape(false)
            .tokenize()
            .unwrap();
        let expected = vec![
            Token::Whitespace(Whitespace::Space),
            Token::make_word(r#"a "" b"#, Some('"')),
            Token::Whitespace(Whitespace::Space),
            Token::make_word(r#"a """#, Some('"')),
            Token::Whitespace(Whitespace::Space),
            Token::make_word(r#"c """""#, Some('"')),
            Token::Whitespace(Whitespace::Space),
        ];
        compare(expected, tokens);
    }

    #[test]
    fn tokenize_with_location() {
        let sql = "SELECT a,\n b";
        let dialect = GenericDialect {};
        let tokens = Tokenizer::new(&dialect, sql)
            .tokenize_with_location()
            .unwrap();
        let expected = vec![
            TokenWithSpan::at(Token::make_keyword("SELECT"), (1, 1).into(), (1, 7).into()),
            TokenWithSpan::at(
                Token::Whitespace(Whitespace::Space),
                (1, 7).into(),
                (1, 8).into(),
            ),
            TokenWithSpan::at(Token::make_word("a", None), (1, 8).into(), (1, 9).into()),
            TokenWithSpan::at(Token::Comma, (1, 9).into(), (1, 10).into()),
            TokenWithSpan::at(
                Token::Whitespace(Whitespace::Newline),
                (1, 10).into(),
                (2, 1).into(),
            ),
            TokenWithSpan::at(
                Token::Whitespace(Whitespace::Space),
                (2, 1).into(),
                (2, 2).into(),
            ),
            TokenWithSpan::at(Token::make_word("b", None), (2, 2).into(), (2, 3).into()),
        ];
        compare(expected, tokens);
    }

    fn compare<T: PartialEq + fmt::Debug>(expected: Vec<T>, actual: Vec<T>) {
        //println!("------------------------------");
        //println!("tokens   = {:?}", actual);
        //println!("expected = {:?}", expected);
        //println!("------------------------------");
        assert_eq!(expected, actual);
    }

    fn check_unescape(s: &str, expected: Option<&str>) {
        let s = format!("'{s}'");
        let mut state = State {
            peekable: s.chars().peekable(),
            line: 0,
            col: 0,
        };

        assert_eq!(
            unescape_single_quoted_string(&mut state),
            expected.map(|s| s.to_string())
        );
    }

    #[test]
    fn test_unescape() {
        check_unescape(r"\b", Some("\u{0008}"));
        check_unescape(r"\f", Some("\u{000C}"));
        check_unescape(r"\t", Some("\t"));
        check_unescape(r"\r\n", Some("\r\n"));
        check_unescape(r"\/", Some("/"));
        check_unescape(r"/", Some("/"));
        check_unescape(r"\\", Some("\\"));

        // 16 and 32-bit hexadecimal Unicode character value
        check_unescape(r"\u0001", Some("\u{0001}"));
        check_unescape(r"\u4c91", Some("\u{4c91}"));
        check_unescape(r"\u4c916", Some("\u{4c91}6"));
        check_unescape(r"\u4c", None);
        check_unescape(r"\u0000", None);
        check_unescape(r"\U0010FFFF", Some("\u{10FFFF}"));
        check_unescape(r"\U00110000", None);
        check_unescape(r"\U00000000", None);
        check_unescape(r"\u", None);
        check_unescape(r"\U", None);
        check_unescape(r"\U1010FFFF", None);

        // hexadecimal byte value
        check_unescape(r"\x4B", Some("\u{004b}"));
        check_unescape(r"\x4", Some("\u{0004}"));
        check_unescape(r"\x4L", Some("\u{0004}L"));
        check_unescape(r"\x", Some("x"));
        check_unescape(r"\xP", Some("xP"));
        check_unescape(r"\x0", None);
        check_unescape(r"\xCAD", None);
        check_unescape(r"\xA9", None);

        // octal byte value
        check_unescape(r"\1", Some("\u{0001}"));
        check_unescape(r"\12", Some("\u{000a}"));
        check_unescape(r"\123", Some("\u{0053}"));
        check_unescape(r"\1232", Some("\u{0053}2"));
        check_unescape(r"\4", Some("\u{0004}"));
        check_unescape(r"\45", Some("\u{0025}"));
        check_unescape(r"\450", Some("\u{0028}"));
        check_unescape(r"\603", None);
        check_unescape(r"\0", None);
        check_unescape(r"\080", None);

        // others
        check_unescape(r"\9", Some("9"));
        check_unescape(r"''", Some("'"));
        check_unescape(
            r"Hello\r\nRust/\u4c91 SQL Parser\U0010ABCD\1232",
            Some("Hello\r\nRust/\u{4c91} SQL Parser\u{10abcd}\u{0053}2"),
        );
        check_unescape(r"Hello\0", None);
        check_unescape(r"Hello\xCADRust", None);
    }

    #[test]
    fn tokenize_numeric_prefix_trait() {
        #[derive(Debug)]
        struct NumericPrefixDialect;

        impl Dialect for NumericPrefixDialect {
            fn is_identifier_start(&self, ch: char) -> bool {
                ch.is_ascii_lowercase()
                    || ch.is_ascii_uppercase()
                    || ch.is_ascii_digit()
                    || ch == '$'
            }

            fn is_identifier_part(&self, ch: char) -> bool {
                ch.is_ascii_lowercase()
                    || ch.is_ascii_uppercase()
                    || ch.is_ascii_digit()
                    || ch == '_'
                    || ch == '$'
                    || ch == '{'
                    || ch == '}'
            }

            fn supports_numeric_prefix(&self) -> bool {
                true
            }
        }

        tokenize_numeric_prefix_inner(&NumericPrefixDialect {});
        tokenize_numeric_prefix_inner(&HiveDialect {});
        tokenize_numeric_prefix_inner(&MySqlDialect {});
    }

    fn tokenize_numeric_prefix_inner(dialect: &dyn Dialect) {
        let sql = r#"SELECT * FROM 1"#;
        let tokens = Tokenizer::new(dialect, sql).tokenize().unwrap();
        let expected = vec![
            Token::make_keyword("SELECT"),
            Token::Whitespace(Whitespace::Space),
            Token::Mul,
            Token::Whitespace(Whitespace::Space),
            Token::make_keyword("FROM"),
            Token::Whitespace(Whitespace::Space),
            Token::Number(String::from("1"), false),
        ];
        compare(expected, tokens);
    }

    #[test]
    fn tokenize_quoted_string_escape() {
        let dialect = SnowflakeDialect {};
        for (sql, expected, expected_unescaped) in [
            (r#"'%a\'%b'"#, r#"%a\'%b"#, r#"%a'%b"#),
            (r#"'a\'\'b\'c\'d'"#, r#"a\'\'b\'c\'d"#, r#"a''b'c'd"#),
            (r#"'\\'"#, r#"\\"#, r#"\"#),
            (
                r#"'\0\a\b\f\n\r\t\Z'"#,
                r#"\0\a\b\f\n\r\t\Z"#,
                "\0\u{7}\u{8}\u{c}\n\r\t\u{1a}",
            ),
            (r#"'\"'"#, r#"\""#, "\""),
            (r#"'\\a\\b\'c'"#, r#"\\a\\b\'c"#, r#"\a\b'c"#),
            (r#"'\'abcd'"#, r#"\'abcd"#, r#"'abcd"#),
            (r#"'''a''b'"#, r#"''a''b"#, r#"'a'b"#),
            (r#"'\q'"#, r#"\q"#, r#"q"#),
            (r#"'\%\_'"#, r#"\%\_"#, r#"%_"#),
            (r#"'\\%\\_'"#, r#"\\%\\_"#, r#"\%\_"#),
        ] {
            let tokens = Tokenizer::new(&dialect, sql)
                .with_unescape(false)
                .tokenize()
                .unwrap();
            let expected = vec![Token::SingleQuotedString(expected.to_string())];
            compare(expected, tokens);

            let tokens = Tokenizer::new(&dialect, sql)
                .with_unescape(true)
                .tokenize()
                .unwrap();
            let expected = vec![Token::SingleQuotedString(expected_unescaped.to_string())];
            compare(expected, tokens);
        }

        for sql in [r#"'\'"#, r#"'ab\'"#] {
            let mut tokenizer = Tokenizer::new(&dialect, sql);
            assert_eq!(
                "Unterminated string literal",
                tokenizer.tokenize().unwrap_err().message.as_str(),
            );
        }

        // Non-escape dialect
        for (sql, expected) in [(r#"'\'"#, r#"\"#), (r#"'ab\'"#, r#"ab\"#)] {
            let dialect = GenericDialect {};
            let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();

            let expected = vec![Token::SingleQuotedString(expected.to_string())];

            compare(expected, tokens);
        }

        // MySQL special case for LIKE escapes
        for (sql, expected) in [(r#"'\%'"#, r#"\%"#), (r#"'\_'"#, r#"\_"#)] {
            let dialect = MySqlDialect {};
            let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();

            let expected = vec![Token::SingleQuotedString(expected.to_string())];

            compare(expected, tokens);
        }
    }

    #[test]
    fn tokenize_triple_quoted_string() {
        fn check<F>(
            q: char, // The quote character to test
            r: char, // An alternate quote character.
            quote_token: F,
        ) where
            F: Fn(String) -> Token,
        {
            let dialect = BigQueryDialect {};

            for (sql, expected, expected_unescaped) in [
                // Empty string
                (format!(r#"{q}{q}{q}{q}{q}{q}"#), "".into(), "".into()),
                // Should not count escaped quote as end of string.
                (
                    format!(r#"{q}{q}{q}ab{q}{q}\{q}{q}cd{q}{q}{q}"#),
                    format!(r#"ab{q}{q}\{q}{q}cd"#),
                    format!(r#"ab{q}{q}{q}{q}cd"#),
                ),
                // Simple string
                (
                    format!(r#"{q}{q}{q}abc{q}{q}{q}"#),
                    "abc".into(),
                    "abc".into(),
                ),
                // Mix single-double quotes unescaped.
                (
                    format!(r#"{q}{q}{q}ab{r}{r}{r}c{r}def{r}{r}{r}{q}{q}{q}"#),
                    format!("ab{r}{r}{r}c{r}def{r}{r}{r}"),
                    format!("ab{r}{r}{r}c{r}def{r}{r}{r}"),
                ),
                // Escaped quote.
                (
                    format!(r#"{q}{q}{q}ab{q}{q}c{q}{q}\{q}de{q}{q}f{q}{q}{q}"#),
                    format!(r#"ab{q}{q}c{q}{q}\{q}de{q}{q}f"#),
                    format!(r#"ab{q}{q}c{q}{q}{q}de{q}{q}f"#),
                ),
                // backslash-escaped quote characters.
                (
                    format!(r#"{q}{q}{q}a\'\'b\'c\'d{q}{q}{q}"#),
                    r#"a\'\'b\'c\'d"#.into(),
                    r#"a''b'c'd"#.into(),
                ),
                // backslash-escaped characters
                (
                    format!(r#"{q}{q}{q}abc\0\n\rdef{q}{q}{q}"#),
                    r#"abc\0\n\rdef"#.into(),
                    "abc\0\n\rdef".into(),
                ),
            ] {
                let tokens = Tokenizer::new(&dialect, sql.as_str())
                    .with_unescape(false)
                    .tokenize()
                    .unwrap();
                let expected = vec![quote_token(expected.to_string())];
                compare(expected, tokens);

                let tokens = Tokenizer::new(&dialect, sql.as_str())
                    .with_unescape(true)
                    .tokenize()
                    .unwrap();
                let expected = vec![quote_token(expected_unescaped.to_string())];
                compare(expected, tokens);
            }

            for sql in [
                format!(r#"{q}{q}{q}{q}{q}\{q}"#),
                format!(r#"{q}{q}{q}abc{q}{q}\{q}"#),
                format!(r#"{q}{q}{q}{q}"#),
                format!(r#"{q}{q}{q}{r}{r}"#),
                format!(r#"{q}{q}{q}abc{q}"#),
                format!(r#"{q}{q}{q}abc{q}{q}"#),
                format!(r#"{q}{q}{q}abc"#),
            ] {
                let dialect = BigQueryDialect {};
                let mut tokenizer = Tokenizer::new(&dialect, sql.as_str());
                assert_eq!(
                    "Unterminated string literal",
                    tokenizer.tokenize().unwrap_err().message.as_str(),
                );
            }
        }

        check('"', '\'', Token::TripleDoubleQuotedString);

        check('\'', '"', Token::TripleSingleQuotedString);

        let dialect = BigQueryDialect {};

        let sql = r#"""''"#;
        let tokens = Tokenizer::new(&dialect, sql)
            .with_unescape(true)
            .tokenize()
            .unwrap();
        let expected = vec![
            Token::DoubleQuotedString("".to_string()),
            Token::SingleQuotedString("".to_string()),
        ];
        compare(expected, tokens);

        let sql = r#"''"""#;
        let tokens = Tokenizer::new(&dialect, sql)
            .with_unescape(true)
            .tokenize()
            .unwrap();
        let expected = vec![
            Token::SingleQuotedString("".to_string()),
            Token::DoubleQuotedString("".to_string()),
        ];
        compare(expected, tokens);

        // Non-triple quoted string dialect
        let dialect = SnowflakeDialect {};
        let sql = r#"''''''"#;
        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
        let expected = vec![Token::SingleQuotedString("''".to_string())];
        compare(expected, tokens);
    }

    #[test]
    fn test_mysql_users_grantees() {
        let dialect = MySqlDialect {};

        let sql = "CREATE USER `root`@`%`";
        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
        let expected = vec![
            Token::make_keyword("CREATE"),
            Token::Whitespace(Whitespace::Space),
            Token::make_keyword("USER"),
            Token::Whitespace(Whitespace::Space),
            Token::make_word("root", Some('`')),
            Token::AtSign,
            Token::make_word("%", Some('`')),
        ];
        compare(expected, tokens);
    }

    #[test]
    fn test_postgres_abs_without_space_and_string_literal() {
        let dialect = MySqlDialect {};

        let sql = "SELECT @'1'";
        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
        let expected = vec![
            Token::make_keyword("SELECT"),
            Token::Whitespace(Whitespace::Space),
            Token::AtSign,
            Token::SingleQuotedString("1".to_string()),
        ];
        compare(expected, tokens);
    }

    #[test]
    fn test_postgres_abs_without_space_and_quoted_column() {
        let dialect = MySqlDialect {};

        let sql = r#"SELECT @"bar" FROM foo"#;
        let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
        let expected = vec![
            Token::make_keyword("SELECT"),
            Token::Whitespace(Whitespace::Space),
            Token::AtSign,
            Token::DoubleQuotedString("bar".to_string()),
            Token::Whitespace(Whitespace::Space),
            Token::make_keyword("FROM"),
            Token::Whitespace(Whitespace::Space),
            Token::make_word("foo", None),
        ];
        compare(expected, tokens);
    }

    #[test]
    fn test_national_strings_backslash_escape_not_supported() {
        all_dialects_where(|dialect| !dialect.supports_string_literal_backslash_escape())
            .tokenizes_to(
                "select n'''''\\'",
                vec![
                    Token::make_keyword("select"),
                    Token::Whitespace(Whitespace::Space),
                    Token::NationalStringLiteral("''\\".to_string()),
                ],
            );
    }

    #[test]
    fn test_national_strings_backslash_escape_supported() {
        all_dialects_where(|dialect| dialect.supports_string_literal_backslash_escape())
            .tokenizes_to(
                "select n'''''\\''",
                vec![
                    Token::make_keyword("select"),
                    Token::Whitespace(Whitespace::Space),
                    Token::NationalStringLiteral("'''".to_string()),
                ],
            );
    }

    #[test]
    fn test_string_escape_constant_not_supported() {
        all_dialects_where(|dialect| !dialect.supports_string_escape_constant()).tokenizes_to(
            "select e'...'",
            vec![
                Token::make_keyword("select"),
                Token::Whitespace(Whitespace::Space),
                Token::make_word("e", None),
                Token::SingleQuotedString("...".to_string()),
            ],
        );

        all_dialects_where(|dialect| !dialect.supports_string_escape_constant()).tokenizes_to(
            "select E'...'",
            vec![
                Token::make_keyword("select"),
                Token::Whitespace(Whitespace::Space),
                Token::make_word("E", None),
                Token::SingleQuotedString("...".to_string()),
            ],
        );
    }

    #[test]
    fn test_string_escape_constant_supported() {
        all_dialects_where(|dialect| dialect.supports_string_escape_constant()).tokenizes_to(
            "select e'\\''",
            vec![
                Token::make_keyword("select"),
                Token::Whitespace(Whitespace::Space),
                Token::EscapedStringLiteral("'".to_string()),
            ],
        );

        all_dialects_where(|dialect| dialect.supports_string_escape_constant()).tokenizes_to(
            "select E'\\''",
            vec![
                Token::make_keyword("select"),
                Token::Whitespace(Whitespace::Space),
                Token::EscapedStringLiteral("'".to_string()),
            ],
        );
    }

    #[test]
    fn test_whitespace_required_after_single_line_comment() {
        all_dialects_where(|dialect| dialect.requires_single_line_comment_whitespace())
            .tokenizes_to(
                "SELECT --'abc'",
                vec![
                    Token::make_keyword("SELECT"),
                    Token::Whitespace(Whitespace::Space),
                    Token::Minus,
                    Token::Minus,
                    Token::SingleQuotedString("abc".to_string()),
                ],
            );

        all_dialects_where(|dialect| dialect.requires_single_line_comment_whitespace())
            .tokenizes_to(
                "SELECT -- 'abc'",
                vec![
                    Token::make_keyword("SELECT"),
                    Token::Whitespace(Whitespace::Space),
                    Token::Whitespace(Whitespace::SingleLineComment {
                        prefix: "--".to_string(),
                        comment: " 'abc'".to_string(),
                    }),
                ],
            );

        all_dialects_where(|dialect| dialect.requires_single_line_comment_whitespace())
            .tokenizes_to(
                "SELECT --",
                vec![
                    Token::make_keyword("SELECT"),
                    Token::Whitespace(Whitespace::Space),
                    Token::Minus,
                    Token::Minus,
                ],
            );
    }

    #[test]
    fn test_whitespace_not_required_after_single_line_comment() {
        all_dialects_where(|dialect| !dialect.requires_single_line_comment_whitespace())
            .tokenizes_to(
                "SELECT --'abc'",
                vec![
                    Token::make_keyword("SELECT"),
                    Token::Whitespace(Whitespace::Space),
                    Token::Whitespace(Whitespace::SingleLineComment {
                        prefix: "--".to_string(),
                        comment: "'abc'".to_string(),
                    }),
                ],
            );

        all_dialects_where(|dialect| !dialect.requires_single_line_comment_whitespace())
            .tokenizes_to(
                "SELECT -- 'abc'",
                vec![
                    Token::make_keyword("SELECT"),
                    Token::Whitespace(Whitespace::Space),
                    Token::Whitespace(Whitespace::SingleLineComment {
                        prefix: "--".to_string(),
                        comment: " 'abc'".to_string(),
                    }),
                ],
            );

        all_dialects_where(|dialect| !dialect.requires_single_line_comment_whitespace())
            .tokenizes_to(
                "SELECT --",
                vec![
                    Token::make_keyword("SELECT"),
                    Token::Whitespace(Whitespace::Space),
                    Token::Whitespace(Whitespace::SingleLineComment {
                        prefix: "--".to_string(),
                        comment: "".to_string(),
                    }),
                ],
            );
    }

    #[test]
    fn test_tokenize_identifiers_numeric_prefix() {
        all_dialects_where(|dialect| dialect.supports_numeric_prefix())
            .tokenizes_to("123abc", vec![Token::make_word("123abc", None)]);

        all_dialects_where(|dialect| dialect.supports_numeric_prefix())
            .tokenizes_to("12e34", vec![Token::Number("12e34".to_string(), false)]);

        all_dialects_where(|dialect| dialect.supports_numeric_prefix()).tokenizes_to(
            "t.12e34",
            vec![
                Token::make_word("t", None),
                Token::Period,
                Token::make_word("12e34", None),
            ],
        );

        all_dialects_where(|dialect| dialect.supports_numeric_prefix()).tokenizes_to(
            "t.1two3",
            vec![
                Token::make_word("t", None),
                Token::Period,
                Token::make_word("1two3", None),
            ],
        );
    }

    #[test]
    fn tokenize_period_underscore() {
        let sql = String::from("SELECT table._col");
        // a dialect that supports underscores in numeric literals
        let dialect = PostgreSqlDialect {};
        let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();

        let expected = vec![
            Token::make_keyword("SELECT"),
            Token::Whitespace(Whitespace::Space),
            Token::Word(Word {
                value: "table".to_string(),
                quote_style: None,
                keyword: Keyword::TABLE,
            }),
            Token::Period,
            Token::Word(Word {
                value: "_col".to_string(),
                quote_style: None,
                keyword: Keyword::NoKeyword,
            }),
        ];

        compare(expected, tokens);

        let sql = String::from("SELECT ._123");
        if let Ok(tokens) = Tokenizer::new(&dialect, &sql).tokenize() {
            panic!("Tokenizer should have failed on {sql}, but it succeeded with {tokens:?}");
        }

        let sql = String::from("SELECT ._abc");
        if let Ok(tokens) = Tokenizer::new(&dialect, &sql).tokenize() {
            panic!("Tokenizer should have failed on {sql}, but it succeeded with {tokens:?}");
        }
    }
}