Rework keyword/identifier parsing (1/8)

Fold Token::{Keyword, Identifier, DoubleQuotedString} into one
Token::SQLWord, which has the necessary information (was it a
known keyword and/or was it quoted).

This lets the parser easily accept DoubleQuotedString (a quoted
identifier) everywhere it expects an Identifier in the same match
arm. (To complete support of quoted identifiers, or "delimited
identifiers" as the spec calls them, a TODO in parse_tablename()
ought to be addressed.)

    As an aside, per <https://en.wikibooks.org/wiki/SQL_Dialects_Reference/Data_structure_definition/Delimited_identifiers>
    sqlite seems to be the only one supporting 'identifier'
    (which is rather hairy, since it can also be a string
    literal), and `identifier` seems only to be supported by
    MySQL. I didn't implement either one.

This also allows the use of `parse`/`expect_keyword` machinery
for non-reserved keywords: previously they relied on the keyword
being a Token::Keyword, which wasn't a Token::Identifier, and so
wasn't accepted as one.

Now whether a keyword can be used as an identifier can be decided
by the parser. (I didn't add a blacklist of "reserved" keywords,
so that any keyword which doesn't have a special meaning in the
parser could be used as an identifier. The list of keywords in
the dialect could be re-used for that purpose at a later stage.)
This commit is contained in:
Nickolay Ponomarev 2019-01-12 19:45:08 +03:00
parent eb4b5bc686
commit 9a8b6a8e64
7 changed files with 164 additions and 131 deletions

View file

@ -26,18 +26,14 @@ use super::dialect::Dialect;
/// SQL Token enumeration
#[derive(Debug, Clone, PartialEq)]
pub enum Token {
/// SQL identifier e.g. table or column name
Identifier(String),
/// SQL keyword e.g. Keyword("SELECT")
Keyword(String),
/// A keyword (like SELECT) or an optionally quoted SQL identifier
SQLWord(SQLWord),
/// Numeric literal
Number(String),
/// A character that could not be tokenized
Char(char),
/// Single quoted string: i.e: 'string'
SingleQuotedString(String),
/// Double quoted string: i.e: "string"
DoubleQuotedString(String),
/// Comma
Comma,
/// Whitespace (space, tab, etc)
@ -93,12 +89,10 @@ pub enum Token {
impl ToString for Token {
fn to_string(&self) -> String {
match self {
Token::Identifier(ref id) => id.to_string(),
Token::Keyword(ref k) => k.to_string(),
Token::SQLWord(ref w) => w.to_string(),
Token::Number(ref n) => n.to_string(),
Token::Char(ref c) => c.to_string(),
Token::SingleQuotedString(ref s) => format!("'{}'", s),
Token::DoubleQuotedString(ref s) => format!("\"{}\"", s),
Token::Comma => ",".to_string(),
Token::Whitespace(ws) => ws.to_string(),
Token::Eq => "=".to_string(),
@ -128,6 +122,49 @@ impl ToString for Token {
}
}
impl Token {
pub fn make_keyword(keyword: &str) -> Self {
Token::SQLWord(SQLWord {
value: keyword.to_string(),
quote_style: None,
keyword: keyword.to_uppercase().to_string(),
})
}
pub fn make_word(word: &str, quote_style: Option<char>) -> Self {
Token::SQLWord(SQLWord {
value: word.to_string(),
quote_style: quote_style,
keyword: "".to_string(),
})
}
}
/// A keyword (like SELECT) or an optionally quoted SQL identifier
#[derive(Debug, Clone, PartialEq)]
pub struct SQLWord {
/// The value of the token, without the enclosing quotes, and with the
/// escape sequences (if any) processed (TODO: escapes are not handled)
pub value: String,
/// An identifier can be "quoted" (&lt;delimited identifier> in ANSI parlance).
/// The standard and most implementations allow using double quotes for this,
/// but some implementations support other quoting styles as well (e.g. \[MS SQL])
pub quote_style: Option<char>,
/// If the word was not quoted and it matched one of the known keywords,
/// this will have one of the values from dialect::keywords, otherwise empty
pub keyword: String,
}
impl ToString for SQLWord {
fn to_string(&self) -> String {
match self.quote_style {
Some('"') => format!("\"{}\"", self.value),
Some('[') => format!("[{}]", self.value),
None => self.value.clone(),
_ => panic!("Unexpected quote_style!"),
}
}
}
#[derive(Debug, Clone, PartialEq)]
pub enum Whitespace {
Space,
@ -189,11 +226,10 @@ impl<'a> Tokenizer<'a> {
}
Token::Whitespace(Whitespace::Tab) => self.col += 4,
Token::Identifier(s) => self.col += s.len() as u64,
Token::Keyword(s) => self.col += s.len() as u64,
Token::SQLWord(w) if w.quote_style == None => self.col += w.value.len() as u64,
Token::SQLWord(w) if w.quote_style != None => self.col += w.value.len() as u64 + 2,
Token::Number(s) => self.col += s.len() as u64,
Token::SingleQuotedString(s) => self.col += s.len() as u64,
Token::DoubleQuotedString(s) => self.col += s.len() as u64,
_ => self.col += 1,
}
@ -234,14 +270,19 @@ impl<'a> Tokenizer<'a> {
}
let upper_str = s.to_uppercase();
if self.is_keyword(upper_str.as_str()) {
Ok(Some(Token::Keyword(upper_str)))
Ok(Some(Token::SQLWord(SQLWord {
value: s,
quote_style: None,
keyword: upper_str,
})))
} else {
Ok(Some(Token::Identifier(s)))
Ok(Some(Token::make_word(&s, None)))
}
}
// string
'\'' => {
//TODO: handle escaped quotes in string
//TODO: handle newlines in string
//TODO: handle EOF before terminating quote
let mut s = String::new();
chars.next(); // consume
@ -275,7 +316,7 @@ impl<'a> Tokenizer<'a> {
}
}
}
Ok(Some(Token::DoubleQuotedString(s)))
Ok(Some(Token::make_word(&s, Some('"'))))
}
// numbers
'0'...'9' => {
@ -389,7 +430,7 @@ mod tests {
let tokens = tokenizer.tokenize().unwrap();
let expected = vec![
Token::Keyword(String::from("SELECT")),
Token::make_keyword("SELECT"),
Token::Whitespace(Whitespace::Space),
Token::Number(String::from("1")),
];
@ -405,9 +446,9 @@ mod tests {
let tokens = tokenizer.tokenize().unwrap();
let expected = vec![
Token::Keyword(String::from("SELECT")),
Token::make_keyword("SELECT"),
Token::Whitespace(Whitespace::Space),
Token::Identifier(String::from("sqrt")),
Token::make_word("sqrt", None),
Token::LParen,
Token::Number(String::from("1")),
Token::RParen,
@ -424,23 +465,23 @@ mod tests {
let tokens = tokenizer.tokenize().unwrap();
let expected = vec![
Token::Keyword(String::from("SELECT")),
Token::make_keyword("SELECT"),
Token::Whitespace(Whitespace::Space),
Token::Mult,
Token::Whitespace(Whitespace::Space),
Token::Keyword(String::from("FROM")),
Token::make_keyword("FROM"),
Token::Whitespace(Whitespace::Space),
Token::Identifier(String::from("customer")),
Token::make_word("customer", None),
Token::Whitespace(Whitespace::Space),
Token::Keyword(String::from("WHERE")),
Token::make_keyword("WHERE"),
Token::Whitespace(Whitespace::Space),
Token::Identifier(String::from("id")),
Token::make_word("id", None),
Token::Whitespace(Whitespace::Space),
Token::Eq,
Token::Whitespace(Whitespace::Space),
Token::Number(String::from("1")),
Token::Whitespace(Whitespace::Space),
Token::Keyword(String::from("LIMIT")),
Token::make_keyword("LIMIT"),
Token::Whitespace(Whitespace::Space),
Token::Number(String::from("5")),
];
@ -456,17 +497,17 @@ mod tests {
let tokens = tokenizer.tokenize().unwrap();
let expected = vec![
Token::Keyword(String::from("SELECT")),
Token::make_keyword("SELECT"),
Token::Whitespace(Whitespace::Space),
Token::Mult,
Token::Whitespace(Whitespace::Space),
Token::Keyword(String::from("FROM")),
Token::make_keyword("FROM"),
Token::Whitespace(Whitespace::Space),
Token::Identifier(String::from("customer")),
Token::make_word("customer", None),
Token::Whitespace(Whitespace::Space),
Token::Keyword(String::from("WHERE")),
Token::make_keyword("WHERE"),
Token::Whitespace(Whitespace::Space),
Token::Identifier(String::from("salary")),
Token::make_word("salary", None),
Token::Whitespace(Whitespace::Space),
Token::Neq,
Token::Whitespace(Whitespace::Space),
@ -491,7 +532,7 @@ mod tests {
Token::Char('ط'),
Token::Char('ف'),
Token::Char('ى'),
Token::Identifier("h".to_string()),
Token::make_word("h", None),
];
compare(expected, tokens);
}
@ -507,20 +548,20 @@ mod tests {
let expected = vec![
Token::Whitespace(Whitespace::Newline),
Token::Whitespace(Whitespace::Newline),
Token::Keyword("SELECT".into()),
Token::make_keyword("SELECT"),
Token::Whitespace(Whitespace::Space),
Token::Mult,
Token::Whitespace(Whitespace::Space),
Token::Keyword("FROM".into()),
Token::make_keyword("FROM"),
Token::Whitespace(Whitespace::Space),
Token::Keyword("TABLE".into()),
Token::make_keyword("table"),
Token::Whitespace(Whitespace::Tab),
Token::Char('م'),
Token::Char('ص'),
Token::Char('ط'),
Token::Char('ف'),
Token::Char('ى'),
Token::Identifier("h".to_string()),
Token::make_word("h", None),
];
compare(expected, tokens);
}
@ -533,11 +574,11 @@ mod tests {
let tokens = tokenizer.tokenize().unwrap();
let expected = vec![
Token::Identifier(String::from("a")),
Token::make_word("a", None),
Token::Whitespace(Whitespace::Space),
Token::Keyword("IS".to_string()),
Token::make_keyword("IS"),
Token::Whitespace(Whitespace::Space),
Token::Keyword("NULL".to_string()),
Token::make_keyword("NULL"),
];
compare(expected, tokens);