simplify lexer and tokenization by removing literal values (#5)

This commit is contained in:
Josh Thomas 2024-10-13 17:14:15 -05:00 committed by GitHub
parent 433c17999c
commit 03c0c19dd9
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -38,6 +38,7 @@ pub enum TokenType {
DoubleEqual, // ==
LeftAngleEqual, // <=
RightAngleEqual, // =>
Whitespace, // special token to account for whitespace
Text,
Eof,
}
@ -46,36 +47,19 @@ pub enum TokenType {
pub struct Token {
token_type: TokenType,
lexeme: String,
literal: Option<String>,
line: usize,
}
impl Token {
fn new(token_type: TokenType, lexeme: String, literal: Option<String>, line: usize) -> Self {
fn new(token_type: TokenType, lexeme: String, line: usize) -> Self {
Token {
token_type,
lexeme,
literal,
line,
}
}
}
impl fmt::Display for Token {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"{}{}",
self.lexeme,
if let Some(literal) = &self.literal {
format!(" ({})", literal)
} else {
String::new()
}
)
}
}
pub trait Tokenizer: Scanner {
type Token: Debug;
type TokenType: Debug;
@ -83,18 +67,22 @@ pub trait Tokenizer: Scanner {
fn tokenize(&mut self) -> Result<Vec<Self::Token>, Self::Error>;
fn scan_token(&mut self) -> Result<(), Self::Error>;
fn add_token(&mut self, token_type: Self::TokenType, literal: Option<String>);
fn add_token(&mut self, token_type: Self::TokenType);
}
#[derive(Debug)]
pub enum LexerError {
EmptyToken(usize),
UnexpectedCharacter(char, usize),
}
impl fmt::Display for LexerError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
LexerError::EmptyToken(line) => write!(f, "Empty token at line {}", line),
LexerError::UnexpectedCharacter(c, line) => {
write!(f, "Unexpected character '{}' at line {}", c, line)
}
}
}
}
@ -123,21 +111,10 @@ impl Lexer {
fn scan_token(&mut self) -> Result<(), LexerError> {
let c = self.advance();
let (token_type, literal) = match c {
'(' => (TokenType::LeftParen, None),
')' => (TokenType::RightParen, None),
'[' => (TokenType::LeftBracket, None),
']' => (TokenType::RightBracket, None),
',' => (TokenType::Comma, None),
'.' => (TokenType::Dot, None),
'-' => (TokenType::Minus, None),
'+' => (TokenType::Plus, None),
':' => (TokenType::Colon, None),
';' => (TokenType::Semicolon, None),
'*' => (TokenType::Star, None),
'|' => (TokenType::Pipe, None),
'\'' => (TokenType::SingleQuote, None),
'"' => (TokenType::DoubleQuote, None),
let token_type = match c {
'(' | ')' | '[' | ']' | ',' | '.' | '-' | '+' | ':' | ';' | '*' | '|' | '\'' | '"' => {
self.handle_single_char(c)
}
'{' => self.handle_left_brace(),
'}' => self.handle_right_brace(),
'%' => self.handle_percent(),
@ -147,18 +124,37 @@ impl Lexer {
'<' => self.handle_left_angle(),
'>' => self.handle_right_angle(),
'/' => self.handle_slash(),
' ' | '\r' | '\t' | '\n' => self.handle_whitespace(c),
_ => self.handle_text()?,
' ' | '\r' | '\t' | '\n' => self.handle_whitespace(),
_ => self.handle_text(),
};
if token_type != TokenType::Text || literal.is_some() {
self.add_token(token_type, literal);
}
self.add_token(token_type?);
Ok(())
}
fn handle_left_brace(&mut self) -> (TokenType, Option<String>) {
fn handle_single_char(&mut self, c: char) -> Result<TokenType, LexerError> {
let token_type = match c {
'(' => TokenType::LeftParen,
')' => TokenType::RightParen,
'[' => TokenType::LeftBracket,
']' => TokenType::RightBracket,
',' => TokenType::Comma,
'.' => TokenType::Dot,
'-' => TokenType::Minus,
'+' => TokenType::Plus,
':' => TokenType::Colon,
';' => TokenType::Semicolon,
'*' => TokenType::Star,
'|' => TokenType::Pipe,
'\'' => TokenType::SingleQuote,
'"' => TokenType::DoubleQuote,
_ => return Err(LexerError::UnexpectedCharacter(c, self.line)),
};
Ok(token_type)
}
fn handle_left_brace(&mut self) -> Result<TokenType, LexerError> {
let token_type = if self.advance_if_matches('{') {
TokenType::DoubleLeftBrace
} else if self.advance_if_matches('%') {
@ -168,138 +164,102 @@ impl Lexer {
} else {
TokenType::LeftBrace
};
(token_type, None)
Ok(token_type)
}
fn handle_right_brace(&mut self) -> (TokenType, Option<String>) {
fn handle_right_brace(&mut self) -> Result<TokenType, LexerError> {
let token_type = if self.advance_if_matches('}') {
TokenType::DoubleRightBrace
} else {
TokenType::RightBrace
};
(token_type, None)
Ok(token_type)
}
fn handle_percent(&mut self) -> (TokenType, Option<String>) {
fn handle_percent(&mut self) -> Result<TokenType, LexerError> {
let token_type = if self.advance_if_matches('}') {
TokenType::PercentRightBrace
} else {
TokenType::Percent
};
(token_type, None)
Ok(token_type)
}
fn handle_hash(&mut self) -> (TokenType, Option<String>) {
fn handle_hash(&mut self) -> Result<TokenType, LexerError> {
let token_type = if self.advance_if_matches('}') {
TokenType::HashRightBrace
} else {
TokenType::Hash
};
(token_type, None)
Ok(token_type)
}
fn handle_bang(&mut self) -> (TokenType, Option<String>) {
fn handle_bang(&mut self) -> Result<TokenType, LexerError> {
let token_type = if self.advance_if_matches('=') {
TokenType::BangEqual
} else {
TokenType::Bang
};
(token_type, None)
Ok(token_type)
}
fn handle_equal(&mut self) -> (TokenType, Option<String>) {
fn handle_equal(&mut self) -> Result<TokenType, LexerError> {
let token_type = if self.advance_if_matches('=') {
TokenType::DoubleEqual
} else {
TokenType::Equal
};
(token_type, None)
Ok(token_type)
}
fn handle_left_angle(&mut self) -> (TokenType, Option<String>) {
fn handle_left_angle(&mut self) -> Result<TokenType, LexerError> {
let token_type = if self.advance_if_matches('=') {
TokenType::LeftAngleEqual
} else {
TokenType::LeftAngle
};
(token_type, None)
Ok(token_type)
}
fn handle_right_angle(&mut self) -> (TokenType, Option<String>) {
fn handle_right_angle(&mut self) -> Result<TokenType, LexerError> {
let token_type = if self.advance_if_matches('=') {
TokenType::RightAngleEqual
} else {
TokenType::RightAngle
};
(token_type, None)
Ok(token_type)
}
fn handle_slash(&mut self) -> (TokenType, Option<String>) {
if self.advance_if_matches('/') {
let start = self.current - 2;
fn handle_slash(&mut self) -> Result<TokenType, LexerError> {
let token_type = if self.advance_if_matches('/') {
while self.peek() != '\n' && !self.is_at_end() {
self.advance();
}
let comment = self.source[start..self.current].to_string();
(TokenType::Text, Some(comment))
TokenType::Text
} else {
(TokenType::Slash, None)
}
TokenType::Slash
};
Ok(token_type)
}
fn handle_whitespace(&mut self, c: char) -> (TokenType, Option<String>) {
if c == '\n' {
self.line += 1;
}
(TokenType::Text, None)
}
fn handle_text(&mut self) -> Result<(TokenType, Option<String>), LexerError> {
while !self.is_at_end() && !self.is_delimiter(self.peek()) {
fn handle_whitespace(&mut self) -> Result<TokenType, LexerError> {
while !self.is_at_end() && self.peek().is_whitespace() {
if self.peek() == '\n' {
self.line += 1;
}
self.advance();
}
let text = self.source[self.start..self.current].to_string();
if text.is_empty() {
Err(LexerError::EmptyToken(self.line))
} else {
Ok((TokenType::Text, Some(text)))
}
Ok(TokenType::Whitespace)
}
fn is_delimiter(&self, c: char) -> bool {
matches!(
c,
'(' | ')'
| '['
| ']'
| '{'
| '}'
| ','
| '.'
| '-'
| '+'
| ':'
| ';'
| '*'
| '|'
| '%'
| '#'
| '!'
| '='
| '<'
| '>'
| '/'
| ' '
| '\r'
| '\t'
| '\n'
| '"'
| '\''
)
fn handle_text(&mut self) -> Result<TokenType, LexerError> {
self.advance_while(|c| !Self::is_token_boundary(c));
if self.start == self.current {
Err(LexerError::EmptyToken(self.line))
} else {
Ok(TokenType::Text)
}
}
fn advance_if_matches(&mut self, expected: char) -> bool {
@ -310,6 +270,27 @@ impl Lexer {
true
}
}
fn advance_while<F>(&mut self, condition: F)
where
F: Fn(char) -> bool,
{
while !self.is_at_end() && condition(self.peek()) {
if self.peek() == '\n' {
self.line += 1;
}
self.advance();
}
}
fn is_token_boundary(c: char) -> bool {
const TOKEN_BOUNDARIES: &[char] = &[
'(', ')', '[', ']', '{', '}', ',', '.', '-', '+', ':', ';', '*', '|', '%', '#', '!',
'=', '<', '>', '/', ' ', '\r', '\t', '\n', '"', '\'',
];
TOKEN_BOUNDARIES.contains(&c)
}
}
impl Tokenizer for Lexer {
@ -324,7 +305,7 @@ impl Tokenizer for Lexer {
}
self.tokens
.push(Token::new(TokenType::Eof, String::new(), None, self.line));
.push(Token::new(TokenType::Eof, String::new(), self.line));
Ok(self.tokens.clone())
}
@ -332,10 +313,11 @@ impl Tokenizer for Lexer {
self.scan_token()
}
fn add_token(&mut self, token_type: Self::TokenType, literal: Option<String>) {
fn add_token(&mut self, token_type: Self::TokenType) {
let text = self.source[self.start..self.current].to_string();
self.tokens
.push(Token::new(token_type, text, literal, self.line));
if token_type != TokenType::Whitespace {
self.tokens.push(Token::new(token_type, text, self.line));
}
}
}
@ -368,7 +350,14 @@ mod tests {
fn tokenize(input: &str) -> Vec<Token> {
let mut lexer = Lexer::new(input.to_string());
match lexer.tokenize() {
Ok(tokens) => tokens,
Ok(tokens) => {
// Debug print all tokens
for (i, token) in tokens.iter().enumerate() {
println!("{:?}", token)
}
tokens
}
Err(e) => {
eprintln!("Tokenization error: {:?}", e);
eprintln!("Input that caused the error: {}", input);