refactor lexer to support future parsing (#4)

This commit is contained in:
Josh Thomas 2024-10-13 15:07:22 -05:00 committed by GitHub
parent 784252e6c7
commit 583b151c07
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 218 additions and 139 deletions

View file

@ -1,7 +1,10 @@
use std::fmt; use std::fmt;
use std::fmt::Debug;
use crate::scanner::Scanner;
#[derive(Debug, Clone, PartialEq)] #[derive(Debug, Clone, PartialEq)]
enum TokenType { pub enum TokenType {
LeftParen, // ( LeftParen, // (
RightParen, // ) RightParen, // )
LeftBrace, // { LeftBrace, // {
@ -40,7 +43,7 @@ enum TokenType {
} }
#[derive(Debug, Clone)] #[derive(Debug, Clone)]
struct Token { pub struct Token {
token_type: TokenType, token_type: TokenType,
lexeme: String, lexeme: String,
literal: Option<String>, literal: Option<String>,
@ -73,10 +76,31 @@ impl fmt::Display for Token {
} }
} }
trait Tokenizer<T> { pub trait Tokenizer: Scanner {
fn tokenize(&mut self) -> Vec<T>; type Token: Debug;
type TokenType: Debug;
type Error: std::error::Error;
fn tokenize(&mut self) -> Result<Vec<Self::Token>, Self::Error>;
fn scan_token(&mut self) -> Result<(), Self::Error>;
fn add_token(&mut self, token_type: Self::TokenType, literal: Option<String>);
} }
#[derive(Debug)]
pub enum LexerError {
EmptyToken(usize),
}
impl fmt::Display for LexerError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
LexerError::EmptyToken(line) => write!(f, "Empty token at line {}", line),
}
}
}
impl std::error::Error for LexerError {}
pub struct Lexer { pub struct Lexer {
source: String, source: String,
tokens: Vec<Token>, tokens: Vec<Token>,
@ -96,24 +120,45 @@ impl Lexer {
} }
} }
fn scan_token(&mut self) { fn scan_token(&mut self) -> Result<(), LexerError> {
let c = self.advance(); let c = self.advance();
match c {
'(' => self.add_token(TokenType::LeftParen), let (token_type, literal) = match c {
')' => self.add_token(TokenType::RightParen), '(' => (TokenType::LeftParen, None),
'[' => self.add_token(TokenType::LeftBracket), ')' => (TokenType::RightParen, None),
']' => self.add_token(TokenType::RightBracket), '[' => (TokenType::LeftBracket, None),
',' => self.add_token(TokenType::Comma), ']' => (TokenType::RightBracket, None),
'.' => self.add_token(TokenType::Dot), ',' => (TokenType::Comma, None),
'-' => self.add_token(TokenType::Minus), '.' => (TokenType::Dot, None),
'+' => self.add_token(TokenType::Plus), '-' => (TokenType::Minus, None),
':' => self.add_token(TokenType::Colon), '+' => (TokenType::Plus, None),
';' => self.add_token(TokenType::Semicolon), ':' => (TokenType::Colon, None),
'*' => self.add_token(TokenType::Star), ';' => (TokenType::Semicolon, None),
'|' => self.add_token(TokenType::Pipe), '*' => (TokenType::Star, None),
'\'' => self.add_token(TokenType::SingleQuote), '|' => (TokenType::Pipe, None),
'"' => self.add_token(TokenType::DoubleQuote), '\'' => (TokenType::SingleQuote, None),
'{' => { '"' => (TokenType::DoubleQuote, None),
'{' => self.handle_left_brace(),
'}' => self.handle_right_brace(),
'%' => self.handle_percent(),
'#' => self.handle_hash(),
'!' => self.handle_bang(),
'=' => self.handle_equal(),
'<' => self.handle_left_angle(),
'>' => self.handle_right_angle(),
'/' => self.handle_slash(),
' ' | '\r' | '\t' | '\n' => self.handle_whitespace(c),
_ => self.handle_text()?,
};
if token_type != TokenType::Text || literal.is_some() {
self.add_token(token_type, literal);
}
Ok(())
}
fn handle_left_brace(&mut self) -> (TokenType, Option<String>) {
let token_type = if self.match_char('{') { let token_type = if self.match_char('{') {
TokenType::DoubleLeftBrace TokenType::DoubleLeftBrace
} else if self.match_char('%') { } else if self.match_char('%') {
@ -123,80 +168,93 @@ impl Lexer {
} else { } else {
TokenType::LeftBrace TokenType::LeftBrace
}; };
self.add_token(token_type); (token_type, None)
} }
'}' => {
fn handle_right_brace(&mut self) -> (TokenType, Option<String>) {
let token_type = if self.match_char('}') { let token_type = if self.match_char('}') {
TokenType::DoubleRightBrace TokenType::DoubleRightBrace
} else { } else {
TokenType::RightBrace TokenType::RightBrace
}; };
self.add_token(token_type); (token_type, None)
} }
'%' => {
fn handle_percent(&mut self) -> (TokenType, Option<String>) {
let token_type = if self.match_char('}') { let token_type = if self.match_char('}') {
TokenType::PercentRightBrace TokenType::PercentRightBrace
} else { } else {
TokenType::Percent TokenType::Percent
}; };
self.add_token(token_type); (token_type, None)
} }
'#' => {
fn handle_hash(&mut self) -> (TokenType, Option<String>) {
let token_type = if self.match_char('}') { let token_type = if self.match_char('}') {
TokenType::HashRightBrace TokenType::HashRightBrace
} else { } else {
TokenType::Hash TokenType::Hash
}; };
self.add_token(token_type); (token_type, None)
} }
'!' => {
fn handle_bang(&mut self) -> (TokenType, Option<String>) {
let token_type = if self.match_char('=') { let token_type = if self.match_char('=') {
TokenType::BangEqual TokenType::BangEqual
} else { } else {
TokenType::Bang TokenType::Bang
}; };
self.add_token(token_type); (token_type, None)
} }
'=' => {
fn handle_equal(&mut self) -> (TokenType, Option<String>) {
let token_type = if self.match_char('=') { let token_type = if self.match_char('=') {
TokenType::DoubleEqual TokenType::DoubleEqual
} else { } else {
TokenType::Equal TokenType::Equal
}; };
self.add_token(token_type); (token_type, None)
} }
'<' => {
fn handle_left_angle(&mut self) -> (TokenType, Option<String>) {
let token_type = if self.match_char('=') { let token_type = if self.match_char('=') {
TokenType::LeftAngleEqual TokenType::LeftAngleEqual
} else { } else {
TokenType::LeftAngle TokenType::LeftAngle
}; };
self.add_token(token_type); (token_type, None)
} }
'>' => {
fn handle_right_angle(&mut self) -> (TokenType, Option<String>) {
let token_type = if self.match_char('=') { let token_type = if self.match_char('=') {
TokenType::RightAngleEqual TokenType::RightAngleEqual
} else { } else {
TokenType::RightAngle TokenType::RightAngle
}; };
self.add_token(token_type); (token_type, None)
} }
'/' => {
fn handle_slash(&mut self) -> (TokenType, Option<String>) {
if self.match_char('/') { if self.match_char('/') {
let start = self.current - 2;
while self.peek() != '\n' && !self.is_at_end() { while self.peek() != '\n' && !self.is_at_end() {
self.advance(); self.advance();
} }
let comment = self.source[start..self.current].to_string();
(TokenType::Text, Some(comment))
} else { } else {
self.add_token(TokenType::Slash); (TokenType::Slash, None)
}
}
' ' | '\r' | '\t' => {}
'\n' => self.line += 1,
_ => self.text(),
} }
} }
fn text(&mut self) { fn handle_whitespace(&mut self, c: char) -> (TokenType, Option<String>) {
if c == '\n' {
self.line += 1;
}
(TokenType::Text, None)
}
fn handle_text(&mut self) -> Result<(TokenType, Option<String>), LexerError> {
while !self.is_at_end() && !self.is_delimiter(self.peek()) { while !self.is_at_end() && !self.is_delimiter(self.peek()) {
if self.peek() == '\n' { if self.peek() == '\n' {
self.line += 1; self.line += 1;
@ -205,8 +263,10 @@ impl Lexer {
} }
let text = self.source[self.start..self.current].to_string(); let text = self.source[self.start..self.current].to_string();
if !text.is_empty() { if text.is_empty() {
self.add_token_literal(TokenType::Text, Some(text)); Err(LexerError::EmptyToken(self.line))
} else {
Ok((TokenType::Text, Some(text)))
} }
} }
@ -246,59 +306,61 @@ impl Lexer {
if self.is_at_end() { if self.is_at_end() {
return false; return false;
} }
if self.source.chars().nth(self.current) != Some(expected) { if self.peek() != expected {
return false; return false;
} }
self.current += 1; self.current += 1;
true true
} }
fn peek(&self) -> char {
self.peek_ahead(0)
} }
fn peek_next(&self) -> char { impl Tokenizer for Lexer {
self.peek_ahead(1) type Token = Token;
} type TokenType = TokenType;
fn peek_ahead(&self, offset: usize) -> char { type Error = LexerError;
self.source
.chars() fn tokenize(&mut self) -> Result<Vec<Self::Token>, Self::Error> {
.nth(self.current + offset) while !self.is_at_end() {
.unwrap_or('\0') self.start = self.current;
self.scan_token()?;
} }
fn is_at_end(&self) -> bool { self.tokens
self.current >= self.source.len() .push(Token::new(TokenType::Eof, String::new(), None, self.line));
Ok(self.tokens.clone())
} }
fn advance(&mut self) -> char { fn scan_token(&mut self) -> Result<(), LexerError> {
let current_char = self.source.chars().nth(self.current).unwrap_or('\0'); self.scan_token()
self.current += 1;
current_char
} }
fn add_token(&mut self, token_type: TokenType) { fn add_token(&mut self, token_type: Self::TokenType, literal: Option<String>) {
self.add_token_literal(token_type, None);
}
fn add_token_literal(&mut self, token_type: TokenType, literal: Option<String>) {
let text = self.source[self.start..self.current].to_string(); let text = self.source[self.start..self.current].to_string();
self.tokens self.tokens
.push(Token::new(token_type, text, literal, self.line)); .push(Token::new(token_type, text, literal, self.line));
} }
} }
impl Tokenizer<Token> for Lexer { impl Scanner for Lexer {
fn tokenize(&mut self) -> Vec<Token> { type Item = char;
while !self.is_at_end() {
self.start = self.current; fn advance(&mut self) -> Self::Item {
self.scan_token(); let current_char = self.peek();
self.current += 1;
current_char
} }
self.tokens fn peek(&self) -> Self::Item {
.push(Token::new(TokenType::Eof, String::new(), None, self.line)); self.source.chars().nth(self.current).unwrap_or('\0')
self.tokens.clone() }
fn peek_next(&self) -> Self::Item {
self.source.chars().nth(self.current + 1).unwrap_or('\0')
}
fn is_at_end(&self) -> bool {
self.current >= self.source.len()
} }
} }
@ -308,7 +370,14 @@ mod tests {
fn tokenize(input: &str) -> Vec<Token> { fn tokenize(input: &str) -> Vec<Token> {
let mut lexer = Lexer::new(input.to_string()); let mut lexer = Lexer::new(input.to_string());
lexer.tokenize() match lexer.tokenize() {
Ok(tokens) => tokens,
Err(e) => {
eprintln!("Tokenization error: {:?}", e);
eprintln!("Input that caused the error: {}", input);
panic!("Tokenization failed. See error output above.");
}
}
} }
#[test] #[test]

View file

@ -1,2 +1,2 @@
mod lexer; mod lexer;
mod scanner;

10
src/scanner.rs Normal file
View file

@ -0,0 +1,10 @@
use std::fmt::Debug;
pub trait Scanner {
type Item: Debug;
fn advance(&mut self) -> Self::Item;
fn peek(&self) -> Self::Item;
fn peek_next(&self) -> Self::Item;
fn is_at_end(&self) -> bool;
}