From 583b151c07cf13972fc6d24e9cc553676b01f155 Mon Sep 17 00:00:00 2001 From: Josh Thomas Date: Sun, 13 Oct 2024 15:07:22 -0500 Subject: [PATCH] refactor lexer to support future parsing (#4) --- src/lexer.rs | 345 +++++++++++++++++++++++++++++-------------------- src/lib.rs | 2 +- src/scanner.rs | 10 ++ 3 files changed, 218 insertions(+), 139 deletions(-) create mode 100644 src/scanner.rs diff --git a/src/lexer.rs b/src/lexer.rs index 520a229..74e47bd 100644 --- a/src/lexer.rs +++ b/src/lexer.rs @@ -1,7 +1,10 @@ use std::fmt; +use std::fmt::Debug; + +use crate::scanner::Scanner; #[derive(Debug, Clone, PartialEq)] -enum TokenType { +pub enum TokenType { LeftParen, // ( RightParen, // ) LeftBrace, // { @@ -40,7 +43,7 @@ enum TokenType { } #[derive(Debug, Clone)] -struct Token { +pub struct Token { token_type: TokenType, lexeme: String, literal: Option, @@ -73,10 +76,31 @@ impl fmt::Display for Token { } } -trait Tokenizer { - fn tokenize(&mut self) -> Vec; +pub trait Tokenizer: Scanner { + type Token: Debug; + type TokenType: Debug; + type Error: std::error::Error; + + fn tokenize(&mut self) -> Result, Self::Error>; + fn scan_token(&mut self) -> Result<(), Self::Error>; + fn add_token(&mut self, token_type: Self::TokenType, literal: Option); } +#[derive(Debug)] +pub enum LexerError { + EmptyToken(usize), +} + +impl fmt::Display for LexerError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + LexerError::EmptyToken(line) => write!(f, "Empty token at line {}", line), + } + } +} + +impl std::error::Error for LexerError {} + pub struct Lexer { source: String, tokens: Vec, @@ -96,107 +120,141 @@ impl Lexer { } } - fn scan_token(&mut self) { + fn scan_token(&mut self) -> Result<(), LexerError> { let c = self.advance(); - match c { - '(' => self.add_token(TokenType::LeftParen), - ')' => self.add_token(TokenType::RightParen), - '[' => self.add_token(TokenType::LeftBracket), - ']' => self.add_token(TokenType::RightBracket), - ',' => self.add_token(TokenType::Comma), - '.' => self.add_token(TokenType::Dot), - '-' => self.add_token(TokenType::Minus), - '+' => self.add_token(TokenType::Plus), - ':' => self.add_token(TokenType::Colon), - ';' => self.add_token(TokenType::Semicolon), - '*' => self.add_token(TokenType::Star), - '|' => self.add_token(TokenType::Pipe), - '\'' => self.add_token(TokenType::SingleQuote), - '"' => self.add_token(TokenType::DoubleQuote), - '{' => { - let token_type = if self.match_char('{') { - TokenType::DoubleLeftBrace - } else if self.match_char('%') { - TokenType::LeftBracePercent - } else if self.match_char('#') { - TokenType::LeftBraceHash - } else { - TokenType::LeftBrace - }; - self.add_token(token_type); + + let (token_type, literal) = match c { + '(' => (TokenType::LeftParen, None), + ')' => (TokenType::RightParen, None), + '[' => (TokenType::LeftBracket, None), + ']' => (TokenType::RightBracket, None), + ',' => (TokenType::Comma, None), + '.' => (TokenType::Dot, None), + '-' => (TokenType::Minus, None), + '+' => (TokenType::Plus, None), + ':' => (TokenType::Colon, None), + ';' => (TokenType::Semicolon, None), + '*' => (TokenType::Star, None), + '|' => (TokenType::Pipe, None), + '\'' => (TokenType::SingleQuote, None), + '"' => (TokenType::DoubleQuote, None), + '{' => self.handle_left_brace(), + '}' => self.handle_right_brace(), + '%' => self.handle_percent(), + '#' => self.handle_hash(), + '!' => self.handle_bang(), + '=' => self.handle_equal(), + '<' => self.handle_left_angle(), + '>' => self.handle_right_angle(), + '/' => self.handle_slash(), + ' ' | '\r' | '\t' | '\n' => self.handle_whitespace(c), + _ => self.handle_text()?, + }; + + if token_type != TokenType::Text || literal.is_some() { + self.add_token(token_type, literal); + } + + Ok(()) + } + + fn handle_left_brace(&mut self) -> (TokenType, Option) { + let token_type = if self.match_char('{') { + TokenType::DoubleLeftBrace + } else if self.match_char('%') { + TokenType::LeftBracePercent + } else if self.match_char('#') { + TokenType::LeftBraceHash + } else { + TokenType::LeftBrace + }; + (token_type, None) + } + + fn handle_right_brace(&mut self) -> (TokenType, Option) { + let token_type = if self.match_char('}') { + TokenType::DoubleRightBrace + } else { + TokenType::RightBrace + }; + (token_type, None) + } + + fn handle_percent(&mut self) -> (TokenType, Option) { + let token_type = if self.match_char('}') { + TokenType::PercentRightBrace + } else { + TokenType::Percent + }; + (token_type, None) + } + + fn handle_hash(&mut self) -> (TokenType, Option) { + let token_type = if self.match_char('}') { + TokenType::HashRightBrace + } else { + TokenType::Hash + }; + (token_type, None) + } + + fn handle_bang(&mut self) -> (TokenType, Option) { + let token_type = if self.match_char('=') { + TokenType::BangEqual + } else { + TokenType::Bang + }; + (token_type, None) + } + + fn handle_equal(&mut self) -> (TokenType, Option) { + let token_type = if self.match_char('=') { + TokenType::DoubleEqual + } else { + TokenType::Equal + }; + (token_type, None) + } + + fn handle_left_angle(&mut self) -> (TokenType, Option) { + let token_type = if self.match_char('=') { + TokenType::LeftAngleEqual + } else { + TokenType::LeftAngle + }; + (token_type, None) + } + + fn handle_right_angle(&mut self) -> (TokenType, Option) { + let token_type = if self.match_char('=') { + TokenType::RightAngleEqual + } else { + TokenType::RightAngle + }; + (token_type, None) + } + + fn handle_slash(&mut self) -> (TokenType, Option) { + if self.match_char('/') { + let start = self.current - 2; + while self.peek() != '\n' && !self.is_at_end() { + self.advance(); } - '}' => { - let token_type = if self.match_char('}') { - TokenType::DoubleRightBrace - } else { - TokenType::RightBrace - }; - self.add_token(token_type); - } - '%' => { - let token_type = if self.match_char('}') { - TokenType::PercentRightBrace - } else { - TokenType::Percent - }; - self.add_token(token_type); - } - '#' => { - let token_type = if self.match_char('}') { - TokenType::HashRightBrace - } else { - TokenType::Hash - }; - self.add_token(token_type); - } - '!' => { - let token_type = if self.match_char('=') { - TokenType::BangEqual - } else { - TokenType::Bang - }; - self.add_token(token_type); - } - '=' => { - let token_type = if self.match_char('=') { - TokenType::DoubleEqual - } else { - TokenType::Equal - }; - self.add_token(token_type); - } - '<' => { - let token_type = if self.match_char('=') { - TokenType::LeftAngleEqual - } else { - TokenType::LeftAngle - }; - self.add_token(token_type); - } - '>' => { - let token_type = if self.match_char('=') { - TokenType::RightAngleEqual - } else { - TokenType::RightAngle - }; - self.add_token(token_type); - } - '/' => { - if self.match_char('/') { - while self.peek() != '\n' && !self.is_at_end() { - self.advance(); - } - } else { - self.add_token(TokenType::Slash); - } - } - ' ' | '\r' | '\t' => {} - '\n' => self.line += 1, - _ => self.text(), + let comment = self.source[start..self.current].to_string(); + (TokenType::Text, Some(comment)) + } else { + (TokenType::Slash, None) } } - fn text(&mut self) { + fn handle_whitespace(&mut self, c: char) -> (TokenType, Option) { + if c == '\n' { + self.line += 1; + } + (TokenType::Text, None) + } + + fn handle_text(&mut self) -> Result<(TokenType, Option), LexerError> { while !self.is_at_end() && !self.is_delimiter(self.peek()) { if self.peek() == '\n' { self.line += 1; @@ -205,8 +263,10 @@ impl Lexer { } let text = self.source[self.start..self.current].to_string(); - if !text.is_empty() { - self.add_token_literal(TokenType::Text, Some(text)); + if text.is_empty() { + Err(LexerError::EmptyToken(self.line)) + } else { + Ok((TokenType::Text, Some(text))) } } @@ -246,59 +306,61 @@ impl Lexer { if self.is_at_end() { return false; } - if self.source.chars().nth(self.current) != Some(expected) { + if self.peek() != expected { return false; } self.current += 1; true } +} - fn peek(&self) -> char { - self.peek_ahead(0) +impl Tokenizer for Lexer { + type Token = Token; + type TokenType = TokenType; + type Error = LexerError; + + fn tokenize(&mut self) -> Result, Self::Error> { + while !self.is_at_end() { + self.start = self.current; + self.scan_token()?; + } + + self.tokens + .push(Token::new(TokenType::Eof, String::new(), None, self.line)); + Ok(self.tokens.clone()) } - fn peek_next(&self) -> char { - self.peek_ahead(1) - } - fn peek_ahead(&self, offset: usize) -> char { - self.source - .chars() - .nth(self.current + offset) - .unwrap_or('\0') + fn scan_token(&mut self) -> Result<(), LexerError> { + self.scan_token() } - fn is_at_end(&self) -> bool { - self.current >= self.source.len() - } - - fn advance(&mut self) -> char { - let current_char = self.source.chars().nth(self.current).unwrap_or('\0'); - self.current += 1; - current_char - } - - fn add_token(&mut self, token_type: TokenType) { - self.add_token_literal(token_type, None); - } - - fn add_token_literal(&mut self, token_type: TokenType, literal: Option) { + fn add_token(&mut self, token_type: Self::TokenType, literal: Option) { let text = self.source[self.start..self.current].to_string(); self.tokens .push(Token::new(token_type, text, literal, self.line)); } } -impl Tokenizer for Lexer { - fn tokenize(&mut self) -> Vec { - while !self.is_at_end() { - self.start = self.current; - self.scan_token(); - } +impl Scanner for Lexer { + type Item = char; - self.tokens - .push(Token::new(TokenType::Eof, String::new(), None, self.line)); - self.tokens.clone() + fn advance(&mut self) -> Self::Item { + let current_char = self.peek(); + self.current += 1; + current_char + } + + fn peek(&self) -> Self::Item { + self.source.chars().nth(self.current).unwrap_or('\0') + } + + fn peek_next(&self) -> Self::Item { + self.source.chars().nth(self.current + 1).unwrap_or('\0') + } + + fn is_at_end(&self) -> bool { + self.current >= self.source.len() } } @@ -308,7 +370,14 @@ mod tests { fn tokenize(input: &str) -> Vec { let mut lexer = Lexer::new(input.to_string()); - lexer.tokenize() + match lexer.tokenize() { + Ok(tokens) => tokens, + Err(e) => { + eprintln!("Tokenization error: {:?}", e); + eprintln!("Input that caused the error: {}", input); + panic!("Tokenization failed. See error output above."); + } + } } #[test] diff --git a/src/lib.rs b/src/lib.rs index b1da80f..605a466 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,2 +1,2 @@ mod lexer; - +mod scanner; diff --git a/src/scanner.rs b/src/scanner.rs new file mode 100644 index 0000000..d2dba76 --- /dev/null +++ b/src/scanner.rs @@ -0,0 +1,10 @@ +use std::fmt::Debug; + +pub trait Scanner { + type Item: Debug; + + fn advance(&mut self) -> Self::Item; + fn peek(&self) -> Self::Item; + fn peek_next(&self) -> Self::Item; + fn is_at_end(&self) -> bool; +}