refactor lexer to support future parsing (#4)

2025-09-08 01:30:32 +00:00 · 2024-10-13 15:07:22 -05:00 · 2024-10-13 15:07:22 -05:00 · 583b151c07
commit 583b151c07
parent 784252e6c7
3 changed files with 218 additions and 139 deletions
--- a/src/lexer.rs
+++ b/src/lexer.rs
@ -1,7 +1,10 @@
 use std::fmt;
 use std::fmt::Debug;
 use crate::scanner::Scanner;
 #[derive(Debug, Clone, PartialEq)]
-enum TokenType {
+pub enum TokenType {
    LeftParen,         // (
    RightParen,        // )
    LeftBrace,         // {
@ -40,7 +43,7 @@ enum TokenType {
 }
 #[derive(Debug, Clone)]
-struct Token {
+pub struct Token {
    token_type: TokenType,
    lexeme: String,
    literal: Option<String>,
@ -73,10 +76,31 @@ impl fmt::Display for Token {
    }
 }
-trait Tokenizer<T> {
+pub trait Tokenizer: Scanner {
-    fn tokenize(&mut self) -> Vec<T>;
+    type Token: Debug;
    type TokenType: Debug;
    type Error: std::error::Error;
    fn tokenize(&mut self) -> Result<Vec<Self::Token>, Self::Error>;
    fn scan_token(&mut self) -> Result<(), Self::Error>;
    fn add_token(&mut self, token_type: Self::TokenType, literal: Option<String>);
 }
 #[derive(Debug)]
 pub enum LexerError {
    EmptyToken(usize),
 }
 impl fmt::Display for LexerError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            LexerError::EmptyToken(line) => write!(f, "Empty token at line {}", line),
        }
    }
 }
 impl std::error::Error for LexerError {}
 pub struct Lexer {
    source: String,
    tokens: Vec<Token>,
@ -96,24 +120,45 @@ impl Lexer {
        }
    }
-    fn scan_token(&mut self) {
+    fn scan_token(&mut self) -> Result<(), LexerError> {
        let c = self.advance();
-        match c {
+
-            '(' => self.add_token(TokenType::LeftParen),
+        let (token_type, literal) = match c {
-            ')' => self.add_token(TokenType::RightParen),
+            '(' => (TokenType::LeftParen, None),
-            '[' => self.add_token(TokenType::LeftBracket),
+            ')' => (TokenType::RightParen, None),
-            ']' => self.add_token(TokenType::RightBracket),
+            '[' => (TokenType::LeftBracket, None),
-            ',' => self.add_token(TokenType::Comma),
+            ']' => (TokenType::RightBracket, None),
-            '.' => self.add_token(TokenType::Dot),
+            ',' => (TokenType::Comma, None),
-            '-' => self.add_token(TokenType::Minus),
+            '.' => (TokenType::Dot, None),
-            '+' => self.add_token(TokenType::Plus),
+            '-' => (TokenType::Minus, None),
-            ':' => self.add_token(TokenType::Colon),
+            '+' => (TokenType::Plus, None),
-            ';' => self.add_token(TokenType::Semicolon),
+            ':' => (TokenType::Colon, None),
-            '*' => self.add_token(TokenType::Star),
+            ';' => (TokenType::Semicolon, None),
-            '|' => self.add_token(TokenType::Pipe),
+            '*' => (TokenType::Star, None),
-            '\'' => self.add_token(TokenType::SingleQuote),
+            '|' => (TokenType::Pipe, None),
-            '"' => self.add_token(TokenType::DoubleQuote),
+            '\'' => (TokenType::SingleQuote, None),
-            '{' => {
+            '"' => (TokenType::DoubleQuote, None),
            '{' => self.handle_left_brace(),
            '}' => self.handle_right_brace(),
            '%' => self.handle_percent(),
            '#' => self.handle_hash(),
            '!' => self.handle_bang(),
            '=' => self.handle_equal(),
            '<' => self.handle_left_angle(),
            '>' => self.handle_right_angle(),
            '/' => self.handle_slash(),
            ' ' | '\r' | '\t' | '\n' => self.handle_whitespace(c),
            _ => self.handle_text()?,
        };
        if token_type != TokenType::Text || literal.is_some() {
            self.add_token(token_type, literal);
        }
        Ok(())
    }
    fn handle_left_brace(&mut self) -> (TokenType, Option<String>) {
        let token_type = if self.match_char('{') {
            TokenType::DoubleLeftBrace
        } else if self.match_char('%') {
@ -123,80 +168,93 @@ impl Lexer {
        } else {
            TokenType::LeftBrace
        };
-                self.add_token(token_type);
+        (token_type, None)
    }
-            '}' => {
+
    fn handle_right_brace(&mut self) -> (TokenType, Option<String>) {
        let token_type = if self.match_char('}') {
            TokenType::DoubleRightBrace
        } else {
            TokenType::RightBrace
        };
-                self.add_token(token_type);
+        (token_type, None)
    }
-            '%' => {
+
    fn handle_percent(&mut self) -> (TokenType, Option<String>) {
        let token_type = if self.match_char('}') {
            TokenType::PercentRightBrace
        } else {
            TokenType::Percent
        };
-                self.add_token(token_type);
+        (token_type, None)
    }
-            '#' => {
+
    fn handle_hash(&mut self) -> (TokenType, Option<String>) {
        let token_type = if self.match_char('}') {
            TokenType::HashRightBrace
        } else {
            TokenType::Hash
        };
-                self.add_token(token_type);
+        (token_type, None)
    }
-            '!' => {
+
    fn handle_bang(&mut self) -> (TokenType, Option<String>) {
        let token_type = if self.match_char('=') {
            TokenType::BangEqual
        } else {
            TokenType::Bang
        };
-                self.add_token(token_type);
+        (token_type, None)
    }
-            '=' => {
+
    fn handle_equal(&mut self) -> (TokenType, Option<String>) {
        let token_type = if self.match_char('=') {
            TokenType::DoubleEqual
        } else {
            TokenType::Equal
        };
-                self.add_token(token_type);
+        (token_type, None)
    }
-            '<' => {
+
    fn handle_left_angle(&mut self) -> (TokenType, Option<String>) {
        let token_type = if self.match_char('=') {
            TokenType::LeftAngleEqual
        } else {
            TokenType::LeftAngle
        };
-                self.add_token(token_type);
+        (token_type, None)
    }
-            '>' => {
+
    fn handle_right_angle(&mut self) -> (TokenType, Option<String>) {
        let token_type = if self.match_char('=') {
            TokenType::RightAngleEqual
        } else {
            TokenType::RightAngle
        };
-                self.add_token(token_type);
+        (token_type, None)
    }
-            '/' => {
+
    fn handle_slash(&mut self) -> (TokenType, Option<String>) {
        if self.match_char('/') {
            let start = self.current - 2;
            while self.peek() != '\n' && !self.is_at_end() {
                self.advance();
            }
            let comment = self.source[start..self.current].to_string();
            (TokenType::Text, Some(comment))
        } else {
-                    self.add_token(TokenType::Slash);
+            (TokenType::Slash, None)
                }
            }
            ' ' | '\r' | '\t' => {}
            '\n' => self.line += 1,
            _ => self.text(),
        }
    }
-    fn text(&mut self) {
+    fn handle_whitespace(&mut self, c: char) -> (TokenType, Option<String>) {
        if c == '\n' {
            self.line += 1;
        }
        (TokenType::Text, None)
    }
    fn handle_text(&mut self) -> Result<(TokenType, Option<String>), LexerError> {
        while !self.is_at_end() && !self.is_delimiter(self.peek()) {
            if self.peek() == '\n' {
                self.line += 1;
@ -205,8 +263,10 @@ impl Lexer {
        }
        let text = self.source[self.start..self.current].to_string();
-        if !text.is_empty() {
+        if text.is_empty() {
-            self.add_token_literal(TokenType::Text, Some(text));
+            Err(LexerError::EmptyToken(self.line))
        } else {
            Ok((TokenType::Text, Some(text)))
        }
    }
@ -246,59 +306,61 @@ impl Lexer {
        if self.is_at_end() {
            return false;
        }
-        if self.source.chars().nth(self.current) != Some(expected) {
+        if self.peek() != expected {
            return false;
        }
        self.current += 1;
        true
    }
    fn peek(&self) -> char {
        self.peek_ahead(0)
 }
-    fn peek_next(&self) -> char {
+impl Tokenizer for Lexer {
-        self.peek_ahead(1)
+    type Token = Token;
-    }
+    type TokenType = TokenType;
-    fn peek_ahead(&self, offset: usize) -> char {
+    type Error = LexerError;
-        self.source
+
-            .chars()
+    fn tokenize(&mut self) -> Result<Vec<Self::Token>, Self::Error> {
-            .nth(self.current + offset)
+        while !self.is_at_end() {
-            .unwrap_or('\0')
+            self.start = self.current;
            self.scan_token()?;
        }
-    fn is_at_end(&self) -> bool {
+        self.tokens
-        self.current >= self.source.len()
+            .push(Token::new(TokenType::Eof, String::new(), None, self.line));
        Ok(self.tokens.clone())
    }
-    fn advance(&mut self) -> char {
+    fn scan_token(&mut self) -> Result<(), LexerError> {
-        let current_char = self.source.chars().nth(self.current).unwrap_or('\0');
+        self.scan_token()
        self.current += 1;
        current_char
    }
-    fn add_token(&mut self, token_type: TokenType) {
+    fn add_token(&mut self, token_type: Self::TokenType, literal: Option<String>) {
        self.add_token_literal(token_type, None);
    }
    fn add_token_literal(&mut self, token_type: TokenType, literal: Option<String>) {
        let text = self.source[self.start..self.current].to_string();
        self.tokens
            .push(Token::new(token_type, text, literal, self.line));
    }
 }
-impl Tokenizer<Token> for Lexer {
+impl Scanner for Lexer {
-    fn tokenize(&mut self) -> Vec<Token> {
+    type Item = char;
-        while !self.is_at_end() {
+
-            self.start = self.current;
+    fn advance(&mut self) -> Self::Item {
-            self.scan_token();
+        let current_char = self.peek();
        self.current += 1;
        current_char
    }
-        self.tokens
+    fn peek(&self) -> Self::Item {
-            .push(Token::new(TokenType::Eof, String::new(), None, self.line));
+        self.source.chars().nth(self.current).unwrap_or('\0')
-        self.tokens.clone()
+    }
    fn peek_next(&self) -> Self::Item {
        self.source.chars().nth(self.current + 1).unwrap_or('\0')
    }
    fn is_at_end(&self) -> bool {
        self.current >= self.source.len()
    }
 }
@ -308,7 +370,14 @@ mod tests {
    fn tokenize(input: &str) -> Vec<Token> {
        let mut lexer = Lexer::new(input.to_string());
-        lexer.tokenize()
+        match lexer.tokenize() {
            Ok(tokens) => tokens,
            Err(e) => {
                eprintln!("Tokenization error: {:?}", e);
                eprintln!("Input that caused the error: {}", input);
                panic!("Tokenization failed. See error output above.");
            }
        }
    }
    #[test]
--- a/src/lib.rs
+++ b/src/lib.rs
@ -1,2 +1,2 @@
 mod lexer;
-
+mod scanner;
--- a/src/scanner.rs
+++ b/src/scanner.rs
@ -0,0 +1,10 @@
 use std::fmt::Debug;
 pub trait Scanner {
    type Item: Debug;
    fn advance(&mut self) -> Self::Item;
    fn peek(&self) -> Self::Item;
    fn peek_next(&self) -> Self::Item;
    fn is_at_end(&self) -> bool;
 }