simplify lexer and tokenization by removing literal values (#5)

2025-09-26 18:19:09 +00:00 · 2024-10-13 17:14:15 -05:00 · 2024-10-13 17:14:15 -05:00 · 03c0c19dd9
commit 03c0c19dd9
parent 433c17999c
1 changed files with 102 additions and 113 deletions
--- a/src/lexer.rs
+++ b/src/lexer.rs
@ -38,6 +38,7 @@ pub enum TokenType {
    DoubleEqual,       // ==
    LeftAngleEqual,    // <=
    RightAngleEqual,   // =>
    Whitespace,        // special token to account for whitespace
    Text,
    Eof,
 }
@ -46,36 +47,19 @@ pub enum TokenType {
 pub struct Token {
    token_type: TokenType,
    lexeme: String,
    literal: Option<String>,
    line: usize,
 }
 impl Token {
-    fn new(token_type: TokenType, lexeme: String, literal: Option<String>, line: usize) -> Self {
+    fn new(token_type: TokenType, lexeme: String, line: usize) -> Self {
        Token {
            token_type,
            lexeme,
            literal,
            line,
        }
    }
 }
 impl fmt::Display for Token {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        write!(
            f,
            "{}{}",
            self.lexeme,
            if let Some(literal) = &self.literal {
                format!(" ({})", literal)
            } else {
                String::new()
            }
        )
    }
 }
 pub trait Tokenizer: Scanner {
    type Token: Debug;
    type TokenType: Debug;
@ -83,18 +67,22 @@ pub trait Tokenizer: Scanner {
    fn tokenize(&mut self) -> Result<Vec<Self::Token>, Self::Error>;
    fn scan_token(&mut self) -> Result<(), Self::Error>;
-    fn add_token(&mut self, token_type: Self::TokenType, literal: Option<String>);
+    fn add_token(&mut self, token_type: Self::TokenType);
 }
 #[derive(Debug)]
 pub enum LexerError {
    EmptyToken(usize),
    UnexpectedCharacter(char, usize),
 }
 impl fmt::Display for LexerError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            LexerError::EmptyToken(line) => write!(f, "Empty token at line {}", line),
            LexerError::UnexpectedCharacter(c, line) => {
                write!(f, "Unexpected character '{}' at line {}", c, line)
            }
        }
    }
 }
@ -123,21 +111,10 @@ impl Lexer {
    fn scan_token(&mut self) -> Result<(), LexerError> {
        let c = self.advance();
-        let (token_type, literal) = match c {
+        let token_type = match c {
-            '(' => (TokenType::LeftParen, None),
+            '(' | ')' | '[' | ']' | ',' | '.' | '-' | '+' | ':' | ';' | '*' | '|' | '\'' | '"' => {
-            ')' => (TokenType::RightParen, None),
+                self.handle_single_char(c)
-            '[' => (TokenType::LeftBracket, None),
+            }
            ']' => (TokenType::RightBracket, None),
            ',' => (TokenType::Comma, None),
            '.' => (TokenType::Dot, None),
            '-' => (TokenType::Minus, None),
            '+' => (TokenType::Plus, None),
            ':' => (TokenType::Colon, None),
            ';' => (TokenType::Semicolon, None),
            '*' => (TokenType::Star, None),
            '|' => (TokenType::Pipe, None),
            '\'' => (TokenType::SingleQuote, None),
            '"' => (TokenType::DoubleQuote, None),
            '{' => self.handle_left_brace(),
            '}' => self.handle_right_brace(),
            '%' => self.handle_percent(),
@ -147,18 +124,37 @@ impl Lexer {
            '<' => self.handle_left_angle(),
            '>' => self.handle_right_angle(),
            '/' => self.handle_slash(),
-            ' ' | '\r' | '\t' | '\n' => self.handle_whitespace(c),
+            ' ' | '\r' | '\t' | '\n' => self.handle_whitespace(),
-            _ => self.handle_text()?,
+            _ => self.handle_text(),
        };
-        if token_type != TokenType::Text || literal.is_some() {
+        self.add_token(token_type?);
            self.add_token(token_type, literal);
        }
        Ok(())
    }
-    fn handle_left_brace(&mut self) -> (TokenType, Option<String>) {
+    fn handle_single_char(&mut self, c: char) -> Result<TokenType, LexerError> {
        let token_type = match c {
            '(' => TokenType::LeftParen,
            ')' => TokenType::RightParen,
            '[' => TokenType::LeftBracket,
            ']' => TokenType::RightBracket,
            ',' => TokenType::Comma,
            '.' => TokenType::Dot,
            '-' => TokenType::Minus,
            '+' => TokenType::Plus,
            ':' => TokenType::Colon,
            ';' => TokenType::Semicolon,
            '*' => TokenType::Star,
            '|' => TokenType::Pipe,
            '\'' => TokenType::SingleQuote,
            '"' => TokenType::DoubleQuote,
            _ => return Err(LexerError::UnexpectedCharacter(c, self.line)),
        };
        Ok(token_type)
    }
    fn handle_left_brace(&mut self) -> Result<TokenType, LexerError> {
        let token_type = if self.advance_if_matches('{') {
            TokenType::DoubleLeftBrace
        } else if self.advance_if_matches('%') {
@ -168,138 +164,102 @@ impl Lexer {
        } else {
            TokenType::LeftBrace
        };
-        (token_type, None)
+        Ok(token_type)
    }
-    fn handle_right_brace(&mut self) -> (TokenType, Option<String>) {
+    fn handle_right_brace(&mut self) -> Result<TokenType, LexerError> {
        let token_type = if self.advance_if_matches('}') {
            TokenType::DoubleRightBrace
        } else {
            TokenType::RightBrace
        };
-        (token_type, None)
+        Ok(token_type)
    }
-    fn handle_percent(&mut self) -> (TokenType, Option<String>) {
+    fn handle_percent(&mut self) -> Result<TokenType, LexerError> {
        let token_type = if self.advance_if_matches('}') {
            TokenType::PercentRightBrace
        } else {
            TokenType::Percent
        };
-        (token_type, None)
+        Ok(token_type)
    }
-    fn handle_hash(&mut self) -> (TokenType, Option<String>) {
+    fn handle_hash(&mut self) -> Result<TokenType, LexerError> {
        let token_type = if self.advance_if_matches('}') {
            TokenType::HashRightBrace
        } else {
            TokenType::Hash
        };
-        (token_type, None)
+        Ok(token_type)
    }
-    fn handle_bang(&mut self) -> (TokenType, Option<String>) {
+    fn handle_bang(&mut self) -> Result<TokenType, LexerError> {
        let token_type = if self.advance_if_matches('=') {
            TokenType::BangEqual
        } else {
            TokenType::Bang
        };
-        (token_type, None)
+        Ok(token_type)
    }
-    fn handle_equal(&mut self) -> (TokenType, Option<String>) {
+    fn handle_equal(&mut self) -> Result<TokenType, LexerError> {
        let token_type = if self.advance_if_matches('=') {
            TokenType::DoubleEqual
        } else {
            TokenType::Equal
        };
-        (token_type, None)
+        Ok(token_type)
    }
-    fn handle_left_angle(&mut self) -> (TokenType, Option<String>) {
+    fn handle_left_angle(&mut self) -> Result<TokenType, LexerError> {
        let token_type = if self.advance_if_matches('=') {
            TokenType::LeftAngleEqual
        } else {
            TokenType::LeftAngle
        };
-        (token_type, None)
+        Ok(token_type)
    }
-    fn handle_right_angle(&mut self) -> (TokenType, Option<String>) {
+    fn handle_right_angle(&mut self) -> Result<TokenType, LexerError> {
        let token_type = if self.advance_if_matches('=') {
            TokenType::RightAngleEqual
        } else {
            TokenType::RightAngle
        };
-        (token_type, None)
+        Ok(token_type)
    }
-    fn handle_slash(&mut self) -> (TokenType, Option<String>) {
+    fn handle_slash(&mut self) -> Result<TokenType, LexerError> {
-        if self.advance_if_matches('/') {
+        let token_type = if self.advance_if_matches('/') {
            let start = self.current - 2;
            while self.peek() != '\n' && !self.is_at_end() {
                self.advance();
            }
-            let comment = self.source[start..self.current].to_string();
+            TokenType::Text
            (TokenType::Text, Some(comment))
        } else {
-            (TokenType::Slash, None)
+            TokenType::Slash
-        }
+        };
        Ok(token_type)
    }
-    fn handle_whitespace(&mut self, c: char) -> (TokenType, Option<String>) {
+    fn handle_whitespace(&mut self) -> Result<TokenType, LexerError> {
-        if c == '\n' {
+        while !self.is_at_end() && self.peek().is_whitespace() {
            self.line += 1;
        }
        (TokenType::Text, None)
    }
    fn handle_text(&mut self) -> Result<(TokenType, Option<String>), LexerError> {
        while !self.is_at_end() && !self.is_delimiter(self.peek()) {
            if self.peek() == '\n' {
                self.line += 1;
            }
            self.advance();
        }
-
+        Ok(TokenType::Whitespace)
        let text = self.source[self.start..self.current].to_string();
        if text.is_empty() {
            Err(LexerError::EmptyToken(self.line))
        } else {
            Ok((TokenType::Text, Some(text)))
        }
    }
-    fn is_delimiter(&self, c: char) -> bool {
+    fn handle_text(&mut self) -> Result<TokenType, LexerError> {
-        matches!(
+        self.advance_while(|c| !Self::is_token_boundary(c));
-            c,
+
-            '(' | ')'
+        if self.start == self.current {
-                | '['
+            Err(LexerError::EmptyToken(self.line))
-                | ']'
+        } else {
-                | '{'
+            Ok(TokenType::Text)
-                | '}'
+        }
                | ','
                | '.'
                | '-'
                | '+'
                | ':'
                | ';'
                | '*'
                | '|'
                | '%'
                | '#'
                | '!'
                | '='
                | '<'
                | '>'
                | '/'
                | ' '
                | '\r'
                | '\t'
                | '\n'
                | '"'
                | '\''
        )
    }
    fn advance_if_matches(&mut self, expected: char) -> bool {
@ -310,6 +270,27 @@ impl Lexer {
            true
        }
    }
    fn advance_while<F>(&mut self, condition: F)
    where
        F: Fn(char) -> bool,
    {
        while !self.is_at_end() && condition(self.peek()) {
            if self.peek() == '\n' {
                self.line += 1;
            }
            self.advance();
        }
    }
    fn is_token_boundary(c: char) -> bool {
        const TOKEN_BOUNDARIES: &[char] = &[
            '(', ')', '[', ']', '{', '}', ',', '.', '-', '+', ':', ';', '*', '|', '%', '#', '!',
            '=', '<', '>', '/', ' ', '\r', '\t', '\n', '"', '\'',
        ];
        TOKEN_BOUNDARIES.contains(&c)
    }
 }
 impl Tokenizer for Lexer {
@ -324,7 +305,7 @@ impl Tokenizer for Lexer {
        }
        self.tokens
-            .push(Token::new(TokenType::Eof, String::new(), None, self.line));
+            .push(Token::new(TokenType::Eof, String::new(), self.line));
        Ok(self.tokens.clone())
    }
@ -332,10 +313,11 @@ impl Tokenizer for Lexer {
        self.scan_token()
    }
-    fn add_token(&mut self, token_type: Self::TokenType, literal: Option<String>) {
+    fn add_token(&mut self, token_type: Self::TokenType) {
        let text = self.source[self.start..self.current].to_string();
-        self.tokens
+        if token_type != TokenType::Whitespace {
-            .push(Token::new(token_type, text, literal, self.line));
+            self.tokens.push(Token::new(token_type, text, self.line));
        }
    }
 }
@ -368,7 +350,14 @@ mod tests {
    fn tokenize(input: &str) -> Vec<Token> {
        let mut lexer = Lexer::new(input.to_string());
        match lexer.tokenize() {
-            Ok(tokens) => tokens,
+            Ok(tokens) => {
                // Debug print all tokens
                for (i, token) in tokens.iter().enumerate() {
                    println!("{:?}", token)
                }
                tokens
            }
            Err(e) => {
                eprintln!("Tokenization error: {:?}", e);
                eprintln!("Input that caused the error: {}", input);