move token matching from lexer to token

This commit is contained in:
Josh Thomas 2024-10-16 23:14:44 -05:00
parent b4c7688f65
commit da5c34fd0f
4 changed files with 603 additions and 597 deletions

View file

@ -2,16 +2,53 @@ use thiserror::Error;
#[derive(Error, Debug)]
pub enum LexerError {
#[error("Empty token at line {line:?}")]
#[error("empty token at line {line:?}")]
EmptyToken { line: usize },
#[error("Unexpected character '{character}' at line {line}")]
#[error("unexpected character '{character}' at line {line}")]
UnexpectedCharacter { character: char, line: usize },
#[error("Source is empty")]
#[error("source is empty")]
EmptySource,
#[error("At beginning of source")]
#[error("at beginning of source")]
AtBeginningOfSource,
#[error("At end of source")]
#[error("at end of source")]
AtEndOfSource,
#[error("Invalid character access")]
#[error("invalid character access")]
InvalidCharacterAccess,
#[error(transparent)] // Display the inner TokenError directly
TokenError(#[from] TokenError), // This automatically implements From<TokenError>
}
#[derive(Error, Debug)]
pub enum TokenError {
#[error("unexpected character '{character}'")]
UnexpectedCharacter { character: char },
#[error("string did not match a token")]
NoTokenMatch,
#[error("unexpected end of input, expected string literal")]
UnexpectedEndOfInput,
}
#[derive(Error, Debug)]
pub enum NodeError {
#[error("Tag name cannot be empty")]
NoTagName,
#[error("Block name cannot be empty")]
NoBlockName,
}
#[derive(Error, Debug)]
pub enum ParserError {
#[error("Token stream is empty")]
EmptyTokenStream,
#[error("At beginning of token stream")]
AtBeginningOfStream,
#[error("At end of token stream")]
AtEndOfStream,
#[error("Invalid token access")]
InvalidTokenAccess,
#[error("AST error: {0}")]
ASTError(#[from] ASTError),
}
#[derive(Error, Debug)]
pub enum ASTError {}

View file

@ -1,6 +1,6 @@
use crate::error::LexerError;
use crate::scanner::{LexerState, Scanner};
use crate::token::{Token, TokenType, Tokenizer};
use crate::token::{Token, TokenType};
pub struct Lexer<'a> {
source: &'a str,
@ -17,227 +17,33 @@ impl<'a> Lexer<'a> {
}
}
fn match_token_type(&mut self, c: char) -> Result<TokenType, LexerError> {
match c {
',' | '.' | '+' | ':' | '|' | '\'' | '"' => self.single_char(c),
'{' => self.left_brace(),
'}' => self.right_brace(),
'%' => self.percent(),
'#' => self.hash(),
'!' => self.bang(),
'=' => self.equal(),
'<' => self.left_angle(),
'>' => self.right_angle(),
'/' => self.slash(),
'-' => self.dash(),
'*' => self.star(),
' ' | '\r' | '\t' | '\n' => self.whitespace(c),
_ => self.text(),
}
}
fn single_char(&mut self, c: char) -> Result<TokenType, LexerError> {
let token_type = match c {
',' => TokenType::Comma,
'.' => TokenType::Dot,
'+' => TokenType::Plus,
':' => TokenType::Colon,
'|' => TokenType::Pipe,
'\'' => TokenType::SingleQuote,
'"' => TokenType::DoubleQuote,
_ => {
return Err(LexerError::UnexpectedCharacter {
character: c,
line: self.state.line,
})
}
};
Ok(token_type)
}
fn left_brace(&mut self) -> Result<TokenType, LexerError> {
let token_type = if self.advance_if_matches('{')? {
TokenType::DoubleLeftBrace
} else if self.advance_if_matches('%')? {
TokenType::LeftBracePercent
} else if self.advance_if_matches('#')? {
TokenType::LeftBraceHash
} else {
self.text()?
};
Ok(token_type)
}
fn right_brace(&mut self) -> Result<TokenType, LexerError> {
let token_type = if self.advance_if_matches('}')? {
TokenType::DoubleRightBrace
} else {
self.text()?
};
Ok(token_type)
}
fn percent(&mut self) -> Result<TokenType, LexerError> {
let token_type = if self.advance_if_matches('}')? {
TokenType::PercentRightBrace
} else {
TokenType::Percent
};
Ok(token_type)
}
fn hash(&mut self) -> Result<TokenType, LexerError> {
let token_type = if self.advance_if_matches('}')? {
TokenType::HashRightBrace
} else {
self.text()?
};
Ok(token_type)
}
fn bang(&mut self) -> Result<TokenType, LexerError> {
let token_type = if self.advance_if_matches('=')? {
TokenType::BangEqual
} else {
TokenType::Bang
};
Ok(token_type)
}
fn equal(&mut self) -> Result<TokenType, LexerError> {
let token_type = if self.advance_if_matches('=')? {
TokenType::DoubleEqual
} else {
TokenType::Equal
};
Ok(token_type)
}
fn left_angle(&mut self) -> Result<TokenType, LexerError> {
let token_type = if self.advance_if_matches('=')? {
TokenType::LeftAngleEqual
} else if self.advance_if_matches('!')? {
let start_pos = self.state.current;
self.advance_while(|c| c == '-')?;
if self.state.current - start_pos >= 2 {
TokenType::LeftAngleBangDashDash
} else {
self.state.current = start_pos;
TokenType::LeftAngle
}
} else {
TokenType::LeftAngle
};
Ok(token_type)
}
fn right_angle(&mut self) -> Result<TokenType, LexerError> {
let token_type = if self.advance_if_matches('=')? {
TokenType::RightAngleEqual
} else {
TokenType::RightAngle
};
Ok(token_type)
}
fn slash(&mut self) -> Result<TokenType, LexerError> {
let token_type = if self.advance_if_matches('>')? {
TokenType::SlashRightAngle
} else if self.advance_if_matches('/')? {
TokenType::DoubleSlash
} else if self.advance_if_matches('*')? {
TokenType::SlashStar
} else {
TokenType::Slash
};
Ok(token_type)
}
fn dash(&mut self) -> Result<TokenType, LexerError> {
let token_type = if self.advance_if_matches('-')? {
if self.advance_if_matches('>')? {
TokenType::DashDashRightAngle
} else {
self.text()?
}
} else {
TokenType::Dash
};
Ok(token_type)
}
fn star(&mut self) -> Result<TokenType, LexerError> {
let token_type = if self.advance_if_matches('/')? {
TokenType::StarSlash
} else {
self.text()?
};
Ok(token_type)
}
fn whitespace(&mut self, mut c: char) -> Result<TokenType, LexerError> {
while !self.is_at_end() && self.peek()?.is_whitespace() {
match c {
'\n' => {
self.state.line += 1;
}
'\r' if self.peek()? == '\n' => {
self.advance()?;
self.state.line += 1;
}
' ' | '\t' | '\r' => {}
_ => {
return Err(LexerError::UnexpectedCharacter {
character: c,
line: self.state.line,
})
}
}
c = self.advance()?;
}
Ok(TokenType::Whitespace)
}
fn text(&mut self) -> Result<TokenType, LexerError> {
self.advance_while(|c| !Self::is_token_boundary(c))?;
Ok(TokenType::Text)
}
fn advance_if_matches(&mut self, expected: char) -> Result<bool, LexerError> {
if self.is_at_end() || self.peek()? != expected {
Ok(false)
} else {
self.state.current += 1;
Ok(true)
}
}
fn advance_while<F>(&mut self, condition: F) -> Result<(), LexerError>
where
F: Fn(char) -> bool,
{
pub fn tokenize(&mut self) -> Result<Vec<Token>, LexerError> {
while !self.is_at_end() {
let current_char = self.peek()?;
if !condition(current_char) {
break;
}
if current_char == '\n' {
self.state.line += 1;
}
self.advance()?;
self.state.start = self.state.current;
let (token, size, lines_consumed) = self.next_token()?;
self.add_token(token);
self.state.current += size;
self.state.line += lines_consumed;
}
Ok(())
self.add_token(Token::new(TokenType::Eof, "", self.state.line));
Ok(self.tokens.clone())
}
fn is_token_boundary(c: char) -> bool {
const TOKEN_BOUNDARIES: &[char] = &[
'(', ')', '[', ']', '{', '}', ',', '.', '-', '+', ':', ';', '*', '|', '%', '#', '!',
'=', '<', '>', '/', ' ', '\r', '\t', '\n', '"', '\'',
];
fn next_token(&mut self) -> Result<(Token<'a>, usize, usize), LexerError> {
self.advance()?;
let remaining_source = &self.source[self.state.current..];
TOKEN_BOUNDARIES.contains(&c)
let (token, size, lines_traversed) = Token::from_input(remaining_source, self.state.line)?;
Ok((token, size, lines_traversed))
}
fn add_token(&mut self, token: Token<'a>) {
if token.token_type != TokenType::Whitespace {
self.tokens.push(token);
}
}
}
@ -284,367 +90,3 @@ impl<'a> Scanner for Lexer<'a> {
self.state.current >= self.source.len()
}
}
impl<'a> Tokenizer<'a> for Lexer<'a> {
type Token = Token<'a>;
type TokenType = TokenType;
fn tokenize(&mut self) -> Result<Vec<Self::Token>, Self::Error> {
while !self.is_at_end() {
self.state.start = self.state.current;
let (token_type, text) = self.next_token()?;
self.add_token(token_type, text);
}
self.add_token(TokenType::Eof, "");
Ok(self.tokens.clone())
}
fn next_token(&mut self) -> Result<(Self::TokenType, &'a str), Self::Error> {
let c = self.advance()?;
let token_type = self.match_token_type(c)?;
let text = &self.source[self.state.start..self.state.current];
Ok((token_type, text))
}
fn add_token(&mut self, token_type: Self::TokenType, text: &'a str) {
if token_type != TokenType::Whitespace {
self.tokens
.push(Token::new(token_type, text, self.state.line));
}
}
}
#[cfg(test)]
mod tests {
use super::*;
mod lexer {
use super::*;
#[test]
fn test_lexer_new() {
let lexer = Lexer::new("");
assert_eq!(lexer.source, "");
assert_eq!(lexer.tokens.len(), 0);
assert_eq!(lexer.state.start, 0);
assert_eq!(lexer.state.current, 0);
assert_eq!(lexer.state.line, 1);
}
fn assert_token_type<F>(test_cases: Vec<(&str, TokenType)>, method: F)
where
F: Fn(&mut Lexer, Option<char>) -> Result<TokenType, LexerError>,
{
for (input, expected) in test_cases {
println!("Testing input: {:?}", input);
let mut chars = input.chars();
let first_char = chars.next().unwrap();
let rest: String = chars.collect();
let mut lexer = Lexer::new(&rest);
match method(&mut lexer, Some(first_char)) {
Ok(token_type) => assert_eq!(token_type, expected, "Input: {}", input),
Err(e) => panic!(
"Expected {:?}, but got Err({:?}) for input: {}",
expected, e, input
),
}
}
}
#[test]
fn test_match_token_type() {
let test_cases = vec![
("<", TokenType::LeftAngle),
(">", TokenType::RightAngle),
(",", TokenType::Comma),
(".", TokenType::Dot),
("-", TokenType::Dash),
("+", TokenType::Plus),
(":", TokenType::Colon),
("/", TokenType::Slash),
("!", TokenType::Bang),
("=", TokenType::Equal),
("|", TokenType::Pipe),
("%", TokenType::Percent),
("'", TokenType::SingleQuote),
("\"", TokenType::DoubleQuote),
("{{", TokenType::DoubleLeftBrace),
("}}", TokenType::DoubleRightBrace),
("{%", TokenType::LeftBracePercent),
("%}", TokenType::PercentRightBrace),
("{#", TokenType::LeftBraceHash),
("#}", TokenType::HashRightBrace),
("!=", TokenType::BangEqual),
("==", TokenType::DoubleEqual),
("<=", TokenType::LeftAngleEqual),
(">=", TokenType::RightAngleEqual),
("<!--", TokenType::LeftAngleBangDashDash),
("-->", TokenType::DashDashRightAngle),
("/>", TokenType::SlashRightAngle),
("//", TokenType::DoubleSlash),
("/*", TokenType::SlashStar),
("*/", TokenType::StarSlash),
(" ", TokenType::Whitespace),
("\r", TokenType::Whitespace),
("\t", TokenType::Whitespace),
("\n", TokenType::Whitespace),
(" ", TokenType::Whitespace),
(" \n", TokenType::Whitespace),
("a", TokenType::Text),
("1", TokenType::Text),
("Hello", TokenType::Text),
];
assert_token_type(test_cases, |lexer, c| lexer.match_token_type(c.unwrap()));
}
#[test]
fn test_left_brace() {
let test_cases = vec![
("{{", TokenType::DoubleLeftBrace),
("{%", TokenType::LeftBracePercent),
("{#", TokenType::LeftBraceHash),
("{", TokenType::Text),
];
assert_token_type(test_cases, |lexer, _| lexer.left_brace());
}
#[test]
fn test_right_brace() {
let test_cases = vec![("}}", TokenType::DoubleRightBrace), ("}", TokenType::Text)];
assert_token_type(test_cases, |lexer, _| lexer.right_brace());
}
#[test]
fn test_percent() {
let test_cases = vec![
("%", TokenType::Percent),
("%}", TokenType::PercentRightBrace),
];
assert_token_type(test_cases, |lexer, _| lexer.percent());
}
#[test]
fn test_bang() {
let test_cases = vec![("!", TokenType::Bang), ("!=", TokenType::BangEqual)];
assert_token_type(test_cases, |lexer, _| lexer.bang());
}
#[test]
fn test_equal() {
let test_cases = vec![("=", TokenType::Equal), ("==", TokenType::DoubleEqual)];
assert_token_type(test_cases, |lexer, _| lexer.equal());
}
#[test]
fn test_left_angle() {
let test_cases = vec![
("<", TokenType::LeftAngle),
("<=", TokenType::LeftAngleEqual),
("<!--", TokenType::LeftAngleBangDashDash),
("<!", TokenType::LeftAngle),
("<!-", TokenType::LeftAngle),
("<!---", TokenType::LeftAngleBangDashDash),
];
assert_token_type(test_cases, |lexer, _| lexer.left_angle());
}
#[test]
fn test_right_angle() {
let test_cases = vec![
(">", TokenType::RightAngle),
(">=", TokenType::RightAngleEqual),
];
assert_token_type(test_cases, |lexer, _| lexer.right_angle());
}
#[test]
fn test_slash() {
let test_cases = vec![
("/", TokenType::Slash),
("/>", TokenType::SlashRightAngle),
("//", TokenType::DoubleSlash),
("/*", TokenType::SlashStar),
];
assert_token_type(test_cases, |lexer, _| lexer.slash());
}
#[test]
fn test_dash() {
let test_cases = vec![
("-", TokenType::Dash),
("-->", TokenType::DashDashRightAngle),
("--", TokenType::Text),
];
assert_token_type(test_cases, |lexer, _| lexer.dash());
}
#[test]
fn test_star() {
let test_cases = vec![("*/", TokenType::StarSlash), ("*", TokenType::Text)];
assert_token_type(test_cases, |lexer, _| lexer.star());
}
#[test]
fn test_whitespace() {
let test_cases = vec![
(" ", TokenType::Whitespace),
("\r", TokenType::Whitespace),
("\t", TokenType::Whitespace),
("\n", TokenType::Whitespace),
(" ", TokenType::Whitespace),
(" \n", TokenType::Whitespace),
];
assert_token_type(test_cases, |lexer, c| lexer.whitespace(c.unwrap()));
}
#[test]
fn test_text() {
let test_cases = vec![
("a", TokenType::Text),
("1", TokenType::Text),
("Hello", TokenType::Text),
];
assert_token_type(test_cases, |lexer, _| lexer.text());
}
}
fn tokenize(input: &str) -> Vec<Token> {
let mut lexer = Lexer::new(input);
match lexer.tokenize() {
Ok(tokens) => {
// Debug print all tokens
for token in tokens.iter() {
println!("{:?}", token)
}
tokens
}
Err(e) => {
eprintln!("Tokenization error: {:?}", e);
eprintln!("Input that caused the error: {}", input);
panic!("Tokenization failed. See error output above.");
}
}
}
#[test]
fn test_opening_tag() {
let tokens = tokenize("<html>");
assert_eq!(tokens[0].token_type, TokenType::LeftAngle);
assert_eq!(tokens[1].token_type, TokenType::Text);
assert_eq!(tokens[2].token_type, TokenType::RightAngle);
}
#[test]
fn test_closing_tag() {
let tokens = tokenize("</body>");
assert_eq!(tokens[0].token_type, TokenType::LeftAngle);
assert_eq!(tokens[1].token_type, TokenType::Slash);
assert_eq!(tokens[2].token_type, TokenType::Text);
assert_eq!(tokens[3].token_type, TokenType::RightAngle);
}
#[test]
fn test_html_attribute() {
let tokens = tokenize(r#"<a href="link">"#);
assert_eq!(tokens[0].token_type, TokenType::LeftAngle);
assert_eq!(tokens[1].token_type, TokenType::Text);
assert_eq!(tokens[2].token_type, TokenType::Text);
assert_eq!(tokens[3].token_type, TokenType::Equal);
assert_eq!(tokens[4].token_type, TokenType::DoubleQuote);
assert_eq!(tokens[5].token_type, TokenType::Text);
assert_eq!(tokens[6].token_type, TokenType::DoubleQuote);
assert_eq!(tokens[7].token_type, TokenType::RightAngle);
}
#[test]
fn test_django_variable() {
let tokens = tokenize("{{ variable }}");
assert_eq!(tokens[0].token_type, TokenType::DoubleLeftBrace);
assert_eq!(tokens[1].token_type, TokenType::Text);
assert_eq!(tokens[2].token_type, TokenType::DoubleRightBrace);
}
#[test]
fn test_django_templatetag() {
let tokens = tokenize("{% if condition %}");
assert_eq!(tokens[0].token_type, TokenType::LeftBracePercent);
assert_eq!(tokens[1].token_type, TokenType::Text);
assert_eq!(tokens[2].token_type, TokenType::Text);
assert_eq!(tokens[3].token_type, TokenType::PercentRightBrace);
}
#[test]
fn test_django_comment() {
let tokens = tokenize("{# This is a comment #}");
assert_eq!(tokens[0].token_type, TokenType::LeftBraceHash);
assert_eq!(tokens[1].token_type, TokenType::Text);
assert_eq!(tokens[2].token_type, TokenType::Text);
assert_eq!(tokens[3].token_type, TokenType::Text);
assert_eq!(tokens[4].token_type, TokenType::Text);
assert_eq!(tokens[5].token_type, TokenType::HashRightBrace);
}
#[test]
fn test_django_filter() {
let tokens = tokenize("{{ value|default:'default' }}");
assert_eq!(tokens[0].token_type, TokenType::DoubleLeftBrace);
assert_eq!(tokens[1].token_type, TokenType::Text);
assert_eq!(tokens[2].token_type, TokenType::Pipe);
assert_eq!(tokens[3].token_type, TokenType::Text);
assert_eq!(tokens[4].token_type, TokenType::Colon);
assert_eq!(tokens[5].token_type, TokenType::SingleQuote);
assert_eq!(tokens[6].token_type, TokenType::Text);
assert_eq!(tokens[7].token_type, TokenType::SingleQuote);
assert_eq!(tokens[8].token_type, TokenType::DoubleRightBrace);
}
#[test]
fn test_quoted_django_templatetag() {
let tokens = tokenize(r#"'{% url "api:index" %}'"#);
assert_eq!(tokens[0].token_type, TokenType::SingleQuote);
assert_eq!(tokens[1].token_type, TokenType::LeftBracePercent);
assert_eq!(tokens[2].token_type, TokenType::Text);
assert_eq!(tokens[3].token_type, TokenType::DoubleQuote);
assert_eq!(tokens[4].token_type, TokenType::Text);
assert_eq!(tokens[5].token_type, TokenType::Colon);
assert_eq!(tokens[6].token_type, TokenType::Text);
assert_eq!(tokens[7].token_type, TokenType::DoubleQuote);
assert_eq!(tokens[8].token_type, TokenType::PercentRightBrace);
assert_eq!(tokens[9].token_type, TokenType::SingleQuote);
}
#[test]
fn test_multiline_template() {
let template = r#"\
{% if user.is_authenticated %}
Hello, {{ user.name }}!
{% else %}
Please log in.
{% endif %}
"#;
let tokens = tokenize(template);
assert_eq!(tokens[0].line, 1);
assert_eq!(tokens[6].line, 2);
assert_eq!(tokens[14].line, 3);
assert_eq!(tokens[17].line, 4);
assert_eq!(tokens[21].line, 5);
}
}

View file

@ -5,10 +5,11 @@ mod token;
use lexer::Lexer;
use std::error::Error;
use token::Tokenizer;
pub fn compile(template: &str) -> Result<String, Box<dyn Error>> {
let tokens = Lexer::new(template).tokenize()?;
let mut lexer = Lexer::new(template);
let tokens = lexer.tokenize()?;
let ast = Parser::new(tokens.clone()).parse()?;
println!("{:?}", tokens);
todo!("Implement compilation process")
}

View file

@ -1,4 +1,4 @@
use crate::scanner::Scanner;
use crate::error::TokenError;
use std::fmt::Debug;
#[derive(Debug, Clone, PartialEq)]
@ -38,7 +38,275 @@ pub enum TokenType {
Eof,
}
#[derive(Debug, Clone)]
impl TokenType {
fn single_char(c: char) -> Result<(Self, usize), TokenError> {
let token_type;
let size = 1;
token_type = match c {
',' => Self::Comma,
'.' => Self::Dot,
'+' => Self::Plus,
':' => Self::Colon,
'|' => Self::Pipe,
'\'' => Self::SingleQuote,
'"' => Self::DoubleQuote,
'/' => Self::Slash,
'%' => Self::Percent,
_ => return Err(TokenError::UnexpectedCharacter { character: c }),
};
Ok((token_type, size))
}
fn left_brace(s: &str) -> Result<(Self, usize), TokenError> {
let token_type;
let size;
if s.starts_with("{{") {
token_type = Self::DoubleLeftBrace;
size = 2;
} else if s.starts_with("{%") {
token_type = Self::LeftBracePercent;
size = 2;
} else if s.starts_with("{#") {
token_type = Self::LeftBraceHash;
size = 2;
} else {
token_type = Self::Text;
size = 1;
}
Ok((token_type, size))
}
fn right_brace(s: &str) -> Result<(Self, usize), TokenError> {
let token_type;
let size;
if s.starts_with("}}") {
token_type = Self::DoubleRightBrace;
size = 2;
} else {
token_type = Self::Text;
size = 1;
}
Ok((token_type, size))
}
fn percent(s: &str) -> Result<(Self, usize), TokenError> {
let token_type;
let size;
if s.starts_with("%}") {
token_type = Self::PercentRightBrace;
size = 2;
} else {
token_type = Self::Percent;
size = 1;
}
Ok((token_type, size))
}
fn hash(s: &str) -> Result<(Self, usize), TokenError> {
let token_type;
let size;
if s.starts_with("#}") {
token_type = Self::HashRightBrace;
size = 2;
} else {
token_type = Self::Text;
size = 1;
}
Ok((token_type, size))
}
fn bang(s: &str) -> Result<(Self, usize), TokenError> {
let token_type;
let size;
if s.starts_with("!=") {
token_type = Self::BangEqual;
size = 2;
} else {
token_type = Self::Bang;
size = 1;
}
Ok((token_type, size))
}
fn equal(s: &str) -> Result<(Self, usize), TokenError> {
let token_type;
let size;
if s.starts_with("==") {
token_type = Self::DoubleEqual;
size = 2;
} else {
token_type = Self::Equal;
size = 1;
}
Ok((token_type, size))
}
fn left_angle(s: &str) -> Result<(Self, usize), TokenError> {
let token_type;
let size;
if s.starts_with("<=") {
token_type = Self::LeftAngleEqual;
size = 2;
} else if s.starts_with("<!--") {
token_type = Self::LeftAngleBangDashDash;
size = 5;
} else {
token_type = Self::LeftAngle;
size = 1;
}
Ok((token_type, size))
}
fn right_angle(s: &str) -> Result<(Self, usize), TokenError> {
let token_type;
let size;
if s.starts_with(">=") {
token_type = Self::RightAngleEqual;
size = 2;
} else {
token_type = Self::RightAngle;
size = 1;
}
Ok((token_type, size))
}
fn slash(s: &str) -> Result<(Self, usize), TokenError> {
let token_type;
let size;
if s.starts_with("/>") {
token_type = Self::SlashRightAngle;
size = 2;
} else if s.starts_with("//") {
token_type = Self::DoubleSlash;
size = 2;
} else if s.starts_with("/*") {
token_type = Self::SlashStar;
size = 2;
} else if s.starts_with("*/") {
token_type = Self::StarSlash;
size = 2;
} else {
token_type = Self::Slash;
size = 1;
}
Ok((token_type, size))
}
fn dash(s: &str) -> Result<(Self, usize), TokenError> {
let token_type;
let size;
if let Some(rest) = s.strip_prefix("--") {
if rest.starts_with(">") {
token_type = Self::DashDashRightAngle;
size = 3;
} else {
token_type = Self::Text;
size = 2;
}
} else {
token_type = Self::Dash;
size = 1;
}
Ok((token_type, size))
}
fn star(s: &str) -> Result<(Self, usize), TokenError> {
let token_type;
let size;
if s.starts_with("*/") {
token_type = Self::StarSlash;
size = 2;
} else {
token_type = Self::Text;
size = 1;
}
Ok((token_type, size))
}
fn whitespace(s: &str) -> Result<(Self, usize, usize), TokenError> {
let mut size = 0;
let mut lines = 0;
let mut chars = s.chars().peekable();
while let Some(&c) = chars.peek() {
match c {
' ' | '\t' => {}
'\n' => {
lines += 1;
}
'\r' => {
chars.next();
if chars.peek() == Some(&'\n') {
chars.next();
}
lines += 1;
}
_ => break,
}
size += c.len_utf8();
chars.next();
}
if size > 0 {
Ok((Self::Whitespace, size, lines))
} else {
Err(TokenError::NoTokenMatch)
}
}
fn text(s: &str) -> Result<(Self, usize), TokenError> {
let mut size = 0;
for (i, c) in s.chars().enumerate() {
if Self::is_token_boundary(c) {
break;
}
size = i + 1;
}
if size > 0 {
Ok((Self::Text, size))
} else {
Err(TokenError::NoTokenMatch)
}
}
fn is_token_boundary(c: char) -> bool {
const TOKEN_BOUNDARIES: &[char] = &[
'(', ')', '[', ']', '{', '}', ',', '.', '-', '+', ':', ';', '*', '|', '%', '#', '!',
'=', '<', '>', '/', ' ', '\r', '\t', '\n', '"', '\'',
];
TOKEN_BOUNDARIES.contains(&c)
}
}
#[derive(Clone, Debug, PartialEq)]
pub struct Token<'a> {
pub token_type: TokenType,
pub lexeme: &'a str,
@ -53,13 +321,271 @@ impl<'a> Token<'a> {
line,
}
}
pub fn from_input(input: &'a str, line: usize) -> Result<(Self, usize, usize), TokenError> {
let c = input.chars().next().ok_or(TokenError::NoTokenMatch)?;
if c.is_whitespace() {
let (token_type, size, lines_consumed) = TokenType::whitespace(input)?;
return Ok((
Self::new(token_type, &input[..size.min(input.len())], line),
size,
lines_consumed,
));
}
let (token_type, size) = match c {
',' | '.' | '+' | ':' | '|' | '\'' | '"' => TokenType::single_char(c)?,
'{' => TokenType::left_brace(input)?,
'}' => TokenType::right_brace(input)?,
'%' => TokenType::percent(input)?,
'#' => TokenType::hash(input)?,
'!' => TokenType::bang(input)?,
'=' => TokenType::equal(input)?,
'<' => TokenType::left_angle(input)?,
'>' => TokenType::right_angle(input)?,
'/' => TokenType::slash(input)?,
'-' => TokenType::dash(input)?,
'*' => TokenType::star(input)?,
_ => TokenType::text(input)?,
};
Ok((
Self::new(token_type, &input[..size.min(input.len())], line),
size,
0,
))
}
}
pub trait Tokenizer<'a>: Scanner {
type Token: Debug;
type TokenType: Debug;
#[cfg(test)]
mod tests {
use super::*;
fn tokenize(&mut self) -> Result<Vec<Self::Token>, Self::Error>;
fn next_token(&mut self) -> Result<(Self::TokenType, &'a str), Self::Error>;
fn add_token(&mut self, token_type: Self::TokenType, text: &'a str);
fn assert_token_instance<F>(test_cases: Vec<(&str, TokenType)>, method: F)
where
F: Fn(&str) -> Result<(Token<'_>, usize, usize), TokenError>,
{
for (input, expected_token_type) in test_cases {
println!("Testing input: {:?}", input);
// Call the token-based method
match method(input) {
Ok((token, _size_consumed, _lines_consumed)) => {
assert_eq!(token.token_type, expected_token_type, "Input: {}", input);
}
Err(e) => panic!(
"Expected {:?}, but got Err({:?}) for input: {}",
expected_token_type, e, input,
),
}
}
}
#[test]
fn test_match_token() {
let test_cases = vec![
("<", TokenType::LeftAngle),
(">", TokenType::RightAngle),
(",", TokenType::Comma),
(".", TokenType::Dot),
("-", TokenType::Dash),
("+", TokenType::Plus),
(":", TokenType::Colon),
("/", TokenType::Slash),
("!", TokenType::Bang),
("=", TokenType::Equal),
("|", TokenType::Pipe),
("%", TokenType::Percent),
("'", TokenType::SingleQuote),
("\"", TokenType::DoubleQuote),
("{{", TokenType::DoubleLeftBrace),
("}}", TokenType::DoubleRightBrace),
("{%", TokenType::LeftBracePercent),
("%}", TokenType::PercentRightBrace),
("{#", TokenType::LeftBraceHash),
("#}", TokenType::HashRightBrace),
("!=", TokenType::BangEqual),
("==", TokenType::DoubleEqual),
("<=", TokenType::LeftAngleEqual),
(">=", TokenType::RightAngleEqual),
("<!--", TokenType::LeftAngleBangDashDash),
("-->", TokenType::DashDashRightAngle),
("/>", TokenType::SlashRightAngle),
("//", TokenType::DoubleSlash),
("/*", TokenType::SlashStar),
("*/", TokenType::StarSlash),
(" ", TokenType::Whitespace),
("\r", TokenType::Whitespace),
("\t", TokenType::Whitespace),
("\n", TokenType::Whitespace),
(" ", TokenType::Whitespace),
(" \n", TokenType::Whitespace),
("a", TokenType::Text),
("1", TokenType::Text),
("Hello", TokenType::Text),
];
assert_token_instance(test_cases, |input| Token::from_input(input, 0));
}
fn assert_token_type<F>(test_cases: Vec<(&str, TokenType)>, method: F)
where
F: Fn(&str) -> Result<(TokenType, usize), TokenError>,
{
for (input, expected_token_type) in test_cases {
println!("Testing input: {:?}", input);
match method(input) {
Ok((token_type, _size_consumed)) => {
assert_eq!(token_type, expected_token_type, "Input: {}", input);
}
Err(e) => panic!(
"Expected {:?}, but got Err({:?}) for input: {}",
expected_token_type, e, input,
),
}
}
}
#[test]
fn test_left_brace() {
let test_cases = vec![
("{{", TokenType::DoubleLeftBrace),
("{%", TokenType::LeftBracePercent),
("{#", TokenType::LeftBraceHash),
("{", TokenType::Text),
];
assert_token_type(test_cases, TokenType::left_brace);
}
#[test]
fn test_right_brace() {
let test_cases = vec![("}}", TokenType::DoubleRightBrace), ("}", TokenType::Text)];
assert_token_type(test_cases, TokenType::right_brace);
}
#[test]
fn test_percent() {
let test_cases = vec![
("%", TokenType::Percent),
("%}", TokenType::PercentRightBrace),
];
assert_token_type(test_cases, TokenType::percent);
}
#[test]
fn test_bang() {
let test_cases = vec![("!", TokenType::Bang), ("!=", TokenType::BangEqual)];
assert_token_type(test_cases, TokenType::bang);
}
#[test]
fn test_equal() {
let test_cases = vec![("=", TokenType::Equal), ("==", TokenType::DoubleEqual)];
assert_token_type(test_cases, TokenType::equal);
}
#[test]
fn test_left_angle() {
let test_cases = vec![
("<", TokenType::LeftAngle),
("<=", TokenType::LeftAngleEqual),
("<!--", TokenType::LeftAngleBangDashDash),
("<!", TokenType::LeftAngle),
("<!-", TokenType::LeftAngle),
("<!---", TokenType::LeftAngleBangDashDash),
];
assert_token_type(test_cases, TokenType::left_angle);
}
#[test]
fn test_right_angle() {
let test_cases = vec![
(">", TokenType::RightAngle),
(">=", TokenType::RightAngleEqual),
];
assert_token_type(test_cases, TokenType::right_angle);
}
#[test]
fn test_slash() {
let test_cases = vec![
("/", TokenType::Slash),
("/>", TokenType::SlashRightAngle),
("//", TokenType::DoubleSlash),
("/*", TokenType::SlashStar),
];
assert_token_type(test_cases, TokenType::slash);
}
#[test]
fn test_dash() {
let test_cases = vec![
("-", TokenType::Dash),
("-->", TokenType::DashDashRightAngle),
("--", TokenType::Text),
];
assert_token_type(test_cases, TokenType::dash);
}
#[test]
fn test_star() {
let test_cases = vec![("*/", TokenType::StarSlash), ("*", TokenType::Text)];
assert_token_type(test_cases, TokenType::star);
}
#[test]
fn test_text() {
let test_cases = vec![
("a", TokenType::Text),
("1", TokenType::Text),
("Hello", TokenType::Text),
];
assert_token_type(test_cases, TokenType::text);
}
fn assert_whitespace_token_type<F>(test_cases: Vec<(&str, usize)>, method: F)
where
F: Fn(&str) -> Result<(TokenType, usize, usize), TokenError>,
{
for (input, expected_lines) in test_cases {
println!("Testing input: {:?}", input);
// Call the token matcher
match method(input) {
Ok((token_type, _size_consumed, lines_consumed)) => {
assert_eq!(token_type, TokenType::Whitespace, "Input: {}", input);
assert_eq!(lines_consumed, expected_lines, "Input: {}", input);
}
Err(e) => panic!(
"Expected Whitespace, but got Err({:?}) for input: {}",
e, input
),
}
}
}
#[test]
fn test_whitespace_token_type() {
let test_cases = vec![
(" ", 0),
("\n", 1),
("\t", 0),
("\r", 1),
(" \n", 1),
("\r\n", 1),
];
assert_whitespace_token_type(test_cases, TokenType::whitespace);
}
}