mirror of
https://github.com/joshuadavidthomas/django-template-ast.git
synced 2025-09-08 17:50:30 +00:00
simplify lexer and tokenization by removing literal values
not really needed since we are just dealing with text in django templates, at least at the moment
This commit is contained in:
parent
433c17999c
commit
d76378d6af
1 changed files with 102 additions and 113 deletions
215
src/lexer.rs
215
src/lexer.rs
|
@ -38,6 +38,7 @@ pub enum TokenType {
|
|||
DoubleEqual, // ==
|
||||
LeftAngleEqual, // <=
|
||||
RightAngleEqual, // =>
|
||||
Whitespace, // special token to account for whitespace
|
||||
Text,
|
||||
Eof,
|
||||
}
|
||||
|
@ -46,36 +47,19 @@ pub enum TokenType {
|
|||
pub struct Token {
|
||||
token_type: TokenType,
|
||||
lexeme: String,
|
||||
literal: Option<String>,
|
||||
line: usize,
|
||||
}
|
||||
|
||||
impl Token {
|
||||
fn new(token_type: TokenType, lexeme: String, literal: Option<String>, line: usize) -> Self {
|
||||
fn new(token_type: TokenType, lexeme: String, line: usize) -> Self {
|
||||
Token {
|
||||
token_type,
|
||||
lexeme,
|
||||
literal,
|
||||
line,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for Token {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"{}{}",
|
||||
self.lexeme,
|
||||
if let Some(literal) = &self.literal {
|
||||
format!(" ({})", literal)
|
||||
} else {
|
||||
String::new()
|
||||
}
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
pub trait Tokenizer: Scanner {
|
||||
type Token: Debug;
|
||||
type TokenType: Debug;
|
||||
|
@ -83,18 +67,22 @@ pub trait Tokenizer: Scanner {
|
|||
|
||||
fn tokenize(&mut self) -> Result<Vec<Self::Token>, Self::Error>;
|
||||
fn scan_token(&mut self) -> Result<(), Self::Error>;
|
||||
fn add_token(&mut self, token_type: Self::TokenType, literal: Option<String>);
|
||||
fn add_token(&mut self, token_type: Self::TokenType);
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum LexerError {
|
||||
EmptyToken(usize),
|
||||
UnexpectedCharacter(char, usize),
|
||||
}
|
||||
|
||||
impl fmt::Display for LexerError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
LexerError::EmptyToken(line) => write!(f, "Empty token at line {}", line),
|
||||
LexerError::UnexpectedCharacter(c, line) => {
|
||||
write!(f, "Unexpected character '{}' at line {}", c, line)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -123,21 +111,10 @@ impl Lexer {
|
|||
fn scan_token(&mut self) -> Result<(), LexerError> {
|
||||
let c = self.advance();
|
||||
|
||||
let (token_type, literal) = match c {
|
||||
'(' => (TokenType::LeftParen, None),
|
||||
')' => (TokenType::RightParen, None),
|
||||
'[' => (TokenType::LeftBracket, None),
|
||||
']' => (TokenType::RightBracket, None),
|
||||
',' => (TokenType::Comma, None),
|
||||
'.' => (TokenType::Dot, None),
|
||||
'-' => (TokenType::Minus, None),
|
||||
'+' => (TokenType::Plus, None),
|
||||
':' => (TokenType::Colon, None),
|
||||
';' => (TokenType::Semicolon, None),
|
||||
'*' => (TokenType::Star, None),
|
||||
'|' => (TokenType::Pipe, None),
|
||||
'\'' => (TokenType::SingleQuote, None),
|
||||
'"' => (TokenType::DoubleQuote, None),
|
||||
let token_type = match c {
|
||||
'(' | ')' | '[' | ']' | ',' | '.' | '-' | '+' | ':' | ';' | '*' | '|' | '\'' | '"' => {
|
||||
self.handle_single_char(c)
|
||||
}
|
||||
'{' => self.handle_left_brace(),
|
||||
'}' => self.handle_right_brace(),
|
||||
'%' => self.handle_percent(),
|
||||
|
@ -147,18 +124,37 @@ impl Lexer {
|
|||
'<' => self.handle_left_angle(),
|
||||
'>' => self.handle_right_angle(),
|
||||
'/' => self.handle_slash(),
|
||||
' ' | '\r' | '\t' | '\n' => self.handle_whitespace(c),
|
||||
_ => self.handle_text()?,
|
||||
' ' | '\r' | '\t' | '\n' => self.handle_whitespace(),
|
||||
_ => self.handle_text(),
|
||||
};
|
||||
|
||||
if token_type != TokenType::Text || literal.is_some() {
|
||||
self.add_token(token_type, literal);
|
||||
}
|
||||
self.add_token(token_type?);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn handle_left_brace(&mut self) -> (TokenType, Option<String>) {
|
||||
fn handle_single_char(&mut self, c: char) -> Result<TokenType, LexerError> {
|
||||
let token_type = match c {
|
||||
'(' => TokenType::LeftParen,
|
||||
')' => TokenType::RightParen,
|
||||
'[' => TokenType::LeftBracket,
|
||||
']' => TokenType::RightBracket,
|
||||
',' => TokenType::Comma,
|
||||
'.' => TokenType::Dot,
|
||||
'-' => TokenType::Minus,
|
||||
'+' => TokenType::Plus,
|
||||
':' => TokenType::Colon,
|
||||
';' => TokenType::Semicolon,
|
||||
'*' => TokenType::Star,
|
||||
'|' => TokenType::Pipe,
|
||||
'\'' => TokenType::SingleQuote,
|
||||
'"' => TokenType::DoubleQuote,
|
||||
_ => return Err(LexerError::UnexpectedCharacter(c, self.line)),
|
||||
};
|
||||
Ok(token_type)
|
||||
}
|
||||
|
||||
fn handle_left_brace(&mut self) -> Result<TokenType, LexerError> {
|
||||
let token_type = if self.advance_if_matches('{') {
|
||||
TokenType::DoubleLeftBrace
|
||||
} else if self.advance_if_matches('%') {
|
||||
|
@ -168,138 +164,102 @@ impl Lexer {
|
|||
} else {
|
||||
TokenType::LeftBrace
|
||||
};
|
||||
(token_type, None)
|
||||
Ok(token_type)
|
||||
}
|
||||
|
||||
fn handle_right_brace(&mut self) -> (TokenType, Option<String>) {
|
||||
fn handle_right_brace(&mut self) -> Result<TokenType, LexerError> {
|
||||
let token_type = if self.advance_if_matches('}') {
|
||||
TokenType::DoubleRightBrace
|
||||
} else {
|
||||
TokenType::RightBrace
|
||||
};
|
||||
(token_type, None)
|
||||
Ok(token_type)
|
||||
}
|
||||
|
||||
fn handle_percent(&mut self) -> (TokenType, Option<String>) {
|
||||
fn handle_percent(&mut self) -> Result<TokenType, LexerError> {
|
||||
let token_type = if self.advance_if_matches('}') {
|
||||
TokenType::PercentRightBrace
|
||||
} else {
|
||||
TokenType::Percent
|
||||
};
|
||||
(token_type, None)
|
||||
Ok(token_type)
|
||||
}
|
||||
|
||||
fn handle_hash(&mut self) -> (TokenType, Option<String>) {
|
||||
fn handle_hash(&mut self) -> Result<TokenType, LexerError> {
|
||||
let token_type = if self.advance_if_matches('}') {
|
||||
TokenType::HashRightBrace
|
||||
} else {
|
||||
TokenType::Hash
|
||||
};
|
||||
(token_type, None)
|
||||
Ok(token_type)
|
||||
}
|
||||
|
||||
fn handle_bang(&mut self) -> (TokenType, Option<String>) {
|
||||
fn handle_bang(&mut self) -> Result<TokenType, LexerError> {
|
||||
let token_type = if self.advance_if_matches('=') {
|
||||
TokenType::BangEqual
|
||||
} else {
|
||||
TokenType::Bang
|
||||
};
|
||||
(token_type, None)
|
||||
Ok(token_type)
|
||||
}
|
||||
|
||||
fn handle_equal(&mut self) -> (TokenType, Option<String>) {
|
||||
fn handle_equal(&mut self) -> Result<TokenType, LexerError> {
|
||||
let token_type = if self.advance_if_matches('=') {
|
||||
TokenType::DoubleEqual
|
||||
} else {
|
||||
TokenType::Equal
|
||||
};
|
||||
(token_type, None)
|
||||
Ok(token_type)
|
||||
}
|
||||
|
||||
fn handle_left_angle(&mut self) -> (TokenType, Option<String>) {
|
||||
fn handle_left_angle(&mut self) -> Result<TokenType, LexerError> {
|
||||
let token_type = if self.advance_if_matches('=') {
|
||||
TokenType::LeftAngleEqual
|
||||
} else {
|
||||
TokenType::LeftAngle
|
||||
};
|
||||
(token_type, None)
|
||||
Ok(token_type)
|
||||
}
|
||||
|
||||
fn handle_right_angle(&mut self) -> (TokenType, Option<String>) {
|
||||
fn handle_right_angle(&mut self) -> Result<TokenType, LexerError> {
|
||||
let token_type = if self.advance_if_matches('=') {
|
||||
TokenType::RightAngleEqual
|
||||
} else {
|
||||
TokenType::RightAngle
|
||||
};
|
||||
(token_type, None)
|
||||
Ok(token_type)
|
||||
}
|
||||
|
||||
fn handle_slash(&mut self) -> (TokenType, Option<String>) {
|
||||
if self.advance_if_matches('/') {
|
||||
let start = self.current - 2;
|
||||
fn handle_slash(&mut self) -> Result<TokenType, LexerError> {
|
||||
let token_type = if self.advance_if_matches('/') {
|
||||
while self.peek() != '\n' && !self.is_at_end() {
|
||||
self.advance();
|
||||
}
|
||||
let comment = self.source[start..self.current].to_string();
|
||||
(TokenType::Text, Some(comment))
|
||||
TokenType::Text
|
||||
} else {
|
||||
(TokenType::Slash, None)
|
||||
}
|
||||
TokenType::Slash
|
||||
};
|
||||
Ok(token_type)
|
||||
}
|
||||
|
||||
fn handle_whitespace(&mut self, c: char) -> (TokenType, Option<String>) {
|
||||
if c == '\n' {
|
||||
self.line += 1;
|
||||
}
|
||||
(TokenType::Text, None)
|
||||
}
|
||||
|
||||
fn handle_text(&mut self) -> Result<(TokenType, Option<String>), LexerError> {
|
||||
while !self.is_at_end() && !self.is_delimiter(self.peek()) {
|
||||
fn handle_whitespace(&mut self) -> Result<TokenType, LexerError> {
|
||||
while !self.is_at_end() && self.peek().is_whitespace() {
|
||||
if self.peek() == '\n' {
|
||||
self.line += 1;
|
||||
}
|
||||
self.advance();
|
||||
}
|
||||
|
||||
let text = self.source[self.start..self.current].to_string();
|
||||
if text.is_empty() {
|
||||
Err(LexerError::EmptyToken(self.line))
|
||||
} else {
|
||||
Ok((TokenType::Text, Some(text)))
|
||||
}
|
||||
Ok(TokenType::Whitespace)
|
||||
}
|
||||
|
||||
fn is_delimiter(&self, c: char) -> bool {
|
||||
matches!(
|
||||
c,
|
||||
'(' | ')'
|
||||
| '['
|
||||
| ']'
|
||||
| '{'
|
||||
| '}'
|
||||
| ','
|
||||
| '.'
|
||||
| '-'
|
||||
| '+'
|
||||
| ':'
|
||||
| ';'
|
||||
| '*'
|
||||
| '|'
|
||||
| '%'
|
||||
| '#'
|
||||
| '!'
|
||||
| '='
|
||||
| '<'
|
||||
| '>'
|
||||
| '/'
|
||||
| ' '
|
||||
| '\r'
|
||||
| '\t'
|
||||
| '\n'
|
||||
| '"'
|
||||
| '\''
|
||||
)
|
||||
fn handle_text(&mut self) -> Result<TokenType, LexerError> {
|
||||
self.advance_while(|c| !Self::is_token_boundary(c));
|
||||
|
||||
if self.start == self.current {
|
||||
Err(LexerError::EmptyToken(self.line))
|
||||
} else {
|
||||
Ok(TokenType::Text)
|
||||
}
|
||||
}
|
||||
|
||||
fn advance_if_matches(&mut self, expected: char) -> bool {
|
||||
|
@ -310,6 +270,27 @@ impl Lexer {
|
|||
true
|
||||
}
|
||||
}
|
||||
|
||||
fn advance_while<F>(&mut self, condition: F)
|
||||
where
|
||||
F: Fn(char) -> bool,
|
||||
{
|
||||
while !self.is_at_end() && condition(self.peek()) {
|
||||
if self.peek() == '\n' {
|
||||
self.line += 1;
|
||||
}
|
||||
self.advance();
|
||||
}
|
||||
}
|
||||
|
||||
fn is_token_boundary(c: char) -> bool {
|
||||
const TOKEN_BOUNDARIES: &[char] = &[
|
||||
'(', ')', '[', ']', '{', '}', ',', '.', '-', '+', ':', ';', '*', '|', '%', '#', '!',
|
||||
'=', '<', '>', '/', ' ', '\r', '\t', '\n', '"', '\'',
|
||||
];
|
||||
|
||||
TOKEN_BOUNDARIES.contains(&c)
|
||||
}
|
||||
}
|
||||
|
||||
impl Tokenizer for Lexer {
|
||||
|
@ -324,7 +305,7 @@ impl Tokenizer for Lexer {
|
|||
}
|
||||
|
||||
self.tokens
|
||||
.push(Token::new(TokenType::Eof, String::new(), None, self.line));
|
||||
.push(Token::new(TokenType::Eof, String::new(), self.line));
|
||||
Ok(self.tokens.clone())
|
||||
}
|
||||
|
||||
|
@ -332,10 +313,11 @@ impl Tokenizer for Lexer {
|
|||
self.scan_token()
|
||||
}
|
||||
|
||||
fn add_token(&mut self, token_type: Self::TokenType, literal: Option<String>) {
|
||||
fn add_token(&mut self, token_type: Self::TokenType) {
|
||||
let text = self.source[self.start..self.current].to_string();
|
||||
self.tokens
|
||||
.push(Token::new(token_type, text, literal, self.line));
|
||||
if token_type != TokenType::Whitespace {
|
||||
self.tokens.push(Token::new(token_type, text, self.line));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -368,7 +350,14 @@ mod tests {
|
|||
fn tokenize(input: &str) -> Vec<Token> {
|
||||
let mut lexer = Lexer::new(input.to_string());
|
||||
match lexer.tokenize() {
|
||||
Ok(tokens) => tokens,
|
||||
Ok(tokens) => {
|
||||
// Debug print all tokens
|
||||
for (i, token) in tokens.iter().enumerate() {
|
||||
println!("{:?}", token)
|
||||
}
|
||||
|
||||
tokens
|
||||
}
|
||||
Err(e) => {
|
||||
eprintln!("Tokenization error: {:?}", e);
|
||||
eprintln!("Input that caused the error: {}", input);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue