add a few more Django and HTML specific tokens (#23)

This commit is contained in:
Josh Thomas 2024-10-16 12:22:50 -05:00 committed by GitHub
parent f00192a8b7
commit b9d61b4478
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 104 additions and 43 deletions

View file

@ -19,8 +19,9 @@ impl<'a> Lexer<'a> {
fn match_token_type(&mut self, c: char) -> Result<TokenType, LexerError> {
match c {
'(' | ')' | '[' | ']' | ',' | '.' | '-' | '+' | ':' | ';' | '/' | '*' | '|' | '\''
| '"' => self.single_char(c),
'(' | ')' | '[' | ']' | ',' | '-' | '+' | ':' | ';' | '*' | '|' | '\'' | '"' => {
self.single_char(c)
}
'{' => self.left_brace(),
'}' => self.right_brace(),
'%' => self.percent(),
@ -29,6 +30,8 @@ impl<'a> Lexer<'a> {
'=' => self.equal(),
'<' => self.left_angle(),
'>' => self.right_angle(),
'/' => self.slash(),
'.' => self.dot(),
' ' | '\r' | '\t' | '\n' => self.whitespace(c),
_ => self.text(),
}
@ -41,12 +44,10 @@ impl<'a> Lexer<'a> {
'[' => TokenType::LeftBracket,
']' => TokenType::RightBracket,
',' => TokenType::Comma,
'.' => TokenType::Dot,
'-' => TokenType::Minus,
'+' => TokenType::Plus,
':' => TokenType::Colon,
';' => TokenType::Semicolon,
'/' => TokenType::Slash,
'*' => TokenType::Star,
'|' => TokenType::Pipe,
'\'' => TokenType::SingleQuote,
@ -122,9 +123,20 @@ impl<'a> Lexer<'a> {
fn left_angle(&mut self) -> Result<TokenType, LexerError> {
let token_type = if self.advance_if_matches('=')? {
TokenType::LeftAngleEqual
} else if self.advance_if_matches('!')? {
let start_pos = self.state.current;
self.advance_while(|c| c == '-')?;
if self.state.current - start_pos >= 2 {
TokenType::LeftAngleBangMinusMinus
} else {
self.state.current = start_pos;
TokenType::LeftAngle
}
} else {
TokenType::LeftAngle
};
Ok(token_type)
}
@ -137,6 +149,26 @@ impl<'a> Lexer<'a> {
Ok(token_type)
}
fn slash(&mut self) -> Result<TokenType, LexerError> {
let token_type = if self.advance_if_matches('>')? {
TokenType::SlashRightAngle
} else if self.advance_if_matches('/')? {
TokenType::DoubleSlash
} else {
TokenType::Slash
};
Ok(token_type)
}
fn dot(&mut self) -> Result<TokenType, LexerError> {
let token_type = if self.advance_if_matches('.')? {
TokenType::DoubleDot
} else {
TokenType::Dot
};
Ok(token_type)
}
fn whitespace(&mut self, mut c: char) -> Result<TokenType, LexerError> {
while !self.is_at_end() && self.peek()?.is_whitespace() {
match c {
@ -298,12 +330,12 @@ mod tests {
F: Fn(&mut Lexer, Option<char>) -> Result<TokenType, LexerError>,
{
for (input, expected) in test_cases {
println!("Testing input: {:?}", input);
let mut chars = input.chars();
let first_char = chars.next().unwrap();
let second_char = chars.next();
let rest: String = chars.collect();
let source = second_char.map_or(String::new(), |c| c.to_string());
let mut lexer = Lexer::new(&source);
let mut lexer = Lexer::new(&rest);
match method(&mut lexer, Some(first_char)) {
Ok(token_type) => assert_eq!(token_type, expected, "Input: {}", input),
@ -351,13 +383,16 @@ mod tests {
("==", TokenType::DoubleEqual),
("<=", TokenType::LeftAngleEqual),
(">=", TokenType::RightAngleEqual),
("..", TokenType::DoubleDot),
("<!--", TokenType::LeftAngleBangMinusMinus),
("/>", TokenType::SlashRightAngle),
("//", TokenType::DoubleSlash),
(" ", TokenType::Whitespace),
("\r", TokenType::Whitespace),
("\t", TokenType::Whitespace),
("\n", TokenType::Whitespace),
(" ", TokenType::Whitespace),
(" \n", TokenType::Whitespace),
(" \r\n", TokenType::Whitespace),
("a", TokenType::Text),
("1", TokenType::Text),
("Hello", TokenType::Text),
@ -425,6 +460,10 @@ mod tests {
let test_cases = vec![
("<", TokenType::LeftAngle),
("<=", TokenType::LeftAngleEqual),
("<!--", TokenType::LeftAngleBangMinusMinus),
("<!", TokenType::LeftAngle),
("<!-", TokenType::LeftAngle),
("<!---", TokenType::LeftAngleBangMinusMinus),
];
assert_token_type(test_cases, |lexer, _| lexer.left_angle());
@ -440,6 +479,24 @@ mod tests {
assert_token_type(test_cases, |lexer, _| lexer.right_angle());
}
#[test]
fn test_slash() {
let test_cases = vec![
("/", TokenType::Slash),
("/>", TokenType::SlashRightAngle),
("//", TokenType::DoubleSlash),
];
assert_token_type(test_cases, |lexer, _| lexer.slash());
}
#[test]
fn test_dot() {
let test_cases = vec![(".", TokenType::Dot), ("..", TokenType::DoubleDot)];
assert_token_type(test_cases, |lexer, _| lexer.dot());
}
#[test]
fn test_whitespace() {
let test_cases = vec![
@ -449,7 +506,6 @@ mod tests {
("\n", TokenType::Whitespace),
(" ", TokenType::Whitespace),
(" \n", TokenType::Whitespace),
(" \r\n", TokenType::Whitespace),
];
assert_token_type(test_cases, |lexer, c| lexer.whitespace(c.unwrap()));

View file

@ -3,40 +3,45 @@ use std::fmt::Debug;
#[derive(Debug, Clone, PartialEq)]
pub enum TokenType {
LeftParen, // (
RightParen, // )
LeftBrace, // {
RightBrace, // }
LeftBracket, // [
RightBracket, // ]
LeftAngle, // <
RightAngle, // >
Comma, // ,
Dot, // .
Minus, // -
Plus, // +
Colon, // :
Semicolon, // ;
Slash, // /
Star, // *
Bang, // !
Equal, // =
Pipe, // |
Percent, // %
Hash, // #
SingleQuote, // '
DoubleQuote, // "
DoubleLeftBrace, // {{
DoubleRightBrace, // }}
LeftBracePercent, // {%
PercentRightBrace, // %}
LeftBraceHash, // {#
HashRightBrace, // #}
BangEqual, // !=
DoubleEqual, // ==
LeftAngleEqual, // <=
RightAngleEqual, // =>
Whitespace, // special token to account for whitespace
LeftParen, // (
RightParen, // )
LeftBrace, // {
RightBrace, // }
LeftBracket, // [
RightBracket, // ]
LeftAngle, // <
RightAngle, // >
Comma, // ,
Dot, // .
Minus, // -
Plus, // +
Colon, // :
Semicolon, // ;
Slash, // /
Star, // *
Bang, // !
Equal, // =
Pipe, // |
Percent, // %
Hash, // #
SingleQuote, // '
DoubleQuote, // "
DoubleLeftBrace, // {{
DoubleRightBrace, // }}
LeftBracePercent, // {%
PercentRightBrace, // %}
LeftBraceHash, // {#
HashRightBrace, // #}
BangEqual, // !=
DoubleEqual, // ==
LeftAngleEqual, // <=
RightAngleEqual, // =>
DoubleDot, // ..
LeftAngleBangMinusMinus, // <!--
MinusMinusRightAngle, // -->
SlashRightAngle, // />
DoubleSlash, // //
Whitespace, // special token to account for whitespace
Text,
Eof,
}