From f96b3da951951db5ec3c91873ac93c3832af99ee Mon Sep 17 00:00:00 2001 From: Josh Thomas Date: Mon, 9 Dec 2024 17:48:24 -0600 Subject: [PATCH] create djls-ast crate and implement lexer and parser (#8) --- Cargo.toml | 1 + crates/djls-ast/Cargo.toml | 16 + crates/djls-ast/src/ast.rs | 225 ++++++ crates/djls-ast/src/lexer.rs | 414 ++++++++++++ crates/djls-ast/src/lib.rs | 4 + crates/djls-ast/src/parser.rs | 638 ++++++++++++++++++ ..._ast__lexer__tests__tokenize_comments.snap | 94 +++ ...__lexer__tests__tokenize_django_block.snap | 27 + ...exer__tests__tokenize_django_variable.snap | 11 + ...st__lexer__tests__tokenize_everything.snap | 369 ++++++++++ ...djls_ast__lexer__tests__tokenize_html.snap | 15 + ...ls_ast__lexer__tests__tokenize_script.snap | 68 ++ ...jls_ast__lexer__tests__tokenize_style.snap | 69 ++ ...ls_ast__parser__tests__parse_comments.snap | 28 + ...st__parser__tests__parse_django_block.snap | 20 + ..._parser__tests__parse_django_variable.snap | 16 + ...ls_ast__parser__tests__parse_html_tag.snap | 15 + ...djls_ast__parser__tests__parse_script.snap | 22 + .../djls_ast__parser__tests__parse_style.snap | 17 + crates/djls-ast/src/tokens.rs | 186 +++++ 20 files changed, 2255 insertions(+) create mode 100644 crates/djls-ast/Cargo.toml create mode 100644 crates/djls-ast/src/ast.rs create mode 100644 crates/djls-ast/src/lexer.rs create mode 100644 crates/djls-ast/src/lib.rs create mode 100644 crates/djls-ast/src/parser.rs create mode 100644 crates/djls-ast/src/snapshots/djls_ast__lexer__tests__tokenize_comments.snap create mode 100644 crates/djls-ast/src/snapshots/djls_ast__lexer__tests__tokenize_django_block.snap create mode 100644 crates/djls-ast/src/snapshots/djls_ast__lexer__tests__tokenize_django_variable.snap create mode 100644 crates/djls-ast/src/snapshots/djls_ast__lexer__tests__tokenize_everything.snap create mode 100644 crates/djls-ast/src/snapshots/djls_ast__lexer__tests__tokenize_html.snap create mode 100644 crates/djls-ast/src/snapshots/djls_ast__lexer__tests__tokenize_script.snap create mode 100644 crates/djls-ast/src/snapshots/djls_ast__lexer__tests__tokenize_style.snap create mode 100644 crates/djls-ast/src/snapshots/djls_ast__parser__tests__parse_comments.snap create mode 100644 crates/djls-ast/src/snapshots/djls_ast__parser__tests__parse_django_block.snap create mode 100644 crates/djls-ast/src/snapshots/djls_ast__parser__tests__parse_django_variable.snap create mode 100644 crates/djls-ast/src/snapshots/djls_ast__parser__tests__parse_html_tag.snap create mode 100644 crates/djls-ast/src/snapshots/djls_ast__parser__tests__parse_script.snap create mode 100644 crates/djls-ast/src/snapshots/djls_ast__parser__tests__parse_style.snap create mode 100644 crates/djls-ast/src/tokens.rs diff --git a/Cargo.toml b/Cargo.toml index 5504872..6701f14 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,6 +4,7 @@ resolver = "2" [workspace.dependencies] djls = { path = "crates/djls" } +djls-ast = { path = "crates/djls-ast" } djls-django = { path = "crates/djls-django" } djls-python = { path = "crates/djls-python" } diff --git a/crates/djls-ast/Cargo.toml b/crates/djls-ast/Cargo.toml new file mode 100644 index 0000000..0387ae8 --- /dev/null +++ b/crates/djls-ast/Cargo.toml @@ -0,0 +1,16 @@ +[package] +name = "djls-ast" +version = "0.0.0" +edition = "2021" + +[dependencies] +serde = { workspace = true } + +thiserror = "2.0.6" + +[dev-dependencies] +insta = { version = "1.41.1", features = ["yaml"] } + +[profile.dev.package] +insta.opt-level = 3 +similar.opt-level = 3 diff --git a/crates/djls-ast/src/ast.rs b/crates/djls-ast/src/ast.rs new file mode 100644 index 0000000..3bb034c --- /dev/null +++ b/crates/djls-ast/src/ast.rs @@ -0,0 +1,225 @@ +use serde::Serialize; +use std::collections::BTreeMap; +use std::str::FromStr; +use thiserror::Error; + +#[derive(Clone, Debug, Default, Serialize)] +pub struct Ast { + nodes: Vec, +} + +impl Ast { + pub fn nodes(&self) -> &Vec { + &self.nodes + } + + pub fn add_node(&mut self, node: Node) { + self.nodes.push(node); + } + + pub fn finalize(&mut self) -> Result { + if self.nodes.is_empty() { + return Err(AstError::EmptyAst); + } + Ok(self.clone()) + } +} + +#[derive(Clone, Debug, Serialize)] +pub enum Node { + Django(DjangoNode), + Html(HtmlNode), + Script(ScriptNode), + Style(StyleNode), + Text(String), +} + +#[derive(Clone, Debug, Serialize)] +pub enum DjangoNode { + Comment(String), + Tag { + kind: DjangoTagKind, + bits: Vec, + children: Vec, + }, + Variable { + bits: Vec, + filters: Vec, + }, +} + +#[derive(Clone, Debug, Serialize)] +pub enum DjangoTagKind { + Autoescape, + Block, + Comment, + CsrfToken, + Cycle, + Debug, + Elif, + Else, + Empty, + Extends, + Filter, + FirstOf, + For, + If, + IfChanged, + Include, + Load, + Lorem, + Now, + Other(String), + Querystring, // 5.1 + Regroup, + ResetCycle, + Spaceless, + TemplateTag, + Url, + Verbatim, + WidthRatio, + With, +} + +impl DjangoTagKind { + const AUTOESCAPE: &'static str = "autoescape"; + const BLOCK: &'static str = "block"; + const COMMENT: &'static str = "comment"; + const CSRF_TOKEN: &'static str = "csrf_token"; + const CYCLE: &'static str = "cycle"; + const DEBUG: &'static str = "debug"; + const ELIF: &'static str = "elif"; + const ELSE: &'static str = "else"; + const EMPTY: &'static str = "empty"; + const EXTENDS: &'static str = "extends"; + const FILTER: &'static str = "filter"; + const FIRST_OF: &'static str = "firstof"; + const FOR: &'static str = "for"; + const IF: &'static str = "if"; + const IF_CHANGED: &'static str = "ifchanged"; + const INCLUDE: &'static str = "include"; + const LOAD: &'static str = "load"; + const LOREM: &'static str = "lorem"; + const NOW: &'static str = "now"; + const QUERYSTRING: &'static str = "querystring"; + const REGROUP: &'static str = "regroup"; + const RESET_CYCLE: &'static str = "resetcycle"; + const SPACELESS: &'static str = "spaceless"; + const TEMPLATE_TAG: &'static str = "templatetag"; + const URL: &'static str = "url"; + const VERBATIM: &'static str = "verbatim"; + const WIDTH_RATIO: &'static str = "widthratio"; + const WITH: &'static str = "with"; +} + +impl FromStr for DjangoTagKind { + type Err = AstError; + + fn from_str(s: &str) -> Result { + if s.is_empty() { + return Err(AstError::EmptyTag); + } + + match s { + Self::AUTOESCAPE => Ok(Self::Autoescape), + Self::BLOCK => Ok(Self::Block), + Self::COMMENT => Ok(Self::Comment), + Self::CSRF_TOKEN => Ok(Self::CsrfToken), + Self::CYCLE => Ok(Self::Cycle), + Self::DEBUG => Ok(Self::Debug), + Self::ELIF => Ok(Self::Elif), + Self::ELSE => Ok(Self::Else), + Self::EMPTY => Ok(Self::Empty), + Self::EXTENDS => Ok(Self::Extends), + Self::FILTER => Ok(Self::Filter), + Self::FIRST_OF => Ok(Self::FirstOf), + Self::FOR => Ok(Self::For), + Self::IF => Ok(Self::If), + Self::IF_CHANGED => Ok(Self::IfChanged), + Self::INCLUDE => Ok(Self::Include), + Self::LOAD => Ok(Self::Load), + Self::LOREM => Ok(Self::Lorem), + Self::NOW => Ok(Self::Now), + Self::QUERYSTRING => Ok(Self::Querystring), + Self::REGROUP => Ok(Self::Regroup), + Self::RESET_CYCLE => Ok(Self::ResetCycle), + Self::SPACELESS => Ok(Self::Spaceless), + Self::TEMPLATE_TAG => Ok(Self::TemplateTag), + Self::URL => Ok(Self::Url), + Self::VERBATIM => Ok(Self::Verbatim), + Self::WIDTH_RATIO => Ok(Self::WidthRatio), + Self::WITH => Ok(Self::With), + other => Ok(Self::Other(other.to_string())), + } + } +} + +#[derive(Clone, Debug, Serialize)] +pub struct DjangoFilter { + name: String, + arguments: Vec, +} + +impl DjangoFilter { + pub fn new(name: String, arguments: Vec) -> Self { + Self { name, arguments } + } +} + +#[derive(Clone, Debug, Serialize)] +pub enum HtmlNode { + Comment(String), + Doctype(String), + Element { + tag_name: String, + attributes: Attributes, + children: Vec, + }, + Void { + tag_name: String, + attributes: Attributes, + }, +} + +#[derive(Clone, Debug, Serialize)] +pub enum ScriptNode { + Comment { + content: String, + kind: ScriptCommentKind, + }, + Element { + attributes: Attributes, + children: Vec, + }, +} + +#[derive(Clone, Debug, Serialize)] +pub enum ScriptCommentKind { + SingleLine, // // + MultiLine, // /* */ +} + +#[derive(Clone, Debug, Serialize)] +pub enum StyleNode { + Comment(String), + Element { + attributes: Attributes, + children: Vec, + }, +} + +#[derive(Clone, Debug, Serialize)] +pub enum AttributeValue { + Value(String), + Boolean, +} + +pub type Attributes = BTreeMap; + +#[derive(Error, Debug)] +pub enum AstError { + #[error("error parsing django tag, recieved empty tag name")] + EmptyTag, + #[error("empty ast")] + EmptyAst, +} diff --git a/crates/djls-ast/src/lexer.rs b/crates/djls-ast/src/lexer.rs new file mode 100644 index 0000000..83b55b8 --- /dev/null +++ b/crates/djls-ast/src/lexer.rs @@ -0,0 +1,414 @@ +use crate::tokens::{Token, TokenStream, TokenType}; +use thiserror::Error; + +pub struct Lexer { + source: String, + chars: Vec, + start: usize, + current: usize, + line: usize, +} + +impl Lexer { + pub fn new(source: &str) -> Self { + Lexer { + source: String::from(source), + chars: source.chars().collect(), + start: 0, + current: 0, + line: 1, + } + } + + pub fn tokenize(&mut self) -> Result { + let mut tokens = TokenStream::default(); + while !self.is_at_end() { + let token = self.next_token()?; + tokens.add_token(token); + } + tokens.finalize(self.line); + Ok(tokens) + } + + fn next_token(&mut self) -> Result { + self.start = self.current; + + let token_type = match self.peek()? { + '{' => match self.peek_next()? { + '%' => { + self.consume_n(2)?; // {% + let content = self.consume_until("%}")?; + self.consume_n(2)?; // %} + TokenType::DjangoBlock(content) + } + '{' => { + self.consume_n(2)?; // {{ + let content = self.consume_until("}}")?; + self.consume_n(2)?; // }} + TokenType::DjangoVariable(content) + } + '#' => { + self.consume_n(2)?; // {# + let content = self.consume_until("#}")?; + self.consume_n(2)?; // #} + TokenType::Comment(content, "{#".to_string(), Some("#}".to_string())) + } + _ => { + self.consume()?; // { + TokenType::Text(String::from("{")) + } + }, + + '<' => match self.peek_next()? { + '/' => { + self.consume_n(2)?; // ")?; + self.consume()?; // > + TokenType::HtmlTagClose(tag) + } + '!' if self.matches("")?; + self.consume_n(3)?; // --> + TokenType::Comment(content, "".to_string())) + } + _ => { + self.consume()?; // consume < + let tag = self.consume_until(">")?; + self.consume()?; // consume > + if tag.starts_with("script") { + TokenType::ScriptTagOpen(tag) + } else if tag.starts_with("style") { + TokenType::StyleTagOpen(tag) + } else if tag.ends_with("/") { + TokenType::HtmlTagVoid(tag.trim_end_matches("/").to_string()) + } else { + TokenType::HtmlTagOpen(tag) + } + } + }, + + '/' => match self.peek_next()? { + '/' => { + self.consume_n(2)?; // // + let content = self.consume_until("\n")?; + TokenType::Comment(content, "//".to_string(), None) + } + '*' => { + self.consume_n(2)?; // /* + let content = self.consume_until("*/")?; + self.consume_n(2)?; // */ + TokenType::Comment(content, "/*".to_string(), Some("*/".to_string())) + } + _ => { + self.consume()?; + TokenType::Text("/".to_string()) + } + }, + + c if c.is_whitespace() => { + if c == '\n' || c == '\r' { + self.consume()?; // \r or \n + if c == '\r' && self.peek()? == '\n' { + self.consume()?; // \n of \r\n + } + TokenType::Newline + } else { + self.consume()?; // Consume the first whitespace + while !self.is_at_end() && self.peek()?.is_whitespace() { + if self.peek()? == '\n' || self.peek()? == '\r' { + break; + } + self.consume()?; + } + let whitespace_count = self.current - self.start; + TokenType::Whitespace(whitespace_count) + } + } + + _ => { + let mut text = String::new(); + while !self.is_at_end() { + let c = self.peek()?; + if c == '{' || c == '<' || c == '\n' { + break; + } + text.push(c); + self.consume()?; + } + TokenType::Text(text) + } + }; + + let token = Token::new(token_type, self.line, Some(self.start)); + + match self.peek_previous()? { + '\n' => self.line += 1, + '\r' => { + self.line += 1; + if self.peek()? == '\n' { + self.current += 1; + } + } + _ => {} + } + + Ok(token) + } + + fn peek(&self) -> Result { + self.peek_at(0) + } + + fn peek_next(&self) -> Result { + self.peek_at(1) + } + + fn peek_previous(&self) -> Result { + self.peek_at(-1) + } + + fn peek_until(&self, end: &str) -> Result { + let mut index = self.current; + let end_chars: Vec = end.chars().collect(); + + while index < self.chars.len() { + if self.chars[index..].starts_with(&end_chars) { + return Ok(true); + } + index += 1; + } + Ok(false) + } + + fn peek_at(&self, offset: isize) -> Result { + let index = self.current as isize + offset; + self.item_at(index as usize) + } + + fn item_at(&self, index: usize) -> Result { + if index >= self.source.len() { + // Return a null character when past the end, a bit of a departure from + // idiomatic Rust code, but makes writing the matching above and testing + // much easier + Ok('\0') + } else { + Ok(self.source.chars().nth(index).unwrap()) + } + } + + fn matches(&mut self, pattern: &str) -> Result { + let mut i = self.current; + for c in pattern.chars() { + if i >= self.chars.len() || self.chars[i] != c { + return Ok(false); + } + i += 1; + } + Ok(true) + } + + fn is_at_end(&self) -> bool { + self.current >= self.source.len() + } + + fn consume(&mut self) -> Result { + if self.is_at_end() { + return Err(LexerError::AtEndOfSource); + } + self.current += 1; + self.peek_previous() + } + + fn consume_n(&mut self, count: usize) -> Result { + let start = self.current; + for _ in 0..count { + self.consume()?; + } + Ok(self.source[start..self.current].trim().to_string()) + } + + fn consume_chars(&mut self, s: &str) -> Result { + for c in s.chars() { + if c != self.peek()? { + return Err(LexerError::UnexpectedCharacter(c, self.line)); + } + self.consume()?; + } + self.peek_previous() + } + + fn consume_until(&mut self, s: &str) -> Result { + let start = self.current; + while !self.is_at_end() { + if self.chars[self.current..self.chars.len()] + .starts_with(s.chars().collect::>().as_slice()) + { + return Ok(self.source[start..self.current].trim().to_string()); + } + self.consume()?; + } + Err(LexerError::UnexpectedEndOfInput) + } +} + +#[derive(Error, Debug)] +pub enum LexerError { + #[error("empty token at line {0}")] + EmptyToken(usize), + + #[error("unexpected character '{0}' at line {1}")] + UnexpectedCharacter(char, usize), + + #[error("unexpected end of input")] + UnexpectedEndOfInput, + + #[error("source is empty")] + EmptySource, + + #[error("at beginning of source")] + AtBeginningOfSource, + + #[error("at end of source")] + AtEndOfSource, + + #[error("invalid character access")] + InvalidCharacterAccess, + + #[error("unexpected token type '{0:?}'")] + UnexpectedTokenType(TokenType), +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_tokenize_html() { + let source = r#"
"#; + let mut lexer = Lexer::new(source); + let tokens = lexer.tokenize().unwrap(); + insta::assert_yaml_snapshot!(tokens); + } + + #[test] + fn test_tokenize_django_variable() { + let source = "{{ user.name|default:\"Anonymous\"|title }}"; + let mut lexer = Lexer::new(source); + let tokens = lexer.tokenize().unwrap(); + insta::assert_yaml_snapshot!(tokens); + } + + #[test] + fn test_tokenize_django_block() { + let source = "{% if user.is_staff %}Admin{% else %}User{% endif %}"; + let mut lexer = Lexer::new(source); + let tokens = lexer.tokenize().unwrap(); + insta::assert_yaml_snapshot!(tokens); + } + + #[test] + fn test_tokenize_comments() { + let source = r#" +{# Django comment #} + +"#; + let mut lexer = Lexer::new(source); + let tokens = lexer.tokenize().unwrap(); + insta::assert_yaml_snapshot!(tokens); + } + + #[test] + fn test_tokenize_script() { + let source = r#""#; + let mut lexer = Lexer::new(source); + let tokens = lexer.tokenize().unwrap(); + insta::assert_yaml_snapshot!(tokens); + } + + #[test] + fn test_tokenize_style() { + let source = r#""#; + let mut lexer = Lexer::new(source); + let tokens = lexer.tokenize().unwrap(); + insta::assert_yaml_snapshot!(tokens); + } + + #[test] + fn test_tokenize_error_cases() { + // Unterminated tokens + assert!(Lexer::new("{{ user.name").tokenize().is_err()); // No closing }} + assert!(Lexer::new("{% if").tokenize().is_err()); // No closing %} + assert!(Lexer::new("{#").tokenize().is_err()); // No closing #} + assert!(Lexer::new(" + + // Invalid characters or syntax within tokens + assert!(Lexer::new("{{}}").tokenize().is_ok()); // Empty but valid + assert!(Lexer::new("{% %}").tokenize().is_ok()); // Empty but valid + assert!(Lexer::new("{##}").tokenize().is_ok()); // Empty but valid + } + + #[test] + fn test_tokenize_nested_delimiters() { + let source = r#"{{ user.name }} +{% if true %} +{# comment #} + +
text
"#; + assert!(Lexer::new(source).tokenize().is_ok()); + } + + #[test] + fn test_tokenize_everything() { + let source = r#" + + + + + + + +
+ {% if user.is_authenticated %} + {# Welcome message #} +

Welcome, {{ user.name|default:"Guest"|title }}!

+ {% if user.is_staff %} + Admin + {% else %} + User + {% endif %} + {% endif %} +
+ +"#; + let mut lexer = Lexer::new(source); + let tokens = lexer.tokenize().unwrap(); + insta::assert_yaml_snapshot!(tokens); + } +} diff --git a/crates/djls-ast/src/lib.rs b/crates/djls-ast/src/lib.rs new file mode 100644 index 0000000..55a85bc --- /dev/null +++ b/crates/djls-ast/src/lib.rs @@ -0,0 +1,4 @@ +mod ast; +mod lexer; +mod parser; +mod tokens; diff --git a/crates/djls-ast/src/parser.rs b/crates/djls-ast/src/parser.rs new file mode 100644 index 0000000..df59ddf --- /dev/null +++ b/crates/djls-ast/src/parser.rs @@ -0,0 +1,638 @@ +use crate::ast::{ + Ast, AstError, AttributeValue, DjangoFilter, DjangoNode, DjangoTagKind, HtmlNode, Node, + ScriptCommentKind, ScriptNode, StyleNode, +}; +use crate::tokens::{Token, TokenStream, TokenType}; +use std::collections::BTreeMap; +use std::str::FromStr; +use thiserror::Error; + +pub struct Parser { + tokens: TokenStream, + current: usize, +} + +impl Parser { + pub fn new(tokens: TokenStream) -> Self { + Parser { tokens, current: 0 } + } + + pub fn parse(&mut self) -> Result { + let mut ast = Ast::default(); + + while !self.is_at_end() { + let node = self.next_node(); + match node { + Ok(node) => { + ast.add_node(node); + } + Err(ParserError::AtEndOfStream) => { + if ast.nodes().is_empty() { + return Err(ParserError::UnexpectedEof); + } + break; + } + Err(_) => { + self.synchronize(&[ + TokenType::DjangoBlock(String::new()), + TokenType::HtmlTagOpen(String::new()), + TokenType::HtmlTagVoid(String::new()), + TokenType::ScriptTagOpen(String::new()), + TokenType::StyleTagOpen(String::new()), + TokenType::Newline, + TokenType::Eof, + ])?; + continue; + } + } + } + + Ok(ast.finalize()?) + } + + fn next_node(&mut self) -> Result { + let token = self.peek()?; + let node = match token.token_type() { + TokenType::Comment(s, start, end) => self.parse_comment(s, start, end.as_deref()), + TokenType::DjangoBlock(s) => self.parse_django_block(s), + TokenType::DjangoVariable(s) => self.parse_django_variable(s), + TokenType::Eof => self.parse_eof(), + TokenType::HtmlTagClose(tag) => Err(ParserError::ClosingTagFound(tag.to_string())), + TokenType::HtmlTagOpen(s) => self.parse_html_tag_open(s), + TokenType::HtmlTagVoid(s) => self.parse_html_tag_void(s), + TokenType::Newline => self.parse_newline(), + TokenType::ScriptTagOpen(s) => self.parse_script_tag_open(s), + TokenType::ScriptTagClose(_) => Err(ParserError::ClosingTagFound("script".to_string())), + TokenType::StyleTagOpen(s) => self.parse_style_tag_open(s), + TokenType::StyleTagClose(_) => Err(ParserError::ClosingTagFound("style".to_string())), + TokenType::Text(s) => self.parse_text(s), + TokenType::Whitespace(_) => self.parse_whitespace(), + }?; + Ok(node) + } + + fn parse_comment( + &mut self, + content: &str, + start: &str, + end: Option<&str>, + ) -> Result { + self.consume()?; + + match start { + "{#" => Ok(Node::Django(DjangoNode::Comment(content.to_string()))), + " +{# Django comment #} + +"#; + let tokens = Lexer::new(source).tokenize().unwrap(); + let mut parser = Parser::new(tokens); + let ast = parser.parse().unwrap(); + insta::assert_yaml_snapshot!(ast); + } + + #[test] + fn test_parse_django_block() { + let source = r#"{% if user.is_staff %}Admin{% else %}User{% endif %}"#; + let tokens = Lexer::new(source).tokenize().unwrap(); + let mut parser = Parser::new(tokens); + let ast = parser.parse().unwrap(); + insta::assert_yaml_snapshot!(ast); + } + + #[test] + fn test_parse_django_variable() { + let source = r#"{{ user.name|default:"Anonymous"|title }}"#; + let tokens = Lexer::new(source).tokenize().unwrap(); + let mut parser = Parser::new(tokens); + let ast = parser.parse().unwrap(); + insta::assert_yaml_snapshot!(ast); + } + #[test] + fn test_parse_html_tag() { + let source = r#"
"#; + let tokens = Lexer::new(source).tokenize().unwrap(); + let mut parser = Parser::new(tokens); + let ast = parser.parse().unwrap(); + insta::assert_yaml_snapshot!(ast); + } + + #[test] + fn test_parse_script() { + let source = r#""#; + let tokens = Lexer::new(source).tokenize().unwrap(); + let mut parser = Parser::new(tokens); + let ast = parser.parse().unwrap(); + insta::assert_yaml_snapshot!(ast); + } + + #[test] + fn test_parse_style() { + let source = r#""#; + let tokens = Lexer::new(source).tokenize().unwrap(); + let mut parser = Parser::new(tokens); + let ast = parser.parse().unwrap(); + insta::assert_yaml_snapshot!(ast); + } + + fn test_parse_full() { + let source = r#" + + + + + + + +
+ {% if user.is_authenticated %} + {# Welcome message #} +

Welcome, {{ user.name|default:"Guest"|title }}!

+ {% if user.is_staff %} + Admin + {% else %} + User + {% endif %} + {% endif %} +
+ +"#; + let tokens = Lexer::new(source).tokenize().unwrap(); + let mut parser = Parser::new(tokens); + let ast = parser.parse().unwrap(); + insta::assert_yaml_snapshot!(ast); + } + + #[test] + fn test_parse_unexpected_eof() { + let source = "
\n"; + let tokens = Lexer::new(source).tokenize().unwrap(); + let mut parser = Parser::new(tokens); + let ast = parser.parse(); + assert!(matches!(ast, Err(ParserError::UnexpectedEof))); + } +} diff --git a/crates/djls-ast/src/snapshots/djls_ast__lexer__tests__tokenize_comments.snap b/crates/djls-ast/src/snapshots/djls_ast__lexer__tests__tokenize_comments.snap new file mode 100644 index 0000000..276552a --- /dev/null +++ b/crates/djls-ast/src/snapshots/djls_ast__lexer__tests__tokenize_comments.snap @@ -0,0 +1,94 @@ +--- +source: crates/djls-ast/src/lexer.rs +expression: tokens +--- +- token_type: + Comment: + - HTML comment + - "" + line: 1 + start: 0 +- token_type: Newline + line: 1 + start: 21 +- token_type: + Comment: + - Django comment + - "{#" + - "#}" + line: 2 + start: 22 +- token_type: Newline + line: 2 + start: 42 +- token_type: + ScriptTagOpen: script + line: 3 + start: 43 +- token_type: Newline + line: 3 + start: 51 +- token_type: + Whitespace: 4 + line: 4 + start: 52 +- token_type: + Comment: + - JS single line comment + - // + - ~ + line: 4 + start: 56 +- token_type: Newline + line: 4 + start: 81 +- token_type: + Whitespace: 4 + line: 5 + start: 82 +- token_type: + Comment: + - "JS multi-line\n comment" + - /* + - "*/" + line: 5 + start: 86 +- token_type: Newline + line: 5 + start: 120 +- token_type: + HtmlTagClose: script + line: 6 + start: 121 +- token_type: Newline + line: 6 + start: 130 +- token_type: + StyleTagOpen: style + line: 7 + start: 131 +- token_type: Newline + line: 7 + start: 138 +- token_type: + Whitespace: 4 + line: 8 + start: 139 +- token_type: + Comment: + - CSS comment + - /* + - "*/" + line: 8 + start: 143 +- token_type: Newline + line: 8 + start: 160 +- token_type: + HtmlTagClose: style + line: 9 + start: 161 +- token_type: Eof + line: 9 + start: ~ diff --git a/crates/djls-ast/src/snapshots/djls_ast__lexer__tests__tokenize_django_block.snap b/crates/djls-ast/src/snapshots/djls_ast__lexer__tests__tokenize_django_block.snap new file mode 100644 index 0000000..8e8a3bf --- /dev/null +++ b/crates/djls-ast/src/snapshots/djls_ast__lexer__tests__tokenize_django_block.snap @@ -0,0 +1,27 @@ +--- +source: crates/djls-ast/src/lexer.rs +expression: tokens +--- +- token_type: + DjangoBlock: if user.is_staff + line: 1 + start: 0 +- token_type: + Text: Admin + line: 1 + start: 22 +- token_type: + DjangoBlock: else + line: 1 + start: 27 +- token_type: + Text: User + line: 1 + start: 37 +- token_type: + DjangoBlock: endif + line: 1 + start: 41 +- token_type: Eof + line: 1 + start: ~ diff --git a/crates/djls-ast/src/snapshots/djls_ast__lexer__tests__tokenize_django_variable.snap b/crates/djls-ast/src/snapshots/djls_ast__lexer__tests__tokenize_django_variable.snap new file mode 100644 index 0000000..6daee9c --- /dev/null +++ b/crates/djls-ast/src/snapshots/djls_ast__lexer__tests__tokenize_django_variable.snap @@ -0,0 +1,11 @@ +--- +source: crates/djls-ast/src/lexer.rs +expression: tokens +--- +- token_type: + DjangoVariable: "user.name|default:\"Anonymous\"|title" + line: 1 + start: 0 +- token_type: Eof + line: 1 + start: ~ diff --git a/crates/djls-ast/src/snapshots/djls_ast__lexer__tests__tokenize_everything.snap b/crates/djls-ast/src/snapshots/djls_ast__lexer__tests__tokenize_everything.snap new file mode 100644 index 0000000..ee185d6 --- /dev/null +++ b/crates/djls-ast/src/snapshots/djls_ast__lexer__tests__tokenize_everything.snap @@ -0,0 +1,369 @@ +--- +source: crates/djls-ast/src/lexer.rs +expression: tokens +--- +- token_type: + HtmlTagOpen: "!DOCTYPE html" + line: 1 + start: 0 +- token_type: Newline + line: 1 + start: 15 +- token_type: + HtmlTagOpen: html + line: 2 + start: 16 +- token_type: Newline + line: 2 + start: 22 +- token_type: + HtmlTagOpen: head + line: 3 + start: 23 +- token_type: Newline + line: 3 + start: 29 +- token_type: + Whitespace: 4 + line: 4 + start: 30 +- token_type: + StyleTagOpen: "style type=\"text/css\"" + line: 4 + start: 34 +- token_type: Newline + line: 4 + start: 57 +- token_type: + Whitespace: 8 + line: 5 + start: 58 +- token_type: + Comment: + - Style header + - /* + - "*/" + line: 5 + start: 66 +- token_type: Newline + line: 5 + start: 84 +- token_type: + Whitespace: 8 + line: 6 + start: 85 +- token_type: + Text: ".header " + line: 6 + start: 93 +- token_type: + Text: "{" + line: 6 + start: 101 +- token_type: + Whitespace: 1 + line: 6 + start: 102 +- token_type: + Text: "color: blue; }" + line: 6 + start: 103 +- token_type: Newline + line: 6 + start: 117 +- token_type: + Whitespace: 4 + line: 7 + start: 118 +- token_type: + HtmlTagClose: style + line: 7 + start: 122 +- token_type: Newline + line: 7 + start: 130 +- token_type: + Whitespace: 4 + line: 8 + start: 131 +- token_type: + ScriptTagOpen: "script type=\"text/javascript\"" + line: 8 + start: 135 +- token_type: Newline + line: 8 + start: 166 +- token_type: + Whitespace: 8 + line: 9 + start: 167 +- token_type: + Comment: + - Init app + - // + - ~ + line: 9 + start: 175 +- token_type: Newline + line: 9 + start: 186 +- token_type: + Whitespace: 8 + line: 10 + start: 187 +- token_type: + Text: "const app = " + line: 10 + start: 195 +- token_type: + Text: "{" + line: 10 + start: 207 +- token_type: Newline + line: 10 + start: 208 +- token_type: + Whitespace: 12 + line: 11 + start: 209 +- token_type: + Comment: + - Config + - /* + - "*/" + line: 11 + start: 221 +- token_type: Newline + line: 11 + start: 233 +- token_type: + Whitespace: 12 + line: 12 + start: 234 +- token_type: + Text: "debug: true" + line: 12 + start: 246 +- token_type: Newline + line: 12 + start: 257 +- token_type: + Whitespace: 8 + line: 13 + start: 258 +- token_type: + Text: "};" + line: 13 + start: 266 +- token_type: Newline + line: 13 + start: 268 +- token_type: + Whitespace: 4 + line: 14 + start: 269 +- token_type: + HtmlTagClose: script + line: 14 + start: 273 +- token_type: Newline + line: 14 + start: 282 +- token_type: + HtmlTagClose: head + line: 15 + start: 283 +- token_type: Newline + line: 15 + start: 290 +- token_type: + HtmlTagOpen: body + line: 16 + start: 291 +- token_type: Newline + line: 16 + start: 297 +- token_type: + Whitespace: 4 + line: 17 + start: 298 +- token_type: + Comment: + - Header section + - "" + line: 17 + start: 302 +- token_type: Newline + line: 17 + start: 325 +- token_type: + Whitespace: 4 + line: 18 + start: 326 +- token_type: + HtmlTagOpen: "div class=\"header\" id=\"main\" data-value=\"123\" disabled" + line: 18 + start: 330 +- token_type: Newline + line: 18 + start: 386 +- token_type: + Whitespace: 8 + line: 19 + start: 387 +- token_type: + DjangoBlock: if user.is_authenticated + line: 19 + start: 395 +- token_type: Newline + line: 19 + start: 425 +- token_type: + Whitespace: 12 + line: 20 + start: 426 +- token_type: + Comment: + - Welcome message + - "{#" + - "#}" + line: 20 + start: 438 +- token_type: Newline + line: 20 + start: 459 +- token_type: + Whitespace: 12 + line: 21 + start: 460 +- token_type: + HtmlTagOpen: h1 + line: 21 + start: 472 +- token_type: + Text: "Welcome, " + line: 21 + start: 476 +- token_type: + DjangoVariable: "user.name|default:\"Guest\"|title" + line: 21 + start: 485 +- token_type: + Text: "!" + line: 21 + start: 522 +- token_type: + HtmlTagClose: h1 + line: 21 + start: 523 +- token_type: Newline + line: 21 + start: 528 +- token_type: + Whitespace: 12 + line: 22 + start: 529 +- token_type: + DjangoBlock: if user.is_staff + line: 22 + start: 541 +- token_type: Newline + line: 22 + start: 563 +- token_type: + Whitespace: 16 + line: 23 + start: 564 +- token_type: + HtmlTagOpen: span + line: 23 + start: 580 +- token_type: + Text: Admin + line: 23 + start: 586 +- token_type: + HtmlTagClose: span + line: 23 + start: 591 +- token_type: Newline + line: 23 + start: 598 +- token_type: + Whitespace: 12 + line: 24 + start: 599 +- token_type: + DjangoBlock: else + line: 24 + start: 611 +- token_type: Newline + line: 24 + start: 621 +- token_type: + Whitespace: 16 + line: 25 + start: 622 +- token_type: + HtmlTagOpen: span + line: 25 + start: 638 +- token_type: + Text: User + line: 25 + start: 644 +- token_type: + HtmlTagClose: span + line: 25 + start: 648 +- token_type: Newline + line: 25 + start: 655 +- token_type: + Whitespace: 12 + line: 26 + start: 656 +- token_type: + DjangoBlock: endif + line: 26 + start: 668 +- token_type: Newline + line: 26 + start: 679 +- token_type: + Whitespace: 8 + line: 27 + start: 680 +- token_type: + DjangoBlock: endif + line: 27 + start: 688 +- token_type: Newline + line: 27 + start: 699 +- token_type: + Whitespace: 4 + line: 28 + start: 700 +- token_type: + HtmlTagClose: div + line: 28 + start: 704 +- token_type: Newline + line: 28 + start: 710 +- token_type: + HtmlTagClose: body + line: 29 + start: 711 +- token_type: Newline + line: 29 + start: 718 +- token_type: + HtmlTagClose: html + line: 30 + start: 719 +- token_type: Eof + line: 30 + start: ~ diff --git a/crates/djls-ast/src/snapshots/djls_ast__lexer__tests__tokenize_html.snap b/crates/djls-ast/src/snapshots/djls_ast__lexer__tests__tokenize_html.snap new file mode 100644 index 0000000..e03f6db --- /dev/null +++ b/crates/djls-ast/src/snapshots/djls_ast__lexer__tests__tokenize_html.snap @@ -0,0 +1,15 @@ +--- +source: crates/djls-ast/src/lexer.rs +expression: tokens +--- +- token_type: + HtmlTagOpen: "div class=\"container\" id=\"main\" disabled" + line: 1 + start: 0 +- token_type: + HtmlTagClose: div + line: 1 + start: 42 +- token_type: Eof + line: 1 + start: ~ diff --git a/crates/djls-ast/src/snapshots/djls_ast__lexer__tests__tokenize_script.snap b/crates/djls-ast/src/snapshots/djls_ast__lexer__tests__tokenize_script.snap new file mode 100644 index 0000000..3aed037 --- /dev/null +++ b/crates/djls-ast/src/snapshots/djls_ast__lexer__tests__tokenize_script.snap @@ -0,0 +1,68 @@ +--- +source: crates/djls-ast/src/lexer.rs +expression: tokens +--- +- token_type: + ScriptTagOpen: "script type=\"text/javascript\"" + line: 1 + start: 0 +- token_type: Newline + line: 1 + start: 31 +- token_type: + Whitespace: 4 + line: 2 + start: 32 +- token_type: + Comment: + - Single line comment + - // + - ~ + line: 2 + start: 36 +- token_type: Newline + line: 2 + start: 58 +- token_type: + Whitespace: 4 + line: 3 + start: 59 +- token_type: + Text: const x = 1; + line: 3 + start: 63 +- token_type: Newline + line: 3 + start: 75 +- token_type: + Whitespace: 4 + line: 4 + start: 76 +- token_type: + Comment: + - "Multi-line\n comment" + - /* + - "*/" + line: 4 + start: 80 +- token_type: Newline + line: 4 + start: 111 +- token_type: + Whitespace: 4 + line: 5 + start: 112 +- token_type: + Text: console.log(x); + line: 5 + start: 116 +- token_type: Newline + line: 5 + start: 131 +- token_type: + HtmlTagClose: script + line: 6 + start: 132 +- token_type: Eof + line: 6 + start: ~ diff --git a/crates/djls-ast/src/snapshots/djls_ast__lexer__tests__tokenize_style.snap b/crates/djls-ast/src/snapshots/djls_ast__lexer__tests__tokenize_style.snap new file mode 100644 index 0000000..dfab492 --- /dev/null +++ b/crates/djls-ast/src/snapshots/djls_ast__lexer__tests__tokenize_style.snap @@ -0,0 +1,69 @@ +--- +source: crates/djls-ast/src/lexer.rs +expression: tokens +--- +- token_type: + StyleTagOpen: "style type=\"text/css\"" + line: 1 + start: 0 +- token_type: Newline + line: 1 + start: 23 +- token_type: + Whitespace: 4 + line: 2 + start: 24 +- token_type: + Comment: + - Header styles + - /* + - "*/" + line: 2 + start: 28 +- token_type: Newline + line: 2 + start: 47 +- token_type: + Whitespace: 4 + line: 3 + start: 48 +- token_type: + Text: ".header " + line: 3 + start: 52 +- token_type: + Text: "{" + line: 3 + start: 60 +- token_type: Newline + line: 3 + start: 61 +- token_type: + Whitespace: 8 + line: 4 + start: 62 +- token_type: + Text: "color: blue;" + line: 4 + start: 70 +- token_type: Newline + line: 4 + start: 82 +- token_type: + Whitespace: 4 + line: 5 + start: 83 +- token_type: + Text: "}" + line: 5 + start: 87 +- token_type: Newline + line: 5 + start: 88 +- token_type: + HtmlTagClose: style + line: 6 + start: 89 +- token_type: Eof + line: 6 + start: ~ diff --git a/crates/djls-ast/src/snapshots/djls_ast__parser__tests__parse_comments.snap b/crates/djls-ast/src/snapshots/djls_ast__parser__tests__parse_comments.snap new file mode 100644 index 0000000..1249821 --- /dev/null +++ b/crates/djls-ast/src/snapshots/djls_ast__parser__tests__parse_comments.snap @@ -0,0 +1,28 @@ +--- +source: crates/djls-ast/src/parser.rs +expression: ast +--- +nodes: + - Html: + Comment: HTML comment + - Django: + Comment: Django comment + - Script: + Element: + attributes: + script: Boolean + children: + - Script: + Comment: + content: JS single line + kind: SingleLine + - Script: + Comment: + content: "JS multi\n line" + kind: MultiLine + - Style: + Element: + attributes: {} + children: + - Style: + Comment: CSS comment diff --git a/crates/djls-ast/src/snapshots/djls_ast__parser__tests__parse_django_block.snap b/crates/djls-ast/src/snapshots/djls_ast__parser__tests__parse_django_block.snap new file mode 100644 index 0000000..237067e --- /dev/null +++ b/crates/djls-ast/src/snapshots/djls_ast__parser__tests__parse_django_block.snap @@ -0,0 +1,20 @@ +--- +source: crates/djls-ast/src/parser.rs +expression: ast +--- +nodes: + - Django: + Tag: + kind: If + bits: + - if + - user.is_staff + children: + - Text: Admin + - Django: + Tag: + kind: Else + bits: + - else + children: + - Text: User diff --git a/crates/djls-ast/src/snapshots/djls_ast__parser__tests__parse_django_variable.snap b/crates/djls-ast/src/snapshots/djls_ast__parser__tests__parse_django_variable.snap new file mode 100644 index 0000000..fe16717 --- /dev/null +++ b/crates/djls-ast/src/snapshots/djls_ast__parser__tests__parse_django_variable.snap @@ -0,0 +1,16 @@ +--- +source: crates/djls-ast/src/parser.rs +expression: ast +--- +nodes: + - Django: + Variable: + bits: + - user + - name + filters: + - name: default + arguments: + - Anonymous + - name: title + arguments: [] diff --git a/crates/djls-ast/src/snapshots/djls_ast__parser__tests__parse_html_tag.snap b/crates/djls-ast/src/snapshots/djls_ast__parser__tests__parse_html_tag.snap new file mode 100644 index 0000000..b005093 --- /dev/null +++ b/crates/djls-ast/src/snapshots/djls_ast__parser__tests__parse_html_tag.snap @@ -0,0 +1,15 @@ +--- +source: crates/djls-ast/src/parser.rs +expression: ast +--- +nodes: + - Html: + Element: + tag_name: div + attributes: + class: + Value: container + disabled: Boolean + id: + Value: main + children: [] diff --git a/crates/djls-ast/src/snapshots/djls_ast__parser__tests__parse_script.snap b/crates/djls-ast/src/snapshots/djls_ast__parser__tests__parse_script.snap new file mode 100644 index 0000000..a2605cf --- /dev/null +++ b/crates/djls-ast/src/snapshots/djls_ast__parser__tests__parse_script.snap @@ -0,0 +1,22 @@ +--- +source: crates/djls-ast/src/parser.rs +expression: ast +--- +nodes: + - Script: + Element: + attributes: + script: Boolean + type: + Value: text/javascript + children: + - Script: + Comment: + content: Single line comment + kind: SingleLine + - Text: const x = 1; + - Script: + Comment: + content: "Multi-line\n comment" + kind: MultiLine + - Text: console.log(x); diff --git a/crates/djls-ast/src/snapshots/djls_ast__parser__tests__parse_style.snap b/crates/djls-ast/src/snapshots/djls_ast__parser__tests__parse_style.snap new file mode 100644 index 0000000..2c0a587 --- /dev/null +++ b/crates/djls-ast/src/snapshots/djls_ast__parser__tests__parse_style.snap @@ -0,0 +1,17 @@ +--- +source: crates/djls-ast/src/parser.rs +expression: ast +--- +nodes: + - Style: + Element: + attributes: + type: + Value: text/css + children: + - Style: + Comment: Header styles + - Text: ".header " + - Text: "{" + - Text: "color: blue;" + - Text: "}" diff --git a/crates/djls-ast/src/tokens.rs b/crates/djls-ast/src/tokens.rs new file mode 100644 index 0000000..095907e --- /dev/null +++ b/crates/djls-ast/src/tokens.rs @@ -0,0 +1,186 @@ +use serde::Serialize; +use std::fmt; +use std::ops::{Deref, DerefMut}; + +#[derive(Clone, Debug, Serialize, PartialEq)] +pub enum TokenType { + Comment(String, String, Option), + DjangoBlock(String), + DjangoVariable(String), + Eof, + HtmlTagOpen(String), + HtmlTagClose(String), + HtmlTagVoid(String), + Newline, + ScriptTagOpen(String), + ScriptTagClose(String), + StyleTagOpen(String), + StyleTagClose(String), + Text(String), + Whitespace(usize), +} + +impl TokenType { + pub fn len(&self) -> Option { + match self { + TokenType::DjangoBlock(s) + | TokenType::DjangoVariable(s) + | TokenType::HtmlTagOpen(s) + | TokenType::HtmlTagClose(s) + | TokenType::HtmlTagVoid(s) + | TokenType::ScriptTagOpen(s) + | TokenType::ScriptTagClose(s) + | TokenType::StyleTagOpen(s) + | TokenType::StyleTagClose(s) + | TokenType::Text(s) => Some(s.len()), + TokenType::Comment(content, start, end) => { + Some(content.len() + start.len() + end.as_ref().map_or(0, |e| e.len())) + } + TokenType::Whitespace(len) => Some(len.clone()), + TokenType::Newline => Some(1), + TokenType::Eof => None, + } + } + + pub fn lexeme(&self) -> &str { + match self { + TokenType::DjangoBlock(s) + | TokenType::DjangoVariable(s) + | TokenType::HtmlTagOpen(s) + | TokenType::HtmlTagClose(s) + | TokenType::HtmlTagVoid(s) + | TokenType::ScriptTagOpen(s) + | TokenType::ScriptTagClose(s) + | TokenType::StyleTagOpen(s) + | TokenType::StyleTagClose(s) + | TokenType::Text(s) => s, + TokenType::Comment(content, _, _) => content, // Just return the content + TokenType::Whitespace(_) => " ", + TokenType::Newline => "\n", + TokenType::Eof => "", + } + } +} + +impl fmt::Display for TokenType { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + use TokenType::*; + + match self { + Comment(content, start, end) => match end { + Some(end) => write!(f, "{}{}{}", start, content, end), + None => write!(f, "{}{}", start, content), + }, + DjangoBlock(s) => write!(f, "{{% {} %}}", s), + DjangoVariable(s) => write!(f, "{{{{ {} }}}}", s), + Eof => Ok(()), + HtmlTagOpen(s) => write!(f, "<{}>", s), + HtmlTagClose(s) => write!(f, "", s), + HtmlTagVoid(s) => write!(f, "<{}/>", s), + Newline => f.write_str("\n"), + ScriptTagOpen(s) => write!(f, "", s), + ScriptTagClose(_) => f.write_str(""), + StyleTagOpen(s) => write!(f, "", s), + StyleTagClose(_) => f.write_str(""), + Text(s) => f.write_str(s), + Whitespace(len) => f.write_str(&" ".repeat(*len)), + } + } +} + +#[derive(Clone, Debug, Serialize, PartialEq)] +pub struct Token { + token_type: TokenType, + line: usize, + start: Option, +} + +impl Token { + pub fn new(token_type: TokenType, line: usize, start: Option) -> Self { + Self { + token_type, + line, + start, + } + } + + pub fn lexeme_from_source<'a>(&self, source: &'a str) -> Option<&'a str> { + match (self.start, self.token_type.len()) { + (Some(start), Some(len)) => Some(&source[start..start + len]), + _ => None, + } + } + + pub fn lexeme(&self) -> &str { + self.token_type.lexeme() + } + + pub fn token_type(&self) -> &TokenType { + &self.token_type + } + + pub fn is_token_type(&self, token_type: &TokenType) -> bool { + &self.token_type == token_type + } +} + +#[derive(Clone, Debug, Default, Serialize)] +pub struct TokenStream(Vec); + +impl TokenStream { + pub fn tokens(&self) -> &Vec { + &self.0 + } + + pub fn add_token(&mut self, token: Token) { + self.0.push(token); + } + + pub fn finalize(&mut self, line: usize) -> TokenStream { + let eof_token = Token { + token_type: TokenType::Eof, + line, + start: None, + }; + self.add_token(eof_token); + self.clone() + } +} + +impl AsRef<[Token]> for TokenStream { + fn as_ref(&self) -> &[Token] { + &self.0 + } +} + +impl Deref for TokenStream { + type Target = Vec; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl DerefMut for TokenStream { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} + +impl IntoIterator for TokenStream { + type Item = Token; + type IntoIter = std::vec::IntoIter; + + fn into_iter(self) -> Self::IntoIter { + self.0.into_iter() + } +} + +impl<'a> IntoIterator for &'a TokenStream { + type Item = &'a Token; + type IntoIter = std::slice::Iter<'a, Token>; + + fn into_iter(self) -> Self::IntoIter { + self.0.iter() + } +}