refactor: Patch Template.compile_nodelist with custom template parser (#908)

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
This commit is contained in:
Juro Oravec 2025-01-15 22:34:32 +01:00 committed by GitHub
parent 8cd4b03286
commit 7ed4fd88f9
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 810 additions and 204 deletions

View file

@ -15,14 +15,14 @@ class ComponentsConfig(AppConfig):
def ready(self) -> None:
from django_components.app_settings import app_settings
from django_components.autodiscovery import autodiscover, import_libraries
from django_components.component import monkeypatch_template
from django_components.component_registry import registry
from django_components.components.dynamic import DynamicComponent
from django_components.util.django_monkeypatch import monkeypatch_template_cls
# NOTE: This monkeypatch is applied here, before Django processes any requests.
# To make django-components work with django-debug-toolbar-template-profiler
# See https://github.com/EmilStenstrom/django-components/discussions/819
monkeypatch_template(Template)
monkeypatch_template_cls(Template)
# Import modules set in `COMPONENTS.libraries` setting
import_libraries()

View file

@ -68,6 +68,7 @@ from django_components.slots import (
resolve_fills,
)
from django_components.template import cached_template
from django_components.util.django_monkeypatch import is_template_cls_patched
from django_components.util.logger import trace_msg
from django_components.util.misc import gen_id
from django_components.util.template_tag import TagParams
@ -1272,54 +1273,6 @@ class ComponentNode(BaseNode):
return output
def monkeypatch_template(template_cls: Type[Template]) -> None:
# Modify `Template.render` to set `isolated_context` kwarg of `push_state`
# based on our custom `Template._dc_is_component_nested`.
#
# Part of fix for https://github.com/EmilStenstrom/django-components/issues/508
#
# NOTE 1: While we could've subclassed Template, then we would need to either
# 1) ask the user to change the backend, so all templates are of our subclass, or
# 2) copy the data from user's Template class instance to our subclass instance,
# which could lead to doubly parsing the source, and could be problematic if users
# used more exotic subclasses of Template.
#
# Instead, modifying only the `render` method of an already-existing instance
# should work well with any user-provided custom subclasses of Template, and it
# doesn't require the source to be parsed multiple times. User can pass extra args/kwargs,
# and can modify the rendering behavior by overriding the `_render` method.
#
# NOTE 2: Instead of setting `Template._dc_is_component_nested`, alternatively we could
# have passed the value to `monkeypatch_template` directly. However, we intentionally
# did NOT do that, so the monkey-patched method is more robust, and can be e.g. copied
# to other.
if hasattr(template_cls, "_dc_patched"):
# Do not patch if done so already. This helps us avoid RecursionError
return
def _template_render(self: Template, context: Context, *args: Any, **kwargs: Any) -> str:
# ---------------- OUR CHANGES START ----------------
# We parametrized `isolated_context`, which was `True` in the original method.
if not hasattr(self, "_dc_is_component_nested"):
isolated_context = True
else:
# MUST be `True` for templates that are NOT import with `{% extends %}` tag,
# and `False` otherwise.
isolated_context = not self._dc_is_component_nested
# ---------------- OUR CHANGES END ----------------
with context.render_context.push_state(self, isolated_context=isolated_context):
if context.template is None:
with context.bind_template(self):
context.template_name = self.name
return self._render(context, *args, **kwargs)
else:
return self._render(context, *args, **kwargs)
template_cls.render = _template_render
template_cls._dc_patched = True
@contextmanager
def _maybe_bind_template(context: Context, template: Template) -> Generator[None, Any, None]:
if context.template is None:
@ -1342,7 +1295,7 @@ def _prepare_template(
# And https://github.com/EmilStenstrom/django-components/issues/634
template = component._get_template(context)
if not getattr(template, "_dc_patched"):
if not is_template_cls_patched(template):
raise RuntimeError(
"Django-components received a Template instance which was not patched."
"If you are using Django's Template class, check if you added django-components"
@ -1350,10 +1303,11 @@ def _prepare_template(
"manually patch the class."
)
# Set `Template._dc_is_component_nested` based on whether we're currently INSIDE
# Set `Template._djc_is_component_nested` based on whether we're currently INSIDE
# the `{% extends %}` tag.
# Part of fix for https://github.com/EmilStenstrom/django-components/issues/508
template._dc_is_component_nested = bool(context.render_context.get(BLOCK_CONTEXT_KEY))
# See django_monkeypatch.py
template._djc_is_component_nested = bool(context.render_context.get(BLOCK_CONTEXT_KEY))
with _maybe_bind_template(context, template):
yield template

View file

@ -2,7 +2,9 @@ import re
from typing import TYPE_CHECKING, Any, Dict, List
from django.template import Context, Node, NodeList, TemplateSyntaxError
from django.template.base import Lexer, Parser, VariableNode
from django.template.base import Parser, VariableNode
from django_components.util.template_parser import parse_template
if TYPE_CHECKING:
from django_components.util.template_tag import TagParam
@ -48,8 +50,7 @@ class DynamicFilterExpression:
# Copy the Parser, and pass through the tags and filters available
# in the current context. Thus, if user calls `{% load %}` inside
# the expression, it won't spill outside.
lexer = Lexer(self.expr)
tokens = lexer.tokenize()
tokens = parse_template(self.expr)
expr_parser = Parser(tokens=tokens)
expr_parser.tags = {**parser.tags}
expr_parser.filters = {**parser.filters}

View file

@ -28,7 +28,7 @@ from django_components.slots import SLOT_DEFAULT_KEYWORD, SLOT_REQUIRED_KEYWORD,
from django_components.tag_formatter import get_tag_formatter
from django_components.util.logger import trace_msg
from django_components.util.misc import gen_id
from django_components.util.template_tag import TagSpec, fix_nested_tags, parse_template_tag, with_tag_spec
from django_components.util.template_tag import TagSpec, parse_template_tag, with_tag_spec
# NOTE: Variable name `register` is required by Django to recognize this as a template tag library
# See https://docs.djangoproject.com/en/dev/howto/custom-template-tags
@ -492,7 +492,6 @@ def component(
"""
tag_id = gen_id()
fix_nested_tags(parser, token)
bits = token.split_contents()
# Let the TagFormatter pre-process the tokens

View file

@ -0,0 +1,110 @@
from typing import Any, Type
from django.template import Context, NodeList, Template
from django.template.base import Parser
from django_components.util.template_parser import parse_template
# In some cases we can't work around Django's design, and need to patch the template class.
def monkeypatch_template_cls(template_cls: Type[Template]) -> None:
monkeypatch_template_compile_nodelist(template_cls)
monkeypatch_template_render(template_cls)
template_cls._djc_patched = True
# Patch `Template.compile_nodelist` to use our custom parser. Our parser makes it possible
# to use template tags as inputs to the component tag:
#
# {% component "my-component" description="{% lorem 3 w %}" / %}
def monkeypatch_template_compile_nodelist(template_cls: Type[Template]) -> None:
def _compile_nodelist(self: Template) -> NodeList:
"""
Parse and compile the template source into a nodelist. If debug
is True and an exception occurs during parsing, the exception is
annotated with contextual line information where it occurred in the
template source.
"""
# ---------------- ORIGINAL (Django v5.1.3) ----------------
# if self.engine.debug:
# lexer = DebugLexer(self.source)
# else:
# lexer = Lexer(self.source)
# tokens = lexer.tokenize()
# ---------------- OUR CHANGES START ----------------
tokens = parse_template(self.source)
# ---------------- OUR CHANGES END ----------------
parser = Parser(
tokens,
self.engine.template_libraries,
self.engine.template_builtins,
self.origin,
)
try:
# ---------------- ADDED IN Django v5.1 - See https://github.com/django/django/commit/35bbb2c9c01882b1d77b0b8c737ac646144833d4 # noqa: E501
nodelist = parser.parse()
self.extra_data = getattr(parser, "extra_data", {})
# ---------------- END OF ADDED IN Django v5.1 ----------------
return nodelist
except Exception as e:
if self.engine.debug:
e.template_debug = self.get_exception_info(e, e.token) # type: ignore
raise
template_cls.compile_nodelist = _compile_nodelist
def monkeypatch_template_render(template_cls: Type[Template]) -> None:
# Modify `Template.render` to set `isolated_context` kwarg of `push_state`
# based on our custom `Template._djc_is_component_nested`.
#
# Part of fix for https://github.com/EmilStenstrom/django-components/issues/508
#
# NOTE 1: While we could've subclassed Template, then we would need to either
# 1) ask the user to change the backend, so all templates are of our subclass, or
# 2) copy the data from user's Template class instance to our subclass instance,
# which could lead to doubly parsing the source, and could be problematic if users
# used more exotic subclasses of Template.
#
# Instead, modifying only the `render` method of an already-existing instance
# should work well with any user-provided custom subclasses of Template, and it
# doesn't require the source to be parsed multiple times. User can pass extra args/kwargs,
# and can modify the rendering behavior by overriding the `_render` method.
#
# NOTE 2: Instead of setting `Template._djc_is_component_nested`, alternatively we could
# have passed the value to `monkeypatch_template_render` directly. However, we intentionally
# did NOT do that, so the monkey-patched method is more robust, and can be e.g. copied
# to other.
if is_template_cls_patched(template_cls):
# Do not patch if done so already. This helps us avoid RecursionError
return
def _template_render(self: Template, context: Context, *args: Any, **kwargs: Any) -> str:
"Display stage -- can be called many times"
# ---------------- ORIGINAL (Django v5.1.3) ----------------
# with context.render_context.push_state(self):
# ---------------- OUR CHANGES START ----------------
# We parametrized `isolated_context`, which was `True` in the original method.
if not hasattr(self, "_djc_is_component_nested"):
isolated_context = True
else:
# MUST be `True` for templates that are NOT import with `{% extends %}` tag,
# and `False` otherwise.
isolated_context = not self._djc_is_component_nested
with context.render_context.push_state(self, isolated_context=isolated_context):
# ---------------- OUR CHANGES END ----------------
if context.template is None:
with context.bind_template(self):
context.template_name = self.name
return self._render(context, *args, **kwargs)
else:
return self._render(context, *args, **kwargs)
template_cls.render = _template_render
def is_template_cls_patched(template_cls: Type[Template]) -> bool:
return getattr(template_cls, "_djc_patched", False)

View file

@ -0,0 +1,227 @@
"""
Parser for Django template.
The parser reads a template file (usually HTML, but not necessarily), which may contain
"template tags" like this:
```django
{% component 'my_comp' key=val key2='val2 two' %}
{% endcomponent %}
{{ my_var }}
{# I am comment #}
```
and returns a list of Tokens:
```py
[
(TokenType.TEXT, '\n', (0, 1), 1),
(TokenType.BLOCK, "component 'my_comp' key=val key2='val2 two'", (1, 50), 2),
(TokenType.TEXT, '\n', (50, 51), 2),
(TokenType.BLOCK, 'endcomponent', (51, 69), 3),
(TokenType.TEXT, '\n\n', (69, 71), 3),
(TokenType.VAR, 'my_var', (71, 83), 5),
(TokenType.TEXT, '\n\n', (83, 85), 5),
(TokenType.COMMENT, 'I am comment', (85, 103), 7),
(TokenType.TEXT, '\n', (103, 104), 7),
]
```
See `parse_template()` for details.
"""
import re
from functools import lru_cache
from typing import List, Optional, Tuple
from django.template.base import DebugLexer, Token, TokenType
from django.template.exceptions import TemplateSyntaxError
# NOTE: As of 0.125, the strategy is to use Django's lexer, and use our own parser
# only when necessary, for the shortest time possible.
#
# Before I switched to this strategy, my initial parser was about 50x slower than Django's lexer.
# I (Juro) assume it was because I was walking character by character, instead of using a regex.
#
# The overall speed should then depend on the number of broken tokens in the template.
#
# Performance of the new strategy on a real-world example:
# - A template with about 110 lines and 6 components
# - Components spanning ~35 lines in total, so roughly 1/3 of the template
# - The custom parser is about 8x slower than Django's Debug lexer.
# - For a mid-sized project of 200 templates, it would take 7-8 seconds to load all the templates
# (from 1 second with Django's lexer).
# - However, thanks to django-component's lazy-loading, this should not be a problem.
#
# How it works is that:
# 1. We use Django's lexer to get the tokens.
# 2. We check them one-by-one, and if we find a broken token, we switch to our parser to fix it.
# 3. Once the broken token is fixed, we find it's end position, and switch back to the Django lexer
# for the remaining text (step 1).
def parse_template(text: str) -> List[Token]:
resolved_tokens: List[Token] = []
index_start = 0
index_end = len(text)
lineno_offset = 0
while index_start < index_end:
broken_token: Optional[Token] = None
# Do fast tokenization with regex - This is about 50x faster than our custom tokenizer.
# We use DebugLexer because we need to get the position of the tokens.
# DebugLexer and Lexer have very similar speeds, Debug is about 33% slower.
lexer = DebugLexer(text[index_start:index_end])
tokens: List[Token] = lexer.tokenize()
for token in tokens:
token.lineno += lineno_offset
token.position = (token.position[0] + index_start, token.position[1] + index_start)
if token.token_type == TokenType.BLOCK and ("'" in token.contents or '"' in token.contents):
broken_token = token
break
else:
resolved_tokens.append(token)
# If we found a broken token, we switch to our slow parser
if broken_token is not None:
broken_token_start = broken_token.position[0]
fixed_token = _detailed_tag_parser(text[broken_token_start:], broken_token.lineno, broken_token_start)
resolved_tokens.append(fixed_token)
index_start = fixed_token.position[1]
lineno_offset += (
fixed_token.lineno - 1 # -1 because lines are 1-indexed
+ fixed_token.contents.count("\n")
) # fmt: skip
else:
break
return resolved_tokens
# Handle parsing of `{% %}` tags, while allowing `%}` inside of strings
def _detailed_tag_parser(text: str, lineno: int, start_index: int) -> Token:
index = 0
length = len(text)
result_content: List[str] = []
# Pre-compute common substrings
QUOTE_CHARS = ("'", '"')
QUOTE_OR_PERCENT = (*QUOTE_CHARS, "%")
def take_char() -> str:
nonlocal index
if index >= length:
return ""
char = text[index]
index += 1
return char
def peek_char(offset: int = 0) -> str:
peek_index = index + offset
if peek_index >= length:
return ""
return text[peek_index]
# This is an optimized version that uses regex to find the next stop character
# and ignores the stop characters if they are prefixed by a backslash, if allow_escapes is True.
#
# For the intuition, the original version is:
#
# ```py
# def take_until_any(stop_chars: Tuple[str, ...], allow_escapes: bool = False) -> str:
# nonlocal index
# start = index
# while index < length:
# char = text[index]
# if allow_escapes and char == BACKSLASH and index + 1 < length:
# index += 2
# continue
# if char in stop_chars:
# break
# index += 1
# return text[start:index]
# ```
def take_until_any(stop_chars: Tuple[str, ...], allow_escapes: bool = False) -> str:
nonlocal index
stop_chars_str = "".join(stop_chars)
pattern = _compile_take_until_pattern(stop_chars_str, allow_escapes)
# Find match at current position
match = pattern.match(text, index)
if match:
matched_text = match.group(0)
index += len(matched_text)
return matched_text
return ""
# Given that this function is called only when there's a broken token,
# we know that the first two characters are always "{%"
take_char() # {
take_char() # %
# Main parsing loop
while index < length:
char = peek_char()
# Handle strings within `{% %}`
if char in QUOTE_CHARS:
quote_char = take_char()
result_content.append(quote_char)
# Take content until matching quote, allowing escaped quotes
content = take_until_any((quote_char,), allow_escapes=True)
result_content.append(content)
# Handle the closing quote
if peek_char() == quote_char:
result_content.append(take_char())
else:
raise TemplateSyntaxError(f"Unexpected end of text - unterminated {quote_char} string")
continue
# Check for closing tag
if char == "%":
if peek_char(1) == "}":
take_char() # %
take_char() # }
break
else:
# False alarm, just a string
content = take_until_any(QUOTE_CHARS)
result_content.append(content)
continue
# Take regular content until we hit a quote or potential closing tag
content = take_until_any(QUOTE_OR_PERCENT)
result_content.append(content)
else:
raise TemplateSyntaxError("Unexpected end of text - unterminated {% tag")
result_str = "".join(result_content).strip() # Django's Lexer.tokenize() strips the whitespace
return Token(TokenType.BLOCK, result_str, (start_index, index + start_index), lineno)
# Create a regex pattern that takes anything until any of the stop characters are found.
#
# If allow_escapes is True, also the stop characters are allowed, given that they are
# prefixed by a backslash.
@lru_cache(maxsize=128)
def _compile_take_until_pattern(stop_chars: str, allow_escapes: bool) -> re.Pattern:
escaped_stops = "".join(re.escape(c) for c in stop_chars)
if allow_escapes:
# Match either escaped characters or anything until stop chars
pattern = f"(?:\\\\.|[^{escaped_stops}])*"
else:
# Match anything until stop chars
pattern = f"[^{escaped_stops}]*"
return re.compile(pattern)

View file

@ -1,14 +1,14 @@
import functools
import inspect
from dataclasses import dataclass
from typing import Any, Callable, Dict, Iterable, List, Mapping, NamedTuple, Optional, Set, Tuple, cast
from typing import Any, Callable, Dict, Iterable, List, Mapping, NamedTuple, Optional, Set, Tuple
from django.template import Context, NodeList
from django.template.base import Parser, Token, TokenType
from django.template.base import Parser, Token
from django.template.exceptions import TemplateSyntaxError
from django_components.expression import process_aggregate_kwargs
from django_components.util.tag_parser import TagAttr, TagValue, parse_tag
from django_components.util.tag_parser import TagAttr, parse_tag
@dataclass
@ -97,6 +97,17 @@ class TagSpec:
# Set the signature on the function
validator.__signature__ = self.signature # type: ignore[attr-defined]
# Call the validator with our args and kwargs, in such a way to
# let the Python interpreter validate on repeated kwargs. E.g.
#
# ```
# args, kwargs = validator(
# *call_args,
# **call_kwargs[0],
# **call_kwargs[1],
# ...
# )
# ```
call_args = []
call_kwargs = []
for param in params:
@ -105,13 +116,12 @@ class TagSpec:
else:
call_kwargs.append({param.key: param.value})
# Call the validator with our args and kwargs, in such a way to
# let the Python interpreter validate on repeated kwargs.
#
# E.g. `args, kwargs = validator(*call_args, **call_kwargs[0], **call_kwargs[1])`
#
# NOTE: Although we use `exec()` here, it's safe, because we control the input -
# we pass in only the list index.
# we make dynamic only the list index.
#
# We MUST use the indices, because we can't trust neither the param keys nor values,
# so we MUST NOT reference them directly in the exec script, otherwise we'd be at risk
# of injection attack.
validator_call_script = "args, kwargs = validator(*call_args, "
for kw_index, _ in enumerate(call_kwargs):
validator_call_script += f"**call_kwargs[{kw_index}], "
@ -229,8 +239,6 @@ def parse_template_tag(
token: Token,
tag_spec: TagSpec,
) -> ParsedTag:
fix_nested_tags(parser, token)
_, attrs = parse_tag(token.contents, parser)
# First token is tag name, e.g. `slot` in `{% slot <name> ... %}`
@ -340,138 +348,3 @@ def merge_repeated_kwargs(params: List[TagParam]) -> List[TagParam]:
params_by_key[param.key].value += " " + str(param.value)
return resolved_params
def fix_nested_tags(parser: Parser, block_token: Token) -> None:
# Since the nested tags MUST be wrapped in quotes, e.g.
# `{% component 'test' "{% lorem var_a w %}" %}`
# `{% component 'test' key="{% lorem var_a w %}" %}`
#
# We can parse the tag's tokens so we can find the last one, and so we consider
# the unclosed `{%` only for the last bit.
_, attrs = parse_tag(block_token.contents, parser)
# If there are no attributes, then there are no nested tags
if not attrs:
return
last_attr = attrs[-1]
# TODO: Currently, using a nested template inside a list or dict
# e.g. `{% component ... key=["{% nested %}"] %}` is NOT supported.
# Hence why we leave if value is not "simple" (which means the value is list or dict).
if last_attr.value.type != "simple":
return
last_attr_value = cast(TagValue, last_attr.value.entries[0])
last_token = last_attr_value.parts[-1]
# User probably forgot to wrap the nested tag in quotes, or this is the end of the input.
# `{% component ... key={% nested %} %}`
# `{% component ... key= %}`
if not last_token.value:
return
# When our template tag contains a nested tag, e.g.:
# `{% component 'test' "{% lorem var_a w %}" %}`
#
# Django parses this into:
# `TokenType.BLOCK: 'component 'test' "{% lorem var_a w'`
#
# Above you can see that the token ends at the end of the NESTED tag,
# and includes `{%`. So that's what we use to identify if we need to fix
# nested tags or not.
has_unclosed_tag = (
(last_token.value.count("{%") > last_token.value.count("%}"))
# Moreover we need to also check for unclosed quotes for this edge case:
# `{% component 'test' "{%}" %}`
#
# Which Django parses this into:
# `TokenType.BLOCK: 'component 'test' "{'`
#
# Here we cannot see any unclosed tags, but there is an unclosed double quote at the end.
#
# But we cannot naively search the full contents for unclosed quotes, but
# only within the last 'bit'. Consider this:
# `{% component 'test' '"' "{%}" %}`
#
or (last_token.value in ("'{", '"{'))
)
# There is 3 double quotes, but if the contents get split at the first `%}`
# then there will be a single unclosed double quote in the last bit.
first_char_index = len(last_token.spread or "")
has_unclosed_quote = (
not last_token.quoted
and last_token.value
and last_token.value[first_char_index] in ('"', "'")
) # fmt: skip
needs_fixing = has_unclosed_tag and has_unclosed_quote
if not needs_fixing:
return
block_token.contents += "%}" if has_unclosed_quote else " %}"
expects_text = True
while True:
# This is where we need to take parsing in our own hands, because Django parser parsed
# only up to the first closing tag `%}`, but that closing tag corresponds to a nested tag,
# and not to the end of the outer template tag.
#
# NOTE: If we run out of tokens, this will raise, and break out of the loop
token = parser.next_token()
# If there is a nested BLOCK `{% %}`, VAR `{{ }}`, or COMMENT `{# #}` tag inside the template tag,
# then the way Django parses it results in alternating Tokens of TEXT and non-TEXT types.
#
# We use `expects_text` to know which type to handle.
if expects_text:
if token.token_type != TokenType.TEXT:
raise TemplateSyntaxError(f"Template parser received TokenType '{token.token_type}' instead of 'TEXT'")
expects_text = False
# Once we come across a closing tag in the text, we know that's our original
# end tag. Until then, append all the text to the block token and continue
if "%}" not in token.contents:
block_token.contents += token.contents
continue
# This is the ACTUAL end of the block template tag
remaining_block_content, text_content = token.contents.split("%}", 1)
block_token.contents += remaining_block_content
# We put back into the Parser the remaining bit of the text.
# NOTE: Looking at the implementation, `parser.prepend_token()` is the opposite
# of `parser.next_token()`.
parser.prepend_token(Token(TokenType.TEXT, contents=text_content))
break
# In this case we've come across a next block tag `{% %}` inside the template tag
# This isn't the first occurence, where the `{%` was ignored. And so, the content
# between the `{% %}` is correctly captured, e.g.
#
# `{% firstof False 0 is_active %}`
# gives
# `TokenType.BLOCK: 'firstof False 0 is_active'`
#
# But we don't want to evaluate this as a standalone BLOCK tag, and instead append
# it to the block tag that this nested block is part of
else:
if token.token_type == TokenType.TEXT:
raise TemplateSyntaxError(
f"Template parser received TokenType '{token.token_type}' instead of 'BLOCK', 'VAR', 'COMMENT'"
)
if token.token_type == TokenType.BLOCK:
block_token.contents += "{% " + token.contents + " %}"
elif token.token_type == TokenType.VAR:
block_token.contents += "{{ " + token.contents + " }}"
elif token.token_type == TokenType.COMMENT:
pass # Comments are ignored
else:
raise TemplateSyntaxError(f"Unknown token type '{token.token_type}'")
expects_text = True
continue