refactor: replace selectolax with beautifulsoup (#823)

* refactor: replace selectolax with beautifulsoup

* refactor: add tests for html parser impl

* refactor: add missing import

* refactor: fix tests

* refactor: fix linter issues

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
This commit is contained in:
Juro Oravec 2024-12-08 08:42:48 +01:00 committed by GitHub
parent c61847d30d
commit 1cd545b986
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 391 additions and 351 deletions

View file

@ -29,7 +29,7 @@ classifiers = [
] ]
dependencies = [ dependencies = [
'Django>=4.2', 'Django>=4.2',
'selectolax>=0.3.24', 'beautifulsoup4>=4.12',
] ]
license = {text = "MIT"} license = {text = "MIT"}

View file

@ -11,4 +11,4 @@ playwright
requests requests
types-requests types-requests
whitenoise whitenoise
selectolax beautifulsoup4

View file

@ -6,7 +6,9 @@
# #
asgiref==3.8.1 asgiref==3.8.1
# via django # via django
black==24.8.0 beautifulsoup4==4.12.3
# via -r requirements-dev.in
black==24.10.0
# via -r requirements-dev.in # via -r requirements-dev.in
cachetools==5.5.0 cachetools==5.5.0
# via tox # via tox
@ -16,15 +18,15 @@ cfgv==3.4.0
# via pre-commit # via pre-commit
chardet==5.2.0 chardet==5.2.0
# via tox # via tox
charset-normalizer==3.3.2 charset-normalizer==3.4.0
# via requests # via requests
click==8.1.7 click==8.1.7
# via black # via black
colorama==0.4.6 colorama==0.4.6
# via tox # via tox
distlib==0.3.8 distlib==0.3.9
# via virtualenv # via virtualenv
django==5.1.1 django==5.1.4
# via -r requirements-dev.in # via -r requirements-dev.in
filelock==3.16.1 filelock==3.16.1
# via # via
@ -38,7 +40,7 @@ flake8-pyproject==1.2.3
# via -r requirements-dev.in # via -r requirements-dev.in
greenlet==3.1.1 greenlet==3.1.1
# via playwright # via playwright
identify==2.5.33 identify==2.6.3
# via pre-commit # via pre-commit
idna==3.10 idna==3.10
# via requests # via requests
@ -54,9 +56,9 @@ mypy-extensions==1.0.0
# via # via
# black # black
# mypy # mypy
nodeenv==1.8.0 nodeenv==1.9.1
# via pre-commit # via pre-commit
packaging==24.1 packaging==24.2
# via # via
# black # black
# pyproject-api # pyproject-api
@ -69,7 +71,7 @@ platformdirs==4.3.6
# black # black
# tox # tox
# virtualenv # virtualenv
playwright==1.48.0 playwright==1.49.0
# via -r requirements-dev.in # via -r requirements-dev.in
pluggy==1.5.0 pluggy==1.5.0
# via # via
@ -77,7 +79,7 @@ pluggy==1.5.0
# tox # tox
pre-commit==4.0.1 pre-commit==4.0.1
# via -r requirements-dev.in # via -r requirements-dev.in
pycodestyle==2.12.0 pycodestyle==2.12.1
# via flake8 # via flake8
pyee==12.0.0 pyee==12.0.0
# via playwright # via playwright
@ -87,19 +89,19 @@ pyproject-api==1.8.0
# via tox # via tox
pytest==8.3.4 pytest==8.3.4
# via -r requirements-dev.in # via -r requirements-dev.in
pyyaml==6.0.1 pyyaml==6.0.2
# via pre-commit # via pre-commit
requests==2.32.3 requests==2.32.3
# via -r requirements-dev.in # via -r requirements-dev.in
selectolax==0.3.26 soupsieve==2.6
# via -r requirements-dev.in # via beautifulsoup4
sqlparse==0.5.0 sqlparse==0.5.2
# via django # via django
tox==4.23.2 tox==4.23.2
# via -r requirements-dev.in # via -r requirements-dev.in
types-requests==2.32.0.20241016 types-requests==2.32.0.20241016
# via -r requirements-dev.in # via -r requirements-dev.in
typing-extensions==4.10.0 typing-extensions==4.12.2
# via # via
# mypy # mypy
# pyee # pyee
@ -107,12 +109,9 @@ urllib3==2.2.3
# via # via
# requests # requests
# types-requests # types-requests
virtualenv==20.26.6 virtualenv==20.28.0
# via # via
# pre-commit # pre-commit
# tox # tox
whitenoise==6.7.0 whitenoise==6.8.2
# via -r requirements-dev.in # via -r requirements-dev.in
# The following packages are considered to be unsafe in a requirements file:
# setuptools

View file

@ -20,7 +20,7 @@
# - pymdown-extensions # - pymdown-extensions
# - black # - black
# - django>=4.2 # - django>=4.2
# - selectolax>=0.3.24 # - beautifulsoup4>=4.12
# #
asgiref==3.8.1 asgiref==3.8.1
@ -29,6 +29,8 @@ babel==2.16.0
# via # via
# mkdocs-git-revision-date-localized-plugin # mkdocs-git-revision-date-localized-plugin
# mkdocs-material # mkdocs-material
beautifulsoup4==4.12.3
# via hatch.envs.docs
black==24.10.0 black==24.10.0
# via hatch.envs.docs # via hatch.envs.docs
bracex==2.5.post1 bracex==2.5.post1
@ -207,12 +209,12 @@ regex==2024.11.6
# via mkdocs-material # via mkdocs-material
requests==2.32.3 requests==2.32.3
# via mkdocs-material # via mkdocs-material
selectolax==0.3.26
# via hatch.envs.docs
six==1.16.0 six==1.16.0
# via python-dateutil # via python-dateutil
smmap==5.0.1 smmap==5.0.1
# via gitdb # via gitdb
soupsieve==2.6
# via beautifulsoup4
sqlparse==0.5.2 sqlparse==0.5.2
# via django # via django
tinycss2==1.4.0 tinycss2==1.4.0

View file

@ -33,11 +33,10 @@ from django.templatetags.static import static
from django.urls import path, reverse from django.urls import path, reverse
from django.utils.decorators import sync_and_async_middleware from django.utils.decorators import sync_and_async_middleware
from django.utils.safestring import SafeString, mark_safe from django.utils.safestring import SafeString, mark_safe
from selectolax.lexbor import LexborHTMLParser
import django_components.types as types import django_components.types as types
from django_components.util.html import parse_document_or_nodes, parse_multiroot_html, parse_node from django_components.util.html import SoupNode
from django_components.util.misc import escape_js_string_literal, get_import_path from django_components.util.misc import _escape_js, get_import_path
if TYPE_CHECKING: if TYPE_CHECKING:
from django_components.component import Component from django_components.component import Component
@ -362,26 +361,14 @@ def render_dependencies(content: TContent, type: RenderType = "document") -> TCo
# then try to insert the JS scripts at the end of <body> and CSS sheets at the end # then try to insert the JS scripts at the end of <body> and CSS sheets at the end
# of <head> # of <head>
if type == "document" and (not did_find_js_placeholder or not did_find_css_placeholder): if type == "document" and (not did_find_js_placeholder or not did_find_css_placeholder):
tree = parse_document_or_nodes(content_.decode()) maybe_transformed = _insert_js_css_to_default_locations(
content_.decode(),
css_content=None if did_find_css_placeholder else css_dependencies.decode(),
js_content=None if did_find_js_placeholder else js_dependencies.decode(),
)
if isinstance(tree, LexborHTMLParser): if maybe_transformed is not None:
did_modify_html = False content_ = maybe_transformed.encode()
if not did_find_css_placeholder and tree.head:
css_elems = parse_multiroot_html(css_dependencies.decode())
for css_elem in css_elems:
tree.head.insert_child(css_elem) # type: ignore # TODO: Update to selectolax 0.3.25
did_modify_html = True
if not did_find_js_placeholder and tree.body:
js_elems = parse_multiroot_html(js_dependencies.decode())
for js_elem in js_elems:
tree.body.insert_child(js_elem) # type: ignore # TODO: Update to selectolax 0.3.25
did_modify_html = True
transformed = cast(str, tree.html)
if did_modify_html:
content_ = transformed.encode()
# Return the same type as we were given # Return the same type as we were given
output = content_.decode() if isinstance(content, str) else content_ output = content_.decode() if isinstance(content, str) else content_
@ -567,15 +554,15 @@ def _postprocess_media_tags(
tags_by_url: Dict[str, str] = {} tags_by_url: Dict[str, str] = {}
for tag in tags: for tag in tags:
node = parse_node(tag) node = SoupNode.from_fragment(tag.strip())[0]
# <script src="..."> vs <link href="..."> # <script src="..."> vs <link href="...">
attr = "src" if script_type == "js" else "href" attr = "src" if script_type == "js" else "href"
maybe_url = node.attrs.get(attr, None) maybe_url = node.get_attr(attr, None)
if not _is_nonempty_str(maybe_url): if not _is_nonempty_str(maybe_url):
raise RuntimeError( raise RuntimeError(
f"One of entries for `Component.Media.{script_type}` media is missing a " f"One of entries for `Component.Media.{script_type}` media is missing a "
f"value for attribute '{attr}'. If there is content inlined inside the `<{node.tag}>` tags, " f"value for attribute '{attr}'. If there is content inlined inside the `<{node.name()}>` tags, "
f"you must move the content to a `.{script_type}` file and reference it via '{attr}'.\nGot:\n{tag}" f"you must move the content to a `.{script_type}` file and reference it via '{attr}'.\nGot:\n{tag}"
) )
@ -739,11 +726,48 @@ def _gen_exec_script(
return exec_script return exec_script
def _escape_js(js: str, eval: bool = True) -> str: def _insert_js_css_to_default_locations(
escaped_js = escape_js_string_literal(js) html_content: str,
# `unescapeJs` is the function we call in the browser to parse the escaped JS js_content: Optional[str],
escaped_js = f"Components.unescapeJs(`{escaped_js}`)" css_content: Optional[str],
return f"eval({escaped_js})" if eval else escaped_js ) -> Optional[str]:
"""
This function tries to insert the JS and CSS content into the default locations.
JS is inserted at the end of `<body>`, and CSS is inserted at the end of `<head>`.
"""
elems = SoupNode.from_fragment(html_content)
if not elems:
return None
did_modify_html = False
if css_content is not None:
for elem in elems:
if not elem.is_element():
continue
head = elem.find_tag("head")
if head:
css_elems = SoupNode.from_fragment(css_content)
head.append_children(css_elems)
did_modify_html = True
if js_content is not None:
for elem in elems:
if not elem.is_element():
continue
body = elem.find_tag("body")
if body:
js_elems = SoupNode.from_fragment(js_content)
body.append_children(js_elems)
did_modify_html = True
if did_modify_html:
transformed = SoupNode.to_html_multiroot(elems)
return transformed
else:
return None # No changes made
######################################################### #########################################################
@ -802,27 +826,27 @@ class ComponentDependencyMiddleware:
""" """
def __init__(self, get_response: "Callable[[HttpRequest], HttpResponse]") -> None: def __init__(self, get_response: "Callable[[HttpRequest], HttpResponse]") -> None:
self.get_response = get_response self._get_response = get_response
# NOTE: Required to work with async # NOTE: Required to work with async
if iscoroutinefunction(self.get_response): if iscoroutinefunction(self._get_response):
markcoroutinefunction(self) markcoroutinefunction(self)
def __call__(self, request: HttpRequest) -> HttpResponseBase: def __call__(self, request: HttpRequest) -> HttpResponseBase:
if iscoroutinefunction(self): if iscoroutinefunction(self):
return self.__acall__(request) return self.__acall__(request)
response = self.get_response(request) response = self._get_response(request)
response = self.process_response(response) response = self._process_response(response)
return response return response
# NOTE: Required to work with async # NOTE: Required to work with async
async def __acall__(self, request: HttpRequest) -> HttpResponseBase: async def __acall__(self, request: HttpRequest) -> HttpResponseBase:
response = await self.get_response(request) response = await self._get_response(request)
response = self.process_response(response) response = self._process_response(response)
return response return response
def process_response(self, response: HttpResponse) -> HttpResponse: def _process_response(self, response: HttpResponse) -> HttpResponse:
if not isinstance(response, StreamingHttpResponse) and response.get("Content-Type", "").startswith( if not isinstance(response, StreamingHttpResponse) and response.get("Content-Type", "").startswith(
"text/html" "text/html"
): ):

View file

@ -1,100 +1,108 @@
from typing import List, Union from abc import ABC, abstractmethod
from typing import Any, List, Optional, Sequence
from selectolax.lexbor import LexborHTMLParser, LexborNode from bs4 import BeautifulSoup, CData, Comment, Doctype, Tag
def parse_node(html: str) -> LexborNode: class HTMLNode(ABC):
""" """
Use this when you know the given HTML is a single node like Interface for an HTML manipulation library. This allows us to potentially swap
between different libraries.
`<div> Hi </div>`
""" """
tree = LexborHTMLParser(html)
# NOTE: The parser automatically places <style> tags inside <head> @classmethod
# while <script> tags are inside <body>. @abstractmethod
return tree.body.child or tree.head.child # type: ignore[union-attr, return-value] def from_fragment(cls, html: str) -> Sequence["HTMLNode"]: ... # noqa: E704
@abstractmethod
def to_html(self) -> str: ... # noqa: E704
@abstractmethod
def name(self) -> str:
"""Get tag name"""
...
@abstractmethod
def find_tag(self, tag: str) -> Optional["HTMLNode"]: ... # noqa: E704
@abstractmethod
def append_children(self, children: Sequence[Any]) -> None: ... # noqa: E704
@abstractmethod
def get_attr(self, attr: str, default: Any = None) -> Any: ... # noqa: E704
@abstractmethod
def set_attr(self, attr: str, value: Any) -> None: ... # noqa: E704
@abstractmethod
def is_element(self) -> bool: ... # noqa: E704
"""Returns `False` if the node is a text, comment, or doctype node. `True` otherwise."""
@classmethod
def to_html_multiroot(cls, elems: Sequence["HTMLNode"]) -> str:
return "".join([elem.to_html() for elem in elems])
def parse_document_or_nodes(html: str) -> Union[List[LexborNode], LexborHTMLParser]: class SoupNode(HTMLNode):
""" """BeautifulSoup implementation of HTMLNode."""
Use this if you do NOT know whether the given HTML is a full document
with `<html>`, `<head>`, and `<body>` tags, or an HTML fragment.
"""
html = html.strip()
tree = LexborHTMLParser(html)
is_fragment = is_html_parser_fragment(html, tree)
if is_fragment: def __init__(self, node: Tag):
nodes = parse_multiroot_html(html) self.node = node
return nodes
else:
return tree
@classmethod
def from_fragment(cls, html: str) -> List["SoupNode"]:
soup = BeautifulSoup(html, "html.parser")
# Get top-level elements in the fragment
return [cls(elem) for elem in soup.contents]
def parse_multiroot_html(html: str) -> List[LexborNode]: def to_html(self) -> str:
""" if isinstance(self.node, CData):
Use this when you know the given HTML is a multiple nodes like return f"<![CDATA[{self.node}]]>"
elif isinstance(self.node, Comment):
return f"<!-- {self.node} -->"
elif isinstance(self.node, Doctype):
return f"<!DOCTYPE {self.node}>"
else:
return str(self.node)
`<div> Hi </div> <span> Hello </span>` def name(self) -> str:
""" return self.node.name
# NOTE: HTML / XML MUST have a single root. So, to support multiple
# top-level elements, we wrap them in a dummy singular root.
parser = LexborHTMLParser(f"<root>{html}</root>")
# Get all contents of the root def find_tag(self, tag: str) -> Optional["SoupNode"]:
root_elem = parser.css_first("root") if isinstance(self.node, Tag) and self.node.name == tag:
elems = [*root_elem.iter()] if root_elem else [] return self
return elems else:
match = self.node.select_one(tag)
if match:
return SoupNode(match)
return None
def append_children(self, children: Sequence["SoupNode"]) -> None:
if isinstance(self.node, Tag):
for child in children:
self.node.append(child.node)
def is_html_parser_fragment(html: str, tree: LexborHTMLParser) -> bool: def get_attr(self, attr: str, default: Any = None) -> Any:
# If we pass only an HTML fragment to the parser, like `<div>123</div>`, then if isinstance(self.node, Tag):
# the parser automatically wraps it in `<html>`, `<head>`, and `<body>` tags. res = self.node.get(attr, default)
# if isinstance(res, list):
# <html> return " ".join(res)
# <head> return res
# </head> return default
# <body>
# <div>123</div>
# </body>
# </html>
#
# But also, as described in Lexbor (https://github.com/lexbor/lexbor/issues/183#issuecomment-1611975340),
# if the parser first comes across HTML tags that could go into the `<head>`,
# it will put them there, and then put the rest in `<body>`.
#
# So `<link href="..." /><div></div>` will be parsed as
#
# <html>
# <head>
# <link href="..." />
# </head>
# <body>
# <div>123</div>
# </body>
# </html>
#
# BUT, if we're dealing with a fragment, we want to parse it correctly as
# a multi-root fragment:
#
# <link href="..." />
# <div>123</div>
#
# The way do so is that we:
# 1. Take the original HTML string
# 2. Subtract the content of parsed `<head>` from the START of the original HTML
# 3. Subtract the content of parsed `<body>` from the END of the original HTML
# 4. Then, if we have an HTML fragment, we should be left with empty string (maybe whitespace?).
# 5. But if we have an HTML document, then the "space between" should contain text,
# because we didn't account for the length of `<html>`, `<head>`, `<body>` tags.
#
# TODO: Replace with fragment parser?
# See https://github.com/rushter/selectolax/issues/74#issuecomment-2404470344
parsed_head_html: str = tree.head.html # type: ignore
parsed_body_html: str = tree.body.html # type: ignore
head_content = parsed_head_html[len("<head>") : -len("</head>")] # noqa: E203
body_content = parsed_body_html[len("<body>") : -len("</body>")] # noqa: E203
between_content = html[len(head_content) : -len(body_content)].strip() # noqa: E203
is_fragment = not html or not between_content def set_attr(self, attr: str, value: Any) -> None:
return is_fragment if not isinstance(self.node, Tag):
return
if value is True:
# Set boolean attributes without a value
self.node[attr] = None
elif value is False:
# Remove the attribute
self.node.attrs.pop(attr, None)
else:
self.node[attr] = value
def is_element(self) -> bool:
return isinstance(self.node, Tag)

View file

@ -77,3 +77,10 @@ def get_last_index(lst: List, key: Callable[[Any], bool]) -> Optional[int]:
if key(item): if key(item):
return len(lst) - 1 - index return len(lst) - 1 - index
return None return None
def _escape_js(js: str, eval: bool = True) -> str:
escaped_js = escape_js_string_literal(js)
# `unescapeJs` is the function we call in the browser to parse the escaped JS
escaped_js = f"Components.unescapeJs(`{escaped_js}`)"
return f"eval({escaped_js})" if eval else escaped_js

View file

@ -2,11 +2,11 @@ from unittest.mock import Mock
from django.http import HttpResponseNotModified from django.http import HttpResponseNotModified
from django.template import Context, Template from django.template import Context, Template
from selectolax.lexbor import LexborHTMLParser
from django_components import Component, registry, render_dependencies, types from django_components import Component, registry, render_dependencies, types
from django_components.components.dynamic import DynamicComponent from django_components.components.dynamic import DynamicComponent
from django_components.middleware import ComponentDependencyMiddleware from django_components.middleware import ComponentDependencyMiddleware
from django_components.util.html import SoupNode
from .django_test_setup import setup_test_config from .django_test_setup import setup_test_config
from .testutils import BaseTestCase, create_and_process_template_response from .testutils import BaseTestCase, create_and_process_template_response
@ -224,7 +224,9 @@ class RenderDependenciesTests(BaseTestCase):
count=1, count=1,
) )
rendered_body = LexborHTMLParser(rendered).body.html # type: ignore[union-attr] # Nodes: [Doctype, whitespace, <html>]
nodes = SoupNode.from_fragment(rendered.strip())
rendered_body = nodes[2].find_tag("body").to_html() # type: ignore[union-attr]
self.assertInHTML( self.assertInHTML(
"""<script src="django_components/django_components.min.js">""", """<script src="django_components/django_components.min.js">""",
@ -274,7 +276,9 @@ class RenderDependenciesTests(BaseTestCase):
count=1, count=1,
) )
rendered_head = LexborHTMLParser(rendered).head.html # type: ignore[union-attr] # Nodes: [Doctype, whitespace, <html>]
nodes = SoupNode.from_fragment(rendered.strip())
rendered_head = nodes[2].find_tag("head").to_html() # type: ignore[union-attr]
self.assertInHTML( self.assertInHTML(
"""<script src="django_components/django_components.min.js">""", """<script src="django_components/django_components.min.js">""",
@ -287,6 +291,142 @@ class RenderDependenciesTests(BaseTestCase):
count=1, count=1,
) )
# NOTE: Some HTML parser libraries like selectolax or lxml try to "correct" the given HTML.
# We want to avoid this behavior, so user gets the exact same HTML back.
def test_does_not_try_to_add_close_tags(self):
registry.register(name="test", component=SimpleComponent)
template_str: types.django_html = """
<thead>
"""
rendered_raw = Template(template_str).render(Context({"formset": [1]}))
rendered = render_dependencies(rendered_raw, type="fragment")
self.assertHTMLEqual(rendered, "<thead>")
def test_does_not_modify_html_when_no_component_used(self):
registry.register(name="test", component=SimpleComponent)
template_str: types.django_html = """
<table class="table-auto border-collapse divide-y divide-x divide-slate-300 w-full">
<!-- Table head -->
<thead>
<tr class="py-0 my-0 h-7">
<!-- Empty row -->
<th class="min-w-12">#</th>
</tr>
</thead>
<!-- Table body -->
<tbody id="items" class="divide-y divide-slate-300">
{% for form in formset %}
{% with row_number=forloop.counter %}
<tr class=" hover:bg-gray-200 py-0 {% cycle 'bg-white' 'bg-gray-50' %} divide-x "
aria-rowindex="{{ row_number }}">
<!-- row num -->
<td class="whitespace-nowrap w-fit text-center px-4 w-px"
aria-colindex="1">
{{ row_number }}
</td>
</tr>
{% endwith %}
{% endfor %}
</tbody>
</table>
"""
rendered_raw = Template(template_str).render(Context({"formset": [1]}))
rendered = render_dependencies(rendered_raw, type="fragment")
expected = """
<table class="table-auto border-collapse divide-y divide-x divide-slate-300 w-full">
<!-- Table head -->
<thead>
<tr class="py-0 my-0 h-7">
<!-- Empty row -->
<th class="min-w-12">#</th>
</tr>
</thead>
<!-- Table body -->
<tbody id="items" class="divide-y divide-slate-300">
<tr class=" hover:bg-gray-200 py-0 bg-white divide-x "
aria-rowindex="1">
<!-- row num -->
<td class="whitespace-nowrap w-fit text-center px-4 w-px"
aria-colindex="1">
1
</td>
</tr>
</tbody>
</table>
"""
self.assertHTMLEqual(expected, rendered)
# Explanation: The component is used in the template, but the template doesn't use
# {% component_js_dependencies %} or {% component_css_dependencies %} tags,
# nor defines a `<head>` or `<body>` tag. In which case, the dependencies are not rendered.
def test_does_not_modify_html_when_component_used_but_nowhere_to_insert(self):
registry.register(name="test", component=SimpleComponent)
template_str: types.django_html = """
{% load component_tags %}
<table class="table-auto border-collapse divide-y divide-x divide-slate-300 w-full">
<!-- Table head -->
<thead>
<tr class="py-0 my-0 h-7">
<!-- Empty row -->
<th class="min-w-12">#</th>
</tr>
</thead>
<!-- Table body -->
<tbody id="items" class="divide-y divide-slate-300">
{% for form in formset %}
{% with row_number=forloop.counter %}
<tr class=" hover:bg-gray-200 py-0 {% cycle 'bg-white' 'bg-gray-50' %} divide-x "
aria-rowindex="{{ row_number }}">
<!-- row num -->
<td class="whitespace-nowrap w-fit text-center px-4 w-px"
aria-colindex="1">
{{ row_number }}
{% component "test" variable="hi" / %}
</td>
</tr>
{% endwith %}
{% endfor %}
</tbody>
</table>
"""
rendered_raw = Template(template_str).render(Context({"formset": [1]}))
rendered = render_dependencies(rendered_raw, type="fragment")
expected = """
<table class="table-auto border-collapse divide-y divide-x divide-slate-300 w-full">
<!-- Table head -->
<thead>
<tr class="py-0 my-0 h-7">
<!-- Empty row -->
<th class="min-w-12">#</th>
</tr>
</thead>
<!-- Table body -->
<tbody id="items" class="divide-y divide-slate-300">
<tr class=" hover:bg-gray-200 py-0 bg-white divide-x "
aria-rowindex="1">
<!-- row num -->
<td class="whitespace-nowrap w-fit text-center px-4 w-px"
aria-colindex="1">
1
Variable: <strong>hi</strong>
</td>
</tr>
</tbody>
</table>
"""
self.assertHTMLEqual(expected, rendered)
class MiddlewareTests(BaseTestCase): class MiddlewareTests(BaseTestCase):
def test_middleware_response_without_content_type(self): def test_middleware_response_without_content_type(self):

View file

@ -1,14 +1,6 @@
from typing import List, cast
from django.test import TestCase from django.test import TestCase
from selectolax.lexbor import LexborHTMLParser, LexborNode
from django_components.util.html import ( from django_components.util.html import SoupNode
is_html_parser_fragment,
parse_document_or_nodes,
parse_multiroot_html,
parse_node,
)
from .django_test_setup import setup_test_config from .django_test_setup import setup_test_config
@ -16,50 +8,26 @@ setup_test_config({"autodiscover": False})
class HtmlTests(TestCase): class HtmlTests(TestCase):
def test_parse_node(self): def test_beautifulsoup_impl(self):
node = parse_node( nodes = SoupNode.from_fragment(
""" """
<div class="abc xyz" data-id="123"> <div class="abc xyz" data-id="123">
<ul> <ul>
<li>Hi</li> <li>Hi</li>
</ul> </ul>
</div> </div>
""" <!-- I'M COMMENT -->
) <button>
node.attrs["id"] = "my-id" # type: ignore[index] Click me!
node.css("li")[0].attrs["class"] = "item" # type: ignore[index] </button>
""".strip()
self.assertHTMLEqual(
node.html,
"""
<div class="abc xyz" data-id="123" id="my-id">
<ul>
<li class="item">Hi</li>
</ul>
</div>
""",
) )
def test_parse_multiroot_html(self): # Items: <div>, whitespace, comment, whitespace, <button>
html = """ self.assertEqual(len(nodes), 5)
<div class="abc xyz" data-id="123">
<ul>
<li>Hi</li>
</ul>
</div>
<main id="123" class="one">
<div>
42
</div>
</main>
<span>
Hello
</span>
"""
nodes = parse_multiroot_html(html)
self.assertHTMLEqual( self.assertHTMLEqual(
nodes[0].html, nodes[0].to_html(),
""" """
<div class="abc xyz" data-id="123"> <div class="abc xyz" data-id="123">
<ul> <ul>
@ -69,87 +37,37 @@ class HtmlTests(TestCase):
""", """,
) )
self.assertHTMLEqual( self.assertHTMLEqual(
nodes[1].html, nodes[2].to_html(),
""" "<!-- I&#x27;M COMMENT -->",
<main id="123" class="one">
<div>
42
</div>
</main>
""",
) )
self.assertHTMLEqual( self.assertHTMLEqual(
nodes[2].html, nodes[4].to_html(),
""" """
<span> <button>
Hello Click me!
</span> </button>
""", """,
) )
def test_is_html_parser_fragment(self): self.assertEqual(nodes[0].name(), "div")
fragment_html = """ self.assertEqual(nodes[4].name(), "button")
<div class="abc xyz" data-id="123">
<ul>
<li>Hi</li>
</ul>
</div>
<main id="123" class="one">
<div>
42
</div>
</main>
<span>
Hello
</span>
"""
fragment_tree = LexborHTMLParser(fragment_html)
fragment_result = is_html_parser_fragment(fragment_html, fragment_tree)
self.assertEqual(fragment_result, True) self.assertEqual(nodes[0].is_element(), True)
self.assertEqual(nodes[2].is_element(), False)
self.assertEqual(nodes[4].is_element(), True)
doc_html = """ self.assertEqual(nodes[0].get_attr("class"), "abc xyz")
<!doctype html> self.assertEqual(nodes[4].get_attr("class"), None)
<html>
<head>
<link href="https://..." />
</head>
<body>
<div class="abc xyz" data-id="123">
<ul>
<li>Hi</li>
</ul>
</div>
</body>
</html>
"""
doc_tree = LexborHTMLParser(doc_html)
doc_result = is_html_parser_fragment(doc_html, doc_tree)
self.assertEqual(doc_result, False) nodes[0].set_attr("class", "123 456")
nodes[4].set_attr("class", "abc def")
def test_parse_document_or_nodes__fragment(self): self.assertEqual(nodes[0].get_attr("class"), "123 456")
fragment_html = """ self.assertEqual(nodes[4].get_attr("class"), "abc def")
<div class="abc xyz" data-id="123">
<ul>
<li>Hi</li>
</ul>
</div>
<main id="123" class="one">
<div>
42
</div>
</main>
<span>
Hello
</span>
"""
fragment_result = cast(List[LexborNode], parse_document_or_nodes(fragment_html))
self.assertHTMLEqual( self.assertHTMLEqual(
fragment_result[0].html, nodes[0].to_html(),
""" """
<div class="abc xyz" data-id="123"> <div class="123 456" data-id="123">
<ul> <ul>
<li>Hi</li> <li>Hi</li>
</ul> </ul>
@ -157,111 +75,53 @@ class HtmlTests(TestCase):
""", """,
) )
self.assertHTMLEqual( self.assertHTMLEqual(
fragment_result[1].html, nodes[4].to_html(),
""" """
<main id="123" class="one"> <button class="abc def">
<div> Click me!
42 </button>
</div>
</main>
""",
)
self.assertHTMLEqual(
fragment_result[2].html,
"""
<span>
Hello
</span>
""", """,
) )
def test_parse_document_or_nodes__mixed(self): # Setting attr to `True` will set it to boolean attribute,
fragment_html = """ # while setting it to `False` will remove the attribute.
<link href="" /> nodes[4].set_attr("disabled", True)
<div class="abc xyz" data-id="123">
<ul>
<li>Hi</li>
</ul>
</div>
<main id="123" class="one">
<div>
42
</div>
</main>
<span>
Hello
</span>
"""
fragment_result = cast(List[LexborNode], parse_document_or_nodes(fragment_html))
self.assertHTMLEqual( self.assertHTMLEqual(
fragment_result[0].html, nodes[4].to_html(),
""" """
<link href="" /> <button class="abc def" disabled>
Click me!
</button>
""", """,
) )
nodes[4].set_attr("disabled", False)
self.assertHTMLEqual( self.assertHTMLEqual(
fragment_result[1].html, nodes[4].to_html(),
""" """
<div class="abc xyz" data-id="123"> <button class="abc def">
<ul> Click me!
<li>Hi</li> </button>
</ul>
</div>
""",
)
self.assertHTMLEqual(
fragment_result[2].html,
"""
<main id="123" class="one">
<div>
42
</div>
</main>
""",
)
self.assertHTMLEqual(
fragment_result[3].html,
"""
<span>
Hello
</span>
""", """,
) )
def test_parse_document_or_nodes__doc(self): # Return self
doc_html = """ self.assertEqual(nodes[0].node, nodes[0].find_tag("div").node) # type: ignore[union-attr]
<!doctype html> # Return descendant
<html> li = nodes[0].find_tag("li")
<head> self.assertHTMLEqual(li.to_html(), "<li>Hi</li>") # type: ignore[union-attr]
<link href="https://..." /> # Return None when not found
</head> self.assertEqual(nodes[0].find_tag("main"), None)
<body>
<div class="abc xyz" data-id="123">
<ul>
<li>Hi</li>
</ul>
</div>
</body>
</html>
"""
fragment_result = cast(LexborHTMLParser, parse_document_or_nodes(doc_html))
# Insert children
li.append_children([nodes[4]]) # type: ignore[union-attr]
self.assertHTMLEqual( self.assertHTMLEqual(
fragment_result.html, li.to_html(), # type: ignore[union-attr]
""" """
<!doctype html> <li>
<html> Hi
<head> <button class="abc def">
<link href="https://..." /> Click me!
</head> </button>
<body> </li>
<div class="abc xyz" data-id="123">
<ul>
<li>Hi</li>
</ul>
</div>
</body>
</html>
""", """,
) )