refactor: replace selectolax with beautifulsoup (#823)

* refactor: replace selectolax with beautifulsoup

* refactor: add tests for html parser impl

* refactor: add missing import

* refactor: fix tests

* refactor: fix linter issues

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
This commit is contained in:
Juro Oravec 2024-12-08 08:42:48 +01:00 committed by GitHub
parent c61847d30d
commit 1cd545b986
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 391 additions and 351 deletions

View file

@ -29,7 +29,7 @@ classifiers = [
]
dependencies = [
'Django>=4.2',
'selectolax>=0.3.24',
'beautifulsoup4>=4.12',
]
license = {text = "MIT"}

View file

@ -11,4 +11,4 @@ playwright
requests
types-requests
whitenoise
selectolax
beautifulsoup4

View file

@ -6,7 +6,9 @@
#
asgiref==3.8.1
# via django
black==24.8.0
beautifulsoup4==4.12.3
# via -r requirements-dev.in
black==24.10.0
# via -r requirements-dev.in
cachetools==5.5.0
# via tox
@ -16,15 +18,15 @@ cfgv==3.4.0
# via pre-commit
chardet==5.2.0
# via tox
charset-normalizer==3.3.2
charset-normalizer==3.4.0
# via requests
click==8.1.7
# via black
colorama==0.4.6
# via tox
distlib==0.3.8
distlib==0.3.9
# via virtualenv
django==5.1.1
django==5.1.4
# via -r requirements-dev.in
filelock==3.16.1
# via
@ -38,7 +40,7 @@ flake8-pyproject==1.2.3
# via -r requirements-dev.in
greenlet==3.1.1
# via playwright
identify==2.5.33
identify==2.6.3
# via pre-commit
idna==3.10
# via requests
@ -54,9 +56,9 @@ mypy-extensions==1.0.0
# via
# black
# mypy
nodeenv==1.8.0
nodeenv==1.9.1
# via pre-commit
packaging==24.1
packaging==24.2
# via
# black
# pyproject-api
@ -69,7 +71,7 @@ platformdirs==4.3.6
# black
# tox
# virtualenv
playwright==1.48.0
playwright==1.49.0
# via -r requirements-dev.in
pluggy==1.5.0
# via
@ -77,7 +79,7 @@ pluggy==1.5.0
# tox
pre-commit==4.0.1
# via -r requirements-dev.in
pycodestyle==2.12.0
pycodestyle==2.12.1
# via flake8
pyee==12.0.0
# via playwright
@ -87,19 +89,19 @@ pyproject-api==1.8.0
# via tox
pytest==8.3.4
# via -r requirements-dev.in
pyyaml==6.0.1
pyyaml==6.0.2
# via pre-commit
requests==2.32.3
# via -r requirements-dev.in
selectolax==0.3.26
# via -r requirements-dev.in
sqlparse==0.5.0
soupsieve==2.6
# via beautifulsoup4
sqlparse==0.5.2
# via django
tox==4.23.2
# via -r requirements-dev.in
types-requests==2.32.0.20241016
# via -r requirements-dev.in
typing-extensions==4.10.0
typing-extensions==4.12.2
# via
# mypy
# pyee
@ -107,12 +109,9 @@ urllib3==2.2.3
# via
# requests
# types-requests
virtualenv==20.26.6
virtualenv==20.28.0
# via
# pre-commit
# tox
whitenoise==6.7.0
whitenoise==6.8.2
# via -r requirements-dev.in
# The following packages are considered to be unsafe in a requirements file:
# setuptools

View file

@ -20,7 +20,7 @@
# - pymdown-extensions
# - black
# - django>=4.2
# - selectolax>=0.3.24
# - beautifulsoup4>=4.12
#
asgiref==3.8.1
@ -29,6 +29,8 @@ babel==2.16.0
# via
# mkdocs-git-revision-date-localized-plugin
# mkdocs-material
beautifulsoup4==4.12.3
# via hatch.envs.docs
black==24.10.0
# via hatch.envs.docs
bracex==2.5.post1
@ -207,12 +209,12 @@ regex==2024.11.6
# via mkdocs-material
requests==2.32.3
# via mkdocs-material
selectolax==0.3.26
# via hatch.envs.docs
six==1.16.0
# via python-dateutil
smmap==5.0.1
# via gitdb
soupsieve==2.6
# via beautifulsoup4
sqlparse==0.5.2
# via django
tinycss2==1.4.0

View file

@ -33,11 +33,10 @@ from django.templatetags.static import static
from django.urls import path, reverse
from django.utils.decorators import sync_and_async_middleware
from django.utils.safestring import SafeString, mark_safe
from selectolax.lexbor import LexborHTMLParser
import django_components.types as types
from django_components.util.html import parse_document_or_nodes, parse_multiroot_html, parse_node
from django_components.util.misc import escape_js_string_literal, get_import_path
from django_components.util.html import SoupNode
from django_components.util.misc import _escape_js, get_import_path
if TYPE_CHECKING:
from django_components.component import Component
@ -362,26 +361,14 @@ def render_dependencies(content: TContent, type: RenderType = "document") -> TCo
# then try to insert the JS scripts at the end of <body> and CSS sheets at the end
# of <head>
if type == "document" and (not did_find_js_placeholder or not did_find_css_placeholder):
tree = parse_document_or_nodes(content_.decode())
maybe_transformed = _insert_js_css_to_default_locations(
content_.decode(),
css_content=None if did_find_css_placeholder else css_dependencies.decode(),
js_content=None if did_find_js_placeholder else js_dependencies.decode(),
)
if isinstance(tree, LexborHTMLParser):
did_modify_html = False
if not did_find_css_placeholder and tree.head:
css_elems = parse_multiroot_html(css_dependencies.decode())
for css_elem in css_elems:
tree.head.insert_child(css_elem) # type: ignore # TODO: Update to selectolax 0.3.25
did_modify_html = True
if not did_find_js_placeholder and tree.body:
js_elems = parse_multiroot_html(js_dependencies.decode())
for js_elem in js_elems:
tree.body.insert_child(js_elem) # type: ignore # TODO: Update to selectolax 0.3.25
did_modify_html = True
transformed = cast(str, tree.html)
if did_modify_html:
content_ = transformed.encode()
if maybe_transformed is not None:
content_ = maybe_transformed.encode()
# Return the same type as we were given
output = content_.decode() if isinstance(content, str) else content_
@ -567,15 +554,15 @@ def _postprocess_media_tags(
tags_by_url: Dict[str, str] = {}
for tag in tags:
node = parse_node(tag)
node = SoupNode.from_fragment(tag.strip())[0]
# <script src="..."> vs <link href="...">
attr = "src" if script_type == "js" else "href"
maybe_url = node.attrs.get(attr, None)
maybe_url = node.get_attr(attr, None)
if not _is_nonempty_str(maybe_url):
raise RuntimeError(
f"One of entries for `Component.Media.{script_type}` media is missing a "
f"value for attribute '{attr}'. If there is content inlined inside the `<{node.tag}>` tags, "
f"value for attribute '{attr}'. If there is content inlined inside the `<{node.name()}>` tags, "
f"you must move the content to a `.{script_type}` file and reference it via '{attr}'.\nGot:\n{tag}"
)
@ -739,11 +726,48 @@ def _gen_exec_script(
return exec_script
def _escape_js(js: str, eval: bool = True) -> str:
escaped_js = escape_js_string_literal(js)
# `unescapeJs` is the function we call in the browser to parse the escaped JS
escaped_js = f"Components.unescapeJs(`{escaped_js}`)"
return f"eval({escaped_js})" if eval else escaped_js
def _insert_js_css_to_default_locations(
html_content: str,
js_content: Optional[str],
css_content: Optional[str],
) -> Optional[str]:
"""
This function tries to insert the JS and CSS content into the default locations.
JS is inserted at the end of `<body>`, and CSS is inserted at the end of `<head>`.
"""
elems = SoupNode.from_fragment(html_content)
if not elems:
return None
did_modify_html = False
if css_content is not None:
for elem in elems:
if not elem.is_element():
continue
head = elem.find_tag("head")
if head:
css_elems = SoupNode.from_fragment(css_content)
head.append_children(css_elems)
did_modify_html = True
if js_content is not None:
for elem in elems:
if not elem.is_element():
continue
body = elem.find_tag("body")
if body:
js_elems = SoupNode.from_fragment(js_content)
body.append_children(js_elems)
did_modify_html = True
if did_modify_html:
transformed = SoupNode.to_html_multiroot(elems)
return transformed
else:
return None # No changes made
#########################################################
@ -802,27 +826,27 @@ class ComponentDependencyMiddleware:
"""
def __init__(self, get_response: "Callable[[HttpRequest], HttpResponse]") -> None:
self.get_response = get_response
self._get_response = get_response
# NOTE: Required to work with async
if iscoroutinefunction(self.get_response):
if iscoroutinefunction(self._get_response):
markcoroutinefunction(self)
def __call__(self, request: HttpRequest) -> HttpResponseBase:
if iscoroutinefunction(self):
return self.__acall__(request)
response = self.get_response(request)
response = self.process_response(response)
response = self._get_response(request)
response = self._process_response(response)
return response
# NOTE: Required to work with async
async def __acall__(self, request: HttpRequest) -> HttpResponseBase:
response = await self.get_response(request)
response = self.process_response(response)
response = await self._get_response(request)
response = self._process_response(response)
return response
def process_response(self, response: HttpResponse) -> HttpResponse:
def _process_response(self, response: HttpResponse) -> HttpResponse:
if not isinstance(response, StreamingHttpResponse) and response.get("Content-Type", "").startswith(
"text/html"
):

View file

@ -1,100 +1,108 @@
from typing import List, Union
from abc import ABC, abstractmethod
from typing import Any, List, Optional, Sequence
from selectolax.lexbor import LexborHTMLParser, LexborNode
from bs4 import BeautifulSoup, CData, Comment, Doctype, Tag
def parse_node(html: str) -> LexborNode:
class HTMLNode(ABC):
"""
Use this when you know the given HTML is a single node like
`<div> Hi </div>`
Interface for an HTML manipulation library. This allows us to potentially swap
between different libraries.
"""
tree = LexborHTMLParser(html)
# NOTE: The parser automatically places <style> tags inside <head>
# while <script> tags are inside <body>.
return tree.body.child or tree.head.child # type: ignore[union-attr, return-value]
@classmethod
@abstractmethod
def from_fragment(cls, html: str) -> Sequence["HTMLNode"]: ... # noqa: E704
@abstractmethod
def to_html(self) -> str: ... # noqa: E704
@abstractmethod
def name(self) -> str:
"""Get tag name"""
...
@abstractmethod
def find_tag(self, tag: str) -> Optional["HTMLNode"]: ... # noqa: E704
@abstractmethod
def append_children(self, children: Sequence[Any]) -> None: ... # noqa: E704
@abstractmethod
def get_attr(self, attr: str, default: Any = None) -> Any: ... # noqa: E704
@abstractmethod
def set_attr(self, attr: str, value: Any) -> None: ... # noqa: E704
@abstractmethod
def is_element(self) -> bool: ... # noqa: E704
"""Returns `False` if the node is a text, comment, or doctype node. `True` otherwise."""
@classmethod
def to_html_multiroot(cls, elems: Sequence["HTMLNode"]) -> str:
return "".join([elem.to_html() for elem in elems])
def parse_document_or_nodes(html: str) -> Union[List[LexborNode], LexborHTMLParser]:
"""
Use this if you do NOT know whether the given HTML is a full document
with `<html>`, `<head>`, and `<body>` tags, or an HTML fragment.
"""
html = html.strip()
tree = LexborHTMLParser(html)
is_fragment = is_html_parser_fragment(html, tree)
class SoupNode(HTMLNode):
"""BeautifulSoup implementation of HTMLNode."""
if is_fragment:
nodes = parse_multiroot_html(html)
return nodes
else:
return tree
def __init__(self, node: Tag):
self.node = node
@classmethod
def from_fragment(cls, html: str) -> List["SoupNode"]:
soup = BeautifulSoup(html, "html.parser")
# Get top-level elements in the fragment
return [cls(elem) for elem in soup.contents]
def parse_multiroot_html(html: str) -> List[LexborNode]:
"""
Use this when you know the given HTML is a multiple nodes like
def to_html(self) -> str:
if isinstance(self.node, CData):
return f"<![CDATA[{self.node}]]>"
elif isinstance(self.node, Comment):
return f"<!-- {self.node} -->"
elif isinstance(self.node, Doctype):
return f"<!DOCTYPE {self.node}>"
else:
return str(self.node)
`<div> Hi </div> <span> Hello </span>`
"""
# NOTE: HTML / XML MUST have a single root. So, to support multiple
# top-level elements, we wrap them in a dummy singular root.
parser = LexborHTMLParser(f"<root>{html}</root>")
def name(self) -> str:
return self.node.name
# Get all contents of the root
root_elem = parser.css_first("root")
elems = [*root_elem.iter()] if root_elem else []
return elems
def find_tag(self, tag: str) -> Optional["SoupNode"]:
if isinstance(self.node, Tag) and self.node.name == tag:
return self
else:
match = self.node.select_one(tag)
if match:
return SoupNode(match)
return None
def append_children(self, children: Sequence["SoupNode"]) -> None:
if isinstance(self.node, Tag):
for child in children:
self.node.append(child.node)
def is_html_parser_fragment(html: str, tree: LexborHTMLParser) -> bool:
# If we pass only an HTML fragment to the parser, like `<div>123</div>`, then
# the parser automatically wraps it in `<html>`, `<head>`, and `<body>` tags.
#
# <html>
# <head>
# </head>
# <body>
# <div>123</div>
# </body>
# </html>
#
# But also, as described in Lexbor (https://github.com/lexbor/lexbor/issues/183#issuecomment-1611975340),
# if the parser first comes across HTML tags that could go into the `<head>`,
# it will put them there, and then put the rest in `<body>`.
#
# So `<link href="..." /><div></div>` will be parsed as
#
# <html>
# <head>
# <link href="..." />
# </head>
# <body>
# <div>123</div>
# </body>
# </html>
#
# BUT, if we're dealing with a fragment, we want to parse it correctly as
# a multi-root fragment:
#
# <link href="..." />
# <div>123</div>
#
# The way do so is that we:
# 1. Take the original HTML string
# 2. Subtract the content of parsed `<head>` from the START of the original HTML
# 3. Subtract the content of parsed `<body>` from the END of the original HTML
# 4. Then, if we have an HTML fragment, we should be left with empty string (maybe whitespace?).
# 5. But if we have an HTML document, then the "space between" should contain text,
# because we didn't account for the length of `<html>`, `<head>`, `<body>` tags.
#
# TODO: Replace with fragment parser?
# See https://github.com/rushter/selectolax/issues/74#issuecomment-2404470344
parsed_head_html: str = tree.head.html # type: ignore
parsed_body_html: str = tree.body.html # type: ignore
head_content = parsed_head_html[len("<head>") : -len("</head>")] # noqa: E203
body_content = parsed_body_html[len("<body>") : -len("</body>")] # noqa: E203
between_content = html[len(head_content) : -len(body_content)].strip() # noqa: E203
def get_attr(self, attr: str, default: Any = None) -> Any:
if isinstance(self.node, Tag):
res = self.node.get(attr, default)
if isinstance(res, list):
return " ".join(res)
return res
return default
is_fragment = not html or not between_content
return is_fragment
def set_attr(self, attr: str, value: Any) -> None:
if not isinstance(self.node, Tag):
return
if value is True:
# Set boolean attributes without a value
self.node[attr] = None
elif value is False:
# Remove the attribute
self.node.attrs.pop(attr, None)
else:
self.node[attr] = value
def is_element(self) -> bool:
return isinstance(self.node, Tag)

View file

@ -77,3 +77,10 @@ def get_last_index(lst: List, key: Callable[[Any], bool]) -> Optional[int]:
if key(item):
return len(lst) - 1 - index
return None
def _escape_js(js: str, eval: bool = True) -> str:
escaped_js = escape_js_string_literal(js)
# `unescapeJs` is the function we call in the browser to parse the escaped JS
escaped_js = f"Components.unescapeJs(`{escaped_js}`)"
return f"eval({escaped_js})" if eval else escaped_js

View file

@ -2,11 +2,11 @@ from unittest.mock import Mock
from django.http import HttpResponseNotModified
from django.template import Context, Template
from selectolax.lexbor import LexborHTMLParser
from django_components import Component, registry, render_dependencies, types
from django_components.components.dynamic import DynamicComponent
from django_components.middleware import ComponentDependencyMiddleware
from django_components.util.html import SoupNode
from .django_test_setup import setup_test_config
from .testutils import BaseTestCase, create_and_process_template_response
@ -224,7 +224,9 @@ class RenderDependenciesTests(BaseTestCase):
count=1,
)
rendered_body = LexborHTMLParser(rendered).body.html # type: ignore[union-attr]
# Nodes: [Doctype, whitespace, <html>]
nodes = SoupNode.from_fragment(rendered.strip())
rendered_body = nodes[2].find_tag("body").to_html() # type: ignore[union-attr]
self.assertInHTML(
"""<script src="django_components/django_components.min.js">""",
@ -274,7 +276,9 @@ class RenderDependenciesTests(BaseTestCase):
count=1,
)
rendered_head = LexborHTMLParser(rendered).head.html # type: ignore[union-attr]
# Nodes: [Doctype, whitespace, <html>]
nodes = SoupNode.from_fragment(rendered.strip())
rendered_head = nodes[2].find_tag("head").to_html() # type: ignore[union-attr]
self.assertInHTML(
"""<script src="django_components/django_components.min.js">""",
@ -287,6 +291,142 @@ class RenderDependenciesTests(BaseTestCase):
count=1,
)
# NOTE: Some HTML parser libraries like selectolax or lxml try to "correct" the given HTML.
# We want to avoid this behavior, so user gets the exact same HTML back.
def test_does_not_try_to_add_close_tags(self):
registry.register(name="test", component=SimpleComponent)
template_str: types.django_html = """
<thead>
"""
rendered_raw = Template(template_str).render(Context({"formset": [1]}))
rendered = render_dependencies(rendered_raw, type="fragment")
self.assertHTMLEqual(rendered, "<thead>")
def test_does_not_modify_html_when_no_component_used(self):
registry.register(name="test", component=SimpleComponent)
template_str: types.django_html = """
<table class="table-auto border-collapse divide-y divide-x divide-slate-300 w-full">
<!-- Table head -->
<thead>
<tr class="py-0 my-0 h-7">
<!-- Empty row -->
<th class="min-w-12">#</th>
</tr>
</thead>
<!-- Table body -->
<tbody id="items" class="divide-y divide-slate-300">
{% for form in formset %}
{% with row_number=forloop.counter %}
<tr class=" hover:bg-gray-200 py-0 {% cycle 'bg-white' 'bg-gray-50' %} divide-x "
aria-rowindex="{{ row_number }}">
<!-- row num -->
<td class="whitespace-nowrap w-fit text-center px-4 w-px"
aria-colindex="1">
{{ row_number }}
</td>
</tr>
{% endwith %}
{% endfor %}
</tbody>
</table>
"""
rendered_raw = Template(template_str).render(Context({"formset": [1]}))
rendered = render_dependencies(rendered_raw, type="fragment")
expected = """
<table class="table-auto border-collapse divide-y divide-x divide-slate-300 w-full">
<!-- Table head -->
<thead>
<tr class="py-0 my-0 h-7">
<!-- Empty row -->
<th class="min-w-12">#</th>
</tr>
</thead>
<!-- Table body -->
<tbody id="items" class="divide-y divide-slate-300">
<tr class=" hover:bg-gray-200 py-0 bg-white divide-x "
aria-rowindex="1">
<!-- row num -->
<td class="whitespace-nowrap w-fit text-center px-4 w-px"
aria-colindex="1">
1
</td>
</tr>
</tbody>
</table>
"""
self.assertHTMLEqual(expected, rendered)
# Explanation: The component is used in the template, but the template doesn't use
# {% component_js_dependencies %} or {% component_css_dependencies %} tags,
# nor defines a `<head>` or `<body>` tag. In which case, the dependencies are not rendered.
def test_does_not_modify_html_when_component_used_but_nowhere_to_insert(self):
registry.register(name="test", component=SimpleComponent)
template_str: types.django_html = """
{% load component_tags %}
<table class="table-auto border-collapse divide-y divide-x divide-slate-300 w-full">
<!-- Table head -->
<thead>
<tr class="py-0 my-0 h-7">
<!-- Empty row -->
<th class="min-w-12">#</th>
</tr>
</thead>
<!-- Table body -->
<tbody id="items" class="divide-y divide-slate-300">
{% for form in formset %}
{% with row_number=forloop.counter %}
<tr class=" hover:bg-gray-200 py-0 {% cycle 'bg-white' 'bg-gray-50' %} divide-x "
aria-rowindex="{{ row_number }}">
<!-- row num -->
<td class="whitespace-nowrap w-fit text-center px-4 w-px"
aria-colindex="1">
{{ row_number }}
{% component "test" variable="hi" / %}
</td>
</tr>
{% endwith %}
{% endfor %}
</tbody>
</table>
"""
rendered_raw = Template(template_str).render(Context({"formset": [1]}))
rendered = render_dependencies(rendered_raw, type="fragment")
expected = """
<table class="table-auto border-collapse divide-y divide-x divide-slate-300 w-full">
<!-- Table head -->
<thead>
<tr class="py-0 my-0 h-7">
<!-- Empty row -->
<th class="min-w-12">#</th>
</tr>
</thead>
<!-- Table body -->
<tbody id="items" class="divide-y divide-slate-300">
<tr class=" hover:bg-gray-200 py-0 bg-white divide-x "
aria-rowindex="1">
<!-- row num -->
<td class="whitespace-nowrap w-fit text-center px-4 w-px"
aria-colindex="1">
1
Variable: <strong>hi</strong>
</td>
</tr>
</tbody>
</table>
"""
self.assertHTMLEqual(expected, rendered)
class MiddlewareTests(BaseTestCase):
def test_middleware_response_without_content_type(self):

View file

@ -1,14 +1,6 @@
from typing import List, cast
from django.test import TestCase
from selectolax.lexbor import LexborHTMLParser, LexborNode
from django_components.util.html import (
is_html_parser_fragment,
parse_document_or_nodes,
parse_multiroot_html,
parse_node,
)
from django_components.util.html import SoupNode
from .django_test_setup import setup_test_config
@ -16,50 +8,26 @@ setup_test_config({"autodiscover": False})
class HtmlTests(TestCase):
def test_parse_node(self):
node = parse_node(
def test_beautifulsoup_impl(self):
nodes = SoupNode.from_fragment(
"""
<div class="abc xyz" data-id="123">
<ul>
<li>Hi</li>
</ul>
</div>
"""
)
node.attrs["id"] = "my-id" # type: ignore[index]
node.css("li")[0].attrs["class"] = "item" # type: ignore[index]
self.assertHTMLEqual(
node.html,
"""
<div class="abc xyz" data-id="123" id="my-id">
<ul>
<li class="item">Hi</li>
</ul>
</div>
""",
<!-- I'M COMMENT -->
<button>
Click me!
</button>
""".strip()
)
def test_parse_multiroot_html(self):
html = """
<div class="abc xyz" data-id="123">
<ul>
<li>Hi</li>
</ul>
</div>
<main id="123" class="one">
<div>
42
</div>
</main>
<span>
Hello
</span>
"""
nodes = parse_multiroot_html(html)
# Items: <div>, whitespace, comment, whitespace, <button>
self.assertEqual(len(nodes), 5)
self.assertHTMLEqual(
nodes[0].html,
nodes[0].to_html(),
"""
<div class="abc xyz" data-id="123">
<ul>
@ -69,87 +37,37 @@ class HtmlTests(TestCase):
""",
)
self.assertHTMLEqual(
nodes[1].html,
"""
<main id="123" class="one">
<div>
42
</div>
</main>
""",
nodes[2].to_html(),
"<!-- I&#x27;M COMMENT -->",
)
self.assertHTMLEqual(
nodes[2].html,
nodes[4].to_html(),
"""
<span>
Hello
</span>
<button>
Click me!
</button>
""",
)
def test_is_html_parser_fragment(self):
fragment_html = """
<div class="abc xyz" data-id="123">
<ul>
<li>Hi</li>
</ul>
</div>
<main id="123" class="one">
<div>
42
</div>
</main>
<span>
Hello
</span>
"""
fragment_tree = LexborHTMLParser(fragment_html)
fragment_result = is_html_parser_fragment(fragment_html, fragment_tree)
self.assertEqual(nodes[0].name(), "div")
self.assertEqual(nodes[4].name(), "button")
self.assertEqual(fragment_result, True)
self.assertEqual(nodes[0].is_element(), True)
self.assertEqual(nodes[2].is_element(), False)
self.assertEqual(nodes[4].is_element(), True)
doc_html = """
<!doctype html>
<html>
<head>
<link href="https://..." />
</head>
<body>
<div class="abc xyz" data-id="123">
<ul>
<li>Hi</li>
</ul>
</div>
</body>
</html>
"""
doc_tree = LexborHTMLParser(doc_html)
doc_result = is_html_parser_fragment(doc_html, doc_tree)
self.assertEqual(nodes[0].get_attr("class"), "abc xyz")
self.assertEqual(nodes[4].get_attr("class"), None)
self.assertEqual(doc_result, False)
def test_parse_document_or_nodes__fragment(self):
fragment_html = """
<div class="abc xyz" data-id="123">
<ul>
<li>Hi</li>
</ul>
</div>
<main id="123" class="one">
<div>
42
</div>
</main>
<span>
Hello
</span>
"""
fragment_result = cast(List[LexborNode], parse_document_or_nodes(fragment_html))
nodes[0].set_attr("class", "123 456")
nodes[4].set_attr("class", "abc def")
self.assertEqual(nodes[0].get_attr("class"), "123 456")
self.assertEqual(nodes[4].get_attr("class"), "abc def")
self.assertHTMLEqual(
fragment_result[0].html,
nodes[0].to_html(),
"""
<div class="abc xyz" data-id="123">
<div class="123 456" data-id="123">
<ul>
<li>Hi</li>
</ul>
@ -157,111 +75,53 @@ class HtmlTests(TestCase):
""",
)
self.assertHTMLEqual(
fragment_result[1].html,
nodes[4].to_html(),
"""
<main id="123" class="one">
<div>
42
</div>
</main>
""",
)
self.assertHTMLEqual(
fragment_result[2].html,
"""
<span>
Hello
</span>
<button class="abc def">
Click me!
</button>
""",
)
def test_parse_document_or_nodes__mixed(self):
fragment_html = """
<link href="" />
<div class="abc xyz" data-id="123">
<ul>
<li>Hi</li>
</ul>
</div>
<main id="123" class="one">
<div>
42
</div>
</main>
<span>
Hello
</span>
"""
fragment_result = cast(List[LexborNode], parse_document_or_nodes(fragment_html))
# Setting attr to `True` will set it to boolean attribute,
# while setting it to `False` will remove the attribute.
nodes[4].set_attr("disabled", True)
self.assertHTMLEqual(
fragment_result[0].html,
nodes[4].to_html(),
"""
<link href="" />
<button class="abc def" disabled>
Click me!
</button>
""",
)
nodes[4].set_attr("disabled", False)
self.assertHTMLEqual(
fragment_result[1].html,
nodes[4].to_html(),
"""
<div class="abc xyz" data-id="123">
<ul>
<li>Hi</li>
</ul>
</div>
""",
)
self.assertHTMLEqual(
fragment_result[2].html,
"""
<main id="123" class="one">
<div>
42
</div>
</main>
""",
)
self.assertHTMLEqual(
fragment_result[3].html,
"""
<span>
Hello
</span>
<button class="abc def">
Click me!
</button>
""",
)
def test_parse_document_or_nodes__doc(self):
doc_html = """
<!doctype html>
<html>
<head>
<link href="https://..." />
</head>
<body>
<div class="abc xyz" data-id="123">
<ul>
<li>Hi</li>
</ul>
</div>
</body>
</html>
"""
fragment_result = cast(LexborHTMLParser, parse_document_or_nodes(doc_html))
# Return self
self.assertEqual(nodes[0].node, nodes[0].find_tag("div").node) # type: ignore[union-attr]
# Return descendant
li = nodes[0].find_tag("li")
self.assertHTMLEqual(li.to_html(), "<li>Hi</li>") # type: ignore[union-attr]
# Return None when not found
self.assertEqual(nodes[0].find_tag("main"), None)
# Insert children
li.append_children([nodes[4]]) # type: ignore[union-attr]
self.assertHTMLEqual(
fragment_result.html,
li.to_html(), # type: ignore[union-attr]
"""
<!doctype html>
<html>
<head>
<link href="https://..." />
</head>
<body>
<div class="abc xyz" data-id="123">
<ul>
<li>Hi</li>
</ul>
</div>
</body>
</html>
<li>
Hi
<button class="abc def">
Click me!
</button>
</li>
""",
)