refactor: replace selectolax with beautifulsoup (#823)

* refactor: replace selectolax with beautifulsoup * refactor: add tests for html parser impl * refactor: add missing import * refactor: fix tests * refactor: fix linter issues * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
2025-08-18 13:10:13 +00:00 · 2024-12-08 08:42:48 +01:00 · 2024-12-08 08:42:48 +01:00 · 1cd545b986
commit 1cd545b986
parent c61847d30d
9 changed files with 391 additions and 351 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@ -29,7 +29,7 @@ classifiers = [
 ]
 dependencies = [
    'Django>=4.2',
-    'selectolax>=0.3.24',
+    'beautifulsoup4>=4.12',
 ]
 license = {text = "MIT"}
--- a/requirements-dev.in
+++ b/requirements-dev.in
@ -11,4 +11,4 @@ playwright
 requests
 types-requests
 whitenoise
-selectolax
+beautifulsoup4
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@ -6,7 +6,9 @@
 #
 asgiref==3.8.1
    # via django
-black==24.8.0
+beautifulsoup4==4.12.3
    # via -r requirements-dev.in
 black==24.10.0
    # via -r requirements-dev.in
 cachetools==5.5.0
    # via tox
@ -16,15 +18,15 @@ cfgv==3.4.0
    # via pre-commit
 chardet==5.2.0
    # via tox
-charset-normalizer==3.3.2
+charset-normalizer==3.4.0
    # via requests
 click==8.1.7
    # via black
 colorama==0.4.6
    # via tox
-distlib==0.3.8
+distlib==0.3.9
    # via virtualenv
-django==5.1.1
+django==5.1.4
    # via -r requirements-dev.in
 filelock==3.16.1
    # via
@ -38,7 +40,7 @@ flake8-pyproject==1.2.3
    # via -r requirements-dev.in
 greenlet==3.1.1
    # via playwright
-identify==2.5.33
+identify==2.6.3
    # via pre-commit
 idna==3.10
    # via requests
@ -54,9 +56,9 @@ mypy-extensions==1.0.0
    # via
    #   black
    #   mypy
-nodeenv==1.8.0
+nodeenv==1.9.1
    # via pre-commit
-packaging==24.1
+packaging==24.2
    # via
    #   black
    #   pyproject-api
@ -69,7 +71,7 @@ platformdirs==4.3.6
    #   black
    #   tox
    #   virtualenv
-playwright==1.48.0
+playwright==1.49.0
    # via -r requirements-dev.in
 pluggy==1.5.0
    # via
@ -77,7 +79,7 @@ pluggy==1.5.0
    #   tox
 pre-commit==4.0.1
    # via -r requirements-dev.in
-pycodestyle==2.12.0
+pycodestyle==2.12.1
    # via flake8
 pyee==12.0.0
    # via playwright
@ -87,19 +89,19 @@ pyproject-api==1.8.0
    # via tox
 pytest==8.3.4
    # via -r requirements-dev.in
-pyyaml==6.0.1
+pyyaml==6.0.2
    # via pre-commit
 requests==2.32.3
    # via -r requirements-dev.in
-selectolax==0.3.26
+soupsieve==2.6
-    # via -r requirements-dev.in
+    # via beautifulsoup4
-sqlparse==0.5.0
+sqlparse==0.5.2
    # via django
 tox==4.23.2
    # via -r requirements-dev.in
 types-requests==2.32.0.20241016
    # via -r requirements-dev.in
-typing-extensions==4.10.0
+typing-extensions==4.12.2
    # via
    #   mypy
    #   pyee
@ -107,12 +109,9 @@ urllib3==2.2.3
    # via
    #   requests
    #   types-requests
-virtualenv==20.26.6
+virtualenv==20.28.0
    # via
    #   pre-commit
    #   tox
-whitenoise==6.7.0
+whitenoise==6.8.2
    # via -r requirements-dev.in
 # The following packages are considered to be unsafe in a requirements file:
 # setuptools
--- a/requirements-docs.txt
+++ b/requirements-docs.txt
@ -20,7 +20,7 @@
 # - pymdown-extensions
 # - black
 # - django>=4.2
-# - selectolax>=0.3.24
+# - beautifulsoup4>=4.12
 #
 asgiref==3.8.1
@ -29,6 +29,8 @@ babel==2.16.0
    # via
    #   mkdocs-git-revision-date-localized-plugin
    #   mkdocs-material
 beautifulsoup4==4.12.3
    # via hatch.envs.docs
 black==24.10.0
    # via hatch.envs.docs
 bracex==2.5.post1
@ -207,12 +209,12 @@ regex==2024.11.6
    # via mkdocs-material
 requests==2.32.3
    # via mkdocs-material
 selectolax==0.3.26
    # via hatch.envs.docs
 six==1.16.0
    # via python-dateutil
 smmap==5.0.1
    # via gitdb
 soupsieve==2.6
    # via beautifulsoup4
 sqlparse==0.5.2
    # via django
 tinycss2==1.4.0
--- a/src/django_components/dependencies.py
+++ b/src/django_components/dependencies.py
@ -33,11 +33,10 @@ from django.templatetags.static import static
 from django.urls import path, reverse
 from django.utils.decorators import sync_and_async_middleware
 from django.utils.safestring import SafeString, mark_safe
 from selectolax.lexbor import LexborHTMLParser
 import django_components.types as types
-from django_components.util.html import parse_document_or_nodes, parse_multiroot_html, parse_node
+from django_components.util.html import SoupNode
-from django_components.util.misc import escape_js_string_literal, get_import_path
+from django_components.util.misc import _escape_js, get_import_path
 if TYPE_CHECKING:
    from django_components.component import Component
@ -362,26 +361,14 @@ def render_dependencies(content: TContent, type: RenderType = "document") -> TCo
    # then try to insert the JS scripts at the end of <body> and CSS sheets at the end
    # of <head>
    if type == "document" and (not did_find_js_placeholder or not did_find_css_placeholder):
-        tree = parse_document_or_nodes(content_.decode())
+        maybe_transformed = _insert_js_css_to_default_locations(
            content_.decode(),
            css_content=None if did_find_css_placeholder else css_dependencies.decode(),
            js_content=None if did_find_js_placeholder else js_dependencies.decode(),
        )
-        if isinstance(tree, LexborHTMLParser):
+        if maybe_transformed is not None:
-            did_modify_html = False
+            content_ = maybe_transformed.encode()
            if not did_find_css_placeholder and tree.head:
                css_elems = parse_multiroot_html(css_dependencies.decode())
                for css_elem in css_elems:
                    tree.head.insert_child(css_elem)  # type: ignore # TODO: Update to selectolax 0.3.25
                did_modify_html = True
            if not did_find_js_placeholder and tree.body:
                js_elems = parse_multiroot_html(js_dependencies.decode())
                for js_elem in js_elems:
                    tree.body.insert_child(js_elem)  # type: ignore # TODO: Update to selectolax 0.3.25
                did_modify_html = True
            transformed = cast(str, tree.html)
            if did_modify_html:
                content_ = transformed.encode()
    # Return the same type as we were given
    output = content_.decode() if isinstance(content, str) else content_
@ -567,15 +554,15 @@ def _postprocess_media_tags(
    tags_by_url: Dict[str, str] = {}
    for tag in tags:
-        node = parse_node(tag)
+        node = SoupNode.from_fragment(tag.strip())[0]
        # <script src="..."> vs <link href="...">
        attr = "src" if script_type == "js" else "href"
-        maybe_url = node.attrs.get(attr, None)
+        maybe_url = node.get_attr(attr, None)
        if not _is_nonempty_str(maybe_url):
            raise RuntimeError(
                f"One of entries for `Component.Media.{script_type}` media is missing a "
-                f"value for attribute '{attr}'. If there is content inlined inside the `<{node.tag}>` tags, "
+                f"value for attribute '{attr}'. If there is content inlined inside the `<{node.name()}>` tags, "
                f"you must move the content to a `.{script_type}` file and reference it via '{attr}'.\nGot:\n{tag}"
            )
@ -739,11 +726,48 @@ def _gen_exec_script(
    return exec_script
-def _escape_js(js: str, eval: bool = True) -> str:
+def _insert_js_css_to_default_locations(
-    escaped_js = escape_js_string_literal(js)
+    html_content: str,
-    # `unescapeJs` is the function we call in the browser to parse the escaped JS
+    js_content: Optional[str],
-    escaped_js = f"Components.unescapeJs(`{escaped_js}`)"
+    css_content: Optional[str],
-    return f"eval({escaped_js})" if eval else escaped_js
+) -> Optional[str]:
    """
    This function tries to insert the JS and CSS content into the default locations.
    JS is inserted at the end of `<body>`, and CSS is inserted at the end of `<head>`.
    """
    elems = SoupNode.from_fragment(html_content)
    if not elems:
        return None
    did_modify_html = False
    if css_content is not None:
        for elem in elems:
            if not elem.is_element():
                continue
            head = elem.find_tag("head")
            if head:
                css_elems = SoupNode.from_fragment(css_content)
                head.append_children(css_elems)
                did_modify_html = True
    if js_content is not None:
        for elem in elems:
            if not elem.is_element():
                continue
            body = elem.find_tag("body")
            if body:
                js_elems = SoupNode.from_fragment(js_content)
                body.append_children(js_elems)
                did_modify_html = True
    if did_modify_html:
        transformed = SoupNode.to_html_multiroot(elems)
        return transformed
    else:
        return None  # No changes made
 #########################################################
@ -802,27 +826,27 @@ class ComponentDependencyMiddleware:
    """
    def __init__(self, get_response: "Callable[[HttpRequest], HttpResponse]") -> None:
-        self.get_response = get_response
+        self._get_response = get_response
        # NOTE: Required to work with async
-        if iscoroutinefunction(self.get_response):
+        if iscoroutinefunction(self._get_response):
            markcoroutinefunction(self)
    def __call__(self, request: HttpRequest) -> HttpResponseBase:
        if iscoroutinefunction(self):
            return self.__acall__(request)
-        response = self.get_response(request)
+        response = self._get_response(request)
-        response = self.process_response(response)
+        response = self._process_response(response)
        return response
    # NOTE: Required to work with async
    async def __acall__(self, request: HttpRequest) -> HttpResponseBase:
-        response = await self.get_response(request)
+        response = await self._get_response(request)
-        response = self.process_response(response)
+        response = self._process_response(response)
        return response
-    def process_response(self, response: HttpResponse) -> HttpResponse:
+    def _process_response(self, response: HttpResponse) -> HttpResponse:
        if not isinstance(response, StreamingHttpResponse) and response.get("Content-Type", "").startswith(
            "text/html"
        ):
--- a/src/django_components/util/html.py
+++ b/src/django_components/util/html.py
@ -1,100 +1,108 @@
-from typing import List, Union
+from abc import ABC, abstractmethod
 from typing import Any, List, Optional, Sequence
-from selectolax.lexbor import LexborHTMLParser, LexborNode
+from bs4 import BeautifulSoup, CData, Comment, Doctype, Tag
-def parse_node(html: str) -> LexborNode:
+class HTMLNode(ABC):
    """
-    Use this when you know the given HTML is a single node like
+    Interface for an HTML manipulation library. This allows us to potentially swap
-
+    between different libraries.
    `<div> Hi </div>`
    """
-    tree = LexborHTMLParser(html)
+
-    # NOTE: The parser automatically places <style> tags inside <head>
+    @classmethod
-    # while <script> tags are inside <body>.
+    @abstractmethod
-    return tree.body.child or tree.head.child  # type: ignore[union-attr, return-value]
+    def from_fragment(cls, html: str) -> Sequence["HTMLNode"]: ...  # noqa: E704
    @abstractmethod
    def to_html(self) -> str: ...  # noqa: E704
    @abstractmethod
    def name(self) -> str:
        """Get tag name"""
        ...
    @abstractmethod
    def find_tag(self, tag: str) -> Optional["HTMLNode"]: ...  # noqa: E704
    @abstractmethod
    def append_children(self, children: Sequence[Any]) -> None: ...  # noqa: E704
    @abstractmethod
    def get_attr(self, attr: str, default: Any = None) -> Any: ...  # noqa: E704
    @abstractmethod
    def set_attr(self, attr: str, value: Any) -> None: ...  # noqa: E704
    @abstractmethod
    def is_element(self) -> bool: ...  # noqa: E704
    """Returns `False` if the node is a text, comment, or doctype node. `True` otherwise."""
    @classmethod
    def to_html_multiroot(cls, elems: Sequence["HTMLNode"]) -> str:
        return "".join([elem.to_html() for elem in elems])
-def parse_document_or_nodes(html: str) -> Union[List[LexborNode], LexborHTMLParser]:
+class SoupNode(HTMLNode):
-    """
+    """BeautifulSoup implementation of HTMLNode."""
    Use this if you do NOT know whether the given HTML is a full document
    with `<html>`, `<head>`, and `<body>` tags, or an HTML fragment.
    """
    html = html.strip()
    tree = LexborHTMLParser(html)
    is_fragment = is_html_parser_fragment(html, tree)
-    if is_fragment:
+    def __init__(self, node: Tag):
-        nodes = parse_multiroot_html(html)
+        self.node = node
        return nodes
    else:
        return tree
    @classmethod
    def from_fragment(cls, html: str) -> List["SoupNode"]:
        soup = BeautifulSoup(html, "html.parser")
        # Get top-level elements in the fragment
        return [cls(elem) for elem in soup.contents]
-def parse_multiroot_html(html: str) -> List[LexborNode]:
+    def to_html(self) -> str:
-    """
+        if isinstance(self.node, CData):
-    Use this when you know the given HTML is a multiple nodes like
+            return f"<![CDATA[{self.node}]]>"
        elif isinstance(self.node, Comment):
            return f"<!-- {self.node} -->"
        elif isinstance(self.node, Doctype):
            return f"<!DOCTYPE {self.node}>"
        else:
            return str(self.node)
-    `<div> Hi </div> <span> Hello </span>`
+    def name(self) -> str:
-    """
+        return self.node.name
    # NOTE: HTML / XML MUST have a single root. So, to support multiple
    # top-level elements, we wrap them in a dummy singular root.
    parser = LexborHTMLParser(f"<root>{html}</root>")
-    # Get all contents of the root
+    def find_tag(self, tag: str) -> Optional["SoupNode"]:
-    root_elem = parser.css_first("root")
+        if isinstance(self.node, Tag) and self.node.name == tag:
-    elems = [*root_elem.iter()] if root_elem else []
+            return self
-    return elems
+        else:
            match = self.node.select_one(tag)
            if match:
                return SoupNode(match)
        return None
    def append_children(self, children: Sequence["SoupNode"]) -> None:
        if isinstance(self.node, Tag):
            for child in children:
                self.node.append(child.node)
-def is_html_parser_fragment(html: str, tree: LexborHTMLParser) -> bool:
+    def get_attr(self, attr: str, default: Any = None) -> Any:
-    # If we pass only an HTML fragment to the parser, like `<div>123</div>`, then
+        if isinstance(self.node, Tag):
-    # the parser automatically wraps it in `<html>`, `<head>`, and `<body>` tags.
+            res = self.node.get(attr, default)
-    #
+            if isinstance(res, list):
-    # <html>
+                return " ".join(res)
-    #   <head>
+            return res
-    #   </head>
+        return default
    #   <body>
    #     <div>123</div>
    #   </body>
    # </html>
    #
    # But also, as described in Lexbor (https://github.com/lexbor/lexbor/issues/183#issuecomment-1611975340),
    # if the parser first comes across HTML tags that could go into the `<head>`,
    # it will put them there, and then put the rest in `<body>`.
    #
    # So `<link href="..." /><div></div>` will be parsed as
    #
    # <html>
    #   <head>
    #     <link href="..." />
    #   </head>
    #   <body>
    #     <div>123</div>
    #   </body>
    # </html>
    #
    # BUT, if we're dealing with a fragment, we want to parse it correctly as
    # a multi-root fragment:
    #
    # <link href="..." />
    # <div>123</div>
    #
    # The way do so is that we:
    # 1. Take the original HTML string
    # 2. Subtract the content of parsed `<head>` from the START of the original HTML
    # 3. Subtract the content of parsed `<body>` from the END of the original HTML
    # 4. Then, if we have an HTML fragment, we should be left with empty string (maybe whitespace?).
    # 5. But if we have an HTML document, then the "space between" should contain text,
    #    because we didn't account for the length of `<html>`, `<head>`, `<body>` tags.
    #
    # TODO: Replace with fragment parser?
    #       See https://github.com/rushter/selectolax/issues/74#issuecomment-2404470344
    parsed_head_html: str = tree.head.html  # type: ignore
    parsed_body_html: str = tree.body.html  # type: ignore
    head_content = parsed_head_html[len("<head>") : -len("</head>")]  # noqa: E203
    body_content = parsed_body_html[len("<body>") : -len("</body>")]  # noqa: E203
    between_content = html[len(head_content) : -len(body_content)].strip()  # noqa: E203
-    is_fragment = not html or not between_content
+    def set_attr(self, attr: str, value: Any) -> None:
-    return is_fragment
+        if not isinstance(self.node, Tag):
            return
        if value is True:
            # Set boolean attributes without a value
            self.node[attr] = None
        elif value is False:
            # Remove the attribute
            self.node.attrs.pop(attr, None)
        else:
            self.node[attr] = value
    def is_element(self) -> bool:
        return isinstance(self.node, Tag)
--- a/src/django_components/util/misc.py
+++ b/src/django_components/util/misc.py
@ -77,3 +77,10 @@ def get_last_index(lst: List, key: Callable[[Any], bool]) -> Optional[int]:
        if key(item):
            return len(lst) - 1 - index
    return None
 def _escape_js(js: str, eval: bool = True) -> str:
    escaped_js = escape_js_string_literal(js)
    # `unescapeJs` is the function we call in the browser to parse the escaped JS
    escaped_js = f"Components.unescapeJs(`{escaped_js}`)"
    return f"eval({escaped_js})" if eval else escaped_js
--- a/tests/test_dependencies.py
+++ b/tests/test_dependencies.py
@ -2,11 +2,11 @@ from unittest.mock import Mock
 from django.http import HttpResponseNotModified
 from django.template import Context, Template
 from selectolax.lexbor import LexborHTMLParser
 from django_components import Component, registry, render_dependencies, types
 from django_components.components.dynamic import DynamicComponent
 from django_components.middleware import ComponentDependencyMiddleware
 from django_components.util.html import SoupNode
 from .django_test_setup import setup_test_config
 from .testutils import BaseTestCase, create_and_process_template_response
@ -224,7 +224,9 @@ class RenderDependenciesTests(BaseTestCase):
            count=1,
        )
-        rendered_body = LexborHTMLParser(rendered).body.html  # type: ignore[union-attr]
+        # Nodes: [Doctype, whitespace, <html>]
        nodes = SoupNode.from_fragment(rendered.strip())
        rendered_body = nodes[2].find_tag("body").to_html()  # type: ignore[union-attr]
        self.assertInHTML(
            """<script src="django_components/django_components.min.js">""",
@ -274,7 +276,9 @@ class RenderDependenciesTests(BaseTestCase):
            count=1,
        )
-        rendered_head = LexborHTMLParser(rendered).head.html  # type: ignore[union-attr]
+        # Nodes: [Doctype, whitespace, <html>]
        nodes = SoupNode.from_fragment(rendered.strip())
        rendered_head = nodes[2].find_tag("head").to_html()  # type: ignore[union-attr]
        self.assertInHTML(
            """<script src="django_components/django_components.min.js">""",
@ -287,6 +291,142 @@ class RenderDependenciesTests(BaseTestCase):
            count=1,
        )
    # NOTE: Some HTML parser libraries like selectolax or lxml try to "correct" the given HTML.
    #       We want to avoid this behavior, so user gets the exact same HTML back.
    def test_does_not_try_to_add_close_tags(self):
        registry.register(name="test", component=SimpleComponent)
        template_str: types.django_html = """
            <thead>
        """
        rendered_raw = Template(template_str).render(Context({"formset": [1]}))
        rendered = render_dependencies(rendered_raw, type="fragment")
        self.assertHTMLEqual(rendered, "<thead>")
    def test_does_not_modify_html_when_no_component_used(self):
        registry.register(name="test", component=SimpleComponent)
        template_str: types.django_html = """
            <table class="table-auto border-collapse divide-y divide-x divide-slate-300 w-full">
                <!-- Table head -->
                <thead>
                    <tr class="py-0 my-0 h-7">
                        <!-- Empty row -->
                        <th class="min-w-12">#</th>
                    </tr>
                </thead>
                <!-- Table body -->
                <tbody id="items" class="divide-y divide-slate-300">
                    {% for form in formset %}
                        {% with row_number=forloop.counter %}
                            <tr class=" hover:bg-gray-200 py-0 {% cycle 'bg-white' 'bg-gray-50' %} divide-x "
                                aria-rowindex="{{ row_number }}">
                                <!-- row num -->
                                <td class="whitespace-nowrap w-fit text-center px-4 w-px"
                                    aria-colindex="1">
                                    {{ row_number }}
                                </td>
                            </tr>
                        {% endwith %}
                    {% endfor %}
                </tbody>
            </table>
        """
        rendered_raw = Template(template_str).render(Context({"formset": [1]}))
        rendered = render_dependencies(rendered_raw, type="fragment")
        expected = """
            <table class="table-auto border-collapse divide-y divide-x divide-slate-300 w-full">
                <!-- Table head -->
                <thead>
                    <tr class="py-0 my-0 h-7">
                        <!-- Empty row -->
                        <th class="min-w-12">#</th>
                    </tr>
                </thead>
                <!-- Table body -->
                <tbody id="items" class="divide-y divide-slate-300">
                    <tr class=" hover:bg-gray-200 py-0 bg-white divide-x "
                        aria-rowindex="1">
                        <!-- row num -->
                        <td class="whitespace-nowrap w-fit text-center px-4 w-px"
                            aria-colindex="1">
                            1
                        </td>
                    </tr>
                </tbody>
            </table>
        """
        self.assertHTMLEqual(expected, rendered)
    # Explanation: The component is used in the template, but the template doesn't use
    # {% component_js_dependencies %} or {% component_css_dependencies %} tags,
    # nor defines a `<head>` or `<body>` tag. In which case, the dependencies are not rendered.
    def test_does_not_modify_html_when_component_used_but_nowhere_to_insert(self):
        registry.register(name="test", component=SimpleComponent)
        template_str: types.django_html = """
            {% load component_tags %}
            <table class="table-auto border-collapse divide-y divide-x divide-slate-300 w-full">
                <!-- Table head -->
                <thead>
                    <tr class="py-0 my-0 h-7">
                        <!-- Empty row -->
                        <th class="min-w-12">#</th>
                    </tr>
                </thead>
                <!-- Table body -->
                <tbody id="items" class="divide-y divide-slate-300">
                    {% for form in formset %}
                        {% with row_number=forloop.counter %}
                            <tr class=" hover:bg-gray-200 py-0 {% cycle 'bg-white' 'bg-gray-50' %} divide-x "
                                aria-rowindex="{{ row_number }}">
                                <!-- row num -->
                                <td class="whitespace-nowrap w-fit text-center px-4 w-px"
                                    aria-colindex="1">
                                    {{ row_number }}
                                    {% component "test" variable="hi" / %}
                                </td>
                            </tr>
                        {% endwith %}
                    {% endfor %}
                </tbody>
            </table>
        """
        rendered_raw = Template(template_str).render(Context({"formset": [1]}))
        rendered = render_dependencies(rendered_raw, type="fragment")
        expected = """
            <table class="table-auto border-collapse divide-y divide-x divide-slate-300 w-full">
                <!-- Table head -->
                <thead>
                    <tr class="py-0 my-0 h-7">
                        <!-- Empty row -->
                        <th class="min-w-12">#</th>
                    </tr>
                </thead>
                <!-- Table body -->
                <tbody id="items" class="divide-y divide-slate-300">
                    <tr class=" hover:bg-gray-200 py-0 bg-white divide-x "
                        aria-rowindex="1">
                        <!-- row num -->
                        <td class="whitespace-nowrap w-fit text-center px-4 w-px"
                            aria-colindex="1">
                            1
                            Variable: <strong>hi</strong>
                        </td>
                    </tr>
                </tbody>
            </table>
        """
        self.assertHTMLEqual(expected, rendered)
 class MiddlewareTests(BaseTestCase):
    def test_middleware_response_without_content_type(self):
--- a/tests/test_html.py
+++ b/tests/test_html.py
@ -1,14 +1,6 @@
 from typing import List, cast
 from django.test import TestCase
 from selectolax.lexbor import LexborHTMLParser, LexborNode
-from django_components.util.html import (
+from django_components.util.html import SoupNode
    is_html_parser_fragment,
    parse_document_or_nodes,
    parse_multiroot_html,
    parse_node,
 )
 from .django_test_setup import setup_test_config
@ -16,50 +8,26 @@ setup_test_config({"autodiscover": False})
 class HtmlTests(TestCase):
-    def test_parse_node(self):
+    def test_beautifulsoup_impl(self):
-        node = parse_node(
+        nodes = SoupNode.from_fragment(
            """
            <div class="abc xyz" data-id="123">
                <ul>
                    <li>Hi</li>
                </ul>
            </div>
-            """
+            <!-- I'M COMMENT -->
-        )
+            <button>
-        node.attrs["id"] = "my-id"  # type: ignore[index]
+                Click me!
-        node.css("li")[0].attrs["class"] = "item"  # type: ignore[index]
+            </button>
-
+            """.strip()
        self.assertHTMLEqual(
            node.html,
            """
            <div class="abc xyz" data-id="123" id="my-id">
                <ul>
                    <li class="item">Hi</li>
                </ul>
            </div>
            """,
        )
-    def test_parse_multiroot_html(self):
+        # Items: <div>, whitespace, comment, whitespace, <button>
-        html = """
+        self.assertEqual(len(nodes), 5)
            <div class="abc xyz" data-id="123">
                <ul>
                    <li>Hi</li>
                </ul>
            </div>
            <main id="123" class="one">
                <div>
                    42
                </div>
            </main>
            <span>
                Hello
            </span>
        """
        nodes = parse_multiroot_html(html)
        self.assertHTMLEqual(
-            nodes[0].html,
+            nodes[0].to_html(),
            """
            <div class="abc xyz" data-id="123">
                <ul>
@ -69,87 +37,37 @@ class HtmlTests(TestCase):
            """,
        )
        self.assertHTMLEqual(
-            nodes[1].html,
+            nodes[2].to_html(),
-            """
+            "<!-- I&#x27;M COMMENT -->",
            <main id="123" class="one">
                <div>
                    42
                </div>
            </main>
            """,
        )
        self.assertHTMLEqual(
-            nodes[2].html,
+            nodes[4].to_html(),
            """
-            <span>
+            <button>
-                Hello
+                Click me!
-            </span>
+            </button>
            """,
        )
-    def test_is_html_parser_fragment(self):
+        self.assertEqual(nodes[0].name(), "div")
-        fragment_html = """
+        self.assertEqual(nodes[4].name(), "button")
            <div class="abc xyz" data-id="123">
                <ul>
                    <li>Hi</li>
                </ul>
            </div>
            <main id="123" class="one">
                <div>
                    42
                </div>
            </main>
            <span>
                Hello
            </span>
        """
        fragment_tree = LexborHTMLParser(fragment_html)
        fragment_result = is_html_parser_fragment(fragment_html, fragment_tree)
-        self.assertEqual(fragment_result, True)
+        self.assertEqual(nodes[0].is_element(), True)
        self.assertEqual(nodes[2].is_element(), False)
        self.assertEqual(nodes[4].is_element(), True)
-        doc_html = """
+        self.assertEqual(nodes[0].get_attr("class"), "abc xyz")
-            <!doctype html>
+        self.assertEqual(nodes[4].get_attr("class"), None)
            <html>
              <head>
                <link href="https://..." />
              </head>
              <body>
                <div class="abc xyz" data-id="123">
                    <ul>
                        <li>Hi</li>
                    </ul>
                </div>
              </body>
            </html>
        """
        doc_tree = LexborHTMLParser(doc_html)
        doc_result = is_html_parser_fragment(doc_html, doc_tree)
-        self.assertEqual(doc_result, False)
+        nodes[0].set_attr("class", "123 456")
-
+        nodes[4].set_attr("class", "abc def")
-    def test_parse_document_or_nodes__fragment(self):
+        self.assertEqual(nodes[0].get_attr("class"), "123 456")
-        fragment_html = """
+        self.assertEqual(nodes[4].get_attr("class"), "abc def")
            <div class="abc xyz" data-id="123">
                <ul>
                    <li>Hi</li>
                </ul>
            </div>
            <main id="123" class="one">
                <div>
                    42
                </div>
            </main>
            <span>
                Hello
            </span>
        """
        fragment_result = cast(List[LexborNode], parse_document_or_nodes(fragment_html))
        self.assertHTMLEqual(
-            fragment_result[0].html,
+            nodes[0].to_html(),
            """
-            <div class="abc xyz" data-id="123">
+            <div class="123 456" data-id="123">
                <ul>
                    <li>Hi</li>
                </ul>
@ -157,111 +75,53 @@ class HtmlTests(TestCase):
            """,
        )
        self.assertHTMLEqual(
-            fragment_result[1].html,
+            nodes[4].to_html(),
            """
-            <main id="123" class="one">
+            <button class="abc def">
-                <div>
+                Click me!
-                    42
+            </button>
                </div>
            </main>
            """,
        )
        self.assertHTMLEqual(
            fragment_result[2].html,
            """
            <span>
                Hello
            </span>
            """,
        )
-    def test_parse_document_or_nodes__mixed(self):
+        # Setting attr to `True` will set it to boolean attribute,
-        fragment_html = """
+        # while setting it to `False` will remove the attribute.
-            <link href="" />
+        nodes[4].set_attr("disabled", True)
            <div class="abc xyz" data-id="123">
                <ul>
                    <li>Hi</li>
                </ul>
            </div>
            <main id="123" class="one">
                <div>
                    42
                </div>
            </main>
            <span>
                Hello
            </span>
        """
        fragment_result = cast(List[LexborNode], parse_document_or_nodes(fragment_html))
        self.assertHTMLEqual(
-            fragment_result[0].html,
+            nodes[4].to_html(),
            """
-            <link href="" />
+            <button class="abc def" disabled>
                Click me!
            </button>
            """,
        )
        nodes[4].set_attr("disabled", False)
        self.assertHTMLEqual(
-            fragment_result[1].html,
+            nodes[4].to_html(),
            """
-            <div class="abc xyz" data-id="123">
+            <button class="abc def">
-                <ul>
+                Click me!
-                    <li>Hi</li>
+            </button>
                </ul>
            </div>
            """,
        )
        self.assertHTMLEqual(
            fragment_result[2].html,
            """
            <main id="123" class="one">
                <div>
                    42
                </div>
            </main>
            """,
        )
        self.assertHTMLEqual(
            fragment_result[3].html,
            """
            <span>
                Hello
            </span>
            """,
        )
-    def test_parse_document_or_nodes__doc(self):
+        # Return self
-        doc_html = """
+        self.assertEqual(nodes[0].node, nodes[0].find_tag("div").node)  # type: ignore[union-attr]
-            <!doctype html>
+        # Return descendant
-            <html>
+        li = nodes[0].find_tag("li")
-              <head>
+        self.assertHTMLEqual(li.to_html(), "<li>Hi</li>")  # type: ignore[union-attr]
-                <link href="https://..." />
+        # Return None when not found
-              </head>
+        self.assertEqual(nodes[0].find_tag("main"), None)
              <body>
                <div class="abc xyz" data-id="123">
                    <ul>
                        <li>Hi</li>
                    </ul>
                </div>
              </body>
            </html>
        """
        fragment_result = cast(LexborHTMLParser, parse_document_or_nodes(doc_html))
        # Insert children
        li.append_children([nodes[4]])  # type: ignore[union-attr]
        self.assertHTMLEqual(
-            fragment_result.html,
+            li.to_html(),  # type: ignore[union-attr]
            """
-            <!doctype html>
+            <li>
-            <html>
+                Hi
-              <head>
+                <button class="abc def">
-                <link href="https://..." />
+                    Click me!
-              </head>
+                </button>
-              <body>
+            </li>
                <div class="abc xyz" data-id="123">
                    <ul>
                        <li>Hi</li>
                    </ul>
                </div>
              </body>
            </html>
            """,
        )