refactor: replace selectolax with beautifulsoup (#823)

* refactor: replace selectolax with beautifulsoup

* refactor: add tests for html parser impl

* refactor: add missing import

* refactor: fix tests

* refactor: fix linter issues

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
This commit is contained in:
Juro Oravec 2024-12-08 08:42:48 +01:00 committed by GitHub
parent c61847d30d
commit 1cd545b986
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 391 additions and 351 deletions

View file

@ -1,14 +1,6 @@
from typing import List, cast
from django.test import TestCase
from selectolax.lexbor import LexborHTMLParser, LexborNode
from django_components.util.html import (
is_html_parser_fragment,
parse_document_or_nodes,
parse_multiroot_html,
parse_node,
)
from django_components.util.html import SoupNode
from .django_test_setup import setup_test_config
@ -16,50 +8,26 @@ setup_test_config({"autodiscover": False})
class HtmlTests(TestCase):
def test_parse_node(self):
node = parse_node(
def test_beautifulsoup_impl(self):
nodes = SoupNode.from_fragment(
"""
<div class="abc xyz" data-id="123">
<ul>
<li>Hi</li>
</ul>
</div>
"""
)
node.attrs["id"] = "my-id" # type: ignore[index]
node.css("li")[0].attrs["class"] = "item" # type: ignore[index]
self.assertHTMLEqual(
node.html,
"""
<div class="abc xyz" data-id="123" id="my-id">
<ul>
<li class="item">Hi</li>
</ul>
</div>
""",
<!-- I'M COMMENT -->
<button>
Click me!
</button>
""".strip()
)
def test_parse_multiroot_html(self):
html = """
<div class="abc xyz" data-id="123">
<ul>
<li>Hi</li>
</ul>
</div>
<main id="123" class="one">
<div>
42
</div>
</main>
<span>
Hello
</span>
"""
nodes = parse_multiroot_html(html)
# Items: <div>, whitespace, comment, whitespace, <button>
self.assertEqual(len(nodes), 5)
self.assertHTMLEqual(
nodes[0].html,
nodes[0].to_html(),
"""
<div class="abc xyz" data-id="123">
<ul>
@ -69,87 +37,37 @@ class HtmlTests(TestCase):
""",
)
self.assertHTMLEqual(
nodes[1].html,
"""
<main id="123" class="one">
<div>
42
</div>
</main>
""",
nodes[2].to_html(),
"<!-- I&#x27;M COMMENT -->",
)
self.assertHTMLEqual(
nodes[2].html,
nodes[4].to_html(),
"""
<span>
Hello
</span>
<button>
Click me!
</button>
""",
)
def test_is_html_parser_fragment(self):
fragment_html = """
<div class="abc xyz" data-id="123">
<ul>
<li>Hi</li>
</ul>
</div>
<main id="123" class="one">
<div>
42
</div>
</main>
<span>
Hello
</span>
"""
fragment_tree = LexborHTMLParser(fragment_html)
fragment_result = is_html_parser_fragment(fragment_html, fragment_tree)
self.assertEqual(nodes[0].name(), "div")
self.assertEqual(nodes[4].name(), "button")
self.assertEqual(fragment_result, True)
self.assertEqual(nodes[0].is_element(), True)
self.assertEqual(nodes[2].is_element(), False)
self.assertEqual(nodes[4].is_element(), True)
doc_html = """
<!doctype html>
<html>
<head>
<link href="https://..." />
</head>
<body>
<div class="abc xyz" data-id="123">
<ul>
<li>Hi</li>
</ul>
</div>
</body>
</html>
"""
doc_tree = LexborHTMLParser(doc_html)
doc_result = is_html_parser_fragment(doc_html, doc_tree)
self.assertEqual(nodes[0].get_attr("class"), "abc xyz")
self.assertEqual(nodes[4].get_attr("class"), None)
self.assertEqual(doc_result, False)
def test_parse_document_or_nodes__fragment(self):
fragment_html = """
<div class="abc xyz" data-id="123">
<ul>
<li>Hi</li>
</ul>
</div>
<main id="123" class="one">
<div>
42
</div>
</main>
<span>
Hello
</span>
"""
fragment_result = cast(List[LexborNode], parse_document_or_nodes(fragment_html))
nodes[0].set_attr("class", "123 456")
nodes[4].set_attr("class", "abc def")
self.assertEqual(nodes[0].get_attr("class"), "123 456")
self.assertEqual(nodes[4].get_attr("class"), "abc def")
self.assertHTMLEqual(
fragment_result[0].html,
nodes[0].to_html(),
"""
<div class="abc xyz" data-id="123">
<div class="123 456" data-id="123">
<ul>
<li>Hi</li>
</ul>
@ -157,111 +75,53 @@ class HtmlTests(TestCase):
""",
)
self.assertHTMLEqual(
fragment_result[1].html,
nodes[4].to_html(),
"""
<main id="123" class="one">
<div>
42
</div>
</main>
""",
)
self.assertHTMLEqual(
fragment_result[2].html,
"""
<span>
Hello
</span>
<button class="abc def">
Click me!
</button>
""",
)
def test_parse_document_or_nodes__mixed(self):
fragment_html = """
<link href="" />
<div class="abc xyz" data-id="123">
<ul>
<li>Hi</li>
</ul>
</div>
<main id="123" class="one">
<div>
42
</div>
</main>
<span>
Hello
</span>
"""
fragment_result = cast(List[LexborNode], parse_document_or_nodes(fragment_html))
# Setting attr to `True` will set it to boolean attribute,
# while setting it to `False` will remove the attribute.
nodes[4].set_attr("disabled", True)
self.assertHTMLEqual(
fragment_result[0].html,
nodes[4].to_html(),
"""
<link href="" />
<button class="abc def" disabled>
Click me!
</button>
""",
)
nodes[4].set_attr("disabled", False)
self.assertHTMLEqual(
fragment_result[1].html,
nodes[4].to_html(),
"""
<div class="abc xyz" data-id="123">
<ul>
<li>Hi</li>
</ul>
</div>
""",
)
self.assertHTMLEqual(
fragment_result[2].html,
"""
<main id="123" class="one">
<div>
42
</div>
</main>
""",
)
self.assertHTMLEqual(
fragment_result[3].html,
"""
<span>
Hello
</span>
<button class="abc def">
Click me!
</button>
""",
)
def test_parse_document_or_nodes__doc(self):
doc_html = """
<!doctype html>
<html>
<head>
<link href="https://..." />
</head>
<body>
<div class="abc xyz" data-id="123">
<ul>
<li>Hi</li>
</ul>
</div>
</body>
</html>
"""
fragment_result = cast(LexborHTMLParser, parse_document_or_nodes(doc_html))
# Return self
self.assertEqual(nodes[0].node, nodes[0].find_tag("div").node) # type: ignore[union-attr]
# Return descendant
li = nodes[0].find_tag("li")
self.assertHTMLEqual(li.to_html(), "<li>Hi</li>") # type: ignore[union-attr]
# Return None when not found
self.assertEqual(nodes[0].find_tag("main"), None)
# Insert children
li.append_children([nodes[4]]) # type: ignore[union-attr]
self.assertHTMLEqual(
fragment_result.html,
li.to_html(), # type: ignore[union-attr]
"""
<!doctype html>
<html>
<head>
<link href="https://..." />
</head>
<body>
<div class="abc xyz" data-id="123">
<ul>
<li>Hi</li>
</ul>
</div>
</body>
</html>
<li>
Hi
<button class="abc def">
Click me!
</button>
</li>
""",
)