feat: replacepython html parser djc-core-html-parser (#929)

Co-authored-by: Emil Stenström <emil@emilstenstrom.se>
This commit is contained in:
Juro Oravec 2025-01-29 14:24:25 +01:00 committed by GitHub
parent 55bcc11f50
commit b69c6e6624
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
10 changed files with 62 additions and 1501 deletions

View file

@ -29,6 +29,7 @@ classifiers = [
]
dependencies = [
'Django>=4.2',
'djc-core-html-parser>=1.0',
]
license = {text = "MIT"}
@ -118,6 +119,7 @@ requires = [
[tool.hatch.envs.default]
dependencies = [
"django",
"djc-core-html-parser",
"tox",
"pytest",
"flake8",

View file

@ -1,4 +1,5 @@
django
djc-core-html-parser
tox
pytest
flake8

View file

@ -26,6 +26,8 @@ distlib==0.3.9
# via virtualenv
django==5.1.5
# via -r requirements-dev.in
djc-core-html-parser==1.0.1
# via -r requirements-dev.in
filelock==3.16.1
# via
# tox

View file

@ -33,9 +33,9 @@ from django.templatetags.static import static
from django.urls import path, reverse
from django.utils.decorators import sync_and_async_middleware
from django.utils.safestring import SafeString, mark_safe
from djc_core_html_parser import set_html_attributes
from django_components.node import BaseNode
from django_components.util.html_parser import set_html_attributes
from django_components.util.misc import get_import_path, is_nonempty_str
if TYPE_CHECKING:
@ -438,9 +438,9 @@ SCRIPT_NAME_REGEX = re.compile(
rb"^(?P<comp_cls_hash>[\w\-\./]+?),(?P<id>[\w]+?),(?P<js>[0-9a-f]*?),(?P<css>[0-9a-f]*?)$"
)
# E.g. `data-djc-id-a1b2c3`
MAYBE_COMP_ID = r"(?: data-djc-id-\w{6})?"
MAYBE_COMP_ID = r'(?: data-djc-id-\w{6}="")?'
# E.g. `data-djc-css-99914b`
MAYBE_COMP_CSS_ID = r"(?: data-djc-css-\w{6})?"
MAYBE_COMP_CSS_ID = r'(?: data-djc-css-\w{6}="")?'
PLACEHOLDER_REGEX = re.compile(
r"{css_placeholder}|{js_placeholder}".format(

File diff suppressed because it is too large Load diff

View file

@ -184,7 +184,7 @@ class MainMediaTest(BaseTestCase):
rendered = render_dependencies(rendered_raw)
self.assertIn(
"Variable: <strong data-djc-id-a1bc41>test</strong>",
'Variable: <strong data-djc-id-a1bc41="">test</strong>',
rendered,
)
self.assertInHTML(

View file

@ -508,7 +508,7 @@ class MiddlewareTests(BaseTestCase):
assert_dependencies(rendered1)
self.assertEqual(
rendered1.count("Variable: <strong data-djc-id-a1bc41 data-djc-id-a1bc42>value</strong>"),
rendered1.count('Variable: <strong data-djc-id-a1bc41="" data-djc-id-a1bc42="">value</strong>'),
1,
)
@ -519,7 +519,7 @@ class MiddlewareTests(BaseTestCase):
assert_dependencies(rendered2)
self.assertEqual(
rendered2.count("Variable: <strong data-djc-id-a1bc43 data-djc-id-a1bc44>value</strong>"),
rendered2.count('Variable: <strong data-djc-id-a1bc43="" data-djc-id-a1bc44="">value</strong>'),
1,
)
@ -530,6 +530,6 @@ class MiddlewareTests(BaseTestCase):
assert_dependencies(rendered3)
self.assertEqual(
rendered3.count("Variable: <strong data-djc-id-a1bc45 data-djc-id-a1bc46>value</strong>"),
rendered3.count('Variable: <strong data-djc-id-a1bc45="" data-djc-id-a1bc46="">value</strong>'),
1,
)

View file

@ -266,10 +266,10 @@ class DynamicExprTests(BaseTestCase):
rendered.strip(),
(
"<!-- _RENDERED SimpleComponent_e258c0,a1bc3f,, -->\n"
" <div data-djc-id-a1bc3f></div>\n"
" <div data-djc-id-a1bc3f> abc</div>\n"
" <div data-djc-id-a1bc3f></div>\n"
" <div data-djc-id-a1bc3f> </div>"
' <div data-djc-id-a1bc3f=""></div>\n'
' <div data-djc-id-a1bc3f=""> abc</div>\n'
' <div data-djc-id-a1bc3f=""></div>\n'
' <div data-djc-id-a1bc3f=""> </div>'
),
)
@ -345,11 +345,11 @@ class DynamicExprTests(BaseTestCase):
rendered.strip(),
(
"<!-- _RENDERED SimpleComponent_6c8e94,a1bc3f,, -->\n"
" <div data-djc-id-a1bc3f> lorem ipsum dolor </div>\n"
" <div data-djc-id-a1bc3f> lorem ipsum dolor [{'a': 1}] </div>\n"
" <div data-djc-id-a1bc3f> True </div>\n"
" <div data-djc-id-a1bc3f> [{'a': 1}, {'a': 2}] </div>\n"
" <div data-djc-id-a1bc3f> {'a': 3} </div>"
' <div data-djc-id-a1bc3f=""> lorem ipsum dolor </div>\n'
' <div data-djc-id-a1bc3f=""> lorem ipsum dolor [{\'a\': 1}] </div>\n'
' <div data-djc-id-a1bc3f=""> True </div>\n'
' <div data-djc-id-a1bc3f=""> [{\'a\': 1}, {\'a\': 2}] </div>\n'
' <div data-djc-id-a1bc3f=""> {\'a\': 3} </div>'
),
)

View file

@ -1,7 +1,6 @@
from django.test import TestCase
from typing import List
from django_components.util.html_parser import HTMLTag, _parse_html as parse_html, set_html_attributes
from djc_core_html_parser import set_html_attributes
from .django_test_setup import setup_test_config
@ -14,14 +13,27 @@ class TestHTMLParser(TestCase):
def test_basic_transformation(self):
html = "<div><p>Hello</p></div>"
result, _ = set_html_attributes(html, root_attributes=["data-root"], all_attributes=["data-all"])
expected = "<div data-root data-all><p data-all>Hello</p></div>"
assert result == expected
self.assertHTMLEqual(
result,
"""
<div data-root data-all>
<p data-all>Hello</p>
</div>
""",
)
def test_multiple_roots(self):
html = "<div>First</div><span>Second</span>"
result, _ = set_html_attributes(html, root_attributes=["data-root"], all_attributes=["data-all"])
expected = "<div data-root data-all>First</div><span data-root data-all>Second</span>"
assert result == expected
self.assertHTMLEqual(
result,
"""
<div data-root data-all>First</div>
<span data-root data-all>Second</span>
""",
)
def test_complex_html(self):
html = """
@ -69,7 +81,7 @@ class TestHTMLParser(TestCase):
<p data-all data-v-123>&copy; 2024</p>
</footer>
""" # noqa: E501
assert result == expected
self.assertHTMLEqual(result, expected)
def test_void_elements(self):
test_cases = [
@ -81,7 +93,7 @@ class TestHTMLParser(TestCase):
for input_html, expected in test_cases:
result, _ = set_html_attributes(input_html, ["data-root"], ["data-v-123"])
assert result == expected
self.assertHTMLEqual(result, expected)
def test_html_head_with_meta(self):
html = """
@ -94,14 +106,17 @@ class TestHTMLParser(TestCase):
result, _ = set_html_attributes(html, ["data-root"], ["data-v-123"])
expected = """
self.assertHTMLEqual(
result,
"""
<head data-root data-v-123>
<meta charset="utf-8" data-v-123>
<title data-v-123>Test Page</title>
<link rel="stylesheet" href="style.css" data-v-123>
<meta name="description" content="Test" data-v-123>
</head>"""
assert result == expected
</head>
""",
)
def test_watch_attribute(self):
html = """
@ -112,24 +127,27 @@ class TestHTMLParser(TestCase):
</div>"""
result, captured = set_html_attributes(html, ["data-root"], ["data-v-123"], watch_on_attribute="data-id")
expected = """
self.assertHTMLEqual(
result,
"""
<div data-id="123" data-root data-v-123>
<p data-v-123>Regular element</p>
<span data-id="456" data-v-123>Nested element</span>
<img data-id="789" src="test.jpg" data-v-123/>
</div>"""
assert result == expected
</div>
""",
)
# Verify attribute capturing
assert len(captured) == 3
self.assertEqual(len(captured), 3)
# Root element should have both root and all attributes
assert captured["123"] == ["data-root", "data-v-123"]
self.assertEqual(captured["123"], ["data-root", "data-v-123"])
# Non-root elements should only have all attributes
assert captured["456"] == ["data-v-123"]
assert captured["789"] == ["data-v-123"]
self.assertEqual(captured["456"], ["data-v-123"])
self.assertEqual(captured["789"], ["data-v-123"])
def test_whitespace_preservation(self):
html = """<div>
@ -138,340 +156,9 @@ class TestHTMLParser(TestCase):
</div>"""
result, _ = set_html_attributes(html, ["data-root"], ["data-all"])
expected = """<div data-root data-all>
<p data-all> Hello World </p>
<span data-all> Text with spaces </span>
expected = """<div data-root="" data-all="">
<p data-all=""> Hello World </p>
<span data-all=""> Text with spaces </span>
</div>"""
assert result == expected
# This checks that the parser works irrespective of the main use case
class TestHTMLParserInternal(TestCase):
def test_parse_simple_tag(self):
processed_tags = []
def on_tag(tag: HTMLTag, tag_stack: List[HTMLTag]) -> None:
processed_tags.append(tag)
html = "<div>Hello</div>"
result = parse_html(html, on_tag)
self.assertEqual(result, html)
self.assertEqual(len(processed_tags), 1)
self.assertEqual(processed_tags[0].name, "div")
def test_parse_nested_tags(self):
processed_tags = []
def on_tag(tag: HTMLTag, tag_stack: List[HTMLTag]) -> None:
processed_tags.append((tag.name, len(tag_stack)))
html = "<div><p>Hello</p></div>"
result = parse_html(html, on_tag)
self.assertEqual(result, html)
self.assertEqual(len(processed_tags), 2)
self.assertEqual(processed_tags[0], ("p", 2)) # p tag with stack depth 2
self.assertEqual(processed_tags[1], ("div", 1)) # div tag with stack depth 1
def test_parse_attributes(self):
processed_tags = []
def on_tag(tag: HTMLTag, tag_stack: List[HTMLTag]) -> None:
processed_tags.append(tag)
html = '<div class="container" id="main">Hello</div>'
result = parse_html(html, on_tag)
self.assertEqual(result, html)
self.assertEqual(len(processed_tags), 1)
self.assertEqual(len(processed_tags[0].attrs), 2)
self.assertEqual(processed_tags[0].attrs[0].key, "class")
self.assertEqual(processed_tags[0].attrs[0].value, "container")
self.assertEqual(processed_tags[0].attrs[1].key, "id")
self.assertEqual(processed_tags[0].attrs[1].value, "main")
def test_void_elements(self):
processed_tags = []
def on_tag(tag: HTMLTag, tag_stack: List[HTMLTag]) -> None:
processed_tags.append(tag)
html = '<img src="test.jpg" />'
result = parse_html(html, on_tag)
self.assertEqual(result, html)
self.assertEqual(len(processed_tags), 1)
self.assertEqual(processed_tags[0].name, "img")
self.assertEqual(processed_tags[0].attrs[0].key, "src")
self.assertEqual(processed_tags[0].attrs[0].value, "test.jpg")
def test_add_attr(self):
def on_tag(tag: HTMLTag, tag_stack: List[HTMLTag]) -> None:
tag.add_attr("data-test", "value", quoted=True)
tag.add_attr("hidden", None, quoted=False)
html = "<div>Content</div>"
result = parse_html(html, on_tag)
self.assertEqual(result, '<div data-test="value" hidden>Content</div>')
def test_rename_attr(self):
def on_tag(tag: HTMLTag, tag_stack: List[HTMLTag]) -> None:
tag.rename_attr("class", "className")
html = '<div class="test">Content</div>'
result = parse_html(html, on_tag)
self.assertEqual(result, '<div className="test">Content</div>')
def test_delete_attr(self):
def on_tag(tag: HTMLTag, tag_stack: List[HTMLTag]) -> None:
tag.delete_attr("id")
html = '<div class="test" id="main">Content</div>'
result = parse_html(html, on_tag)
self.assertEqual(result, '<div class="test" >Content</div>')
def test_clear_attrs(self):
def on_tag(tag: HTMLTag, tag_stack: List[HTMLTag]) -> None:
tag.clear_attrs()
html = '<div class="test" id="main" data-value="123">Content</div>'
result = parse_html(html, on_tag)
self.assertEqual(result, "<div >Content</div>")
def test_add_after_clearing_attrs(self):
def on_tag(tag: HTMLTag, tag_stack: List[HTMLTag]) -> None:
tag.clear_attrs()
tag.add_attr("data-test", "value", quoted=True)
html = '<div class="test" id="main" data-value="123">Content</div>'
result = parse_html(html, on_tag)
self.assertEqual(result, '<div data-test="value">Content</div>')
def test_insert_content(self):
def on_tag(tag: HTMLTag, tag_stack: List[HTMLTag]) -> None:
tag.insert_content("Start ", 0)
tag.insert_content(" End", -1)
html = "<div>Content</div>"
result = parse_html(html, on_tag)
self.assertEqual(result, "<div>Start Content End</div>")
def test_clear_content(self):
def on_tag(tag: HTMLTag, tag_stack: List[HTMLTag]) -> None:
tag.clear_content()
html = "<div>Original content</div>"
result = parse_html(html, on_tag)
self.assertEqual(result, "<div></div>")
def test_replace_content(self):
def on_tag(tag: HTMLTag, tag_stack: List[HTMLTag]) -> None:
tag.replace_content("New content")
html = "<div>Original content</div>"
result = parse_html(html, on_tag)
self.assertEqual(result, "<div>New content</div>")
def test_prepend_append(self):
def on_tag(tag: HTMLTag, tag_stack: List[HTMLTag]) -> None:
tag.prepend("Before ")
tag.append(" after")
html = "<div>Content</div>"
result = parse_html(html, on_tag)
self.assertEqual(result, "Before <div>Content</div> after")
def test_wrap(self):
def on_tag(tag: HTMLTag, tag_stack: List[HTMLTag]) -> None:
tag.wrap('<section class="wrapper">', "</section>")
html = "<div>Content</div>"
result = parse_html(html, on_tag)
self.assertEqual(result, '<section class="wrapper"><div>Content</div></section>')
def test_unwrap(self):
def on_tag(tag: HTMLTag, tag_stack: List[HTMLTag]) -> None:
if tag.name == "span":
tag.unwrap()
html = "<div><span>Content</span></div>"
result = parse_html(html, on_tag)
self.assertEqual(result, "<div>Content</div>")
def test_rename_tag(self):
def on_tag(tag: HTMLTag, tag_stack: List[HTMLTag]) -> None:
tag.rename_tag("article")
html = "<div>Content</div>"
result = parse_html(html, on_tag)
self.assertEqual(result, "<article>Content</article>")
def test_get_attr_has_attr(self):
def on_tag(tag: HTMLTag, tag_stack: List[HTMLTag]) -> None:
assert tag.has_attr("class")
assert not tag.has_attr("id")
attr = tag.get_attr("class")
assert attr is not None and attr.value == "test"
assert tag.get_attr("id") is None
html = '<div class="test">Content</div>'
result = parse_html(html, on_tag)
self.assertEqual(result, html)
def test_tag_manipulation_complex(self):
def on_tag(tag: HTMLTag, tag_stack: List[HTMLTag]) -> None:
if tag.name == "div":
# Test add_attr
tag.add_attr("data-new", "value", quoted=True)
# Test rename_attr
tag.rename_attr("class", "className")
# Test delete_attr
tag.delete_attr("id")
# Test insert_content
tag.insert_content("<span>Start</span>", 0)
tag.insert_content("<span>End</span>", -1)
# Test wrap
tag.wrap("<section>", "</section>")
elif tag.name == "p":
# Test get_attr and has_attr
assert tag.has_attr("class")
attr = tag.get_attr("class")
assert attr is not None and attr.value == "inner"
# Test clear_attrs
tag.clear_attrs()
# Test clear_content and replace_content
tag.clear_content()
tag.replace_content("New content")
# Test prepend and append
tag.prepend("Before ")
tag.append(" after")
# Test rename_tag
tag.rename_tag("article")
# Test unwrap
tag.unwrap()
html = '<div class="test" id="main"><p class="inner">Original content</p></div>'
expected = '<section><div className="test" data-new="value"><span>Start</span>Before New content after<span>End</span></div></section>' # noqa: E501
result = parse_html(html, on_tag)
self.assertEqual(result, expected)
def test_complex_html(self):
processed_tags = []
def on_tag(tag: HTMLTag, tag_stack: List[HTMLTag]) -> None:
processed_tags.append(tag)
if tag.name == "body":
# Test attribute manipulation
tag.add_attr("data-modified", "true", quoted=True)
tag.rename_attr("class", "className")
elif tag.name == "div":
# Test content manipulation
tag.insert_content("<!-- Modified -->", 0)
tag.wrap('<div class="wrapper">', "</div>")
elif tag.name == "p":
# Test attribute without value
tag.add_attr("hidden", None, quoted=False)
html = """<!DOCTYPE html>
<html lang="en" data-theme="light">
<!-- Header section -->
<head>
<meta charset="UTF-8"/>
<title>Complex Test</title>
<link rel="stylesheet" href="style.css">
<script type="text/javascript">
// Single line comment with tags: <div></div>
/* Multi-line comment
</script>
*/
const template = `<div>${value}</div>`;
console.log('</script>');
</script>
</head>
<body class="main" id="content" data-loaded>
<![CDATA[
Some CDATA content with <tags> that should be preserved
]]>
<div class="container" style="display: flex">
<img src="test.jpg" alt="Test Image"/>
<p>Hello <strong>World</strong>!</p>
<input type="text" disabled value="test"/>
</div>
</body>
</html>"""
expected = """<!DOCTYPE html>
<html lang="en" data-theme="light">
<!-- Header section -->
<head>
<meta charset="UTF-8"/>
<title>Complex Test</title>
<link rel="stylesheet" href="style.css">
<script type="text/javascript">
// Single line comment with tags: <div></div>
/* Multi-line comment
</script>
*/
const template = `<div>${value}</div>`;
console.log('</script>');
</script>
</head>
<body className="main" id="content" data-loaded data-modified="true">
<![CDATA[
Some CDATA content with <tags> that should be preserved
]]>
<div class="wrapper"><div class="container" style="display: flex"><!-- Modified -->
<img src="test.jpg" alt="Test Image"/>
<p hidden>Hello <strong>World</strong>!</p>
<input type="text" disabled value="test"/>
</div></div>
</body>
</html>"""
result = parse_html(html, on_tag)
self.assertEqual(result, expected)
# Verify the structure of processed tags
self.assertEqual(len(processed_tags), 12) # Count all non-void elements
# Verify specific tag attributes
html_tag = next(tag for tag in processed_tags if tag.name == "html")
self.assertEqual(len(html_tag.attrs), 2)
self.assertEqual(html_tag.attrs[0].key, "lang")
self.assertEqual(html_tag.attrs[0].value, "en")
self.assertEqual(html_tag.attrs[1].key, "data-theme")
self.assertEqual(html_tag.attrs[1].value, "light")
# Verify void elements
img_tag = next(tag for tag in processed_tags if tag.name == "img")
self.assertEqual(len(img_tag.attrs), 2)
self.assertEqual(img_tag.attrs[0].key, "src")
self.assertEqual(img_tag.attrs[0].value, "test.jpg")
# Verify attribute without value
body_tag = next(tag for tag in processed_tags if tag.name == "body")
data_loaded_attr = next(attr for attr in body_tag.attrs if attr.key == "data-loaded")
self.assertIsNone(data_loaded_attr.value)
# Verify modified attributes
self.assertTrue(any(attr.key == "data-modified" and attr.value == "true" for attr in body_tag.attrs))
self.assertTrue(any(attr.key == "className" and attr.value == "main" for attr in body_tag.attrs))
# Verify p tag modifications
p_tag = next(tag for tag in processed_tags if tag.name == "p")
self.assertTrue(any(attr.key == "hidden" and attr.value is None for attr in p_tag.attrs))

View file

@ -31,6 +31,7 @@ deps =
django42: Django>=4.2,<4.3
django50: Django>=5.0,<5.1
django51: Django>=5.1,<5.2
djc-core-html-parser
pytest
pytest-xdist
# NOTE: Keep playwright is sync with the version in requirements-ci.txt