[3.12] gh-135462: Fix quadratic complexity in processing special input in HTMLParser (GH-135464) (GH-135483)
Some checks are pending
Tests / Change detection (push) Waiting to run
Tests / Docs (push) Blocked by required conditions
Tests / Check if the ABI has changed (push) Blocked by required conditions
Tests / Check if Autoconf files are up to date (push) Blocked by required conditions
Tests / (push) Blocked by required conditions
Tests / All required checks pass (push) Blocked by required conditions
Tests / Check if generated files are up to date (push) Blocked by required conditions
Tests / Windows MSI (push) Blocked by required conditions
Tests / Ubuntu SSL tests with OpenSSL (push) Blocked by required conditions
Tests / Hypothesis tests on Ubuntu (push) Blocked by required conditions
Tests / Address sanitizer (push) Blocked by required conditions
Lint / lint (push) Waiting to run

End-of-file errors are now handled according to the HTML5 specs --
comments and declarations are automatically closed, tags are ignored.
(cherry picked from commit 6eb6c5dbfb)
This commit is contained in:
Serhiy Storchaka 2025-07-04 01:12:10 +03:00 committed by GitHub
parent 033aa5cfd8
commit ab0893fd5c
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 116 additions and 23 deletions

View file

@ -25,6 +25,7 @@ entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
starttagopen = re.compile('<[a-zA-Z]')
endtagopen = re.compile('</[a-zA-Z]')
piclose = re.compile('>')
commentclose = re.compile(r'--\s*>')
# Note:
@ -177,7 +178,7 @@ class HTMLParser(_markupbase.ParserBase):
k = self.parse_pi(i)
elif startswith("<!", i):
k = self.parse_html_declaration(i)
elif (i + 1) < n:
elif (i + 1) < n or end:
self.handle_data("<")
k = i + 1
else:
@ -185,17 +186,35 @@ class HTMLParser(_markupbase.ParserBase):
if k < 0:
if not end:
break
k = rawdata.find('>', i + 1)
if k < 0:
k = rawdata.find('<', i + 1)
if k < 0:
k = i + 1
if starttagopen.match(rawdata, i): # < + letter
pass
elif startswith("</", i):
if i + 2 == n:
self.handle_data("</")
elif endtagopen.match(rawdata, i): # </ + letter
pass
else:
# bogus comment
self.handle_comment(rawdata[i+2:])
elif startswith("<!--", i):
j = n
for suffix in ("--!", "--", "-"):
if rawdata.endswith(suffix, i+4):
j -= len(suffix)
break
self.handle_comment(rawdata[i+4:j])
elif startswith("<![CDATA[", i):
self.unknown_decl(rawdata[i+3:])
elif rawdata[i:i+9].lower() == '<!doctype':
self.handle_decl(rawdata[i+2:])
elif startswith("<!", i):
# bogus comment
self.handle_comment(rawdata[i+2:])
elif startswith("<?", i):
self.handle_pi(rawdata[i+2:])
else:
k += 1
if self.convert_charrefs and not self.cdata_elem:
self.handle_data(unescape(rawdata[i:k]))
else:
self.handle_data(rawdata[i:k])
raise AssertionError("we should not get here!")
k = n
i = self.updatepos(i, k)
elif startswith("&#", i):
match = charref.match(rawdata, i)