gh-69426: HTMLParser: only unescape properly terminated character entities in attribute values (GH-95215)

According to the HTML5 spec, named character references in attribute values
should only be processed if they are not followed by an ASCII alphanumeric,
or an equals sign.

https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
This commit is contained in:
Sascha Ißbrücker 2025-05-07 17:49:49 +02:00 committed by GitHub
parent 3dfed23092
commit 77b14a6d58
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 57 additions and 9 deletions

View file

@ -12,6 +12,7 @@ import re
import _markupbase
from html import unescape
from html.entities import html5 as html5_entities
__all__ = ['HTMLParser']
@ -23,6 +24,7 @@ incomplete = re.compile('&[a-zA-Z#]')
entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
attr_charref = re.compile(r'&(#[0-9]+|#[xX][0-9a-fA-F]+|[a-zA-Z][a-zA-Z0-9]*)[;=]?')
starttagopen = re.compile('<[a-zA-Z]')
piclose = re.compile('>')
@ -57,6 +59,22 @@ endendtag = re.compile('>')
# </ and the tag name, so maybe this should be fixed
endtagfind = re.compile(r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
# Character reference processing logic specific to attribute values
# See: https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
def _replace_attr_charref(match):
ref = match.group(0)
# Numeric / hex char refs must always be unescaped
if ref.startswith('&#'):
return unescape(ref)
# Named character / entity references must only be unescaped
# if they are an exact match, and they are not followed by an equals sign
if not ref.endswith('=') and ref[1:] in html5_entities:
return unescape(ref)
# Otherwise do not unescape
return ref
def _unescape_attrvalue(s):
return attr_charref.sub(_replace_attr_charref, s)
class HTMLParser(_markupbase.ParserBase):
@ -323,7 +341,7 @@ class HTMLParser(_markupbase.ParserBase):
attrvalue[:1] == '"' == attrvalue[-1:]:
attrvalue = attrvalue[1:-1]
if attrvalue:
attrvalue = unescape(attrvalue)
attrvalue = _unescape_attrvalue(attrvalue)
attrs.append((attrname.lower(), attrvalue))
k = m.end()