mirror of
https://github.com/python/cpython.git
synced 2025-08-04 08:59:19 +00:00
gh-69426: HTMLParser: only unescape properly terminated character entities in attribute values (GH-95215)
According to the HTML5 spec, named character references in attribute values should only be processed if they are not followed by an ASCII alphanumeric, or an equals sign. https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
This commit is contained in:
parent
3dfed23092
commit
77b14a6d58
3 changed files with 57 additions and 9 deletions
|
@ -12,6 +12,7 @@ import re
|
|||
import _markupbase
|
||||
|
||||
from html import unescape
|
||||
from html.entities import html5 as html5_entities
|
||||
|
||||
|
||||
__all__ = ['HTMLParser']
|
||||
|
@ -23,6 +24,7 @@ incomplete = re.compile('&[a-zA-Z#]')
|
|||
|
||||
entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
|
||||
charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
|
||||
attr_charref = re.compile(r'&(#[0-9]+|#[xX][0-9a-fA-F]+|[a-zA-Z][a-zA-Z0-9]*)[;=]?')
|
||||
|
||||
starttagopen = re.compile('<[a-zA-Z]')
|
||||
piclose = re.compile('>')
|
||||
|
@ -57,6 +59,22 @@ endendtag = re.compile('>')
|
|||
# </ and the tag name, so maybe this should be fixed
|
||||
endtagfind = re.compile(r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
|
||||
|
||||
# Character reference processing logic specific to attribute values
|
||||
# See: https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
|
||||
def _replace_attr_charref(match):
|
||||
ref = match.group(0)
|
||||
# Numeric / hex char refs must always be unescaped
|
||||
if ref.startswith('&#'):
|
||||
return unescape(ref)
|
||||
# Named character / entity references must only be unescaped
|
||||
# if they are an exact match, and they are not followed by an equals sign
|
||||
if not ref.endswith('=') and ref[1:] in html5_entities:
|
||||
return unescape(ref)
|
||||
# Otherwise do not unescape
|
||||
return ref
|
||||
|
||||
def _unescape_attrvalue(s):
|
||||
return attr_charref.sub(_replace_attr_charref, s)
|
||||
|
||||
|
||||
class HTMLParser(_markupbase.ParserBase):
|
||||
|
@ -323,7 +341,7 @@ class HTMLParser(_markupbase.ParserBase):
|
|||
attrvalue[:1] == '"' == attrvalue[-1:]:
|
||||
attrvalue = attrvalue[1:-1]
|
||||
if attrvalue:
|
||||
attrvalue = unescape(attrvalue)
|
||||
attrvalue = _unescape_attrvalue(attrvalue)
|
||||
attrs.append((attrname.lower(), attrvalue))
|
||||
k = m.end()
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue