mirror of
https://github.com/python/cpython.git
synced 2025-07-27 21:24:32 +00:00
Patch #912410: Replace HTML entity references for attribute values
in HTMLParser.
This commit is contained in:
parent
ff432e6f4a
commit
ab8a6bba25
4 changed files with 43 additions and 13 deletions
|
@ -358,12 +358,30 @@ class HTMLParser(markupbase.ParserBase):
|
|||
self.error("unknown declaration: %r" % (data,))
|
||||
|
||||
# Internal -- helper to remove special character quoting
|
||||
entitydefs = None
|
||||
def unescape(self, s):
|
||||
if '&' not in s:
|
||||
return s
|
||||
s = s.replace("<", "<")
|
||||
s = s.replace(">", ">")
|
||||
s = s.replace("'", "'")
|
||||
s = s.replace(""", '"')
|
||||
s = s.replace("&", "&") # Must be last
|
||||
return s
|
||||
def replaceEntities(s):
|
||||
s = s.groups()[0]
|
||||
if s[0] == "#":
|
||||
s = s[1:]
|
||||
if s[0] in ['x','X']:
|
||||
c = int(s[1:], 16)
|
||||
else:
|
||||
c = int(s)
|
||||
return unichr(c)
|
||||
else:
|
||||
# Cannot use name2codepoint directly, because HTMLParser supports apos,
|
||||
# which is not part of HTML 4
|
||||
import htmlentitydefs
|
||||
if HTMLParser.entitydefs is None:
|
||||
entitydefs = HTMLParser.entitydefs = {'apos':u"'"}
|
||||
for k, v in htmlentitydefs.name2codepoint.iteritems():
|
||||
entitydefs[k] = unichr(v)
|
||||
try:
|
||||
return self.entitydefs[s]
|
||||
except KeyError:
|
||||
return '&'+s+';'
|
||||
|
||||
return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", replaceEntities, s)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue