#2927: Added the unescape() function to the html module.

2025-11-17 01:25:57 +00:00 · 2013-11-19 20:28:45 +02:00 · 2013-11-19 20:28:45 +02:00 · 4a9ee26750
commit 4a9ee26750
parent 5160da1afc
7 changed files with 215 additions and 49 deletions
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@ -8,9 +8,12 @@
 # and CDATA (character data -- only end tags are special).


-import _markupbase
 import re
 import warnings
+import _markupbase
+
+from html import unescape
+

 __all__ = ['HTMLParser']

@ -357,7 +360,7 @@ class HTMLParser(_markupbase.ParserBase):
                 attrvalue[:1] == '"' == attrvalue[-1:]:
                attrvalue = attrvalue[1:-1]
            if attrvalue:
-                attrvalue = self.unescape(attrvalue)
+                attrvalue = unescape(attrvalue)
            attrs.append((attrname.lower(), attrvalue))
            k = m.end()

@ -510,34 +513,3 @@ class HTMLParser(_markupbase.ParserBase):
    def unknown_decl(self, data):
        if self.strict:
            self.error("unknown declaration: %r" % (data,))
-
-    # Internal -- helper to remove special character quoting
-    def unescape(self, s):
-        if '&' not in s:
-            return s
-        def replaceEntities(s):
-            s = s.groups()[0]
-            try:
-                if s[0] == "#":
-                    s = s[1:]
-                    if s[0] in ['x','X']:
-                        c = int(s[1:].rstrip(';'), 16)
-                    else:
-                        c = int(s.rstrip(';'))
-                    return chr(c)
-            except ValueError:
-                return '&#' + s
-            else:
-                from html.entities import html5
-                if s in html5:
-                    return html5[s]
-                elif s.endswith(';'):
-                    return '&' + s
-                for x in range(2, len(s)):
-                    if s[:x] in html5:
-                        return html5[s[:x]] + s[x:]
-                else:
-                    return '&' + s
-
-        return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+;|\w{1,32};?))",
-                      replaceEntities, s, flags=re.ASCII)