diff --git a/Lib/html/parser.py b/Lib/html/parser.py
index afdb305d08c..662e85575a4 100644
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@@ -30,8 +30,8 @@ attrfind = re.compile(
r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?')
attrfind_tolerant = re.compile(
- r',?\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
- r'(\'[^\']*\'|"[^"]*"|[^>\s]*))?')
+ r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'
+ r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')
locatestarttagend = re.compile(r"""
<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
(?:\s+ # whitespace before attribute name
@@ -49,16 +49,16 @@ locatestarttagend = re.compile(r"""
locatestarttagend_tolerant = re.compile(r"""
<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
(?:\s* # optional whitespace before attribute name
- (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
- (?:\s*=\s* # value indicator
+ (?:(?<=['"\s])[^\s/>][^\s/=>]* # attribute name
+ (?:\s*=+\s* # value indicator
(?:'[^']*' # LITA-enclosed value
- |\"[^\"]*\" # LIT-enclosed value
- |[^'\">\s]+ # bare value
+ |"[^"]*" # LIT-enclosed value
+ |(?!['"])[^>\s]* # bare value
)
(?:\s*,)* # possibly followed by a comma
- )?
- )
- )*
+ )?\s*
+ )*
+ )?
\s* # trailing whitespace
""", re.VERBOSE)
endendtag = re.compile('>')
@@ -295,6 +295,7 @@ class HTMLParser(_markupbase.ParserBase):
elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
attrvalue[:1] == '"' == attrvalue[-1:]:
attrvalue = attrvalue[1:-1]
+ if attrvalue:
attrvalue = self.unescape(attrvalue)
attrs.append((attrname.lower(), attrvalue))
k = m.end()
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py
index b587ab80d54..1ce4594a44a 100644
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -196,60 +196,6 @@ DOCTYPE html [
("data", "this < text > contains < bare>pointy< brackets"),
])
- def test_attr_syntax(self):
- output = [
- ("starttag", "a", [("b", "v"), ("c", "v"), ("d", "v"), ("e", None)])
- ]
- self._run_check("""""", output)
- self._run_check("""""", output)
- self._run_check("""""", output)
- self._run_check("""""", output)
-
- def test_attr_values(self):
- self._run_check("""""",
- [("starttag", "a", [("b", "xxx\n\txxx"),
- ("c", "yyy\t\nyyy"),
- ("d", "\txyz\n")])
- ])
- self._run_check("""""", [
- ("starttag", "a", [("b", ""), ("c", "")]),
- ])
- # Regression test for SF patch #669683.
- self._run_check("", [
- ("starttag", "e", [("a", "rgb(1,2,3)")]),
- ])
- # Regression test for SF bug #921657.
- self._run_check("", [
- ("starttag", "a", [("href", "mailto:xyz@example.com")]),
- ])
-
- def test_attr_nonascii(self):
- # see issue 7311
- self._run_check("
", [
- ("starttag", "img", [("src", "/foo/bar.png"),
- ("alt", "\u4e2d\u6587")]),
- ])
- self._run_check("", [
- ("starttag", "a", [("title", "\u30c6\u30b9\u30c8"),
- ("href", "\u30c6\u30b9\u30c8.html")]),
- ])
- self._run_check('', [
- ("starttag", "a", [("title", "\u30c6\u30b9\u30c8"),
- ("href", "\u30c6\u30b9\u30c8.html")]),
- ])
-
- def test_attr_entity_replacement(self):
- self._run_check("""""", [
- ("starttag", "a", [("b", "&><\"'")]),
- ])
-
- def test_attr_funky_names(self):
- self._run_check("""""", [
- ("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")]),
- ])
-
def test_illegal_declarations(self):
self._parse_error('')
@@ -295,13 +241,11 @@ DOCTYPE html [
self._parse_error("")
self._parse_error("")
self._parse_error("")
self._parse_error("'")
self._parse_error("",
- [("starttag", "html", [("foo", "\u20AC&aa&unsupported;")])])
-
class HTMLParserTolerantTestCase(HTMLParserStrictTestCase):
@@ -371,15 +311,14 @@ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase):
def test_tolerant_parsing(self):
self._run_check('te>>xt&a<\n'
'
/img>