mirror of
https://github.com/python/cpython.git
synced 2025-07-07 19:35:27 +00:00
gh-135661: Fix parsing start and end tags in HTMLParser according to the HTML5 standard (GH-135930)
* Whitespaces no longer accepted between `</` and the tag name. E.g. `</ script>` does not end the script section. * Vertical tabulation (`\v`) and non-ASCII whitespaces no longer recognized as whitespaces. The only whitespaces are `\t\n\r\f `. * Null character (U+0000) no longer ends the tag name. * Attributes and slashes after the tag name in end tags are now ignored, instead of terminating after the first `>` in quoted attribute value. E.g. `</script/foo=">"/>`. * Multiple slashes and whitespaces between the last attribute and closing `>` are now ignored in both start and end tags. E.g. `<a foo=bar/ //>`. * Multiple `=` between attribute name and value are no longer collapsed. E.g. `<a foo==bar>` produces attribute "foo" with value "=bar". * Whitespaces between the `=` separator and attribute name or value are no longer ignored. E.g. `<a foo =bar>` produces two attributes "foo" and "=bar", both with value None; `<a foo= bar>` produces two attributes: "foo" with value "" and "bar" with value None. * Fix Sphinx errors. * Apply suggestions from code review Co-authored-by: Ezio Melotti <ezio.melotti@gmail.com> * Address review comments. * Move to Security. --------- Co-authored-by: Ezio Melotti <ezio.melotti@gmail.com>
This commit is contained in:
parent
938a5d7e62
commit
0243f97cba
3 changed files with 192 additions and 127 deletions
|
@ -31,15 +31,43 @@ endtagopen = re.compile('</[a-zA-Z]')
|
|||
piclose = re.compile('>')
|
||||
commentclose = re.compile(r'--\s*>')
|
||||
# Note:
|
||||
# 1) if you change tagfind/attrfind remember to update locatestarttagend too;
|
||||
# 2) if you change tagfind/attrfind and/or locatestarttagend the parser will
|
||||
# 1) if you change tagfind/attrfind remember to update locatetagend too;
|
||||
# 2) if you change tagfind/attrfind and/or locatetagend the parser will
|
||||
# explode, so don't do it.
|
||||
# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
|
||||
# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
|
||||
tagfind_tolerant = re.compile(r'([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*')
|
||||
attrfind_tolerant = re.compile(
|
||||
r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
|
||||
r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')
|
||||
# see the HTML5 specs section "13.2.5.6 Tag open state",
|
||||
# "13.2.5.8 Tag name state" and "13.2.5.33 Attribute name state".
|
||||
# https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
|
||||
# https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
|
||||
# https://html.spec.whatwg.org/multipage/parsing.html#attribute-name-state
|
||||
tagfind_tolerant = re.compile(r'([a-zA-Z][^\t\n\r\f />]*)(?:[\t\n\r\f ]|/(?!>))*')
|
||||
attrfind_tolerant = re.compile(r"""
|
||||
(
|
||||
(?<=['"\t\n\r\f /])[^\t\n\r\f />][^\t\n\r\f /=>]* # attribute name
|
||||
)
|
||||
(= # value indicator
|
||||
('[^']*' # LITA-enclosed value
|
||||
|"[^"]*" # LIT-enclosed value
|
||||
|(?!['"])[^>\t\n\r\f ]* # bare value
|
||||
)
|
||||
)?
|
||||
(?:[\t\n\r\f ]|/(?!>))* # possibly followed by a space
|
||||
""", re.VERBOSE)
|
||||
locatetagend = re.compile(r"""
|
||||
[a-zA-Z][^\t\n\r\f />]* # tag name
|
||||
[\t\n\r\f /]* # optional whitespace before attribute name
|
||||
(?:(?<=['"\t\n\r\f /])[^\t\n\r\f />][^\t\n\r\f /=>]* # attribute name
|
||||
(?:= # value indicator
|
||||
(?:'[^']*' # LITA-enclosed value
|
||||
|"[^"]*" # LIT-enclosed value
|
||||
|(?!['"])[^>\t\n\r\f ]* # bare value
|
||||
)
|
||||
)?
|
||||
[\t\n\r\f /]* # possibly followed by a space
|
||||
)*
|
||||
>?
|
||||
""", re.VERBOSE)
|
||||
# The following variables are not used, but are temporarily left for
|
||||
# backward compatibility.
|
||||
locatestarttagend_tolerant = re.compile(r"""
|
||||
<[a-zA-Z][^\t\n\r\f />\x00]* # tag name
|
||||
(?:[\s/]* # optional whitespace before attribute name
|
||||
|
@ -56,8 +84,6 @@ locatestarttagend_tolerant = re.compile(r"""
|
|||
\s* # trailing whitespace
|
||||
""", re.VERBOSE)
|
||||
endendtag = re.compile('>')
|
||||
# the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
|
||||
# </ and the tag name, so maybe this should be fixed
|
||||
endtagfind = re.compile(r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
|
||||
|
||||
# Character reference processing logic specific to attribute values
|
||||
|
@ -141,7 +167,8 @@ class HTMLParser(_markupbase.ParserBase):
|
|||
|
||||
def set_cdata_mode(self, elem):
|
||||
self.cdata_elem = elem.lower()
|
||||
self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
|
||||
self.interesting = re.compile(r'</%s(?=[\t\n\r\f />])' % self.cdata_elem,
|
||||
re.IGNORECASE|re.ASCII)
|
||||
|
||||
def clear_cdata_mode(self):
|
||||
self.interesting = interesting_normal
|
||||
|
@ -166,7 +193,7 @@ class HTMLParser(_markupbase.ParserBase):
|
|||
# & near the end and see if it's followed by a space or ;.
|
||||
amppos = rawdata.rfind('&', max(i, n-34))
|
||||
if (amppos >= 0 and
|
||||
not re.compile(r'[\s;]').search(rawdata, amppos)):
|
||||
not re.compile(r'[\t\n\r\f ;]').search(rawdata, amppos)):
|
||||
break # wait till we get all the text
|
||||
j = n
|
||||
else:
|
||||
|
@ -310,7 +337,7 @@ class HTMLParser(_markupbase.ParserBase):
|
|||
return self.parse_bogus_comment(i)
|
||||
|
||||
# Internal -- parse bogus comment, return length or -1 if not terminated
|
||||
# see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
|
||||
# see https://html.spec.whatwg.org/multipage/parsing.html#bogus-comment-state
|
||||
def parse_bogus_comment(self, i, report=1):
|
||||
rawdata = self.rawdata
|
||||
assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to '
|
||||
|
@ -336,6 +363,8 @@ class HTMLParser(_markupbase.ParserBase):
|
|||
|
||||
# Internal -- handle starttag, return end or -1 if not terminated
|
||||
def parse_starttag(self, i):
|
||||
# See the HTML5 specs section "13.2.5.8 Tag name state"
|
||||
# https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
|
||||
self.__starttag_text = None
|
||||
endpos = self.check_for_whole_start_tag(i)
|
||||
if endpos < 0:
|
||||
|
@ -381,76 +410,42 @@ class HTMLParser(_markupbase.ParserBase):
|
|||
# or -1 if incomplete.
|
||||
def check_for_whole_start_tag(self, i):
|
||||
rawdata = self.rawdata
|
||||
m = locatestarttagend_tolerant.match(rawdata, i)
|
||||
if m:
|
||||
j = m.end()
|
||||
next = rawdata[j:j+1]
|
||||
if next == ">":
|
||||
return j + 1
|
||||
if next == "/":
|
||||
if rawdata.startswith("/>", j):
|
||||
return j + 2
|
||||
if rawdata.startswith("/", j):
|
||||
# buffer boundary
|
||||
return -1
|
||||
# else bogus input
|
||||
if j > i:
|
||||
return j
|
||||
else:
|
||||
return i + 1
|
||||
if next == "":
|
||||
# end of input
|
||||
return -1
|
||||
if next in ("abcdefghijklmnopqrstuvwxyz=/"
|
||||
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
|
||||
# end of input in or before attribute value, or we have the
|
||||
# '/' from a '/>' ending
|
||||
return -1
|
||||
if j > i:
|
||||
return j
|
||||
else:
|
||||
return i + 1
|
||||
raise AssertionError("we should not get here!")
|
||||
match = locatetagend.match(rawdata, i+1)
|
||||
assert match
|
||||
j = match.end()
|
||||
if rawdata[j-1] != ">":
|
||||
return -1
|
||||
return j
|
||||
|
||||
# Internal -- parse endtag, return end or -1 if incomplete
|
||||
def parse_endtag(self, i):
|
||||
# See the HTML5 specs section "13.2.5.7 End tag open state"
|
||||
# https://html.spec.whatwg.org/multipage/parsing.html#end-tag-open-state
|
||||
rawdata = self.rawdata
|
||||
assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
|
||||
match = endendtag.search(rawdata, i+1) # >
|
||||
if not match:
|
||||
if rawdata.find('>', i+2) < 0: # fast check
|
||||
return -1
|
||||
gtpos = match.end()
|
||||
match = endtagfind.match(rawdata, i) # </ + tag + >
|
||||
if not match:
|
||||
if self.cdata_elem is not None:
|
||||
self.handle_data(rawdata[i:gtpos])
|
||||
return gtpos
|
||||
# find the name: w3.org/TR/html5/tokenization.html#tag-name-state
|
||||
namematch = tagfind_tolerant.match(rawdata, i+2)
|
||||
if not namematch:
|
||||
# w3.org/TR/html5/tokenization.html#end-tag-open-state
|
||||
if rawdata[i:i+3] == '</>':
|
||||
return i+3
|
||||
else:
|
||||
return self.parse_bogus_comment(i)
|
||||
tagname = namematch.group(1).lower()
|
||||
# consume and ignore other stuff between the name and the >
|
||||
# Note: this is not 100% correct, since we might have things like
|
||||
# </tag attr=">">, but looking for > after the name should cover
|
||||
# most of the cases and is much simpler
|
||||
gtpos = rawdata.find('>', namematch.end())
|
||||
self.handle_endtag(tagname)
|
||||
return gtpos+1
|
||||
if not endtagopen.match(rawdata, i): # </ + letter
|
||||
if rawdata[i+2:i+3] == '>': # </> is ignored
|
||||
# "missing-end-tag-name" parser error
|
||||
return i+3
|
||||
else:
|
||||
return self.parse_bogus_comment(i)
|
||||
|
||||
elem = match.group(1).lower() # script or style
|
||||
if self.cdata_elem is not None:
|
||||
if elem != self.cdata_elem:
|
||||
self.handle_data(rawdata[i:gtpos])
|
||||
return gtpos
|
||||
match = locatetagend.match(rawdata, i+2)
|
||||
assert match
|
||||
j = match.end()
|
||||
if rawdata[j-1] != ">":
|
||||
return -1
|
||||
|
||||
self.handle_endtag(elem)
|
||||
# find the name: "13.2.5.8 Tag name state"
|
||||
# https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
|
||||
match = tagfind_tolerant.match(rawdata, i+2)
|
||||
assert match
|
||||
tag = match.group(1).lower()
|
||||
self.handle_endtag(tag)
|
||||
self.clear_cdata_mode()
|
||||
return gtpos
|
||||
return j
|
||||
|
||||
# Overridable -- finish processing of start+end tag: <tag.../>
|
||||
def handle_startendtag(self, tag, attrs):
|
||||
|
|
|
@ -81,6 +81,13 @@ class EventCollectorCharrefs(EventCollector):
|
|||
self.fail('This should never be called with convert_charrefs=True')
|
||||
|
||||
|
||||
# The normal event collector normalizes the events in get_events,
|
||||
# so we override it to return the original list of events.
|
||||
class EventCollectorNoNormalize(EventCollector):
|
||||
def get_events(self):
|
||||
return self.events
|
||||
|
||||
|
||||
class TestCaseBase(unittest.TestCase):
|
||||
|
||||
def get_collector(self):
|
||||
|
@ -265,8 +272,7 @@ text
|
|||
("starttag", "foo:bar", [("one", "1"), ("two", "2")]),
|
||||
("starttag_text", s)])
|
||||
|
||||
def test_cdata_content(self):
|
||||
contents = [
|
||||
@support.subTests('content', [
|
||||
'<!-- not a comment --> ¬-an-entity-ref;',
|
||||
"<not a='start tag'>",
|
||||
'<a href="" /> <p> <span></span>',
|
||||
|
@ -279,54 +285,83 @@ text
|
|||
'src="http://www.example.org/r=\'+new '
|
||||
'Date().getTime()+\'"><\\/s\'+\'cript>\');\n//]]>'),
|
||||
'\n<!-- //\nvar foo = 3.14;\n// -->\n',
|
||||
'foo = "</sty" + "le>";',
|
||||
'<!-- \u2603 -->',
|
||||
# these two should be invalid according to the HTML 5 spec,
|
||||
# section 8.1.2.2
|
||||
#'foo = </\nscript>',
|
||||
#'foo = </ script>',
|
||||
]
|
||||
elements = ['script', 'style', 'SCRIPT', 'STYLE', 'Script', 'Style']
|
||||
for content in contents:
|
||||
for element in elements:
|
||||
element_lower = element.lower()
|
||||
s = '<{element}>{content}</{element}>'.format(element=element,
|
||||
content=content)
|
||||
self._run_check(s, [("starttag", element_lower, []),
|
||||
("data", content),
|
||||
("endtag", element_lower)])
|
||||
'foo = "</ script>"',
|
||||
'foo = "</scripture>"',
|
||||
'foo = "</script\v>"',
|
||||
'foo = "</script\xa0>"',
|
||||
'foo = "</ſcript>"',
|
||||
'foo = "</scrıpt>"',
|
||||
])
|
||||
def test_script_content(self, content):
|
||||
s = f'<script>{content}</script>'
|
||||
self._run_check(s, [("starttag", "script", []),
|
||||
("data", content),
|
||||
("endtag", "script")])
|
||||
|
||||
def test_cdata_with_closing_tags(self):
|
||||
@support.subTests('content', [
|
||||
'a::before { content: "<!-- not a comment -->"; }',
|
||||
'a::before { content: "¬-an-entity-ref;"; }',
|
||||
'a::before { content: "<not a=\'start tag\'>"; }',
|
||||
'a::before { content: "\u2603"; }',
|
||||
'a::before { content: "< /style>"; }',
|
||||
'a::before { content: "</ style>"; }',
|
||||
'a::before { content: "</styled>"; }',
|
||||
'a::before { content: "</style\v>"; }',
|
||||
'a::before { content: "</style\xa0>"; }',
|
||||
'a::before { content: "</ſtyle>"; }',
|
||||
])
|
||||
def test_style_content(self, content):
|
||||
s = f'<style>{content}</style>'
|
||||
self._run_check(s, [("starttag", "style", []),
|
||||
("data", content),
|
||||
("endtag", "style")])
|
||||
|
||||
@support.subTests('endtag', ['script', 'SCRIPT', 'script ', 'script\n',
|
||||
'script/', 'script foo=bar', 'script foo=">"'])
|
||||
def test_script_closing_tag(self, endtag):
|
||||
# see issue #13358
|
||||
# make sure that HTMLParser calls handle_data only once for each CDATA.
|
||||
# The normal event collector normalizes the events in get_events,
|
||||
# so we override it to return the original list of events.
|
||||
class Collector(EventCollector):
|
||||
def get_events(self):
|
||||
return self.events
|
||||
|
||||
content = """<!-- not a comment --> ¬-an-entity-ref;
|
||||
<a href="" /> </p><p> <span></span></style>
|
||||
'</script' + '>'"""
|
||||
for element in [' script', 'script ', ' script ',
|
||||
'\nscript', 'script\n', '\nscript\n']:
|
||||
element_lower = element.lower().strip()
|
||||
s = '<script>{content}</{element}>'.format(element=element,
|
||||
content=content)
|
||||
self._run_check(s, [("starttag", element_lower, []),
|
||||
("data", content),
|
||||
("endtag", element_lower)],
|
||||
collector=Collector(convert_charrefs=False))
|
||||
s = f'<ScrIPt>{content}</{endtag}>'
|
||||
self._run_check(s, [("starttag", "script", []),
|
||||
("data", content),
|
||||
("endtag", "script")],
|
||||
collector=EventCollectorNoNormalize(convert_charrefs=False))
|
||||
|
||||
def test_EOF_in_cdata(self):
|
||||
content = """<!-- not a comment --> ¬-an-entity-ref;
|
||||
<a href="" /> </p><p> <span></span></style>
|
||||
'</script' + '>'"""
|
||||
s = f'<script>{content}'
|
||||
self._run_check(s, [
|
||||
("starttag", 'script', []),
|
||||
("data", content)
|
||||
])
|
||||
@support.subTests('endtag', ['style', 'STYLE', 'style ', 'style\n',
|
||||
'style/', 'style foo=bar', 'style foo=">"'])
|
||||
def test_style_closing_tag(self, endtag):
|
||||
content = """
|
||||
b::before { content: "<!-- not a comment -->"; }
|
||||
p::before { content: "¬-an-entity-ref;"; }
|
||||
a::before { content: "<i>"; }
|
||||
a::after { content: "</i>"; }
|
||||
"""
|
||||
s = f'<StyLE>{content}</{endtag}>'
|
||||
self._run_check(s, [("starttag", "style", []),
|
||||
("data", content),
|
||||
("endtag", "style")],
|
||||
collector=EventCollectorNoNormalize(convert_charrefs=False))
|
||||
|
||||
@support.subTests('tail,end', [
|
||||
('', False),
|
||||
('<', False),
|
||||
('</', False),
|
||||
('</s', False),
|
||||
('</script', False),
|
||||
('</script ', True),
|
||||
('</script foo=bar', True),
|
||||
('</script foo=">', True),
|
||||
])
|
||||
def test_eof_in_script(self, tail, end):
|
||||
content = "a = 123"
|
||||
s = f'<ScrIPt>{content}{tail}'
|
||||
self._run_check(s, [("starttag", "script", []),
|
||||
("data", content if end else content + tail)],
|
||||
collector=EventCollectorNoNormalize(convert_charrefs=False))
|
||||
|
||||
def test_comments(self):
|
||||
html = ("<!-- I'm a valid comment -->"
|
||||
|
@ -443,7 +478,7 @@ text
|
|||
self._run_check("</$>", [('comment', '$')])
|
||||
self._run_check("</", [('data', '</')])
|
||||
self._run_check("</a", [])
|
||||
self._run_check("</ a>", [('endtag', 'a')])
|
||||
self._run_check("</ a>", [('comment', ' a')])
|
||||
self._run_check("</ a", [('comment', ' a')])
|
||||
self._run_check("<a<a>", [('starttag', 'a<a', [])])
|
||||
self._run_check("</a<a>", [('endtag', 'a<a')])
|
||||
|
@ -491,6 +526,10 @@ text
|
|||
]
|
||||
self._run_check(html, expected)
|
||||
|
||||
def test_slashes_in_endtag(self):
|
||||
self._run_check('</a/>', [('endtag', 'a')])
|
||||
self._run_check('</a foo="var"/>', [('endtag', 'a')])
|
||||
|
||||
def test_declaration_junk_chars(self):
|
||||
self._run_check("<!DOCTYPE foo $ >", [('decl', 'DOCTYPE foo $ ')])
|
||||
|
||||
|
@ -525,15 +564,11 @@ text
|
|||
self._run_check(html, expected)
|
||||
|
||||
def test_broken_invalid_end_tag(self):
|
||||
# This is technically wrong (the "> shouldn't be included in the 'data')
|
||||
# but is probably not worth fixing it (in addition to all the cases of
|
||||
# the previous test, it would require a full attribute parsing).
|
||||
# see #13993
|
||||
html = '<b>This</b attr=">"> confuses the parser'
|
||||
expected = [('starttag', 'b', []),
|
||||
('data', 'This'),
|
||||
('endtag', 'b'),
|
||||
('data', '"> confuses the parser')]
|
||||
('data', ' confuses the parser')]
|
||||
self._run_check(html, expected)
|
||||
|
||||
def test_correct_detection_of_start_tags(self):
|
||||
|
@ -560,7 +595,7 @@ text
|
|||
|
||||
html = '<div style="", foo = "bar" ><b>The <a href="some_url">rain</a>'
|
||||
expected = [
|
||||
('starttag', 'div', [('style', ''), (',', None), ('foo', 'bar')]),
|
||||
('starttag', 'div', [('style', ''), (',', None), ('foo', None), ('=', None), ('"bar"', None)]),
|
||||
('starttag', 'b', []),
|
||||
('data', 'The '),
|
||||
('starttag', 'a', [('href', 'some_url')]),
|
||||
|
@ -749,9 +784,15 @@ class AttributesTestCase(TestCaseBase):
|
|||
("starttag", "a", [("b", "v"), ("c", "v"), ("d", "v"), ("e", None)])
|
||||
]
|
||||
self._run_check("""<a b='v' c="v" d=v e>""", output)
|
||||
self._run_check("""<a b = 'v' c = "v" d = v e>""", output)
|
||||
self._run_check("""<a\nb\n=\n'v'\nc\n=\n"v"\nd\n=\nv\ne>""", output)
|
||||
self._run_check("""<a\tb\t=\t'v'\tc\t=\t"v"\td\t=\tv\te>""", output)
|
||||
self._run_check("<a foo==bar>", [('starttag', 'a', [('foo', '=bar')])])
|
||||
self._run_check("<a foo =bar>", [('starttag', 'a', [('foo', None), ('=bar', None)])])
|
||||
self._run_check("<a foo\t=bar>", [('starttag', 'a', [('foo', None), ('=bar', None)])])
|
||||
self._run_check("<a foo\v=bar>", [('starttag', 'a', [('foo\v', 'bar')])])
|
||||
self._run_check("<a foo\xa0=bar>", [('starttag', 'a', [('foo\xa0', 'bar')])])
|
||||
self._run_check("<a foo= bar>", [('starttag', 'a', [('foo', ''), ('bar', None)])])
|
||||
self._run_check("<a foo=\tbar>", [('starttag', 'a', [('foo', ''), ('bar', None)])])
|
||||
self._run_check("<a foo=\vbar>", [('starttag', 'a', [('foo', '\vbar')])])
|
||||
self._run_check("<a foo=\xa0bar>", [('starttag', 'a', [('foo', '\xa0bar')])])
|
||||
|
||||
def test_attr_values(self):
|
||||
self._run_check("""<a b='xxx\n\txxx' c="yyy\t\nyyy" d='\txyz\n'>""",
|
||||
|
@ -760,6 +801,10 @@ class AttributesTestCase(TestCaseBase):
|
|||
("d", "\txyz\n")])])
|
||||
self._run_check("""<a b='' c="">""",
|
||||
[("starttag", "a", [("b", ""), ("c", "")])])
|
||||
self._run_check("<a b=\t c=\n>",
|
||||
[("starttag", "a", [("b", ""), ("c", "")])])
|
||||
self._run_check("<a b=\v c=\xa0>",
|
||||
[("starttag", "a", [("b", "\v"), ("c", "\xa0")])])
|
||||
# Regression test for SF patch #669683.
|
||||
self._run_check("<e a=rgb(1,2,3)>",
|
||||
[("starttag", "e", [("a", "rgb(1,2,3)")])])
|
||||
|
@ -831,7 +876,7 @@ class AttributesTestCase(TestCaseBase):
|
|||
('data', 'test - bad2'), ('endtag', 'a'),
|
||||
('starttag', 'a', [('href', "test'\xa0style='color:red;bad3'")]),
|
||||
('data', 'test - bad3'), ('endtag', 'a'),
|
||||
('starttag', 'a', [('href', "test'\xa0style='color:red;bad4'")]),
|
||||
('starttag', 'a', [('href', None), ('=', None), ("test' style", 'color:red;bad4')]),
|
||||
('data', 'test - bad4'), ('endtag', 'a')
|
||||
]
|
||||
self._run_check(html, expected)
|
||||
|
|
|
@ -0,0 +1,25 @@
|
|||
Fix parsing start and end tags in :class:`html.parser.HTMLParser`
|
||||
according to the HTML5 standard.
|
||||
|
||||
* Whitespaces no longer accepted between ``</`` and the tag name.
|
||||
E.g. ``</ script>`` does not end the script section.
|
||||
|
||||
* Vertical tabulation (``\v``) and non-ASCII whitespaces no longer recognized
|
||||
as whitespaces. The only whitespaces are ``\t\n\r\f`` and space.
|
||||
|
||||
* Null character (U+0000) no longer ends the tag name.
|
||||
|
||||
* Attributes and slashes after the tag name in end tags are now ignored,
|
||||
instead of terminating after the first ``>`` in quoted attribute value.
|
||||
E.g. ``</script/foo=">"/>``.
|
||||
|
||||
* Multiple slashes and whitespaces between the last attribute and closing ``>``
|
||||
are now ignored in both start and end tags. E.g. ``<a foo=bar/ //>``.
|
||||
|
||||
* Multiple ``=`` between attribute name and value are no longer collapsed.
|
||||
E.g. ``<a foo==bar>`` produces attribute "foo" with value "=bar".
|
||||
|
||||
* Whitespaces between the ``=`` separator and attribute name or value are no
|
||||
longer ignored. E.g. ``<a foo =bar>`` produces two attributes "foo" and
|
||||
"=bar", both with value None; ``<a foo= bar>`` produces two attributes:
|
||||
"foo" with value "" and "bar" with value None.
|
Loading…
Add table
Add a link
Reference in a new issue