mirror of
https://github.com/python/cpython.git
synced 2025-08-01 15:43:13 +00:00
HTMLParser is allowed to be more strict than sgmllib, so let's not
change their basic behavior: When parsing something that cannot possibly be valid in either HTML or XHTML, raise an exception.
This commit is contained in:
parent
a0ca3d611e
commit
7cf613dc77
2 changed files with 17 additions and 37 deletions
|
@ -269,17 +269,18 @@ class HTMLParser:
|
||||||
return -1
|
return -1
|
||||||
# in practice, this should look like: ((name|stringlit) S*)+ '>'
|
# in practice, this should look like: ((name|stringlit) S*)+ '>'
|
||||||
n = len(rawdata)
|
n = len(rawdata)
|
||||||
decltype = None
|
decltype, j = self.scan_name(j, i)
|
||||||
extrachars = ""
|
if j < 0:
|
||||||
|
return j
|
||||||
|
if decltype.lower() != "doctype":
|
||||||
|
raise HTMLParseError("unknown declaration: '%s'" % decltype,
|
||||||
|
self.getpos())
|
||||||
while j < n:
|
while j < n:
|
||||||
c = rawdata[j]
|
c = rawdata[j]
|
||||||
if c == ">":
|
if c == ">":
|
||||||
# end of declaration syntax
|
# end of declaration syntax
|
||||||
data = rawdata[i+2:j]
|
data = rawdata[i+2:j]
|
||||||
if decltype == "doctype":
|
self.handle_decl(data)
|
||||||
self.handle_decl(data)
|
|
||||||
else:
|
|
||||||
self.unknown_decl(data)
|
|
||||||
return j + 1
|
return j + 1
|
||||||
if c in "\"'":
|
if c in "\"'":
|
||||||
m = declstringlit.match(rawdata, j)
|
m = declstringlit.match(rawdata, j)
|
||||||
|
@ -287,30 +288,15 @@ class HTMLParser:
|
||||||
return -1 # incomplete
|
return -1 # incomplete
|
||||||
j = m.end()
|
j = m.end()
|
||||||
elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
|
elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
|
||||||
m = declname.match(rawdata, j)
|
name, j = self.scan_name(j, i)
|
||||||
if not m:
|
|
||||||
return -1 # incomplete
|
|
||||||
j = m.end()
|
|
||||||
if decltype is None:
|
|
||||||
decltype = m.group(0).rstrip().lower()
|
|
||||||
if decltype != "doctype":
|
|
||||||
extrachars = "="
|
|
||||||
elif c == "[" and decltype == "doctype":
|
elif c == "[" and decltype == "doctype":
|
||||||
j = self.parse_doctype_subset(j + 1, i)
|
j = self.parse_doctype_subset(j + 1, i)
|
||||||
if j < 0:
|
|
||||||
return j
|
|
||||||
elif c in extrachars:
|
|
||||||
j = j + 1
|
|
||||||
while j < n and rawdata[j] in string.whitespace:
|
|
||||||
j = j + 1
|
|
||||||
if j == n:
|
|
||||||
# end of buffer while in declaration
|
|
||||||
return -1
|
|
||||||
else:
|
else:
|
||||||
raise HTMLParseError(
|
raise HTMLParseError(
|
||||||
"unexpected char in declaration: %s" % `rawdata[j]`,
|
"unexpected char in declaration: %s" % `rawdata[j]`,
|
||||||
self.getpos())
|
self.getpos())
|
||||||
decltype = decltype or ''
|
if j < 0:
|
||||||
|
return j
|
||||||
return -1 # incomplete
|
return -1 # incomplete
|
||||||
|
|
||||||
# Internal -- scan past the internal subset in a <!DOCTYPE declaration,
|
# Internal -- scan past the internal subset in a <!DOCTYPE declaration,
|
||||||
|
@ -359,11 +345,9 @@ class HTMLParser:
|
||||||
if (j + 1) == n:
|
if (j + 1) == n:
|
||||||
# end of buffer; incomplete
|
# end of buffer; incomplete
|
||||||
return -1
|
return -1
|
||||||
m = declname.match(rawdata, j + 1)
|
s, j = self.scan_name(j + 1, declstartpos)
|
||||||
s = m.group()
|
if j < 0:
|
||||||
if s == rawdata[j+1:]:
|
return j
|
||||||
return -1
|
|
||||||
j = j + 1 + len(s.rstrip())
|
|
||||||
if rawdata[j] == ";":
|
if rawdata[j] == ";":
|
||||||
j = j + 1
|
j = j + 1
|
||||||
elif c == "]":
|
elif c == "]":
|
||||||
|
@ -383,8 +367,9 @@ class HTMLParser:
|
||||||
j = j + 1
|
j = j + 1
|
||||||
else:
|
else:
|
||||||
self.updatepos(declstartpos, j)
|
self.updatepos(declstartpos, j)
|
||||||
raise HTMLParseError("unexpected char in internal subset",
|
raise HTMLParseError(
|
||||||
self.getpos())
|
"unexpected char %s in internal subset" % `c`,
|
||||||
|
self.getpos())
|
||||||
# end of buffer reached
|
# end of buffer reached
|
||||||
return -1
|
return -1
|
||||||
|
|
||||||
|
|
|
@ -203,12 +203,7 @@ DOCTYPE html [
|
||||||
])
|
])
|
||||||
|
|
||||||
def test_illegal_declarations(self):
|
def test_illegal_declarations(self):
|
||||||
s = 'abc<!spacer type="block" height="25">def'
|
self._parse_error('<!spacer type="block" height="25">')
|
||||||
self._run_check(s, [
|
|
||||||
("data", "abc"),
|
|
||||||
("unknown decl", 'spacer type="block" height="25"'),
|
|
||||||
("data", "def"),
|
|
||||||
])
|
|
||||||
|
|
||||||
def test_starttag_end_boundary(self):
|
def test_starttag_end_boundary(self):
|
||||||
self._run_check("""<a b='<'>""", [("starttag", "a", [("b", "<")])])
|
self._run_check("""<a b='<'>""", [("starttag", "a", [("b", "<")])])
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue