mirror of
https://github.com/python/cpython.git
synced 2025-09-26 18:29:57 +00:00
Convert to using string methods instead of the string module.
In goahead(), use a bound version of rawdata.startswith() since we use the same method all the time and never change the value of rawdata. This can save a lot of bound method creation.
This commit is contained in:
parent
073148c4ef
commit
248b04383f
1 changed files with 25 additions and 29 deletions
|
@ -10,7 +10,6 @@
|
||||||
|
|
||||||
import markupbase
|
import markupbase
|
||||||
import re
|
import re
|
||||||
import string
|
|
||||||
|
|
||||||
# Regular expressions used for parsing
|
# Regular expressions used for parsing
|
||||||
|
|
||||||
|
@ -23,7 +22,6 @@ charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
|
||||||
|
|
||||||
starttagopen = re.compile('<[a-zA-Z]')
|
starttagopen = re.compile('<[a-zA-Z]')
|
||||||
piclose = re.compile('>')
|
piclose = re.compile('>')
|
||||||
endtagopen = re.compile('</')
|
|
||||||
commentclose = re.compile(r'--\s*>')
|
commentclose = re.compile(r'--\s*>')
|
||||||
tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
|
tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
|
||||||
attrfind = re.compile(
|
attrfind = re.compile(
|
||||||
|
@ -96,7 +94,6 @@ class HTMLParser(markupbase.ParserBase):
|
||||||
def reset(self):
|
def reset(self):
|
||||||
"""Reset this instance. Loses all unprocessed data."""
|
"""Reset this instance. Loses all unprocessed data."""
|
||||||
self.rawdata = ''
|
self.rawdata = ''
|
||||||
self.stack = []
|
|
||||||
self.lasttag = '???'
|
self.lasttag = '???'
|
||||||
self.interesting = interesting_normal
|
self.interesting = interesting_normal
|
||||||
markupbase.ParserBase.reset(self)
|
markupbase.ParserBase.reset(self)
|
||||||
|
@ -145,18 +142,19 @@ class HTMLParser(markupbase.ParserBase):
|
||||||
if i < j: self.handle_data(rawdata[i:j])
|
if i < j: self.handle_data(rawdata[i:j])
|
||||||
i = self.updatepos(i, j)
|
i = self.updatepos(i, j)
|
||||||
if i == n: break
|
if i == n: break
|
||||||
if rawdata[i] == '<':
|
startswith = rawdata.startswith
|
||||||
|
if startswith('<', i):
|
||||||
if starttagopen.match(rawdata, i): # < + letter
|
if starttagopen.match(rawdata, i): # < + letter
|
||||||
k = self.parse_starttag(i)
|
k = self.parse_starttag(i)
|
||||||
elif endtagopen.match(rawdata, i): # </
|
elif startswith("</", i):
|
||||||
k = self.parse_endtag(i)
|
k = self.parse_endtag(i)
|
||||||
if k >= 0:
|
if k >= 0:
|
||||||
self.clear_cdata_mode()
|
self.clear_cdata_mode()
|
||||||
elif rawdata.startswith("<!--", i): # <!--
|
elif startswith("<!--", i):
|
||||||
k = self.parse_comment(i)
|
k = self.parse_comment(i)
|
||||||
elif rawdata.startswith("<?", i): # <?
|
elif startswith("<?", i):
|
||||||
k = self.parse_pi(i)
|
k = self.parse_pi(i)
|
||||||
elif rawdata.startswith("<!", i): # <!
|
elif startswith("<!", i):
|
||||||
k = self.parse_declaration(i)
|
k = self.parse_declaration(i)
|
||||||
elif (i + 1) < n:
|
elif (i + 1) < n:
|
||||||
self.handle_data("<")
|
self.handle_data("<")
|
||||||
|
@ -168,33 +166,32 @@ class HTMLParser(markupbase.ParserBase):
|
||||||
self.error("EOF in middle of construct")
|
self.error("EOF in middle of construct")
|
||||||
break
|
break
|
||||||
i = self.updatepos(i, k)
|
i = self.updatepos(i, k)
|
||||||
elif rawdata[i:i+2] == "&#":
|
elif startswith("&#", i):
|
||||||
match = charref.match(rawdata, i)
|
match = charref.match(rawdata, i)
|
||||||
if match:
|
if match:
|
||||||
name = match.group()[2:-1]
|
name = match.group()[2:-1]
|
||||||
self.handle_charref(name)
|
self.handle_charref(name)
|
||||||
k = match.end()
|
k = match.end()
|
||||||
if rawdata[k-1] != ';':
|
if not startswith(';', k-1):
|
||||||
k = k - 1
|
k = k - 1
|
||||||
i = self.updatepos(i, k)
|
i = self.updatepos(i, k)
|
||||||
continue
|
continue
|
||||||
else:
|
else:
|
||||||
break
|
break
|
||||||
elif rawdata[i] == '&':
|
elif startswith('&', i):
|
||||||
match = entityref.match(rawdata, i)
|
match = entityref.match(rawdata, i)
|
||||||
if match:
|
if match:
|
||||||
name = match.group(1)
|
name = match.group(1)
|
||||||
self.handle_entityref(name)
|
self.handle_entityref(name)
|
||||||
k = match.end()
|
k = match.end()
|
||||||
if rawdata[k-1] != ';':
|
if not startswith(';', k-1):
|
||||||
k = k - 1
|
k = k - 1
|
||||||
i = self.updatepos(i, k)
|
i = self.updatepos(i, k)
|
||||||
continue
|
continue
|
||||||
match = incomplete.match(rawdata, i)
|
match = incomplete.match(rawdata, i)
|
||||||
if match:
|
if match:
|
||||||
# match.group() will contain at least 2 chars
|
# match.group() will contain at least 2 chars
|
||||||
rest = rawdata[i:]
|
if end and match.group() == rawdata[i:]:
|
||||||
if end and match.group() == rest:
|
|
||||||
self.error("EOF in middle of entity or char ref")
|
self.error("EOF in middle of entity or char ref")
|
||||||
# incomplete
|
# incomplete
|
||||||
break
|
break
|
||||||
|
@ -252,7 +249,7 @@ class HTMLParser(markupbase.ParserBase):
|
||||||
match = tagfind.match(rawdata, i+1)
|
match = tagfind.match(rawdata, i+1)
|
||||||
assert match, 'unexpected call to parse_starttag()'
|
assert match, 'unexpected call to parse_starttag()'
|
||||||
k = match.end()
|
k = match.end()
|
||||||
self.lasttag = tag = string.lower(rawdata[i+1:k])
|
self.lasttag = tag = rawdata[i+1:k].lower()
|
||||||
|
|
||||||
while k < endpos:
|
while k < endpos:
|
||||||
m = attrfind.match(rawdata, k)
|
m = attrfind.match(rawdata, k)
|
||||||
|
@ -265,21 +262,21 @@ class HTMLParser(markupbase.ParserBase):
|
||||||
attrvalue[:1] == '"' == attrvalue[-1:]:
|
attrvalue[:1] == '"' == attrvalue[-1:]:
|
||||||
attrvalue = attrvalue[1:-1]
|
attrvalue = attrvalue[1:-1]
|
||||||
attrvalue = self.unescape(attrvalue)
|
attrvalue = self.unescape(attrvalue)
|
||||||
attrs.append((string.lower(attrname), attrvalue))
|
attrs.append((attrname.lower(), attrvalue))
|
||||||
k = m.end()
|
k = m.end()
|
||||||
|
|
||||||
end = string.strip(rawdata[k:endpos])
|
end = rawdata[k:endpos].strip()
|
||||||
if end not in (">", "/>"):
|
if end not in (">", "/>"):
|
||||||
lineno, offset = self.getpos()
|
lineno, offset = self.getpos()
|
||||||
if "\n" in self.__starttag_text:
|
if "\n" in self.__starttag_text:
|
||||||
lineno = lineno + string.count(self.__starttag_text, "\n")
|
lineno = lineno + self.__starttag_text.count("\n")
|
||||||
offset = len(self.__starttag_text) \
|
offset = len(self.__starttag_text) \
|
||||||
- string.rfind(self.__starttag_text, "\n")
|
- self.__starttag_text.rfind("\n")
|
||||||
else:
|
else:
|
||||||
offset = offset + len(self.__starttag_text)
|
offset = offset + len(self.__starttag_text)
|
||||||
self.error("junk characters in start tag: %s"
|
self.error("junk characters in start tag: %s"
|
||||||
% `rawdata[k:endpos][:20]`)
|
% `rawdata[k:endpos][:20]`)
|
||||||
if end[-2:] == '/>':
|
if end.endswith('/>'):
|
||||||
# XHTML-style empty tag: <span attr="value" />
|
# XHTML-style empty tag: <span attr="value" />
|
||||||
self.handle_startendtag(tag, attrs)
|
self.handle_startendtag(tag, attrs)
|
||||||
else:
|
else:
|
||||||
|
@ -299,10 +296,9 @@ class HTMLParser(markupbase.ParserBase):
|
||||||
if next == ">":
|
if next == ">":
|
||||||
return j + 1
|
return j + 1
|
||||||
if next == "/":
|
if next == "/":
|
||||||
s = rawdata[j:j+2]
|
if rawdata.startswith("/>", j):
|
||||||
if s == "/>":
|
|
||||||
return j + 2
|
return j + 2
|
||||||
if s == "/":
|
if rawdata.startswith("/", j):
|
||||||
# buffer boundary
|
# buffer boundary
|
||||||
return -1
|
return -1
|
||||||
# else bogus input
|
# else bogus input
|
||||||
|
@ -332,7 +328,7 @@ class HTMLParser(markupbase.ParserBase):
|
||||||
if not match:
|
if not match:
|
||||||
self.error("bad end tag: %s" % `rawdata[i:j]`)
|
self.error("bad end tag: %s" % `rawdata[i:j]`)
|
||||||
tag = match.group(1)
|
tag = match.group(1)
|
||||||
self.handle_endtag(string.lower(tag))
|
self.handle_endtag(tag.lower())
|
||||||
return j
|
return j
|
||||||
|
|
||||||
# Overridable -- finish processing of start+end tag: <tag.../>
|
# Overridable -- finish processing of start+end tag: <tag.../>
|
||||||
|
@ -379,9 +375,9 @@ class HTMLParser(markupbase.ParserBase):
|
||||||
def unescape(self, s):
|
def unescape(self, s):
|
||||||
if '&' not in s:
|
if '&' not in s:
|
||||||
return s
|
return s
|
||||||
s = string.replace(s, "<", "<")
|
s = s.replace("<", "<")
|
||||||
s = string.replace(s, ">", ">")
|
s = s.replace(">", ">")
|
||||||
s = string.replace(s, "'", "'")
|
s = s.replace("'", "'")
|
||||||
s = string.replace(s, """, '"')
|
s = s.replace(""", '"')
|
||||||
s = string.replace(s, "&", "&") # Must be last
|
s = s.replace("&", "&") # Must be last
|
||||||
return s
|
return s
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue