mirror of
https://github.com/python/cpython.git
synced 2025-11-02 11:08:57 +00:00
Major overhaul. Don't use global variable (e.g. verbose); use
instance variables. Make all global functions methods, for easy overriding. Restructure getpage() for easy overriding. Add save_pickle() method and load_pickle() global function to make it easier for other programs to emulate the toplevel interface.
This commit is contained in:
parent
1ee492e5d4
commit
00756bd4a6
1 changed files with 191 additions and 130 deletions
|
|
@ -94,7 +94,7 @@ rooturl -- URL to start checking
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
__version__ = "0.5"
|
__version__ = "$Revision$"
|
||||||
|
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
|
|
@ -112,9 +112,17 @@ import sgmllib
|
||||||
import mimetypes
|
import mimetypes
|
||||||
import robotparser
|
import robotparser
|
||||||
|
|
||||||
|
# Extract real version number if necessary
|
||||||
|
if __version__[0] == '$':
|
||||||
|
_v = string.split(__version__)
|
||||||
|
if len(_v) == 3:
|
||||||
|
__version__ = _v[1]
|
||||||
|
|
||||||
|
|
||||||
# Tunable parameters
|
# Tunable parameters
|
||||||
DEFROOT = "file:/usr/local/etc/httpd/htdocs/" # Default root URL
|
DEFROOT = "file:/usr/local/etc/httpd/htdocs/" # Default root URL
|
||||||
|
CHECKEXT = 1 # Check external references (1 deep)
|
||||||
|
VERBOSE = 1 # Verbosity level (0-3)
|
||||||
MAXPAGE = 150000 # Ignore files bigger than this
|
MAXPAGE = 150000 # Ignore files bigger than this
|
||||||
ROUNDSIZE = 50 # Number of links processed per round
|
ROUNDSIZE = 50 # Number of links processed per round
|
||||||
DUMPFILE = "@webchecker.pickle" # Pickled checkpoint
|
DUMPFILE = "@webchecker.pickle" # Pickled checkpoint
|
||||||
|
|
@ -122,16 +130,15 @@ AGENTNAME = "webchecker" # Agent name for robots.txt parser
|
||||||
|
|
||||||
|
|
||||||
# Global variables
|
# Global variables
|
||||||
verbose = 1
|
|
||||||
maxpage = MAXPAGE
|
|
||||||
roundsize = ROUNDSIZE
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
global verbose, maxpage, roundsize
|
checkext = CHECKEXT
|
||||||
|
verbose = VERBOSE
|
||||||
|
maxpage = MAXPAGE
|
||||||
|
roundsize = ROUNDSIZE
|
||||||
dumpfile = DUMPFILE
|
dumpfile = DUMPFILE
|
||||||
restart = 0
|
restart = 0
|
||||||
checkext = 1
|
|
||||||
norun = 0
|
norun = 0
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|
@ -163,18 +170,15 @@ def main():
|
||||||
print AGENTNAME, "version", __version__
|
print AGENTNAME, "version", __version__
|
||||||
|
|
||||||
if restart:
|
if restart:
|
||||||
if verbose > 0:
|
c = load_pickle(dumpfile=dumpfile, verbose=verbose)
|
||||||
print "Loading checkpoint from %s ..." % dumpfile
|
|
||||||
f = open(dumpfile, "rb")
|
|
||||||
c = pickle.load(f)
|
|
||||||
f.close()
|
|
||||||
if verbose > 0:
|
|
||||||
print "Done."
|
|
||||||
print "Root:", string.join(c.roots, "\n ")
|
|
||||||
else:
|
else:
|
||||||
c = Checker(checkext)
|
c = Checker()
|
||||||
if not args:
|
|
||||||
args.append(DEFROOT)
|
c.setflags(checkext=checkext, verbose=verbose,
|
||||||
|
maxpage=maxpage, roundsize=roundsize)
|
||||||
|
|
||||||
|
if not restart and not args:
|
||||||
|
args.append(DEFROOT)
|
||||||
|
|
||||||
for arg in args:
|
for arg in args:
|
||||||
c.addroot(arg)
|
c.addroot(arg)
|
||||||
|
|
@ -192,40 +196,43 @@ def main():
|
||||||
if verbose > 0:
|
if verbose > 0:
|
||||||
print "[report interrupted]"
|
print "[report interrupted]"
|
||||||
|
|
||||||
if not c.changed:
|
if c.save_pickle(dumpfile):
|
||||||
if verbose > 0:
|
if dumpfile == DUMPFILE:
|
||||||
print
|
print "Use ``%s -R'' to restart." % sys.argv[0]
|
||||||
print "No need to save checkpoint"
|
else:
|
||||||
elif not dumpfile:
|
print "Use ``%s -R -d %s'' to restart." % (sys.argv[0], dumpfile)
|
||||||
if verbose > 0:
|
|
||||||
print "No dumpfile, won't save checkpoint"
|
|
||||||
else:
|
def load_pickle(dumpfile=DUMPFILE, verbose=VERBOSE):
|
||||||
if verbose > 0:
|
if verbose > 0:
|
||||||
print
|
print "Loading checkpoint from %s ..." % dumpfile
|
||||||
print "Saving checkpoint to %s ..." % dumpfile
|
f = open(dumpfile, "rb")
|
||||||
newfile = dumpfile + ".new"
|
c = pickle.load(f)
|
||||||
f = open(newfile, "wb")
|
f.close()
|
||||||
pickle.dump(c, f)
|
if verbose > 0:
|
||||||
f.close()
|
print "Done."
|
||||||
try:
|
print "Root:", string.join(c.roots, "\n ")
|
||||||
os.unlink(dumpfile)
|
return c
|
||||||
except os.error:
|
|
||||||
pass
|
|
||||||
os.rename(newfile, dumpfile)
|
|
||||||
if verbose > 0:
|
|
||||||
print "Done."
|
|
||||||
if dumpfile == DUMPFILE:
|
|
||||||
print "Use ``%s -R'' to restart." % sys.argv[0]
|
|
||||||
else:
|
|
||||||
print "Use ``%s -R -d %s'' to restart." % (sys.argv[0],
|
|
||||||
dumpfile)
|
|
||||||
|
|
||||||
|
|
||||||
class Checker:
|
class Checker:
|
||||||
|
|
||||||
def __init__(self, checkext=1):
|
checkext = CHECKEXT
|
||||||
|
verbose = VERBOSE
|
||||||
|
maxpage = MAXPAGE
|
||||||
|
roundsize = ROUNDSIZE
|
||||||
|
|
||||||
|
validflags = tuple(dir())
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
self.reset()
|
self.reset()
|
||||||
self.checkext = checkext
|
|
||||||
|
def setflags(self, **kw):
|
||||||
|
for key in kw.keys():
|
||||||
|
if key not in self.validflags:
|
||||||
|
raise NameError, "invalid keyword argument: %s" % str(key)
|
||||||
|
for key, value in kw.items():
|
||||||
|
setattr(self, key, value)
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
self.roots = []
|
self.roots = []
|
||||||
|
|
@ -243,6 +250,7 @@ class Checker:
|
||||||
return (self.roots, self.todo, self.done, self.bad, self.round)
|
return (self.roots, self.todo, self.done, self.bad, self.round)
|
||||||
|
|
||||||
def __setstate__(self, state):
|
def __setstate__(self, state):
|
||||||
|
self.reset()
|
||||||
(self.roots, self.todo, self.done, self.bad, self.round) = state
|
(self.roots, self.todo, self.done, self.bad, self.round) = state
|
||||||
for root in self.roots:
|
for root in self.roots:
|
||||||
self.addrobot(root)
|
self.addrobot(root)
|
||||||
|
|
@ -268,24 +276,24 @@ class Checker:
|
||||||
if self.robots.has_key(root): return
|
if self.robots.has_key(root): return
|
||||||
url = urlparse.urljoin(root, "/robots.txt")
|
url = urlparse.urljoin(root, "/robots.txt")
|
||||||
self.robots[root] = rp = robotparser.RobotFileParser()
|
self.robots[root] = rp = robotparser.RobotFileParser()
|
||||||
if verbose > 2:
|
if self.verbose > 2:
|
||||||
print "Parsing", url
|
print "Parsing", url
|
||||||
rp.debug = verbose > 3
|
rp.debug = self.verbose > 3
|
||||||
rp.set_url(url)
|
rp.set_url(url)
|
||||||
try:
|
try:
|
||||||
rp.read()
|
rp.read()
|
||||||
except IOError, msg:
|
except IOError, msg:
|
||||||
if verbose > 1:
|
if self.verbose > 1:
|
||||||
print "I/O error parsing", url, ":", msg
|
print "I/O error parsing", url, ":", msg
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
while self.todo:
|
while self.todo:
|
||||||
self.round = self.round + 1
|
self.round = self.round + 1
|
||||||
if verbose > 0:
|
if self.verbose > 0:
|
||||||
print
|
print
|
||||||
print "Round %d (%s)" % (self.round, self.status())
|
print "Round %d (%s)" % (self.round, self.status())
|
||||||
print
|
print
|
||||||
urls = self.todo.keys()[:roundsize]
|
urls = self.todo.keys()[:self.roundsize]
|
||||||
for url in urls:
|
for url in urls:
|
||||||
self.dopage(url)
|
self.dopage(url)
|
||||||
|
|
||||||
|
|
@ -325,9 +333,9 @@ class Checker:
|
||||||
print " msg", msg
|
print " msg", msg
|
||||||
|
|
||||||
def dopage(self, url):
|
def dopage(self, url):
|
||||||
if verbose > 1:
|
if self.verbose > 1:
|
||||||
if verbose > 2:
|
if self.verbose > 2:
|
||||||
show("Check ", url, " from", self.todo[url])
|
self.show("Check ", url, " from", self.todo[url])
|
||||||
else:
|
else:
|
||||||
print "Check ", url
|
print "Check ", url
|
||||||
page = self.getpage(url)
|
page = self.getpage(url)
|
||||||
|
|
@ -346,17 +354,17 @@ class Checker:
|
||||||
|
|
||||||
def newdonelink(self, url, origin):
|
def newdonelink(self, url, origin):
|
||||||
self.done[url].append(origin)
|
self.done[url].append(origin)
|
||||||
if verbose > 3:
|
if self.verbose > 3:
|
||||||
print " Done link", url
|
print " Done link", url
|
||||||
|
|
||||||
def newtodolink(self, url, origin):
|
def newtodolink(self, url, origin):
|
||||||
if self.todo.has_key(url):
|
if self.todo.has_key(url):
|
||||||
self.todo[url].append(origin)
|
self.todo[url].append(origin)
|
||||||
if verbose > 3:
|
if self.verbose > 3:
|
||||||
print " Seen todo link", url
|
print " Seen todo link", url
|
||||||
else:
|
else:
|
||||||
self.todo[url] = [origin]
|
self.todo[url] = [origin]
|
||||||
if verbose > 3:
|
if self.verbose > 3:
|
||||||
print " New todo link", url
|
print " New todo link", url
|
||||||
|
|
||||||
def markdone(self, url):
|
def markdone(self, url):
|
||||||
|
|
@ -373,56 +381,79 @@ class Checker:
|
||||||
|
|
||||||
def getpage(self, url):
|
def getpage(self, url):
|
||||||
if url[:7] == 'mailto:' or url[:5] == 'news:':
|
if url[:7] == 'mailto:' or url[:5] == 'news:':
|
||||||
if verbose > 1: print " Not checking mailto/news URL"
|
if self.verbose > 1: print " Not checking mailto/news URL"
|
||||||
return None
|
return None
|
||||||
isint = self.inroots(url)
|
isint = self.inroots(url)
|
||||||
if not isint and not self.checkext:
|
if not isint:
|
||||||
if verbose > 1: print " Not checking ext link"
|
if not self.checkext:
|
||||||
|
if self.verbose > 1: print " Not checking ext link"
|
||||||
|
return None
|
||||||
|
f = self.openpage(url)
|
||||||
|
if f:
|
||||||
|
self.safeclose(f)
|
||||||
return None
|
return None
|
||||||
|
text, nurl = self.readhtml(url)
|
||||||
|
if nurl != url:
|
||||||
|
if self.verbose > 1:
|
||||||
|
print " Redirected to", nurl
|
||||||
|
url = nurl
|
||||||
|
if text:
|
||||||
|
return Page(text, url, verbose=self.verbose, maxpage=self.maxpage)
|
||||||
|
|
||||||
|
def readhtml(self, url):
|
||||||
|
text = None
|
||||||
|
f, url = self.openhtml(url)
|
||||||
|
if f:
|
||||||
|
text = f.read()
|
||||||
|
f.close()
|
||||||
|
return text, url
|
||||||
|
|
||||||
|
def openhtml(self, url):
|
||||||
|
f = self.openpage(url)
|
||||||
|
if f:
|
||||||
|
url = f.geturl()
|
||||||
|
info = f.info()
|
||||||
|
if not self.checkforhtml(info, url):
|
||||||
|
self.safeclose(f)
|
||||||
|
f = None
|
||||||
|
return f, url
|
||||||
|
|
||||||
|
def openpage(self, url):
|
||||||
try:
|
try:
|
||||||
f = self.urlopener.open(url)
|
return self.urlopener.open(url)
|
||||||
except IOError, msg:
|
except IOError, msg:
|
||||||
msg = sanitize(msg)
|
msg = self.sanitize(msg)
|
||||||
if verbose > 0:
|
if self.verbose > 0:
|
||||||
print "Error ", msg
|
print "Error ", msg
|
||||||
if verbose > 0:
|
if self.verbose > 0:
|
||||||
show(" HREF ", url, " from", self.todo[url])
|
self.show(" HREF ", url, " from", self.todo[url])
|
||||||
self.setbad(url, msg)
|
self.setbad(url, msg)
|
||||||
return None
|
return None
|
||||||
if not isint:
|
|
||||||
if verbose > 1: print " Not gathering links from ext URL"
|
def checkforhtml(self, info, url):
|
||||||
safeclose(f)
|
|
||||||
return None
|
|
||||||
nurl = f.geturl()
|
|
||||||
info = f.info()
|
|
||||||
if info.has_key('content-type'):
|
if info.has_key('content-type'):
|
||||||
ctype = string.lower(info['content-type'])
|
ctype = string.lower(info['content-type'])
|
||||||
else:
|
else:
|
||||||
ctype = None
|
if url[-1:] == "/":
|
||||||
if nurl != url:
|
return 1
|
||||||
if verbose > 1:
|
ctype, encoding = mimetypes.guess_type(url)
|
||||||
print " Redirected to", nurl
|
if ctype == 'text/html':
|
||||||
if not ctype:
|
return 1
|
||||||
ctype, encoding = mimetypes.guess_type(nurl)
|
else:
|
||||||
if ctype != 'text/html':
|
if self.verbose > 1:
|
||||||
safeclose(f)
|
|
||||||
if verbose > 1:
|
|
||||||
print " Not HTML, mime type", ctype
|
print " Not HTML, mime type", ctype
|
||||||
return None
|
return 0
|
||||||
text = f.read()
|
|
||||||
f.close()
|
|
||||||
return Page(text, nurl)
|
|
||||||
|
|
||||||
def setgood(self, url):
|
def setgood(self, url):
|
||||||
if self.bad.has_key(url):
|
if self.bad.has_key(url):
|
||||||
del self.bad[url]
|
del self.bad[url]
|
||||||
self.changed = 1
|
self.changed = 1
|
||||||
if verbose > 0:
|
if self.verbose > 0:
|
||||||
print "(Clear previously seen error)"
|
print "(Clear previously seen error)"
|
||||||
|
|
||||||
def setbad(self, url, msg):
|
def setbad(self, url, msg):
|
||||||
if self.bad.has_key(url) and self.bad[url] == msg:
|
if self.bad.has_key(url) and self.bad[url] == msg:
|
||||||
if verbose > 0:
|
if self.verbose > 0:
|
||||||
print "(Seen this error before)"
|
print "(Seen this error before)"
|
||||||
return
|
return
|
||||||
self.bad[url] = msg
|
self.bad[url] = msg
|
||||||
|
|
@ -444,23 +475,88 @@ class Checker:
|
||||||
except KeyError:
|
except KeyError:
|
||||||
self.errors[url] = [triple]
|
self.errors[url] = [triple]
|
||||||
|
|
||||||
|
# The following used to be toplevel functions; they have been
|
||||||
|
# changed into methods so they can be overridden in subclasses.
|
||||||
|
|
||||||
|
def show(self, p1, link, p2, origins):
|
||||||
|
print p1, link
|
||||||
|
i = 0
|
||||||
|
for source, rawlink in origins:
|
||||||
|
i = i+1
|
||||||
|
if i == 2:
|
||||||
|
p2 = ' '*len(p2)
|
||||||
|
print p2, source,
|
||||||
|
if rawlink != link: print "(%s)" % rawlink,
|
||||||
|
print
|
||||||
|
|
||||||
|
def sanitize(self, msg):
|
||||||
|
if isinstance(IOError, ClassType) and isinstance(msg, IOError):
|
||||||
|
# Do the other branch recursively
|
||||||
|
msg.args = self.sanitize(msg.args)
|
||||||
|
elif isinstance(msg, TupleType):
|
||||||
|
if len(msg) >= 4 and msg[0] == 'http error' and \
|
||||||
|
isinstance(msg[3], InstanceType):
|
||||||
|
# Remove the Message instance -- it may contain
|
||||||
|
# a file object which prevents pickling.
|
||||||
|
msg = msg[:3] + msg[4:]
|
||||||
|
return msg
|
||||||
|
|
||||||
|
def safeclose(self, f):
|
||||||
|
try:
|
||||||
|
url = f.geturl()
|
||||||
|
except AttributeError:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
if url[:4] == 'ftp:' or url[:7] == 'file://':
|
||||||
|
# Apparently ftp connections don't like to be closed
|
||||||
|
# prematurely...
|
||||||
|
text = f.read()
|
||||||
|
f.close()
|
||||||
|
|
||||||
|
def save_pickle(self, dumpfile=DUMPFILE):
|
||||||
|
if not self.changed:
|
||||||
|
if self.verbose > 0:
|
||||||
|
print
|
||||||
|
print "No need to save checkpoint"
|
||||||
|
elif not dumpfile:
|
||||||
|
if self.verbose > 0:
|
||||||
|
print "No dumpfile, won't save checkpoint"
|
||||||
|
else:
|
||||||
|
if self.verbose > 0:
|
||||||
|
print
|
||||||
|
print "Saving checkpoint to %s ..." % dumpfile
|
||||||
|
newfile = dumpfile + ".new"
|
||||||
|
f = open(newfile, "wb")
|
||||||
|
pickle.dump(self, f)
|
||||||
|
f.close()
|
||||||
|
try:
|
||||||
|
os.unlink(dumpfile)
|
||||||
|
except os.error:
|
||||||
|
pass
|
||||||
|
os.rename(newfile, dumpfile)
|
||||||
|
if self.verbose > 0:
|
||||||
|
print "Done."
|
||||||
|
return 1
|
||||||
|
|
||||||
|
|
||||||
class Page:
|
class Page:
|
||||||
|
|
||||||
def __init__(self, text, url):
|
def __init__(self, text, url, verbose=VERBOSE, maxpage=MAXPAGE):
|
||||||
self.text = text
|
self.text = text
|
||||||
self.url = url
|
self.url = url
|
||||||
|
self.verbose = verbose
|
||||||
|
self.maxpage = maxpage
|
||||||
|
|
||||||
def getlinkinfos(self):
|
def getlinkinfos(self):
|
||||||
size = len(self.text)
|
size = len(self.text)
|
||||||
if size > maxpage:
|
if size > self.maxpage:
|
||||||
if verbose > 0:
|
if self.verbose > 0:
|
||||||
print "Skip huge file", self.url
|
print "Skip huge file", self.url
|
||||||
print " (%.0f Kbytes)" % (size*0.001)
|
print " (%.0f Kbytes)" % (size*0.001)
|
||||||
return []
|
return []
|
||||||
if verbose > 2:
|
if self.verbose > 2:
|
||||||
print " Parsing", self.url, "(%d bytes)" % size
|
print " Parsing", self.url, "(%d bytes)" % size
|
||||||
parser = MyHTMLParser()
|
parser = MyHTMLParser(verbose=self.verbose)
|
||||||
parser.feed(self.text)
|
parser.feed(self.text)
|
||||||
parser.close()
|
parser.close()
|
||||||
rawlinks = parser.getlinks()
|
rawlinks = parser.getlinks()
|
||||||
|
|
@ -529,10 +625,11 @@ class MyURLopener(urllib.FancyURLopener):
|
||||||
|
|
||||||
class MyHTMLParser(sgmllib.SGMLParser):
|
class MyHTMLParser(sgmllib.SGMLParser):
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self, verbose=VERBOSE):
|
||||||
self.base = None
|
self.base = None
|
||||||
self.links = {}
|
self.links = {}
|
||||||
sgmllib.SGMLParser.__init__ (self)
|
self.myverbose = verbose
|
||||||
|
sgmllib.SGMLParser.__init__(self)
|
||||||
|
|
||||||
def start_a(self, attributes):
|
def start_a(self, attributes):
|
||||||
self.link_attr(attributes, 'href')
|
self.link_attr(attributes, 'href')
|
||||||
|
|
@ -559,7 +656,7 @@ class MyHTMLParser(sgmllib.SGMLParser):
|
||||||
if name == 'href':
|
if name == 'href':
|
||||||
if value: value = string.strip(value)
|
if value: value = string.strip(value)
|
||||||
if value:
|
if value:
|
||||||
if verbose > 1:
|
if self.myverbose > 1:
|
||||||
print " Base", value
|
print " Base", value
|
||||||
self.base = value
|
self.base = value
|
||||||
|
|
||||||
|
|
@ -570,41 +667,5 @@ class MyHTMLParser(sgmllib.SGMLParser):
|
||||||
return self.base
|
return self.base
|
||||||
|
|
||||||
|
|
||||||
def show(p1, link, p2, origins):
|
|
||||||
print p1, link
|
|
||||||
i = 0
|
|
||||||
for source, rawlink in origins:
|
|
||||||
i = i+1
|
|
||||||
if i == 2:
|
|
||||||
p2 = ' '*len(p2)
|
|
||||||
print p2, source,
|
|
||||||
if rawlink != link: print "(%s)" % rawlink,
|
|
||||||
print
|
|
||||||
|
|
||||||
|
|
||||||
def sanitize(msg):
|
|
||||||
if (type(msg) == TupleType and
|
|
||||||
len(msg) >= 4 and
|
|
||||||
msg[0] == 'http error' and
|
|
||||||
type(msg[3]) == InstanceType):
|
|
||||||
# Remove the Message instance -- it may contain
|
|
||||||
# a file object which prevents pickling.
|
|
||||||
msg = msg[:3] + msg[4:]
|
|
||||||
return msg
|
|
||||||
|
|
||||||
|
|
||||||
def safeclose(f):
|
|
||||||
try:
|
|
||||||
url = f.geturl()
|
|
||||||
except AttributeError:
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
if url[:4] == 'ftp:' or url[:7] == 'file://':
|
|
||||||
# Apparently ftp connections don't like to be closed
|
|
||||||
# prematurely...
|
|
||||||
text = f.read()
|
|
||||||
f.close()
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
main()
|
main()
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue