mirror of
https://github.com/python/cpython.git
synced 2025-09-27 10:50:04 +00:00
Adapt to new webchecker structure. Due to better structure of
getpage(), much less duplicate code is needed -- we only need to override readhtml().
This commit is contained in:
parent
00756bd4a6
commit
1a7eae919a
1 changed files with 33 additions and 59 deletions
|
@ -2,7 +2,7 @@
|
||||||
|
|
||||||
"""A variant on webchecker that creates a mirror copy of a remote site."""
|
"""A variant on webchecker that creates a mirror copy of a remote site."""
|
||||||
|
|
||||||
__version__ = "0.1"
|
__version__ = "$Revision$"
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
@ -11,22 +11,28 @@ import urllib
|
||||||
import getopt
|
import getopt
|
||||||
|
|
||||||
import webchecker
|
import webchecker
|
||||||
verbose = webchecker.verbose
|
|
||||||
|
# Extract real version number if necessary
|
||||||
|
if __version__[0] == '$':
|
||||||
|
_v = string.split(__version__)
|
||||||
|
if len(_v) == 3:
|
||||||
|
__version__ = _v[1]
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
global verbose
|
verbose = webchecker.VERBOSE
|
||||||
try:
|
try:
|
||||||
opts, args = getopt.getopt(sys.argv[1:], "qv")
|
opts, args = getopt.getopt(sys.argv[1:], "qv")
|
||||||
except getopt.error, msg:
|
except getopt.error, msg:
|
||||||
print msg
|
print msg
|
||||||
print "usage:", sys.argv[0], "[-v] ... [rooturl] ..."
|
print "usage:", sys.argv[0], "[-qv] ... [rooturl] ..."
|
||||||
return 2
|
return 2
|
||||||
for o, a in opts:
|
for o, a in opts:
|
||||||
if o == "-q":
|
if o == "-q":
|
||||||
webchecker.verbose = verbose = 0
|
verbose = 0
|
||||||
if o == "-v":
|
if o == "-v":
|
||||||
webchecker.verbose = verbose = verbose + 1
|
verbose = verbose + 1
|
||||||
c = Sucker(0)
|
c = Sucker()
|
||||||
|
c.setflags(verbose=verbose)
|
||||||
c.urlopener.addheaders = [
|
c.urlopener.addheaders = [
|
||||||
('User-agent', 'websucker/%s' % __version__),
|
('User-agent', 'websucker/%s' % __version__),
|
||||||
]
|
]
|
||||||
|
@ -38,63 +44,31 @@ def main():
|
||||||
|
|
||||||
class Sucker(webchecker.Checker):
|
class Sucker(webchecker.Checker):
|
||||||
|
|
||||||
# Alas, had to copy this to make one change...
|
checkext = 0
|
||||||
def getpage(self, url):
|
|
||||||
if url[:7] == 'mailto:' or url[:5] == 'news:':
|
def readhtml(self, url):
|
||||||
if verbose > 1: print " Not checking mailto/news URL"
|
text = None
|
||||||
return None
|
|
||||||
isint = self.inroots(url)
|
|
||||||
if not isint and not self.checkext:
|
|
||||||
if verbose > 1: print " Not checking ext link"
|
|
||||||
return None
|
|
||||||
path = self.savefilename(url)
|
path = self.savefilename(url)
|
||||||
saved = 0
|
|
||||||
try:
|
try:
|
||||||
f = open(path, "rb")
|
f = open(path, "rb")
|
||||||
except IOError:
|
except IOError:
|
||||||
try:
|
f = self.openpage(url)
|
||||||
f = self.urlopener.open(url)
|
if f:
|
||||||
except IOError, msg:
|
info = f.info()
|
||||||
msg = webchecker.sanitize(msg)
|
nurl = f.geturl()
|
||||||
if verbose > 0:
|
if nurl != url:
|
||||||
print "Error ", msg
|
url = nurl
|
||||||
if verbose > 0:
|
path = self.savefilename(url)
|
||||||
webchecker.show(" HREF ", url, " from", self.todo[url])
|
text = f.read()
|
||||||
self.setbad(url, msg)
|
f.close()
|
||||||
return None
|
self.savefile(text, path)
|
||||||
if not isint:
|
if not self.checkforhtml(info, url):
|
||||||
if verbose > 1: print " Not gathering links from ext URL"
|
text = None
|
||||||
safeclose(f)
|
|
||||||
return None
|
|
||||||
nurl = f.geturl()
|
|
||||||
if nurl != url:
|
|
||||||
path = self.savefilename(nurl)
|
|
||||||
info = f.info()
|
|
||||||
else:
|
else:
|
||||||
if verbose: print "Loading cached URL", url
|
if self.checkforhtml({}, url):
|
||||||
saved = 1
|
text = f.read()
|
||||||
nurl = url
|
f.close()
|
||||||
info = {}
|
return text, url
|
||||||
if url[-1:] == "/":
|
|
||||||
info["content-type"] = "text/html"
|
|
||||||
text = f.read()
|
|
||||||
if not saved: self.savefile(text, path)
|
|
||||||
if info.has_key('content-type'):
|
|
||||||
ctype = string.lower(info['content-type'])
|
|
||||||
else:
|
|
||||||
ctype = None
|
|
||||||
if nurl != url:
|
|
||||||
if verbose > 1:
|
|
||||||
print " Redirected to", nurl
|
|
||||||
if not ctype:
|
|
||||||
ctype, encoding = webchecker.mimetypes.guess_type(nurl)
|
|
||||||
if ctype != 'text/html':
|
|
||||||
webchecker.safeclose(f)
|
|
||||||
if verbose > 1:
|
|
||||||
print " Not HTML, mime type", ctype
|
|
||||||
return None
|
|
||||||
f.close()
|
|
||||||
return webchecker.Page(text, nurl)
|
|
||||||
|
|
||||||
def savefile(self, text, path):
|
def savefile(self, text, path):
|
||||||
dir, base = os.path.split(path)
|
dir, base = os.path.split(path)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue