mirror of
https://github.com/python/cpython.git
synced 2025-07-24 19:54:21 +00:00

(with one small bugfix in bgen/bgen/scantools.py) This replaces string module functions with string methods for the stuff in the Tools directory. Several uses of string.letters etc. are still remaining.
125 lines
3.3 KiB
Python
Executable file
125 lines
3.3 KiB
Python
Executable file
#! /usr/bin/env python
|
|
|
|
"""A variant on webchecker that creates a mirror copy of a remote site."""
|
|
|
|
__version__ = "$Revision$"
|
|
|
|
import os
|
|
import sys
|
|
import urllib
|
|
import getopt
|
|
|
|
import webchecker
|
|
|
|
# Extract real version number if necessary
|
|
if __version__[0] == '$':
|
|
_v = __version__.split()
|
|
if len(_v) == 3:
|
|
__version__ = _v[1]
|
|
|
|
def main():
|
|
verbose = webchecker.VERBOSE
|
|
try:
|
|
opts, args = getopt.getopt(sys.argv[1:], "qv")
|
|
except getopt.error, msg:
|
|
print msg
|
|
print "usage:", sys.argv[0], "[-qv] ... [rooturl] ..."
|
|
return 2
|
|
for o, a in opts:
|
|
if o == "-q":
|
|
verbose = 0
|
|
if o == "-v":
|
|
verbose = verbose + 1
|
|
c = Sucker()
|
|
c.setflags(verbose=verbose)
|
|
c.urlopener.addheaders = [
|
|
('User-agent', 'websucker/%s' % __version__),
|
|
]
|
|
for arg in args:
|
|
print "Adding root", arg
|
|
c.addroot(arg)
|
|
print "Run..."
|
|
c.run()
|
|
|
|
class Sucker(webchecker.Checker):
|
|
|
|
checkext = 0
|
|
nonames = 1
|
|
|
|
# SAM 11/13/99: in general, URLs are now URL pairs.
|
|
# Since we've suppressed name anchor checking,
|
|
# we can ignore the second dimension.
|
|
|
|
def readhtml(self, url_pair):
|
|
url = url_pair[0]
|
|
text = None
|
|
path = self.savefilename(url)
|
|
try:
|
|
f = open(path, "rb")
|
|
except IOError:
|
|
f = self.openpage(url_pair)
|
|
if f:
|
|
info = f.info()
|
|
nurl = f.geturl()
|
|
if nurl != url:
|
|
url = nurl
|
|
path = self.savefilename(url)
|
|
text = f.read()
|
|
f.close()
|
|
self.savefile(text, path)
|
|
if not self.checkforhtml(info, url):
|
|
text = None
|
|
else:
|
|
if self.checkforhtml({}, url):
|
|
text = f.read()
|
|
f.close()
|
|
return text, url
|
|
|
|
def savefile(self, text, path):
|
|
dir, base = os.path.split(path)
|
|
makedirs(dir)
|
|
try:
|
|
f = open(path, "wb")
|
|
f.write(text)
|
|
f.close()
|
|
self.message("saved %s", path)
|
|
except IOError, msg:
|
|
self.message("didn't save %s: %s", path, str(msg))
|
|
|
|
def savefilename(self, url):
|
|
type, rest = urllib.splittype(url)
|
|
host, path = urllib.splithost(rest)
|
|
path = path.lstrip("/")
|
|
user, host = urllib.splituser(host)
|
|
host, port = urllib.splitnport(host)
|
|
host = host.lower()
|
|
if not path or path[-1] == "/":
|
|
path = path + "index.html"
|
|
if os.sep != "/":
|
|
path = os.sep.join(path.split("/"))
|
|
if os.name == "mac":
|
|
path = os.sep + path
|
|
path = os.path.join(host, path)
|
|
return path
|
|
|
|
def makedirs(dir):
|
|
if not dir:
|
|
return
|
|
if os.path.exists(dir):
|
|
if not os.path.isdir(dir):
|
|
try:
|
|
os.rename(dir, dir + ".bak")
|
|
os.mkdir(dir)
|
|
os.rename(dir + ".bak", os.path.join(dir, "index.html"))
|
|
except os.error:
|
|
pass
|
|
return
|
|
head, tail = os.path.split(dir)
|
|
if not tail:
|
|
print "Huh? Don't know how to make dir", dir
|
|
return
|
|
makedirs(head)
|
|
os.mkdir(dir, 0777)
|
|
|
|
if __name__ == '__main__':
|
|
sys.exit(main() or 0)
|