Samuel L. Bayer:

- same fixes from webchecker.py
- incorporated small diff between current webchecker.py and 1.5.2
- fixed bug where "extra roots" added with the -t argument were being
  checked as real roots, not just as possible continuations
- added -a argument to suppress checking of name anchors

[And untabified --GvR]
This commit is contained in:
Guido van Rossum 1999-11-17 15:02:53 +00:00
parent dbd5c3e63b
commit f97eecccb7

View file

@ -124,6 +124,7 @@ Options:
-t root -- specify root dir which should be treated as internal (can repeat)
-v -- verbose operation; repeating -v will increase verbosity
-x -- don't check external links (these are often slow to check)
-a -- don't check name anchors
Arguments:
@ -166,6 +167,7 @@ MAXPAGE = 150000 # Ignore files bigger than this
ROUNDSIZE = 50 # Number of links processed per round
DUMPFILE = "@webchecker.pickle" # Pickled checkpoint
AGENTNAME = "webchecker" # Agent name for robots.txt parser
NONAMES = 0 # Force name anchor checking
# Global variables
@ -183,7 +185,7 @@ def main():
try:
# Begin SLB 2/24/99: Added -t option here.
opts, args = getopt.getopt(sys.argv[1:], 'Rd:m:nqr:t:vx')
opts, args = getopt.getopt(sys.argv[1:], 'Rd:m:nqr:t:vxa')
# End SLB 2/24/99
except getopt.error, msg:
@ -195,6 +197,7 @@ def main():
# Begin SLB 2/24/99: Added extra_roots variable to
# collect extra roots.
extra_roots = []
nonames = NONAMES
# End SLB 2/24/99
for o, a in opts:
@ -215,6 +218,8 @@ def main():
# -t option.
if o == '-t':
extra_roots.append(a)
if o == '-a':
nonames = not nonames
# End SLB 2/24/99
if o == '-v':
@ -231,7 +236,9 @@ def main():
c = Checker()
c.setflags(checkext=checkext, verbose=verbose,
maxpage=maxpage, roundsize=roundsize)
maxpage=maxpage, roundsize=roundsize,
nonames=nonames
)
if not restart and not args:
args.append(DEFROOT)
@ -249,7 +256,7 @@ def main():
# directory component.
if root[-1] != "/":
root = root + "/"
c.addroot(root)
c.addroot(root, add_to_do = 0)
# End SLB 2/24/99
try:
@ -294,6 +301,7 @@ class Checker:
verbose = VERBOSE
maxpage = MAXPAGE
roundsize = ROUNDSIZE
nonames = NONAMES
validflags = tuple(dir())
@ -348,7 +356,7 @@ class Checker:
for url in self.bad.keys():
self.markerror(url)
def addroot(self, root):
def addroot(self, root, add_to_do = 1):
if root not in self.roots:
troot = root
scheme, netloc, path, params, query, fragment = \
@ -363,6 +371,7 @@ class Checker:
# Begin SLB 2/24/99: Modified this call to respect
# the fact that the "done" and "todo" dictionaries
# are now (URL, fragment) pairs
if add_to_do:
self.newlink((root, ""), ("<root>", root))
# End SLB 2/24/99
@ -441,9 +450,12 @@ class Checker:
" from", self.todo[url_pair])
else:
self.message("Check %s", self.format_url(url_pair))
url, local_fragment = url_pair
if local_fragment and self.nonames:
self.markdone(url_pair)
return
page = self.getpage(url_pair)
if page:
url, local_fragment = url_pair
# Store the page which corresponds to this URL.
self.name_table[url] = page
# If there is a fragment in this url_pair, and it's not
@ -473,12 +485,23 @@ class Checker:
self.newtodolink(url, origin)
def newdonelink(self, url, origin):
if origin not in self.done[url]:
self.done[url].append(origin)
# Begin SLB 2/24/99: changed reference to URL
# to call self.format_url(), since the URL here
# is now a (URL, fragment) pair.
self.note(3, " Done link %s", self.format_url(url))
# SLB 11/11/99: Make sure that if it's bad, that
# the origin gets added.
if self.bad.has_key(url):
source, rawlink = origin
triple = url, rawlink, self.bad[url]
self.seterror(source, triple)
# End SLB 2/24/99
def newtodolink(self, url, origin):
@ -487,6 +510,7 @@ class Checker:
# to call self.format_url(), since the URL here
# is now a (URL, fragment) pair.
if self.todo.has_key(url):
if origin not in self.todo[url]:
self.todo[url].append(origin)
self.note(3, " Seen todo link %s", self.format_url(url))
else:
@ -793,9 +817,9 @@ class MyURLopener(urllib.FancyURLopener):
def open_file(self, url):
path = urllib.url2pathname(urllib.unquote(url))
if os.path.isdir(path):
if path[-1] != os.sep:
url = url + '/'
if os.path.isdir(path):
indexpath = os.path.join(path, "index.html")
if os.path.exists(indexpath):
return self.open_file(url + "index.html")
@ -812,7 +836,7 @@ class MyURLopener(urllib.FancyURLopener):
s.write('<A HREF="%s">%s</A>\n' % (q, q))
s.seek(0)
return s
return urllib.FancyURLopener.open_file(self, path)
return urllib.FancyURLopener.open_file(self, url)
class MyHTMLParser(sgmllib.SGMLParser):