mirror of
https://github.com/python/cpython.git
synced 2025-11-03 03:22:27 +00:00
Samuel L. Bayer:
- same fixes from webchecker.py - incorporated small diff between current webchecker.py and 1.5.2 - fixed bug where "extra roots" added with the -t argument were being checked as real roots, not just as possible continuations - added -a argument to suppress checking of name anchors [And untabified --GvR]
This commit is contained in:
parent
dbd5c3e63b
commit
f97eecccb7
1 changed files with 178 additions and 154 deletions
|
|
@ -124,6 +124,7 @@ Options:
|
|||
-t root -- specify root dir which should be treated as internal (can repeat)
|
||||
-v -- verbose operation; repeating -v will increase verbosity
|
||||
-x -- don't check external links (these are often slow to check)
|
||||
-a -- don't check name anchors
|
||||
|
||||
Arguments:
|
||||
|
||||
|
|
@ -166,6 +167,7 @@ MAXPAGE = 150000 # Ignore files bigger than this
|
|||
ROUNDSIZE = 50 # Number of links processed per round
|
||||
DUMPFILE = "@webchecker.pickle" # Pickled checkpoint
|
||||
AGENTNAME = "webchecker" # Agent name for robots.txt parser
|
||||
NONAMES = 0 # Force name anchor checking
|
||||
|
||||
|
||||
# Global variables
|
||||
|
|
@ -183,7 +185,7 @@ def main():
|
|||
try:
|
||||
|
||||
# Begin SLB 2/24/99: Added -t option here.
|
||||
opts, args = getopt.getopt(sys.argv[1:], 'Rd:m:nqr:t:vx')
|
||||
opts, args = getopt.getopt(sys.argv[1:], 'Rd:m:nqr:t:vxa')
|
||||
# End SLB 2/24/99
|
||||
|
||||
except getopt.error, msg:
|
||||
|
|
@ -195,6 +197,7 @@ def main():
|
|||
# Begin SLB 2/24/99: Added extra_roots variable to
|
||||
# collect extra roots.
|
||||
extra_roots = []
|
||||
nonames = NONAMES
|
||||
# End SLB 2/24/99
|
||||
|
||||
for o, a in opts:
|
||||
|
|
@ -215,6 +218,8 @@ def main():
|
|||
# -t option.
|
||||
if o == '-t':
|
||||
extra_roots.append(a)
|
||||
if o == '-a':
|
||||
nonames = not nonames
|
||||
# End SLB 2/24/99
|
||||
|
||||
if o == '-v':
|
||||
|
|
@ -231,7 +236,9 @@ def main():
|
|||
c = Checker()
|
||||
|
||||
c.setflags(checkext=checkext, verbose=verbose,
|
||||
maxpage=maxpage, roundsize=roundsize)
|
||||
maxpage=maxpage, roundsize=roundsize,
|
||||
nonames=nonames
|
||||
)
|
||||
|
||||
if not restart and not args:
|
||||
args.append(DEFROOT)
|
||||
|
|
@ -249,7 +256,7 @@ def main():
|
|||
# directory component.
|
||||
if root[-1] != "/":
|
||||
root = root + "/"
|
||||
c.addroot(root)
|
||||
c.addroot(root, add_to_do = 0)
|
||||
# End SLB 2/24/99
|
||||
|
||||
try:
|
||||
|
|
@ -294,6 +301,7 @@ class Checker:
|
|||
verbose = VERBOSE
|
||||
maxpage = MAXPAGE
|
||||
roundsize = ROUNDSIZE
|
||||
nonames = NONAMES
|
||||
|
||||
validflags = tuple(dir())
|
||||
|
||||
|
|
@ -348,7 +356,7 @@ class Checker:
|
|||
for url in self.bad.keys():
|
||||
self.markerror(url)
|
||||
|
||||
def addroot(self, root):
|
||||
def addroot(self, root, add_to_do = 1):
|
||||
if root not in self.roots:
|
||||
troot = root
|
||||
scheme, netloc, path, params, query, fragment = \
|
||||
|
|
@ -363,6 +371,7 @@ class Checker:
|
|||
# Begin SLB 2/24/99: Modified this call to respect
|
||||
# the fact that the "done" and "todo" dictionaries
|
||||
# are now (URL, fragment) pairs
|
||||
if add_to_do:
|
||||
self.newlink((root, ""), ("<root>", root))
|
||||
# End SLB 2/24/99
|
||||
|
||||
|
|
@ -441,9 +450,12 @@ class Checker:
|
|||
" from", self.todo[url_pair])
|
||||
else:
|
||||
self.message("Check %s", self.format_url(url_pair))
|
||||
url, local_fragment = url_pair
|
||||
if local_fragment and self.nonames:
|
||||
self.markdone(url_pair)
|
||||
return
|
||||
page = self.getpage(url_pair)
|
||||
if page:
|
||||
url, local_fragment = url_pair
|
||||
# Store the page which corresponds to this URL.
|
||||
self.name_table[url] = page
|
||||
# If there is a fragment in this url_pair, and it's not
|
||||
|
|
@ -473,12 +485,23 @@ class Checker:
|
|||
self.newtodolink(url, origin)
|
||||
|
||||
def newdonelink(self, url, origin):
|
||||
|
||||
if origin not in self.done[url]:
|
||||
self.done[url].append(origin)
|
||||
|
||||
# Begin SLB 2/24/99: changed reference to URL
|
||||
# to call self.format_url(), since the URL here
|
||||
# is now a (URL, fragment) pair.
|
||||
self.note(3, " Done link %s", self.format_url(url))
|
||||
|
||||
# SLB 11/11/99: Make sure that if it's bad, that
|
||||
# the origin gets added.
|
||||
|
||||
if self.bad.has_key(url):
|
||||
source, rawlink = origin
|
||||
triple = url, rawlink, self.bad[url]
|
||||
self.seterror(source, triple)
|
||||
|
||||
# End SLB 2/24/99
|
||||
|
||||
def newtodolink(self, url, origin):
|
||||
|
|
@ -487,6 +510,7 @@ class Checker:
|
|||
# to call self.format_url(), since the URL here
|
||||
# is now a (URL, fragment) pair.
|
||||
if self.todo.has_key(url):
|
||||
if origin not in self.todo[url]:
|
||||
self.todo[url].append(origin)
|
||||
self.note(3, " Seen todo link %s", self.format_url(url))
|
||||
else:
|
||||
|
|
@ -793,9 +817,9 @@ class MyURLopener(urllib.FancyURLopener):
|
|||
|
||||
def open_file(self, url):
|
||||
path = urllib.url2pathname(urllib.unquote(url))
|
||||
if os.path.isdir(path):
|
||||
if path[-1] != os.sep:
|
||||
url = url + '/'
|
||||
if os.path.isdir(path):
|
||||
indexpath = os.path.join(path, "index.html")
|
||||
if os.path.exists(indexpath):
|
||||
return self.open_file(url + "index.html")
|
||||
|
|
@ -812,7 +836,7 @@ class MyURLopener(urllib.FancyURLopener):
|
|||
s.write('<A HREF="%s">%s</A>\n' % (q, q))
|
||||
s.seek(0)
|
||||
return s
|
||||
return urllib.FancyURLopener.open_file(self, path)
|
||||
return urllib.FancyURLopener.open_file(self, url)
|
||||
|
||||
|
||||
class MyHTMLParser(sgmllib.SGMLParser):
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue