Samuel L. Bayer:

- same fixes from webchecker.py - incorporated small diff between current webchecker.py and 1.5.2 - fixed bug where "extra roots" added with the -t argument were being checked as real roots, not just as possible continuations - added -a argument to suppress checking of name anchors [And untabified --GvR]
2025-11-03 03:22:27 +00:00 · 1999-11-17 15:02:53 +00:00 · 1999-11-17 15:02:53 +00:00 · f97eecccb7
commit f97eecccb7
parent dbd5c3e63b
1 changed files with 178 additions and 154 deletions
--- a/Tools/webchecker/wcnew.py
+++ b/Tools/webchecker/wcnew.py
@ -124,6 +124,7 @@ Options:
 -t root   -- specify root dir which should be treated as internal (can repeat)
 -v        -- verbose operation; repeating -v will increase verbosity
 -x        -- don't check external links (these are often slow to check)
+-a        -- don't check name anchors

 Arguments:

@ -166,6 +167,7 @@ MAXPAGE = 150000                        # Ignore files bigger than this
 ROUNDSIZE = 50                          # Number of links processed per round
 DUMPFILE = "@webchecker.pickle"         # Pickled checkpoint
 AGENTNAME = "webchecker"                # Agent name for robots.txt parser
+NONAMES = 0                             # Force name anchor checking


 # Global variables
@ -183,7 +185,7 @@ def main():
    try:

        # Begin SLB 2/24/99: Added -t option here.
-        opts, args = getopt.getopt(sys.argv[1:], 'Rd:m:nqr:t:vx')
+        opts, args = getopt.getopt(sys.argv[1:], 'Rd:m:nqr:t:vxa')
        # End SLB 2/24/99

    except getopt.error, msg:
@ -195,6 +197,7 @@ def main():
    # Begin SLB 2/24/99: Added extra_roots variable to
    # collect extra roots.
    extra_roots = []
+    nonames = NONAMES
    # End SLB 2/24/99

    for o, a in opts:
@ -215,6 +218,8 @@ def main():
        # -t option.
        if o == '-t':
            extra_roots.append(a)
+        if o == '-a':
+            nonames = not nonames
        # End SLB 2/24/99

        if o == '-v':
@ -231,7 +236,9 @@ def main():
        c = Checker()

    c.setflags(checkext=checkext, verbose=verbose,
-               maxpage=maxpage, roundsize=roundsize)
+               maxpage=maxpage, roundsize=roundsize,
+               nonames=nonames
+               )

    if not restart and not args:
        args.append(DEFROOT)
@ -249,7 +256,7 @@ def main():
            # directory component.
            if root[-1] != "/":
                root = root + "/"
-	    c.addroot(root)
+            c.addroot(root, add_to_do = 0)
    # End SLB 2/24/99

    try:
@ -294,6 +301,7 @@ class Checker:
    verbose = VERBOSE
    maxpage = MAXPAGE
    roundsize = ROUNDSIZE
+    nonames = NONAMES

    validflags = tuple(dir())

@ -348,7 +356,7 @@ class Checker:
        for url in self.bad.keys():
            self.markerror(url)

-    def addroot(self, root):
+    def addroot(self, root, add_to_do = 1):
        if root not in self.roots:
            troot = root
            scheme, netloc, path, params, query, fragment = \
@ -363,6 +371,7 @@ class Checker:
            # Begin SLB 2/24/99: Modified this call to respect
            # the fact that the "done" and "todo" dictionaries
            # are now (URL, fragment) pairs
+            if add_to_do:
                self.newlink((root, ""), ("<root>", root))
            # End SLB 2/24/99

@ -441,9 +450,12 @@ class Checker:
                          "  from", self.todo[url_pair])
            else:
                self.message("Check %s", self.format_url(url_pair))
+        url, local_fragment = url_pair
+        if local_fragment and self.nonames:
+            self.markdone(url_pair)
+            return
        page = self.getpage(url_pair)
        if page:
-	    url, local_fragment = url_pair
            # Store the page which corresponds to this URL.
            self.name_table[url] = page
            # If there is a fragment in this url_pair, and it's not
@ -473,12 +485,23 @@ class Checker:
            self.newtodolink(url, origin)

    def newdonelink(self, url, origin):
+
+        if origin not in self.done[url]:
            self.done[url].append(origin)

        # Begin SLB 2/24/99: changed reference to URL
        # to call self.format_url(), since the URL here
        # is now a (URL, fragment) pair.
        self.note(3, "  Done link %s", self.format_url(url))
+
+        # SLB 11/11/99: Make sure that if it's bad, that
+        # the origin gets added.
+
+        if self.bad.has_key(url):
+            source, rawlink = origin
+            triple = url, rawlink, self.bad[url]
+            self.seterror(source, triple)
+
        # End SLB 2/24/99

    def newtodolink(self, url, origin):
@ -487,6 +510,7 @@ class Checker:
        # to call self.format_url(), since the URL here
        # is now a (URL, fragment) pair.
        if self.todo.has_key(url):
+            if origin not in self.todo[url]:
                self.todo[url].append(origin)
            self.note(3, "  Seen todo link %s", self.format_url(url))
        else:
@ -793,9 +817,9 @@ class MyURLopener(urllib.FancyURLopener):

    def open_file(self, url):
        path = urllib.url2pathname(urllib.unquote(url))
+        if os.path.isdir(path):
            if path[-1] != os.sep:
                url = url + '/'
-        if os.path.isdir(path):
            indexpath = os.path.join(path, "index.html")
            if os.path.exists(indexpath):
                return self.open_file(url + "index.html")
@ -812,7 +836,7 @@ class MyURLopener(urllib.FancyURLopener):
                s.write('<A HREF="%s">%s</A>\n' % (q, q))
            s.seek(0)
            return s
-        return urllib.FancyURLopener.open_file(self, path)
+        return urllib.FancyURLopener.open_file(self, url)


 class MyHTMLParser(sgmllib.SGMLParser):