mirror of
				https://github.com/python/cpython.git
				synced 2025-11-03 19:34:08 +00:00 
			
		
		
		
	Samuel L. Bayer:
- same fixes from webchecker.py - incorporated small diff between current webchecker.py and 1.5.2 - fixed bug where "extra roots" added with the -t argument were being checked as real roots, not just as possible continuations - added -a argument to suppress checking of name anchors [And untabified --GvR]
This commit is contained in:
		
							parent
							
								
									dbd5c3e63b
								
							
						
					
					
						commit
						f97eecccb7
					
				
					 1 changed files with 178 additions and 154 deletions
				
			
		| 
						 | 
				
			
			@ -124,6 +124,7 @@ Options:
 | 
			
		|||
-t root   -- specify root dir which should be treated as internal (can repeat)
 | 
			
		||||
-v        -- verbose operation; repeating -v will increase verbosity
 | 
			
		||||
-x        -- don't check external links (these are often slow to check)
 | 
			
		||||
-a        -- don't check name anchors
 | 
			
		||||
 | 
			
		||||
Arguments:
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -166,6 +167,7 @@ MAXPAGE = 150000                        # Ignore files bigger than this
 | 
			
		|||
ROUNDSIZE = 50                          # Number of links processed per round
 | 
			
		||||
DUMPFILE = "@webchecker.pickle"         # Pickled checkpoint
 | 
			
		||||
AGENTNAME = "webchecker"                # Agent name for robots.txt parser
 | 
			
		||||
NONAMES = 0                             # Force name anchor checking
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Global variables
 | 
			
		||||
| 
						 | 
				
			
			@ -182,9 +184,9 @@ def main():
 | 
			
		|||
 | 
			
		||||
    try:
 | 
			
		||||
 | 
			
		||||
	# Begin SLB 2/24/99: Added -t option here.
 | 
			
		||||
        opts, args = getopt.getopt(sys.argv[1:], 'Rd:m:nqr:t:vx')
 | 
			
		||||
	# End SLB 2/24/99
 | 
			
		||||
        # Begin SLB 2/24/99: Added -t option here.
 | 
			
		||||
        opts, args = getopt.getopt(sys.argv[1:], 'Rd:m:nqr:t:vxa')
 | 
			
		||||
        # End SLB 2/24/99
 | 
			
		||||
 | 
			
		||||
    except getopt.error, msg:
 | 
			
		||||
        sys.stdout = sys.stderr
 | 
			
		||||
| 
						 | 
				
			
			@ -195,6 +197,7 @@ def main():
 | 
			
		|||
    # Begin SLB 2/24/99: Added extra_roots variable to
 | 
			
		||||
    # collect extra roots.
 | 
			
		||||
    extra_roots = []
 | 
			
		||||
    nonames = NONAMES
 | 
			
		||||
    # End SLB 2/24/99
 | 
			
		||||
 | 
			
		||||
    for o, a in opts:
 | 
			
		||||
| 
						 | 
				
			
			@ -211,12 +214,14 @@ def main():
 | 
			
		|||
        if o == '-r':
 | 
			
		||||
            roundsize = string.atoi(a)
 | 
			
		||||
 | 
			
		||||
	# Begin SLB 2/24/99: Added processing for
 | 
			
		||||
	# -t option.
 | 
			
		||||
	if o == '-t':
 | 
			
		||||
	    extra_roots.append(a)
 | 
			
		||||
	# End SLB 2/24/99
 | 
			
		||||
	
 | 
			
		||||
        # Begin SLB 2/24/99: Added processing for
 | 
			
		||||
        # -t option.
 | 
			
		||||
        if o == '-t':
 | 
			
		||||
            extra_roots.append(a)
 | 
			
		||||
        if o == '-a':
 | 
			
		||||
            nonames = not nonames
 | 
			
		||||
        # End SLB 2/24/99
 | 
			
		||||
 | 
			
		||||
        if o == '-v':
 | 
			
		||||
            verbose = verbose + 1
 | 
			
		||||
        if o == '-x':
 | 
			
		||||
| 
						 | 
				
			
			@ -231,7 +236,9 @@ def main():
 | 
			
		|||
        c = Checker()
 | 
			
		||||
 | 
			
		||||
    c.setflags(checkext=checkext, verbose=verbose,
 | 
			
		||||
               maxpage=maxpage, roundsize=roundsize)
 | 
			
		||||
               maxpage=maxpage, roundsize=roundsize,
 | 
			
		||||
               nonames=nonames
 | 
			
		||||
               )
 | 
			
		||||
 | 
			
		||||
    if not restart and not args:
 | 
			
		||||
        args.append(DEFROOT)
 | 
			
		||||
| 
						 | 
				
			
			@ -243,13 +250,13 @@ def main():
 | 
			
		|||
    # links are not to be checked. So -t values are ignored unless
 | 
			
		||||
    # -x was specified.
 | 
			
		||||
    if not checkext:
 | 
			
		||||
	for root in extra_roots:
 | 
			
		||||
	    # Make sure it's terminated by a slash,
 | 
			
		||||
	    # so that addroot doesn't discard the last
 | 
			
		||||
	    # directory component.
 | 
			
		||||
	    if root[-1] != "/":
 | 
			
		||||
		root = root + "/"
 | 
			
		||||
	    c.addroot(root)
 | 
			
		||||
        for root in extra_roots:
 | 
			
		||||
            # Make sure it's terminated by a slash,
 | 
			
		||||
            # so that addroot doesn't discard the last
 | 
			
		||||
            # directory component.
 | 
			
		||||
            if root[-1] != "/":
 | 
			
		||||
                root = root + "/"
 | 
			
		||||
            c.addroot(root, add_to_do = 0)
 | 
			
		||||
    # End SLB 2/24/99
 | 
			
		||||
 | 
			
		||||
    try:
 | 
			
		||||
| 
						 | 
				
			
			@ -294,6 +301,7 @@ class Checker:
 | 
			
		|||
    verbose = VERBOSE
 | 
			
		||||
    maxpage = MAXPAGE
 | 
			
		||||
    roundsize = ROUNDSIZE
 | 
			
		||||
    nonames = NONAMES
 | 
			
		||||
 | 
			
		||||
    validflags = tuple(dir())
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -313,11 +321,11 @@ class Checker:
 | 
			
		|||
        self.done = {}
 | 
			
		||||
        self.bad = {}
 | 
			
		||||
 | 
			
		||||
	# Begin SLB 2/24/99: Added a name table, so that the
 | 
			
		||||
	# name URLs can be checked. Also serves as an implicit
 | 
			
		||||
	# cache for which URLs are done.
 | 
			
		||||
	self.name_table = {}
 | 
			
		||||
	# End SLB 2/24/99
 | 
			
		||||
        # Begin SLB 2/24/99: Added a name table, so that the
 | 
			
		||||
        # name URLs can be checked. Also serves as an implicit
 | 
			
		||||
        # cache for which URLs are done.
 | 
			
		||||
        self.name_table = {}
 | 
			
		||||
        # End SLB 2/24/99
 | 
			
		||||
 | 
			
		||||
        self.round = 0
 | 
			
		||||
        # The following are not pickled:
 | 
			
		||||
| 
						 | 
				
			
			@ -325,13 +333,13 @@ class Checker:
 | 
			
		|||
        self.errors = {}
 | 
			
		||||
        self.urlopener = MyURLopener()
 | 
			
		||||
        self.changed = 0
 | 
			
		||||
        
 | 
			
		||||
 | 
			
		||||
    def note(self, level, format, *args):
 | 
			
		||||
        if self.verbose > level:
 | 
			
		||||
            if args:
 | 
			
		||||
                format = format%args
 | 
			
		||||
            self.message(format)
 | 
			
		||||
    
 | 
			
		||||
 | 
			
		||||
    def message(self, format, *args):
 | 
			
		||||
        if args:
 | 
			
		||||
            format = format%args
 | 
			
		||||
| 
						 | 
				
			
			@ -348,7 +356,7 @@ class Checker:
 | 
			
		|||
        for url in self.bad.keys():
 | 
			
		||||
            self.markerror(url)
 | 
			
		||||
 | 
			
		||||
    def addroot(self, root):
 | 
			
		||||
    def addroot(self, root, add_to_do = 1):
 | 
			
		||||
        if root not in self.roots:
 | 
			
		||||
            troot = root
 | 
			
		||||
            scheme, netloc, path, params, query, fragment = \
 | 
			
		||||
| 
						 | 
				
			
			@ -360,11 +368,12 @@ class Checker:
 | 
			
		|||
                                             params, query, fragment))
 | 
			
		||||
            self.roots.append(troot)
 | 
			
		||||
            self.addrobot(root)
 | 
			
		||||
	    # Begin SLB 2/24/99: Modified this call to respect
 | 
			
		||||
	    # the fact that the "done" and "todo" dictionaries
 | 
			
		||||
	    # are now (URL, fragment) pairs
 | 
			
		||||
	    self.newlink((root, ""), ("<root>", root))
 | 
			
		||||
	    # End SLB 2/24/99
 | 
			
		||||
            # Begin SLB 2/24/99: Modified this call to respect
 | 
			
		||||
            # the fact that the "done" and "todo" dictionaries
 | 
			
		||||
            # are now (URL, fragment) pairs
 | 
			
		||||
            if add_to_do:
 | 
			
		||||
                self.newlink((root, ""), ("<root>", root))
 | 
			
		||||
            # End SLB 2/24/99
 | 
			
		||||
 | 
			
		||||
    def addrobot(self, root):
 | 
			
		||||
        root = urlparse.urljoin(root, "/")
 | 
			
		||||
| 
						 | 
				
			
			@ -416,55 +425,58 @@ class Checker:
 | 
			
		|||
                self.message("%d Errors in %s", len(triples), source)
 | 
			
		||||
            else:
 | 
			
		||||
                self.message("Error in %s", source)
 | 
			
		||||
	    # Begin SLB 2/24/99: Modified this loop to
 | 
			
		||||
	    # call self.format_url() instead of referring
 | 
			
		||||
	    # to the URL directly, since the URLs in these
 | 
			
		||||
	    # triples is now a (URL, fragment) pair. The value
 | 
			
		||||
	    # of the "source" variable comes from the list of
 | 
			
		||||
	    # origins, and is a URL, not a pair.
 | 
			
		||||
            for url, rawlink, msg in triples:		
 | 
			
		||||
            # Begin SLB 2/24/99: Modified this loop to
 | 
			
		||||
            # call self.format_url() instead of referring
 | 
			
		||||
            # to the URL directly, since the URLs in these
 | 
			
		||||
            # triples is now a (URL, fragment) pair. The value
 | 
			
		||||
            # of the "source" variable comes from the list of
 | 
			
		||||
            # origins, and is a URL, not a pair.
 | 
			
		||||
            for url, rawlink, msg in triples:           
 | 
			
		||||
                if rawlink != self.format_url(url): s = " (%s)" % rawlink
 | 
			
		||||
                else: s = ""
 | 
			
		||||
                self.message("  HREF %s%s\n    msg %s",
 | 
			
		||||
			     self.format_url(url), s, msg)
 | 
			
		||||
	    # End SLB 2/24/99.
 | 
			
		||||
                             self.format_url(url), s, msg)
 | 
			
		||||
            # End SLB 2/24/99.
 | 
			
		||||
 | 
			
		||||
    def dopage(self, url_pair):
 | 
			
		||||
	
 | 
			
		||||
	# Begin SLB 2/24/99: Substantial modification to reflect the change
 | 
			
		||||
	# to dictionary entries being (URL, fragment) pairs. All
 | 
			
		||||
	# printing of URLs uses format_url(); argument changed to
 | 
			
		||||
	# url_pair for clarity.
 | 
			
		||||
 | 
			
		||||
        # Begin SLB 2/24/99: Substantial modification to reflect the change
 | 
			
		||||
        # to dictionary entries being (URL, fragment) pairs. All
 | 
			
		||||
        # printing of URLs uses format_url(); argument changed to
 | 
			
		||||
        # url_pair for clarity.
 | 
			
		||||
        if self.verbose > 1:
 | 
			
		||||
            if self.verbose > 2:
 | 
			
		||||
                self.show("Check ", self.format_url(url_pair),
 | 
			
		||||
			  "  from", self.todo[url_pair])
 | 
			
		||||
                          "  from", self.todo[url_pair])
 | 
			
		||||
            else:
 | 
			
		||||
                self.message("Check %s", self.format_url(url_pair))
 | 
			
		||||
        url, local_fragment = url_pair
 | 
			
		||||
        if local_fragment and self.nonames:
 | 
			
		||||
            self.markdone(url_pair)
 | 
			
		||||
            return
 | 
			
		||||
        page = self.getpage(url_pair)
 | 
			
		||||
        if page:
 | 
			
		||||
	    url, local_fragment = url_pair
 | 
			
		||||
	    # Store the page which corresponds to this URL.
 | 
			
		||||
	    self.name_table[url] = page
 | 
			
		||||
	    # If there is a fragment in this url_pair, and it's not
 | 
			
		||||
	    # in the list of names for the page, call setbad(), since
 | 
			
		||||
	    # it's a missing anchor.
 | 
			
		||||
	    if local_fragment and local_fragment not in page.getnames():
 | 
			
		||||
		self.setbad(url_pair, ("Missing name anchor `%s'" % local_fragment))
 | 
			
		||||
	    for info in page.getlinkinfos():
 | 
			
		||||
		# getlinkinfos() now returns the fragment as well,
 | 
			
		||||
		# and we store that fragment here in the "todo" dictionary.
 | 
			
		||||
            # Store the page which corresponds to this URL.
 | 
			
		||||
            self.name_table[url] = page
 | 
			
		||||
            # If there is a fragment in this url_pair, and it's not
 | 
			
		||||
            # in the list of names for the page, call setbad(), since
 | 
			
		||||
            # it's a missing anchor.
 | 
			
		||||
            if local_fragment and local_fragment not in page.getnames():
 | 
			
		||||
                self.setbad(url_pair, ("Missing name anchor `%s'" % local_fragment))
 | 
			
		||||
            for info in page.getlinkinfos():
 | 
			
		||||
                # getlinkinfos() now returns the fragment as well,
 | 
			
		||||
                # and we store that fragment here in the "todo" dictionary.
 | 
			
		||||
                link, rawlink, fragment = info
 | 
			
		||||
		# However, we don't want the fragment as the origin, since
 | 
			
		||||
		# the origin is logically a page.
 | 
			
		||||
                # However, we don't want the fragment as the origin, since
 | 
			
		||||
                # the origin is logically a page.
 | 
			
		||||
                origin = url, rawlink
 | 
			
		||||
                self.newlink((link, fragment), origin)
 | 
			
		||||
	else:
 | 
			
		||||
	    # If no page has been created yet, we want to
 | 
			
		||||
	    # record that fact.
 | 
			
		||||
	    self.name_table[url_pair[0]] = None
 | 
			
		||||
        else:
 | 
			
		||||
            # If no page has been created yet, we want to
 | 
			
		||||
            # record that fact.
 | 
			
		||||
            self.name_table[url_pair[0]] = None
 | 
			
		||||
        self.markdone(url_pair)
 | 
			
		||||
	# End SLB 2/24/99
 | 
			
		||||
        # End SLB 2/24/99
 | 
			
		||||
 | 
			
		||||
    def newlink(self, url, origin):
 | 
			
		||||
        if self.done.has_key(url):
 | 
			
		||||
| 
						 | 
				
			
			@ -473,32 +485,44 @@ class Checker:
 | 
			
		|||
            self.newtodolink(url, origin)
 | 
			
		||||
 | 
			
		||||
    def newdonelink(self, url, origin):
 | 
			
		||||
        self.done[url].append(origin)
 | 
			
		||||
	
 | 
			
		||||
	# Begin SLB 2/24/99: changed reference to URL
 | 
			
		||||
	# to call self.format_url(), since the URL here
 | 
			
		||||
	# is now a (URL, fragment) pair.
 | 
			
		||||
 | 
			
		||||
        if origin not in self.done[url]:
 | 
			
		||||
            self.done[url].append(origin)
 | 
			
		||||
 | 
			
		||||
        # Begin SLB 2/24/99: changed reference to URL
 | 
			
		||||
        # to call self.format_url(), since the URL here
 | 
			
		||||
        # is now a (URL, fragment) pair.
 | 
			
		||||
        self.note(3, "  Done link %s", self.format_url(url))
 | 
			
		||||
	# End SLB 2/24/99
 | 
			
		||||
 | 
			
		||||
        # SLB 11/11/99: Make sure that if it's bad, that
 | 
			
		||||
        # the origin gets added.
 | 
			
		||||
 | 
			
		||||
        if self.bad.has_key(url):
 | 
			
		||||
            source, rawlink = origin
 | 
			
		||||
            triple = url, rawlink, self.bad[url]
 | 
			
		||||
            self.seterror(source, triple)
 | 
			
		||||
 | 
			
		||||
        # End SLB 2/24/99
 | 
			
		||||
 | 
			
		||||
    def newtodolink(self, url, origin):
 | 
			
		||||
	
 | 
			
		||||
	# Begin SLB 2/24/99: changed reference to URL
 | 
			
		||||
	# to call self.format_url(), since the URL here
 | 
			
		||||
	# is now a (URL, fragment) pair.
 | 
			
		||||
 | 
			
		||||
        # Begin SLB 2/24/99: changed reference to URL
 | 
			
		||||
        # to call self.format_url(), since the URL here
 | 
			
		||||
        # is now a (URL, fragment) pair.
 | 
			
		||||
        if self.todo.has_key(url):
 | 
			
		||||
            self.todo[url].append(origin)
 | 
			
		||||
            if origin not in self.todo[url]:
 | 
			
		||||
                self.todo[url].append(origin)
 | 
			
		||||
            self.note(3, "  Seen todo link %s", self.format_url(url))
 | 
			
		||||
        else:
 | 
			
		||||
            self.todo[url] = [origin]
 | 
			
		||||
            self.note(3, "  New todo link %s", self.format_url(url))
 | 
			
		||||
	# End SLB 2/24/99
 | 
			
		||||
        # End SLB 2/24/99
 | 
			
		||||
 | 
			
		||||
    # Begin SLB 2/24/99: Added method format_url().
 | 
			
		||||
    def format_url(self, url):	
 | 
			
		||||
	link, fragment = url
 | 
			
		||||
	if fragment: return link + "#" + fragment
 | 
			
		||||
	else: return link
 | 
			
		||||
    def format_url(self, url):  
 | 
			
		||||
        link, fragment = url
 | 
			
		||||
        if fragment: return link + "#" + fragment
 | 
			
		||||
        else: return link
 | 
			
		||||
    # End SLB 2/24/99
 | 
			
		||||
 | 
			
		||||
    def markdone(self, url):
 | 
			
		||||
| 
						 | 
				
			
			@ -511,30 +535,30 @@ class Checker:
 | 
			
		|||
            if url[:len(root)] == root:
 | 
			
		||||
                return self.isallowed(root, url)
 | 
			
		||||
        return 0
 | 
			
		||||
    
 | 
			
		||||
 | 
			
		||||
    def isallowed(self, root, url):
 | 
			
		||||
        root = urlparse.urljoin(root, "/")
 | 
			
		||||
        return self.robots[root].can_fetch(AGENTNAME, url)
 | 
			
		||||
 | 
			
		||||
    def getpage(self, url_pair):
 | 
			
		||||
 | 
			
		||||
	# Begin SLB 2/24/99: changed incoming argument name to 
 | 
			
		||||
	# url_pair, since it's now a (URL, fragment) pair.
 | 
			
		||||
	# The page may have been cached in the name_table variable.
 | 
			
		||||
	url, fragment = url_pair
 | 
			
		||||
	if self.name_table.has_key(url):
 | 
			
		||||
	    return self.name_table[url]
 | 
			
		||||
	# End SLB 2/24/99
 | 
			
		||||
	
 | 
			
		||||
        # Begin SLB 2/24/99: changed incoming argument name to 
 | 
			
		||||
        # url_pair, since it's now a (URL, fragment) pair.
 | 
			
		||||
        # The page may have been cached in the name_table variable.
 | 
			
		||||
        url, fragment = url_pair
 | 
			
		||||
        if self.name_table.has_key(url):
 | 
			
		||||
            return self.name_table[url]
 | 
			
		||||
        # End SLB 2/24/99
 | 
			
		||||
 | 
			
		||||
        if url[:7] == 'mailto:' or url[:5] == 'news:':
 | 
			
		||||
            self.note(1, " Not checking mailto/news URL")
 | 
			
		||||
            return None
 | 
			
		||||
        isint = self.inroots(url)
 | 
			
		||||
	
 | 
			
		||||
	# Begin SLB 2/24/99: Changed calls to openpage and readhtml
 | 
			
		||||
	# in order to ensure that openpage gets the URL pair to
 | 
			
		||||
	# print out its error message and record the error pair
 | 
			
		||||
	# correctly.
 | 
			
		||||
 | 
			
		||||
        # Begin SLB 2/24/99: Changed calls to openpage and readhtml
 | 
			
		||||
        # in order to ensure that openpage gets the URL pair to
 | 
			
		||||
        # print out its error message and record the error pair
 | 
			
		||||
        # correctly.
 | 
			
		||||
        if not isint:
 | 
			
		||||
            if not self.checkext:
 | 
			
		||||
                self.note(1, " Not checking ext link")
 | 
			
		||||
| 
						 | 
				
			
			@ -544,7 +568,7 @@ class Checker:
 | 
			
		|||
                self.safeclose(f)
 | 
			
		||||
            return None
 | 
			
		||||
        text, nurl = self.readhtml(url_pair)
 | 
			
		||||
	# End SLB 2/24/99
 | 
			
		||||
        # End SLB 2/24/99
 | 
			
		||||
 | 
			
		||||
        if nurl != url:
 | 
			
		||||
            self.note(1, " Redirected to %s", nurl)
 | 
			
		||||
| 
						 | 
				
			
			@ -556,7 +580,7 @@ class Checker:
 | 
			
		|||
    # to take (URL, fragment) pairs as arguments, so that openpage()
 | 
			
		||||
    # receives the appropriate tuple to record error messages.
 | 
			
		||||
    def readhtml(self, url_pair):
 | 
			
		||||
	url, fragment = url_pair
 | 
			
		||||
        url, fragment = url_pair
 | 
			
		||||
        text = None
 | 
			
		||||
        f, url = self.openhtml(url_pair)
 | 
			
		||||
        if f:
 | 
			
		||||
| 
						 | 
				
			
			@ -565,7 +589,7 @@ class Checker:
 | 
			
		|||
        return text, url
 | 
			
		||||
 | 
			
		||||
    def openhtml(self, url_pair):
 | 
			
		||||
	url, fragment = url_pair
 | 
			
		||||
        url, fragment = url_pair
 | 
			
		||||
        f = self.openpage(url_pair)
 | 
			
		||||
        if f:
 | 
			
		||||
            url = f.geturl()
 | 
			
		||||
| 
						 | 
				
			
			@ -576,7 +600,7 @@ class Checker:
 | 
			
		|||
        return f, url
 | 
			
		||||
 | 
			
		||||
    def openpage(self, url_pair):
 | 
			
		||||
	url, fragment = url_pair
 | 
			
		||||
        url, fragment = url_pair
 | 
			
		||||
        try:
 | 
			
		||||
            return self.urlopener.open(url)
 | 
			
		||||
        except IOError, msg:
 | 
			
		||||
| 
						 | 
				
			
			@ -614,7 +638,7 @@ class Checker:
 | 
			
		|||
        self.bad[url] = msg
 | 
			
		||||
        self.changed = 1
 | 
			
		||||
        self.markerror(url)
 | 
			
		||||
        
 | 
			
		||||
 | 
			
		||||
    def markerror(self, url):
 | 
			
		||||
        try:
 | 
			
		||||
            origins = self.todo[url]
 | 
			
		||||
| 
						 | 
				
			
			@ -626,17 +650,17 @@ class Checker:
 | 
			
		|||
 | 
			
		||||
    def seterror(self, url, triple):
 | 
			
		||||
        try:
 | 
			
		||||
	    
 | 
			
		||||
	    # Begin SLB 2/24/99: Because of the way the
 | 
			
		||||
	    # URLs are now processed, I need to check to make
 | 
			
		||||
	    # sure the URL hasn't been entered in the error list.
 | 
			
		||||
	    # The first element of the triple here is a (URL, fragment)
 | 
			
		||||
	    # pair, but the URL key is not, since it's from the
 | 
			
		||||
	    # list of origins.
 | 
			
		||||
	    if triple not in self.errors[url]:
 | 
			
		||||
		self.errors[url].append(triple)
 | 
			
		||||
	    # End SLB 2/24/99
 | 
			
		||||
	    
 | 
			
		||||
 | 
			
		||||
            # Begin SLB 2/24/99: Because of the way the
 | 
			
		||||
            # URLs are now processed, I need to check to make
 | 
			
		||||
            # sure the URL hasn't been entered in the error list.
 | 
			
		||||
            # The first element of the triple here is a (URL, fragment)
 | 
			
		||||
            # pair, but the URL key is not, since it's from the
 | 
			
		||||
            # list of origins.
 | 
			
		||||
            if triple not in self.errors[url]:
 | 
			
		||||
                self.errors[url].append(triple)
 | 
			
		||||
            # End SLB 2/24/99
 | 
			
		||||
 | 
			
		||||
        except KeyError:
 | 
			
		||||
            self.errors[url] = [triple]
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -706,22 +730,22 @@ class Page:
 | 
			
		|||
        self.verbose = verbose
 | 
			
		||||
        self.maxpage = maxpage
 | 
			
		||||
        self.checker = checker
 | 
			
		||||
	
 | 
			
		||||
	# Begin SLB 2/24/99: Moved the parsing of the page into
 | 
			
		||||
	# the __init__() routine in order to initialize the list of
 | 
			
		||||
	# names the file contains. Stored the parser in an instance
 | 
			
		||||
	# variable. Passed the URL to MyHTMLParser().
 | 
			
		||||
	size = len(self.text)
 | 
			
		||||
 | 
			
		||||
        # Begin SLB 2/24/99: Moved the parsing of the page into
 | 
			
		||||
        # the __init__() routine in order to initialize the list of
 | 
			
		||||
        # names the file contains. Stored the parser in an instance
 | 
			
		||||
        # variable. Passed the URL to MyHTMLParser().
 | 
			
		||||
        size = len(self.text)
 | 
			
		||||
        if size > self.maxpage:
 | 
			
		||||
            self.note(0, "Skip huge file %s (%.0f Kbytes)", self.url, (size*0.001))
 | 
			
		||||
            self.parser = None
 | 
			
		||||
	    return
 | 
			
		||||
            return
 | 
			
		||||
        self.checker.note(2, "  Parsing %s (%d bytes)", self.url, size)
 | 
			
		||||
        self.parser = MyHTMLParser(url, verbose=self.verbose,
 | 
			
		||||
				   checker=self.checker)
 | 
			
		||||
                                   checker=self.checker)
 | 
			
		||||
        self.parser.feed(self.text)
 | 
			
		||||
        self.parser.close()
 | 
			
		||||
	# End SLB 2/24/99
 | 
			
		||||
        # End SLB 2/24/99
 | 
			
		||||
 | 
			
		||||
    def note(self, level, msg, *args):
 | 
			
		||||
        if self.checker:
 | 
			
		||||
| 
						 | 
				
			
			@ -734,32 +758,32 @@ class Page:
 | 
			
		|||
 | 
			
		||||
    # Begin SLB 2/24/99: Added method to retrieve names.
 | 
			
		||||
    def getnames(self):
 | 
			
		||||
	return self.parser.names
 | 
			
		||||
        return self.parser.names
 | 
			
		||||
    # End SLB 2/24/99
 | 
			
		||||
 | 
			
		||||
    def getlinkinfos(self):
 | 
			
		||||
 | 
			
		||||
	# Begin SLB 2/24/99: Moved file reading to __init__() routine.
 | 
			
		||||
	# Stored parser in local variable to indicate success of
 | 
			
		||||
	# parsing.
 | 
			
		||||
	
 | 
			
		||||
	# If no parser was stored, fail.
 | 
			
		||||
	if not self.parser: return []
 | 
			
		||||
	
 | 
			
		||||
        # Begin SLB 2/24/99: Moved file reading to __init__() routine.
 | 
			
		||||
        # Stored parser in local variable to indicate success of
 | 
			
		||||
        # parsing.
 | 
			
		||||
 | 
			
		||||
        # If no parser was stored, fail.
 | 
			
		||||
        if not self.parser: return []
 | 
			
		||||
 | 
			
		||||
        rawlinks = self.parser.getlinks()
 | 
			
		||||
        base = urlparse.urljoin(self.url, self.parser.getbase() or "")
 | 
			
		||||
        infos = []
 | 
			
		||||
        for rawlink in rawlinks:
 | 
			
		||||
            t = urlparse.urlparse(rawlink)
 | 
			
		||||
	    # DON'T DISCARD THE FRAGMENT! Instead, include
 | 
			
		||||
	    # it in the tuples which are returned. See Checker.dopage().
 | 
			
		||||
	    fragment = t[-1]
 | 
			
		||||
            # DON'T DISCARD THE FRAGMENT! Instead, include
 | 
			
		||||
            # it in the tuples which are returned. See Checker.dopage().
 | 
			
		||||
            fragment = t[-1]
 | 
			
		||||
            t = t[:-1] + ('',)
 | 
			
		||||
            rawlink = urlparse.urlunparse(t)
 | 
			
		||||
            link = urlparse.urljoin(base, rawlink)
 | 
			
		||||
            infos.append((link, rawlink, fragment))	
 | 
			
		||||
            infos.append((link, rawlink, fragment))     
 | 
			
		||||
        # End SLB 2/24/99
 | 
			
		||||
	
 | 
			
		||||
 | 
			
		||||
        return infos
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -793,9 +817,9 @@ class MyURLopener(urllib.FancyURLopener):
 | 
			
		|||
 | 
			
		||||
    def open_file(self, url):
 | 
			
		||||
        path = urllib.url2pathname(urllib.unquote(url))
 | 
			
		||||
        if path[-1] != os.sep:
 | 
			
		||||
            url = url + '/'
 | 
			
		||||
        if os.path.isdir(path):
 | 
			
		||||
            if path[-1] != os.sep:
 | 
			
		||||
                url = url + '/'
 | 
			
		||||
            indexpath = os.path.join(path, "index.html")
 | 
			
		||||
            if os.path.exists(indexpath):
 | 
			
		||||
                return self.open_file(url + "index.html")
 | 
			
		||||
| 
						 | 
				
			
			@ -812,7 +836,7 @@ class MyURLopener(urllib.FancyURLopener):
 | 
			
		|||
                s.write('<A HREF="%s">%s</A>\n' % (q, q))
 | 
			
		||||
            s.seek(0)
 | 
			
		||||
            return s
 | 
			
		||||
        return urllib.FancyURLopener.open_file(self, path)
 | 
			
		||||
        return urllib.FancyURLopener.open_file(self, url)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class MyHTMLParser(sgmllib.SGMLParser):
 | 
			
		||||
| 
						 | 
				
			
			@ -822,30 +846,30 @@ class MyHTMLParser(sgmllib.SGMLParser):
 | 
			
		|||
        self.checker = checker
 | 
			
		||||
        self.base = None
 | 
			
		||||
        self.links = {}
 | 
			
		||||
	
 | 
			
		||||
	# Begin SLB 2/24/99: Added names instance variable.
 | 
			
		||||
	# Modified arglist to take the URL as an argument.
 | 
			
		||||
	self.names = []
 | 
			
		||||
	self.url = url
 | 
			
		||||
	# End SLB 2/24/99
 | 
			
		||||
	
 | 
			
		||||
 | 
			
		||||
        # Begin SLB 2/24/99: Added names instance variable.
 | 
			
		||||
        # Modified arglist to take the URL as an argument.
 | 
			
		||||
        self.names = []
 | 
			
		||||
        self.url = url
 | 
			
		||||
        # End SLB 2/24/99
 | 
			
		||||
 | 
			
		||||
        sgmllib.SGMLParser.__init__(self)
 | 
			
		||||
 | 
			
		||||
    def start_a(self, attributes):
 | 
			
		||||
        self.link_attr(attributes, 'href')
 | 
			
		||||
	
 | 
			
		||||
	# Begin SLB 2/24/99: We must rescue the NAME
 | 
			
		||||
	# attributes from the anchor, in order to
 | 
			
		||||
	# cache the internal anchors which are made
 | 
			
		||||
	# available in the page.
 | 
			
		||||
	for name, value in attributes:
 | 
			
		||||
	    if name == "name":
 | 
			
		||||
		if value in self.names:
 | 
			
		||||
		    self.checker.message("WARNING: duplicate name %s in %s",
 | 
			
		||||
					 value, self.url)
 | 
			
		||||
		else: self.names.append(value)
 | 
			
		||||
		break
 | 
			
		||||
	# End SLB 2/24/99
 | 
			
		||||
 | 
			
		||||
        # Begin SLB 2/24/99: We must rescue the NAME
 | 
			
		||||
        # attributes from the anchor, in order to
 | 
			
		||||
        # cache the internal anchors which are made
 | 
			
		||||
        # available in the page.
 | 
			
		||||
        for name, value in attributes:
 | 
			
		||||
            if name == "name":
 | 
			
		||||
                if value in self.names:
 | 
			
		||||
                    self.checker.message("WARNING: duplicate name %s in %s",
 | 
			
		||||
                                         value, self.url)
 | 
			
		||||
                else: self.names.append(value)
 | 
			
		||||
                break
 | 
			
		||||
        # End SLB 2/24/99
 | 
			
		||||
 | 
			
		||||
    def end_a(self): pass
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue