The bulk of the credit for these changes goes to Bastian Kleineidam

* restores urllib as the file fetcher (closes bug #132000) * allows checking URLs with empty paths (closes patches #103511 and 103721) * properly handle user agents with versions (e.g., SpamMeister/1.5) * added several more tests
2025-12-23 09:19:18 +00:00 · 2001-02-12 20:58:30 +00:00 · 2001-02-12 20:58:30 +00:00 · 5bba231d1e
commit 5bba231d1e
parent 498cb15306
1 changed files with 89 additions and 34 deletions
--- a/Lib/robotparser.py
+++ b/Lib/robotparser.py
@ -39,28 +39,19 @@ class RobotFileParser:
        self.host, self.path = urlparse.urlparse(url)[1:3]

    def read(self):
-        import httplib
-        tries = 0
-        while tries<5:
-            connection = httplib.HTTP(self.host)
-            connection.putrequest("GET", self.path)
-            connection.putheader("Host", self.host)
-            connection.endheaders()
-            status, text, mime = connection.getreply()
-            if status in [301,302] and mime:
-                tries = tries + 1
-                newurl = mime.get("Location", mime.get("Uri", ""))
-                newurl = urlparse.urljoin(self.url, newurl)
-                self.set_url(newurl)
-            else:
-                break
-        if status==401 or status==403:
+        opener = URLopener()
+        f = opener.open(self.url)
+        lines = f.readlines()
+        self.errcode = opener.errcode
+        if self.errcode == 401 or self.errcode == 403:
            self.disallow_all = 1
-        elif status>=400:
+            _debug("disallow all")
+        elif self.errcode >= 400:
            self.allow_all = 1
-        else:
-            # status < 400
-            self.parse(connection.getfile().readlines())
+            _debug("allow all")
+        elif self.errcode == 200 and lines:
+            _debug("parse lines")
+            self.parse(lines)

    def parse(self, lines):
        """parse the input lines from a robot.txt file.
@ -129,15 +120,15 @@ class RobotFileParser:

    def can_fetch(self, useragent, url):
        """using the parsed robots.txt decide if useragent can fetch url"""
-        _debug("Checking robot.txt allowance for\n%s\n%s" % (useragent, url))
+        _debug("Checking robot.txt allowance for:\n  user agent: %s\n  url: %s" %
+               (useragent, url))
        if self.disallow_all:
            return 0
        if self.allow_all:
            return 1
        # search for given user agent matches
        # the first match counts
-        useragent = useragent.lower()
-        url = urllib.quote(urlparse.urlparse(url)[2])
+        url = urllib.quote(urlparse.urlparse(url)[2]) or "/"
        for entry in self.entries:
            if entry.applies_to(useragent):
                return entry.allowance(url)
@ -181,11 +172,16 @@ class Entry:
        return ret

    def applies_to(self, useragent):
-        "check if this entry applies to the specified agent"
+        """check if this entry applies to the specified agent"""
+        # split the name token and make it lower case
+        useragent = useragent.split("/")[0].lower()
        for agent in self.useragents:
-            if agent=="*":
+            if agent=='*':
+                # we have the catch-all agent
                return 1
-            if re.match(agent, useragent):
+            agent = agent.lower()
+            # don't forget to re.escape
+            if re.search(re.escape(useragent), agent):
                return 1
        return 0

@ -194,25 +190,84 @@ class Entry:
        - our agent applies to this entry
        - filename is URL decoded"""
        for line in self.rulelines:
+            _debug((filename, str(line), line.allowance))
            if line.applies_to(filename):
                return line.allowance
        return 1

+class URLopener(urllib.FancyURLopener):
+    def __init__(self, *args):
+        apply(urllib.FancyURLopener.__init__, (self,) + args)
+        self.errcode = 200
+        self.tries = 0
+        self.maxtries = 10
+        
+    def http_error_default(self, url, fp, errcode, errmsg, headers):
+        self.errcode = errcode
+        return urllib.FancyURLopener.http_error_default(self, url, fp, errcode,
+                                                        errmsg, headers)
+
+    def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
+        self.tries += 1
+        if self.tries >= self.maxtries:
+            return self.http_error_default(url, fp, 500,
+                                           "Internal Server Error: Redirect Recursion",
+                                           headers)
+        result = urllib.FancyURLopener.http_error_302(self, url, fp, errcode,
+                                                      errmsg, headers, data)
+        self.tries = 0
+        return result
+
+def _check(a,b):
+    if not b:
+        ac = "access denied"
+    else:
+        ac = "access allowed"
+    if a!=b:
+        print "failed"
+    else:
+        print "ok (%s)" % ac
+    print

 def _test():
    global debug
    import sys
    rp = RobotFileParser()
    debug = 1
-    if len(sys.argv) <= 1:
-        rp.set_url('http://www.musi-cal.com/robots.txt')
-        rp.read()
-    else:
-        rp.parse(open(sys.argv[1]).readlines())
-    print rp.can_fetch('*', 'http://www.musi-cal.com/')
-    print rp.can_fetch('Musi-Cal-Robot/1.0',
+
+    # robots.txt that exists, gotten to by redirection
+    rp.set_url('http://www.musi-cal.com/robots.txt')
+    rp.read()
+
+    # test for re.escape
+    _check(rp.can_fetch('*', 'http://www.musi-cal.com/'), 1)
+    # this should match the first rule, which is a disallow
+    _check(rp.can_fetch('', 'http://www.musi-cal.com/'), 0)
+    # various cherry pickers
+    _check(rp.can_fetch('CherryPickerSE',
                       'http://www.musi-cal.com/cgi-bin/event-search'
-                       '?city=San+Francisco')
+                       '?city=San+Francisco'), 0)
+    _check(rp.can_fetch('CherryPickerSE/1.0',
+                       'http://www.musi-cal.com/cgi-bin/event-search'
+                       '?city=San+Francisco'), 0)
+    _check(rp.can_fetch('CherryPickerSE/1.5',
+                       'http://www.musi-cal.com/cgi-bin/event-search'
+                       '?city=San+Francisco'), 0)
+    # case sensitivity
+    _check(rp.can_fetch('ExtractorPro', 'http://www.musi-cal.com/blubba'), 0)
+    _check(rp.can_fetch('extractorpro', 'http://www.musi-cal.com/blubba'), 0)
+    # substring test
+    _check(rp.can_fetch('toolpak/1.1', 'http://www.musi-cal.com/blubba'), 0)
+    # tests for catch-all * agent
+    _check(rp.can_fetch('spam', 'http://www.musi-cal.com/search'), 0)
+    _check(rp.can_fetch('spam', 'http://www.musi-cal.com/Musician/me'), 1)
+    _check(rp.can_fetch('spam', 'http://www.musi-cal.com/'), 1)
+    _check(rp.can_fetch('spam', 'http://www.musi-cal.com/'), 1)
+
+    # robots.txt that does not exist
+    rp.set_url('http://www.lycos.com/robots.txt')
+    rp.read()
+    _check(rp.can_fetch('Mozilla', 'http://www.lycos.com/search'), 1)

 if __name__ == '__main__':
    _test()