Bug 3347: robotparser failed because it didn't convert bytes to string.

The solution is to convert bytes to text via utf-8. I'm not entirely sure if this is safe, but it looks like robots.txt is expected to be ascii.
2025-10-28 17:13:08 +00:00 · 2008-07-18 20:59:44 +00:00 · 2008-07-18 20:59:44 +00:00 · 73fd46d24e
commit 73fd46d24e
parent 48577d1944
2 changed files with 18 additions and 5 deletions
--- a/Lib/test/test_robotparser.py
+++ b/Lib/test/test_robotparser.py
@ -136,8 +136,9 @@ bad = [] # Bug report says "/" should be denied, but that is not in the RFC
 RobotTest(7, doc, good, bad)
-class TestCase(unittest.TestCase):
+class NetworkTestCase(unittest.TestCase):
-    def runTest(self):
+
    def testPasswordProtectedSite(self):
        support.requires('network')
        # whole site is password-protected.
        url = 'http://mueblesmoraleda.com'
@ -146,9 +147,17 @@ class TestCase(unittest.TestCase):
        parser.read()
        self.assertEqual(parser.can_fetch("*", url+"/robots.txt"), False)
    def testPythonOrg(self):
        support.requires('network')
        parser = urllib.robotparser.RobotFileParser(
            "http://www.python.org/robots.txt")
        parser.read()
        self.assertTrue(parser.can_fetch("*",
                                         "http://www.python.org/robots.txt"))
 def test_main():
    support.run_unittest(NetworkTestCase)
    support.run_unittest(tests)
    TestCase().run()
 if __name__=='__main__':
    support.Verbose = 1
--- a/Lib/urllib/robotparser.py
+++ b/Lib/urllib/robotparser.py
@ -60,7 +60,8 @@ class RobotFileParser:
            elif err.code >= 400:
                self.allow_all = True
        else:
-            self.parse(f.read().splitlines())
+            raw = f.read()
            self.parse(raw.decode("utf-8").splitlines())
    def _add_entry(self, entry):
        if "*" in entry.useragents:
@ -123,7 +124,10 @@ class RobotFileParser:
            return True
        # search for given user agent matches
        # the first match counts
-        url = urllib.parse.quote(urllib.parse.urlparse(urllib.parse.unquote(url))[2]) or "/"
+        url = urllib.parse.quote(
            urllib.parse.urlparse(urllib.parse.unquote(url))[2])
        if not url:
            url = "/"
        for entry in self.entries:
            if entry.applies_to(useragent):
                return entry.allowance(url)