mirror of
https://github.com/python/cpython.git
synced 2025-07-19 17:25:54 +00:00
Bug 3347: robotparser failed because it didn't convert bytes to string.
The solution is to convert bytes to text via utf-8. I'm not entirely sure if this is safe, but it looks like robots.txt is expected to be ascii.
This commit is contained in:
parent
48577d1944
commit
73fd46d24e
2 changed files with 18 additions and 5 deletions
|
@ -136,8 +136,9 @@ bad = [] # Bug report says "/" should be denied, but that is not in the RFC
|
||||||
|
|
||||||
RobotTest(7, doc, good, bad)
|
RobotTest(7, doc, good, bad)
|
||||||
|
|
||||||
class TestCase(unittest.TestCase):
|
class NetworkTestCase(unittest.TestCase):
|
||||||
def runTest(self):
|
|
||||||
|
def testPasswordProtectedSite(self):
|
||||||
support.requires('network')
|
support.requires('network')
|
||||||
# whole site is password-protected.
|
# whole site is password-protected.
|
||||||
url = 'http://mueblesmoraleda.com'
|
url = 'http://mueblesmoraleda.com'
|
||||||
|
@ -146,9 +147,17 @@ class TestCase(unittest.TestCase):
|
||||||
parser.read()
|
parser.read()
|
||||||
self.assertEqual(parser.can_fetch("*", url+"/robots.txt"), False)
|
self.assertEqual(parser.can_fetch("*", url+"/robots.txt"), False)
|
||||||
|
|
||||||
|
def testPythonOrg(self):
|
||||||
|
support.requires('network')
|
||||||
|
parser = urllib.robotparser.RobotFileParser(
|
||||||
|
"http://www.python.org/robots.txt")
|
||||||
|
parser.read()
|
||||||
|
self.assertTrue(parser.can_fetch("*",
|
||||||
|
"http://www.python.org/robots.txt"))
|
||||||
|
|
||||||
def test_main():
|
def test_main():
|
||||||
|
support.run_unittest(NetworkTestCase)
|
||||||
support.run_unittest(tests)
|
support.run_unittest(tests)
|
||||||
TestCase().run()
|
|
||||||
|
|
||||||
if __name__=='__main__':
|
if __name__=='__main__':
|
||||||
support.Verbose = 1
|
support.Verbose = 1
|
||||||
|
|
|
@ -60,7 +60,8 @@ class RobotFileParser:
|
||||||
elif err.code >= 400:
|
elif err.code >= 400:
|
||||||
self.allow_all = True
|
self.allow_all = True
|
||||||
else:
|
else:
|
||||||
self.parse(f.read().splitlines())
|
raw = f.read()
|
||||||
|
self.parse(raw.decode("utf-8").splitlines())
|
||||||
|
|
||||||
def _add_entry(self, entry):
|
def _add_entry(self, entry):
|
||||||
if "*" in entry.useragents:
|
if "*" in entry.useragents:
|
||||||
|
@ -123,7 +124,10 @@ class RobotFileParser:
|
||||||
return True
|
return True
|
||||||
# search for given user agent matches
|
# search for given user agent matches
|
||||||
# the first match counts
|
# the first match counts
|
||||||
url = urllib.parse.quote(urllib.parse.urlparse(urllib.parse.unquote(url))[2]) or "/"
|
url = urllib.parse.quote(
|
||||||
|
urllib.parse.urlparse(urllib.parse.unquote(url))[2])
|
||||||
|
if not url:
|
||||||
|
url = "/"
|
||||||
for entry in self.entries:
|
for entry in self.entries:
|
||||||
if entry.applies_to(useragent):
|
if entry.applies_to(useragent):
|
||||||
return entry.allowance(url)
|
return entry.allowance(url)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue