mirror of
https://github.com/python/cpython.git
synced 2025-07-25 12:14:38 +00:00

It consists of code from urllib, urllib2, urlparse, and robotparser. The old modules have all been removed. The new package has five submodules: urllib.parse, urllib.request, urllib.response, urllib.error, and urllib.robotparser. The urllib.request.urlopen() function uses the url opener from urllib2. Note that the unittests have not been renamed for the beta, but they will be renamed in the future. Joint work with Senthil Kumaran.
155 lines
3.5 KiB
Python
155 lines
3.5 KiB
Python
import io
|
|
import unittest
|
|
import urllib.robotparser
|
|
from test import support
|
|
|
|
class RobotTestCase(unittest.TestCase):
|
|
def __init__(self, index, parser, url, good, agent):
|
|
unittest.TestCase.__init__(self)
|
|
if good:
|
|
self.str = "RobotTest(%d, good, %s)" % (index, url)
|
|
else:
|
|
self.str = "RobotTest(%d, bad, %s)" % (index, url)
|
|
self.parser = parser
|
|
self.url = url
|
|
self.good = good
|
|
self.agent = agent
|
|
|
|
def runTest(self):
|
|
if isinstance(self.url, tuple):
|
|
agent, url = self.url
|
|
else:
|
|
url = self.url
|
|
agent = self.agent
|
|
if self.good:
|
|
self.failUnless(self.parser.can_fetch(agent, url))
|
|
else:
|
|
self.failIf(self.parser.can_fetch(agent, url))
|
|
|
|
def __str__(self):
|
|
return self.str
|
|
|
|
tests = unittest.TestSuite()
|
|
|
|
def RobotTest(index, robots_txt, good_urls, bad_urls,
|
|
agent="test_robotparser"):
|
|
|
|
lines = io.StringIO(robots_txt).readlines()
|
|
parser = urllib.robotparser.RobotFileParser()
|
|
parser.parse(lines)
|
|
for url in good_urls:
|
|
tests.addTest(RobotTestCase(index, parser, url, 1, agent))
|
|
for url in bad_urls:
|
|
tests.addTest(RobotTestCase(index, parser, url, 0, agent))
|
|
|
|
# Examples from http://www.robotstxt.org/wc/norobots.html (fetched 2002)
|
|
|
|
# 1.
|
|
doc = """
|
|
User-agent: *
|
|
Disallow: /cyberworld/map/ # This is an infinite virtual URL space
|
|
Disallow: /tmp/ # these will soon disappear
|
|
Disallow: /foo.html
|
|
"""
|
|
|
|
good = ['/','/test.html']
|
|
bad = ['/cyberworld/map/index.html','/tmp/xxx','/foo.html']
|
|
|
|
RobotTest(1, doc, good, bad)
|
|
|
|
# 2.
|
|
doc = """
|
|
# robots.txt for http://www.example.com/
|
|
|
|
User-agent: *
|
|
Disallow: /cyberworld/map/ # This is an infinite virtual URL space
|
|
|
|
# Cybermapper knows where to go.
|
|
User-agent: cybermapper
|
|
Disallow:
|
|
|
|
"""
|
|
|
|
good = ['/','/test.html',('cybermapper','/cyberworld/map/index.html')]
|
|
bad = ['/cyberworld/map/index.html']
|
|
|
|
RobotTest(2, doc, good, bad)
|
|
|
|
# 3.
|
|
doc = """
|
|
# go away
|
|
User-agent: *
|
|
Disallow: /
|
|
"""
|
|
|
|
good = []
|
|
bad = ['/cyberworld/map/index.html','/','/tmp/']
|
|
|
|
RobotTest(3, doc, good, bad)
|
|
|
|
# Examples from http://www.robotstxt.org/wc/norobots-rfc.html (fetched 2002)
|
|
|
|
# 4.
|
|
doc = """
|
|
User-agent: figtree
|
|
Disallow: /tmp
|
|
Disallow: /a%3cd.html
|
|
Disallow: /a%2fb.html
|
|
Disallow: /%7ejoe/index.html
|
|
"""
|
|
|
|
good = [] # XFAIL '/a/b.html'
|
|
bad = ['/tmp','/tmp.html','/tmp/a.html',
|
|
'/a%3cd.html','/a%3Cd.html','/a%2fb.html',
|
|
'/~joe/index.html'
|
|
]
|
|
|
|
RobotTest(4, doc, good, bad, 'figtree')
|
|
RobotTest(5, doc, good, bad, 'FigTree Robot libwww-perl/5.04')
|
|
|
|
# 6.
|
|
doc = """
|
|
User-agent: *
|
|
Disallow: /tmp/
|
|
Disallow: /a%3Cd.html
|
|
Disallow: /a/b.html
|
|
Disallow: /%7ejoe/index.html
|
|
"""
|
|
|
|
good = ['/tmp',] # XFAIL: '/a%2fb.html'
|
|
bad = ['/tmp/','/tmp/a.html',
|
|
'/a%3cd.html','/a%3Cd.html',"/a/b.html",
|
|
'/%7Ejoe/index.html']
|
|
|
|
RobotTest(6, doc, good, bad)
|
|
|
|
# From bug report #523041
|
|
|
|
# 7.
|
|
doc = """
|
|
User-Agent: *
|
|
Disallow: /.
|
|
"""
|
|
|
|
good = ['/foo.html']
|
|
bad = [] # Bug report says "/" should be denied, but that is not in the RFC
|
|
|
|
RobotTest(7, doc, good, bad)
|
|
|
|
class TestCase(unittest.TestCase):
|
|
def runTest(self):
|
|
support.requires('network')
|
|
# whole site is password-protected.
|
|
url = 'http://mueblesmoraleda.com'
|
|
parser = urllib.robotparser.RobotFileParser()
|
|
parser.set_url(url)
|
|
parser.read()
|
|
self.assertEqual(parser.can_fetch("*", url+"/robots.txt"), False)
|
|
|
|
def test_main():
|
|
support.run_unittest(tests)
|
|
TestCase().run()
|
|
|
|
if __name__=='__main__':
|
|
support.Verbose = 1
|
|
test_main()
|