bpo-21475: Support the Sitemap extension in robotparser (GH-6883)

This commit is contained in:
Christopher Beacham 2018-05-16 07:52:07 -07:00 committed by Ned Deily
parent 7a1c027501
commit 5db5c0669e
5 changed files with 47 additions and 0 deletions

View file

@ -12,6 +12,7 @@ class BaseRobotTest:
agent = 'test_robotparser'
good = []
bad = []
site_maps = None
def setUp(self):
lines = io.StringIO(self.robots_txt).readlines()
@ -36,6 +37,9 @@ class BaseRobotTest:
with self.subTest(url=url, agent=agent):
self.assertFalse(self.parser.can_fetch(agent, url))
def test_site_maps(self):
self.assertEqual(self.parser.site_maps(), self.site_maps)
class UserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
robots_txt = """\
@ -65,6 +69,23 @@ Disallow:
bad = ['/cyberworld/map/index.html']
class SitemapTest(BaseRobotTest, unittest.TestCase):
robots_txt = """\
# robots.txt for http://www.example.com/
User-agent: *
Sitemap: http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml
Sitemap: http://www.google.com/hostednews/sitemap_index.xml
Request-rate: 3/15
Disallow: /cyberworld/map/ # This is an infinite virtual URL space
"""
good = ['/', '/test.html']
bad = ['/cyberworld/map/index.html']
site_maps = ['http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml',
'http://www.google.com/hostednews/sitemap_index.xml']
class RejectAllRobotsTest(BaseRobotTest, unittest.TestCase):
robots_txt = """\
# go away