mirror of
https://github.com/python/cpython.git
synced 2025-09-09 18:32:22 +00:00
bpo-21475: Support the Sitemap extension in robotparser (GH-6883)
This commit is contained in:
parent
7a1c027501
commit
5db5c0669e
5 changed files with 47 additions and 0 deletions
|
@ -12,6 +12,7 @@ class BaseRobotTest:
|
|||
agent = 'test_robotparser'
|
||||
good = []
|
||||
bad = []
|
||||
site_maps = None
|
||||
|
||||
def setUp(self):
|
||||
lines = io.StringIO(self.robots_txt).readlines()
|
||||
|
@ -36,6 +37,9 @@ class BaseRobotTest:
|
|||
with self.subTest(url=url, agent=agent):
|
||||
self.assertFalse(self.parser.can_fetch(agent, url))
|
||||
|
||||
def test_site_maps(self):
|
||||
self.assertEqual(self.parser.site_maps(), self.site_maps)
|
||||
|
||||
|
||||
class UserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
|
||||
robots_txt = """\
|
||||
|
@ -65,6 +69,23 @@ Disallow:
|
|||
bad = ['/cyberworld/map/index.html']
|
||||
|
||||
|
||||
class SitemapTest(BaseRobotTest, unittest.TestCase):
|
||||
robots_txt = """\
|
||||
# robots.txt for http://www.example.com/
|
||||
|
||||
User-agent: *
|
||||
Sitemap: http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml
|
||||
Sitemap: http://www.google.com/hostednews/sitemap_index.xml
|
||||
Request-rate: 3/15
|
||||
Disallow: /cyberworld/map/ # This is an infinite virtual URL space
|
||||
|
||||
"""
|
||||
good = ['/', '/test.html']
|
||||
bad = ['/cyberworld/map/index.html']
|
||||
site_maps = ['http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml',
|
||||
'http://www.google.com/hostednews/sitemap_index.xml']
|
||||
|
||||
|
||||
class RejectAllRobotsTest(BaseRobotTest, unittest.TestCase):
|
||||
robots_txt = """\
|
||||
# go away
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue