From 9a7bbb2e3f12faaf4110ecd15fb739e94f4bc8f6 Mon Sep 17 00:00:00 2001 From: Berker Peksag Date: Sun, 18 Sep 2016 20:17:58 +0300 Subject: [PATCH] Issue #25400: RobotFileParser now correctly returns default values for crawl_delay and request_rate Initial patch by Peter Wirtz. --- Lib/test/test_robotparser.py | 56 ++++++++++++++++++++++++------------ Lib/urllib/robotparser.py | 8 ++++-- Misc/NEWS | 3 ++ 3 files changed, 46 insertions(+), 21 deletions(-) diff --git a/Lib/test/test_robotparser.py b/Lib/test/test_robotparser.py index 51b48ce53c9..0f64ba8b060 100644 --- a/Lib/test/test_robotparser.py +++ b/Lib/test/test_robotparser.py @@ -79,7 +79,28 @@ Disallow: / bad = ['/cyberworld/map/index.html', '/', '/tmp/'] -class CrawlDelayAndRequestRateTest(BaseRobotTest, unittest.TestCase): +class BaseRequestRateTest(BaseRobotTest): + + def test_request_rate(self): + for url in self.good + self.bad: + agent, url = self.get_agent_and_url(url) + with self.subTest(url=url, agent=agent): + if self.crawl_delay: + self.assertEqual( + self.parser.crawl_delay(agent), self.crawl_delay + ) + if self.request_rate: + self.assertEqual( + self.parser.request_rate(agent).requests, + self.request_rate.requests + ) + self.assertEqual( + self.parser.request_rate(agent).seconds, + self.request_rate.seconds + ) + + +class CrawlDelayAndRequestRateTest(BaseRequestRateTest, unittest.TestCase): robots_txt = """\ User-agent: figtree Crawl-delay: 3 @@ -96,24 +117,6 @@ Disallow: /%7ejoe/index.html bad = ['/tmp', '/tmp.html', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html', '/a%2fb.html', '/~joe/index.html'] - def test_request_rate(self): - for url in self.good: - agent, url = self.get_agent_and_url(url) - with self.subTest(url=url, agent=agent): - if self.crawl_delay: - self.assertEqual( - self.parser.crawl_delay(agent), self.crawl_delay - ) - if self.request_rate and self.parser.request_rate(agent): - self.assertEqual( - self.parser.request_rate(agent).requests, - self.request_rate.requests - ) - self.assertEqual( - self.parser.request_rate(agent).seconds, - self.request_rate.seconds - ) - class DifferentAgentTest(CrawlDelayAndRequestRateTest): agent = 'FigTree Robot libwww-perl/5.04' @@ -230,6 +233,19 @@ Disallow: /another/path? bad = ['/another/path?'] +class DefaultEntryTest(BaseRequestRateTest, unittest.TestCase): + robots_txt = """\ +User-agent: * +Crawl-delay: 1 +Request-rate: 3/15 +Disallow: /cyberworld/map/ + """ + request_rate = namedtuple('req_rate', 'requests seconds')(3, 15) + crawl_delay = 1 + good = ['/', '/test.html'] + bad = ['/cyberworld/map/index.html'] + + class RobotHandler(BaseHTTPRequestHandler): def do_GET(self): @@ -309,6 +325,8 @@ class NetworkTestCase(unittest.TestCase): self.assertTrue(parser.allow_all) self.assertFalse(parser.disallow_all) self.assertEqual(parser.mtime(), 0) + self.assertIsNone(parser.crawl_delay('*')) + self.assertIsNone(parser.request_rate('*')) if __name__=='__main__': unittest.main() diff --git a/Lib/urllib/robotparser.py b/Lib/urllib/robotparser.py index 85add1624ae..9dab4c1c3a8 100644 --- a/Lib/urllib/robotparser.py +++ b/Lib/urllib/robotparser.py @@ -175,16 +175,20 @@ class RobotFileParser: return True def crawl_delay(self, useragent): + if not self.mtime(): + return None for entry in self.entries: if entry.applies_to(useragent): return entry.delay - return None + return self.default_entry.delay def request_rate(self, useragent): + if not self.mtime(): + return None for entry in self.entries: if entry.applies_to(useragent): return entry.req_rate - return None + return self.default_entry.req_rate def __str__(self): return ''.join([str(entry) + "\n" for entry in self.entries]) diff --git a/Misc/NEWS b/Misc/NEWS index 671a9b41c96..e26a5c05aba 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -29,6 +29,9 @@ Core and Builtins Library ------- +- Issue #25400: RobotFileParser now correctly returns default values for + crawl_delay and request_rate. Initial patch by Peter Wirtz. + - Issue #27932: Prevent memory leak in win32_ver(). - Fix UnboundLocalError in socket._sendfile_use_sendfile.