Issue #16099: RobotFileParser now supports Crawl-delay and Request-rate

extensions.

Patch by Nikolay Bogoychev.
This commit is contained in:
Berker Peksag 2015-10-08 12:27:06 +03:00
parent 2137dc1573
commit 960e848f0d
6 changed files with 147 additions and 26 deletions

View file

@ -10,7 +10,9 @@
http://www.robotstxt.org/norobots-rfc.txt
"""
import urllib.parse, urllib.request
import collections
import urllib.parse
import urllib.request
__all__ = ["RobotFileParser"]
@ -120,10 +122,29 @@ class RobotFileParser:
if state != 0:
entry.rulelines.append(RuleLine(line[1], True))
state = 2
elif line[0] == "crawl-delay":
if state != 0:
# before trying to convert to int we need to make
# sure that robots.txt has valid syntax otherwise
# it will crash
if line[1].strip().isdigit():
entry.delay = int(line[1])
state = 2
elif line[0] == "request-rate":
if state != 0:
numbers = line[1].split('/')
# check if all values are sane
if (len(numbers) == 2 and numbers[0].strip().isdigit()
and numbers[1].strip().isdigit()):
req_rate = collections.namedtuple('req_rate',
'requests seconds')
entry.req_rate = req_rate
entry.req_rate.requests = int(numbers[0])
entry.req_rate.seconds = int(numbers[1])
state = 2
if state == 2:
self._add_entry(entry)
def can_fetch(self, useragent, url):
"""using the parsed robots.txt decide if useragent can fetch url"""
if self.disallow_all:
@ -153,6 +174,18 @@ class RobotFileParser:
# agent not found ==> access granted
return True
def crawl_delay(self, useragent):
for entry in self.entries:
if entry.applies_to(useragent):
return entry.delay
return None
def request_rate(self, useragent):
for entry in self.entries:
if entry.applies_to(useragent):
return entry.req_rate
return None
def __str__(self):
return ''.join([str(entry) + "\n" for entry in self.entries])
@ -180,6 +213,8 @@ class Entry:
def __init__(self):
self.useragents = []
self.rulelines = []
self.delay = None
self.req_rate = None
def __str__(self):
ret = []