mirror of
https://github.com/python/cpython.git
synced 2025-07-23 03:05:38 +00:00
Issue #16099: RobotFileParser now supports Crawl-delay and Request-rate
extensions. Patch by Nikolay Bogoychev.
This commit is contained in:
parent
2137dc1573
commit
960e848f0d
6 changed files with 147 additions and 26 deletions
|
@ -10,7 +10,9 @@
|
|||
http://www.robotstxt.org/norobots-rfc.txt
|
||||
"""
|
||||
|
||||
import urllib.parse, urllib.request
|
||||
import collections
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
|
||||
__all__ = ["RobotFileParser"]
|
||||
|
||||
|
@ -120,10 +122,29 @@ class RobotFileParser:
|
|||
if state != 0:
|
||||
entry.rulelines.append(RuleLine(line[1], True))
|
||||
state = 2
|
||||
elif line[0] == "crawl-delay":
|
||||
if state != 0:
|
||||
# before trying to convert to int we need to make
|
||||
# sure that robots.txt has valid syntax otherwise
|
||||
# it will crash
|
||||
if line[1].strip().isdigit():
|
||||
entry.delay = int(line[1])
|
||||
state = 2
|
||||
elif line[0] == "request-rate":
|
||||
if state != 0:
|
||||
numbers = line[1].split('/')
|
||||
# check if all values are sane
|
||||
if (len(numbers) == 2 and numbers[0].strip().isdigit()
|
||||
and numbers[1].strip().isdigit()):
|
||||
req_rate = collections.namedtuple('req_rate',
|
||||
'requests seconds')
|
||||
entry.req_rate = req_rate
|
||||
entry.req_rate.requests = int(numbers[0])
|
||||
entry.req_rate.seconds = int(numbers[1])
|
||||
state = 2
|
||||
if state == 2:
|
||||
self._add_entry(entry)
|
||||
|
||||
|
||||
def can_fetch(self, useragent, url):
|
||||
"""using the parsed robots.txt decide if useragent can fetch url"""
|
||||
if self.disallow_all:
|
||||
|
@ -153,6 +174,18 @@ class RobotFileParser:
|
|||
# agent not found ==> access granted
|
||||
return True
|
||||
|
||||
def crawl_delay(self, useragent):
|
||||
for entry in self.entries:
|
||||
if entry.applies_to(useragent):
|
||||
return entry.delay
|
||||
return None
|
||||
|
||||
def request_rate(self, useragent):
|
||||
for entry in self.entries:
|
||||
if entry.applies_to(useragent):
|
||||
return entry.req_rate
|
||||
return None
|
||||
|
||||
def __str__(self):
|
||||
return ''.join([str(entry) + "\n" for entry in self.entries])
|
||||
|
||||
|
@ -180,6 +213,8 @@ class Entry:
|
|||
def __init__(self):
|
||||
self.useragents = []
|
||||
self.rulelines = []
|
||||
self.delay = None
|
||||
self.req_rate = None
|
||||
|
||||
def __str__(self):
|
||||
ret = []
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue