Issue #16099: RobotFileParser now supports Crawl-delay and Request-rate

extensions. Patch by Nikolay Bogoychev.
2025-08-01 15:43:13 +00:00 · 2015-10-08 12:27:06 +03:00 · 2015-10-08 12:27:06 +03:00 · 960e848f0d
commit 960e848f0d
parent 2137dc1573
6 changed files with 147 additions and 26 deletions
--- a/Lib/urllib/robotparser.py
+++ b/Lib/urllib/robotparser.py
@ -10,7 +10,9 @@
    http://www.robotstxt.org/norobots-rfc.txt
 """

-import urllib.parse, urllib.request
+import collections
+import urllib.parse
+import urllib.request

 __all__ = ["RobotFileParser"]

@ -120,10 +122,29 @@ class RobotFileParser:
                    if state != 0:
                        entry.rulelines.append(RuleLine(line[1], True))
                        state = 2
+                elif line[0] == "crawl-delay":
+                    if state != 0:
+                        # before trying to convert to int we need to make
+                        # sure that robots.txt has valid syntax otherwise
+                        # it will crash
+                        if line[1].strip().isdigit():
+                            entry.delay = int(line[1])
+                        state = 2
+                elif line[0] == "request-rate":
+                    if state != 0:
+                        numbers = line[1].split('/')
+                        # check if all values are sane
+                        if (len(numbers) == 2 and numbers[0].strip().isdigit()
+                            and numbers[1].strip().isdigit()):
+                            req_rate = collections.namedtuple('req_rate',
+                                                              'requests seconds')
+                            entry.req_rate = req_rate
+                            entry.req_rate.requests = int(numbers[0])
+                            entry.req_rate.seconds = int(numbers[1])
+                        state = 2
        if state == 2:
            self._add_entry(entry)

-
    def can_fetch(self, useragent, url):
        """using the parsed robots.txt decide if useragent can fetch url"""
        if self.disallow_all:
@ -153,6 +174,18 @@ class RobotFileParser:
        # agent not found ==> access granted
        return True

+    def crawl_delay(self, useragent):
+        for entry in self.entries:
+            if entry.applies_to(useragent):
+                return entry.delay
+        return None
+
+    def request_rate(self, useragent):
+        for entry in self.entries:
+            if entry.applies_to(useragent):
+                return entry.req_rate
+        return None
+
    def __str__(self):
        return ''.join([str(entry) + "\n" for entry in self.entries])

@ -180,6 +213,8 @@ class Entry:
    def __init__(self):
        self.useragents = []
        self.rulelines = []
+        self.delay = None
+        self.req_rate = None

    def __str__(self):
        ret = []