mirror of
https://github.com/python/cpython.git
synced 2025-07-24 19:54:21 +00:00
bpo-21475: Support the Sitemap extension in robotparser (GH-6883)
This commit is contained in:
parent
7a1c027501
commit
5db5c0669e
5 changed files with 47 additions and 0 deletions
|
@ -27,6 +27,7 @@ class RobotFileParser:
|
|||
|
||||
def __init__(self, url=''):
|
||||
self.entries = []
|
||||
self.sitemaps = []
|
||||
self.default_entry = None
|
||||
self.disallow_all = False
|
||||
self.allow_all = False
|
||||
|
@ -141,6 +142,12 @@ class RobotFileParser:
|
|||
and numbers[1].strip().isdigit()):
|
||||
entry.req_rate = RequestRate(int(numbers[0]), int(numbers[1]))
|
||||
state = 2
|
||||
elif line[0] == "sitemap":
|
||||
# According to http://www.sitemaps.org/protocol.html
|
||||
# "This directive is independent of the user-agent line,
|
||||
# so it doesn't matter where you place it in your file."
|
||||
# Therefore we do not change the state of the parser.
|
||||
self.sitemaps.append(line[1])
|
||||
if state == 2:
|
||||
self._add_entry(entry)
|
||||
|
||||
|
@ -189,6 +196,11 @@ class RobotFileParser:
|
|||
return entry.req_rate
|
||||
return self.default_entry.req_rate
|
||||
|
||||
def site_maps(self):
|
||||
if not self.sitemaps:
|
||||
return None
|
||||
return self.sitemaps
|
||||
|
||||
def __str__(self):
|
||||
entries = self.entries
|
||||
if self.default_entry is not None:
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue