bpo-21475: Support the Sitemap extension in robotparser (GH-6883)

This commit is contained in:
Christopher Beacham 2018-05-16 07:52:07 -07:00 committed by Ned Deily
parent 7a1c027501
commit 5db5c0669e
5 changed files with 47 additions and 0 deletions

View file

@ -27,6 +27,7 @@ class RobotFileParser:
def __init__(self, url=''):
self.entries = []
self.sitemaps = []
self.default_entry = None
self.disallow_all = False
self.allow_all = False
@ -141,6 +142,12 @@ class RobotFileParser:
and numbers[1].strip().isdigit()):
entry.req_rate = RequestRate(int(numbers[0]), int(numbers[1]))
state = 2
elif line[0] == "sitemap":
# According to http://www.sitemaps.org/protocol.html
# "This directive is independent of the user-agent line,
# so it doesn't matter where you place it in your file."
# Therefore we do not change the state of the parser.
self.sitemaps.append(line[1])
if state == 2:
self._add_entry(entry)
@ -189,6 +196,11 @@ class RobotFileParser:
return entry.req_rate
return self.default_entry.req_rate
def site_maps(self):
if not self.sitemaps:
return None
return self.sitemaps
def __str__(self):
entries = self.entries
if self.default_entry is not None: