mirror of
https://github.com/python/cpython.git
synced 2025-07-25 20:24:11 +00:00

* Repair the broken link to norobots-rfc.txt. * HTTP response codes >= 500 treated as a failed read rather than as a not found. Not found means that we can assume the entire site is allowed. A 5xx server error tells us nothing. * A successful read() or parse() updates the mtime (which is defined to be "the time the robots.txt file was last fetched"). * The can_fetch() method returns False unless we've had a read() with a 2xx or 4xx response. This avoids false positives in the case where a user calls can_fetch() before calling read(). * I don't see any easy way to test this patch without hitting internet resources that might change or without use of mock objects that wouldn't provide must reassurance.
212 lines
6.8 KiB
Python
212 lines
6.8 KiB
Python
""" robotparser.py
|
|
|
|
Copyright (C) 2000 Bastian Kleineidam
|
|
|
|
You can choose between two licenses when using this package:
|
|
1) GNU GPLv2
|
|
2) PSF license for Python 2.2
|
|
|
|
The robots.txt Exclusion Protocol is implemented as specified in
|
|
http://www.robotstxt.org/norobots-rfc.txt
|
|
"""
|
|
|
|
import urllib.parse, urllib.request
|
|
|
|
__all__ = ["RobotFileParser"]
|
|
|
|
class RobotFileParser:
|
|
""" This class provides a set of methods to read, parse and answer
|
|
questions about a single robots.txt file.
|
|
|
|
"""
|
|
|
|
def __init__(self, url=''):
|
|
self.entries = []
|
|
self.default_entry = None
|
|
self.disallow_all = False
|
|
self.allow_all = False
|
|
self.set_url(url)
|
|
self.last_checked = 0
|
|
|
|
def mtime(self):
|
|
"""Returns the time the robots.txt file was last fetched.
|
|
|
|
This is useful for long-running web spiders that need to
|
|
check for new robots.txt files periodically.
|
|
|
|
"""
|
|
return self.last_checked
|
|
|
|
def modified(self):
|
|
"""Sets the time the robots.txt file was last fetched to the
|
|
current time.
|
|
|
|
"""
|
|
import time
|
|
self.last_checked = time.time()
|
|
|
|
def set_url(self, url):
|
|
"""Sets the URL referring to a robots.txt file."""
|
|
self.url = url
|
|
self.host, self.path = urllib.parse.urlparse(url)[1:3]
|
|
|
|
def read(self):
|
|
"""Reads the robots.txt URL and feeds it to the parser."""
|
|
try:
|
|
f = urllib.request.urlopen(self.url)
|
|
except urllib.error.HTTPError as err:
|
|
if err.code in (401, 403):
|
|
self.disallow_all = True
|
|
elif err.code >= 400 and err.code < 500:
|
|
self.allow_all = True
|
|
else:
|
|
raw = f.read()
|
|
self.parse(raw.decode("utf-8").splitlines())
|
|
|
|
def _add_entry(self, entry):
|
|
if "*" in entry.useragents:
|
|
# the default entry is considered last
|
|
if self.default_entry is None:
|
|
# the first default entry wins
|
|
self.default_entry = entry
|
|
else:
|
|
self.entries.append(entry)
|
|
|
|
def parse(self, lines):
|
|
"""Parse the input lines from a robots.txt file.
|
|
|
|
We allow that a user-agent: line is not preceded by
|
|
one or more blank lines.
|
|
"""
|
|
# states:
|
|
# 0: start state
|
|
# 1: saw user-agent line
|
|
# 2: saw an allow or disallow line
|
|
state = 0
|
|
entry = Entry()
|
|
|
|
self.modified()
|
|
for line in lines:
|
|
if not line:
|
|
if state == 1:
|
|
entry = Entry()
|
|
state = 0
|
|
elif state == 2:
|
|
self._add_entry(entry)
|
|
entry = Entry()
|
|
state = 0
|
|
# remove optional comment and strip line
|
|
i = line.find('#')
|
|
if i >= 0:
|
|
line = line[:i]
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
line = line.split(':', 1)
|
|
if len(line) == 2:
|
|
line[0] = line[0].strip().lower()
|
|
line[1] = urllib.parse.unquote(line[1].strip())
|
|
if line[0] == "user-agent":
|
|
if state == 2:
|
|
self._add_entry(entry)
|
|
entry = Entry()
|
|
entry.useragents.append(line[1])
|
|
state = 1
|
|
elif line[0] == "disallow":
|
|
if state != 0:
|
|
entry.rulelines.append(RuleLine(line[1], False))
|
|
state = 2
|
|
elif line[0] == "allow":
|
|
if state != 0:
|
|
entry.rulelines.append(RuleLine(line[1], True))
|
|
state = 2
|
|
if state == 2:
|
|
self._add_entry(entry)
|
|
|
|
|
|
def can_fetch(self, useragent, url):
|
|
"""using the parsed robots.txt decide if useragent can fetch url"""
|
|
if self.disallow_all:
|
|
return False
|
|
if self.allow_all:
|
|
return True
|
|
# Until the robots.txt file has been read or found not
|
|
# to exist, we must assume that no url is allowable.
|
|
# This prevents false positives when a user erronenously
|
|
# calls can_fetch() before calling read().
|
|
if not self.last_checked:
|
|
return False
|
|
# search for given user agent matches
|
|
# the first match counts
|
|
parsed_url = urllib.parse.urlparse(urllib.parse.unquote(url))
|
|
url = urllib.parse.urlunparse(('','',parsed_url.path,
|
|
parsed_url.params,parsed_url.query, parsed_url.fragment))
|
|
url = urllib.parse.quote(url)
|
|
if not url:
|
|
url = "/"
|
|
for entry in self.entries:
|
|
if entry.applies_to(useragent):
|
|
return entry.allowance(url)
|
|
# try the default entry last
|
|
if self.default_entry:
|
|
return self.default_entry.allowance(url)
|
|
# agent not found ==> access granted
|
|
return True
|
|
|
|
def __str__(self):
|
|
return ''.join([str(entry) + "\n" for entry in self.entries])
|
|
|
|
|
|
class RuleLine:
|
|
"""A rule line is a single "Allow:" (allowance==True) or "Disallow:"
|
|
(allowance==False) followed by a path."""
|
|
def __init__(self, path, allowance):
|
|
if path == '' and not allowance:
|
|
# an empty value means allow all
|
|
allowance = True
|
|
path = urllib.parse.urlunparse(urllib.parse.urlparse(path))
|
|
self.path = urllib.parse.quote(path)
|
|
self.allowance = allowance
|
|
|
|
def applies_to(self, filename):
|
|
return self.path == "*" or filename.startswith(self.path)
|
|
|
|
def __str__(self):
|
|
return (self.allowance and "Allow" or "Disallow") + ": " + self.path
|
|
|
|
|
|
class Entry:
|
|
"""An entry has one or more user-agents and zero or more rulelines"""
|
|
def __init__(self):
|
|
self.useragents = []
|
|
self.rulelines = []
|
|
|
|
def __str__(self):
|
|
ret = []
|
|
for agent in self.useragents:
|
|
ret.extend(["User-agent: ", agent, "\n"])
|
|
for line in self.rulelines:
|
|
ret.extend([str(line), "\n"])
|
|
return ''.join(ret)
|
|
|
|
def applies_to(self, useragent):
|
|
"""check if this entry applies to the specified agent"""
|
|
# split the name token and make it lower case
|
|
useragent = useragent.split("/")[0].lower()
|
|
for agent in self.useragents:
|
|
if agent == '*':
|
|
# we have the catch-all agent
|
|
return True
|
|
agent = agent.lower()
|
|
if agent in useragent:
|
|
return True
|
|
return False
|
|
|
|
def allowance(self, filename):
|
|
"""Preconditions:
|
|
- our agent applies to this entry
|
|
- filename is URL decoded"""
|
|
for line in self.rulelines:
|
|
if line.applies_to(filename):
|
|
return line.allowance
|
|
return True
|