mirror of
https://github.com/python/cpython.git
synced 2025-10-17 12:18:23 +00:00
Patch #499513: use readline() instead of readlines(). Removed the
unnecessary redirection limit code which is already in FancyURLopener.
This commit is contained in:
parent
73e618734d
commit
d22368ffb3
1 changed files with 6 additions and 16 deletions
|
@ -4,7 +4,7 @@
|
||||||
|
|
||||||
You can choose between two licenses when using this package:
|
You can choose between two licenses when using this package:
|
||||||
1) GNU GPLv2
|
1) GNU GPLv2
|
||||||
2) PYTHON 2.0 OPEN SOURCE LICENSE
|
2) PSF license for Python 2.2
|
||||||
|
|
||||||
The robots.txt Exclusion Protocol is implemented as specified in
|
The robots.txt Exclusion Protocol is implemented as specified in
|
||||||
http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html
|
http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html
|
||||||
|
@ -42,7 +42,11 @@ class RobotFileParser:
|
||||||
def read(self):
|
def read(self):
|
||||||
opener = URLopener()
|
opener = URLopener()
|
||||||
f = opener.open(self.url)
|
f = opener.open(self.url)
|
||||||
lines = f.readlines()
|
lines = []
|
||||||
|
line = f.readline()
|
||||||
|
while line:
|
||||||
|
lines.append(line.strip())
|
||||||
|
line = f.readline()
|
||||||
self.errcode = opener.errcode
|
self.errcode = opener.errcode
|
||||||
if self.errcode == 401 or self.errcode == 403:
|
if self.errcode == 401 or self.errcode == 403:
|
||||||
self.disallow_all = 1
|
self.disallow_all = 1
|
||||||
|
@ -63,7 +67,6 @@ class RobotFileParser:
|
||||||
entry = Entry()
|
entry = Entry()
|
||||||
|
|
||||||
for line in lines:
|
for line in lines:
|
||||||
line = line.strip()
|
|
||||||
linenumber = linenumber + 1
|
linenumber = linenumber + 1
|
||||||
if not line:
|
if not line:
|
||||||
if state==1:
|
if state==1:
|
||||||
|
@ -209,25 +212,12 @@ class URLopener(urllib.FancyURLopener):
|
||||||
def __init__(self, *args):
|
def __init__(self, *args):
|
||||||
apply(urllib.FancyURLopener.__init__, (self,) + args)
|
apply(urllib.FancyURLopener.__init__, (self,) + args)
|
||||||
self.errcode = 200
|
self.errcode = 200
|
||||||
self.tries = 0
|
|
||||||
self.maxtries = 10
|
|
||||||
|
|
||||||
def http_error_default(self, url, fp, errcode, errmsg, headers):
|
def http_error_default(self, url, fp, errcode, errmsg, headers):
|
||||||
self.errcode = errcode
|
self.errcode = errcode
|
||||||
return urllib.FancyURLopener.http_error_default(self, url, fp, errcode,
|
return urllib.FancyURLopener.http_error_default(self, url, fp, errcode,
|
||||||
errmsg, headers)
|
errmsg, headers)
|
||||||
|
|
||||||
def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
|
|
||||||
self.tries += 1
|
|
||||||
if self.tries >= self.maxtries:
|
|
||||||
return self.http_error_default(url, fp, 500,
|
|
||||||
"Internal Server Error: Redirect Recursion",
|
|
||||||
headers)
|
|
||||||
result = urllib.FancyURLopener.http_error_302(self, url, fp, errcode,
|
|
||||||
errmsg, headers, data)
|
|
||||||
self.tries = 0
|
|
||||||
return result
|
|
||||||
|
|
||||||
def _check(a,b):
|
def _check(a,b):
|
||||||
if not b:
|
if not b:
|
||||||
ac = "access denied"
|
ac = "access denied"
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue