Patch #499513: use readline() instead of readlines(). Removed the

unnecessary redirection limit code which is already in FancyURLopener.
This commit is contained in:
Martin v. Löwis 2002-03-18 10:41:20 +00:00
parent 73e618734d
commit d22368ffb3

View file

@ -4,7 +4,7 @@
You can choose between two licenses when using this package: You can choose between two licenses when using this package:
1) GNU GPLv2 1) GNU GPLv2
2) PYTHON 2.0 OPEN SOURCE LICENSE 2) PSF license for Python 2.2
The robots.txt Exclusion Protocol is implemented as specified in The robots.txt Exclusion Protocol is implemented as specified in
http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html
@ -42,7 +42,11 @@ class RobotFileParser:
def read(self): def read(self):
opener = URLopener() opener = URLopener()
f = opener.open(self.url) f = opener.open(self.url)
lines = f.readlines() lines = []
line = f.readline()
while line:
lines.append(line.strip())
line = f.readline()
self.errcode = opener.errcode self.errcode = opener.errcode
if self.errcode == 401 or self.errcode == 403: if self.errcode == 401 or self.errcode == 403:
self.disallow_all = 1 self.disallow_all = 1
@ -63,7 +67,6 @@ class RobotFileParser:
entry = Entry() entry = Entry()
for line in lines: for line in lines:
line = line.strip()
linenumber = linenumber + 1 linenumber = linenumber + 1
if not line: if not line:
if state==1: if state==1:
@ -209,25 +212,12 @@ class URLopener(urllib.FancyURLopener):
def __init__(self, *args): def __init__(self, *args):
apply(urllib.FancyURLopener.__init__, (self,) + args) apply(urllib.FancyURLopener.__init__, (self,) + args)
self.errcode = 200 self.errcode = 200
self.tries = 0
self.maxtries = 10
def http_error_default(self, url, fp, errcode, errmsg, headers): def http_error_default(self, url, fp, errcode, errmsg, headers):
self.errcode = errcode self.errcode = errcode
return urllib.FancyURLopener.http_error_default(self, url, fp, errcode, return urllib.FancyURLopener.http_error_default(self, url, fp, errcode,
errmsg, headers) errmsg, headers)
def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
self.tries += 1
if self.tries >= self.maxtries:
return self.http_error_default(url, fp, 500,
"Internal Server Error: Redirect Recursion",
headers)
result = urllib.FancyURLopener.http_error_302(self, url, fp, errcode,
errmsg, headers, data)
self.tries = 0
return result
def _check(a,b): def _check(a,b):
if not b: if not b:
ac = "access denied" ac = "access denied"