mirror of
https://github.com/python/cpython.git
synced 2025-08-02 08:02:56 +00:00
Use bytes regex instead of decoding whole pages
This commit is contained in:
parent
c8f9c81cfa
commit
030cfe26a3
1 changed files with 10 additions and 12 deletions
|
@ -159,22 +159,20 @@ class Crawler(BaseClient):
|
||||||
|
|
||||||
Return a list of names.
|
Return a list of names.
|
||||||
"""
|
"""
|
||||||
with self._open_url(self.index_url) as index:
|
if '*' in name:
|
||||||
if '*' in name:
|
name.replace('*', '.*')
|
||||||
name.replace('*', '.*')
|
else:
|
||||||
else:
|
name = "%s%s%s" % ('*.?', name, '*.?')
|
||||||
name = "%s%s%s" % ('*.?', name, '*.?')
|
name = name.replace('*', '[^<]*') # avoid matching end tag
|
||||||
name = name.replace('*', '[^<]*') # avoid matching end tag
|
pattern = ('<a[^>]*>(%s)</a>' % name).encode('utf-8')
|
||||||
projectname = re.compile('<a[^>]*>(%s)</a>' % name, re.I)
|
projectname = re.compile(pattern, re.I)
|
||||||
matching_projects = []
|
matching_projects = []
|
||||||
|
|
||||||
|
with self._open_url(self.index_url) as index:
|
||||||
index_content = index.read()
|
index_content = index.read()
|
||||||
|
|
||||||
# FIXME should use bytes I/O and regexes instead of decoding
|
|
||||||
index_content = index_content.decode()
|
|
||||||
|
|
||||||
for match in projectname.finditer(index_content):
|
for match in projectname.finditer(index_content):
|
||||||
project_name = match.group(1)
|
project_name = match.group(1).decode('utf-8')
|
||||||
matching_projects.append(self._get_project(project_name))
|
matching_projects.append(self._get_project(project_name))
|
||||||
return matching_projects
|
return matching_projects
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue