initial import of the packaging package in the standard library

This commit is contained in:
Tarek Ziade 2011-05-19 13:07:25 +02:00
parent 566f8a646e
commit 1231a4e097
193 changed files with 30376 additions and 149 deletions

View file

@ -0,0 +1,9 @@
"""Low-level and high-level APIs to interact with project indexes."""
__all__ = ['simple',
'xmlrpc',
'dist',
'errors',
'mirrors']
from packaging.pypi.dist import ReleaseInfo, ReleasesList, DistInfo

View file

@ -0,0 +1,48 @@
"""Base class for index crawlers."""
from packaging.pypi.dist import ReleasesList
class BaseClient:
"""Base class containing common methods for the index crawlers/clients"""
def __init__(self, prefer_final, prefer_source):
self._prefer_final = prefer_final
self._prefer_source = prefer_source
self._index = self
def _get_prefer_final(self, prefer_final=None):
"""Return the prefer_final internal parameter or the specified one if
provided"""
if prefer_final:
return prefer_final
else:
return self._prefer_final
def _get_prefer_source(self, prefer_source=None):
"""Return the prefer_source internal parameter or the specified one if
provided"""
if prefer_source:
return prefer_source
else:
return self._prefer_source
def _get_project(self, project_name):
"""Return an project instance, create it if necessary"""
return self._projects.setdefault(project_name.lower(),
ReleasesList(project_name, index=self._index))
def download_distribution(self, requirements, temp_path=None,
prefer_source=None, prefer_final=None):
"""Download a distribution from the last release according to the
requirements.
If temp_path is provided, download to this path, otherwise, create a
temporary location for the download and return it.
"""
prefer_final = self._get_prefer_final(prefer_final)
prefer_source = self._get_prefer_source(prefer_source)
release = self.get_release(requirements, prefer_final)
if release:
dist = release.get_distribution(prefer_source=prefer_source)
return dist.download(temp_path)

547
Lib/packaging/pypi/dist.py Normal file
View file

@ -0,0 +1,547 @@
"""Classes representing releases and distributions retrieved from indexes.
A project (= unique name) can have several releases (= versions) and
each release can have several distributions (= sdist and bdists).
Release objects contain metadata-related information (see PEP 376);
distribution objects contain download-related information.
"""
import sys
import mimetypes
import re
import tempfile
import urllib.request
import urllib.parse
import urllib.error
import urllib.parse
import hashlib
from shutil import unpack_archive
from packaging.errors import IrrationalVersionError
from packaging.version import (suggest_normalized_version, NormalizedVersion,
get_version_predicate)
from packaging.metadata import Metadata
from packaging.pypi.errors import (HashDoesNotMatch, UnsupportedHashName,
CantParseArchiveName)
__all__ = ['ReleaseInfo', 'DistInfo', 'ReleasesList', 'get_infos_from_url']
EXTENSIONS = ".tar.gz .tar.bz2 .tar .zip .tgz .egg".split()
MD5_HASH = re.compile(r'^.*#md5=([a-f0-9]+)$')
DIST_TYPES = ['bdist', 'sdist']
class IndexReference:
"""Mixin used to store the index reference"""
def set_index(self, index=None):
self._index = index
class ReleaseInfo(IndexReference):
"""Represent a release of a project (a project with a specific version).
The release contain the _metadata informations related to this specific
version, and is also a container for distribution related informations.
See the DistInfo class for more information about distributions.
"""
def __init__(self, name, version, metadata=None, hidden=False,
index=None, **kwargs):
"""
:param name: the name of the distribution
:param version: the version of the distribution
:param metadata: the metadata fields of the release.
:type metadata: dict
:param kwargs: optional arguments for a new distribution.
"""
self.set_index(index)
self.name = name
self._version = None
self.version = version
if metadata:
self.metadata = Metadata(mapping=metadata)
else:
self.metadata = None
self.dists = {}
self.hidden = hidden
if 'dist_type' in kwargs:
dist_type = kwargs.pop('dist_type')
self.add_distribution(dist_type, **kwargs)
def set_version(self, version):
try:
self._version = NormalizedVersion(version)
except IrrationalVersionError:
suggestion = suggest_normalized_version(version)
if suggestion:
self.version = suggestion
else:
raise IrrationalVersionError(version)
def get_version(self):
return self._version
version = property(get_version, set_version)
def fetch_metadata(self):
"""If the metadata is not set, use the indexes to get it"""
if not self.metadata:
self._index.get_metadata(self.name, str(self.version))
return self.metadata
@property
def is_final(self):
"""proxy to version.is_final"""
return self.version.is_final
def fetch_distributions(self):
if self.dists is None:
self._index.get_distributions(self.name, str(self.version))
if self.dists is None:
self.dists = {}
return self.dists
def add_distribution(self, dist_type='sdist', python_version=None,
**params):
"""Add distribution informations to this release.
If distribution information is already set for this distribution type,
add the given url paths to the distribution. This can be useful while
some of them fails to download.
:param dist_type: the distribution type (eg. "sdist", "bdist", etc.)
:param params: the fields to be passed to the distribution object
(see the :class:DistInfo constructor).
"""
if dist_type not in DIST_TYPES:
raise ValueError(dist_type)
if dist_type in self.dists:
self.dists[dist_type].add_url(**params)
else:
self.dists[dist_type] = DistInfo(self, dist_type,
index=self._index, **params)
if python_version:
self.dists[dist_type].python_version = python_version
def get_distribution(self, dist_type=None, prefer_source=True):
"""Return a distribution.
If dist_type is set, find first for this distribution type, and just
act as an alias of __get_item__.
If prefer_source is True, search first for source distribution, and if
not return one existing distribution.
"""
if len(self.dists) == 0:
raise LookupError()
if dist_type:
return self[dist_type]
if prefer_source:
if "sdist" in self.dists:
dist = self["sdist"]
else:
dist = next(self.dists.values())
return dist
def unpack(self, path=None, prefer_source=True):
"""Unpack the distribution to the given path.
If not destination is given, creates a temporary location.
Returns the location of the extracted files (root).
"""
return self.get_distribution(prefer_source=prefer_source)\
.unpack(path=path)
def download(self, temp_path=None, prefer_source=True):
"""Download the distribution, using the requirements.
If more than one distribution match the requirements, use the last
version.
Download the distribution, and put it in the temp_path. If no temp_path
is given, creates and return one.
Returns the complete absolute path to the downloaded archive.
"""
return self.get_distribution(prefer_source=prefer_source)\
.download(path=temp_path)
def set_metadata(self, metadata):
if not self.metadata:
self.metadata = Metadata()
self.metadata.update(metadata)
def __getitem__(self, item):
"""distributions are available using release["sdist"]"""
return self.dists[item]
def _check_is_comparable(self, other):
if not isinstance(other, ReleaseInfo):
raise TypeError("cannot compare %s and %s"
% (type(self).__name__, type(other).__name__))
elif self.name != other.name:
raise TypeError("cannot compare %s and %s"
% (self.name, other.name))
def __repr__(self):
return "<%s %s>" % (self.name, self.version)
def __eq__(self, other):
self._check_is_comparable(other)
return self.version == other.version
def __lt__(self, other):
self._check_is_comparable(other)
return self.version < other.version
def __ne__(self, other):
return not self.__eq__(other)
def __gt__(self, other):
return not (self.__lt__(other) or self.__eq__(other))
def __le__(self, other):
return self.__eq__(other) or self.__lt__(other)
def __ge__(self, other):
return self.__eq__(other) or self.__gt__(other)
# See http://docs.python.org/reference/datamodel#object.__hash__
__hash__ = object.__hash__
class DistInfo(IndexReference):
"""Represents a distribution retrieved from an index (sdist, bdist, ...)
"""
def __init__(self, release, dist_type=None, url=None, hashname=None,
hashval=None, is_external=True, python_version=None,
index=None):
"""Create a new instance of DistInfo.
:param release: a DistInfo class is relative to a release.
:param dist_type: the type of the dist (eg. source, bin-*, etc.)
:param url: URL where we found this distribution
:param hashname: the name of the hash we want to use. Refer to the
hashlib.new documentation for more information.
:param hashval: the hash value.
:param is_external: we need to know if the provided url comes from
an index browsing, or from an external resource.
"""
self.set_index(index)
self.release = release
self.dist_type = dist_type
self.python_version = python_version
self._unpacked_dir = None
# set the downloaded path to None by default. The goal here
# is to not download distributions multiple times
self.downloaded_location = None
# We store urls in dict, because we need to have a bit more infos
# than the simple URL. It will be used later to find the good url to
# use.
# We have two _url* attributes: _url and urls. urls contains a list
# of dict for the different urls, and _url contains the choosen url, in
# order to dont make the selection process multiple times.
self.urls = []
self._url = None
self.add_url(url, hashname, hashval, is_external)
def add_url(self, url=None, hashname=None, hashval=None, is_external=True):
"""Add a new url to the list of urls"""
if hashname is not None:
try:
hashlib.new(hashname)
except ValueError:
raise UnsupportedHashName(hashname)
if not url in [u['url'] for u in self.urls]:
self.urls.append({
'url': url,
'hashname': hashname,
'hashval': hashval,
'is_external': is_external,
})
# reset the url selection process
self._url = None
@property
def url(self):
"""Pick up the right url for the list of urls in self.urls"""
# We return internal urls over externals.
# If there is more than one internal or external, return the first
# one.
if self._url is None:
if len(self.urls) > 1:
internals_urls = [u for u in self.urls \
if u['is_external'] == False]
if len(internals_urls) >= 1:
self._url = internals_urls[0]
if self._url is None:
self._url = self.urls[0]
return self._url
@property
def is_source(self):
"""return if the distribution is a source one or not"""
return self.dist_type == 'sdist'
def download(self, path=None):
"""Download the distribution to a path, and return it.
If the path is given in path, use this, otherwise, generates a new one
Return the download location.
"""
if path is None:
path = tempfile.mkdtemp()
# if we do not have downloaded it yet, do it.
if self.downloaded_location is None:
url = self.url['url']
archive_name = urllib.parse.urlparse(url)[2].split('/')[-1]
filename, headers = urllib.request.urlretrieve(url,
path + "/" + archive_name)
self.downloaded_location = filename
self._check_md5(filename)
return self.downloaded_location
def unpack(self, path=None):
"""Unpack the distribution to the given path.
If not destination is given, creates a temporary location.
Returns the location of the extracted files (root).
"""
if not self._unpacked_dir:
if path is None:
path = tempfile.mkdtemp()
filename = self.download(path)
content_type = mimetypes.guess_type(filename)[0]
unpack_archive(filename, path)
self._unpacked_dir = path
return path
def _check_md5(self, filename):
"""Check that the md5 checksum of the given file matches the one in
url param"""
hashname = self.url['hashname']
expected_hashval = self.url['hashval']
if not None in (expected_hashval, hashname):
with open(filename, 'rb') as f:
hashval = hashlib.new(hashname)
hashval.update(f.read())
if hashval.hexdigest() != expected_hashval:
raise HashDoesNotMatch("got %s instead of %s"
% (hashval.hexdigest(), expected_hashval))
def __repr__(self):
if self.release is None:
return "<? ? %s>" % self.dist_type
return "<%s %s %s>" % (
self.release.name, self.release.version, self.dist_type or "")
class ReleasesList(IndexReference):
"""A container of Release.
Provides useful methods and facilities to sort and filter releases.
"""
def __init__(self, name, releases=None, contains_hidden=False, index=None):
self.set_index(index)
self.releases = []
self.name = name
self.contains_hidden = contains_hidden
if releases:
self.add_releases(releases)
def fetch_releases(self):
self._index.get_releases(self.name)
return self.releases
def filter(self, predicate):
"""Filter and return a subset of releases matching the given predicate.
"""
return ReleasesList(self.name, [release for release in self.releases
if predicate.match(release.version)],
index=self._index)
def get_last(self, requirements, prefer_final=None):
"""Return the "last" release, that satisfy the given predicates.
"last" is defined by the version number of the releases, you also could
set prefer_final parameter to True or False to change the order results
"""
predicate = get_version_predicate(requirements)
releases = self.filter(predicate)
if len(releases) == 0:
return None
releases.sort_releases(prefer_final, reverse=True)
return releases[0]
def add_releases(self, releases):
"""Add releases in the release list.
:param: releases is a list of ReleaseInfo objects.
"""
for r in releases:
self.add_release(release=r)
def add_release(self, version=None, dist_type='sdist', release=None,
**dist_args):
"""Add a release to the list.
The release can be passed in the `release` parameter, and in this case,
it will be crawled to extract the useful informations if necessary, or
the release informations can be directly passed in the `version` and
`dist_type` arguments.
Other keywords arguments can be provided, and will be forwarded to the
distribution creation (eg. the arguments of the DistInfo constructor).
"""
if release:
if release.name.lower() != self.name.lower():
raise ValueError("%s is not the same project as %s" %
(release.name, self.name))
version = str(release.version)
if not version in self.get_versions():
# append only if not already exists
self.releases.append(release)
for dist in release.dists.values():
for url in dist.urls:
self.add_release(version, dist.dist_type, **url)
else:
matches = [r for r in self.releases
if str(r.version) == version and r.name == self.name]
if not matches:
release = ReleaseInfo(self.name, version, index=self._index)
self.releases.append(release)
else:
release = matches[0]
release.add_distribution(dist_type=dist_type, **dist_args)
def sort_releases(self, prefer_final=False, reverse=True, *args, **kwargs):
"""Sort the results with the given properties.
The `prefer_final` argument can be used to specify if final
distributions (eg. not dev, bet or alpha) would be prefered or not.
Results can be inverted by using `reverse`.
Any other parameter provided will be forwarded to the sorted call. You
cannot redefine the key argument of "sorted" here, as it is used
internally to sort the releases.
"""
sort_by = []
if prefer_final:
sort_by.append("is_final")
sort_by.append("version")
self.releases.sort(
key=lambda i: tuple(getattr(i, arg) for arg in sort_by),
reverse=reverse, *args, **kwargs)
def get_release(self, version):
"""Return a release from its version."""
matches = [r for r in self.releases if str(r.version) == version]
if len(matches) != 1:
raise KeyError(version)
return matches[0]
def get_versions(self):
"""Return a list of releases versions contained"""
return [str(r.version) for r in self.releases]
def __getitem__(self, key):
return self.releases[key]
def __len__(self):
return len(self.releases)
def __repr__(self):
string = 'Project "%s"' % self.name
if self.get_versions():
string += ' versions: %s' % ', '.join(self.get_versions())
return '<%s>' % string
def get_infos_from_url(url, probable_dist_name=None, is_external=True):
"""Get useful informations from an URL.
Return a dict of (name, version, url, hashtype, hash, is_external)
:param url: complete url of the distribution
:param probable_dist_name: A probable name of the project.
:param is_external: Tell if the url commes from an index or from
an external URL.
"""
# if the url contains a md5 hash, get it.
md5_hash = None
match = MD5_HASH.match(url)
if match is not None:
md5_hash = match.group(1)
# remove the hash
url = url.replace("#md5=%s" % md5_hash, "")
# parse the archive name to find dist name and version
archive_name = urllib.parse.urlparse(url)[2].split('/')[-1]
extension_matched = False
# remove the extension from the name
for ext in EXTENSIONS:
if archive_name.endswith(ext):
archive_name = archive_name[:-len(ext)]
extension_matched = True
name, version = split_archive_name(archive_name)
if extension_matched is True:
return {'name': name,
'version': version,
'url': url,
'hashname': "md5",
'hashval': md5_hash,
'is_external': is_external,
'dist_type': 'sdist'}
def split_archive_name(archive_name, probable_name=None):
"""Split an archive name into two parts: name and version.
Return the tuple (name, version)
"""
# Try to determine wich part is the name and wich is the version using the
# "-" separator. Take the larger part to be the version number then reduce
# if this not works.
def eager_split(str, maxsplit=2):
# split using the "-" separator
splits = str.rsplit("-", maxsplit)
name = splits[0]
version = "-".join(splits[1:])
if version.startswith("-"):
version = version[1:]
if suggest_normalized_version(version) is None and maxsplit >= 0:
# we dont get a good version number: recurse !
return eager_split(str, maxsplit - 1)
else:
return name, version
if probable_name is not None:
probable_name = probable_name.lower()
name = None
if probable_name is not None and probable_name in archive_name:
# we get the name from probable_name, if given.
name = probable_name
version = archive_name.lstrip(name)
else:
name, version = eager_split(archive_name)
version = suggest_normalized_version(version)
if version is not None and name != "":
return name.lower(), version
else:
raise CantParseArchiveName(archive_name)

View file

@ -0,0 +1,39 @@
"""Exceptions raised by packaging.pypi code."""
from packaging.errors import PackagingPyPIError
class ProjectNotFound(PackagingPyPIError):
"""Project has not been found"""
class DistributionNotFound(PackagingPyPIError):
"""The release has not been found"""
class ReleaseNotFound(PackagingPyPIError):
"""The release has not been found"""
class CantParseArchiveName(PackagingPyPIError):
"""An archive name can't be parsed to find distribution name and version"""
class DownloadError(PackagingPyPIError):
"""An error has occurs while downloading"""
class HashDoesNotMatch(DownloadError):
"""Compared hashes does not match"""
class UnsupportedHashName(PackagingPyPIError):
"""A unsupported hashname has been used"""
class UnableToDownload(PackagingPyPIError):
"""All mirrors have been tried, without success"""
class InvalidSearchField(PackagingPyPIError):
"""An invalid search field has been used"""

View file

@ -0,0 +1,52 @@
"""Utilities related to the mirror infrastructure defined in PEP 381."""
from string import ascii_lowercase
import socket
DEFAULT_MIRROR_URL = "last.pypi.python.org"
def get_mirrors(hostname=None):
"""Return the list of mirrors from the last record found on the DNS
entry::
>>> from packaging.pypi.mirrors import get_mirrors
>>> get_mirrors()
['a.pypi.python.org', 'b.pypi.python.org', 'c.pypi.python.org',
'd.pypi.python.org']
"""
if hostname is None:
hostname = DEFAULT_MIRROR_URL
# return the last mirror registered on PyPI.
try:
hostname = socket.gethostbyname_ex(hostname)[0]
except socket.gaierror:
return []
end_letter = hostname.split(".", 1)
# determine the list from the last one.
return ["%s.%s" % (s, end_letter[1]) for s in string_range(end_letter[0])]
def string_range(last):
"""Compute the range of string between "a" and last.
This works for simple "a to z" lists, but also for "a to zz" lists.
"""
for k in range(len(last)):
for x in product(ascii_lowercase, repeat=(k + 1)):
result = ''.join(x)
yield result
if result == last:
return
def product(*args, **kwds):
pools = [tuple(arg) for arg in args] * kwds.get('repeat', 1)
result = [[]]
for pool in pools:
result = [x + [y] for x in result for y in pool]
for prod in result:
yield tuple(prod)

View file

@ -0,0 +1,452 @@
"""Spider using the screen-scraping "simple" PyPI API.
This module contains the class SimpleIndexCrawler, a simple spider that
can be used to find and retrieve distributions from a project index
(like the Python Package Index), using its so-called simple API (see
reference implementation available at http://pypi.python.org/simple/).
"""
import http.client
import re
import socket
import sys
import urllib.request
import urllib.parse
import urllib.error
import os
from fnmatch import translate
from packaging import logger
from packaging.metadata import Metadata
from packaging.version import get_version_predicate
from packaging import __version__ as packaging_version
from packaging.pypi.base import BaseClient
from packaging.pypi.dist import (ReleasesList, EXTENSIONS,
get_infos_from_url, MD5_HASH)
from packaging.pypi.errors import (PackagingPyPIError, DownloadError,
UnableToDownload, CantParseArchiveName,
ReleaseNotFound, ProjectNotFound)
from packaging.pypi.mirrors import get_mirrors
from packaging.metadata import Metadata
__all__ = ['Crawler', 'DEFAULT_SIMPLE_INDEX_URL']
# -- Constants -----------------------------------------------
DEFAULT_SIMPLE_INDEX_URL = "http://a.pypi.python.org/simple/"
DEFAULT_HOSTS = ("*",)
SOCKET_TIMEOUT = 15
USER_AGENT = "Python-urllib/%s packaging/%s" % (
sys.version[:3], packaging_version)
# -- Regexps -------------------------------------------------
EGG_FRAGMENT = re.compile(r'^egg=([-A-Za-z0-9_.]+)$')
HREF = re.compile("""href\\s*=\\s*['"]?([^'"> ]+)""", re.I)
URL_SCHEME = re.compile('([-+.a-z0-9]{2,}):', re.I).match
# This pattern matches a character entity reference (a decimal numeric
# references, a hexadecimal numeric reference, or a named reference).
ENTITY_SUB = re.compile(r'&(#(\d+|x[\da-fA-F]+)|[\w.:-]+);?').sub
REL = re.compile("""<([^>]*\srel\s*=\s*['"]?([^'">]+)[^>]*)>""", re.I)
def socket_timeout(timeout=SOCKET_TIMEOUT):
"""Decorator to add a socket timeout when requesting pages on PyPI.
"""
def _socket_timeout(func):
def _socket_timeout(self, *args, **kwargs):
old_timeout = socket.getdefaulttimeout()
if hasattr(self, "_timeout"):
timeout = self._timeout
socket.setdefaulttimeout(timeout)
try:
return func(self, *args, **kwargs)
finally:
socket.setdefaulttimeout(old_timeout)
return _socket_timeout
return _socket_timeout
def with_mirror_support():
"""Decorator that makes the mirroring support easier"""
def wrapper(func):
def wrapped(self, *args, **kwargs):
try:
return func(self, *args, **kwargs)
except DownloadError:
# if an error occurs, try with the next index_url
if self._mirrors_tries >= self._mirrors_max_tries:
try:
self._switch_to_next_mirror()
except KeyError:
raise UnableToDownload("Tried all mirrors")
else:
self._mirrors_tries += 1
self._projects.clear()
return wrapped(self, *args, **kwargs)
return wrapped
return wrapper
class Crawler(BaseClient):
"""Provides useful tools to request the Python Package Index simple API.
You can specify both mirrors and mirrors_url, but mirrors_url will only be
used if mirrors is set to None.
:param index_url: the url of the simple index to search on.
:param prefer_final: if the version is not mentioned, and the last
version is not a "final" one (alpha, beta, etc.),
pick up the last final version.
:param prefer_source: if the distribution type is not mentioned, pick up
the source one if available.
:param follow_externals: tell if following external links is needed or
not. Default is False.
:param hosts: a list of hosts allowed to be processed while using
follow_externals=True. Default behavior is to follow all
hosts.
:param follow_externals: tell if following external links is needed or
not. Default is False.
:param mirrors_url: the url to look on for DNS records giving mirror
adresses.
:param mirrors: a list of mirrors (see PEP 381).
:param timeout: time in seconds to consider a url has timeouted.
:param mirrors_max_tries": number of times to try requesting informations
on mirrors before switching.
"""
def __init__(self, index_url=DEFAULT_SIMPLE_INDEX_URL, prefer_final=False,
prefer_source=True, hosts=DEFAULT_HOSTS,
follow_externals=False, mirrors_url=None, mirrors=None,
timeout=SOCKET_TIMEOUT, mirrors_max_tries=0):
super(Crawler, self).__init__(prefer_final, prefer_source)
self.follow_externals = follow_externals
# mirroring attributes.
if not index_url.endswith("/"):
index_url += "/"
# if no mirrors are defined, use the method described in PEP 381.
if mirrors is None:
mirrors = get_mirrors(mirrors_url)
self._mirrors = set(mirrors)
self._mirrors_used = set()
self.index_url = index_url
self._mirrors_max_tries = mirrors_max_tries
self._mirrors_tries = 0
self._timeout = timeout
# create a regexp to match all given hosts
self._allowed_hosts = re.compile('|'.join(map(translate, hosts))).match
# we keep an index of pages we have processed, in order to avoid
# scanning them multple time (eg. if there is multiple pages pointing
# on one)
self._processed_urls = []
self._projects = {}
@with_mirror_support()
def search_projects(self, name=None, **kwargs):
"""Search the index for projects containing the given name.
Return a list of names.
"""
with self._open_url(self.index_url) as index:
if '*' in name:
name.replace('*', '.*')
else:
name = "%s%s%s" % ('*.?', name, '*.?')
name = name.replace('*', '[^<]*') # avoid matching end tag
projectname = re.compile('<a[^>]*>(%s)</a>' % name, re.I)
matching_projects = []
index_content = index.read()
# FIXME should use bytes I/O and regexes instead of decoding
index_content = index_content.decode()
for match in projectname.finditer(index_content):
project_name = match.group(1)
matching_projects.append(self._get_project(project_name))
return matching_projects
def get_releases(self, requirements, prefer_final=None,
force_update=False):
"""Search for releases and return a ReleaseList object containing
the results.
"""
predicate = get_version_predicate(requirements)
if predicate.name.lower() in self._projects and not force_update:
return self._projects.get(predicate.name.lower())
prefer_final = self._get_prefer_final(prefer_final)
logger.info('reading info on PyPI about %s', predicate.name)
self._process_index_page(predicate.name)
if predicate.name.lower() not in self._projects:
raise ProjectNotFound()
releases = self._projects.get(predicate.name.lower())
releases.sort_releases(prefer_final=prefer_final)
return releases
def get_release(self, requirements, prefer_final=None):
"""Return only one release that fulfill the given requirements"""
predicate = get_version_predicate(requirements)
release = self.get_releases(predicate, prefer_final)\
.get_last(predicate)
if not release:
raise ReleaseNotFound("No release matches the given criterias")
return release
def get_distributions(self, project_name, version):
"""Return the distributions found on the index for the specific given
release"""
# as the default behavior of get_release is to return a release
# containing the distributions, just alias it.
return self.get_release("%s (%s)" % (project_name, version))
def get_metadata(self, project_name, version):
"""Return the metadatas from the simple index.
Currently, download one archive, extract it and use the PKG-INFO file.
"""
release = self.get_distributions(project_name, version)
if not release.metadata:
location = release.get_distribution().unpack()
pkg_info = os.path.join(location, 'PKG-INFO')
release.metadata = Metadata(pkg_info)
return release
def _switch_to_next_mirror(self):
"""Switch to the next mirror (eg. point self.index_url to the next
mirror url.
Raise a KeyError if all mirrors have been tried.
"""
self._mirrors_used.add(self.index_url)
index_url = self._mirrors.pop()
if not ("http://" or "https://" or "file://") in index_url:
index_url = "http://%s" % index_url
if not index_url.endswith("/simple"):
index_url = "%s/simple/" % index_url
self.index_url = index_url
def _is_browsable(self, url):
"""Tell if the given URL can be browsed or not.
It uses the follow_externals and the hosts list to tell if the given
url is browsable or not.
"""
# if _index_url is contained in the given URL, we are browsing the
# index, and it's always "browsable".
# local files are always considered browable resources
if self.index_url in url or urllib.parse.urlparse(url)[0] == "file":
return True
elif self.follow_externals:
if self._allowed_hosts(urllib.parse.urlparse(url)[1]): # 1 is netloc
return True
else:
return False
return False
def _is_distribution(self, link):
"""Tell if the given URL matches to a distribution name or not.
"""
#XXX find a better way to check that links are distributions
# Using a regexp ?
for ext in EXTENSIONS:
if ext in link:
return True
return False
def _register_release(self, release=None, release_info={}):
"""Register a new release.
Both a release or a dict of release_info can be provided, the prefered
way (eg. the quicker) is the dict one.
Return the list of existing releases for the given project.
"""
# Check if the project already has a list of releases (refering to
# the project name). If not, create a new release list.
# Then, add the release to the list.
if release:
name = release.name
else:
name = release_info['name']
if not name.lower() in self._projects:
self._projects[name.lower()] = ReleasesList(name, index=self._index)
if release:
self._projects[name.lower()].add_release(release=release)
else:
name = release_info.pop('name')
version = release_info.pop('version')
dist_type = release_info.pop('dist_type')
self._projects[name.lower()].add_release(version, dist_type,
**release_info)
return self._projects[name.lower()]
def _process_url(self, url, project_name=None, follow_links=True):
"""Process an url and search for distributions packages.
For each URL found, if it's a download, creates a PyPIdistribution
object. If it's a homepage and we can follow links, process it too.
:param url: the url to process
:param project_name: the project name we are searching for.
:param follow_links: Do not want to follow links more than from one
level. This parameter tells if we want to follow
the links we find (eg. run recursively this
method on it)
"""
with self._open_url(url) as f:
base_url = f.url
if url not in self._processed_urls:
self._processed_urls.append(url)
link_matcher = self._get_link_matcher(url)
for link, is_download in link_matcher(f.read().decode(), base_url):
if link not in self._processed_urls:
if self._is_distribution(link) or is_download:
self._processed_urls.append(link)
# it's a distribution, so create a dist object
try:
infos = get_infos_from_url(link, project_name,
is_external=not self.index_url in url)
except CantParseArchiveName as e:
logger.warning(
"version has not been parsed: %s", e)
else:
self._register_release(release_info=infos)
else:
if self._is_browsable(link) and follow_links:
self._process_url(link, project_name,
follow_links=False)
def _get_link_matcher(self, url):
"""Returns the right link matcher function of the given url
"""
if self.index_url in url:
return self._simple_link_matcher
else:
return self._default_link_matcher
def _get_full_url(self, url, base_url):
return urllib.parse.urljoin(base_url, self._htmldecode(url))
def _simple_link_matcher(self, content, base_url):
"""Yield all links with a rel="download" or rel="homepage".
This matches the simple index requirements for matching links.
If follow_externals is set to False, dont yeld the external
urls.
:param content: the content of the page we want to parse
:param base_url: the url of this page.
"""
for match in HREF.finditer(content):
url = self._get_full_url(match.group(1), base_url)
if MD5_HASH.match(url):
yield (url, True)
for match in REL.finditer(content):
# search for rel links.
tag, rel = match.groups()
rels = [s.strip() for s in rel.lower().split(',')]
if 'homepage' in rels or 'download' in rels:
for match in HREF.finditer(tag):
url = self._get_full_url(match.group(1), base_url)
if 'download' in rels or self._is_browsable(url):
# yield a list of (url, is_download)
yield (url, 'download' in rels)
def _default_link_matcher(self, content, base_url):
"""Yield all links found on the page.
"""
for match in HREF.finditer(content):
url = self._get_full_url(match.group(1), base_url)
if self._is_browsable(url):
yield (url, False)
@with_mirror_support()
def _process_index_page(self, name):
"""Find and process a PyPI page for the given project name.
:param name: the name of the project to find the page
"""
# Browse and index the content of the given PyPI page.
url = self.index_url + name + "/"
self._process_url(url, name)
@socket_timeout()
def _open_url(self, url):
"""Open a urllib2 request, handling HTTP authentication, and local
files support.
"""
scheme, netloc, path, params, query, frag = urllib.parse.urlparse(url)
# authentication stuff
if scheme in ('http', 'https'):
auth, host = urllib.parse.splituser(netloc)
else:
auth = None
# add index.html automatically for filesystem paths
if scheme == 'file':
if url.endswith('/'):
url += "index.html"
# add authorization headers if auth is provided
if auth:
auth = "Basic " + \
urllib.parse.unquote(auth).encode('base64').strip()
new_url = urllib.parse.urlunparse((
scheme, host, path, params, query, frag))
request = urllib.request.Request(new_url)
request.add_header("Authorization", auth)
else:
request = urllib.request.Request(url)
request.add_header('User-Agent', USER_AGENT)
try:
fp = urllib.request.urlopen(request)
except (ValueError, http.client.InvalidURL) as v:
msg = ' '.join([str(arg) for arg in v.args])
raise PackagingPyPIError('%s %s' % (url, msg))
except urllib.error.HTTPError as v:
return v
except urllib.error.URLError as v:
raise DownloadError("Download error for %s: %s" % (url, v.reason))
except http.client.BadStatusLine as v:
raise DownloadError('%s returned a bad status line. '
'The server might be down, %s' % (url, v.line))
except http.client.HTTPException as v:
raise DownloadError("Download error for %s: %s" % (url, v))
except socket.timeout:
raise DownloadError("The server timeouted")
if auth:
# Put authentication info back into request URL if same host,
# so that links found on the page will work
s2, h2, path2, param2, query2, frag2 = \
urllib.parse.urlparse(fp.url)
if s2 == scheme and h2 == host:
fp.url = urllib.parse.urlunparse(
(s2, netloc, path2, param2, query2, frag2))
return fp
def _decode_entity(self, match):
what = match.group(1)
if what.startswith('#x'):
what = int(what[2:], 16)
elif what.startswith('#'):
what = int(what[1:])
else:
from html.entities import name2codepoint
what = name2codepoint.get(what, match.group(0))
return chr(what)
def _htmldecode(self, text):
"""Decode HTML entities in the given text."""
return ENTITY_SUB(self._decode_entity, text)

View file

@ -0,0 +1,99 @@
"""Convenient client for all PyPI APIs.
This module provides a ClientWrapper class which will use the "simple"
or XML-RPC API to request information or files from an index.
"""
from packaging.pypi import simple, xmlrpc
_WRAPPER_MAPPINGS = {'get_release': 'simple',
'get_releases': 'simple',
'search_projects': 'simple',
'get_metadata': 'xmlrpc',
'get_distributions': 'simple'}
_WRAPPER_INDEXES = {'xmlrpc': xmlrpc.Client,
'simple': simple.Crawler}
def switch_index_if_fails(func, wrapper):
"""Decorator that switch of index (for instance from xmlrpc to simple)
if the first mirror return an empty list or raises an exception.
"""
def decorator(*args, **kwargs):
retry = True
exception = None
methods = [func]
for f in wrapper._indexes.values():
if f != func.__self__ and hasattr(f, func.__name__):
methods.append(getattr(f, func.__name__))
for method in methods:
try:
response = method(*args, **kwargs)
retry = False
except Exception as e:
exception = e
if not retry:
break
if retry and exception:
raise exception
else:
return response
return decorator
class ClientWrapper:
"""Wrapper around simple and xmlrpc clients,
Choose the best implementation to use depending the needs, using the given
mappings.
If one of the indexes returns an error, tries to use others indexes.
:param index: tell which index to rely on by default.
:param index_classes: a dict of name:class to use as indexes.
:param indexes: a dict of name:index already instantiated
:param mappings: the mappings to use for this wrapper
"""
def __init__(self, default_index='simple', index_classes=_WRAPPER_INDEXES,
indexes={}, mappings=_WRAPPER_MAPPINGS):
self._projects = {}
self._mappings = mappings
self._indexes = indexes
self._default_index = default_index
# instantiate the classes and set their _project attribute to the one
# of the wrapper.
for name, cls in index_classes.items():
obj = self._indexes.setdefault(name, cls())
obj._projects = self._projects
obj._index = self
def __getattr__(self, method_name):
"""When asking for methods of the wrapper, return the implementation of
the wrapped classes, depending the mapping.
Decorate the methods to switch of implementation if an error occurs
"""
real_method = None
if method_name in _WRAPPER_MAPPINGS:
obj = self._indexes[_WRAPPER_MAPPINGS[method_name]]
real_method = getattr(obj, method_name)
else:
# the method is not defined in the mappings, so we try first to get
# it via the default index, and rely on others if needed.
try:
real_method = getattr(self._indexes[self._default_index],
method_name)
except AttributeError:
other_indexes = [i for i in self._indexes
if i != self._default_index]
for index in other_indexes:
real_method = getattr(self._indexes[index], method_name,
None)
if real_method:
break
if real_method:
return switch_index_if_fails(real_method, self)
else:
raise AttributeError("No index have attribute '%s'" % method_name)

View file

@ -0,0 +1,200 @@
"""Spider using the XML-RPC PyPI API.
This module contains the class Client, a spider that can be used to find
and retrieve distributions from a project index (like the Python Package
Index), using its XML-RPC API (see documentation of the reference
implementation at http://wiki.python.org/moin/PyPiXmlRpc).
"""
import xmlrpc.client
from packaging import logger
from packaging.errors import IrrationalVersionError
from packaging.version import get_version_predicate
from packaging.pypi.base import BaseClient
from packaging.pypi.errors import (ProjectNotFound, InvalidSearchField,
ReleaseNotFound)
from packaging.pypi.dist import ReleaseInfo
__all__ = ['Client', 'DEFAULT_XMLRPC_INDEX_URL']
DEFAULT_XMLRPC_INDEX_URL = 'http://python.org/pypi'
_SEARCH_FIELDS = ['name', 'version', 'author', 'author_email', 'maintainer',
'maintainer_email', 'home_page', 'license', 'summary',
'description', 'keywords', 'platform', 'download_url']
class Client(BaseClient):
"""Client to query indexes using XML-RPC method calls.
If no server_url is specified, use the default PyPI XML-RPC URL,
defined in the DEFAULT_XMLRPC_INDEX_URL constant::
>>> client = XMLRPCClient()
>>> client.server_url == DEFAULT_XMLRPC_INDEX_URL
True
>>> client = XMLRPCClient("http://someurl/")
>>> client.server_url
'http://someurl/'
"""
def __init__(self, server_url=DEFAULT_XMLRPC_INDEX_URL, prefer_final=False,
prefer_source=True):
super(Client, self).__init__(prefer_final, prefer_source)
self.server_url = server_url
self._projects = {}
def get_release(self, requirements, prefer_final=False):
"""Return a release with all complete metadata and distribution
related informations.
"""
prefer_final = self._get_prefer_final(prefer_final)
predicate = get_version_predicate(requirements)
releases = self.get_releases(predicate.name)
release = releases.get_last(predicate, prefer_final)
self.get_metadata(release.name, str(release.version))
self.get_distributions(release.name, str(release.version))
return release
def get_releases(self, requirements, prefer_final=None, show_hidden=True,
force_update=False):
"""Return the list of existing releases for a specific project.
Cache the results from one call to another.
If show_hidden is True, return the hidden releases too.
If force_update is True, reprocess the index to update the
informations (eg. make a new XML-RPC call).
::
>>> client = XMLRPCClient()
>>> client.get_releases('Foo')
['1.1', '1.2', '1.3']
If no such project exists, raise a ProjectNotFound exception::
>>> client.get_project_versions('UnexistingProject')
ProjectNotFound: UnexistingProject
"""
def get_versions(project_name, show_hidden):
return self.proxy.package_releases(project_name, show_hidden)
predicate = get_version_predicate(requirements)
prefer_final = self._get_prefer_final(prefer_final)
project_name = predicate.name
if not force_update and (project_name.lower() in self._projects):
project = self._projects[project_name.lower()]
if not project.contains_hidden and show_hidden:
# if hidden releases are requested, and have an existing
# list of releases that does not contains hidden ones
all_versions = get_versions(project_name, show_hidden)
existing_versions = project.get_versions()
hidden_versions = set(all_versions) - set(existing_versions)
for version in hidden_versions:
project.add_release(release=ReleaseInfo(project_name,
version, index=self._index))
else:
versions = get_versions(project_name, show_hidden)
if not versions:
raise ProjectNotFound(project_name)
project = self._get_project(project_name)
project.add_releases([ReleaseInfo(project_name, version,
index=self._index)
for version in versions])
project = project.filter(predicate)
if len(project) == 0:
raise ReleaseNotFound("%s" % predicate)
project.sort_releases(prefer_final)
return project
def get_distributions(self, project_name, version):
"""Grab informations about distributions from XML-RPC.
Return a ReleaseInfo object, with distribution-related informations
filled in.
"""
url_infos = self.proxy.release_urls(project_name, version)
project = self._get_project(project_name)
if version not in project.get_versions():
project.add_release(release=ReleaseInfo(project_name, version,
index=self._index))
release = project.get_release(version)
for info in url_infos:
packagetype = info['packagetype']
dist_infos = {'url': info['url'],
'hashval': info['md5_digest'],
'hashname': 'md5',
'is_external': False,
'python_version': info['python_version']}
release.add_distribution(packagetype, **dist_infos)
return release
def get_metadata(self, project_name, version):
"""Retrieve project metadata.
Return a ReleaseInfo object, with metadata informations filled in.
"""
# to be case-insensitive, get the informations from the XMLRPC API
projects = [d['name'] for d in
self.proxy.search({'name': project_name})
if d['name'].lower() == project_name]
if len(projects) > 0:
project_name = projects[0]
metadata = self.proxy.release_data(project_name, version)
project = self._get_project(project_name)
if version not in project.get_versions():
project.add_release(release=ReleaseInfo(project_name, version,
index=self._index))
release = project.get_release(version)
release.set_metadata(metadata)
return release
def search_projects(self, name=None, operator="or", **kwargs):
"""Find using the keys provided in kwargs.
You can set operator to "and" or "or".
"""
for key in kwargs:
if key not in _SEARCH_FIELDS:
raise InvalidSearchField(key)
if name:
kwargs["name"] = name
projects = self.proxy.search(kwargs, operator)
for p in projects:
project = self._get_project(p['name'])
try:
project.add_release(release=ReleaseInfo(p['name'],
p['version'], metadata={'summary': p['summary']},
index=self._index))
except IrrationalVersionError as e:
logger.warning("Irrational version error found: %s", e)
return [self._projects[p['name'].lower()] for p in projects]
def get_all_projects(self):
"""Return the list of all projects registered in the package index"""
projects = self.proxy.list_packages()
for name in projects:
self.get_releases(name, show_hidden=True)
return [self._projects[name.lower()] for name in set(projects)]
@property
def proxy(self):
"""Property used to return the XMLRPC server proxy.
If no server proxy is defined yet, creates a new one::
>>> client = XmlRpcClient()
>>> client.proxy()
<ServerProxy for python.org/pypi>
"""
if not hasattr(self, '_server_proxy'):
self._server_proxy = xmlrpc.client.ServerProxy(self.server_url)
return self._server_proxy