Check compatibility for cached unzipped wheels (#501)

**Motivation** Previously, we would install any wheel with the correct
package name and version from the cache, even if it doesn't match the
current python interpreter.

**Summary** The unzipped wheel cache for registries now uses the entire
wheel filename over the name-version (`editables-0.5-py3-none-any.whl`
over `editables-0.5`).

Built wheels are not stored in the `wheels-v0` unzipped wheels cache
anymore. For each source distribution, there can be multiple built
wheels (with different compatibility tags), so i argue that we need a
different cache structure for them (follow up PR).

For `all-kinds.in` with

```bash
rm -rf cache-all-kinds
virtualenv --clear -p 3.12 .venv
cargo run --bin puffin -- pip-sync --cache-dir cache-all-kinds target/all-kinds.txt
```

we get:

**Before**
```
cache-all-kinds/wheels-v0/
├── registry
│   ├── annotated_types-0.6.0
│   ├── asgiref-3.7.2
│   ├── blinker-1.7.0
│   ├── certifi-2023.11.17
│   ├── cffi-1.16.0
│   ├── [...]
│   ├── tzdata-2023.3
│   ├── urllib3-2.1.0
│   └── wheel-0.42.0
└── url
    ├── 4b8be67c801a7ecb
    │   ├── flask
    │   └── flask-3.0.0.dist-info
    ├── 6781bd6440ae72c2
    │   ├── werkzeug
    │   └── werkzeug-3.0.1.dist-info
    └── a67db8ed076e3814
        ├── pydantic_extra_types
        └── pydantic_extra_types-2.1.0.dist-info

48 directories, 0 files
```

**After**

```
cache-all-kinds/wheels-v0/
├── registry
│   ├── annotated_types-0.6.0-py3-none-any.whl
│   ├── asgiref-3.7.2-py3-none-any.whl
│   ├── blinker-1.7.0-py3-none-any.whl
│   ├── certifi-2023.11.17-py3-none-any.whl
│   ├── cffi-1.16.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
│   ├── [...]
│   ├── tzdata-2023.3-py2.py3-none-any.whl
│   ├── urllib3-2.1.0-py3-none-any.whl
│   └── wheel-0.42.0-py3-none-any.whl
└── url
    └── 4b8be67c801a7ecb
        └── flask-3.0.0-py3-none-any.whl

39 directories, 0 files
```

**Outlook** Part of #477 "Fix wheel caching". Further tasks:
* Replace the `CacheShard` with `WheelMetadataCache` which handles urls
properly.
* Delete unzipped wheels when their remote wheel changed
* Store built wheels next to the `metadata.json` in the source dist
directory; delete built wheels when their source dist changed (different
cache bucket, but it's the same problem of fixing wheel caching) I'll
make stacked PRs for those
This commit is contained in:
konsti 2023-11-28 01:03:58 +01:00 committed by GitHub
parent b455b08537
commit 1142a14f4d
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 106 additions and 77 deletions

View file

@ -1,15 +1,15 @@
use std::path::{Path, PathBuf};
use std::str::FromStr;
use anyhow::{anyhow, Result};
use anyhow::Result;
use url::Url;
use crate::traits::Metadata;
use crate::{BuiltDist, Dist, SourceDist, VersionOrUrl};
use pep440_rs::Version;
use distribution_filename::WheelFilename;
use puffin_normalize::PackageName;
use crate::direct_url::DirectUrl;
use crate::traits::Metadata;
use crate::{BuiltDist, Dist, SourceDist, VersionOrUrl};
/// A built distribution (wheel) that exists in the local cache.
#[derive(Debug, Clone)]
@ -22,31 +22,30 @@ pub enum CachedDist {
#[derive(Debug, Clone)]
pub struct CachedRegistryDist {
pub name: PackageName,
pub version: Version,
pub filename: WheelFilename,
pub path: PathBuf,
}
#[derive(Debug, Clone)]
pub struct CachedDirectUrlDist {
pub name: PackageName,
pub filename: WheelFilename,
pub url: Url,
pub path: PathBuf,
}
impl Metadata for CachedRegistryDist {
fn name(&self) -> &PackageName {
&self.name
&self.filename.name
}
fn version_or_url(&self) -> VersionOrUrl {
VersionOrUrl::Version(&self.version)
VersionOrUrl::Version(&self.filename.version)
}
}
impl Metadata for CachedDirectUrlDist {
fn name(&self) -> &PackageName {
&self.name
&self.filename.name
}
fn version_or_url(&self) -> VersionOrUrl {
@ -72,40 +71,36 @@ impl Metadata for CachedDist {
impl CachedDist {
/// Initialize a [`CachedDist`] from a [`Dist`].
pub fn from_remote(remote: Dist, path: PathBuf) -> Self {
pub fn from_remote(remote: Dist, filename: WheelFilename, path: PathBuf) -> Self {
match remote {
Dist::Built(BuiltDist::Registry(dist)) => Self::Registry(CachedRegistryDist {
name: dist.name,
version: dist.version,
path,
}),
Dist::Built(BuiltDist::Registry(_dist)) => {
Self::Registry(CachedRegistryDist { filename, path })
}
Dist::Built(BuiltDist::DirectUrl(dist)) => Self::Url(CachedDirectUrlDist {
name: dist.filename.name,
filename,
url: dist.url,
path,
}),
Dist::Built(BuiltDist::Path(dist)) => Self::Url(CachedDirectUrlDist {
name: dist.filename.name,
filename,
url: dist.url,
path,
}),
Dist::Source(SourceDist::Registry(dist)) => Self::Registry(CachedRegistryDist {
name: dist.name,
version: dist.version,
path,
}),
Dist::Source(SourceDist::Registry(_dist)) => {
Self::Registry(CachedRegistryDist { filename, path })
}
Dist::Source(SourceDist::DirectUrl(dist)) => Self::Url(CachedDirectUrlDist {
name: dist.name,
filename,
url: dist.url,
path,
}),
Dist::Source(SourceDist::Git(dist)) => Self::Url(CachedDirectUrlDist {
name: dist.name,
filename,
url: dist.url,
path,
}),
Dist::Source(SourceDist::Path(dist)) => Self::Url(CachedDirectUrlDist {
name: dist.name,
filename,
url: dist.url,
path,
}),
@ -130,8 +125,12 @@ impl CachedDist {
}
impl CachedDirectUrlDist {
pub fn from_url(name: PackageName, url: Url, path: PathBuf) -> Self {
Self { name, url, path }
pub fn from_url(filename: WheelFilename, url: Url, path: PathBuf) -> Self {
Self {
filename,
url,
path,
}
}
}
@ -144,18 +143,12 @@ impl CachedRegistryDist {
let Some(file_name) = file_name.to_str() else {
return Ok(None);
};
let Some((name, version)) = file_name.rsplit_once('-') else {
let Ok(filename) = WheelFilename::from_str(file_name) else {
return Ok(None);
};
let name = PackageName::from_str(name)?;
let version = Version::from_str(version).map_err(|err| anyhow!(err))?;
let path = path.to_path_buf();
Ok(Some(Self {
name,
version,
path,
}))
Ok(Some(Self { filename, path }))
}
}

View file

@ -76,6 +76,8 @@ pub(crate) async fn sync_requirements(
"Using Python interpreter: {}",
venv.python_executable().display()
);
// Determine the current environment markers.
let tags = Tags::from_interpreter(venv.interpreter())?;
// Partition into those that should be linked from the cache (`local`), those that need to be
// downloaded (`remote`), and those that should be removed (`extraneous`).
@ -83,7 +85,8 @@ pub(crate) async fn sync_requirements(
local,
remote,
extraneous,
} = InstallPlan::try_from_requirements(requirements, cache, &venv)?;
} = InstallPlan::try_from_requirements(requirements, cache, &venv, &tags)
.context("Failed to determine installation plan")?;
// Nothing to do.
if remote.is_empty() && local.is_empty() && extraneous.is_empty() {
@ -102,9 +105,6 @@ pub(crate) async fn sync_requirements(
return Ok(ExitStatus::Success);
}
// Determine the current environment markers.
let tags = Tags::from_interpreter(venv.interpreter())?;
// Instantiate a client.
let client = {
let mut builder = RegistryClientBuilder::new(cache);

View file

@ -115,13 +115,13 @@ impl BuildContext for BuildDispatch {
venv.root().display(),
);
let tags = Tags::from_interpreter(&self.interpreter)?;
let InstallPlan {
local,
remote,
extraneous,
} = InstallPlan::try_from_requirements(requirements, &self.cache, venv)?;
let tags = Tags::from_interpreter(&self.interpreter)?;
} = InstallPlan::try_from_requirements(requirements, &self.cache, venv, &tags)?;
// Resolve the dependencies.
let remote = if remote.is_empty() {

View file

@ -2,7 +2,9 @@ use std::path::{Path, PathBuf};
use fs_err as fs;
use distribution_types::{BuiltDist, Dist, Metadata, SourceDist};
use distribution_filename::WheelFilename;
use distribution_types::{BuiltDist, Dist, Metadata, SourceDist, VersionOrUrl};
use puffin_cache::{digest, CanonicalUrl};
static WHEEL_CACHE: &str = "wheels-v0";
@ -25,10 +27,13 @@ impl WheelCache {
}
/// Return the path at which a given [`Dist`] would be stored.
pub(crate) fn entry(&self, dist: &Dist) -> PathBuf {
self.root
.join(CacheShard::from(dist).segment())
.join(dist.package_id())
pub(crate) fn entry(&self, dist: &Dist, filename: &WheelFilename) -> PathBuf {
let mut path = self.root.join(CacheShard::from(dist).segment());
// TODO(konstin): Use `WheelMetadataCache` instead
if let VersionOrUrl::Url(url) = dist.version_or_url() {
path.push(digest(&CanonicalUrl::new(url)));
}
path.join(filename.to_string())
}
/// Returns a handle to the wheel cache directory.

View file

@ -1,11 +1,14 @@
use std::path::Path;
use std::str::FromStr;
use anyhow::{Context, Result};
use anyhow::{bail, Context, Result};
use distribution_filename::WheelFilename;
use tracing::debug;
use distribution_types::direct_url::DirectUrl;
use distribution_types::{CachedDist, InstalledDist};
use distribution_types::{CachedDist, InstalledDist, RemoteSource};
use pep508_rs::{Requirement, VersionOrUrl};
use platform_tags::Tags;
use puffin_interpreter::Virtualenv;
use crate::url_index::UrlIndex;
@ -33,13 +36,14 @@ impl InstallPlan {
requirements: &[Requirement],
cache: &Path,
venv: &Virtualenv,
tags: &Tags,
) -> Result<Self> {
// Index all the already-installed packages in site-packages.
let mut site_packages =
SitePackages::try_from_executable(venv).context("Failed to list installed packages")?;
// Index all the already-downloaded wheels in the cache.
let registry_index = RegistryIndex::try_from_directory(cache);
let registry_index = RegistryIndex::try_from_directory(cache, tags);
let url_index = UrlIndex::try_from_directory(cache);
let mut local = vec![];
@ -86,7 +90,7 @@ impl InstallPlan {
None | Some(VersionOrUrl::VersionSpecifier(_)) => {
if let Some(distribution) = registry_index
.get(&requirement.name)
.filter(|dist| requirement.is_satisfied_by(&dist.version))
.filter(|dist| requirement.is_satisfied_by(&dist.filename.version))
{
debug!("Requirement already cached: {distribution}");
local.push(CachedDist::Registry(distribution.clone()));
@ -94,10 +98,24 @@ impl InstallPlan {
}
}
Some(VersionOrUrl::Url(url)) => {
if let Some(distribution) = url_index.get(&requirement.name, url) {
debug!("Requirement already cached: {distribution}");
local.push(CachedDist::Url(distribution.clone()));
continue;
// Only consider wheel urls
if let Some(filename) = url
.filename()
.ok()
.and_then(|filename| WheelFilename::from_str(filename).ok())
{
if requirement.name != filename.name {
bail!(
"Given name `{}` does not match url name `{}`",
requirement.name,
url
);
}
if let Some(distribution) = url_index.get(filename, url) {
debug!("Requirement already cached: {distribution}");
local.push(CachedDist::Url(distribution.clone()));
continue;
}
}
}
}

View file

@ -5,6 +5,7 @@ use fs_err as fs;
use tracing::warn;
use distribution_types::{CachedRegistryDist, Metadata};
use platform_tags::Tags;
use puffin_normalize::PackageName;
use crate::cache::{CacheShard, WheelCache};
@ -15,7 +16,7 @@ pub struct RegistryIndex(HashMap<PackageName, CachedRegistryDist>);
impl RegistryIndex {
/// Build an index of cached distributions from a directory.
pub fn try_from_directory(path: &Path) -> Self {
pub fn try_from_directory(path: &Path, tags: &Tags) -> Self {
let mut index = HashMap::new();
let cache = WheelCache::new(path);
@ -36,21 +37,32 @@ impl RegistryIndex {
continue;
}
};
if file_type.is_dir() {
match CachedRegistryDist::try_from_path(&path) {
Ok(None) => {}
Ok(Some(dist_info)) => {
if !file_type.is_dir() {
continue;
}
match CachedRegistryDist::try_from_path(&path) {
Ok(None) => {}
Ok(Some(dist_info)) => {
// Pick the wheel with the highest priority
let compatibility = dist_info.filename.compatibility(tags);
if let Some(existing) = index.get_mut(dist_info.name()) {
// Override if we have better compatibility
if compatibility > existing.filename.compatibility(tags) {
*existing = dist_info;
}
} else if compatibility.is_some() {
index.insert(dist_info.name().clone(), dist_info);
}
Err(err) => {
warn!("Invalid cache entry at {}, removing. {err}", path.display());
let result = fs::remove_dir_all(&path);
if let Err(err) = result {
warn!(
"Failed to remove invalid cache entry at {}: {err}",
path.display()
);
}
}
Err(err) => {
warn!("Invalid cache entry at {}, removing. {err}", path.display());
let result = fs::remove_dir_all(&path);
if let Err(err) = result {
warn!(
"Failed to remove invalid cache entry at {}: {err}",
path.display()
);
}
}
}

View file

@ -44,6 +44,7 @@ impl Unzipper {
let mut wheels = Vec::with_capacity(downloads.len());
for download in downloads {
let remote = download.remote().clone();
let filename = download.filename().clone();
debug!("Unpacking wheel: {remote}");
@ -55,7 +56,7 @@ impl Unzipper {
.await??;
// Write the unzipped wheel to the target directory.
let target = wheel_cache.entry(&remote);
let target = wheel_cache.entry(&remote, &filename);
if let Some(parent) = target.parent() {
fs_err::create_dir_all(parent)?;
}
@ -65,7 +66,7 @@ impl Unzipper {
if let Err(err) = result {
// If the renaming failed because another instance was faster, that's fine
// (`DirectoryNotEmpty` is not stable so we can't match on it)
if !wheel_cache.entry(&remote).is_dir() {
if !wheel_cache.entry(&remote, &filename).is_dir() {
return Err(err.into());
}
}
@ -74,8 +75,8 @@ impl Unzipper {
reporter.on_unzip_progress(&remote);
}
let path = wheel_cache.entry(&remote);
wheels.push(CachedDist::from_remote(remote, path));
let path = wheel_cache.entry(&remote, &filename);
wheels.push(CachedDist::from_remote(remote, filename, path));
}
if let Some(reporter) = self.reporter.as_ref() {

View file

@ -4,8 +4,8 @@ use fxhash::FxHashMap;
use tracing::warn;
use url::Url;
use distribution_filename::WheelFilename;
use distribution_types::{CachedDirectUrlDist, Identifier};
use puffin_normalize::PackageName;
use crate::cache::{CacheShard, WheelCache};
@ -49,13 +49,13 @@ impl UrlIndex {
}
/// Returns a distribution from the index, if it exists.
pub(crate) fn get(&self, name: &PackageName, url: &Url) -> Option<CachedDirectUrlDist> {
pub(crate) fn get(&self, filename: WheelFilename, url: &Url) -> Option<CachedDirectUrlDist> {
// TODO(charlie): This takes advantage of the fact that for URL dependencies, the package ID
// and distribution ID are identical. We should either change the cache layout to use
// distribution IDs, or implement package ID for URL.
let path = self.0.get(&url.distribution_id())?;
Some(CachedDirectUrlDist::from_url(
name.clone(),
filename,
url.clone(),
path.clone(),
))