Respect direct URLs in puffin installer (#345)

We now write the `direct_url.json` when installing, and _skip_
installing if we find a package installed via the direct URL that the
user is requesting.

A lot of TODOs, especially around cleaning up the `Source` abstraction
and its relationship to `DirectUrl`. I'm gonna keep working on these
today, but this works and makes the requirements clear.

Closes #332.
This commit is contained in:
Charlie Marsh 2023-11-07 06:11:27 -08:00 committed by GitHub
parent 55ad1c89be
commit 2c32bc5a86
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
16 changed files with 464 additions and 44 deletions

2
Cargo.lock generated
View file

@ -2326,11 +2326,13 @@ name = "puffin-distribution"
version = "0.1.0"
dependencies = [
"anyhow",
"fs-err",
"pep440_rs 0.3.12",
"puffin-cache",
"puffin-git",
"puffin-normalize",
"pypi-types",
"serde_json",
"url",
]

View file

@ -573,7 +573,53 @@ fn install_url() -> Result<()> {
/// Install a package into a virtual environment from a Git repository.
#[test]
#[cfg(feature = "git")]
fn install_git() -> Result<()> {
fn install_git_commit() -> Result<()> {
let temp_dir = assert_fs::TempDir::new()?;
let cache_dir = assert_fs::TempDir::new()?;
let venv = temp_dir.child(".venv");
Command::new(get_cargo_bin(BIN_NAME))
.arg("venv")
.arg(venv.as_os_str())
.arg("--cache-dir")
.arg(cache_dir.path())
.current_dir(&temp_dir)
.assert()
.success();
venv.assert(predicates::path::is_dir());
let requirements_txt = temp_dir.child("requirements.txt");
requirements_txt.touch()?;
requirements_txt.write_str("werkzeug @ git+https://github.com/pallets/werkzeug.git@af160e0b6b7ddd81c22f1652c728ff5ac72d5c74")?;
insta::with_settings!({
filters => vec![
(r"(\d|\.)+(ms|s)", "[TIME]"),
]
}, {
assert_cmd_snapshot!(Command::new(get_cargo_bin(BIN_NAME))
.arg("pip-sync")
.arg("requirements.txt")
.arg("--cache-dir")
.arg(cache_dir.path())
.env("VIRTUAL_ENV", venv.as_os_str())
.current_dir(&temp_dir));
});
Command::new(venv.join("bin").join("python"))
.arg("-c")
.arg("import werkzeug")
.current_dir(&temp_dir)
.assert()
.success();
Ok(())
}
/// Install a package into a virtual environment from a Git repository.
#[test]
#[cfg(feature = "git")]
fn install_git_tag() -> Result<()> {
let temp_dir = assert_fs::TempDir::new()?;
let cache_dir = assert_fs::TempDir::new()?;
let venv = temp_dir.child(".venv");
@ -706,3 +752,179 @@ fn install_sdist() -> Result<()> {
Ok(())
}
/// Attempt to re-install a package into a virtual environment from a URL. The second install
/// should be a no-op.
#[test]
fn install_url_then_install_url() -> Result<()> {
let temp_dir = assert_fs::TempDir::new()?;
let cache_dir = assert_fs::TempDir::new()?;
let venv = temp_dir.child(".venv");
Command::new(get_cargo_bin(BIN_NAME))
.arg("venv")
.arg(venv.as_os_str())
.arg("--cache-dir")
.arg(cache_dir.path())
.current_dir(&temp_dir)
.assert()
.success();
venv.assert(predicates::path::is_dir());
let requirements_txt = temp_dir.child("requirements.txt");
requirements_txt.touch()?;
requirements_txt.write_str("werkzeug @ https://files.pythonhosted.org/packages/ff/1d/960bb4017c68674a1cb099534840f18d3def3ce44aed12b5ed8b78e0153e/Werkzeug-2.0.0-py3-none-any.whl")?;
Command::new(get_cargo_bin(BIN_NAME))
.arg("pip-sync")
.arg("requirements.txt")
.arg("--cache-dir")
.arg(cache_dir.path())
.env("VIRTUAL_ENV", venv.as_os_str())
.current_dir(&temp_dir)
.assert()
.success();
insta::with_settings!({
filters => vec![
(r"(\d|\.)+(ms|s)", "[TIME]"),
]
}, {
assert_cmd_snapshot!(Command::new(get_cargo_bin(BIN_NAME))
.arg("pip-sync")
.arg("requirements.txt")
.arg("--cache-dir")
.arg(cache_dir.path())
.env("VIRTUAL_ENV", venv.as_os_str())
.current_dir(&temp_dir));
});
Command::new(venv.join("bin").join("python"))
.arg("-c")
.arg("import werkzeug")
.current_dir(&temp_dir)
.assert()
.success();
Ok(())
}
/// Install a package via a URL, then via a registry version. The second install _should_ remove the
/// URL-based version, but doesn't right now.
#[test]
fn install_url_then_install_version() -> Result<()> {
let temp_dir = assert_fs::TempDir::new()?;
let cache_dir = assert_fs::TempDir::new()?;
let venv = temp_dir.child(".venv");
Command::new(get_cargo_bin(BIN_NAME))
.arg("venv")
.arg(venv.as_os_str())
.arg("--cache-dir")
.arg(cache_dir.path())
.current_dir(&temp_dir)
.assert()
.success();
venv.assert(predicates::path::is_dir());
let requirements_txt = temp_dir.child("requirements.txt");
requirements_txt.touch()?;
requirements_txt.write_str("werkzeug @ https://files.pythonhosted.org/packages/ff/1d/960bb4017c68674a1cb099534840f18d3def3ce44aed12b5ed8b78e0153e/Werkzeug-2.0.0-py3-none-any.whl")?;
Command::new(get_cargo_bin(BIN_NAME))
.arg("pip-sync")
.arg("requirements.txt")
.arg("--cache-dir")
.arg(cache_dir.path())
.env("VIRTUAL_ENV", venv.as_os_str())
.current_dir(&temp_dir)
.assert()
.success();
let requirements_txt = temp_dir.child("requirements.txt");
requirements_txt.touch()?;
requirements_txt.write_str("werkzeug==2.0.0")?;
insta::with_settings!({
filters => vec![
(r"(\d|\.)+(ms|s)", "[TIME]"),
]
}, {
assert_cmd_snapshot!(Command::new(get_cargo_bin(BIN_NAME))
.arg("pip-sync")
.arg("requirements.txt")
.arg("--cache-dir")
.arg(cache_dir.path())
.env("VIRTUAL_ENV", venv.as_os_str())
.current_dir(&temp_dir));
});
Command::new(venv.join("bin").join("python"))
.arg("-c")
.arg("import werkzeug")
.current_dir(&temp_dir)
.assert()
.success();
Ok(())
}
/// Install a package via a registry version, then via a direct URL version. The second install
/// should remove the registry-based version.
#[test]
fn install_version_then_install_url() -> Result<()> {
let temp_dir = assert_fs::TempDir::new()?;
let cache_dir = assert_fs::TempDir::new()?;
let venv = temp_dir.child(".venv");
Command::new(get_cargo_bin(BIN_NAME))
.arg("venv")
.arg(venv.as_os_str())
.arg("--cache-dir")
.arg(cache_dir.path())
.current_dir(&temp_dir)
.assert()
.success();
venv.assert(predicates::path::is_dir());
let requirements_txt = temp_dir.child("requirements.txt");
requirements_txt.touch()?;
requirements_txt.write_str("werkzeug==2.0.0")?;
Command::new(get_cargo_bin(BIN_NAME))
.arg("pip-sync")
.arg("requirements.txt")
.arg("--cache-dir")
.arg(cache_dir.path())
.env("VIRTUAL_ENV", venv.as_os_str())
.current_dir(&temp_dir)
.assert()
.success();
let requirements_txt = temp_dir.child("requirements.txt");
requirements_txt.touch()?;
requirements_txt.write_str("werkzeug @ https://files.pythonhosted.org/packages/ff/1d/960bb4017c68674a1cb099534840f18d3def3ce44aed12b5ed8b78e0153e/Werkzeug-2.0.0-py3-none-any.whl")?;
insta::with_settings!({
filters => vec![
(r"(\d|\.)+(ms|s)", "[TIME]"),
]
}, {
assert_cmd_snapshot!(Command::new(get_cargo_bin(BIN_NAME))
.arg("pip-sync")
.arg("requirements.txt")
.arg("--cache-dir")
.arg(cache_dir.path())
.env("VIRTUAL_ENV", venv.as_os_str())
.current_dir(&temp_dir));
});
Command::new(venv.join("bin").join("python"))
.arg("-c")
.arg("import werkzeug")
.current_dir(&temp_dir)
.assert()
.success();
Ok(())
}

View file

@ -0,0 +1,24 @@
---
source: crates/puffin-cli/tests/pip_sync.rs
info:
program: puffin
args:
- pip-sync
- requirements.txt
- "--cache-dir"
- /var/folders/nt/6gf2v7_s3k13zq_t3944rwz40000gn/T/.tmpMEVMlO
env:
VIRTUAL_ENV: /var/folders/nt/6gf2v7_s3k13zq_t3944rwz40000gn/T/.tmpevmjat/.venv
---
success: true
exit_code: 0
----- stdout -----
----- stderr -----
Resolved 1 package in [TIME]
Downloaded 1 package in [TIME]
Built 1 package in [TIME]
Unzipped 1 package in [TIME]
Installed 1 package in [TIME]
+ werkzeug @ git+https://github.com/pallets/werkzeug.git@af160e0b6b7ddd81c22f1652c728ff5ac72d5c74

View file

@ -6,9 +6,9 @@ info:
- pip-sync
- requirements.txt
- "--cache-dir"
- /var/folders/nt/6gf2v7_s3k13zq_t3944rwz40000gn/T/.tmpZNXBuT
- /var/folders/nt/6gf2v7_s3k13zq_t3944rwz40000gn/T/.tmpaUnvqN
env:
VIRTUAL_ENV: /var/folders/nt/6gf2v7_s3k13zq_t3944rwz40000gn/T/.tmpBYANH7/.venv
VIRTUAL_ENV: /var/folders/nt/6gf2v7_s3k13zq_t3944rwz40000gn/T/.tmpd1OThU/.venv
---
success: true
exit_code: 0

View file

@ -0,0 +1,19 @@
---
source: crates/puffin-cli/tests/pip_sync.rs
info:
program: puffin
args:
- pip-sync
- requirements.txt
- "--cache-dir"
- /var/folders/nt/6gf2v7_s3k13zq_t3944rwz40000gn/T/.tmppJc9Sz
env:
VIRTUAL_ENV: /var/folders/nt/6gf2v7_s3k13zq_t3944rwz40000gn/T/.tmpf8JjJa/.venv
---
success: true
exit_code: 0
----- stdout -----
----- stderr -----
Audited 1 package in [TIME]

View file

@ -0,0 +1,19 @@
---
source: crates/puffin-cli/tests/pip_sync.rs
info:
program: puffin
args:
- pip-sync
- requirements.txt
- "--cache-dir"
- /var/folders/nt/6gf2v7_s3k13zq_t3944rwz40000gn/T/.tmp83DdhQ
env:
VIRTUAL_ENV: /var/folders/nt/6gf2v7_s3k13zq_t3944rwz40000gn/T/.tmpWIocF0/.venv
---
success: true
exit_code: 0
----- stdout -----
----- stderr -----
Audited 1 package in [TIME]

View file

@ -0,0 +1,25 @@
---
source: crates/puffin-cli/tests/pip_sync.rs
info:
program: puffin
args:
- pip-sync
- requirements.txt
- "--cache-dir"
- /var/folders/nt/6gf2v7_s3k13zq_t3944rwz40000gn/T/.tmp4yAvnn
env:
VIRTUAL_ENV: /var/folders/nt/6gf2v7_s3k13zq_t3944rwz40000gn/T/.tmpAIWfM9/.venv
---
success: true
exit_code: 0
----- stdout -----
----- stderr -----
Resolved 1 package in [TIME]
Downloaded 1 package in [TIME]
Unzipped 1 package in [TIME]
Uninstalled 1 package in [TIME]
Installed 1 package in [TIME]
- werkzeug==2.0.0
+ werkzeug @ https://files.pythonhosted.org/packages/ff/1d/960bb4017c68674a1cb099534840f18d3def3ce44aed12b5ed8b78e0153e/Werkzeug-2.0.0-py3-none-any.whl

View file

@ -17,4 +17,6 @@ puffin-normalize = { path = "../puffin-normalize" }
pypi-types = { path = "../pypi-types" }
anyhow = { workspace = true }
fs-err = { workspace = true }
serde_json = { workspace = true }
url = { workspace = true }

View file

@ -8,7 +8,7 @@ use url::Url;
use pep440_rs::Version;
use puffin_cache::CanonicalUrl;
use puffin_normalize::PackageName;
use pypi_types::File;
use pypi_types::{DirectUrl, File};
pub mod source;
@ -267,15 +267,6 @@ pub struct InstalledDistribution {
}
impl InstalledDistribution {
/// Initialize a new installed distribution.
pub fn new(name: PackageName, version: Version, path: PathBuf) -> Self {
Self {
name,
version,
path,
}
}
/// Try to parse a distribution from a `.dist-info` directory name (like `django-5.0a1.dist-info`).
///
/// See: <https://packaging.python.org/en/latest/specifications/recording-installed-packages/#recording-installed-packages>
@ -310,6 +301,7 @@ impl InstalledDistribution {
&self.name
}
/// Return the [`Version`] of the distribution.
pub fn version(&self) -> &Version {
&self.version
}
@ -322,8 +314,20 @@ impl InstalledDistribution {
/// Return a [`Version`], for registry-based distributions, or a [`Url`], for URL-based
/// distributions.
pub fn version_or_url(&self) -> VersionOrUrl {
// TODO(charlie): If this dependency was installed via a direct URL, return it here, rather
// than the version.
VersionOrUrl::Version(&self.version)
}
/// Return the [`DirectUrl`] metadata for this distribution, if it exists.
pub fn direct_url(&self) -> Result<Option<DirectUrl>> {
let path = self.path.join("direct_url.json");
let Ok(file) = fs_err::File::open(path) else {
return Ok(None);
};
let direct_url = serde_json::from_reader::<fs_err::File, DirectUrl>(file)?;
Ok(Some(direct_url))
}
}
impl std::fmt::Display for InstalledDistribution {

View file

@ -1,9 +1,10 @@
use std::path::PathBuf;
use anyhow::{Error, Result};
use anyhow::{anyhow, Error, Result};
use url::Url;
use puffin_git::Git;
use pypi_types::{ArchiveInfo, DirectUrl, VcsInfo, VcsKind};
use crate::RemoteDistributionRef;
@ -30,24 +31,32 @@ impl<'a> TryFrom<&'a RemoteDistributionRef<'_>> for Source<'a> {
// If a distribution is specified via a direct URL, it could be a URL to a hosted file,
// or a URL to a Git repository.
RemoteDistributionRef::Url(_, url) => {
// If the URL points to a subdirectory, extract it, as in:
// `https://git.example.com/MyProject.git@v1.0#subdirectory=pkg_dir`
// `https://git.example.com/MyProject.git@v1.0#egg=pkg&subdirectory=pkg_dir`
let subdirectory = url.fragment().and_then(|fragment| {
fragment.split('&').find_map(|fragment| {
fragment.strip_prefix("subdirectory=").map(PathBuf::from)
})
});
RemoteDistributionRef::Url(_, url) => Self::try_from(*url),
}
}
}
if let Some(url) = url.as_str().strip_prefix("git+") {
let url = Url::parse(url)?;
let git = Git::try_from(url)?;
Ok(Self::Git(git, subdirectory))
} else {
Ok(Self::RemoteUrl(url, subdirectory))
}
}
impl<'a> TryFrom<&'a Url> for Source<'a> {
type Error = Error;
fn try_from(url: &'a Url) -> Result<Self, Self::Error> {
// If the URL points to a subdirectory, extract it, as in:
// `https://git.example.com/MyProject.git@v1.0#subdirectory=pkg_dir`
// `https://git.example.com/MyProject.git@v1.0#egg=pkg&subdirectory=pkg_dir`
let subdirectory = url.fragment().and_then(|fragment| {
fragment
.split('&')
.find_map(|fragment| fragment.strip_prefix("subdirectory=").map(PathBuf::from))
});
// If a distribution is specified via a direct URL, it could be a URL to a hosted file,
// or a URL to a Git repository.
if let Some(url) = url.as_str().strip_prefix("git+") {
let url = Url::parse(url)?;
let git = Git::try_from(url)?;
Ok(Self::Git(git, subdirectory))
} else {
Ok(Self::RemoteUrl(url, subdirectory))
}
}
}
@ -76,3 +85,32 @@ impl From<Source<'_>> for Url {
}
}
}
impl TryFrom<Source<'_>> for DirectUrl {
type Error = Error;
fn try_from(value: Source<'_>) -> Result<Self, Self::Error> {
match value {
Source::RegistryUrl(_) => Err(anyhow!("Registry dependencies have no direct URL")),
Source::RemoteUrl(url, subdirectory) => Ok(DirectUrl::ArchiveUrl {
url: url.to_string(),
archive_info: ArchiveInfo {
hash: None,
hashes: None,
},
subdirectory,
}),
Source::Git(git, subdirectory) => Ok(DirectUrl::VcsUrl {
url: git.url().to_string(),
vcs_info: VcsInfo {
vcs: VcsKind::Git,
// TODO(charlie): In `pip-sync`, we should `.precise` our Git dependencies,
// even though we expect it to be a no-op.
commit_id: git.precise().map(|oid| oid.to_string()),
requested_revision: git.reference().map(ToString::to_string),
},
subdirectory,
}),
}
}
}

View file

@ -24,6 +24,29 @@ impl Git {
self.precise = Some(precise);
self
}
/// Return the [`Url`] of the Git repository.
pub fn url(&self) -> &Url {
&self.url
}
/// Return the reference to the commit to use, which could be a branch, tag or revision.
pub fn reference(&self) -> Option<&str> {
match &self.reference {
GitReference::Branch(rev)
| GitReference::Tag(rev)
| GitReference::BranchOrTag(rev)
| GitReference::Ref(rev)
| GitReference::FullCommit(rev)
| GitReference::ShortCommit(rev) => Some(rev),
GitReference::DefaultBranch => None,
}
}
/// Return the precise commit, if known.
pub fn precise(&self) -> Option<git2::Oid> {
self.precise
}
}
impl TryFrom<Url> for Git {

View file

@ -1,8 +1,10 @@
use anyhow::{Context, Error, Result};
use rayon::iter::{IntoParallelRefIterator, ParallelIterator};
use puffin_distribution::source::Source;
use puffin_distribution::CachedDistribution;
use puffin_interpreter::Virtualenv;
use pypi_types::DirectUrl;
pub struct Installer<'a> {
venv: &'a Virtualenv,
@ -47,7 +49,7 @@ impl<'a> Installer<'a> {
install_wheel_rs::linker::install_wheel(
&location,
wheel.path(),
None,
direct_url(wheel)?.as_ref(),
self.link_mode,
)
.with_context(|| format!("Failed to install: {wheel}"))?;
@ -62,6 +64,17 @@ impl<'a> Installer<'a> {
}
}
/// Return the [`DirectUrl`] for a wheel, if applicable.
///
/// TODO(charlie): This shouldn't be in `puffin-installer`.
fn direct_url(wheel: &CachedDistribution) -> Result<Option<DirectUrl>> {
let CachedDistribution::Url(_, url, _) = wheel else {
return Ok(None);
};
let source = Source::try_from(url)?;
DirectUrl::try_from(source).map(Some)
}
pub trait Reporter: Send + Sync {
/// Callback to invoke when a dependency is resolved.
fn on_install_progress(&self, wheel: &CachedDistribution);

View file

@ -4,8 +4,10 @@ use anyhow::{Context, Result};
use tracing::debug;
use pep508_rs::{Requirement, VersionOrUrl};
use puffin_distribution::source::Source;
use puffin_distribution::{CachedDistribution, InstalledDistribution};
use puffin_interpreter::Virtualenv;
use pypi_types::DirectUrl;
use crate::url_index::UrlIndex;
use crate::{RegistryIndex, SitePackages};
@ -47,14 +49,35 @@ impl InstallPlan {
for requirement in requirements {
// Filter out already-installed packages.
// TODO(charlie): Detect packages installed via URL. Right now, like pip, we _always_
// attempt to reinstall a package if it was installed via URL. This is often very
// fast, since the wheel is cached, but it should still be avoidable.
if let Some(distribution) = site_packages.remove(&requirement.name) {
if requirement.is_satisfied_by(distribution.version()) {
debug!("Requirement already satisfied: {distribution}",);
continue;
// We need to map here from the requirement to its DirectUrl, then see if that DirectUrl
// is anywhere in `site_packages`.
match requirement.version_or_url.as_ref() {
// If the requirement comes from a registry, check by name.
None | Some(VersionOrUrl::VersionSpecifier(_)) => {
if requirement.is_satisfied_by(distribution.version()) {
debug!("Requirement already satisfied: {distribution}");
continue;
}
}
// If the requirement comes from a direct URL, check by URL.
Some(VersionOrUrl::Url(url)) => {
if let Ok(Some(direct_url)) = distribution.direct_url() {
if let Ok(source) = Source::try_from(url) {
if let Ok(target) = DirectUrl::try_from(source) {
// TODO(charlie): These don't need to be strictly equal. We only care
// about a subset of the fields.
if target == direct_url {
debug!("Requirement already satisfied: {distribution}");
continue;
}
}
}
}
}
}
extraneous.push(distribution);
}

View file

@ -146,6 +146,11 @@ impl<'a, T: BuildContext> SourceDistributionFetcher<'a, T> {
return Ok(None);
};
// If the commit already contains a complete SHA, short-circuit.
if git.precise().is_some() {
return Ok(None);
}
// Fetch the precise SHA of the Git reference (which could be a branch, a tag, a partial
// commit, etc.).
let dir = self.0.cache().join(GIT_CACHE);

View file

@ -6,7 +6,7 @@ use serde::{Deserialize, Serialize};
/// Metadata for a distribution that was installed via a direct URL.
///
/// See: <https://packaging.python.org/en/latest/specifications/direct-url-data-structure/>
#[derive(Debug, Clone, Serialize, Deserialize)]
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum DirectUrl {
/// The direct URL is a path to an archive. For example:
@ -31,7 +31,7 @@ pub enum DirectUrl {
},
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub struct ArchiveInfo {
#[serde(skip_serializing_if = "Option::is_none")]
@ -40,16 +40,17 @@ pub struct ArchiveInfo {
pub hashes: Option<HashMap<String, String>>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub struct VcsInfo {
pub vcs: VcsKind,
pub commit_id: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub commit_id: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub requested_revision: Option<String>,
}
#[derive(Debug, Copy, Clone, Serialize, Deserialize)]
#[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum VcsKind {
Git,

View file

@ -1,4 +1,4 @@
pub use direct_url::DirectUrl;
pub use direct_url::{ArchiveInfo, DirectUrl, VcsInfo, VcsKind};
pub use metadata::{Error, Metadata21};
pub use simple_json::{File, SimpleJson};