Write fully-precise Git SHAs to pip-compile output (#299)

This PR adds a mechanism by which we can ensure that we _always_ try to
refresh Git dependencies when resolving; further, we now write the fully
resolved SHA to the "lockfile". However, nothing in the code _assumes_
we do this, so the installer will remain agnostic to this behavior.

The specific approach taken here is minimally invasive. Specifically,
when we try to fetch a source distribution, we check if it's a Git
dependency; if it is, we fetch, and return the exact SHA, which we then
map back to a new URL. In the resolver, we keep track of URL
"redirects", and then we use the redirect (1) for the actual source
distribution building, and (2) when writing back out to the lockfile. As
such, none of the types outside of the resolver change at all, since
we're just mapping `RemoteDistribution` to `RemoteDistribution`, but
swapping out the internal URLs.

There are some inefficiencies here since, e.g., we do the Git fetch,
send back the "precise" URL, then a moment later, do a Git checkout of
that URL (which will be _mostly_ a no-op -- since we have a full SHA, we
don't have to fetch anything, but we _do_ check back on disk to see if
the SHA is still checked out). A more efficient approach would be to
return the path to the checked-out revision when we do this conversion
to a "precise" URL, since we'd then only interact with the Git repo
exactly once. But this runs the risk that the checked-out SHA changes
between the time we make the "precise" URL and the time we build the
source distribution.

Closes #286.
This commit is contained in:
Charlie Marsh 2023-11-03 09:26:57 -07:00 committed by GitHub
parent addcfe533a
commit fa1bbbbe08
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
14 changed files with 254 additions and 124 deletions

View file

@ -9,47 +9,38 @@ use tracing::debug;
use puffin_cache::{digest, CanonicalUrl};
use crate::git::{GitReference, GitRemote};
use crate::git::GitRemote;
use crate::{FetchStrategy, Git};
/// A remote Git source that can be checked out locally.
pub struct GitSource {
/// The git remote which we're going to fetch from.
remote: GitRemote,
/// The Git reference from the manifest file.
manifest_reference: GitReference,
/// The revision which a git source is locked to.
/// This is expected to be set after the Git repository is fetched.
locked_rev: Option<git2::Oid>,
/// The identifier of this source for Cargo's Git cache directory.
/// See [`ident`] for more.
ident: String,
git: Git,
/// The HTTP client to use for fetching.
client: Client,
/// The fetch strategy to use when cloning.
strategy: FetchStrategy,
/// The path to the Git source database.
git: PathBuf,
cache: PathBuf,
}
impl GitSource {
pub fn new(reference: Git, git: PathBuf) -> Self {
pub fn new(git: Git, cache: impl Into<PathBuf>) -> Self {
Self {
remote: GitRemote::new(&reference.url),
manifest_reference: reference.reference,
locked_rev: reference.precise,
ident: digest(&CanonicalUrl::new(&reference.url)),
git,
client: Client::new(),
strategy: FetchStrategy::Libgit2,
git,
cache: cache.into(),
}
}
pub fn fetch(self) -> Result<PathBuf> {
pub fn fetch(self) -> Result<Fetch> {
// The path to the repo, within the Git database.
let db_path = self.git.join("db").join(&self.ident);
let ident = digest(&CanonicalUrl::new(&self.git.url));
let db_path = self.cache.join("db").join(&ident);
let (db, actual_rev) = match (self.locked_rev, self.remote.db_at(&db_path).ok()) {
let remote = GitRemote::new(&self.git.url);
let (db, actual_rev) = match (self.git.precise, remote.db_at(&db_path).ok()) {
// If we have a locked revision, and we have a preexisting database
// which has that revision, then no update needs to happen.
(Some(rev), Some(db)) if db.contains(rev) => (db, rev),
@ -59,12 +50,12 @@ impl GitSource {
// situation that we have a locked revision but the database
// doesn't have it.
(locked_rev, db) => {
debug!("Updating Git source: `{:?}`", self.remote);
debug!("Updating Git source: `{:?}`", remote);
self.remote.checkout(
remote.checkout(
&db_path,
db,
&self.manifest_reference,
&self.git.reference,
locked_rev,
self.strategy,
&self.client,
@ -80,12 +71,34 @@ impl GitSource {
// filesystem. This will use hard links and such to ideally make the
// checkout operation here pretty fast.
let checkout_path = self
.git
.cache
.join("checkouts")
.join(&self.ident)
.join(&ident)
.join(short_id.as_str());
db.copy_to(actual_rev, &checkout_path, self.strategy, &self.client)?;
Ok(checkout_path)
Ok(Fetch {
git: self.git.with_precise(actual_rev),
path: checkout_path,
})
}
}
pub struct Fetch {
/// The [`Git`] reference that was fetched.
git: Git,
/// The path to the checked out repository.
path: PathBuf,
}
impl From<Fetch> for Git {
fn from(fetch: Fetch) -> Self {
fetch.git
}
}
impl From<Fetch> for PathBuf {
fn from(fetch: Fetch) -> Self {
fetch.path
}
}