From 62c474d88054a7e0032765147e6a434505e80d14 Mon Sep 17 00:00:00 2001 From: Charlie Marsh Date: Thu, 2 Nov 2023 08:14:55 -0700 Subject: [PATCH] Add support for Git dependencies (#283) ## Summary This PR adds support for Git dependencies, like: ``` flask @ git+https://github.com/pallets/flask.git ``` Right now, they're only supported in the resolver (and not the installer), since the installer doesn't yet support source distributions at all. The general approach here is based on Cargo's Git implementation. Specifically, I adapted Cargo's [`git`](https://github.com/rust-lang/cargo/blob/23eb492cf920ce051abfc56bbaf838514dc8365c/src/cargo/sources/git/mod.rs) module to perform the cloning, which is based on `libgit2`. As compared to Cargo's implementation, I made the following changes: - Removed any unnecessary code. - Fixed any Clippy errors for our stricter ruleset. - Removed the dependency on `curl`, in favor of `reqwest` which we use elsewhere. - Removed the ability to use `gix`. Cargo allows the use of `gix` as an experimental flag, but it only supports a small subset of the operations. When Cargo fully adopts `gix`, we should plan to do the same. - Removed Cargo's host key checking. We need to re-add this! I'll do it shortly. - Removed Cargo's progress bars. We should re-add this too, but we use `indicatif` and Cargo had their own thing. There are a few follow-ups to consider: - Adding support in the installer. - When we lock, we should write out the Git URL that includes the exact SHA. This lets us cache in perpetuity and avoids dependencies changing without re-locking. - When we resolve, we should _always_ try to refresh Git dependencies. (Right now, we skip if the wheel was already built.) I'll work on the latter two in follow-up PRs. Closes #202. --- Cargo.lock | 168 ++ Cargo.toml | 2 + crates/puffin-build/src/lib.rs | 13 +- crates/puffin-cli/tests/pip_compile.rs | 40 + ...compile__compile_git_https_dependency.snap | 36 + crates/puffin-git/Cargo.toml | 30 + crates/puffin-git/src/git.rs | 1365 +++++++++++++++++ crates/puffin-git/src/lib.rs | 73 + crates/puffin-git/src/source.rs | 91 ++ crates/puffin-git/src/util/errors.rs | 45 + crates/puffin-git/src/util/mod.rs | 17 + crates/puffin-git/src/util/retry.rs | 187 +++ crates/puffin-resolver/Cargo.toml | 1 + crates/puffin-resolver/src/resolver.rs | 48 +- .../src/source_distribution.rs | 78 +- 15 files changed, 2162 insertions(+), 32 deletions(-) create mode 100644 crates/puffin-cli/tests/snapshots/pip_compile__compile_git_https_dependency.snap create mode 100644 crates/puffin-git/Cargo.toml create mode 100644 crates/puffin-git/src/git.rs create mode 100644 crates/puffin-git/src/lib.rs create mode 100644 crates/puffin-git/src/source.rs create mode 100644 crates/puffin-git/src/util/errors.rs create mode 100644 crates/puffin-git/src/util/mod.rs create mode 100644 crates/puffin-git/src/util/retry.rs diff --git a/Cargo.lock b/Cargo.lock index 2451ddf19..2f2c120e7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -362,12 +362,35 @@ dependencies = [ "serde", ] +[[package]] +name = "cargo-util" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77042b5b585f701f1cfb4b6b12ebc02b9b0cefbc8dcce235906b6bf376d4245d" +dependencies = [ + "anyhow", + "core-foundation", + "filetime", + "hex", + "jobserver", + "libc", + "miow", + "same-file", + "sha2", + "shell-escape", + "tempfile", + "tracing", + "walkdir", + "windows-sys 0.48.0", +] + [[package]] name = "cc" version = "1.0.83" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0" dependencies = [ + "jobserver", "libc", ] @@ -956,6 +979,27 @@ version = "0.28.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6fb8d784f27acf97159b40fc4db5ecd8aa23b9ad5ef69cdd136d3bc80665f0c0" +[[package]] +name = "git2" +version = "0.18.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fbf97ba92db08df386e10c8ede66a2a0369bd277090afd8710e19e38de9ec0cd" +dependencies = [ + "bitflags 2.4.1", + "libc", + "libgit2-sys", + "log", + "openssl-probe", + "openssl-sys", + "url", +] + +[[package]] +name = "glob" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" + [[package]] name = "globset" version = "0.4.13" @@ -1412,6 +1456,15 @@ version = "1.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "af150ab688ff2122fcef229be89cb50dd66af9e01a4ff320cc137eecc9bacc38" +[[package]] +name = "jobserver" +version = "0.1.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c37f63953c4c63420ed5fd3d6d398c719489b9f872b9fa683262f8edd363c7d" +dependencies = [ + "libc", +] + [[package]] name = "js-sys" version = "0.3.64" @@ -1433,6 +1486,46 @@ version = "0.2.149" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a08173bc88b7955d1b3145aa561539096c421ac8debde8cbc3612ec635fee29b" +[[package]] +name = "libgit2-sys" +version = "0.16.1+1.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2a2bb3680b094add03bb3732ec520ece34da31a8cd2d633d1389d0f0fb60d0c" +dependencies = [ + "cc", + "libc", + "libssh2-sys", + "libz-sys", + "openssl-sys", + "pkg-config", +] + +[[package]] +name = "libssh2-sys" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2dc8a030b787e2119a731f1951d6a773e2280c660f8ec4b0f5e1505a386e71ee" +dependencies = [ + "cc", + "libc", + "libz-sys", + "openssl-sys", + "pkg-config", + "vcpkg", +] + +[[package]] +name = "libz-sys" +version = "1.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d97137b25e321a73eef1418d1d5d2eda4d77e12813f8e6dead84bc52c5870a7b" +dependencies = [ + "cc", + "libc", + "pkg-config", + "vcpkg", +] + [[package]] name = "line-wrap" version = "0.1.1" @@ -1582,6 +1675,15 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "miow" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "359f76430b20a79f9e20e115b3428614e654f04fab314482fc0fda0ebd3c6044" +dependencies = [ + "windows-sys 0.48.0", +] + [[package]] name = "normalize-line-endings" version = "0.3.0" @@ -1638,6 +1740,34 @@ version = "1.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" +[[package]] +name = "openssl-probe" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" + +[[package]] +name = "openssl-src" +version = "300.1.6+3.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "439fac53e092cd7442a3660c85dde4643ab3b5bd39040912388dcdabf6b88085" +dependencies = [ + "cc", +] + +[[package]] +name = "openssl-sys" +version = "0.9.93" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db4d56a4c0478783083cfafcc42493dd4a981d41669da64b4572a2a089b51b1d" +dependencies = [ + "cc", + "libc", + "openssl-src", + "pkg-config", + "vcpkg", +] + [[package]] name = "option-ext" version = "0.2.0" @@ -1778,6 +1908,12 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" +[[package]] +name = "pkg-config" +version = "0.3.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26072860ba924cbfa98ea39c8c19b4dd6a4a25423dbdf219c1eca91aa0cf6964" + [[package]] name = "plain" version = "0.2.3" @@ -2110,6 +2246,25 @@ dependencies = [ "url", ] +[[package]] +name = "puffin-git" +version = "0.0.1" +dependencies = [ + "anyhow", + "cargo-util", + "git2", + "glob", + "hex", + "once_cell", + "puffin-cache", + "rand", + "reqwest", + "serde", + "tokio", + "tracing", + "url", +] + [[package]] name = "puffin-installer" version = "0.0.1" @@ -2218,6 +2373,7 @@ dependencies = [ "pubgrub", "puffin-client", "puffin-distribution", + "puffin-git", "puffin-interpreter", "puffin-normalize", "puffin-package", @@ -2841,6 +2997,12 @@ dependencies = [ "lazy_static", ] +[[package]] +name = "shell-escape" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "45bb67a18fa91266cc7807181f62f9178a6873bfad7dc788c42e6430db40184f" + [[package]] name = "similar" version = "2.3.0" @@ -3505,6 +3667,12 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d" +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + [[package]] name = "version_check" version = "0.9.4" diff --git a/Cargo.toml b/Cargo.toml index 326dbbd11..943d3b24f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -29,6 +29,7 @@ fs-err = { version = "2.9.0" } fs2 = { version = "0.4.3" } futures = { version = "0.3.28" } fxhash = { version = "0.2.1" } +glob = { version = "0.3.1" } goblin = { version = "0.7.1" } hex = { version = "0.4.3" } http-cache-reqwest = { version = "0.11.3" } @@ -42,6 +43,7 @@ petgraph = { version = "0.6.4" } platform-info = { version = "2.0.2" } plist = { version = "1.5.0" } pyproject-toml = { version = "0.7.0" } +rand = { version = "0.8.5" } rayon = { version = "1.8.0" } reflink-copy = { version = "0.1.10" } regex = { version = "1.9.6" } diff --git a/crates/puffin-build/src/lib.rs b/crates/puffin-build/src/lib.rs index 57022cd7e..fe0954cc7 100644 --- a/crates/puffin-build/src/lib.rs +++ b/crates/puffin-build/src/lib.rs @@ -117,7 +117,8 @@ pub struct SourceDistributionBuilder { } impl SourceDistributionBuilder { - /// Extract the source distribution and create a venv with the required packages + /// Create a virtual environment in which to build a source distribution, extracting the + /// contents from an archive if necessary. pub async fn setup( sdist: &Path, interpreter_info: &InterpreterInfo, @@ -126,9 +127,13 @@ impl SourceDistributionBuilder { let temp_dir = tempdir()?; // TODO(konstin): Parse and verify filenames - debug!("Unpacking for build {}", sdist.display()); - let extracted = temp_dir.path().join("extracted"); - let source_tree = extract_archive(sdist, &extracted)?; + let source_tree = if fs::metadata(sdist)?.is_dir() { + sdist.to_path_buf() + } else { + debug!("Unpacking for build: {}", sdist.display()); + let extracted = temp_dir.path().join("extracted"); + extract_archive(sdist, &extracted)? + }; // Check if we have a PEP 517 build, a legacy setup.py, or an edge case let build_system = if source_tree.join("pyproject.toml").is_file() { diff --git a/crates/puffin-cli/tests/pip_compile.rs b/crates/puffin-cli/tests/pip_compile.rs index 5417e11c3..85e10bfaf 100644 --- a/crates/puffin-cli/tests/pip_compile.rs +++ b/crates/puffin-cli/tests/pip_compile.rs @@ -627,6 +627,46 @@ fn compile_sdist_url_dependency() -> Result<()> { Ok(()) } +/// Resolve a specific Flask source distribution via a Git HTTPS dependency. +#[test] +fn compile_git_https_dependency() -> Result<()> { + let temp_dir = assert_fs::TempDir::new()?; + let cache_dir = assert_fs::TempDir::new()?; + let venv = temp_dir.child(".venv"); + + Command::new(get_cargo_bin(BIN_NAME)) + .arg("venv") + .arg(venv.as_os_str()) + .arg("--cache-dir") + .arg(cache_dir.path()) + .current_dir(&temp_dir) + .assert() + .success(); + venv.assert(predicates::path::is_dir()); + + let requirements_in = temp_dir.child("requirements.in"); + requirements_in.touch()?; + requirements_in.write_str("flask @ git+https://github.com/pallets/flask.git")?; + + insta::with_settings!({ + filters => vec![ + (r"(\d|\.)+(ms|s)", "[TIME]"), + (r"# .* pip-compile", "# [BIN_PATH] pip-compile"), + (r"--cache-dir .*", "--cache-dir [CACHE_DIR]"), + ] + }, { + assert_cmd_snapshot!(Command::new(get_cargo_bin(BIN_NAME)) + .arg("pip-compile") + .arg("requirements.in") + .arg("--cache-dir") + .arg(cache_dir.path()) + .env("VIRTUAL_ENV", venv.as_os_str()) + .current_dir(&temp_dir)); + }); + + Ok(()) +} + /// Request Flask, but include a URL dependency for Werkzeug, which should avoid adding a /// duplicate dependency from `PyPI`. #[test] diff --git a/crates/puffin-cli/tests/snapshots/pip_compile__compile_git_https_dependency.snap b/crates/puffin-cli/tests/snapshots/pip_compile__compile_git_https_dependency.snap new file mode 100644 index 000000000..393b62e77 --- /dev/null +++ b/crates/puffin-cli/tests/snapshots/pip_compile__compile_git_https_dependency.snap @@ -0,0 +1,36 @@ +--- +source: crates/puffin-cli/tests/pip_compile.rs +info: + program: puffin + args: + - pip-compile + - requirements.in + - "--cache-dir" + - /var/folders/nt/6gf2v7_s3k13zq_t3944rwz40000gn/T/.tmpbvYz3u + env: + VIRTUAL_ENV: /var/folders/nt/6gf2v7_s3k13zq_t3944rwz40000gn/T/.tmpHYkK5F/.venv +--- +success: true +exit_code: 0 +----- stdout ----- +# This file was autogenerated by Puffin v0.0.1 via the following command: +# [BIN_PATH] pip-compile requirements.in --cache-dir [CACHE_DIR] +blinker==1.7.0 + # via flask +click==8.1.7 + # via flask +flask @ git+https://github.com/pallets/flask.git +itsdangerous==2.1.2 + # via flask +jinja2==3.1.2 + # via flask +markupsafe==2.1.3 + # via + # jinja2 + # werkzeug +werkzeug==3.0.1 + # via flask + +----- stderr ----- +Resolved 7 packages in [TIME] + diff --git a/crates/puffin-git/Cargo.toml b/crates/puffin-git/Cargo.toml new file mode 100644 index 000000000..9e4cf4c72 --- /dev/null +++ b/crates/puffin-git/Cargo.toml @@ -0,0 +1,30 @@ +[package] +name = "puffin-git" +version = "0.0.1" +edition = { workspace = true } +rust-version = { workspace = true } +homepage = { workspace = true } +documentation = { workspace = true } +repository = { workspace = true } +authors = { workspace = true } +license = { workspace = true } + +[dependencies] +puffin-cache = { path = "../puffin-cache" } + +anyhow = { workspace = true } +cargo-util = { version = "0.2.6" } +git2 = { version = "0.18.1" } +glob = { workspace = true } +hex = { workspace = true } +once_cell = { workspace = true } +rand = { workspace = true } +serde = { workspace = true } +tracing = { workspace = true } +url = { workspace = true } +reqwest = { workspace = true, features = ["blocking"] } +tokio.workspace = true + +[features] +vendored-libgit2 = ["git2/vendored-libgit2"] +vendored-openssl = ["git2/vendored-openssl"] diff --git a/crates/puffin-git/src/git.rs b/crates/puffin-git/src/git.rs new file mode 100644 index 000000000..ebc43b3d3 --- /dev/null +++ b/crates/puffin-git/src/git.rs @@ -0,0 +1,1365 @@ +//! Git support is derived from Cargo's implementation. +//! Cargo is dual-licensed under either Apache 2.0 or MIT, at the user's choice. +//! Source: +use std::borrow::Cow; +use std::path::{Path, PathBuf}; +use std::process::Command; +use std::{env, str}; + +use anyhow::{anyhow, Context as _, Result}; +use cargo_util::{paths, ProcessBuilder}; +use git2::{self, ErrorClass, ObjectType}; +use reqwest::Client; +use reqwest::StatusCode; +use tracing::{debug, info, warn}; +use url::Url; + +use crate::util::retry; +use crate::{FetchStrategy, GitReference}; + +/// A file indicates that if present, `git reset` has been done and a repo +/// checkout is ready to go. See [`GitCheckout::reset`] for why we need this. +const CHECKOUT_READY_LOCK: &str = ".ok"; + +/// A short abbreviated OID. +/// +/// Exists for avoiding extra allocations in [`GitDatabase::to_short_id`]. +pub(crate) struct GitShortID(git2::Buf); + +impl GitShortID { + /// Views the short ID as a `str`. + pub(crate) fn as_str(&self) -> &str { + self.0.as_str().unwrap() + } +} + +/// A remote repository. It gets cloned into a local [`GitDatabase`]. +#[derive(PartialEq, Clone, Debug)] +pub(crate) struct GitRemote { + /// URL to a remote repository. + url: Url, +} + +/// A local clone of a remote repository's database. Multiple [`GitCheckout`]s +/// can be cloned from a single [`GitDatabase`]. +pub(crate) struct GitDatabase { + /// The remote repository where this database is fetched from. + remote: GitRemote, + /// Path to the root of the underlying Git repository on the local filesystem. + path: PathBuf, + /// Underlying Git repository instance for this database. + repo: git2::Repository, +} + +/// A local checkout of a particular revision from a [`GitDatabase`]. +pub(crate) struct GitCheckout<'a> { + /// The git database where this checkout is cloned from. + database: &'a GitDatabase, + /// Path to the root of the underlying Git repository on the local filesystem. + path: PathBuf, + /// The git revision this checkout is for. + revision: git2::Oid, + /// Underlying Git repository instance for this checkout. + repo: git2::Repository, +} + +impl GitRemote { + /// Creates an instance for a remote repository URL. + pub(crate) fn new(url: &Url) -> GitRemote { + GitRemote { url: url.clone() } + } + + /// Gets the remote repository URL. + pub(crate) fn url(&self) -> &Url { + &self.url + } + + /// Fetches and checkouts to a reference or a revision from this remote + /// into a local path. + /// + /// This ensures that it gets the up-to-date commit when a named reference + /// is given (tag, branch, refs/*). Thus, network connection is involved. + /// + /// When `locked_rev` is provided, it takes precedence over `reference`. + /// + /// If we have a previous instance of [`GitDatabase`] then fetch into that + /// if we can. If that can successfully load our revision then we've + /// populated the database with the latest version of `reference`, so + /// return that database and the rev we resolve to. + pub(crate) fn checkout( + &self, + into: &Path, + db: Option, + reference: &GitReference, + locked_rev: Option, + strategy: FetchStrategy, + client: &Client, + ) -> Result<(GitDatabase, git2::Oid)> { + let locked_ref = locked_rev.map(|oid| GitReference::Rev(oid.to_string())); + let reference = locked_ref.as_ref().unwrap_or(reference); + if let Some(mut db) = db { + fetch(&mut db.repo, self.url.as_str(), reference, strategy, client) + .with_context(|| format!("failed to fetch into: {}", into.display()))?; + + let resolved_commit_hash = match locked_rev { + Some(rev) => db.contains(rev).then_some(rev), + None => reference.resolve(&db.repo).ok(), + }; + if let Some(rev) = resolved_commit_hash { + return Ok((db, rev)); + } + } + + // Otherwise start from scratch to handle corrupt git repositories. + // After our fetch (which is interpreted as a clone now) we do the same + // resolution to figure out what we cloned. + if into.exists() { + paths::remove_dir_all(into)?; + } + paths::create_dir_all(into)?; + let mut repo = init(into, true)?; + fetch(&mut repo, self.url.as_str(), reference, strategy, client) + .with_context(|| format!("failed to clone into: {}", into.display()))?; + let rev = match locked_rev { + Some(rev) => rev, + None => reference.resolve(&repo)?, + }; + + Ok(( + GitDatabase { + remote: self.clone(), + path: into.to_path_buf(), + repo, + }, + rev, + )) + } + + /// Creates a [`GitDatabase`] of this remote at `db_path`. + pub(crate) fn db_at(&self, db_path: &Path) -> Result { + let repo = git2::Repository::open(db_path)?; + Ok(GitDatabase { + remote: self.clone(), + path: db_path.to_path_buf(), + repo, + }) + } +} + +impl GitDatabase { + /// Checkouts to a revision at `destination` from this database. + pub(crate) fn copy_to( + &self, + rev: git2::Oid, + destination: &Path, + strategy: FetchStrategy, + client: &Client, + ) -> Result> { + // If the existing checkout exists, and it is fresh, use it. + // A non-fresh checkout can happen if the checkout operation was + // interrupted. In that case, the checkout gets deleted and a new + // clone is created. + let checkout = match git2::Repository::open(destination) + .ok() + .map(|repo| GitCheckout::new(self, rev, repo)) + .filter(GitCheckout::is_fresh) + { + Some(co) => co, + None => GitCheckout::clone_into(destination, self, rev)?, + }; + checkout.update_submodules(strategy, client)?; + Ok(checkout) + } + + /// Get a short OID for a `revision`, usually 7 chars or more if ambiguous. + pub(crate) fn to_short_id(&self, revision: git2::Oid) -> Result { + let obj = self.repo.find_object(revision, None)?; + Ok(GitShortID(obj.short_id()?)) + } + + /// Checks if the database contains the object of this `oid`. + pub(crate) fn contains(&self, oid: git2::Oid) -> bool { + self.repo.revparse_single(&oid.to_string()).is_ok() + } +} + +impl GitReference { + /// Resolves self to an object ID with objects the `repo` currently has. + pub(crate) fn resolve(&self, repo: &git2::Repository) -> Result { + let id = match self { + // Note that we resolve the named tag here in sync with where it's + // fetched into via `fetch` below. + GitReference::Tag(s) => (|| -> Result { + let refname = format!("refs/remotes/origin/tags/{s}"); + let id = repo.refname_to_id(&refname)?; + let obj = repo.find_object(id, None)?; + let obj = obj.peel(ObjectType::Commit)?; + Ok(obj.id()) + })() + .with_context(|| format!("failed to find tag `{s}`"))?, + + // Resolve the remote name since that's all we're configuring in + // `fetch` below. + GitReference::Branch(s) => { + let name = format!("origin/{s}"); + let b = repo + .find_branch(&name, git2::BranchType::Remote) + .with_context(|| format!("failed to find branch `{s}`"))?; + b.get() + .target() + .ok_or_else(|| anyhow::format_err!("branch `{s}` did not have a target"))? + } + + // We'll be using the HEAD commit + GitReference::DefaultBranch => { + let head_id = repo.refname_to_id("refs/remotes/origin/HEAD")?; + let head = repo.find_object(head_id, None)?; + head.peel(ObjectType::Commit)?.id() + } + + GitReference::Rev(s) => { + let obj = repo.revparse_single(s)?; + match obj.as_tag() { + Some(tag) => tag.target_id(), + None => obj.id(), + } + } + }; + Ok(id) + } +} + +impl<'a> GitCheckout<'a> { + /// Creates an instance of [`GitCheckout`]. This doesn't imply the checkout + /// is done. Use [`GitCheckout::is_fresh`] to check. + /// + /// * The `database` is where this checkout is from. + /// * The `repo` will be the checked out Git repository. + fn new( + database: &'a GitDatabase, + revision: git2::Oid, + repo: git2::Repository, + ) -> GitCheckout<'a> { + let path = repo.workdir().unwrap_or_else(|| repo.path()); + GitCheckout { + path: path.to_path_buf(), + database, + revision, + repo, + } + } + + /// Gets the remote repository URL. + fn remote_url(&self) -> &Url { + self.database.remote.url() + } + + /// Clone a repo for a `revision` into a local path from a `datatabase`. + /// This is a filesystem-to-filesystem clone. + fn clone_into( + into: &Path, + database: &'a GitDatabase, + revision: git2::Oid, + ) -> Result> { + let dirname = into.parent().unwrap(); + paths::create_dir_all(dirname)?; + if into.exists() { + paths::remove_dir_all(into)?; + } + + // we're doing a local filesystem-to-filesystem clone so there should + // be no need to respect global configuration options, so pass in + // an empty instance of `git2::Config` below. + let git_config = git2::Config::new()?; + + // Clone the repository, but make sure we use the "local" option in + // libgit2 which will attempt to use hardlinks to set up the database. + // This should speed up the clone operation quite a bit if it works. + // + // Note that we still use the same fetch options because while we don't + // need authentication information we may want progress bars and such. + let url = Url::from_file_path(&database.path) + .map_err(|()| anyhow::format_err!("Invalid path URL: {}", database.path.display()))?; + let mut repo = None; + with_fetch_options(&git_config, url.as_str(), &mut |fopts| { + let mut checkout = git2::build::CheckoutBuilder::new(); + checkout.dry_run(); // we'll do this below during a `reset` + + let r = git2::build::RepoBuilder::new() + // use hard links and/or copy the database, we're doing a + // filesystem clone so this'll speed things up quite a bit. + .clone_local(git2::build::CloneLocal::Local) + .with_checkout(checkout) + .fetch_options(fopts) + .clone(url.as_str(), into)?; + // `git2` doesn't seem to handle shallow repos correctly when doing + // a local clone. Fortunately all that's needed is the copy of the + // one file that defines the shallow boundary, the commits which + // have their parents omitted as part of the shallow clone. + // + // TODO(git2): remove this when git2 supports shallow clone correctly + if database.repo.is_shallow() { + std::fs::copy( + database.repo.path().join("shallow"), + r.path().join("shallow"), + )?; + } + repo = Some(r); + Ok(()) + })?; + let repo = repo.unwrap(); + + let checkout = GitCheckout::new(database, revision, repo); + checkout.reset()?; + Ok(checkout) + } + + /// Checks if the `HEAD` of this checkout points to the expected revision. + fn is_fresh(&self) -> bool { + match self.repo.revparse_single("HEAD") { + Ok(ref head) if head.id() == self.revision => { + // See comments in reset() for why we check this + self.path.join(CHECKOUT_READY_LOCK).exists() + } + _ => false, + } + } + + /// Similar to [`reset()`]. This roughly performs `git reset --hard` to the + /// revision of this checkout, with additional interrupt protection by a + /// dummy file [`CHECKOUT_READY_LOCK`]. + /// + /// If we're interrupted while performing a `git reset` (e.g., we die + /// because of a signal) Cargo needs to be sure to try to check out this + /// repo again on the next go-round. + /// + /// To enable this we have a dummy file in our checkout, [`.cargo-ok`], + /// which if present means that the repo has been successfully reset and is + /// ready to go. Hence if we start to do a reset, we make sure this file + /// *doesn't* exist, and then once we're done we create the file. + /// + /// [`.cargo-ok`]: CHECKOUT_READY_LOCK + fn reset(&self) -> Result<()> { + let ok_file = self.path.join(CHECKOUT_READY_LOCK); + let _ = paths::remove_file(&ok_file); + info!("reset {} to {}", self.repo.path().display(), self.revision); + + // Ensure libgit2 won't mess with newlines when we vendor. + if let Ok(mut git_config) = self.repo.config() { + git_config.set_bool("core.autocrlf", false)?; + } + + let object = self.repo.find_object(self.revision, None)?; + reset(&self.repo, &object)?; + paths::create(ok_file)?; + Ok(()) + } + + /// Like `git submodule update --recursive` but for this git checkout. + /// + /// This function respects `submodule..update = none`[^1] git config. + /// Submodules set to `none` won't be fetched. + /// + /// [^1]: + fn update_submodules(&self, strategy: FetchStrategy, client: &Client) -> Result<()> { + /// Like `Cow`, but without a requirement on `Clone`. + enum Repo<'a> { + Borrowed(&'a git2::Repository), + Owned(git2::Repository), + } + + impl std::ops::Deref for Repo<'_> { + type Target = git2::Repository; + + fn deref(&self) -> &Self::Target { + match self { + Repo::Borrowed(repo) => repo, + Repo::Owned(repo) => repo, + } + } + } + + debug!( + "Update submodules for: {}", + self.repo.workdir().unwrap().display() + ); + + // Initialize a stack with the root repository. + let mut stack = vec![( + Repo::Borrowed(&self.repo), + Cow::Borrowed(self.remote_url().as_str()), + )]; + + while let Some((repo, parent_remote_url)) = stack.pop() { + for mut child in repo.submodules()? { + child.init(false)?; + + let child_url_str = child.url().ok_or_else(|| { + anyhow::format_err!("non-utf8 url for submodule {:?}?", child.path()) + })?; + + // Skip the submodule if the config says not to update it. + if child.update_strategy() == git2::SubmoduleUpdate::None { + debug!( + "Skipping git submodule `{}` due to update strategy in .gitmodules", + child_url_str + ); + continue; + } + + let child_remote_url = + absolute_submodule_url(&parent_remote_url, child_url_str)?.to_string(); + + // A submodule which is listed in .gitmodules but not actually + // checked out will not have a head id, so we should ignore it. + let Some(head) = child.head_id() else { + continue; + }; + + // If the submodule hasn't been checked out yet, we need to + // clone it. If it has been checked out and the head is the same + // as the submodule's head, then we can skip an update and keep + // recursing. + let head_and_repo = child.open().and_then(|repo| { + let target = repo.head()?.target(); + Ok((target, repo)) + }); + let mut repo = if let Ok((head, repo)) = head_and_repo { + if child.head_id() == head { + stack.push((Repo::Owned(repo), Cow::Owned(child_remote_url))); + continue; + } + repo + } else { + let path = repo.workdir().unwrap().join(child.path()); + let _ = paths::remove_dir_all(&path); + init(&path, false)? + }; + + // Fetch data from origin and reset to the head commit + debug!("Updating Git submodule: {}", child_remote_url); + let reference = GitReference::Rev(head.to_string()); + fetch(&mut repo, &child_remote_url, &reference, strategy, client).with_context( + || { + format!( + "failed to fetch submodule `{}` from {}", + child.name().unwrap_or(""), + child_remote_url + ) + }, + )?; + + let obj = repo.find_object(head, None)?; + reset(&repo, &obj)?; + drop(obj); + + // Push the current submodule onto the stack. + stack.push((Repo::Owned(repo), Cow::Owned(child_remote_url))); + } + } + + Ok(()) + } +} + +/// Constructs an absolute URL for a child submodule URL with its parent base URL. +/// +/// Git only assumes a submodule URL is a relative path if it starts with `./` +/// or `../` [^1]. To fetch the correct repo, we need to construct an absolute +/// submodule URL. +/// +/// At this moment it comes with some limitations: +/// +/// * GitHub doesn't accept non-normalized URLs with relative paths. +/// (`ssh://git@github.com/rust-lang/cargo.git/relative/..` is invalid) +/// * `url` crate cannot parse SCP-like URLs. +/// (`git@github.com:rust-lang/cargo.git` is not a valid WHATWG URL) +/// +/// To overcome these, this patch always tries [`Url::parse`] first to normalize +/// the path. If it couldn't, append the relative path as the last resort and +/// pray the remote git service supports non-normalized URLs. +/// +/// See also rust-lang/cargo#12404 and rust-lang/cargo#12295. +/// +/// [^1]: +fn absolute_submodule_url<'s>(base_url: &str, submodule_url: &'s str) -> Result> { + let absolute_url = if ["./", "../"].iter().any(|p| submodule_url.starts_with(p)) { + if let Ok(mut base_url) = Url::parse(base_url) { + let path = base_url.path(); + if !path.ends_with('/') { + base_url.set_path(&format!("{path}/")); + } + let absolute_url = base_url.join(submodule_url).with_context(|| { + format!( + "Failed to parse relative child submodule URL `{submodule_url}` using parent base URL `{base_url}`" + ) + })?; + Cow::from(absolute_url.to_string()) + } else { + let mut absolute_url = base_url.to_string(); + if !absolute_url.ends_with('/') { + absolute_url.push('/'); + } + absolute_url.push_str(submodule_url); + Cow::from(absolute_url) + } + } else { + Cow::from(submodule_url) + }; + + Ok(absolute_url) +} + +/// Prepare the authentication callbacks for cloning a git repository. +/// +/// The main purpose of this function is to construct the "authentication +/// callback" which is used to clone a repository. This callback will attempt to +/// find the right authentication on the system (without user input) and will +/// guide libgit2 in doing so. +/// +/// The callback is provided `allowed` types of credentials, and we try to do as +/// much as possible based on that: +/// +/// * Prioritize SSH keys from the local ssh agent as they're likely the most +/// reliable. The username here is prioritized from the credential +/// callback, then from whatever is configured in git itself, and finally +/// we fall back to the generic user of `git`. +/// +/// * If a username/password is allowed, then we fallback to git2-rs's +/// implementation of the credential helper. This is what is configured +/// with `credential.helper` in git, and is the interface for the macOS +/// keychain, for example. +/// +/// * After the above two have failed, we just kinda grapple attempting to +/// return *something*. +/// +/// If any form of authentication fails, libgit2 will repeatedly ask us for +/// credentials until we give it a reason to not do so. To ensure we don't +/// just sit here looping forever we keep track of authentications we've +/// attempted and we don't try the same ones again. +fn with_authentication(url: &str, cfg: &git2::Config, mut f: F) -> Result +where + F: FnMut(&mut git2::Credentials<'_>) -> Result, +{ + let mut cred_helper = git2::CredentialHelper::new(url); + cred_helper.config(cfg); + + let mut ssh_username_requested = false; + let mut cred_helper_bad = None; + let mut ssh_agent_attempts = Vec::new(); + let mut any_attempts = false; + let mut tried_sshkey = false; + let mut url_attempt = None; + + let orig_url = url; + let mut res = f(&mut |url, username, allowed| { + any_attempts = true; + if url != orig_url { + url_attempt = Some(url.to_string()); + } + // libgit2's "USERNAME" authentication actually means that it's just + // asking us for a username to keep going. This is currently only really + // used for SSH authentication and isn't really an authentication type. + // The logic currently looks like: + // + // let user = ...; + // if (user.is_null()) + // user = callback(USERNAME, null, ...); + // + // callback(SSH_KEY, user, ...) + // + // So if we're being called here then we know that (a) we're using ssh + // authentication and (b) no username was specified in the URL that + // we're trying to clone. We need to guess an appropriate username here, + // but that may involve a few attempts. Unfortunately we can't switch + // usernames during one authentication session with libgit2, so to + // handle this we bail out of this authentication session after setting + // the flag `ssh_username_requested`, and then we handle this below. + if allowed.contains(git2::CredentialType::USERNAME) { + debug_assert!(username.is_none()); + ssh_username_requested = true; + return Err(git2::Error::from_str("gonna try usernames later")); + } + + // An "SSH_KEY" authentication indicates that we need some sort of SSH + // authentication. This can currently either come from the ssh-agent + // process or from a raw in-memory SSH key. Cargo only supports using + // ssh-agent currently. + // + // If we get called with this then the only way that should be possible + // is if a username is specified in the URL itself (e.g., `username` is + // Some), hence the unwrap() here. We try custom usernames down below. + if allowed.contains(git2::CredentialType::SSH_KEY) && !tried_sshkey { + // If ssh-agent authentication fails, libgit2 will keep + // calling this callback asking for other authentication + // methods to try. Make sure we only try ssh-agent once, + // to avoid looping forever. + tried_sshkey = true; + let username = username.unwrap(); + debug_assert!(!ssh_username_requested); + ssh_agent_attempts.push(username.to_string()); + return git2::Cred::ssh_key_from_agent(username); + } + + // Sometimes libgit2 will ask for a username/password in plaintext. This + // is where Cargo would have an interactive prompt if we supported it, + // but we currently don't! Right now the only way we support fetching a + // plaintext password is through the `credential.helper` support, so + // fetch that here. + // + // If ssh-agent authentication fails, libgit2 will keep calling this + // callback asking for other authentication methods to try. Check + // cred_helper_bad to make sure we only try the git credential helper + // once, to avoid looping forever. + if allowed.contains(git2::CredentialType::USER_PASS_PLAINTEXT) && cred_helper_bad.is_none() + { + let r = git2::Cred::credential_helper(cfg, url, username); + cred_helper_bad = Some(r.is_err()); + return r; + } + + // I'm... not sure what the DEFAULT kind of authentication is, but seems + // easy to support? + if allowed.contains(git2::CredentialType::DEFAULT) { + return git2::Cred::default(); + } + + // Whelp, we tried our best + Err(git2::Error::from_str("no authentication methods succeeded")) + }); + + // Ok, so if it looks like we're going to be doing ssh authentication, we + // want to try a few different usernames as one wasn't specified in the URL + // for us to use. In order, we'll try: + // + // * A credential helper's username for this URL, if available. + // * This account's username. + // * "git" + // + // We have to restart the authentication session each time (due to + // constraints in libssh2 I guess? maybe this is inherent to ssh?), so we + // call our callback, `f`, in a loop here. + if ssh_username_requested { + debug_assert!(res.is_err()); + let mut attempts = vec![String::from("git")]; + if let Ok(s) = env::var("USER").or_else(|_| env::var("USERNAME")) { + attempts.push(s); + } + if let Some(ref s) = cred_helper.username { + attempts.push(s.clone()); + } + + while let Some(s) = attempts.pop() { + // We should get `USERNAME` first, where we just return our attempt, + // and then after that we should get `SSH_KEY`. If the first attempt + // fails we'll get called again, but we don't have another option so + // we bail out. + let mut attempts = 0; + res = f(&mut |_url, username, allowed| { + if allowed.contains(git2::CredentialType::USERNAME) { + return git2::Cred::username(&s); + } + if allowed.contains(git2::CredentialType::SSH_KEY) { + debug_assert_eq!(Some(&s[..]), username); + attempts += 1; + if attempts == 1 { + ssh_agent_attempts.push(s.to_string()); + return git2::Cred::ssh_key_from_agent(&s); + } + } + Err(git2::Error::from_str("no authentication methods succeeded")) + }); + + // If we made two attempts then that means: + // + // 1. A username was requested, we returned `s`. + // 2. An ssh key was requested, we returned to look up `s` in the + // ssh agent. + // 3. For whatever reason that lookup failed, so we were asked again + // for another mode of authentication. + // + // Essentially, if `attempts == 2` then in theory the only error was + // that this username failed to authenticate (e.g., no other network + // errors happened). Otherwise something else is funny so we bail + // out. + if attempts != 2 { + break; + } + } + } + let mut err = match res { + Ok(e) => return Ok(e), + Err(e) => e, + }; + + // In the case of an authentication failure (where we tried something) then + // we try to give a more helpful error message about precisely what we + // tried. + if any_attempts { + let mut msg = "failed to authenticate when downloading repository".to_string(); + + if let Some(attempt) = &url_attempt { + if url != attempt { + msg.push_str(": "); + msg.push_str(attempt); + } + } + msg.push('\n'); + if !ssh_agent_attempts.is_empty() { + let names = ssh_agent_attempts + .iter() + .map(|agent| format!("`{agent}`")) + .collect::>() + .join(", "); + msg.push_str(&format!( + "\n* attempted ssh-agent authentication, but \ + no usernames succeeded: {names}" + )); + } + if let Some(failed_cred_helper) = cred_helper_bad { + if failed_cred_helper { + msg.push_str( + "\n* attempted to find username/password via \ + git's `credential.helper` support, but failed", + ); + } else { + msg.push_str( + "\n* attempted to find username/password via \ + `credential.helper`, but maybe the found \ + credentials were incorrect", + ); + } + } + msg.push_str("\n\n"); + msg.push_str("if the git CLI succeeds then `net.git-fetch-with-cli` may help here\n"); + msg.push_str("https://doc.rust-lang.org/cargo/reference/config.html#netgit-fetch-with-cli"); + err = err.context(msg); + + // Otherwise if we didn't even get to the authentication phase them we may + // have failed to set up a connection, in these cases hint on the + // `net.git-fetch-with-cli` configuration option. + } else if let Some(e) = err.downcast_ref::() { + match e.class() { + ErrorClass::Net + | ErrorClass::Ssl + | ErrorClass::Submodule + | ErrorClass::FetchHead + | ErrorClass::Ssh + | ErrorClass::Http => { + let mut msg = "network failure seems to have happened\n".to_string(); + msg.push_str( + "if a proxy or similar is necessary `net.git-fetch-with-cli` may help here\n", + ); + msg.push_str( + "https://doc.rust-lang.org/cargo/reference/config.html#netgit-fetch-with-cli", + ); + err = err.context(msg); + } + ErrorClass::Callback => { + // This unwraps the git2 error. We're using the callback error + // specifically to convey errors from Rust land through the C + // callback interface. We don't need the `; class=Callback + // (26)` that gets tacked on to the git2 error message. + err = anyhow::format_err!("{}", e.message()); + } + _ => {} + } + } + + Err(err) +} + +/// `git reset --hard` to the given `obj` for the `repo`. +/// +/// The `obj` is a commit-ish to which the head should be moved. +fn reset(repo: &git2::Repository, obj: &git2::Object<'_>) -> Result<()> { + // let mut pb = Progress::new("Checkout", config); + let mut opts = git2::build::CheckoutBuilder::new(); + // opts.progress(|_, cur, max| { + // drop(pb.tick(cur, max, "")); + // }); + debug!("doing reset"); + repo.reset(obj, git2::ResetType::Hard, Some(&mut opts))?; + debug!("reset done"); + Ok(()) +} + +/// Prepares the callbacks for fetching a git repository. +/// +/// The main purpose of this function is to construct everything before a fetch. +/// This will attempt to setup a progress bar, the authentication for git, +/// ssh known hosts check, and the network retry mechanism. +/// +/// The callback is provided a fetch options, which can be used by the actual +/// git fetch. +pub(crate) fn with_fetch_options( + git_config: &git2::Config, + url: &str, + cb: &mut dyn FnMut(git2::FetchOptions<'_>) -> Result<()>, +) -> Result<()> { + retry::with_retry(|| { + with_authentication(url, git_config, |f| { + // TODO(charlie): Restore progress reporting. + let mut rcb = git2::RemoteCallbacks::new(); + rcb.credentials(f); + + // Create a local anonymous remote in the repository to fetch the url. + let mut opts = git2::FetchOptions::new(); + opts.remote_callbacks(rcb); + cb(opts) + })?; + Ok(()) + }) +} + +/// Attempts to fetch the given git `reference` for a Git repository. +/// +/// This is the main entry for git clone/fetch. It does the followings: +/// +/// * Turns [`GitReference`] into refspecs accordingly. +/// * Dispatches `git fetch` using libgit2 or git CLI. +/// +/// The `remote_url` argument is the git remote URL where we want to fetch from. +pub(crate) fn fetch( + repo: &mut git2::Repository, + remote_url: &str, + reference: &GitReference, + strategy: FetchStrategy, + client: &Client, +) -> Result<()> { + let oid_to_fetch = match github_fast_path(repo, remote_url, reference, client) { + Ok(FastPathRev::UpToDate) => return Ok(()), + Ok(FastPathRev::NeedsFetch(rev)) => Some(rev), + Ok(FastPathRev::Indeterminate) => None, + Err(e) => { + debug!("failed to check github {:?}", e); + None + } + }; + + maybe_gc_repo(repo)?; + + clean_repo_temp_files(repo); + + // Translate the reference desired here into an actual list of refspecs + // which need to get fetched. Additionally record if we're fetching tags. + let mut refspecs = Vec::new(); + let mut tags = false; + // The `+` symbol on the refspec means to allow a forced (fast-forward) + // update which is needed if there is ever a force push that requires a + // fast-forward. + match reference { + // For branches and tags we can fetch simply one reference and copy it + // locally, no need to fetch other branches/tags. + GitReference::Branch(branch) => { + refspecs.push(format!("+refs/heads/{branch}:refs/remotes/origin/{branch}")); + } + + GitReference::Tag(tag) => { + refspecs.push(format!("+refs/tags/{tag}:refs/remotes/origin/tags/{tag}")); + } + + GitReference::DefaultBranch => { + refspecs.push(String::from("+HEAD:refs/remotes/origin/HEAD")); + } + + GitReference::Rev(rev) => { + if rev.starts_with("refs/") { + refspecs.push(format!("+{rev}:{rev}")); + } else if let Some(oid_to_fetch) = oid_to_fetch { + refspecs.push(format!("+{oid_to_fetch}:refs/commit/{oid_to_fetch}")); + } else if rev.parse::().is_ok() { + // There is a specific commit to fetch and we will do so in shallow-mode only + // to not disturb the previous logic. + // Note that with typical settings for shallowing, we will just fetch a single `rev` + // as single commit. + // The reason we write to `refs/remotes/origin/HEAD` is that it's of special significance + // when during `GitReference::resolve()`, but otherwise it shouldn't matter. + refspecs.push(format!("+{rev}:refs/remotes/origin/HEAD")); + } else { + // We don't know what the rev will point to. To handle this + // situation we fetch all branches and tags, and then we pray + // it's somewhere in there. + refspecs.push(String::from("+refs/heads/*:refs/remotes/origin/*")); + refspecs.push(String::from("+HEAD:refs/remotes/origin/HEAD")); + tags = true; + } + } + } + + debug!("Performing a Git fetch for: {remote_url}"); + match strategy { + FetchStrategy::Cli => fetch_with_cli(repo, remote_url, &refspecs, tags), + FetchStrategy::Libgit2 => { + let git_config = git2::Config::open_default()?; + with_fetch_options(&git_config, remote_url, &mut |mut opts| { + if tags { + opts.download_tags(git2::AutotagOption::All); + } + // The `fetch` operation here may fail spuriously due to a corrupt + // repository. It could also fail, however, for a whole slew of other + // reasons (aka network related reasons). We want Cargo to automatically + // recover from corrupt repositories, but we don't want Cargo to stomp + // over other legitimate errors. + // + // Consequently we save off the error of the `fetch` operation and if it + // looks like a "corrupt repo" error then we blow away the repo and try + // again. If it looks like any other kind of error, or if we've already + // blown away the repository, then we want to return the error as-is. + let mut repo_reinitialized = false; + loop { + debug!("initiating fetch of {refspecs:?} from {remote_url}"); + let res = + repo.remote_anonymous(remote_url)? + .fetch(&refspecs, Some(&mut opts), None); + let err = match res { + Ok(()) => break, + Err(e) => e, + }; + debug!("fetch failed: {}", err); + + if !repo_reinitialized + && matches!(err.class(), ErrorClass::Reference | ErrorClass::Odb) + { + repo_reinitialized = true; + debug!( + "looks like this is a corrupt repository, reinitializing \ + and trying again" + ); + if reinitialize(repo).is_ok() { + continue; + } + } + + return Err(err.into()); + } + Ok(()) + }) + } + } +} + +/// Attempts to use `git` CLI installed on the system to fetch a repository, +/// when the config value [`net.git-fetch-with-cli`][1] is set. +/// +/// Unfortunately `libgit2` is notably lacking in the realm of authentication +/// when compared to the `git` command line. As a result, allow an escape +/// hatch for users that would prefer to use `git`-the-CLI for fetching +/// repositories instead of `libgit2`-the-library. This should make more +/// flavors of authentication possible while also still giving us all the +/// speed and portability of using `libgit2`. +/// +/// [1]: https://doc.rust-lang.org/nightly/cargo/reference/config.html#netgit-fetch-with-cli +fn fetch_with_cli( + repo: &mut git2::Repository, + url: &str, + refspecs: &[String], + tags: bool, +) -> Result<()> { + let mut cmd = ProcessBuilder::new("git"); + cmd.arg("fetch"); + if tags { + cmd.arg("--tags"); + } + cmd.arg("--force") // handle force pushes + .arg("--update-head-ok") // see discussion in #2078 + .arg(url) + .args(refspecs) + // If cargo is run by git (for example, the `exec` command in `git + // rebase`), the GIT_DIR is set by git and will point to the wrong + // location (this takes precedence over the cwd). Make sure this is + // unset so git will look at cwd for the repo. + .env_remove("GIT_DIR") + // The reset of these may not be necessary, but I'm including them + // just to be extra paranoid and avoid any issues. + .env_remove("GIT_WORK_TREE") + .env_remove("GIT_INDEX_FILE") + .env_remove("GIT_OBJECT_DIRECTORY") + .env_remove("GIT_ALTERNATE_OBJECT_DIRECTORIES") + .cwd(repo.path()); + cmd.exec()?; + Ok(()) +} + +/// Attempts to `git gc` a repository. +/// +/// Cargo has a bunch of long-lived git repositories in its global cache and +/// some, like the index, are updated very frequently. Right now each update +/// creates a new "pack file" inside the git database, and over time this can +/// cause bad performance and bad current behavior in libgit2. +/// +/// One pathological use case today is where libgit2 opens hundreds of file +/// descriptors, getting us dangerously close to blowing out the OS limits of +/// how many fds we can have open. This is detailed in [#4403]. +/// +/// To try to combat this problem we attempt a `git gc` here. Note, though, that +/// we may not even have `git` installed on the system! As a result we +/// opportunistically try a `git gc` when the pack directory looks too big, and +/// failing that we just blow away the repository and start over. +/// +/// In theory this shouldn't be too expensive compared to the network request +/// we're about to issue. +/// +/// [#4403]: https://github.com/rust-lang/cargo/issues/4403 +fn maybe_gc_repo(repo: &mut git2::Repository) -> Result<()> { + // Here we arbitrarily declare that if you have more than 100 files in your + // `pack` folder that we need to do a gc. + let entries = if let Ok(e) = repo.path().join("objects/pack").read_dir() { + e.count() + } else { + debug!("skipping gc as pack dir appears gone"); + return Ok(()); + }; + let max = env::var("__CARGO_PACKFILE_LIMIT") + .ok() + .and_then(|s| s.parse::().ok()) + .unwrap_or(100); + if entries < max { + debug!("skipping gc as there's only {} pack files", entries); + return Ok(()); + } + + // First up, try a literal `git gc` by shelling out to git. This is pretty + // likely to fail though as we may not have `git` installed. Note that + // libgit2 doesn't currently implement the gc operation, so there's no + // equivalent there. + match Command::new("git") + .arg("gc") + .current_dir(repo.path()) + .output() + { + Ok(out) => { + debug!( + "git-gc status: {}\n\nstdout ---\n{}\nstderr ---\n{}", + out.status, + String::from_utf8_lossy(&out.stdout), + String::from_utf8_lossy(&out.stderr) + ); + if out.status.success() { + let new = git2::Repository::open(repo.path())?; + *repo = new; + return Ok(()); + } + } + Err(e) => debug!("git-gc failed to spawn: {}", e), + } + + // Alright all else failed, let's start over. + reinitialize(repo) +} + +/// Removes temporary files left from previous activity. +/// +/// If libgit2 is interrupted while indexing pack files, it will leave behind +/// some temporary files that it doesn't clean up. These can be quite large in +/// size, so this tries to clean things up. +/// +/// This intentionally ignores errors. This is only an opportunistic cleaning, +/// and we don't really care if there are issues (there's unlikely anything +/// that can be done). +/// +/// The git CLI has similar behavior (its temp files look like +/// `objects/pack/tmp_pack_9kUSA8`). Those files are normally deleted via `git +/// prune` which is run by `git gc`. However, it doesn't know about libgit2's +/// filenames, so they never get cleaned up. +fn clean_repo_temp_files(repo: &git2::Repository) { + let path = repo.path().join("objects/pack/pack_git2_*"); + let Some(pattern) = path.to_str() else { + warn!("cannot convert {path:?} to a string"); + return; + }; + let Ok(paths) = glob::glob(pattern) else { + return; + }; + for path in paths.flatten() { + match paths::remove_file(&path) { + Ok(()) => debug!("removed stale temp git file {path:?}"), + Err(e) => { + warn!("failed to remove {path:?} while cleaning temp files: {e}"); + } + } + } +} + +/// Reinitializes a given Git repository. This is useful when a Git repository +/// seems corrupted and we want to start over. +fn reinitialize(repo: &mut git2::Repository) -> Result<()> { + // Here we want to drop the current repository object pointed to by `repo`, + // so we initialize temporary repository in a sub-folder, blow away the + // existing git folder, and then recreate the git repo. Finally we blow away + // the `tmp` folder we allocated. + let path = repo.path().to_path_buf(); + debug!("reinitializing git repo at {:?}", path); + let tmp = path.join("tmp"); + let bare = !repo.path().ends_with(".git"); + *repo = init(&tmp, false)?; + for entry in path.read_dir()? { + let entry = entry?; + if entry.file_name().to_str() == Some("tmp") { + continue; + } + let path = entry.path(); + drop(paths::remove_file(&path).or_else(|_| paths::remove_dir_all(&path))); + } + *repo = init(&path, bare)?; + paths::remove_dir_all(&tmp)?; + Ok(()) +} + +/// Initializes a Git repository at `path`. +fn init(path: &Path, bare: bool) -> Result { + let mut opts = git2::RepositoryInitOptions::new(); + // Skip anything related to templates, they just call all sorts of issues as + // we really don't want to use them yet they insist on being used. See #6240 + // for an example issue that comes up. + opts.external_template(false); + opts.bare(bare); + Ok(git2::Repository::init_opts(path, &opts)?) +} + +/// The result of GitHub fast path check. See [`github_fast_path`] for more. +enum FastPathRev { + /// The local rev (determined by `reference.resolve(repo)`) is already up to + /// date with what this rev resolves to on GitHub's server. + UpToDate, + /// The following SHA must be fetched in order for the local rev to become + /// up to date. + NeedsFetch(git2::Oid), + /// Don't know whether local rev is up to date. We'll fetch _all_ branches + /// and tags from the server and see what happens. + Indeterminate, +} + +/// Attempts GitHub's special fast path for testing if we've already got an +/// up-to-date copy of the repository. +/// +/// Updating the index is done pretty regularly so we want it to be as fast as +/// possible. For registries hosted on GitHub (like the crates.io index) there's +/// a fast path available to use[^1] to tell us that there's no updates to be +/// made. +/// +/// Note that this function should never cause an actual failure because it's +/// just a fast path. As a result, a caller should ignore `Err` returned from +/// this function and move forward on the normal path. +/// +/// [^1]: +fn github_fast_path( + repo: &mut git2::Repository, + url: &str, + reference: &GitReference, + client: &Client, +) -> Result { + let url = Url::parse(url)?; + if !is_github(&url) { + return Ok(FastPathRev::Indeterminate); + } + + let local_object = reference.resolve(repo).ok(); + let github_branch_name = match reference { + GitReference::Branch(branch) => branch, + GitReference::Tag(tag) => tag, + GitReference::DefaultBranch => "HEAD", + GitReference::Rev(rev) => { + if rev.starts_with("refs/") { + rev + } else if looks_like_commit_hash(rev) { + // `revparse_single` (used by `resolve`) is the only way to turn + // short hash -> long hash, but it also parses other things, + // like branch and tag names, which might coincidentally be + // valid hex. + // + // We only return early if `rev` is a prefix of the object found + // by `revparse_single`. Don't bother talking to GitHub in that + // case, since commit hashes are permanent. If a commit with the + // requested hash is already present in the local clone, its + // contents must be the same as what is on the server for that + // hash. + // + // If `rev` is not found locally by `revparse_single`, we'll + // need GitHub to resolve it and get a hash. If `rev` is found + // but is not a short hash of the found object, it's probably a + // branch and we also need to get a hash from GitHub, in case + // the branch has moved. + if let Some(local_object) = local_object { + if is_short_hash_of(rev, local_object) { + return Ok(FastPathRev::UpToDate); + } + } + rev + } else { + debug!("can't use github fast path with `rev = \"{}\"`", rev); + return Ok(FastPathRev::Indeterminate); + } + } + }; + + // This expects GitHub urls in the form `github.com/user/repo` and nothing + // else + let mut pieces = url + .path_segments() + .ok_or_else(|| anyhow!("no path segments on url"))?; + let username = pieces + .next() + .ok_or_else(|| anyhow!("couldn't find username"))?; + let repository = pieces + .next() + .ok_or_else(|| anyhow!("couldn't find repository name"))?; + if pieces.next().is_some() { + anyhow::bail!("too many segments on URL"); + } + + // Trim off the `.git` from the repository, if present, since that's + // optional for GitHub and won't work when we try to use the API as well. + let repository = repository.strip_suffix(".git").unwrap_or(repository); + + let url = format!( + "https://api.github.com/repos/{username}/{repository}/commits/{github_branch_name}" + ); + + let runtime = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build()?; + + runtime.block_on(async move { + debug!("Attempting GitHub fast path for: {url}"); + let mut request = client.get(&url); + request = request.header("Accept", "application/vnd.github.3.sha"); + request = request.header("User-Agent", "puffin"); + if let Some(local_object) = local_object { + request = request.header("If-None-Match", local_object.to_string()); + } + + let response = request.send().await?; + response.error_for_status_ref()?; + let response_code = response.status(); + if response_code == StatusCode::NOT_MODIFIED { + Ok(FastPathRev::UpToDate) + } else if response_code == StatusCode::OK { + let oid_to_fetch = response.text().await?.parse::()?; + Ok(FastPathRev::NeedsFetch(oid_to_fetch)) + } else { + // Usually response_code == 404 if the repository does not exist, and + // response_code == 422 if exists but GitHub is unable to resolve the + // requested rev. + Ok(FastPathRev::Indeterminate) + } + }) +} + +/// Whether a `url` is one from GitHub. +fn is_github(url: &Url) -> bool { + url.host_str() == Some("github.com") +} + +/// Whether a `rev` looks like a commit hash (ASCII hex digits). +fn looks_like_commit_hash(rev: &str) -> bool { + rev.len() >= 7 && rev.chars().all(|ch| ch.is_ascii_hexdigit()) +} + +/// Whether `rev` is a shorter hash of `oid`. +fn is_short_hash_of(rev: &str, oid: git2::Oid) -> bool { + let long_hash = oid.to_string(); + match long_hash.get(..rev.len()) { + Some(truncated_long_hash) => truncated_long_hash.eq_ignore_ascii_case(rev), + None => false, + } +} + +#[cfg(test)] +mod tests { + use super::absolute_submodule_url; + + #[test] + fn test_absolute_submodule_url() { + let cases = [ + ( + "ssh://git@gitub.com/rust-lang/cargo", + "git@github.com:rust-lang/cargo.git", + "git@github.com:rust-lang/cargo.git", + ), + ( + "ssh://git@gitub.com/rust-lang/cargo", + "./", + "ssh://git@gitub.com/rust-lang/cargo/", + ), + ( + "ssh://git@gitub.com/rust-lang/cargo", + "../", + "ssh://git@gitub.com/rust-lang/", + ), + ( + "ssh://git@gitub.com/rust-lang/cargo", + "./foo", + "ssh://git@gitub.com/rust-lang/cargo/foo", + ), + ( + "ssh://git@gitub.com/rust-lang/cargo/", + "./foo", + "ssh://git@gitub.com/rust-lang/cargo/foo", + ), + ( + "ssh://git@gitub.com/rust-lang/cargo/", + "../foo", + "ssh://git@gitub.com/rust-lang/foo", + ), + ( + "ssh://git@gitub.com/rust-lang/cargo", + "../foo", + "ssh://git@gitub.com/rust-lang/foo", + ), + ( + "ssh://git@gitub.com/rust-lang/cargo", + "../foo/bar/../baz", + "ssh://git@gitub.com/rust-lang/foo/baz", + ), + ( + "git@github.com:rust-lang/cargo.git", + "ssh://git@gitub.com/rust-lang/cargo", + "ssh://git@gitub.com/rust-lang/cargo", + ), + ( + "git@github.com:rust-lang/cargo.git", + "./", + "git@github.com:rust-lang/cargo.git/./", + ), + ( + "git@github.com:rust-lang/cargo.git", + "../", + "git@github.com:rust-lang/cargo.git/../", + ), + ( + "git@github.com:rust-lang/cargo.git", + "./foo", + "git@github.com:rust-lang/cargo.git/./foo", + ), + ( + "git@github.com:rust-lang/cargo.git/", + "./foo", + "git@github.com:rust-lang/cargo.git/./foo", + ), + ( + "git@github.com:rust-lang/cargo.git", + "../foo", + "git@github.com:rust-lang/cargo.git/../foo", + ), + ( + "git@github.com:rust-lang/cargo.git/", + "../foo", + "git@github.com:rust-lang/cargo.git/../foo", + ), + ( + "git@github.com:rust-lang/cargo.git", + "../foo/bar/../baz", + "git@github.com:rust-lang/cargo.git/../foo/bar/../baz", + ), + ]; + + for (base_url, submodule_url, expected) in cases { + let url = absolute_submodule_url(base_url, submodule_url).unwrap(); + assert_eq!( + expected, url, + "base `{base_url}`; submodule `{submodule_url}`" + ); + } + } +} diff --git a/crates/puffin-git/src/lib.rs b/crates/puffin-git/src/lib.rs new file mode 100644 index 000000000..a07592396 --- /dev/null +++ b/crates/puffin-git/src/lib.rs @@ -0,0 +1,73 @@ +use url::Url; + +pub use self::source::GitSource; + +mod git; +mod source; +mod util; + +/// A reference to a Git repository. +#[derive(Debug, Clone)] +pub struct Git { + /// The URL of the Git repository, with any query parameters and fragments removed. + url: Url, + /// The reference to the commit to use, which could be a branch, tag or revision. + reference: GitReference, + /// The precise commit to use, if known. + precise: Option, +} + +impl TryFrom for Git { + type Error = anyhow::Error; + + /// Initialize a [`Git`] source from a URL. + fn try_from(mut url: Url) -> Result { + let mut reference = GitReference::DefaultBranch; + for (k, v) in url.query_pairs() { + match &k[..] { + // Map older 'ref' to branch. + "branch" | "ref" => reference = GitReference::Branch(v.into_owned()), + "rev" => reference = GitReference::Rev(v.into_owned()), + "tag" => reference = GitReference::Tag(v.into_owned()), + _ => {} + } + } + let precise = url.fragment().map(git2::Oid::from_str).transpose()?; + url.set_fragment(None); + url.set_query(None); + + Ok(Self { + url, + reference, + precise, + }) + } +} + +impl std::fmt::Display for Git { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.url) + } +} + +/// Information to find a specific commit in a Git repository. +#[derive(Debug, Clone)] +pub enum GitReference { + /// From a tag. + Tag(String), + /// From a branch. + Branch(String), + /// From a specific revision. Can be a commit hash (either short or full), + /// or a named reference like `refs/pull/493/head`. + Rev(String), + /// The default branch of the repository, the reference named `HEAD`. + DefaultBranch, +} + +#[derive(Debug, Clone, Copy)] +pub enum FetchStrategy { + /// Fetch Git repositories using libgit2. + Libgit2, + /// Fetch Git repositories using the `git` CLI. + Cli, +} diff --git a/crates/puffin-git/src/source.rs b/crates/puffin-git/src/source.rs new file mode 100644 index 000000000..4f8ae3328 --- /dev/null +++ b/crates/puffin-git/src/source.rs @@ -0,0 +1,91 @@ +//! Git support is derived from Cargo's implementation. +//! Cargo is dual-licensed under either Apache 2.0 or MIT, at the user's choice. +//! Source: +use std::path::PathBuf; + +use anyhow::Result; +use reqwest::Client; +use tracing::debug; + +use puffin_cache::{digest, CanonicalUrl}; + +use crate::git::GitRemote; +use crate::{FetchStrategy, Git, GitReference}; + +/// A remote Git source that can be checked out locally. +pub struct GitSource { + /// The git remote which we're going to fetch from. + remote: GitRemote, + /// The Git reference from the manifest file. + manifest_reference: GitReference, + /// The revision which a git source is locked to. + /// This is expected to be set after the Git repository is fetched. + locked_rev: Option, + /// The identifier of this source for Cargo's Git cache directory. + /// See [`ident`] for more. + ident: String, + /// The HTTP client to use for fetching. + client: Client, + /// The fetch strategy to use when cloning. + strategy: FetchStrategy, + /// The path to the Git source database. + git: PathBuf, +} + +impl GitSource { + pub fn new(reference: Git, git: PathBuf) -> Self { + Self { + remote: GitRemote::new(&reference.url), + manifest_reference: reference.reference, + locked_rev: reference.precise, + ident: digest(&CanonicalUrl::new(&reference.url)), + client: Client::new(), + strategy: FetchStrategy::Libgit2, + git, + } + } + + pub fn fetch(self) -> Result { + // The path to the repo, within the Git database. + let db_path = self.git.join("db").join(&self.ident); + + let (db, actual_rev) = match (self.locked_rev, self.remote.db_at(&db_path).ok()) { + // If we have a locked revision, and we have a preexisting database + // which has that revision, then no update needs to happen. + (Some(rev), Some(db)) if db.contains(rev) => (db, rev), + + // ... otherwise we use this state to update the git database. Note + // that we still check for being offline here, for example in the + // situation that we have a locked revision but the database + // doesn't have it. + (locked_rev, db) => { + debug!("Updating Git source: `{:?}`", self.remote); + + self.remote.checkout( + &db_path, + db, + &self.manifest_reference, + locked_rev, + self.strategy, + &self.client, + )? + } + }; + + // Don’t use the full hash, in order to contribute less to reaching the + // path length limit on Windows. + let short_id = db.to_short_id(actual_rev)?; + + // Check out `actual_rev` from the database to a scoped location on the + // filesystem. This will use hard links and such to ideally make the + // checkout operation here pretty fast. + let checkout_path = self + .git + .join("checkouts") + .join(&self.ident) + .join(short_id.as_str()); + db.copy_to(actual_rev, &checkout_path, self.strategy, &self.client)?; + + Ok(checkout_path) + } +} diff --git a/crates/puffin-git/src/util/errors.rs b/crates/puffin-git/src/util/errors.rs new file mode 100644 index 000000000..337461cff --- /dev/null +++ b/crates/puffin-git/src/util/errors.rs @@ -0,0 +1,45 @@ +//! Git support is derived from Cargo's implementation. +//! Cargo is dual-licensed under either Apache 2.0 or MIT, at the user's choice. +//! Source: +use std::fmt::{self, Write}; + +use super::truncate_with_ellipsis; + +#[derive(Debug)] +pub(crate) struct HttpNotSuccessful { + pub(crate) code: u32, + pub(crate) url: String, + pub(crate) ip: Option, + pub(crate) body: Vec, +} + +impl HttpNotSuccessful { + fn render(&self) -> String { + let mut result = String::new(); + let body = std::str::from_utf8(&self.body).map_or_else( + |_| format!("[{} non-utf8 bytes]", self.body.len()), + |s| truncate_with_ellipsis(s, 512), + ); + + write!( + result, + "failed to get successful HTTP response from `{}`", + self.url + ) + .unwrap(); + if let Some(ip) = &self.ip { + write!(result, " ({ip})").unwrap(); + } + writeln!(result, ", got {}", self.code).unwrap(); + write!(result, "body:\n{body}").unwrap(); + result + } +} + +impl fmt::Display for HttpNotSuccessful { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(&self.render()) + } +} + +impl std::error::Error for HttpNotSuccessful {} diff --git a/crates/puffin-git/src/util/mod.rs b/crates/puffin-git/src/util/mod.rs new file mode 100644 index 000000000..68795f593 --- /dev/null +++ b/crates/puffin-git/src/util/mod.rs @@ -0,0 +1,17 @@ +//! Git support is derived from Cargo's implementation. +//! Cargo is dual-licensed under either Apache 2.0 or MIT, at the user's choice. +//! Source: +pub(crate) mod errors; +pub(crate) mod retry; + +pub(crate) fn truncate_with_ellipsis(s: &str, max_width: usize) -> String { + // We should truncate at grapheme-boundary and compute character-widths, + // yet the dependencies on unicode-segmentation and unicode-width are + // not worth it. + let mut chars = s.chars(); + let mut prefix = (&mut chars).take(max_width - 1).collect::(); + if chars.next().is_some() { + prefix.push('…'); + } + prefix +} diff --git a/crates/puffin-git/src/util/retry.rs b/crates/puffin-git/src/util/retry.rs new file mode 100644 index 000000000..9135fcc71 --- /dev/null +++ b/crates/puffin-git/src/util/retry.rs @@ -0,0 +1,187 @@ +//! Utilities for retrying a network operation. +//! +//! Some network errors are considered "spurious", meaning it is not a real +//! error (such as a 404 not found) and is likely a transient error (like a +//! bad network connection) that we can hope will resolve itself shortly. The +//! [`Retry`] type offers a way to repeatedly perform some kind of network +//! operation with a delay if it detects one of these possibly transient +//! errors. +//! +//! This supports errors from [`git2`], [`reqwest`], and [`HttpNotSuccessful`] +//! 5xx HTTP errors. +//! +//! The number of retries can be configured by the user via the `net.retry` +//! config option. This indicates the number of times to retry the operation +//! (default 3 times for a total of 4 attempts). +//! +//! There are hard-coded constants that indicate how long to sleep between +//! retries. The constants are tuned to balance a few factors, such as the +//! responsiveness to the user (we don't want cargo to hang for too long +//! retrying things), and accommodating things like Cloudfront's default +//! negative TTL of 10 seconds (if Cloudfront gets a 5xx error for whatever +//! reason it won't try to fetch again for 10 seconds). +//! +//! The timeout also implements a primitive form of random jitter. This is so +//! that if multiple requests fail at the same time that they don't all flood +//! the server at the same time when they are retried. This jitter still has +//! some clumping behavior, but should be good enough. +//! +//! [`Retry`] is the core type for implementing retry logic. The +//! [`Retry::try`] method can be called with a callback, and it will +//! indicate if it needs to be called again sometime in the future if there +//! was a possibly transient error. The caller is responsible for sleeping the +//! appropriate amount of time and then calling [`Retry::try`] again. +//! +//! [`with_retry`] is a convenience function that will create a [`Retry`] and +//! handle repeatedly running a callback until it succeeds, or it runs out of +//! retries. +//! +//! Some interesting resources about retries: +//! - +//! - +//! - + +//! Git support is derived from Cargo's implementation. +//! Cargo is dual-licensed under either Apache 2.0 or MIT, at the user's choice. +//! Source: +use std::cmp::min; +use std::time::Duration; + +use anyhow::{Error, Result}; +use rand::Rng; +use tracing::warn; + +use crate::util::errors::HttpNotSuccessful; + +/// State for managing retrying a network operation. +pub(crate) struct Retry { + /// The number of failed attempts that have been done so far. + /// + /// Starts at 0, and increases by one each time an attempt fails. + retries: u64, + /// The maximum number of times the operation should be retried. + /// + /// 0 means it should never retry. + max_retries: u64, +} + +/// The result of attempting some operation via [`Retry::try`]. +pub(crate) enum RetryResult { + /// The operation was successful. + /// + /// The wrapped value is the return value of the callback function. + Success(T), + /// The operation was an error, and it should not be tried again. + Err(Error), + /// The operation failed, and should be tried again in the future. + /// + /// The wrapped value is the number of milliseconds to wait before trying + /// again. The caller is responsible for waiting this long and then + /// calling [`Retry::try`] again. + Retry(u64), +} + +/// Maximum amount of time a single retry can be delayed (milliseconds). +const MAX_RETRY_SLEEP_MS: u64 = 10 * 1000; +/// The minimum initial amount of time a retry will be delayed (milliseconds). +/// +/// The actual amount of time will be a random value above this. +const INITIAL_RETRY_SLEEP_BASE_MS: u64 = 500; +/// The maximum amount of additional time the initial retry will take (milliseconds). +/// +/// The initial delay will be [`INITIAL_RETRY_SLEEP_BASE_MS`] plus a random range +/// from 0 to this value. +const INITIAL_RETRY_JITTER_MS: u64 = 1000; + +impl Retry { + pub(crate) fn new() -> Retry { + Retry { + retries: 0, + max_retries: 3, + } + } + + /// Calls the given callback, and returns a [`RetryResult`] which + /// indicates whether or not this needs to be called again at some point + /// in the future to retry the operation if it failed. + pub(crate) fn r#try(&mut self, f: impl FnOnce() -> Result) -> RetryResult { + match f() { + Err(ref err) if maybe_spurious(err) && self.retries < self.max_retries => { + let err_msg = err.downcast_ref::().map_or_else( + || err.root_cause().to_string(), + HttpNotSuccessful::to_string, + ); + warn!( + "Spurious network error ({} tries remaining): {err_msg}", + self.max_retries - self.retries, + ); + self.retries += 1; + RetryResult::Retry(self.next_sleep_ms()) + } + Err(e) => RetryResult::Err(e), + Ok(r) => RetryResult::Success(r), + } + } + + /// Gets the next sleep duration in milliseconds. + fn next_sleep_ms(&self) -> u64 { + if self.retries == 1 { + let mut rng = rand::thread_rng(); + INITIAL_RETRY_SLEEP_BASE_MS + rng.gen_range(0..INITIAL_RETRY_JITTER_MS) + } else { + min( + ((self.retries - 1) * 3) * 1000 + INITIAL_RETRY_SLEEP_BASE_MS, + MAX_RETRY_SLEEP_MS, + ) + } + } +} + +fn maybe_spurious(err: &Error) -> bool { + if let Some(git_err) = err.downcast_ref::() { + match git_err.class() { + git2::ErrorClass::Net + | git2::ErrorClass::Os + | git2::ErrorClass::Zlib + | git2::ErrorClass::Http => return git_err.code() != git2::ErrorCode::Certificate, + _ => (), + } + } + if let Some(reqwest_err) = err.downcast_ref::() { + if reqwest_err.is_timeout() + || reqwest_err.is_connect() + || reqwest_err + .status() + .map_or(false, |status| status.is_server_error()) + { + return true; + } + } + if let Some(not_200) = err.downcast_ref::() { + if 500 <= not_200.code && not_200.code < 600 { + return true; + } + } + + false +} + +/// Wrapper method for network call retry logic. +/// +/// Retry counts provided by Config object `net.retry`. Config shell outputs +/// a warning on per retry. +/// +/// Closure must return a `Result`. +pub(crate) fn with_retry(mut callback: F) -> Result +where + F: FnMut() -> Result, +{ + let mut retry = Retry::new(); + loop { + match retry.r#try(&mut callback) { + RetryResult::Success(r) => return Ok(r), + RetryResult::Err(e) => return Err(e), + RetryResult::Retry(sleep) => std::thread::sleep(Duration::from_millis(sleep)), + } + } +} diff --git a/crates/puffin-resolver/Cargo.toml b/crates/puffin-resolver/Cargo.toml index ef1455686..c42ded9fe 100644 --- a/crates/puffin-resolver/Cargo.toml +++ b/crates/puffin-resolver/Cargo.toml @@ -21,6 +21,7 @@ puffin-distribution = { path = "../puffin-distribution" } puffin-normalize = { path = "../puffin-normalize" } puffin-package = { path = "../puffin-package" } puffin-traits = { path = "../puffin-traits" } +puffin-git = { path = "../puffin-git" } distribution-filename = { path = "../distribution-filename" } anyhow = { workspace = true } diff --git a/crates/puffin-resolver/src/resolver.rs b/crates/puffin-resolver/src/resolver.rs index 60e3fa93d..cbf688426 100644 --- a/crates/puffin-resolver/src/resolver.rs +++ b/crates/puffin-resolver/src/resolver.rs @@ -687,14 +687,20 @@ impl<'a, Context: BuildContext + Sync> Resolver<'a, Context> { let build_tree = SourceDistributionBuildTree::new(self.build_context); let distribution = RemoteDistributionRef::from_url(&package_name, &url); let metadata = match build_tree.find_dist_info(&distribution, self.tags) { - Ok(Some(metadata)) => metadata, - Ok(None) => build_tree - .download_and_build_sdist(&distribution, self.client) - .await - .map_err(|err| ResolveError::UrlDistribution { - url: url.clone(), - err, - })?, + Ok(Some(metadata)) => { + debug!("Found source distribution metadata in cache: {url}"); + metadata + } + Ok(None) => { + debug!("Downloading source distribution from: {url}"); + build_tree + .download_and_build_sdist(&distribution, self.client) + .await + .map_err(|err| ResolveError::UrlDistribution { + url: url.clone(), + err, + })? + } Err(err) => { error!( "Failed to read source distribution {distribution} from cache: {err}", @@ -715,18 +721,22 @@ impl<'a, Context: BuildContext + Sync> Resolver<'a, Context> { let build_tree = SourceDistributionBuildTree::new(self.build_context); let distribution = RemoteDistributionRef::from_url(&package_name, &url); let metadata = match build_tree.find_dist_info(&distribution, self.tags) { - Ok(Some(metadata)) => metadata, - Ok(None) => build_tree - .download_wheel(&distribution, self.client) - .await - .map_err(|err| ResolveError::UrlDistribution { - url: url.clone(), - err, - })?, + Ok(Some(metadata)) => { + debug!("Found wheel metadata in cache: {url}"); + metadata + } + Ok(None) => { + debug!("Downloading wheel from: {url}"); + build_tree + .download_wheel(&distribution, self.client) + .await + .map_err(|err| ResolveError::UrlDistribution { + url: url.clone(), + err, + })? + } Err(err) => { - error!( - "Failed to read built distribution {distribution} from cache: {err}", - ); + error!("Failed to read wheel {distribution} from cache: {err}",); build_tree .download_wheel(&distribution, self.client) .await diff --git a/crates/puffin-resolver/src/source_distribution.rs b/crates/puffin-resolver/src/source_distribution.rs index adf295535..6d3de9b07 100644 --- a/crates/puffin-resolver/src/source_distribution.rs +++ b/crates/puffin-resolver/src/source_distribution.rs @@ -1,17 +1,20 @@ +use std::borrow::Cow; use std::path::PathBuf; use std::str::FromStr; -use anyhow::Result; +use anyhow::{Error, Result}; use fs_err::tokio as fs; use tempfile::tempdir; use tokio_util::compat::FuturesAsyncReadCompatExt; use tracing::debug; +use url::Url; use zip::ZipArchive; use distribution_filename::WheelFilename; use platform_tags::Tags; use puffin_client::RegistryClient; use puffin_distribution::RemoteDistributionRef; +use puffin_git::{Git, GitSource}; use puffin_package::pypi_types::Metadata21; use puffin_traits::BuildContext; @@ -19,6 +22,8 @@ const BUILT_WHEELS_CACHE: &str = "built-wheels-v0"; const REMOTE_WHEELS_CACHE: &str = "remote-wheels-v0"; +const GIT_CACHE: &str = "git-v0"; + /// Stores wheels built from source distributions. We need to keep those separate from the regular /// wheel cache since a wheel with the same name may be uploaded after we made our build and in that /// case the hashes would clash. @@ -49,16 +54,36 @@ impl<'a, T: BuildContext> SourceDistributionBuildTree<'a, T> { client: &RegistryClient, ) -> Result { debug!("Building: {distribution}"); - let url = distribution.url()?; - let reader = client.stream_external(&url).await?; - let mut reader = tokio::io::BufReader::new(reader.compat()); + let temp_dir = tempdir()?; - // Download the source distribution. - let sdist_filename = distribution.filename()?; - let sdist_file = temp_dir.path().join(sdist_filename.as_ref()); - let mut writer = tokio::fs::File::create(&sdist_file).await?; - tokio::io::copy(&mut reader, &mut writer).await?; + let source = DistributionSource::try_from(distribution)?; + let sdist_file = match source { + DistributionSource::Url(url) => { + debug!("Fetching source distribution from: {url}"); + + let reader = client.stream_external(&url).await?; + let mut reader = tokio::io::BufReader::new(reader.compat()); + + // Download the source distribution. + let sdist_filename = distribution.filename()?; + let sdist_file = temp_dir.path().join(sdist_filename.as_ref()); + let mut writer = tokio::fs::File::create(&sdist_file).await?; + tokio::io::copy(&mut reader, &mut writer).await?; + + sdist_file + } + DistributionSource::Git(git) => { + debug!("Fetching source distribution from: {git}"); + + let git_dir = self.0.cache().map_or_else( + || temp_dir.path().join(GIT_CACHE), + |cache| cache.join(GIT_CACHE), + ); + let source = GitSource::new(git, git_dir); + tokio::task::spawn_blocking(move || source.fetch()).await?? + } + }; // Create a directory for the wheel. let wheel_dir = self.0.cache().map_or_else( @@ -166,3 +191,38 @@ fn read_dist_info(wheel: &CachedWheel) -> Result { )?; Ok(Metadata21::parse(dist_info.as_bytes())?) } + +/// The host source for a distribution. +#[derive(Debug)] +enum DistributionSource<'a> { + /// The distribution is available at a remote URL. This could be a dedicated URL, or a URL + /// served by a registry, like PyPI. + Url(Cow<'a, Url>), + /// The distribution is available in a remote Git repository. + Git(Git), +} + +impl<'a> TryFrom<&'a RemoteDistributionRef<'_>> for DistributionSource<'a> { + type Error = Error; + + fn try_from(value: &'a RemoteDistributionRef<'_>) -> Result { + match value { + // If a distribution is hosted on a registry, it must be available at a URL. + RemoteDistributionRef::Registry(_, _, file) => { + let url = Url::parse(&file.url)?; + Ok(Self::Url(Cow::Owned(url))) + } + // If a distribution is specified via a direct URL, it could be a URL to a hosted file, + // or a URL to a Git repository. + RemoteDistributionRef::Url(_, url) => { + if let Some(url) = url.as_str().strip_prefix("git+") { + let url = Url::parse(url)?; + let git = Git::try_from(url)?; + Ok(Self::Git(git)) + } else { + Ok(Self::Url(Cow::Borrowed(url))) + } + } + } + } +}