Keep track of retries in ManagedPythonDownload::fetch_with_retry

If/when we see https://github.com/astral-sh/uv/issues/14171 again, this
should clarify whether our retry logic was skipped (i.e. a transient
error wasn't correctly identified as transient), or whether we exhausted
our retries. Previously, if you ran a local example fileserver as in
https://github.com/astral-sh/uv/issues/14171#issuecomment-3014580701 and
then you tried to install Python from it, you'd get:

```
$ export UV_TEST_NO_CLI_PROGRESS=1
$ uv python install 3.8.20 --mirror http://localhost:8000 2>&1 | cat
error: Failed to install cpython-3.8.20-linux-x86_64-gnu
  Caused by: Failed to extract archive: cpython-3.8.20-20241002-x86_64-unknown-linux-gnu-install_only_stripped.tar.gz
  Caused by: failed to unpack `/home/jacko/.local/share/uv/python/.temp/.tmpS4sHHZ/python/lib/libpython3.8.so.1.0`
  Caused by: failed to unpack `python/lib/libpython3.8.so.1.0` into `/home/jacko/.local/share/uv/python/.temp/.tmpS4sHHZ/python/lib/libpython3.8.so.1.0`
  Caused by: error decoding response body
  Caused by: request or response body error
  Caused by: error reading a body from connection
  Caused by: Connection reset by peer (os error 104)
```

With this change you get:

```
error: Failed to install cpython-3.8.20-linux-x86_64-gnu
  Caused by: Request failed after 3 retries
  Caused by: Failed to extract archive: cpython-3.8.20-20241002-x86_64-unknown-linux-gnu-install_only_stripped.tar.gz
  Caused by: failed to unpack `/home/jacko/.local/share/uv/python/.temp/.tmp4Ia24w/python/lib/libpython3.8.so.1.0`
  Caused by: failed to unpack `python/lib/libpython3.8.so.1.0` into `/home/jacko/.local/share/uv/python/.temp/.tmp4Ia24w/python/lib/libpython3.8.so.1.0`
  Caused by: error decoding response body
  Caused by: request or response body error
  Caused by: error reading a body from connection
  Caused by: Connection reset by peer (os error 104)
```

At the same time, I'm updating the way we handle the retry count to
avoid nested retry loops exceeding the intended number of attempts, as I
mentioned at
https://github.com/astral-sh/uv/issues/14069#issuecomment-3020634281.
It's not clear to me whether we actually want this part of the change,
and I need feedback here.
This commit is contained in:
Jack O'Connor 2025-06-30 13:37:51 -07:00
parent c078683217
commit 85358fe9c6

View file

@ -12,7 +12,7 @@ use futures::TryStreamExt;
use itertools::Itertools;
use once_cell::sync::OnceCell;
use owo_colors::OwoColorize;
use reqwest_retry::RetryPolicy;
use reqwest_retry::{RetryError, RetryPolicy};
use serde::Deserialize;
use thiserror::Error;
use tokio::io::{AsyncRead, AsyncWriteExt, BufWriter, ReadBuf};
@ -111,6 +111,33 @@ pub enum Error {
},
}
impl Error {
// Return the number of attempts that were made to complete this request before this error was
// returned. Note that e.g. 3 retries equates to 4 attempts.
//
// It's easier to do arithmetic with "attempts" instead of "retries", because if you have
// nested retry loops you can just add up all the attempts directly, while adding up the
// retries requires +1/-1 adjustments.
fn attempts(&self) -> u32 {
// Unfortunately different variants of `Error` track retry counts in different ways. We
// could consider unifying the variants we handle here in `Error::from_reqwest_middleware`
// instead, but both approaches will be fragile as new variants get added over time.
if let Error::NetworkErrorWithRetries { retries, .. } = self {
return retries + 1;
}
// TODO(jack): let-chains are stable as of Rust 1.88. We should use them here as soon as
// our rust-version is high enough.
if let Error::NetworkMiddlewareError(_, anyhow_error) = self {
if let Some(RetryError::WithRetries { retries, .. }) =
anyhow_error.downcast_ref::<RetryError>()
{
return retries + 1;
}
}
1
}
}
#[derive(Debug, PartialEq, Eq, Clone, Hash)]
pub struct ManagedPythonDownload {
key: PythonInstallationKey,
@ -695,7 +722,8 @@ impl ManagedPythonDownload {
pypy_install_mirror: Option<&str>,
reporter: Option<&dyn Reporter>,
) -> Result<DownloadResult, Error> {
let mut n_past_retries = 0;
let mut total_attempts = 0;
let mut retried_here = false;
let start_time = SystemTime::now();
let retry_policy = client.retry_policy();
loop {
@ -710,25 +738,41 @@ impl ManagedPythonDownload {
reporter,
)
.await;
if result
.as_ref()
.err()
.is_some_and(|err| is_extended_transient_error(err))
{
let retry_decision = retry_policy.should_retry(start_time, n_past_retries);
if let reqwest_retry::RetryDecision::Retry { execute_after } = retry_decision {
debug!(
"Transient failure while handling response for {}; retrying...",
self.key()
);
let duration = execute_after
.duration_since(SystemTime::now())
.unwrap_or_else(|_| Duration::default());
tokio::time::sleep(duration).await;
n_past_retries += 1;
continue;
let result = match result {
Ok(download_result) => Ok(download_result),
Err(err) => {
// Inner retry loops (e.g. `reqwest-retry` middleware) might make more than one
// attempt per error we see here.
total_attempts += err.attempts();
// We currently interpret e.g. "3 retries" to mean we should make 4 attempts.
let n_past_retries = total_attempts - 1;
if is_extended_transient_error(&err) {
let retry_decision = retry_policy.should_retry(start_time, n_past_retries);
if let reqwest_retry::RetryDecision::Retry { execute_after } =
retry_decision
{
debug!(
"Transient failure while handling response for {}; retrying...",
self.key()
);
let duration = execute_after
.duration_since(SystemTime::now())
.unwrap_or_else(|_| Duration::default());
tokio::time::sleep(duration).await;
retried_here = true;
continue; // Retry.
}
}
if retried_here {
Err(Error::NetworkErrorWithRetries {
err: Box::new(err),
retries: n_past_retries,
})
} else {
Err(err)
}
}
}
};
return result;
}
}