From e16140a8493d2ebd3b67c6f68dbb03c537f073e5 Mon Sep 17 00:00:00 2001 From: Jonathon Belotti Date: Sat, 9 Mar 2024 19:49:29 -0500 Subject: [PATCH] Address #2220 (slow download perf against PyPi mirror) (#2319) ## Summary Addressing the extremely slow performance detailed in https://github.com/astral-sh/uv/issues/2220. There are two changes to increase download performance: 1. setting `accept-encoding: identity`, in the spirit of https://github.com/pypa/pip/pull/1688 2. increasing buffer from 8KiB to 128KiB. ### 1. accept-encoding: identity I think this related `pip` PR has a good explanation of what's going on: https://github.com/pypa/pip/pull/1688 ``` # We use Accept-Encoding: identity here because requests # defaults to accepting compressed responses. This breaks in # a variety of ways depending on how the server is configured. # - Some servers will notice that the file isn't a compressible # file and will leave the file alone and with an empty # Content-Encoding # - Some servers will notice that the file is already # compressed and will leave the file alone and will add a # Content-Encoding: gzip header # - Some servers won't notice anything at all and will take # a file that's already been compressed and compress it again # and set the Content-Encoding: gzip header ``` The `files.pythonhosted.org` server is the 1st kind. Example debug log I added in `uv` when installing against PyPI: image (there is no `content-encoding` header in this response, the `whl` hasn't been compressed, and there is a content-length header) Our internal mirror is the third case. It does seem sensible that our mirror should be modified to act like the 1st kind. But `uv` should handle all three cases like `pip` does. ### 2. buffer increase In https://github.com/astral-sh/uv/issues/2220 I observed that `pip`'s downloading was causing up-to 128KiB flushes in our mirror. After fix 1, `uv` was still only causing up-to 8KiB flushes, and was slower to download than `pip`. Increasing this buffer from the default 8KiB led to a download performance improvement against our mirror and the expected observed 128KiB flushes. ## Test Plan Ran benchmarking as instructed by @charliermarsh image No performance improvement or regression. --- .../uv-distribution/src/distribution_database.rs | 14 +++++++++++++- crates/uv-extract/src/stream.rs | 2 +- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/crates/uv-distribution/src/distribution_database.rs b/crates/uv-distribution/src/distribution_database.rs index e6fd95907..5200fa337 100644 --- a/crates/uv-distribution/src/distribution_database.rs +++ b/crates/uv-distribution/src/distribution_database.rs @@ -180,7 +180,19 @@ impl<'a, Context: BuildContext + Send + Sync> DistributionDatabase<'a, Context> .instrument(info_span!("download", wheel = %wheel)) }; - let req = self.client.cached_client().uncached().get(url).build()?; + let req = self + .client + .cached_client() + .uncached() + .get(url) + .header( + // `reqwest` defaults to accepting compressed responses. + // Specify identity encoding to get consistent .whl downloading + // behavior from servers. ref: https://github.com/pypa/pip/pull/1688 + "accept-encoding", + reqwest::header::HeaderValue::from_static("identity"), + ) + .build()?; let cache_control = match self.client.connectivity() { Connectivity::Online => CacheControl::from( self.cache diff --git a/crates/uv-extract/src/stream.rs b/crates/uv-extract/src/stream.rs index 8d8dfb11a..3754064a7 100644 --- a/crates/uv-extract/src/stream.rs +++ b/crates/uv-extract/src/stream.rs @@ -18,7 +18,7 @@ pub async fn unzip( target: impl AsRef, ) -> Result<(), Error> { let target = target.as_ref(); - let mut reader = reader.compat(); + let mut reader = futures::io::BufReader::with_capacity(128 * 1024, reader.compat()); let mut zip = async_zip::base::read::stream::ZipFileReader::new(&mut reader); let mut directories = FxHashSet::default();