Avoid re-creating directories during unzip (#1154)

## Summary

We have this optimization in `wheel.rs`, in the installer, but it makes
a huge difference for zips with many small files:

```
Benchmarking file_reader/Django-5.0.1-py3-none-any.whl: Warming up for 3.0000 s
Warning: Unable to complete 100 samples in 5.0s. You may wish to increase target time to 74.2s, or reduce sample count to 10.
file_reader/Django-5.0.1-py3-none-any.whl
                        time:   [751.63 ms 757.78 ms 764.27 ms]
                        change: [-1.0290% +0.0841% +1.2289%] (p = 0.88 > 0.05)
                        No change in performance detected.
Found 4 outliers among 100 measurements (4.00%)
  4 (4.00%) high mild

Benchmarking buffered_reader/Django-5.0.1-py3-none-any.whl: Warming up for 3.0000 s
Warning: Unable to complete 100 samples in 5.0s. You may wish to increase target time to 53.4s, or reduce sample count to 10.
buffered_reader/Django-5.0.1-py3-none-any.whl
                        time:   [529.86 ms 536.44 ms 543.35 ms]
                        change: [+0.0293% +1.5543% +3.1426%] (p = 0.05 > 0.05)
                        No change in performance detected.
Found 3 outliers among 100 measurements (3.00%)
  3 (3.00%) high mild
```

That's almost 30% faster...
This commit is contained in:
Charlie Marsh 2024-01-27 21:07:54 -08:00 committed by GitHub
parent 888a9e6f53
commit 6f2c235d21
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 13 additions and 4 deletions

1
Cargo.lock generated
View file

@ -2559,6 +2559,7 @@ dependencies = [
"fs-err",
"futures",
"rayon",
"rustc-hash",
"tar",
"thiserror",
"tokio",

View file

@ -13,13 +13,14 @@ license = { workspace = true }
workspace = true
[dependencies]
futures = { workspace = true }
tokio-util = { workspace = true, features = ["compat"] }
async_zip = { workspace = true, features = ["tokio"] }
flate2 = { workspace = true }
fs-err = { workspace = true, features = ["tokio"] }
futures = { workspace = true }
rayon = { workspace = true }
rustc-hash = { workspace = true }
tar = { workspace = true }
thiserror = { workspace = true }
tokio = { workspace = true, features = ["io-util"] }
tokio-util = { workspace = true, features = ["compat"] }
zip = { workspace = true }

View file

@ -1,7 +1,9 @@
use std::fs::OpenOptions;
use std::path::{Path, PathBuf};
use std::sync::Mutex;
use rayon::prelude::*;
use rustc_hash::FxHashSet;
use tokio_util::compat::{FuturesAsyncReadCompatExt, TokioAsyncReadCompatExt};
use zip::result::ZipError;
use zip::ZipArchive;
@ -104,6 +106,7 @@ pub fn unzip_archive<R: Send + std::io::Read + std::io::Seek + HasLength>(
) -> Result<(), Error> {
// Unzip in parallel.
let archive = ZipArchive::new(CloneableSeekableReader::new(reader))?;
let directories = Mutex::new(FxHashSet::default());
(0..archive.len())
.par_bridge()
.map(|file_number| {
@ -118,11 +121,15 @@ pub fn unzip_archive<R: Send + std::io::Read + std::io::Seek + HasLength>(
// Create necessary parent directories.
let path = target.join(enclosed_name);
if file.is_dir() {
fs_err::create_dir_all(path)?;
fs_err::create_dir_all(&path)?;
return Ok(());
}
if let Some(parent) = path.parent() {
fs_err::create_dir_all(parent)?;
let mut directories = directories.lock().unwrap();
if directories.insert(parent.to_path_buf()) {
fs_err::create_dir_all(parent)?;
}
}
// Create the file, with the correct permissions (on Unix).