deno/resolvers/npm_cache/tarball_extract.rs
Nathan Whitaker 780b741555
perf(npm): load npm resolution snapshot directly from lockfile (#28647)
Fixes #27264. Fixes https://github.com/denoland/deno/issues/28161.

Currently the new lockfile version is gated behind an unstable flag
(`--unstable-lockfile-v5`) until the next minor release, where it will
become the default.

The main motivation here is that it improves startup performance when
using the global cache or `--node-modules-dir=auto`.

In a create-next-app project, running an empty file:
```
❯ hyperfine --warmup 25 -N --setup "rm -f deno.lock" "deno run --node-modules-dir=auto -A empty.js" "deno-this-pr run --node-modules-dir=auto -A empty.js" "deno-this-pr run --node-modules-dir=auto --unstable-lockfile-v5 empty.js" "deno run --node-modules-dir=manual -A empty.js" "deno-this-pr run --node-modules-dir=manual -A empty.js"
Benchmark 1: deno run --node-modules-dir=auto -A empty.js
  Time (mean ± σ):     247.6 ms ±   1.7 ms    [User: 228.7 ms, System: 19.0 ms]
  Range (min … max):   245.5 ms … 251.5 ms    12 runs

Benchmark 2: deno-this-pr run --node-modules-dir=auto -A empty.js
  Time (mean ± σ):     169.8 ms ±   1.0 ms    [User: 152.9 ms, System: 17.9 ms]
  Range (min … max):   168.9 ms … 172.5 ms    17 runs

Benchmark 3: deno-this-pr run --node-modules-dir=auto --unstable-lockfile-v5 empty.js
  Time (mean ± σ):      16.2 ms ±   0.7 ms    [User: 12.3 ms, System: 5.7 ms]
  Range (min … max):    15.2 ms …  19.2 ms    185 runs

Benchmark 4: deno run --node-modules-dir=manual -A empty.js
  Time (mean ± σ):      16.2 ms ±   0.8 ms    [User: 11.6 ms, System: 5.5 ms]
  Range (min … max):    14.9 ms …  19.7 ms    187 runs

Benchmark 5: deno-this-pr run --node-modules-dir=manual -A empty.js
  Time (mean ± σ):      16.0 ms ±   0.9 ms    [User: 12.0 ms, System: 5.5 ms]
  Range (min … max):    14.8 ms …  22.3 ms    190 runs

  Warning: Statistical outliers were detected. Consider re-running this benchmark on a quiet system without any interferences from other programs. It might help to use the '--warmup' or '--prepare' options.

Summary
  deno-this-pr run --node-modules-dir=manual -A empty.js ran
    1.01 ± 0.08 times faster than deno run --node-modules-dir=manual -A empty.js
    1.01 ± 0.07 times faster than deno-this-pr run --node-modules-dir=auto --unstable-lockfile-v5 empty.js
   10.64 ± 0.60 times faster than deno-this-pr run --node-modules-dir=auto -A empty.js
   15.51 ± 0.88 times faster than deno run --node-modules-dir=auto -A empty.js
```

When using the new lockfile version, this leads to a 15.5x faster
startup time compared to the current deno version.

Install times benefit as well, though to a lesser degree.

`deno install` on a create-next-app project, with everything cached
(just setting up node_modules from scratch):

```
❯ hyperfine --warmup 5 -N --prepare "rm -rf node_modules" --setup "rm -rf deno.lock" "deno i" "deno-this-pr i" "deno-this-pr i --unstable-lockfile-v5"
Benchmark 1: deno i
  Time (mean ± σ):     464.4 ms ±   8.8 ms    [User: 227.7 ms, System: 217.3 ms]
  Range (min … max):   452.6 ms … 478.3 ms    10 runs

Benchmark 2: deno-this-pr i
  Time (mean ± σ):     368.8 ms ±  22.0 ms    [User: 150.8 ms, System: 198.1 ms]
  Range (min … max):   344.8 ms … 397.6 ms    10 runs

Benchmark 3: deno-this-pr i --unstable-lockfile-v5
  Time (mean ± σ):     211.9 ms ±  17.1 ms    [User: 7.1 ms, System: 177.2 ms]
  Range (min … max):   191.3 ms … 233.4 ms    10 runs

Summary
  deno-this-pr i --unstable-lockfile-v5 ran
    1.74 ± 0.17 times faster than deno-this-pr i
    2.19 ± 0.18 times faster than deno i
```

With lockfile v5, a 2.19x faster install time compared to the current
deno.
2025-04-08 02:06:17 +00:00

392 lines
12 KiB
Rust

// Copyright 2018-2025 the Deno authors. MIT license.
use std::borrow::Cow;
use std::collections::HashSet;
use std::fs;
use std::io::ErrorKind;
use std::path::Path;
use std::path::PathBuf;
use base64::prelude::BASE64_STANDARD;
use base64::Engine;
use deno_npm::registry::NpmPackageVersionDistInfo;
use deno_npm::registry::NpmPackageVersionDistInfoIntegrity;
use deno_semver::package::PackageNv;
use flate2::read::GzDecoder;
use tar::Archive;
use tar::EntryType;
#[derive(Debug, Copy, Clone)]
pub enum TarballExtractionMode {
/// Overwrites the destination directory without deleting any files.
Overwrite,
/// Creates and writes to a sibling temporary directory. When done, moves
/// it to the final destination.
///
/// This is more robust than `Overwrite` as it better handles multiple
/// processes writing to the directory at the same time.
SiblingTempDir,
}
#[derive(Debug, thiserror::Error, deno_error::JsError)]
pub enum VerifyAndExtractTarballError {
#[class(inherit)]
#[error(transparent)]
TarballIntegrity(#[from] TarballIntegrityError),
#[class(inherit)]
#[error(transparent)]
ExtractTarball(#[from] ExtractTarballError),
#[class(inherit)]
#[error("Failed moving extracted tarball to final destination")]
MoveFailed(std::io::Error),
}
pub fn verify_and_extract_tarball(
package_nv: &PackageNv,
data: &[u8],
dist_info: &NpmPackageVersionDistInfo,
output_folder: &Path,
extraction_mode: TarballExtractionMode,
) -> Result<(), VerifyAndExtractTarballError> {
verify_tarball_integrity(package_nv, data, &dist_info.integrity())?;
match extraction_mode {
TarballExtractionMode::Overwrite => {
extract_tarball(data, output_folder).map_err(Into::into)
}
TarballExtractionMode::SiblingTempDir => {
let temp_dir = get_atomic_dir_path(output_folder);
extract_tarball(data, &temp_dir)?;
rename_with_retries(&temp_dir, output_folder)
.map_err(VerifyAndExtractTarballError::MoveFailed)
}
}
}
fn rename_with_retries(
temp_dir: &Path,
output_folder: &Path,
) -> Result<(), std::io::Error> {
fn already_exists(err: &std::io::Error, output_folder: &Path) -> bool {
// Windows will do an "Access is denied" error
err.kind() == ErrorKind::AlreadyExists || output_folder.exists()
}
let mut count = 0;
// renaming might be flaky if a lot of processes are trying
// to do this, so retry a few times
loop {
match fs::rename(temp_dir, output_folder) {
Ok(_) => return Ok(()),
Err(err) if already_exists(&err, output_folder) => {
// another process copied here, just cleanup
let _ = fs::remove_dir_all(temp_dir);
return Ok(());
}
Err(err) => {
count += 1;
if count > 5 {
// too many retries, cleanup and return the error
let _ = fs::remove_dir_all(temp_dir);
return Err(err);
}
// wait a bit before retrying... this should be very rare or only
// in error cases, so ok to sleep a bit
let sleep_ms = std::cmp::min(100, 20 * count);
std::thread::sleep(std::time::Duration::from_millis(sleep_ms));
}
}
}
}
#[derive(Debug, thiserror::Error, deno_error::JsError)]
#[class(generic)]
pub enum TarballIntegrityError {
#[error("Not implemented hash function for {package}: {hash_kind}")]
NotImplementedHashFunction {
package: Box<PackageNv>,
hash_kind: String,
},
#[error("Not implemented integrity kind for {package}: {integrity}")]
NotImplementedIntegrityKind {
package: Box<PackageNv>,
integrity: String,
},
#[error("Tarball checksum did not match what was provided by npm registry for {package}.\n\nExpected: {expected}\nActual: {actual}")]
MismatchedChecksum {
package: Box<PackageNv>,
expected: String,
actual: String,
},
}
fn verify_tarball_integrity(
package: &PackageNv,
data: &[u8],
npm_integrity: &NpmPackageVersionDistInfoIntegrity,
) -> Result<(), TarballIntegrityError> {
use ring::digest::Context;
let (tarball_checksum, expected_checksum) = match npm_integrity {
NpmPackageVersionDistInfoIntegrity::Integrity {
algorithm,
base64_hash,
} => {
let algo = match *algorithm {
"sha512" => &ring::digest::SHA512,
"sha1" => &ring::digest::SHA1_FOR_LEGACY_USE_ONLY,
hash_kind => {
return Err(TarballIntegrityError::NotImplementedHashFunction {
package: Box::new(package.clone()),
hash_kind: hash_kind.to_string(),
});
}
};
let mut hash_ctx = Context::new(algo);
hash_ctx.update(data);
let digest = hash_ctx.finish();
let tarball_checksum = BASE64_STANDARD.encode(digest.as_ref());
(tarball_checksum, base64_hash)
}
NpmPackageVersionDistInfoIntegrity::LegacySha1Hex(hex) => {
let mut hash_ctx = Context::new(&ring::digest::SHA1_FOR_LEGACY_USE_ONLY);
hash_ctx.update(data);
let digest = hash_ctx.finish();
let tarball_checksum = faster_hex::hex_string(digest.as_ref());
(tarball_checksum, hex)
}
NpmPackageVersionDistInfoIntegrity::UnknownIntegrity(integrity) => {
return Err(TarballIntegrityError::NotImplementedIntegrityKind {
package: Box::new(package.clone()),
integrity: integrity.to_string(),
});
}
NpmPackageVersionDistInfoIntegrity::None => {
return Ok(());
}
};
if tarball_checksum != *expected_checksum {
return Err(TarballIntegrityError::MismatchedChecksum {
package: Box::new(package.clone()),
expected: expected_checksum.to_string(),
actual: tarball_checksum,
});
}
Ok(())
}
#[derive(Debug, thiserror::Error, deno_error::JsError)]
pub enum ExtractTarballError {
#[class(inherit)]
#[error(transparent)]
Io(#[from] std::io::Error),
#[class(generic)]
#[error(
"Extracted directory '{0}' of npm tarball was not in output directory."
)]
NotInOutputDirectory(PathBuf),
}
fn extract_tarball(
data: &[u8],
output_folder: &Path,
) -> Result<(), ExtractTarballError> {
fs::create_dir_all(output_folder)?;
let output_folder = fs::canonicalize(output_folder)?;
let tar = GzDecoder::new(data);
let mut archive = Archive::new(tar);
archive.set_overwrite(true);
archive.set_preserve_permissions(true);
let mut created_dirs = HashSet::new();
for entry in archive.entries()? {
let mut entry = entry?;
let path = entry.path()?;
let entry_type = entry.header().entry_type();
// Some package tarballs contain "pax_global_header", these entries
// should be skipped.
if entry_type == EntryType::XGlobalHeader {
continue;
}
// skip the first component which will be either "package" or the name of the package
let relative_path = path.components().skip(1).collect::<PathBuf>();
let absolute_path = output_folder.join(relative_path);
let dir_path = if entry_type == EntryType::Directory {
absolute_path.as_path()
} else {
absolute_path.parent().unwrap()
};
if created_dirs.insert(dir_path.to_path_buf()) {
fs::create_dir_all(dir_path)?;
let canonicalized_dir = fs::canonicalize(dir_path)?;
if !canonicalized_dir.starts_with(&output_folder) {
return Err(ExtractTarballError::NotInOutputDirectory(
canonicalized_dir.to_path_buf(),
));
}
}
let entry_type = entry.header().entry_type();
match entry_type {
EntryType::Regular => {
entry.unpack(&absolute_path)?;
}
EntryType::Symlink | EntryType::Link => {
// At the moment, npm doesn't seem to support uploading hardlinks or
// symlinks to the npm registry. If ever adding symlink or hardlink
// support, we will need to validate that the hardlink and symlink
// target are within the package directory.
log::warn!(
"Ignoring npm tarball entry type {:?} for '{}'",
entry_type,
absolute_path.display()
)
}
_ => {
// ignore
}
}
}
Ok(())
}
fn get_atomic_dir_path(file_path: &Path) -> PathBuf {
let rand = gen_rand_path_component();
let new_file_name = format!(
".{}_{}",
file_path
.file_name()
.map(|f| f.to_string_lossy())
.unwrap_or(Cow::Borrowed("")),
rand
);
file_path.with_file_name(new_file_name)
}
fn gen_rand_path_component() -> String {
use std::fmt::Write;
(0..4).fold(String::with_capacity(8), |mut output, _| {
write!(&mut output, "{:02x}", rand::random::<u8>()).unwrap();
output
})
}
#[cfg(test)]
mod test {
use deno_semver::Version;
use tempfile::TempDir;
use super::*;
#[test]
pub fn test_verify_tarball() {
let package = PackageNv {
name: "package".into(),
version: Version::parse_from_npm("1.0.0").unwrap(),
};
let actual_checksum =
"z4PhNX7vuL3xVChQ1m2AB9Yg5AULVxXcg/SpIdNs6c5H0NE8XYXysP+DGNKHfuwvY7kxvUdBeoGlODJ6+SfaPg==";
assert_eq!(
verify_tarball_integrity(
&package,
&Vec::new(),
&NpmPackageVersionDistInfoIntegrity::UnknownIntegrity("test")
)
.unwrap_err()
.to_string(),
"Not implemented integrity kind for package@1.0.0: test",
);
assert_eq!(
verify_tarball_integrity(
&package,
&Vec::new(),
&NpmPackageVersionDistInfoIntegrity::Integrity {
algorithm: "notimplemented",
base64_hash: "test"
}
)
.unwrap_err()
.to_string(),
"Not implemented hash function for package@1.0.0: notimplemented",
);
assert_eq!(
verify_tarball_integrity(
&package,
&Vec::new(),
&NpmPackageVersionDistInfoIntegrity::Integrity {
algorithm: "sha1",
base64_hash: "test"
}
)
.unwrap_err()
.to_string(),
concat!(
"Tarball checksum did not match what was provided by npm ",
"registry for package@1.0.0.\n\nExpected: test\nActual: 2jmj7l5rSw0yVb/vlWAYkK/YBwk=",
),
);
assert_eq!(
verify_tarball_integrity(
&package,
&Vec::new(),
&NpmPackageVersionDistInfoIntegrity::Integrity {
algorithm: "sha512",
base64_hash: "test"
}
)
.unwrap_err()
.to_string(),
format!("Tarball checksum did not match what was provided by npm registry for package@1.0.0.\n\nExpected: test\nActual: {actual_checksum}"),
);
assert!(verify_tarball_integrity(
&package,
&Vec::new(),
&NpmPackageVersionDistInfoIntegrity::Integrity {
algorithm: "sha512",
base64_hash: actual_checksum,
},
)
.is_ok());
let actual_hex = "da39a3ee5e6b4b0d3255bfef95601890afd80709";
assert_eq!(
verify_tarball_integrity(
&package,
&Vec::new(),
&NpmPackageVersionDistInfoIntegrity::LegacySha1Hex("test"),
)
.unwrap_err()
.to_string(),
format!("Tarball checksum did not match what was provided by npm registry for package@1.0.0.\n\nExpected: test\nActual: {actual_hex}"),
);
assert!(verify_tarball_integrity(
&package,
&Vec::new(),
&NpmPackageVersionDistInfoIntegrity::LegacySha1Hex(actual_hex),
)
.is_ok());
}
#[test]
fn rename_with_retries_succeeds_exists() {
let temp_dir = TempDir::new().unwrap();
let folder_1 = temp_dir.path().join("folder_1");
let folder_2 = temp_dir.path().join("folder_2");
std::fs::create_dir_all(&folder_1).unwrap();
std::fs::write(folder_1.join("a.txt"), "test").unwrap();
std::fs::create_dir_all(&folder_2).unwrap();
// this will not end up in the output as rename_with_retries assumes
// the folders ending up at the destination are the same
std::fs::write(folder_2.join("b.txt"), "test2").unwrap();
let dest_folder = temp_dir.path().join("dest_folder");
rename_with_retries(folder_1.as_path(), &dest_folder).unwrap();
rename_with_retries(folder_2.as_path(), &dest_folder).unwrap();
assert!(dest_folder.join("a.txt").exists());
assert!(!dest_folder.join("b.txt").exists());
}
}