Enforce UTF‑8-encoded license files during uv build (#16699)

I noticed this when working on
https://github.com/astral-sh/uv/pull/16697.

[PEP 639](https://peps.python.org/pep-0639/#add-license-files-key)
expects tools to ship license texts as UTF‑8, but previously `uv build`
would quietly include any binary blob listed under
`project.license-files`.

I have no clue what is going on with `rustfmt` for this file, but it
seems that when I add the check, it wants to reformat a bunch of
surrounding stuff.

The relevant part to look at is:

```rust
for license_file in &license_files {
    let file_path = root.join(license_file);
    let bytes = fs_err::read(&file_path)?;
    if str::from_utf8(&bytes).is_err() {
        return Err(ValidationError::LicenseFileNotUtf8(license_file.clone()).into());
    }
}
```

where we validate all collected license files before proceeding.

---------

Co-authored-by: konstin <konstin@mailbox.org>
This commit is contained in:
liam 2025-11-13 07:49:59 -05:00 committed by GitHub
parent c167146f8c
commit e28dc62358
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 160 additions and 95 deletions

View file

@ -3,7 +3,7 @@ use std::ffi::OsStr;
use std::fmt::Display; use std::fmt::Display;
use std::fmt::Write; use std::fmt::Write;
use std::path::{Path, PathBuf}; use std::path::{Path, PathBuf};
use std::str::FromStr; use std::str::{self, FromStr};
use itertools::Itertools; use itertools::Itertools;
use serde::{Deserialize, Deserializer}; use serde::{Deserialize, Deserializer};
@ -60,6 +60,8 @@ pub enum ValidationError {
ReservedGuiScripts, ReservedGuiScripts,
#[error("`project.license` is not a valid SPDX expression: {0}")] #[error("`project.license` is not a valid SPDX expression: {0}")]
InvalidSpdx(String, #[source] spdx::error::ParseError), InvalidSpdx(String, #[source] spdx::error::ParseError),
#[error("License file `{}` must be UTF-8 encoded", _0)]
LicenseFileNotUtf8(String),
} }
/// Check if the build backend is matching the currently running uv version. /// Check if the build backend is matching the currently running uv version.
@ -339,99 +341,7 @@ impl PyProjectToml {
"2.3" "2.3"
}; };
// TODO(konsti): Issue a warning on old license metadata once PEP 639 is universal. let (license, license_expression, license_files) = self.license_metadata(root)?;
let (license, license_expression, license_files) =
if let Some(license_globs) = &self.project.license_files {
let license_expression = match &self.project.license {
None => None,
Some(License::Spdx(license_expression)) => Some(license_expression.clone()),
Some(License::Text { .. } | License::File { .. }) => {
return Err(ValidationError::MixedLicenseGenerations.into());
}
};
let mut license_files = Vec::new();
let mut license_globs_parsed = Vec::new();
for license_glob in license_globs {
let pep639_glob =
PortableGlobParser::Pep639
.parse(license_glob)
.map_err(|err| Error::PortableGlob {
field: license_glob.to_owned(),
source: err,
})?;
license_globs_parsed.push(pep639_glob);
}
let license_globs =
GlobDirFilter::from_globs(&license_globs_parsed).map_err(|err| {
Error::GlobSetTooLarge {
field: "tool.uv.build-backend.source-include".to_string(),
source: err,
}
})?;
for entry in WalkDir::new(root)
.sort_by_file_name()
.into_iter()
.filter_entry(|entry| {
license_globs.match_directory(
entry
.path()
.strip_prefix(root)
.expect("walkdir starts with root"),
)
})
{
let entry = entry.map_err(|err| Error::WalkDir {
root: root.to_path_buf(),
err,
})?;
let relative = entry
.path()
.strip_prefix(root)
.expect("walkdir starts with root");
if !license_globs.match_path(relative) {
trace!("Not a license files match: {}", relative.user_display());
continue;
}
if !entry.file_type().is_file() {
trace!(
"Not a file in license files match: {}",
relative.user_display()
);
continue;
}
error_on_venv(entry.file_name(), entry.path())?;
debug!("License files match: {}", relative.user_display());
license_files.push(relative.portable_display().to_string());
}
// The glob order may be unstable
license_files.sort();
(None, license_expression, license_files)
} else {
match &self.project.license {
None => (None, None, Vec::new()),
Some(License::Spdx(license_expression)) => {
(None, Some(license_expression.clone()), Vec::new())
}
Some(License::Text { text }) => (Some(text.clone()), None, Vec::new()),
Some(License::File { file }) => {
let text = fs_err::read_to_string(root.join(file))?;
(Some(text), None, Vec::new())
}
}
};
// Check that the license expression is a valid SPDX identifier.
if let Some(license_expression) = &license_expression {
if let Err(err) = spdx::Expression::parse(license_expression) {
return Err(ValidationError::InvalidSpdx(license_expression.clone(), err).into());
}
}
// TODO(konsti): https://peps.python.org/pep-0753/#label-normalization (Draft) // TODO(konsti): https://peps.python.org/pep-0753/#label-normalization (Draft)
let project_urls = self let project_urls = self
@ -518,6 +428,118 @@ impl PyProjectToml {
}) })
} }
/// Parse and validate the old (PEP 621) and new (PEP 639) license files.
#[allow(clippy::type_complexity)]
fn license_metadata(
&self,
root: &Path,
) -> Result<(Option<String>, Option<String>, Vec<String>), Error> {
// TODO(konsti): Issue a warning on old license metadata once PEP 639 is universal.
let (license, license_expression, license_files) = if let Some(license_globs) =
&self.project.license_files
{
let license_expression = match &self.project.license {
None => None,
Some(License::Spdx(license_expression)) => Some(license_expression.clone()),
Some(License::Text { .. } | License::File { .. }) => {
return Err(ValidationError::MixedLicenseGenerations.into());
}
};
let mut license_files = Vec::new();
let mut license_globs_parsed = Vec::new();
for license_glob in license_globs {
let pep639_glob =
PortableGlobParser::Pep639
.parse(license_glob)
.map_err(|err| Error::PortableGlob {
field: license_glob.to_owned(),
source: err,
})?;
license_globs_parsed.push(pep639_glob);
}
let license_globs =
GlobDirFilter::from_globs(&license_globs_parsed).map_err(|err| {
Error::GlobSetTooLarge {
field: "tool.uv.build-backend.source-include".to_string(),
source: err,
}
})?;
for entry in WalkDir::new(root)
.sort_by_file_name()
.into_iter()
.filter_entry(|entry| {
license_globs.match_directory(
entry
.path()
.strip_prefix(root)
.expect("walkdir starts with root"),
)
})
{
let entry = entry.map_err(|err| Error::WalkDir {
root: root.to_path_buf(),
err,
})?;
let relative = entry
.path()
.strip_prefix(root)
.expect("walkdir starts with root");
if !license_globs.match_path(relative) {
trace!("Not a license files match: {}", relative.user_display());
continue;
}
if !entry.file_type().is_file() {
trace!(
"Not a file in license files match: {}",
relative.user_display()
);
continue;
}
error_on_venv(entry.file_name(), entry.path())?;
debug!("License files match: {}", relative.user_display());
license_files.push(relative.portable_display().to_string());
}
for license_file in &license_files {
let file_path = root.join(license_file);
let bytes = fs_err::read(&file_path)?;
if str::from_utf8(&bytes).is_err() {
return Err(ValidationError::LicenseFileNotUtf8(license_file.clone()).into());
}
}
// The glob order may be unstable
license_files.sort();
(None, license_expression, license_files)
} else {
match &self.project.license {
None => (None, None, Vec::new()),
Some(License::Spdx(license_expression)) => {
(None, Some(license_expression.clone()), Vec::new())
}
Some(License::Text { text }) => (Some(text.clone()), None, Vec::new()),
Some(License::File { file }) => {
let text = fs_err::read_to_string(root.join(file))?;
(Some(text), None, Vec::new())
}
}
};
// Check that the license expression is a valid SPDX identifier.
if let Some(license_expression) = &license_expression {
if let Err(err) = spdx::Expression::parse(license_expression) {
return Err(ValidationError::InvalidSpdx(license_expression.clone(), err).into());
}
}
Ok((license, license_expression, license_files))
}
/// Validate and convert the entrypoints in `pyproject.toml`, including console and GUI scripts, /// Validate and convert the entrypoints in `pyproject.toml`, including console and GUI scripts,
/// to an `entry_points.txt`. /// to an `entry_points.txt`.
/// ///

View file

@ -1,7 +1,7 @@
use crate::common::{TestContext, uv_snapshot, venv_bin_path}; use crate::common::{TestContext, uv_snapshot, venv_bin_path};
use anyhow::Result; use anyhow::Result;
use assert_cmd::assert::OutputAssertExt; use assert_cmd::assert::OutputAssertExt;
use assert_fs::fixture::{FileTouch, FileWriteStr, PathChild, PathCreateDir}; use assert_fs::fixture::{FileTouch, FileWriteBin, FileWriteStr, PathChild, PathCreateDir};
use flate2::bufread::GzDecoder; use flate2::bufread::GzDecoder;
use fs_err::File; use fs_err::File;
use indoc::{formatdoc, indoc}; use indoc::{formatdoc, indoc};
@ -760,6 +760,49 @@ fn complex_namespace_packages() -> Result<()> {
Ok(()) Ok(())
} }
#[test]
fn license_file_must_be_utf8() -> Result<()> {
let context = TestContext::new("3.12");
let project = context.temp_dir.child("license-utf8");
context
.init()
.arg("--lib")
.arg(project.path())
.assert()
.success();
project.child("pyproject.toml").write_str(indoc! {r#"
[project]
name = "license-utf8"
version = "1.0.0"
license-files = ["LICENSE.bin"]
[build-system]
requires = ["uv_build>=0.7,<10000"]
build-backend = "uv_build"
"#
})?;
project.child("LICENSE.bin").write_binary(&[0xff])?;
uv_snapshot!(context
.build_backend()
.arg("build-wheel")
.arg(context.temp_dir.path())
.current_dir(project.path()), @r###"
success: false
exit_code: 2
----- stdout -----
----- stderr -----
error: Invalid pyproject.toml
Caused by: License file `LICENSE.bin` must be UTF-8 encoded
"###);
Ok(())
}
/// Test that a symlinked file (here: license) gets included. /// Test that a symlinked file (here: license) gets included.
#[test] #[test]
#[cfg(unix)] #[cfg(unix)]