Support PEP 723 scripts in uv add and uv remove (#5995)

## Summary Resolves https://github.com/astral-sh/uv/issues/4667 ## Test Plan `cargo test`
2025-09-22 18:22:28 +00:00 · 2024-08-11 03:40:59 +02:00 · 2024-08-11 03:40:59 +02:00 · 2d53e35e39
commit 2d53e35e39
parent 9b8c07bf18
12 changed files with 1215 additions and 289 deletions
--- a/crates/uv-scripts/src/lib.rs
+++ b/crates/uv-scripts/src/lib.rs
@ -1,9 +1,11 @@
 use std::collections::BTreeMap;
 use std::io;
 use std::path::{Path, PathBuf};
+use std::str::FromStr;
 use std::sync::LazyLock;

 use memchr::memmem::Finder;
+use pep440_rs::VersionSpecifiers;
 use serde::Deserialize;
 use thiserror::Error;

@ -17,8 +19,14 @@ static FINDER: LazyLock<Finder> = LazyLock::new(|| Finder::new(b"# /// script"))
 /// A PEP 723 script, including its [`Pep723Metadata`].
 #[derive(Debug)]
 pub struct Pep723Script {
+    /// The path to the Python script.
    pub path: PathBuf,
+    /// The parsed [`Pep723Metadata`] table from the script.
    pub metadata: Pep723Metadata,
+    /// The content of the script after the metadata table.
+    pub raw: String,
+    /// The content of the script before the metadata table.
+    pub prelude: String,
 }

 impl Pep723Script {
@ -26,12 +34,76 @@ impl Pep723Script {
    ///
    /// See: <https://peps.python.org/pep-0723/>
    pub async fn read(file: impl AsRef<Path>) -> Result<Option<Self>, Pep723Error> {
-        let metadata = Pep723Metadata::read(&file).await?;
-        Ok(metadata.map(|metadata| Self {
+        let contents = match fs_err::tokio::read(&file).await {
+            Ok(contents) => contents,
+            Err(err) if err.kind() == io::ErrorKind::NotFound => return Ok(None),
+            Err(err) => return Err(err.into()),
+        };
+
+        // Extract the `script` tag.
+        let Some(script_tag) = ScriptTag::parse(&contents)? else {
+            return Ok(None);
+        };
+
+        // Parse the metadata.
+        let metadata = Pep723Metadata::from_str(&script_tag.metadata)?;
+
+        Ok(Some(Self {
            path: file.as_ref().to_path_buf(),
            metadata,
+            raw: script_tag.script,
+            prelude: script_tag.prelude,
        }))
    }
+
+    /// Reads a Python script and generates a default PEP 723 metadata table.
+    ///
+    /// See: <https://peps.python.org/pep-0723/>
+    pub async fn create(
+        file: impl AsRef<Path>,
+        requires_python: &VersionSpecifiers,
+    ) -> Result<Self, Pep723Error> {
+        let contents = match fs_err::tokio::read(&file).await {
+            Ok(contents) => contents,
+            Err(err) => return Err(err.into()),
+        };
+
+        // Extract the `script` tag.
+        let default_metadata = indoc::formatdoc! {r#"
+            requires-python = "{requires_python}"
+            dependencies = []
+            "#,
+            requires_python = requires_python,
+        };
+
+        let (prelude, raw) = extract_shebang(&contents)?;
+
+        // Parse the metadata.
+        let metadata = Pep723Metadata::from_str(&default_metadata)?;
+
+        Ok(Self {
+            path: file.as_ref().to_path_buf(),
+            prelude: prelude.unwrap_or_default(),
+            metadata,
+            raw,
+        })
+    }
+
+    /// Replace the existing metadata in the file with new metadata and write the updated content.
+    pub async fn write(&self, metadata: &str) -> Result<(), Pep723Error> {
+        let content = format!(
+            "{}{}{}",
+            if self.prelude.is_empty() {
+                String::new()
+            } else {
+                format!("{}\n", self.prelude)
+            },
+            serialize_metadata(metadata),
+            self.raw
+        );
+
+        Ok(fs_err::tokio::write(&self.path, content).await?)
+    }
 }

 /// PEP 723 metadata as parsed from a `script` comment block.
@ -41,30 +113,23 @@ impl Pep723Script {
 #[serde(rename_all = "kebab-case")]
 pub struct Pep723Metadata {
    pub dependencies: Option<Vec<pep508_rs::Requirement<VerbatimParsedUrl>>>,
-    pub requires_python: Option<pep440_rs::VersionSpecifiers>,
+    pub requires_python: Option<VersionSpecifiers>,
    pub tool: Option<Tool>,
+    /// The raw unserialized document.
+    #[serde(skip)]
+    pub raw: String,
 }

-impl Pep723Metadata {
-    /// Read the PEP 723 `script` metadata from a Python file, if it exists.
-    ///
-    /// See: <https://peps.python.org/pep-0723/>
-    pub async fn read(file: impl AsRef<Path>) -> Result<Option<Self>, Pep723Error> {
-        let contents = match fs_err::tokio::read(file).await {
-            Ok(contents) => contents,
-            Err(err) if err.kind() == io::ErrorKind::NotFound => return Ok(None),
-            Err(err) => return Err(err.into()),
-        };
+impl FromStr for Pep723Metadata {
+    type Err = Pep723Error;

-        // Extract the `script` tag.
-        let Some(contents) = extract_script_tag(&contents)? else {
-            return Ok(None);
-        };
-
-        // Parse the metadata.
-        let metadata = toml::from_str(&contents)?;
-
-        Ok(Some(metadata))
+    /// Parse `Pep723Metadata` from a raw TOML string.
+    fn from_str(raw: &str) -> Result<Self, Self::Err> {
+        let metadata = toml::from_str(raw)?;
+        Ok(Pep723Metadata {
+            raw: raw.to_string(),
+            ..metadata
+        })
    }
 }

@ -94,120 +159,193 @@ pub enum Pep723Error {
    Toml(#[from] toml::de::Error),
 }

-/// Read the PEP 723 `script` metadata from a Python file, if it exists.
-///
-/// See: <https://peps.python.org/pep-0723/>
-pub async fn read_pep723_metadata(
-    file: impl AsRef<Path>,
-) -> Result<Option<Pep723Metadata>, Pep723Error> {
-    let contents = match fs_err::tokio::read(file).await {
-        Ok(contents) => contents,
-        Err(err) if err.kind() == io::ErrorKind::NotFound => return Ok(None),
-        Err(err) => return Err(err.into()),
-    };
-
-    // Extract the `script` tag.
-    let Some(contents) = extract_script_tag(&contents)? else {
-        return Ok(None);
-    };
-
-    // Parse the metadata.
-    let metadata = toml::from_str(&contents)?;
-
-    Ok(Some(metadata))
+#[derive(Debug, Clone, Eq, PartialEq)]
+struct ScriptTag {
+    /// The content of the script before the metadata block.
+    prelude: String,
+    /// The metadata block.
+    metadata: String,
+    /// The content of the script after the metadata block.
+    script: String,
 }

-/// Given the contents of a Python file, extract the `script` metadata block, with leading comment
-/// hashes removed.
-///
-/// See: <https://peps.python.org/pep-0723/>
-fn extract_script_tag(contents: &[u8]) -> Result<Option<String>, Pep723Error> {
-    // Identify the opening pragma.
-    let Some(index) = FINDER.find(contents) else {
-        return Ok(None);
-    };
+impl ScriptTag {
+    /// Given the contents of a Python file, extract the `script` metadata block with leading
+    /// comment hashes removed, any preceding shebang or content (prelude), and the remaining Python
+    /// script.
+    ///
+    /// Given the following input string representing the contents of a Python script:
+    ///
+    /// ```python
+    /// #!/usr/bin/env python3
+    /// # /// script
+    /// # requires-python = '>=3.11'
+    /// # dependencies = [
+    /// #   'requests<3',
+    /// #   'rich',
+    /// # ]
+    /// # ///
+    ///
+    /// import requests
+    ///
+    /// print("Hello, World!")
+    /// ```
+    ///
+    /// This function would return:
+    ///
+    /// - Preamble: `#!/usr/bin/env python3\n`
+    /// - Metadata: `requires-python = '>=3.11'\ndependencies = [\n  'requests<3',\n  'rich',\n]`
+    /// - Script: `import requests\n\nprint("Hello, World!")\n`
+    ///
+    /// See: <https://peps.python.org/pep-0723/>
+    fn parse(contents: &[u8]) -> Result<Option<Self>, Pep723Error> {
+        // Identify the opening pragma.
+        let Some(index) = FINDER.find(contents) else {
+            return Ok(None);
+        };

-    // The opening pragma must be the first line, or immediately preceded by a newline.
-    if !(index == 0 || matches!(contents[index - 1], b'\r' | b'\n')) {
-        return Ok(None);
+        // The opening pragma must be the first line, or immediately preceded by a newline.
+        if !(index == 0 || matches!(contents[index - 1], b'\r' | b'\n')) {
+            return Ok(None);
+        }
+
+        // Extract the preceding content.
+        let prelude = std::str::from_utf8(&contents[..index])?;
+
+        // Decode as UTF-8.
+        let contents = &contents[index..];
+        let contents = std::str::from_utf8(contents)?;
+
+        let mut lines = contents.lines();
+
+        // Ensure that the first line is exactly `# /// script`.
+        if !lines.next().is_some_and(|line| line == "# /// script") {
+            return Ok(None);
+        }
+
+        // > Every line between these two lines (# /// TYPE and # ///) MUST be a comment starting
+        // > with #. If there are characters after the # then the first character MUST be a space. The
+        // > embedded content is formed by taking away the first two characters of each line if the
+        // > second character is a space, otherwise just the first character (which means the line
+        // > consists of only a single #).
+        let mut toml = vec![];
+
+        // Extract the content that follows the metadata block.
+        let mut python_script = vec![];
+
+        while let Some(line) = lines.next() {
+            // Remove the leading `#`.
+            let Some(line) = line.strip_prefix('#') else {
+                python_script.push(line);
+                python_script.extend(lines);
+                break;
+            };
+
+            // If the line is empty, continue.
+            if line.is_empty() {
+                toml.push("");
+                continue;
+            }
+
+            // Otherwise, the line _must_ start with ` `.
+            let Some(line) = line.strip_prefix(' ') else {
+                python_script.push(line);
+                python_script.extend(lines);
+                break;
+            };
+
+            toml.push(line);
+        }
+
+        // Find the closing `# ///`. The precedence is such that we need to identify the _last_ such
+        // line.
+        //
+        // For example, given:
+        // ```python
+        // # /// script
+        // #
+        // # ///
+        // #
+        // # ///
+        // ```
+        //
+        // The latter `///` is the closing pragma
+        let Some(index) = toml.iter().rev().position(|line| *line == "///") else {
+            return Ok(None);
+        };
+        let index = toml.len() - index;
+
+        // Discard any lines after the closing `# ///`.
+        //
+        // For example, given:
+        // ```python
+        // # /// script
+        // #
+        // # ///
+        // #
+        // #
+        // ```
+        //
+        // We need to discard the last two lines.
+        toml.truncate(index - 1);
+
+        // Join the lines into a single string.
+        let prelude = prelude.to_string();
+        let metadata = toml.join("\n") + "\n";
+        let script = python_script.join("\n") + "\n";
+
+        Ok(Some(Self {
+            prelude,
+            metadata,
+            script,
+        }))
    }
+}

-    // Decode as UTF-8.
-    let contents = &contents[index..];
+/// Extracts the shebang line from the given file contents and returns it along with the remaining
+/// content.
+fn extract_shebang(contents: &[u8]) -> Result<(Option<String>, String), Pep723Error> {
    let contents = std::str::from_utf8(contents)?;

    let mut lines = contents.lines();

-    // Ensure that the first line is exactly `# /// script`.
-    if !lines.next().is_some_and(|line| line == "# /// script") {
-        return Ok(None);
-    }
-
-    // > Every line between these two lines (# /// TYPE and # ///) MUST be a comment starting
-    // > with #. If there are characters after the # then the first character MUST be a space. The
-    // > embedded content is formed by taking away the first two characters of each line if the
-    // > second character is a space, otherwise just the first character (which means the line
-    // > consists of only a single #).
-    let mut toml = vec![];
-    for line in lines {
-        // Remove the leading `#`.
-        let Some(line) = line.strip_prefix('#') else {
-            break;
-        };
-
-        // If the line is empty, continue.
-        if line.is_empty() {
-            toml.push("");
-            continue;
+    // Check the first line for a shebang
+    if let Some(first_line) = lines.next() {
+        if first_line.starts_with("#!") {
+            let shebang = first_line.to_string();
+            let remaining_content: String = lines.collect::<Vec<&str>>().join("\n");
+            return Ok((Some(shebang), remaining_content));
        }
-
-        // Otherwise, the line _must_ start with ` `.
-        let Some(line) = line.strip_prefix(' ') else {
-            break;
-        };
-        toml.push(line);
    }

-    // Find the closing `# ///`. The precedence is such that we need to identify the _last_ such
-    // line.
-    //
-    // For example, given:
-    // ```python
-    // # /// script
-    // #
-    // # ///
-    // #
-    // # ///
-    // ```
-    //
-    // The latter `///` is the closing pragma
-    let Some(index) = toml.iter().rev().position(|line| *line == "///") else {
-        return Ok(None);
-    };
-    let index = toml.len() - index;
+    Ok((None, contents.to_string()))
+}

-    // Discard any lines after the closing `# ///`.
-    //
-    // For example, given:
-    // ```python
-    // # /// script
-    // #
-    // # ///
-    // #
-    // #
-    // ```
-    //
-    // We need to discard the last two lines.
-    toml.truncate(index - 1);
+/// Formats the provided metadata by prefixing each line with `#` and wrapping it with script markers.
+fn serialize_metadata(metadata: &str) -> String {
+    let mut output = String::with_capacity(metadata.len() + 2);

-    // Join the lines into a single string.
-    let toml = toml.join("\n") + "\n";
+    output.push_str("# /// script\n");

-    Ok(Some(toml))
+    for line in metadata.lines() {
+        if line.is_empty() {
+            output.push('\n');
+        } else {
+            output.push_str("# ");
+            output.push_str(line);
+            output.push('\n');
+        }
+    }
+
+    output.push_str("# ///\n");
+
+    output
 }

 #[cfg(test)]
 mod tests {
+    use crate::{serialize_metadata, ScriptTag};
+
    #[test]
    fn missing_space() {
        let contents = indoc::indoc! {r"
@ -216,10 +354,7 @@ mod tests {
            # ///
        "};

-        assert_eq!(
-            super::extract_script_tag(contents.as_bytes()).unwrap(),
-            None
-        );
+        assert_eq!(ScriptTag::parse(contents.as_bytes()).unwrap(), None);
    }

    #[test]
@ -233,10 +368,7 @@ mod tests {
            # ]
        "};

-        assert_eq!(
-            super::extract_script_tag(contents.as_bytes()).unwrap(),
-            None
-        );
+        assert_eq!(ScriptTag::parse(contents.as_bytes()).unwrap(), None);
    }

    #[test]
@ -253,10 +385,7 @@ mod tests {
            #
        "};

-        assert_eq!(
-            super::extract_script_tag(contents.as_bytes()).unwrap(),
-            None
-        );
+        assert_eq!(ScriptTag::parse(contents.as_bytes()).unwrap(), None);
    }

    #[test]
@ -269,9 +398,15 @@ mod tests {
            #   'rich',
            # ]
            # ///
+
+            import requests
+            from rich.pretty import pprint
+
+            resp = requests.get('https://peps.python.org/api/peps.json')
+            data = resp.json()
        "};

-        let expected = indoc::indoc! {r"
+        let expected_metadata = indoc::indoc! {r"
            requires-python = '>=3.11'
            dependencies = [
              'requests<3',
@ -279,13 +414,64 @@ mod tests {
            ]
        "};

-        let actual = super::extract_script_tag(contents.as_bytes())
-            .unwrap()
-            .unwrap();
+        let expected_data = indoc::indoc! {r"

-        assert_eq!(actual, expected);
+            import requests
+            from rich.pretty import pprint
+
+            resp = requests.get('https://peps.python.org/api/peps.json')
+            data = resp.json()
+        "};
+
+        let actual = ScriptTag::parse(contents.as_bytes()).unwrap().unwrap();
+
+        assert_eq!(actual.prelude, String::new());
+        assert_eq!(actual.metadata, expected_metadata);
+        assert_eq!(actual.script, expected_data);
    }

+    #[test]
+    fn simple_with_shebang() {
+        let contents = indoc::indoc! {r"
+            #!/usr/bin/env python3
+            # /// script
+            # requires-python = '>=3.11'
+            # dependencies = [
+            #   'requests<3',
+            #   'rich',
+            # ]
+            # ///
+
+            import requests
+            from rich.pretty import pprint
+
+            resp = requests.get('https://peps.python.org/api/peps.json')
+            data = resp.json()
+        "};
+
+        let expected_metadata = indoc::indoc! {r"
+            requires-python = '>=3.11'
+            dependencies = [
+              'requests<3',
+              'rich',
+            ]
+        "};
+
+        let expected_data = indoc::indoc! {r"
+
+            import requests
+            from rich.pretty import pprint
+
+            resp = requests.get('https://peps.python.org/api/peps.json')
+            data = resp.json()
+        "};
+
+        let actual = ScriptTag::parse(contents.as_bytes()).unwrap().unwrap();
+
+        assert_eq!(actual.prelude, "#!/usr/bin/env python3\n".to_string());
+        assert_eq!(actual.metadata, expected_metadata);
+        assert_eq!(actual.script, expected_data);
+    }
    #[test]
    fn embedded_comment() {
        let contents = indoc::indoc! {r"
@ -310,9 +496,10 @@ mod tests {
            '''
        "};

-        let actual = super::extract_script_tag(contents.as_bytes())
+        let actual = ScriptTag::parse(contents.as_bytes())
            .unwrap()
-            .unwrap();
+            .unwrap()
+            .metadata;

        assert_eq!(actual, expected);
    }
@ -339,10 +526,44 @@ mod tests {
            ]
        "};

-        let actual = super::extract_script_tag(contents.as_bytes())
+        let actual = ScriptTag::parse(contents.as_bytes())
            .unwrap()
-            .unwrap();
+            .unwrap()
+            .metadata;

        assert_eq!(actual, expected);
    }
+
+    #[test]
+    fn test_serialize_metadata_formatting() {
+        let metadata = indoc::indoc! {r"
+            requires-python = '>=3.11'
+            dependencies = [
+              'requests<3',
+              'rich',
+            ]
+        "};
+
+        let expected_output = indoc::indoc! {r"
+            # /// script
+            # requires-python = '>=3.11'
+            # dependencies = [
+            #   'requests<3',
+            #   'rich',
+            # ]
+            # ///
+        "};
+
+        let result = serialize_metadata(metadata);
+        assert_eq!(result, expected_output);
+    }
+
+    #[test]
+    fn test_serialize_metadata_empty() {
+        let metadata = "";
+        let expected_output = "# /// script\n# ///\n";
+
+        let result = serialize_metadata(metadata);
+        assert_eq!(result, expected_output);
+    }
 }