From aaa00976aeab36dcba56d43eed6d8ab295eca1eb Mon Sep 17 00:00:00 2001 From: Zanie Blue Date: Thu, 4 Jan 2024 09:19:00 -0600 Subject: [PATCH] Generate deterministic ids when formatting notebooks (#9359) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When formatting notebooks, we populate the `id` field for cells that do not have one. Previously, we generated a UUID v4 which resulted in non-deterministic formatting. Here, we generate the UUID from a seeded random number generator instead of using true randomness. For example, here are the first five ids it would generate: ``` 7fb27b94-1602-401d-9154-2211134fc71a acae54e3-7e7d-407b-bb7b-55eff062a284 9a63283c-baf0-4dbc-ab1f-6479b197f3a8 8dd0d809-2fe7-4a7c-9628-1538738b07e2 72eea511-9410-473a-a328-ad9291626812 ``` We also add a check that an id is not present in another cell to prevent accidental introduction of duplicate ids. The specification is lax, and we could just use incrementing integers e.g. `0`, `1`, ... but I have a minor preference for retaining the UUID format. Some discussion [here](https://github.com/astral-sh/ruff/pull/9359#discussion_r1439607121) — I'm happy to go either way though. Discovered via #9293 --- Cargo.lock | 1 + crates/ruff_notebook/Cargo.toml | 1 + crates/ruff_notebook/src/notebook.rs | 32 +++++++++++++++++++++++++--- 3 files changed, 31 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 9e31c4bdf3..ac47fb91e2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2259,6 +2259,7 @@ dependencies = [ "insta", "itertools 0.12.0", "once_cell", + "rand", "ruff_diagnostics", "ruff_source_file", "ruff_text_size", diff --git a/crates/ruff_notebook/Cargo.toml b/crates/ruff_notebook/Cargo.toml index 517a233455..524cfb4eb9 100644 --- a/crates/ruff_notebook/Cargo.toml +++ b/crates/ruff_notebook/Cargo.toml @@ -25,6 +25,7 @@ serde_json = { workspace = true } serde_with = { workspace = true, default-features = false, features = ["macros"] } thiserror = { workspace = true } uuid = { workspace = true } +rand = { workspace = true } [dev-dependencies] insta = { workspace = true } diff --git a/crates/ruff_notebook/src/notebook.rs b/crates/ruff_notebook/src/notebook.rs index ddc558ba21..94d5a312c6 100644 --- a/crates/ruff_notebook/src/notebook.rs +++ b/crates/ruff_notebook/src/notebook.rs @@ -1,4 +1,5 @@ use std::cmp::Ordering; +use std::collections::HashSet; use std::fs::File; use std::io::{BufReader, Cursor, Read, Seek, SeekFrom, Write}; use std::path::Path; @@ -6,10 +7,10 @@ use std::{io, iter}; use itertools::Itertools; use once_cell::sync::OnceCell; +use rand::{Rng, SeedableRng}; use serde::Serialize; use serde_json::error::Category; use thiserror::Error; -use uuid::Uuid; use ruff_diagnostics::{SourceMap, SourceMarker}; use ruff_source_file::{NewlineWithTrailingNewline, OneIndexed, UniversalNewlineIterator}; @@ -145,7 +146,23 @@ impl Notebook { // Add cell ids to 4.5+ notebooks if they are missing // https://github.com/astral-sh/ruff/issues/6834 // https://github.com/jupyter/enhancement-proposals/blob/master/62-cell-id/cell-id.md#required-field + // https://github.com/jupyter/enhancement-proposals/blob/master/62-cell-id/cell-id.md#questions if raw_notebook.nbformat == 4 && raw_notebook.nbformat_minor >= 5 { + // We use a insecure random number generator to generate deterministic uuids + let mut rng = rand::rngs::StdRng::seed_from_u64(0); + let mut existing_ids = HashSet::new(); + + for cell in &raw_notebook.cells { + let id = match cell { + Cell::Code(cell) => &cell.id, + Cell::Markdown(cell) => &cell.id, + Cell::Raw(cell) => &cell.id, + }; + if let Some(id) = id { + existing_ids.insert(id.clone()); + } + } + for cell in &mut raw_notebook.cells { let id = match cell { Cell::Code(cell) => &mut cell.id, @@ -153,8 +170,17 @@ impl Notebook { Cell::Raw(cell) => &mut cell.id, }; if id.is_none() { - // https://github.com/jupyter/enhancement-proposals/blob/master/62-cell-id/cell-id.md#questions - *id = Some(Uuid::new_v4().to_string()); + loop { + let new_id = uuid::Builder::from_random_bytes(rng.gen()) + .into_uuid() + .as_simple() + .to_string(); + + if existing_ids.insert(new_id.clone()) { + *id = Some(new_id); + break; + } + } } } }