Generate deterministic ids when formatting notebooks (#9359)

When formatting notebooks, we populate the `id` field for cells that do
not have one. Previously, we generated a UUID v4 which resulted in
non-deterministic formatting. Here, we generate the UUID from a seeded
random number generator instead of using true randomness. For example,
here are the first five ids it would generate:

```
7fb27b94-1602-401d-9154-2211134fc71a
acae54e3-7e7d-407b-bb7b-55eff062a284
9a63283c-baf0-4dbc-ab1f-6479b197f3a8
8dd0d809-2fe7-4a7c-9628-1538738b07e2
72eea511-9410-473a-a328-ad9291626812
```

We also add a check that an id is not present in another cell to prevent
accidental introduction of duplicate ids.

The specification is lax, and we could just use incrementing integers
e.g. `0`, `1`, ... but I have a minor preference for retaining the UUID
format. Some discussion
[here](https://github.com/astral-sh/ruff/pull/9359#discussion_r1439607121)
— I'm happy to go either way though.

Discovered via #9293
This commit is contained in:
Zanie Blue 2024-01-04 09:19:00 -06:00 committed by GitHub
parent 328262bfac
commit aaa00976ae
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 31 additions and 3 deletions

View file

@ -1,4 +1,5 @@
use std::cmp::Ordering;
use std::collections::HashSet;
use std::fs::File;
use std::io::{BufReader, Cursor, Read, Seek, SeekFrom, Write};
use std::path::Path;
@ -6,10 +7,10 @@ use std::{io, iter};
use itertools::Itertools;
use once_cell::sync::OnceCell;
use rand::{Rng, SeedableRng};
use serde::Serialize;
use serde_json::error::Category;
use thiserror::Error;
use uuid::Uuid;
use ruff_diagnostics::{SourceMap, SourceMarker};
use ruff_source_file::{NewlineWithTrailingNewline, OneIndexed, UniversalNewlineIterator};
@ -145,7 +146,23 @@ impl Notebook {
// Add cell ids to 4.5+ notebooks if they are missing
// https://github.com/astral-sh/ruff/issues/6834
// https://github.com/jupyter/enhancement-proposals/blob/master/62-cell-id/cell-id.md#required-field
// https://github.com/jupyter/enhancement-proposals/blob/master/62-cell-id/cell-id.md#questions
if raw_notebook.nbformat == 4 && raw_notebook.nbformat_minor >= 5 {
// We use a insecure random number generator to generate deterministic uuids
let mut rng = rand::rngs::StdRng::seed_from_u64(0);
let mut existing_ids = HashSet::new();
for cell in &raw_notebook.cells {
let id = match cell {
Cell::Code(cell) => &cell.id,
Cell::Markdown(cell) => &cell.id,
Cell::Raw(cell) => &cell.id,
};
if let Some(id) = id {
existing_ids.insert(id.clone());
}
}
for cell in &mut raw_notebook.cells {
let id = match cell {
Cell::Code(cell) => &mut cell.id,
@ -153,8 +170,17 @@ impl Notebook {
Cell::Raw(cell) => &mut cell.id,
};
if id.is_none() {
// https://github.com/jupyter/enhancement-proposals/blob/master/62-cell-id/cell-id.md#questions
*id = Some(Uuid::new_v4().to_string());
loop {
let new_id = uuid::Builder::from_random_bytes(rng.gen())
.into_uuid()
.as_simple()
.to_string();
if existing_ids.insert(new_id.clone()) {
*id = Some(new_id);
break;
}
}
}
}
}