add conflict markers to the lock file (#9370)

This PR adds a notion of "conflict markers" to the lock file as an attempt to address #9289. The idea is to encode a new kind of boolean expression indicating how to choose dependencies based on which extras are activated. As an example of what conflict markers look like, consider one of the cases brought up in #9289, where `anyio` had unconditional dependencies on two different versions of `idna`. Now, those are gated by markers, like this: ```toml [[package]] name = "anyio" version = "4.3.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "idna", version = "3.5", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-7-project-foo'" }, { name = "idna", version = "3.6", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-7-project-bar' or extra != 'extra-7-project-foo'" }, { name = "sniffio" }, ] ``` The odd extra values like `extra-7-project-foo` are an encoding of not just the conflicting extra (`foo`) but also the package it's declared for (`project`). We need both bits of information because different packages may have the same extra name, even if they are completely unrelated. The `extra-` part is a prefix to distinguish it from groups (which, in this case, would be encoded as `group-7-project-foo` if `foo` were a dependency group). And the `7` part indicates the length of the package name which makes it possible to parse out the package and extra name from this encoding. (We don't actually utilize that property, but it seems like good sense to do it in case we do need to extra information from these markers.) While this preserves PEP 508 compatibility at a surface level, it does require utilizing this encoding scheme in order to evaluate them when they're present (which only occurs when conflicting extras/groups are declared). My sense is that the most complex part of this change is not just adding conflict markers, but their simplification. I tried to address this in the code comments and commit messages. Reviewers should look at this commit-by-commit. Fixes #9289, Fixes #9546, Fixes #9640, Fixes #9622, Fixes #9498, Fixes #9701, Fixes #9734
2025-09-13 14:06:22 +00:00 · 2024-12-10 10:57:22 -05:00 · 2024-12-10 10:57:22 -05:00 · edf875e306
commit edf875e306
parent 6fb0d797ed
18 changed files with 5164 additions and 129 deletions
--- a/crates/uv-resolver/src/graph_ops.rs
+++ b/crates/uv-resolver/src/graph_ops.rs
@ -1,9 +1,13 @@
-use petgraph::graph::NodeIndex;
+use petgraph::graph::{EdgeIndex, NodeIndex};
 use petgraph::visit::EdgeRef;
 use petgraph::{Direction, Graph};
-use rustc_hash::{FxBuildHasher, FxHashMap};
+use rustc_hash::{FxBuildHasher, FxHashMap, FxHashSet};
 use std::collections::hash_map::Entry;

+use uv_normalize::{ExtraName, GroupName, PackageName};
+use uv_pypi_types::{ConflictItem, Conflicts};
+
+use crate::resolution::ResolutionGraphNode;
 use crate::universal_marker::UniversalMarker;

 /// Determine the markers under which a package is reachable in the dependency tree.
@ -79,3 +83,184 @@ pub(crate) fn marker_reachability<T>(

    reachability
 }
+
+/// Traverse the given dependency graph and propagate activated markers.
+///
+/// For example, given an edge like `foo[x1] -> bar`, then it is known that
+/// `x1` is activated. This in turn can be used to simplify any downstream
+/// conflict markers with `extra == "x1"` in them (by replacing `extra == "x1"`
+/// with `true`).
+pub(crate) fn simplify_conflict_markers(
+    conflicts: &Conflicts,
+    graph: &mut Graph<ResolutionGraphNode, UniversalMarker>,
+) {
+    /// An inference about whether a conflicting item is always included or
+    /// excluded.
+    ///
+    /// We collect these for each node in the graph after determining which
+    /// extras/groups are activated for each node. Once we know what's
+    /// activated, we can infer what must also be *inactivated* based on what's
+    /// conflicting with it. So for example, if we have a conflict marker like
+    /// `extra == 'foo' and extra != 'bar'`, and `foo` and `bar` have been
+    /// declared as conflicting, and we are in a part of the graph where we
+    /// know `foo` must be activated, then it follows that `extra != 'bar'`
+    /// must always be true. Because if it were false, it would imply both
+    /// `foo` and `bar` were activated simultaneously, which uv guarantees
+    /// won't happen.
+    ///
+    /// We then use these inferences to simplify the conflict markers.
+    #[derive(Clone, Debug, Eq, Hash, PartialEq)]
+    struct Inference {
+        item: ConflictItem,
+        included: bool,
+    }
+
+    // Do nothing if there are no declared conflicts. Without any declared
+    // conflicts, we know we have no conflict markers and thus nothing to
+    // simplify by determining which extras are activated at different points
+    // in the dependency graph.
+    if conflicts.is_empty() {
+        return;
+    }
+
+    // The set of activated extras and groups for each node. The ROOT nodes
+    // don't have any extras/groups activated.
+    let mut activated: FxHashMap<NodeIndex, Vec<FxHashSet<ConflictItem>>> = FxHashMap::default();
+
+    // Collect the root nodes.
+    //
+    // Besides the actual virtual root node, virtual dev dependencies packages are also root
+    // nodes since the edges don't cover dev dependencies.
+    let mut queue: Vec<_> = graph
+        .node_indices()
+        .filter(|node_index| {
+            graph
+                .edges_directed(*node_index, Direction::Incoming)
+                .next()
+                .is_none()
+        })
+        .collect();
+
+    let mut seen: FxHashSet<NodeIndex> = FxHashSet::default();
+    while let Some(parent_index) = queue.pop() {
+        if let Some((package, extra)) = graph[parent_index].package_extra_names() {
+            for set in activated
+                .entry(parent_index)
+                .or_insert_with(|| vec![FxHashSet::default()])
+            {
+                set.insert(ConflictItem::from((package.clone(), extra.clone())));
+            }
+        }
+        if let Some((package, group)) = graph[parent_index].package_group_names() {
+            for set in activated
+                .entry(parent_index)
+                .or_insert_with(|| vec![FxHashSet::default()])
+            {
+                set.insert(ConflictItem::from((package.clone(), group.clone())));
+            }
+        }
+        let sets = activated.get(&parent_index).cloned().unwrap_or_default();
+        for child_edge in graph.edges_directed(parent_index, Direction::Outgoing) {
+            let mut change = false;
+            for set in sets.clone() {
+                let existing = activated.entry(child_edge.target()).or_default();
+                // This is doing a linear scan for testing membership, which
+                // is non-ideal. But it's not actually clear that there's a
+                // strictly better alternative without a real workload being
+                // slow because of this. Namely, we are checking whether the
+                // _set_ being inserted is equivalent to an existing set. So
+                // instead of, say, `Vec<FxHashSet<ConflictItem>>`, we could
+                // have `BTreeSet<BTreeSet<ConflictItem>>`. But this in turn
+                // makes mutating the elements in each set (done above) more
+                // difficult and likely require more allocations.
+                //
+                // So if this does result in a perf slowdown on some real
+                // work-load, I think the first step would be to re-examine
+                // whether we're doing more work than we need to be doing. If
+                // we aren't, then we might want a more purpose-built data
+                // structure for this.
+                if !existing.contains(&set) {
+                    existing.push(set);
+                    change = true;
+                }
+            }
+            if seen.insert(child_edge.target()) || change {
+                queue.push(child_edge.target());
+            }
+        }
+    }
+
+    let mut inferences: FxHashMap<NodeIndex, Vec<FxHashSet<Inference>>> = FxHashMap::default();
+    for (node_id, sets) in activated {
+        let mut new_sets = vec![];
+        for set in sets {
+            let mut new_set = FxHashSet::default();
+            for item in set {
+                for conflict_set in conflicts.iter() {
+                    if !conflict_set.contains(item.package(), item.as_ref().conflict()) {
+                        continue;
+                    }
+                    for conflict_item in conflict_set.iter() {
+                        if conflict_item == &item {
+                            continue;
+                        }
+                        new_set.insert(Inference {
+                            item: conflict_item.clone(),
+                            included: false,
+                        });
+                    }
+                }
+                new_set.insert(Inference {
+                    item,
+                    included: true,
+                });
+            }
+            new_sets.push(new_set);
+        }
+        inferences.insert(node_id, new_sets);
+    }
+
+    for edge_index in (0..graph.edge_count()).map(EdgeIndex::new) {
+        let (from_index, _) = graph.edge_endpoints(edge_index).unwrap();
+        let Some(inference_sets) = inferences.get(&from_index) else {
+            continue;
+        };
+        // If not all possible paths (represented by our inferences)
+        // satisfy the conflict marker on this edge, then we can't make any
+        // simplifications. Namely, because it follows that out inferences
+        // aren't always true. Some of them may sometimes be false.
+        let all_paths_satisfied = inference_sets.iter().all(|set| {
+            let extras = set
+                .iter()
+                .filter_map(|inf| {
+                    if !inf.included {
+                        return None;
+                    }
+                    Some((inf.item.package().clone(), inf.item.extra()?.clone()))
+                })
+                .collect::<Vec<(PackageName, ExtraName)>>();
+            let groups = set
+                .iter()
+                .filter_map(|inf| {
+                    if !inf.included {
+                        return None;
+                    }
+                    Some((inf.item.package().clone(), inf.item.group()?.clone()))
+                })
+                .collect::<Vec<(PackageName, GroupName)>>();
+            graph[edge_index].conflict().evaluate(&extras, &groups)
+        });
+        if !all_paths_satisfied {
+            continue;
+        }
+        for set in inference_sets {
+            for inf in set {
+                if inf.included {
+                    graph[edge_index].assume_conflict_item(&inf.item);
+                } else {
+                    graph[edge_index].assume_not_conflict_item(&inf.item);
+                }
+            }
+        }
+    }
+}