gh-119105: Differ.compare is too slow [for degenerate cases] (#119492)

``_fancy_replace()`` is no longer recursive. and a single call does a worst-case linear number of ratio() computations instead of quadratic. This renders toothless a universe of pathological cases. Some inputs may produce different output, but that's rare, and I didn't find a case where the final diff appeared to be of materially worse quality. To the contrary, by refusing to even consider synching on lines "far apart", there was more easy-to-digest locality in the output.
2025-07-07 19:35:27 +00:00 · 2024-05-24 22:08:21 -05:00 · 2024-05-24 22:08:21 -05:00 · de19694cfb
commit de19694cfb
parent 84be5244de
2 changed files with 46 additions and 71 deletions
--- a/Lib/difflib.py
+++ b/Lib/difflib.py
@ -908,79 +908,52 @@ class Differ:
        + abcdefGhijkl
        ?    ^  ^  ^
        """
-        from operator import ge, gt
-        # Don't synch up unless the lines have a similarity score of at
-        # least cutoff; best_ratio tracks the best score seen so far.
-        # Keep track of all index pairs achieving the best ratio and
-        # deal with them here. Previously only the smallest pair was
-        # handled here, and if there are many pairs with the best ratio,
-        # recursion could grow very deep, and runtime cubic. See:
+        # Don't synch up unless the lines have a similarity score above
+        # cutoff. Previously only the smallest pair was handled here,
+        # and if there are many pairs with the best ratio, recursion
+        # could grow very deep, and runtime cubic. See:
        # https://github.com/python/cpython/issues/119105
-        best_ratio, cutoff = 0.74, 0.75
+        #
+        # Later, more pathological cases prompted removing recursion
+        # entirely.
+        cutoff = 0.74999
        cruncher = SequenceMatcher(self.charjunk)
-        eqi, eqj = None, None   # 1st indices of equal lines (if any)
-        # List of index pairs achieving best_ratio. Strictly increasing
-        # in both index positions.
-        max_pairs = []
-        maxi = -1 # `i` index of last pair in max_pairs
-
-        # search for the pair that matches best without being identical
-        # (identical lines must be junk lines, & we don't want to synch
-        # up on junk -- unless we have to)
        crqr = cruncher.real_quick_ratio
        cqr = cruncher.quick_ratio
        cr = cruncher.ratio
-        for j in range(blo, bhi):
-            bj = b[j]
-            cruncher.set_seq2(bj)
-            # Find new best, if possible. Else search for the smallest i
-            # (if any) > maxi that equals the best ratio
-            search_equal = True
-            for i in range(alo, ahi):
-                ai = a[i]
-                if ai == bj:
-                    if eqi is None:
-                        eqi, eqj = i, j
-                    continue
-                cruncher.set_seq1(ai)
-                # computing similarity is expensive, so use the quick
-                # upper bounds first -- have seen this speed up messy
-                # compares by a factor of 3.
-                cmp = ge if search_equal and i > maxi else gt
-                if (cmp(crqr(), best_ratio)
-                      and cmp(cqr(), best_ratio)
-                      and cmp((ratio := cr()), best_ratio)):
-                    if ratio > best_ratio:
-                        best_ratio = ratio
-                        max_pairs.clear()
-                    else:
-                        assert best_ratio == ratio and search_equal
-                        assert i > maxi
-                    max_pairs.append((i, j))
-                    maxi = i
-                    search_equal = False
-        if best_ratio < cutoff:
-            assert not max_pairs
-            # no non-identical "pretty close" pair
-            if eqi is None:
-                # no identical pair either -- treat it as a straight replace
-                yield from self._plain_replace(a, alo, ahi, b, blo, bhi)
-                return
-            # no close pair, but an identical pair -- synch up on that
-            max_pairs = [(eqi, eqj)]
-        else:
-            # there's a close pair, so forget the identical pair (if any)
-            assert max_pairs
-            eqi = None

-        last_i, last_j = alo, blo
-        for this_i, this_j in max_pairs:
-            # pump out diffs from before the synch point
-            yield from self._fancy_helper(a, last_i, this_i,
-                                          b, last_j, this_j)
+        WINDOW = 10
+        best_i = best_j = None
+        dump_i, dump_j = alo, blo # smallest indices not yet resolved
+        for j in range(blo, bhi):
+            cruncher.set_seq2(b[j])
+            # Search the corresponding i's within WINDOW for rhe highest
+            # ratio greater than `cutoff`.
+            aequiv = alo + (j - blo)
+            arange = range(max(aequiv - WINDOW, dump_i),
+                           min(aequiv + WINDOW + 1, ahi))
+            if not arange: # likely exit if `a` is shorter than `b`
+                break
+            best_ratio = cutoff
+            for i in arange:
+                cruncher.set_seq1(a[i])
+                # Ordering by cheapest to most expensive ratio is very
+                # valuable, most often getting out early.
+                if (crqr() > best_ratio
+                      and cqr() > best_ratio
+                      and cr() > best_ratio):
+                    best_i, best_j, best_ratio = i, j, cr()
+
+            if best_i is None:
+                # found nothing to synch on yet - move to next j
+                continue
+
+            # pump out straight replace from before this synch pair
+            yield from self._fancy_helper(a, dump_i, best_i,
+                                          b, dump_j, best_j)
            # do intraline marking on the synch pair
-            aelt, belt = a[this_i], b[this_j]
-            if eqi is None:
+            aelt, belt = a[best_i], b[best_j]
+            if aelt != belt:
                # pump out a '-', '?', '+', '?' quad for the synched lines
                atags = btags = ""
                cruncher.set_seqs(aelt, belt)
@ -1002,17 +975,18 @@ class Differ:
            else:
                # the synch pair is identical
                yield '  ' + aelt
-            last_i, last_j = this_i + 1, this_j + 1
+            dump_i, dump_j = best_i + 1, best_j + 1
+            best_i = best_j = None

-        # pump out diffs from after the last synch point
-        yield from self._fancy_helper(a, last_i, ahi,
-                                      b, last_j, bhi)
+        # pump out straight replace from after the last synch pair
+        yield from self._fancy_helper(a, dump_i, ahi,
+                                      b, dump_j, bhi)

    def _fancy_helper(self, a, alo, ahi, b, blo, bhi):
        g = []
        if alo < ahi:
            if blo < bhi:
-                g = self._fancy_replace(a, alo, ahi, b, blo, bhi)
+                g = self._plain_replace(a, alo, ahi, b, blo, bhi)
            else:
                g = self._dump('-', a, alo, ahi)
        elif blo < bhi: