gh-119105: Differ.compare is too slow [for degenerate cases] (#119492)

``_fancy_replace()`` is no longer recursive. and a single call does a worst-case linear number of ratio() computations instead of quadratic. This renders toothless a universe of pathological cases. Some inputs may produce different output, but that's rare, and I didn't find a case where the final diff appeared to be of materially worse quality. To the contrary, by refusing to even consider synching on lines "far apart", there was more easy-to-digest locality in the output.
This commit is contained in:
Tim Peters 2024-05-24 22:08:21 -05:00 committed by GitHub
parent 84be5244de
commit de19694cfb
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 46 additions and 71 deletions

View file

@ -908,79 +908,52 @@ class Differ:
+ abcdefGhijkl
? ^ ^ ^
"""
from operator import ge, gt
# Don't synch up unless the lines have a similarity score of at
# least cutoff; best_ratio tracks the best score seen so far.
# Keep track of all index pairs achieving the best ratio and
# deal with them here. Previously only the smallest pair was
# handled here, and if there are many pairs with the best ratio,
# recursion could grow very deep, and runtime cubic. See:
# Don't synch up unless the lines have a similarity score above
# cutoff. Previously only the smallest pair was handled here,
# and if there are many pairs with the best ratio, recursion
# could grow very deep, and runtime cubic. See:
# https://github.com/python/cpython/issues/119105
best_ratio, cutoff = 0.74, 0.75
#
# Later, more pathological cases prompted removing recursion
# entirely.
cutoff = 0.74999
cruncher = SequenceMatcher(self.charjunk)
eqi, eqj = None, None # 1st indices of equal lines (if any)
# List of index pairs achieving best_ratio. Strictly increasing
# in both index positions.
max_pairs = []
maxi = -1 # `i` index of last pair in max_pairs
# search for the pair that matches best without being identical
# (identical lines must be junk lines, & we don't want to synch
# up on junk -- unless we have to)
crqr = cruncher.real_quick_ratio
cqr = cruncher.quick_ratio
cr = cruncher.ratio
for j in range(blo, bhi):
bj = b[j]
cruncher.set_seq2(bj)
# Find new best, if possible. Else search for the smallest i
# (if any) > maxi that equals the best ratio
search_equal = True
for i in range(alo, ahi):
ai = a[i]
if ai == bj:
if eqi is None:
eqi, eqj = i, j
continue
cruncher.set_seq1(ai)
# computing similarity is expensive, so use the quick
# upper bounds first -- have seen this speed up messy
# compares by a factor of 3.
cmp = ge if search_equal and i > maxi else gt
if (cmp(crqr(), best_ratio)
and cmp(cqr(), best_ratio)
and cmp((ratio := cr()), best_ratio)):
if ratio > best_ratio:
best_ratio = ratio
max_pairs.clear()
else:
assert best_ratio == ratio and search_equal
assert i > maxi
max_pairs.append((i, j))
maxi = i
search_equal = False
if best_ratio < cutoff:
assert not max_pairs
# no non-identical "pretty close" pair
if eqi is None:
# no identical pair either -- treat it as a straight replace
yield from self._plain_replace(a, alo, ahi, b, blo, bhi)
return
# no close pair, but an identical pair -- synch up on that
max_pairs = [(eqi, eqj)]
else:
# there's a close pair, so forget the identical pair (if any)
assert max_pairs
eqi = None
last_i, last_j = alo, blo
for this_i, this_j in max_pairs:
# pump out diffs from before the synch point
yield from self._fancy_helper(a, last_i, this_i,
b, last_j, this_j)
WINDOW = 10
best_i = best_j = None
dump_i, dump_j = alo, blo # smallest indices not yet resolved
for j in range(blo, bhi):
cruncher.set_seq2(b[j])
# Search the corresponding i's within WINDOW for rhe highest
# ratio greater than `cutoff`.
aequiv = alo + (j - blo)
arange = range(max(aequiv - WINDOW, dump_i),
min(aequiv + WINDOW + 1, ahi))
if not arange: # likely exit if `a` is shorter than `b`
break
best_ratio = cutoff
for i in arange:
cruncher.set_seq1(a[i])
# Ordering by cheapest to most expensive ratio is very
# valuable, most often getting out early.
if (crqr() > best_ratio
and cqr() > best_ratio
and cr() > best_ratio):
best_i, best_j, best_ratio = i, j, cr()
if best_i is None:
# found nothing to synch on yet - move to next j
continue
# pump out straight replace from before this synch pair
yield from self._fancy_helper(a, dump_i, best_i,
b, dump_j, best_j)
# do intraline marking on the synch pair
aelt, belt = a[this_i], b[this_j]
if eqi is None:
aelt, belt = a[best_i], b[best_j]
if aelt != belt:
# pump out a '-', '?', '+', '?' quad for the synched lines
atags = btags = ""
cruncher.set_seqs(aelt, belt)
@ -1002,17 +975,18 @@ class Differ:
else:
# the synch pair is identical
yield ' ' + aelt
last_i, last_j = this_i + 1, this_j + 1
dump_i, dump_j = best_i + 1, best_j + 1
best_i = best_j = None
# pump out diffs from after the last synch point
yield from self._fancy_helper(a, last_i, ahi,
b, last_j, bhi)
# pump out straight replace from after the last synch pair
yield from self._fancy_helper(a, dump_i, ahi,
b, dump_j, bhi)
def _fancy_helper(self, a, alo, ahi, b, blo, bhi):
g = []
if alo < ahi:
if blo < bhi:
g = self._fancy_replace(a, alo, ahi, b, blo, bhi)
g = self._plain_replace(a, alo, ahi, b, blo, bhi)
else:
g = self._dump('-', a, alo, ahi)
elif blo < bhi: