Moved SequenceMatcher from ndiff into new std library module difflib.py.

Guido told me to do this <wink>. Greatly expanded docstrings, and fleshed out with examples. New std test. Added new get_close_matches() function for ESR. Needs docs, but LaTeXification of the module docstring is all it needs. \CVS: ----------------------------------------------------------------------
2025-10-10 00:43:41 +00:00 · 2001-02-10 08:00:53 +00:00 · 2001-02-10 08:00:53 +00:00 · 9ae2148ada
commit 9ae2148ada
parent 6db54c69a4
4 changed files with 1065 additions and 292 deletions
--- a/Tools/scripts/ndiff.py
+++ b/Tools/scripts/ndiff.py
@ -97,6 +97,8 @@ __version__ = 1, 5, 0
 # is sent to stdout.  Or you can call main(args), passing what would
 # have been in sys.argv[1:] had the cmd-line form been used.

+from difflib import SequenceMatcher
+
 import string
 TRACE = 0

@ -111,298 +113,6 @@ def IS_CHARACTER_JUNK(ch, ws=" \t"):

 del re

-class SequenceMatcher:
-    def __init__(self, isjunk=None, a='', b=''):
-        # Members:
-        # a
-        #      first sequence
-        # b
-        #      second sequence; differences are computed as "what do
-        #      we need to do to 'a' to change it into 'b'?"
-        # b2j
-        #      for x in b, b2j[x] is a list of the indices (into b)
-        #      at which x appears; junk elements do not appear
-        # b2jhas
-        #      b2j.has_key
-        # fullbcount
-        #      for x in b, fullbcount[x] == the number of times x
-        #      appears in b; only materialized if really needed (used
-        #      only for computing quick_ratio())
-        # matching_blocks
-        #      a list of (i, j, k) triples, where a[i:i+k] == b[j:j+k];
-        #      ascending & non-overlapping in i and in j; terminated by
-        #      a dummy (len(a), len(b), 0) sentinel
-        # opcodes
-        #      a list of (tag, i1, i2, j1, j2) tuples, where tag is
-        #      one of
-        #          'replace'   a[i1:i2] should be replaced by b[j1:j2]
-        #          'delete'    a[i1:i2] should be deleted
-        #          'insert'    b[j1:j2] should be inserted
-        #          'equal'     a[i1:i2] == b[j1:j2]
-        # isjunk
-        #      a user-supplied function taking a sequence element and
-        #      returning true iff the element is "junk" -- this has
-        #      subtle but helpful effects on the algorithm, which I'll
-        #      get around to writing up someday <0.9 wink>.
-        #      DON'T USE!  Only __chain_b uses this.  Use isbjunk.
-        # isbjunk
-        #      for x in b, isbjunk(x) == isjunk(x) but much faster;
-        #      it's really the has_key method of a hidden dict.
-        #      DOES NOT WORK for x in a!
-
-        self.isjunk = isjunk
-        self.a = self.b = None
-        self.set_seqs(a, b)
-
-    def set_seqs(self, a, b):
-        self.set_seq1(a)
-        self.set_seq2(b)
-
-    def set_seq1(self, a):
-        if a is self.a:
-            return
-        self.a = a
-        self.matching_blocks = self.opcodes = None
-
-    def set_seq2(self, b):
-        if b is self.b:
-            return
-        self.b = b
-        self.matching_blocks = self.opcodes = None
-        self.fullbcount = None
-        self.__chain_b()
-
-    # For each element x in b, set b2j[x] to a list of the indices in
-    # b where x appears; the indices are in increasing order; note that
-    # the number of times x appears in b is len(b2j[x]) ...
-    # when self.isjunk is defined, junk elements don't show up in this
-    # map at all, which stops the central find_longest_match method
-    # from starting any matching block at a junk element ...
-    # also creates the fast isbjunk function ...
-    # note that this is only called when b changes; so for cross-product
-    # kinds of matches, it's best to call set_seq2 once, then set_seq1
-    # repeatedly
-
-    def __chain_b(self):
-        # Because isjunk is a user-defined (not C) function, and we test
-        # for junk a LOT, it's important to minimize the number of calls.
-        # Before the tricks described here, __chain_b was by far the most
-        # time-consuming routine in the whole module!  If anyone sees
-        # Jim Roskind, thank him again for profile.py -- I never would
-        # have guessed that.
-        # The first trick is to build b2j ignoring the possibility
-        # of junk.  I.e., we don't call isjunk at all yet.  Throwing
-        # out the junk later is much cheaper than building b2j "right"
-        # from the start.
-        b = self.b
-        self.b2j = b2j = {}
-        self.b2jhas = b2jhas = b2j.has_key
-        for i in xrange(len(b)):
-            elt = b[i]
-            if b2jhas(elt):
-                b2j[elt].append(i)
-            else:
-                b2j[elt] = [i]
-
-        # Now b2j.keys() contains elements uniquely, and especially when
-        # the sequence is a string, that's usually a good deal smaller
-        # than len(string).  The difference is the number of isjunk calls
-        # saved.
-        isjunk, junkdict = self.isjunk, {}
-        if isjunk:
-            for elt in b2j.keys():
-                if isjunk(elt):
-                    junkdict[elt] = 1   # value irrelevant; it's a set
-                    del b2j[elt]
-
-        # Now for x in b, isjunk(x) == junkdict.has_key(x), but the
-        # latter is much faster.  Note too that while there may be a
-        # lot of junk in the sequence, the number of *unique* junk
-        # elements is probably small.  So the memory burden of keeping
-        # this dict alive is likely trivial compared to the size of b2j.
-        self.isbjunk = junkdict.has_key
-
-    def find_longest_match(self, alo, ahi, blo, bhi):
-        """Find longest matching block in a[alo:ahi] and b[blo:bhi].
-
-        If isjunk is not defined:
-
-        Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where
-            alo <= i <= i+k <= ahi
-            blo <= j <= j+k <= bhi
-        and for all (i',j',k') meeting those conditions,
-            k >= k'
-            i <= i'
-            and if i == i', j <= j'
-        In other words, of all maximal matching blocks, return one
-        that starts earliest in a, and of all those maximal matching
-        blocks that start earliest in a, return the one that starts
-        earliest in b.
-
-        If isjunk is defined, first the longest matching block is
-        determined as above, but with the additional restriction that
-        no junk element appears in the block.  Then that block is
-        extended as far as possible by matching (only) junk elements on
-        both sides.  So the resulting block never matches on junk except
-        as identical junk happens to be adjacent to an "interesting"
-        match.
-
-        If no blocks match, return (alo, blo, 0).
-        """
-
-        # CAUTION:  stripping common prefix or suffix would be incorrect.
-        # E.g.,
-        #    ab
-        #    acab
-        # Longest matching block is "ab", but if common prefix is
-        # stripped, it's "a" (tied with "b").  UNIX(tm) diff does so
-        # strip, so ends up claiming that ab is changed to acab by
-        # inserting "ca" in the middle.  That's minimal but unintuitive:
-        # "it's obvious" that someone inserted "ac" at the front.
-        # Windiff ends up at the same place as diff, but by pairing up
-        # the unique 'b's and then matching the first two 'a's.
-
-        a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.isbjunk
-        besti, bestj, bestsize = alo, blo, 0
-        # find longest junk-free match
-        # during an iteration of the loop, j2len[j] = length of longest
-        # junk-free match ending with a[i-1] and b[j]
-        j2len = {}
-        nothing = []
-        for i in xrange(alo, ahi):
-            # look at all instances of a[i] in b; note that because
-            # b2j has no junk keys, the loop is skipped if a[i] is junk
-            j2lenget = j2len.get
-            newj2len = {}
-            for j in b2j.get(a[i], nothing):
-                # a[i] matches b[j]
-                if j < blo:
-                    continue
-                if j >= bhi:
-                    break
-                k = newj2len[j] = j2lenget(j-1, 0) + 1
-                if k > bestsize:
-                    besti, bestj, bestsize = i-k+1, j-k+1, k
-            j2len = newj2len
-
-        # Now that we have a wholly interesting match (albeit possibly
-        # empty!), we may as well suck up the matching junk on each
-        # side of it too.  Can't think of a good reason not to, and it
-        # saves post-processing the (possibly considerable) expense of
-        # figuring out what to do with it.  In the case of an empty
-        # interesting match, this is clearly the right thing to do,
-        # because no other kind of match is possible in the regions.
-        while besti > alo and bestj > blo and \
-              isbjunk(b[bestj-1]) and \
-              a[besti-1] == b[bestj-1]:
-            besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
-        while besti+bestsize < ahi and bestj+bestsize < bhi and \
-              isbjunk(b[bestj+bestsize]) and \
-              a[besti+bestsize] == b[bestj+bestsize]:
-            bestsize = bestsize + 1
-
-        if TRACE:
-            print "get_matching_blocks", alo, ahi, blo, bhi
-            print "    returns", besti, bestj, bestsize
-        return besti, bestj, bestsize
-
-    def get_matching_blocks(self):
-        if self.matching_blocks is not None:
-            return self.matching_blocks
-        self.matching_blocks = []
-        la, lb = len(self.a), len(self.b)
-        self.__helper(0, la, 0, lb, self.matching_blocks)
-        self.matching_blocks.append( (la, lb, 0) )
-        if TRACE:
-            print '*** matching blocks', self.matching_blocks
-        return self.matching_blocks
-
-    # builds list of matching blocks covering a[alo:ahi] and
-    # b[blo:bhi], appending them in increasing order to answer
-
-    def __helper(self, alo, ahi, blo, bhi, answer):
-        i, j, k = x = self.find_longest_match(alo, ahi, blo, bhi)
-        # a[alo:i] vs b[blo:j] unknown
-        # a[i:i+k] same as b[j:j+k]
-        # a[i+k:ahi] vs b[j+k:bhi] unknown
-        if k:
-            if alo < i and blo < j:
-                self.__helper(alo, i, blo, j, answer)
-            answer.append(x)
-            if i+k < ahi and j+k < bhi:
-                self.__helper(i+k, ahi, j+k, bhi, answer)
-
-    def ratio(self):
-        """Return a measure of the sequences' similarity (float in [0,1]).
-
-        Where T is the total number of elements in both sequences, and
-        M is the number of matches, this is 2*M / T.
-        Note that this is 1 if the sequences are identical, and 0 if
-        they have nothing in common.
-        """
-
-        matches = reduce(lambda sum, triple: sum + triple[-1],
-                         self.get_matching_blocks(), 0)
-        return 2.0 * matches / (len(self.a) + len(self.b))
-
-    def quick_ratio(self):
-        """Return an upper bound on ratio() relatively quickly."""
-        # viewing a and b as multisets, set matches to the cardinality
-        # of their intersection; this counts the number of matches
-        # without regard to order, so is clearly an upper bound
-        if self.fullbcount is None:
-            self.fullbcount = fullbcount = {}
-            for elt in self.b:
-                fullbcount[elt] = fullbcount.get(elt, 0) + 1
-        fullbcount = self.fullbcount
-        # avail[x] is the number of times x appears in 'b' less the
-        # number of times we've seen it in 'a' so far ... kinda
-        avail = {}
-        availhas, matches = avail.has_key, 0
-        for elt in self.a:
-            if availhas(elt):
-                numb = avail[elt]
-            else:
-                numb = fullbcount.get(elt, 0)
-            avail[elt] = numb - 1
-            if numb > 0:
-                matches = matches + 1
-        return 2.0 * matches / (len(self.a) + len(self.b))
-
-    def real_quick_ratio(self):
-        """Return an upper bound on ratio() very quickly"""
-        la, lb = len(self.a), len(self.b)
-        # can't have more matches than the number of elements in the
-        # shorter sequence
-        return 2.0 * min(la, lb) / (la + lb)
-
-    def get_opcodes(self):
-        if self.opcodes is not None:
-            return self.opcodes
-        i = j = 0
-        self.opcodes = answer = []
-        for ai, bj, size in self.get_matching_blocks():
-            # invariant:  we've pumped out correct diffs to change
-            # a[:i] into b[:j], and the next matching block is
-            # a[ai:ai+size] == b[bj:bj+size].  So we need to pump
-            # out a diff to change a[i:ai] into b[j:bj], pump out
-            # the matching block, and move (i,j) beyond the match
-            tag = ''
-            if i < ai and j < bj:
-                tag = 'replace'
-            elif i < ai:
-                tag = 'delete'
-            elif j < bj:
-                tag = 'insert'
-            if tag:
-                answer.append( (tag, i, ai, j, bj) )
-            i, j = ai+size, bj+size
-            # the list of matching blocks is terminated by a
-            # sentinel with size 0
-            if size:
-                answer.append( ('equal', ai, i, bj, j) )
-        return answer
-
 # meant for dumping lines
 def dump(tag, x, lo, hi):
    for i in xrange(lo, hi):