Make difflib.ndiff() and difflib.Differ.compare() generators. This

restores the 2.1 ability of Tools/scripts/ndiff.py to start producing output before the entire comparison is complete.
2025-10-07 07:31:46 +00:00 · 2001-09-22 21:30:22 +00:00 · 2001-09-22 21:30:22 +00:00 · 8a9c284437
commit 8a9c284437
parent 380bad1b4e
4 changed files with 84 additions and 70 deletions
--- a/Doc/lib/libdifflib.tex
+++ b/Doc/lib/libdifflib.tex
@ -85,7 +85,7 @@
 \begin{funcdesc}{ndiff}{a, b\optional{, linejunk\optional{,
                 charjunk}}}
  Compare \var{a} and \var{b} (lists of strings); return a
-  \class{Differ}-style delta.
+  \class{Differ}-style delta (a generator generating the delta lines).
  Optional keyword parameters \var{linejunk} and \var{charjunk} are
  for filter functions (or \code{None}):
@ -132,6 +132,7 @@
 \begin{verbatim}
 >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(1),
 ...              'ore\ntree\nemu\n'.splitlines(1))
 >>> diff = list(diff) # materialize the generated delta into a list
 >>> print ''.join(restore(diff, 1)),
 one
 two
@ -459,13 +460,14 @@ The \class{Differ} class has this constructor:
 method:
 \begin{methoddesc}{compare}{a, b}
-  Compare two sequences of lines; return the resulting delta (list).
+  Compare two sequences of lines, and generate the delta (a sequence
  of lines).
  Each sequence must contain individual single-line strings ending
  with newlines. Such sequences can be obtained from the
-  \method{readlines()} method of file-like objects. The list returned
+  \method{readlines()} method of file-like objects.  The delta generated
-  is also made up of newline-terminated strings, and ready to be used
+  also consists of newline-terminated strings, ready to be printed as-is
-  with the \method{writelines()} method of a file-like object.
+  via the \method{writeline()} method of a file-like object.
 \end{methoddesc}
@ -506,7 +508,7 @@ functions to filter out line and character ``junk.''  See the
 Finally, we compare the two:
 \begin{verbatim}
->>> result = d.compare(text1, text2)
+>>> result = list(d.compare(text1, text2))
 \end{verbatim}
 \code{result} is a list of strings, so let's pretty-print it:
--- a/Lib/difflib.py
+++ b/Lib/difflib.py
@ -1,5 +1,7 @@
 #! /usr/bin/env python
 from __future__ import generators
 """
 Module difflib -- helpers for computing deltas between objects.
@ -22,8 +24,6 @@ Class Differ:
 __all__ = ['get_close_matches', 'ndiff', 'restore', 'SequenceMatcher',
           'Differ']
 TRACE = 0
 class SequenceMatcher:
    """
@ -406,9 +406,6 @@ class SequenceMatcher:
              a[besti+bestsize] == b[bestj+bestsize]:
            bestsize = bestsize + 1
        if TRACE:
            print "get_matching_blocks", alo, ahi, blo, bhi
            print "    returns", besti, bestj, bestsize
        return besti, bestj, bestsize
    def get_matching_blocks(self):
@ -432,8 +429,6 @@ class SequenceMatcher:
        la, lb = len(self.a), len(self.b)
        self.__helper(0, la, 0, lb, self.matching_blocks)
        self.matching_blocks.append( (la, lb, 0) )
        if TRACE:
            print '*** matching blocks', self.matching_blocks
        return self.matching_blocks
    # builds list of matching blocks covering a[alo:ahi] and
@ -694,7 +689,7 @@ class Differ:
    Finally, we compare the two:
-    >>> result = d.compare(text1, text2)
+    >>> result = list(d.compare(text1, text2))
    'result' is a list of strings, so let's pretty-print it:
@ -731,7 +726,7 @@ class Differ:
        Construct a text differencer, with optional filters.
    compare(a, b)
-        Compare two sequences of lines; return the resulting delta (list).
+        Compare two sequences of lines; generate the resulting delta.
    """
    def __init__(self, linejunk=None, charjunk=None):
@ -753,16 +748,15 @@ class Differ:
        self.linejunk = linejunk
        self.charjunk = charjunk
        self.results = []
    def compare(self, a, b):
        r"""
-        Compare two sequences of lines; return the resulting delta (list).
+        Compare two sequences of lines; generate the resulting delta.
        Each sequence must contain individual single-line strings ending with
        newlines. Such sequences can be obtained from the `readlines()` method
-        of file-like objects. The list returned is also made up of
+        of file-like objects.  The delta generated also consists of newline-
-        newline-terminated strings, ready to be used with the `writelines()`
+        terminated strings, ready to be printed as-is via the writeline()
        method of a file-like object.
        Example:
@ -783,34 +777,38 @@ class Differ:
        cruncher = SequenceMatcher(self.linejunk, a, b)
        for tag, alo, ahi, blo, bhi in cruncher.get_opcodes():
            if tag == 'replace':
-                self._fancy_replace(a, alo, ahi, b, blo, bhi)
+                g = self._fancy_replace(a, alo, ahi, b, blo, bhi)
            elif tag == 'delete':
-                self._dump('-', a, alo, ahi)
+                g = self._dump('-', a, alo, ahi)
            elif tag == 'insert':
-                self._dump('+', b, blo, bhi)
+                g = self._dump('+', b, blo, bhi)
            elif tag == 'equal':
-                self._dump(' ', a, alo, ahi)
+                g = self._dump(' ', a, alo, ahi)
            else:
                raise ValueError, 'unknown tag ' + `tag`
-        results = self.results
+
-        self.results = []
+            for line in g:
-        return results
+                yield line
    def _dump(self, tag, x, lo, hi):
-        """Store comparison results for a same-tagged range."""
+        """Generate comparison results for a same-tagged range."""
        for i in xrange(lo, hi):
-            self.results.append('%s %s' % (tag, x[i]))
+            yield '%s %s' % (tag, x[i])
    def _plain_replace(self, a, alo, ahi, b, blo, bhi):
        assert alo < ahi and blo < bhi
        # dump the shorter block first -- reduces the burden on short-term
        # memory if the blocks are of very different sizes
        if bhi - blo < ahi - alo:
-            self._dump('+', b, blo, bhi)
+            first  = self._dump('+', b, blo, bhi)
-            self._dump('-', a, alo, ahi)
+            second = self._dump('-', a, alo, ahi)
        else:
-            self._dump('-', a, alo, ahi)
+            first  = self._dump('-', a, alo, ahi)
-            self._dump('+', b, blo, bhi)
+            second = self._dump('+', b, blo, bhi)
        for g in first, second:
            for line in g:
                yield line
    def _fancy_replace(self, a, alo, ahi, b, blo, bhi):
        r"""
@ -830,12 +828,6 @@ class Differ:
        ?    ^  ^  ^
        """
        if TRACE:
            self.results.append('*** _fancy_replace %s %s %s %s\n'
                                % (alo, ahi, blo, bhi))
            self._dump('>', a, alo, ahi)
            self._dump('<', b, blo, bhi)
        # don't synch up unless the lines have a similarity score of at
        # least cutoff; best_ratio tracks the best score seen so far
        best_ratio, cutoff = 0.74, 0.75
@ -869,7 +861,8 @@ class Differ:
            # no non-identical "pretty close" pair
            if eqi is None:
                # no identical pair either -- treat it as a straight replace
-                self._plain_replace(a, alo, ahi, b, blo, bhi)
+                for line in self._plain_replace(a, alo, ahi, b, blo, bhi):
                    yield line
                return
            # no close pair, but an identical pair -- synch up on that
            best_i, best_j, best_ratio = eqi, eqj, 1.0
@ -879,14 +872,10 @@ class Differ:
        # a[best_i] very similar to b[best_j]; eqi is None iff they're not
        # identical
        if TRACE:
            self.results.append('*** best_ratio %s %s %s %s\n'
                                % (best_ratio, best_i, best_j))
            self._dump('>', a, best_i, best_i+1)
            self._dump('<', b, best_j, best_j+1)
        # pump out diffs from before the synch point
-        self._fancy_helper(a, alo, best_i, b, blo, best_j)
+        for line in self._fancy_helper(a, alo, best_i, b, blo, best_j):
            yield line
        # do intraline marking on the synch pair
        aelt, belt = a[best_i], b[best_j]
@ -908,22 +897,28 @@ class Differ:
                    btags += ' ' * lb
                else:
                    raise ValueError, 'unknown tag ' + `tag`
-            self._qformat(aelt, belt, atags, btags)
+            for line in self._qformat(aelt, belt, atags, btags):
                yield line
        else:
            # the synch pair is identical
-            self.results.append('  ' + aelt)
+            yield '  ' + aelt
        # pump out diffs from after the synch point
-        self._fancy_helper(a, best_i+1, ahi, b, best_j+1, bhi)
+        for line in self._fancy_helper(a, best_i+1, ahi, b, best_j+1, bhi):
            yield line
    def _fancy_helper(self, a, alo, ahi, b, blo, bhi):
        g = []
        if alo < ahi:
            if blo < bhi:
-                self._fancy_replace(a, alo, ahi, b, blo, bhi)
+                g = self._fancy_replace(a, alo, ahi, b, blo, bhi)
            else:
-                self._dump('-', a, alo, ahi)
+                g = self._dump('-', a, alo, ahi)
        elif blo < bhi:
-            self._dump('+', b, blo, bhi)
+            g = self._dump('+', b, blo, bhi)
        for line in g:
            yield line
    def _qformat(self, aline, bline, atags, btags):
        r"""
@ -949,13 +944,13 @@ class Differ:
        atags = atags[common:].rstrip()
        btags = btags[common:].rstrip()
-        self.results.append("- " + aline)
+        yield "- " + aline
        if atags:
-            self.results.append("? %s%s\n" % ("\t" * common, atags))
+             yield "? %s%s\n" % ("\t" * common, atags)
-        self.results.append("+ " + bline)
+        yield "+ " + bline
        if btags:
-            self.results.append("? %s%s\n" % ("\t" * common, btags))
+            yield "? %s%s\n" % ("\t" * common, btags)
 # With respect to junk, an earlier version of ndiff simply refused to
 # *start* a match with a junk element.  The result was cases like this:
@ -1050,7 +1045,7 @@ def ndiff(a, b, linejunk=IS_LINE_JUNK, charjunk=IS_CHARACTER_JUNK):
 def restore(delta, which):
    r"""
-    Return one of the two sequences that generated a delta.
+    Generate one of the two sequences that generated a delta.
    Given a `delta` produced by `Differ.compare()` or `ndiff()`, extract
    lines originating from file 1 or 2 (parameter `which`), stripping off line
@ -1060,6 +1055,7 @@ def restore(delta, which):
    >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(1),
    ...              'ore\ntree\nemu\n'.splitlines(1))
    >>> diff = list(diff)
    >>> print ''.join(restore(diff, 1)),
    one
    two
@ -1075,11 +1071,9 @@ def restore(delta, which):
        raise ValueError, ('unknown delta choice (must be 1 or 2): %r'
                           % which)
    prefixes = ("  ", tag)
    results = []
    for line in delta:
        if line[:2] in prefixes:
-            results.append(line[2:])
+            yield line[2:]
    return results
 def _test():
    import doctest, difflib
--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -30,7 +30,7 @@ Core
 - In 2.2a3, __new__ would only see sequential arguments passed to the
  type in a constructor call; __init__ would see both sequential and
-  positional arguments.  This made no sense whatsoever any more, so
+  keyword arguments.  This made no sense whatsoever any more, so
  now both __new__ and __init__ see all arguments.
 - In 2.2a3, hash() applied to an instance of a subclass of str or unicode
@ -54,6 +54,10 @@ Core
 Library
 - difflib.ndiff() and difflib.Differ.compare() are generators now.  This
  restores the ability of Tools/scripts/ndiff.py to start producing output
  before the entire comparison is complete.
 - StringIO.StringIO instances and cStringIO.StringIO instances support
  iteration just like file objects (i.e. their .readline() method is
  called for each iteration until it returns an empty string).
@ -124,10 +128,25 @@ New platforms
 Tests
 - The "classic" standard tests, which work by comparing stdout to
  an expected-output file under Lib/test/output/, no longer stop at
  the first mismatch.  Instead the test is run to completion, and a
  variant of ndiff-style comparison is used to report all differences.
  This is much easier to understand than the previous style of reporting.
 - The unittest-based standard tests now use regrtest's test_main()
  convention, instead of running as a side-effect of merely being
  imported.  This allows these tests to be run in more natural and
  flexible ways as unittests, outside the regrtest framework.
 - regrtest.py is much better integrated with unittest and doctest now,
  especially in regard to reporting errors.
 Windows
 - Large file support now also works for files > 4GB, on filesystems
-  that support it (NTFS under Windows 2000).
+  that support it (NTFS under Windows 2000).  See "What's New in
  Python 2.2a3" for more detail.
 What's New in Python 2.2a3?
--- a/Tools/scripts/ndiff.py
+++ b/Tools/scripts/ndiff.py
@ -73,9 +73,8 @@ def fcompare(f1name, f2name):
    a = f1.readlines(); f1.close()
    b = f2.readlines(); f2.close()
-
+    for line in difflib.ndiff(a, b):
-    diff = difflib.ndiff(a, b)
+        print line,
    sys.stdout.writelines(diff)
    return 1