mirror of
https://github.com/python/cpython.git
synced 2025-07-29 06:05:00 +00:00
#2986 Add autojunk parameter to SequenceMatcher to optionally disable 'popular == junk' heuristic.
This commit is contained in:
parent
6c2e0224ff
commit
d2d2ae91c5
4 changed files with 96 additions and 39 deletions
|
@ -37,6 +37,16 @@ diffs. For comparing directories and files, see also, the :mod:`filecmp` module.
|
||||||
complicated way on how many elements the sequences have in common; best case
|
complicated way on how many elements the sequences have in common; best case
|
||||||
time is linear.
|
time is linear.
|
||||||
|
|
||||||
|
**Automatic junk heuristic:** :class:`SequenceMatcher` supports a heuristic that
|
||||||
|
automatically treats certain sequence items as junk. The heuristic counts how many
|
||||||
|
times each individual item appears in the sequence. If an item's duplicates (after
|
||||||
|
the first one) account for more than 1% of the sequence and the sequence is at least
|
||||||
|
200 items long, this item is marked as "popular" and is treated as junk for
|
||||||
|
the purpose of sequence matching. This heuristic can be turned off by setting
|
||||||
|
the ``autojunk`` argument to ``False`` when creating the :class:`SequenceMatcher`.
|
||||||
|
|
||||||
|
.. versionadded:: 2.7
|
||||||
|
The *autojunk* parameter.
|
||||||
|
|
||||||
.. class:: Differ
|
.. class:: Differ
|
||||||
|
|
||||||
|
@ -334,7 +344,7 @@ SequenceMatcher Objects
|
||||||
The :class:`SequenceMatcher` class has this constructor:
|
The :class:`SequenceMatcher` class has this constructor:
|
||||||
|
|
||||||
|
|
||||||
.. class:: SequenceMatcher([isjunk[, a[, b]]])
|
.. class:: SequenceMatcher([isjunk[, a[, b[, autojunk=True]]]])
|
||||||
|
|
||||||
Optional argument *isjunk* must be ``None`` (the default) or a one-argument
|
Optional argument *isjunk* must be ``None`` (the default) or a one-argument
|
||||||
function that takes a sequence element and returns true if and only if the
|
function that takes a sequence element and returns true if and only if the
|
||||||
|
@ -350,6 +360,9 @@ The :class:`SequenceMatcher` class has this constructor:
|
||||||
The optional arguments *a* and *b* are sequences to be compared; both default to
|
The optional arguments *a* and *b* are sequences to be compared; both default to
|
||||||
empty strings. The elements of both sequences must be :term:`hashable`.
|
empty strings. The elements of both sequences must be :term:`hashable`.
|
||||||
|
|
||||||
|
The optional argument *autojunk* can be used to disable the automatic junk
|
||||||
|
heuristic.
|
||||||
|
|
||||||
:class:`SequenceMatcher` objects have the following methods:
|
:class:`SequenceMatcher` objects have the following methods:
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -151,7 +151,7 @@ class SequenceMatcher:
|
||||||
Return an upper bound on ratio() very quickly.
|
Return an upper bound on ratio() very quickly.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, isjunk=None, a='', b=''):
|
def __init__(self, isjunk=None, a='', b='', autojunk=True):
|
||||||
"""Construct a SequenceMatcher.
|
"""Construct a SequenceMatcher.
|
||||||
|
|
||||||
Optional arg isjunk is None (the default), or a one-argument
|
Optional arg isjunk is None (the default), or a one-argument
|
||||||
|
@ -169,6 +169,10 @@ class SequenceMatcher:
|
||||||
Optional arg b is the second of two sequences to be compared. By
|
Optional arg b is the second of two sequences to be compared. By
|
||||||
default, an empty string. The elements of b must be hashable. See
|
default, an empty string. The elements of b must be hashable. See
|
||||||
also .set_seqs() and .set_seq2().
|
also .set_seqs() and .set_seq2().
|
||||||
|
|
||||||
|
Optional arg autojunk should be set to False to disable the
|
||||||
|
"automatic junk heuristic" that treats popular elements as junk
|
||||||
|
(see module documentation for more information).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Members:
|
# Members:
|
||||||
|
@ -207,11 +211,13 @@ class SequenceMatcher:
|
||||||
# DOES NOT WORK for x in a!
|
# DOES NOT WORK for x in a!
|
||||||
# isbpopular
|
# isbpopular
|
||||||
# for x in b, isbpopular(x) is true iff b is reasonably long
|
# for x in b, isbpopular(x) is true iff b is reasonably long
|
||||||
# (at least 200 elements) and x accounts for more than 1% of
|
# (at least 200 elements) and x accounts for more than 1 + 1% of
|
||||||
# its elements. DOES NOT WORK for x in a!
|
# its elements (when autojunk is enabled).
|
||||||
|
# DOES NOT WORK for x in a!
|
||||||
|
|
||||||
self.isjunk = isjunk
|
self.isjunk = isjunk
|
||||||
self.a = self.b = None
|
self.a = self.b = None
|
||||||
|
self.autojunk = autojunk
|
||||||
self.set_seqs(a, b)
|
self.set_seqs(a, b)
|
||||||
|
|
||||||
def set_seqs(self, a, b):
|
def set_seqs(self, a, b):
|
||||||
|
@ -288,7 +294,7 @@ class SequenceMatcher:
|
||||||
# from starting any matching block at a junk element ...
|
# from starting any matching block at a junk element ...
|
||||||
# also creates the fast isbjunk function ...
|
# also creates the fast isbjunk function ...
|
||||||
# b2j also does not contain entries for "popular" elements, meaning
|
# b2j also does not contain entries for "popular" elements, meaning
|
||||||
# elements that account for more than 1% of the total elements, and
|
# elements that account for more than 1 + 1% of the total elements, and
|
||||||
# when the sequence is reasonably large (>= 200 elements); this can
|
# when the sequence is reasonably large (>= 200 elements); this can
|
||||||
# be viewed as an adaptive notion of semi-junk, and yields an enormous
|
# be viewed as an adaptive notion of semi-junk, and yields an enormous
|
||||||
# speedup when, e.g., comparing program files with hundreds of
|
# speedup when, e.g., comparing program files with hundreds of
|
||||||
|
@ -309,44 +315,37 @@ class SequenceMatcher:
|
||||||
# out the junk later is much cheaper than building b2j "right"
|
# out the junk later is much cheaper than building b2j "right"
|
||||||
# from the start.
|
# from the start.
|
||||||
b = self.b
|
b = self.b
|
||||||
n = len(b)
|
|
||||||
self.b2j = b2j = {}
|
self.b2j = b2j = {}
|
||||||
populardict = {}
|
|
||||||
for i, elt in enumerate(b):
|
for i, elt in enumerate(b):
|
||||||
if elt in b2j:
|
indices = b2j.setdefault(elt, [])
|
||||||
indices = b2j[elt]
|
indices.append(i)
|
||||||
if n >= 200 and len(indices) * 100 > n:
|
|
||||||
populardict[elt] = 1
|
|
||||||
del indices[:]
|
|
||||||
else:
|
|
||||||
indices.append(i)
|
|
||||||
else:
|
|
||||||
b2j[elt] = [i]
|
|
||||||
|
|
||||||
# Purge leftover indices for popular elements.
|
# Purge junk elements
|
||||||
for elt in populardict:
|
junk = set()
|
||||||
del b2j[elt]
|
|
||||||
|
|
||||||
# Now b2j.keys() contains elements uniquely, and especially when
|
|
||||||
# the sequence is a string, that's usually a good deal smaller
|
|
||||||
# than len(string). The difference is the number of isjunk calls
|
|
||||||
# saved.
|
|
||||||
isjunk = self.isjunk
|
isjunk = self.isjunk
|
||||||
junkdict = {}
|
|
||||||
if isjunk:
|
if isjunk:
|
||||||
for d in populardict, b2j:
|
for elt in list(b2j.keys()): # using list() since b2j is modified
|
||||||
for elt in d.keys():
|
if isjunk(elt):
|
||||||
if isjunk(elt):
|
junk.add(elt)
|
||||||
junkdict[elt] = 1
|
del b2j[elt]
|
||||||
del d[elt]
|
|
||||||
|
|
||||||
# Now for x in b, isjunk(x) == x in junkdict, but the
|
# Purge popular elements that are not junk
|
||||||
# latter is much faster. Note too that while there may be a
|
popular = set()
|
||||||
# lot of junk in the sequence, the number of *unique* junk
|
n = len(b)
|
||||||
# elements is probably small. So the memory burden of keeping
|
if self.autojunk and n >= 200:
|
||||||
# this dict alive is likely trivial compared to the size of b2j.
|
ntest = n // 100 + 1
|
||||||
self.isbjunk = junkdict.__contains__
|
for elt, idxs in list(b2j.items()):
|
||||||
self.isbpopular = populardict.__contains__
|
if len(idxs) > ntest:
|
||||||
|
popular.add(elt)
|
||||||
|
del b2j[elt]
|
||||||
|
|
||||||
|
# Now for x in b, isjunk(x) == x in junk, but the latter is much faster.
|
||||||
|
# Sicne the number of *unique* junk elements is probably small, the
|
||||||
|
# memory burden of keeping this set alive is likely trivial compared to
|
||||||
|
# the size of b2j.
|
||||||
|
self.isbjunk = junk.__contains__
|
||||||
|
self.isbpopular = popular.__contains__
|
||||||
|
|
||||||
def find_longest_match(self, alo, ahi, blo, bhi):
|
def find_longest_match(self, alo, ahi, blo, bhi):
|
||||||
"""Find longest matching block in a[alo:ahi] and b[blo:bhi].
|
"""Find longest matching block in a[alo:ahi] and b[blo:bhi].
|
||||||
|
|
|
@ -4,8 +4,47 @@ import unittest
|
||||||
import doctest
|
import doctest
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
class TestSFbugs(unittest.TestCase):
|
|
||||||
|
|
||||||
|
class TestWithAscii(unittest.TestCase):
|
||||||
|
def test_one_insert(self):
|
||||||
|
sm = difflib.SequenceMatcher(None, 'b' * 100, 'a' + 'b' * 100)
|
||||||
|
self.assertAlmostEqual(sm.ratio(), 0.995, places=3)
|
||||||
|
self.assertEqual(list(sm.get_opcodes()),
|
||||||
|
[ ('insert', 0, 0, 0, 1),
|
||||||
|
('equal', 0, 100, 1, 101)])
|
||||||
|
sm = difflib.SequenceMatcher(None, 'b' * 100, 'b' * 50 + 'a' + 'b' * 50)
|
||||||
|
self.assertAlmostEqual(sm.ratio(), 0.995, places=3)
|
||||||
|
self.assertEqual(list(sm.get_opcodes()),
|
||||||
|
[ ('equal', 0, 50, 0, 50),
|
||||||
|
('insert', 50, 50, 50, 51),
|
||||||
|
('equal', 50, 100, 51, 101)])
|
||||||
|
|
||||||
|
def test_one_delete(self):
|
||||||
|
sm = difflib.SequenceMatcher(None, 'a' * 40 + 'c' + 'b' * 40, 'a' * 40 + 'b' * 40)
|
||||||
|
self.assertAlmostEqual(sm.ratio(), 0.994, places=3)
|
||||||
|
self.assertEqual(list(sm.get_opcodes()),
|
||||||
|
[ ('equal', 0, 40, 0, 40),
|
||||||
|
('delete', 40, 41, 40, 40),
|
||||||
|
('equal', 41, 81, 40, 80)])
|
||||||
|
|
||||||
|
|
||||||
|
class TestAutojunk(unittest.TestCase):
|
||||||
|
"""Tests for the autojunk parameter added in 2.7"""
|
||||||
|
def test_one_insert_homogenous_sequence(self):
|
||||||
|
# By default autojunk=True and the heuristic kicks in for a sequence
|
||||||
|
# of length 200+
|
||||||
|
seq1 = 'b' * 200
|
||||||
|
seq2 = 'a' + 'b' * 200
|
||||||
|
|
||||||
|
sm = difflib.SequenceMatcher(None, seq1, seq2)
|
||||||
|
self.assertAlmostEqual(sm.ratio(), 0, places=3)
|
||||||
|
|
||||||
|
# Now turn the heuristic off
|
||||||
|
sm = difflib.SequenceMatcher(None, seq1, seq2, autojunk=False)
|
||||||
|
self.assertAlmostEqual(sm.ratio(), 0.9975, places=3)
|
||||||
|
|
||||||
|
|
||||||
|
class TestSFbugs(unittest.TestCase):
|
||||||
def test_ratio_for_null_seqn(self):
|
def test_ratio_for_null_seqn(self):
|
||||||
# Check clearing of SF bug 763023
|
# Check clearing of SF bug 763023
|
||||||
s = difflib.SequenceMatcher(None, [], [])
|
s = difflib.SequenceMatcher(None, [], [])
|
||||||
|
@ -184,7 +223,9 @@ class TestOutputFormat(unittest.TestCase):
|
||||||
def test_main():
|
def test_main():
|
||||||
difflib.HtmlDiff._default_prefix = 0
|
difflib.HtmlDiff._default_prefix = 0
|
||||||
Doctests = doctest.DocTestSuite(difflib)
|
Doctests = doctest.DocTestSuite(difflib)
|
||||||
run_unittest(TestSFpatches, TestSFbugs, TestOutputFormat, Doctests)
|
run_unittest(
|
||||||
|
TestWithAscii, TestAutojunk, TestSFpatches, TestSFbugs,
|
||||||
|
TestOutputFormat, Doctests)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
test_main()
|
test_main()
|
||||||
|
|
|
@ -213,6 +213,10 @@ Library
|
||||||
- Issue #808164: Fixed socket.close to avoid references to globals, to
|
- Issue #808164: Fixed socket.close to avoid references to globals, to
|
||||||
avoid issues when socket.close is called from a __del__ method.
|
avoid issues when socket.close is called from a __del__ method.
|
||||||
|
|
||||||
|
- Issue #2986: difflib.SequenceMatcher gets a new parameter, autojunk, which
|
||||||
|
can be set to False to turn off the previously undocumented 'popularity'
|
||||||
|
heuristic. Patch by Terry Reedy and Eli Bendersky
|
||||||
|
|
||||||
- Issue #8797: urllib2 does a retry for Basic Authentication failure instead of
|
- Issue #8797: urllib2 does a retry for Basic Authentication failure instead of
|
||||||
falling into recursion.
|
falling into recursion.
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue