bpo-41972: Use the two-way algorithm for string searching (GH-22904)

Implement an enhanced variant of Crochemore and Perrin's Two-Way string searching algorithm, which reduces worst-case time from quadratic (the product of the string and pattern lengths) to linear. This applies to forward searches (like``find``, ``index``, ``replace``); the algorithm for reverse searches (like ``rfind``) is not changed.

Co-authored-by: Tim Peters <tim.peters@gmail.com>
This commit is contained in:
Dennis Sweeney 2021-02-28 13:20:50 -05:00 committed by GitHub
parent 2183d06bc8
commit 73a85c4e1d
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 941 additions and 20 deletions

View file

@ -5,6 +5,7 @@ Common tests shared by test_unicode, test_userstring and test_bytes.
import unittest, string, sys, struct
from test import support
from collections import UserList
import random
class Sequence:
def __init__(self, seq='wxyz'): self.seq = seq
@ -317,6 +318,44 @@ class BaseTest:
else:
self.checkraises(TypeError, 'hello', 'rindex', 42)
def test_find_periodic_pattern(self):
"""Cover the special path for periodic patterns."""
def reference_find(p, s):
for i in range(len(s)):
if s.startswith(p, i):
return i
return -1
rr = random.randrange
choices = random.choices
for _ in range(1000):
p0 = ''.join(choices('abcde', k=rr(10))) * rr(10, 20)
p = p0[:len(p0) - rr(10)] # pop off some characters
left = ''.join(choices('abcdef', k=rr(2000)))
right = ''.join(choices('abcdef', k=rr(2000)))
text = left + p + right
with self.subTest(p=p, text=text):
self.checkequal(reference_find(p, text),
text, 'find', p)
def test_find_shift_table_overflow(self):
"""When the table of 8-bit shifts overflows."""
N = 2**8 + 100
# first check the periodic case
# here, the shift for 'b' is N + 1.
pattern1 = 'a' * N + 'b' + 'a' * N
text1 = 'babbaa' * N + pattern1
self.checkequal(len(text1)-len(pattern1),
text1, 'find', pattern1)
# now check the non-periodic case
# here, the shift for 'd' is 3*(N+1)+1
pattern2 = 'ddd' + 'abc' * N + "eee"
text2 = pattern2[:-1] + "ddeede" * 2 * N + pattern2 + "de" * N
self.checkequal(len(text2) - N*len("de") - len(pattern2),
text2, 'find', pattern2)
def test_lower(self):
self.checkequal('hello', 'HeLLo', 'lower')
self.checkequal('hello', 'hello', 'lower')