mirror of
https://github.com/python/cpython.git
synced 2025-08-22 17:55:18 +00:00
bpo-41972: Use the two-way algorithm for string searching (GH-22904)
Implement an enhanced variant of Crochemore and Perrin's Two-Way string searching algorithm, which reduces worst-case time from quadratic (the product of the string and pattern lengths) to linear. This applies to forward searches (like``find``, ``index``, ``replace``); the algorithm for reverse searches (like ``rfind``) is not changed. Co-authored-by: Tim Peters <tim.peters@gmail.com>
This commit is contained in:
parent
2183d06bc8
commit
73a85c4e1d
4 changed files with 941 additions and 20 deletions
|
@ -5,6 +5,7 @@ Common tests shared by test_unicode, test_userstring and test_bytes.
|
|||
import unittest, string, sys, struct
|
||||
from test import support
|
||||
from collections import UserList
|
||||
import random
|
||||
|
||||
class Sequence:
|
||||
def __init__(self, seq='wxyz'): self.seq = seq
|
||||
|
@ -317,6 +318,44 @@ class BaseTest:
|
|||
else:
|
||||
self.checkraises(TypeError, 'hello', 'rindex', 42)
|
||||
|
||||
def test_find_periodic_pattern(self):
|
||||
"""Cover the special path for periodic patterns."""
|
||||
def reference_find(p, s):
|
||||
for i in range(len(s)):
|
||||
if s.startswith(p, i):
|
||||
return i
|
||||
return -1
|
||||
|
||||
rr = random.randrange
|
||||
choices = random.choices
|
||||
for _ in range(1000):
|
||||
p0 = ''.join(choices('abcde', k=rr(10))) * rr(10, 20)
|
||||
p = p0[:len(p0) - rr(10)] # pop off some characters
|
||||
left = ''.join(choices('abcdef', k=rr(2000)))
|
||||
right = ''.join(choices('abcdef', k=rr(2000)))
|
||||
text = left + p + right
|
||||
with self.subTest(p=p, text=text):
|
||||
self.checkequal(reference_find(p, text),
|
||||
text, 'find', p)
|
||||
|
||||
def test_find_shift_table_overflow(self):
|
||||
"""When the table of 8-bit shifts overflows."""
|
||||
N = 2**8 + 100
|
||||
|
||||
# first check the periodic case
|
||||
# here, the shift for 'b' is N + 1.
|
||||
pattern1 = 'a' * N + 'b' + 'a' * N
|
||||
text1 = 'babbaa' * N + pattern1
|
||||
self.checkequal(len(text1)-len(pattern1),
|
||||
text1, 'find', pattern1)
|
||||
|
||||
# now check the non-periodic case
|
||||
# here, the shift for 'd' is 3*(N+1)+1
|
||||
pattern2 = 'ddd' + 'abc' * N + "eee"
|
||||
text2 = pattern2[:-1] + "ddeede" * 2 * N + pattern2 + "de" * N
|
||||
self.checkequal(len(text2) - N*len("de") - len(pattern2),
|
||||
text2, 'find', pattern2)
|
||||
|
||||
def test_lower(self):
|
||||
self.checkequal('hello', 'HeLLo', 'lower')
|
||||
self.checkequal('hello', 'hello', 'lower')
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue