gh-118750: Asymptotically faster int(string) (#118751)

Asymptotically faster (O(n log n)) str->int for very large strings, leveraging the faster multiplication scheme in the C-coded `_decimal` when available. This is used instead of the current Karatsuba-limited method starting at 2 million digits.

Lots of opportunity remains for fine-tuning. Good targets include changing BYTELIM, and possibly changing the internal output base (from 256 to a higher number of bytes).

Doing this was substantial work, and many of the new lines are actually comments giving correctness proofs. The obvious approaches sticking to integers were too slow to be useful, so this is doing variable-precision decimal floating-point arithmetic. Much faster, but worst-possible rounding errors have to be wholly accounted for, using as little precision as possible.

Special thanks to Serhiy Storchaka for asking many good questions in his code reviews!

Co-authored-by: Jelle Zijlstra <jelle.zijlstra@gmail.com>
Co-authored-by: sstandre <43125375+sstandre@users.noreply.github.com>
Co-authored-by: Pieter Eendebak <pieter.eendebak@gmail.com>
Co-authored-by: Nice Zombies <nineteendo19d0@gmail.com>
This commit is contained in:
Tim Peters 2024-05-18 19:19:57 -05:00 committed by GitHub
parent caf6064a1b
commit ecd8664f11
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 479 additions and 33 deletions

View file

@ -919,5 +919,84 @@ class PyLongModuleTests(unittest.TestCase):
self.assertEqual(n, int(sn))
bits <<= 1
@support.requires_resource('cpu')
def test_pylong_roundtrip_huge(self):
# k blocks of 1234567890
k = 1_000_000 # so 10 million digits in all
tentoten = 10**10
n = 1234567890 * ((tentoten**k - 1) // (tentoten - 1))
sn = "1234567890" * k
self.assertEqual(n, int(sn))
self.assertEqual(sn, str(n))
@support.requires_resource('cpu')
@unittest.skipUnless(_pylong, "_pylong module required")
def test_whitebox_dec_str_to_int_inner_failsafe(self):
# While I believe the number of GUARD digits in this function is
# always enough so that no more than one correction step is ever
# needed, the code has a "failsafe" path that takes over if I'm
# wrong about that. We have no input that reaches that block.
# Here we test a contrived input that _does_ reach that block,
# provided the number of guard digits is reduced to 1.
sn = "9" * 2000156
n = 10**len(sn) - 1
orig_spread = _pylong._spread.copy()
_pylong._spread.clear()
try:
self.assertEqual(n, _pylong._dec_str_to_int_inner(sn, GUARD=1))
self.assertIn(999, _pylong._spread)
finally:
_pylong._spread.clear()
_pylong._spread.update(orig_spread)
@unittest.skipUnless(_pylong, "pylong module required")
def test_whitebox_dec_str_to_int_inner_monster(self):
# I don't think anyone has enough RAM to build a string long enough
# for this function to complain. So lie about the string length.
class LyingStr(str):
def __len__(self):
return int((1 << 47) / _pylong._LOG_10_BASE_256)
liar = LyingStr("42")
# We have to pass the liar directly to the complaining function. If we
# just try `int(liar)`, earlier layers will replace it with plain old
# "43".
# Embedding `len(liar)` into the f-string failed on the WASI testbot
# (don't know what that is):
# OverflowError: cannot fit 'int' into an index-sized integer
# So a random stab at worming around that.
self.assertRaisesRegex(ValueError,
f"^cannot convert string of len {liar.__len__()} to int$",
_pylong._dec_str_to_int_inner,
liar)
@unittest.skipUnless(_pylong, "_pylong module required")
def test_pylong_compute_powers(self):
# Basic sanity tests. See end of _pylong.py for manual heavy tests.
def consumer(w, base, limit, need_hi):
seen = set()
need = set()
def inner(w):
if w <= limit or w in seen:
return
seen.add(w)
lo = w >> 1
hi = w - lo
need.add(hi if need_hi else lo)
inner(lo)
inner(hi)
inner(w)
d = _pylong.compute_powers(w, base, limit, need_hi=need_hi)
self.assertEqual(d.keys(), need)
for k, v in d.items():
self.assertEqual(v, base ** k)
for base in 2, 5:
for need_hi in False, True:
for limit in 1, 11:
for w in range(250, 550):
consumer(w, base, limit, need_hi)
if __name__ == "__main__":
unittest.main()