gh-134635: add zlib.{adler32,crc32}_combine to combine checksums (#134650)

This commit is contained in:
Bénédikt Tran 2025-05-27 10:48:34 +02:00 committed by GitHub
parent 8704d6b391
commit 737b4ba020
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 356 additions and 1 deletions

View file

@ -44,6 +44,20 @@ The available exception and functions in this module are:
.. versionchanged:: 3.0 .. versionchanged:: 3.0
The result is always unsigned. The result is always unsigned.
.. function:: adler32_combine(adler1, adler2, len2, /)
Combine two Adler-32 checksums into one.
Given the Adler-32 checksum *adler1* of a sequence ``A`` and the
Adler-32 checksum *adler2* of a sequence ``B`` of length *len2*,
return the Adler-32 checksum of ``A`` and ``B`` concatenated.
This function is typically useful to combine Adler-32 checksums
that were concurrently computed. To compute checksums sequentially, use
:func:`adler32` with the running checksum as the ``value`` argument.
.. versionadded:: next
.. function:: compress(data, /, level=-1, wbits=MAX_WBITS) .. function:: compress(data, /, level=-1, wbits=MAX_WBITS)
Compresses the bytes in *data*, returning a bytes object containing compressed data. Compresses the bytes in *data*, returning a bytes object containing compressed data.
@ -136,6 +150,20 @@ The available exception and functions in this module are:
.. versionchanged:: 3.0 .. versionchanged:: 3.0
The result is always unsigned. The result is always unsigned.
.. function:: crc32_combine(crc1, crc2, len2, /)
Combine two CRC-32 checksums into one.
Given the CRC-32 checksum *crc1* of a sequence ``A`` and the
CRC-32 checksum *crc2* of a sequence ``B`` of length *len2*,
return the CRC-32 checksum of ``A`` and ``B`` concatenated.
This function is typically useful to combine CRC-32 checksums
that were concurrently computed. To compute checksums sequentially, use
:func:`crc32` with the running checksum as the ``value`` argument.
.. versionadded:: next
.. function:: decompress(data, /, wbits=MAX_WBITS, bufsize=DEF_BUF_SIZE) .. function:: decompress(data, /, wbits=MAX_WBITS, bufsize=DEF_BUF_SIZE)
Decompresses the bytes in *data*, returning a bytes object containing the Decompresses the bytes in *data*, returning a bytes object containing the

View file

@ -97,6 +97,16 @@ ssl
(Contributed by Will Childs-Klein in :gh:`133624`.) (Contributed by Will Childs-Klein in :gh:`133624`.)
zlib
----
* Allow combining two Adler-32 checksums via :func:`~zlib.adler32_combine`.
(Contributed by Callum Attryde and Bénédikt Tran in :gh:`134635`.)
* Allow combining two CRC-32 checksums via :func:`~zlib.crc32_combine`.
(Contributed by Bénédikt Tran in :gh:`134635`.)
.. Add improved modules above alphabetically, not here at the end. .. Add improved modules above alphabetically, not here at the end.
Optimizations Optimizations

View file

@ -119,6 +119,114 @@ class ChecksumTestCase(unittest.TestCase):
self.assertEqual(binascii.crc32(b'spam'), zlib.crc32(b'spam')) self.assertEqual(binascii.crc32(b'spam'), zlib.crc32(b'spam'))
class ChecksumCombineMixin:
"""Mixin class for testing checksum combination."""
N = 1000
default_iv: int
def parse_iv(self, iv):
"""Parse an IV value.
- The default IV is returned if *iv* is None.
- A random IV is returned if *iv* is -1.
- Otherwise, *iv* is returned as is.
"""
if iv is None:
return self.default_iv
if iv == -1:
return random.randint(1, 0x80000000)
return iv
def checksum(self, data, init=None):
"""Compute the checksum of data with a given initial value.
The *init* value is parsed by ``parse_iv``.
"""
iv = self.parse_iv(init)
return self._checksum(data, iv)
def _checksum(self, data, init):
raise NotImplementedError
def combine(self, a, b, blen):
"""Combine two checksums together."""
raise NotImplementedError
def get_random_data(self, data_len, *, iv=None):
"""Get a triplet (data, iv, checksum)."""
data = random.randbytes(data_len)
init = self.parse_iv(iv)
checksum = self.checksum(data, init)
return data, init, checksum
def test_combine_empty(self):
for _ in range(self.N):
a, iv, checksum = self.get_random_data(32, iv=-1)
res = self.combine(iv, self.checksum(a), len(a))
self.assertEqual(res, checksum)
def test_combine_no_iv(self):
for _ in range(self.N):
a, _, chk_a = self.get_random_data(32)
b, _, chk_b = self.get_random_data(64)
res = self.combine(chk_a, chk_b, len(b))
self.assertEqual(res, self.checksum(a + b))
def test_combine_no_iv_invalid_length(self):
a, _, chk_a = self.get_random_data(32)
b, _, chk_b = self.get_random_data(64)
checksum = self.checksum(a + b)
for invalid_len in [1, len(a), 48, len(b) + 1, 191]:
invalid_res = self.combine(chk_a, chk_b, invalid_len)
self.assertNotEqual(invalid_res, checksum)
self.assertRaises(TypeError, self.combine, 0, 0, "len")
def test_combine_with_iv(self):
for _ in range(self.N):
a, iv_a, chk_a_with_iv = self.get_random_data(32, iv=-1)
chk_a_no_iv = self.checksum(a)
b, iv_b, chk_b_with_iv = self.get_random_data(64, iv=-1)
chk_b_no_iv = self.checksum(b)
# We can represent c = COMBINE(CHK(a, iv_a), CHK(b, iv_b)) as:
#
# c = CHK(CHK(b'', iv_a) + CHK(a) + CHK(b'', iv_b) + CHK(b))
# = COMBINE(
# COMBINE(CHK(b'', iv_a), CHK(a)),
# COMBINE(CHK(b'', iv_b), CHK(b)),
# )
# = COMBINE(COMBINE(iv_a, CHK(a)), COMBINE(iv_b, CHK(b)))
tmp0 = self.combine(iv_a, chk_a_no_iv, len(a))
tmp1 = self.combine(iv_b, chk_b_no_iv, len(b))
expected = self.combine(tmp0, tmp1, len(b))
checksum = self.combine(chk_a_with_iv, chk_b_with_iv, len(b))
self.assertEqual(checksum, expected)
class CRC32CombineTestCase(ChecksumCombineMixin, unittest.TestCase):
default_iv = 0
def _checksum(self, data, init):
return zlib.crc32(data, init)
def combine(self, a, b, blen):
return zlib.crc32_combine(a, b, blen)
class Adler32CombineTestCase(ChecksumCombineMixin, unittest.TestCase):
default_iv = 1
def _checksum(self, data, init):
return zlib.adler32(data, init)
def combine(self, a, b, blen):
return zlib.adler32_combine(a, b, blen)
# Issue #10276 - check that inputs >=4 GiB are handled correctly. # Issue #10276 - check that inputs >=4 GiB are handled correctly.
class ChecksumBigBufferTestCase(unittest.TestCase): class ChecksumBigBufferTestCase(unittest.TestCase):

View file

@ -0,0 +1,3 @@
:mod:`zlib`: Allow to combine Adler-32 and CRC-32 checksums via
:func:`~zlib.adler32_combine` and :func:`~zlib.crc32_combine`. Patch by
Callum Attryde and Bénédikt Tran.

View file

@ -1044,6 +1044,65 @@ exit:
return return_value; return return_value;
} }
PyDoc_STRVAR(zlib_adler32_combine__doc__,
"adler32_combine($module, adler1, adler2, len2, /)\n"
"--\n"
"\n"
"Combine two Adler-32 checksums into one.\n"
"\n"
" adler1\n"
" Adler-32 checksum for sequence A\n"
" adler2\n"
" Adler-32 checksum for sequence B\n"
" len2\n"
" Length of sequence B\n"
"\n"
"Given the Adler-32 checksum \'adler1\' of a sequence A and the\n"
"Adler-32 checksum \'adler2\' of a sequence B of length \'len2\',\n"
"return the Adler-32 checksum of A and B concatenated.");
#define ZLIB_ADLER32_COMBINE_METHODDEF \
{"adler32_combine", _PyCFunction_CAST(zlib_adler32_combine), METH_FASTCALL, zlib_adler32_combine__doc__},
static unsigned int
zlib_adler32_combine_impl(PyObject *module, unsigned int adler1,
unsigned int adler2, PyObject *len2);
static PyObject *
zlib_adler32_combine(PyObject *module, PyObject *const *args, Py_ssize_t nargs)
{
PyObject *return_value = NULL;
unsigned int adler1;
unsigned int adler2;
PyObject *len2;
unsigned int _return_value;
if (!_PyArg_CheckPositional("adler32_combine", nargs, 3, 3)) {
goto exit;
}
adler1 = (unsigned int)PyLong_AsUnsignedLongMask(args[0]);
if (adler1 == (unsigned int)-1 && PyErr_Occurred()) {
goto exit;
}
adler2 = (unsigned int)PyLong_AsUnsignedLongMask(args[1]);
if (adler2 == (unsigned int)-1 && PyErr_Occurred()) {
goto exit;
}
if (!PyLong_Check(args[2])) {
_PyArg_BadArgument("adler32_combine", "argument 3", "int", args[2]);
goto exit;
}
len2 = args[2];
_return_value = zlib_adler32_combine_impl(module, adler1, adler2, len2);
if ((_return_value == (unsigned int)-1) && PyErr_Occurred()) {
goto exit;
}
return_value = PyLong_FromUnsignedLong((unsigned long)_return_value);
exit:
return return_value;
}
PyDoc_STRVAR(zlib_crc32__doc__, PyDoc_STRVAR(zlib_crc32__doc__,
"crc32($module, data, value=0, /)\n" "crc32($module, data, value=0, /)\n"
"--\n" "--\n"
@ -1098,6 +1157,65 @@ exit:
return return_value; return return_value;
} }
PyDoc_STRVAR(zlib_crc32_combine__doc__,
"crc32_combine($module, crc1, crc2, len2, /)\n"
"--\n"
"\n"
"Combine two CRC-32 checksums into one.\n"
"\n"
" crc1\n"
" CRC-32 checksum for sequence A\n"
" crc2\n"
" CRC-32 checksum for sequence B\n"
" len2\n"
" Length of sequence B\n"
"\n"
"Given the CRC-32 checksum \'crc1\' of a sequence A and the\n"
"CRC-32 checksum \'crc2\' of a sequence B of length \'len2\',\n"
"return the CRC-32 checksum of A and B concatenated.");
#define ZLIB_CRC32_COMBINE_METHODDEF \
{"crc32_combine", _PyCFunction_CAST(zlib_crc32_combine), METH_FASTCALL, zlib_crc32_combine__doc__},
static unsigned int
zlib_crc32_combine_impl(PyObject *module, unsigned int crc1,
unsigned int crc2, PyObject *len2);
static PyObject *
zlib_crc32_combine(PyObject *module, PyObject *const *args, Py_ssize_t nargs)
{
PyObject *return_value = NULL;
unsigned int crc1;
unsigned int crc2;
PyObject *len2;
unsigned int _return_value;
if (!_PyArg_CheckPositional("crc32_combine", nargs, 3, 3)) {
goto exit;
}
crc1 = (unsigned int)PyLong_AsUnsignedLongMask(args[0]);
if (crc1 == (unsigned int)-1 && PyErr_Occurred()) {
goto exit;
}
crc2 = (unsigned int)PyLong_AsUnsignedLongMask(args[1]);
if (crc2 == (unsigned int)-1 && PyErr_Occurred()) {
goto exit;
}
if (!PyLong_Check(args[2])) {
_PyArg_BadArgument("crc32_combine", "argument 3", "int", args[2]);
goto exit;
}
len2 = args[2];
_return_value = zlib_crc32_combine_impl(module, crc1, crc2, len2);
if ((_return_value == (unsigned int)-1) && PyErr_Occurred()) {
goto exit;
}
return_value = PyLong_FromUnsignedLong((unsigned long)_return_value);
exit:
return return_value;
}
#ifndef ZLIB_COMPRESS_COPY_METHODDEF #ifndef ZLIB_COMPRESS_COPY_METHODDEF
#define ZLIB_COMPRESS_COPY_METHODDEF #define ZLIB_COMPRESS_COPY_METHODDEF
#endif /* !defined(ZLIB_COMPRESS_COPY_METHODDEF) */ #endif /* !defined(ZLIB_COMPRESS_COPY_METHODDEF) */
@ -1121,4 +1239,4 @@ exit:
#ifndef ZLIB_DECOMPRESS___DEEPCOPY___METHODDEF #ifndef ZLIB_DECOMPRESS___DEEPCOPY___METHODDEF
#define ZLIB_DECOMPRESS___DEEPCOPY___METHODDEF #define ZLIB_DECOMPRESS___DEEPCOPY___METHODDEF
#endif /* !defined(ZLIB_DECOMPRESS___DEEPCOPY___METHODDEF) */ #endif /* !defined(ZLIB_DECOMPRESS___DEEPCOPY___METHODDEF) */
/*[clinic end generated code: output=33938c7613a8c1c7 input=a9049054013a1b77]*/ /*[clinic end generated code: output=3f7692eb3b5d5a0c input=a9049054013a1b77]*/

View file

@ -17,6 +17,16 @@
#error "At least zlib version 1.2.2.1 is required" #error "At least zlib version 1.2.2.1 is required"
#endif #endif
#if (SIZEOF_OFF_T == SIZEOF_SIZE_T)
# define convert_to_z_off_t PyLong_AsSsize_t
#elif (SIZEOF_OFF_T == SIZEOF_LONG_LONG)
# define convert_to_z_off_t PyLong_AsLongLong
#elif (SIZEOF_OFF_T == SIZEOF_LONG)
# define convert_to_z_off_t PyLong_AsLong
#else
# error off_t does not match either size_t, long, or long long!
#endif
// Blocks output buffer wrappers // Blocks output buffer wrappers
#include "pycore_blocks_output_buffer.h" #include "pycore_blocks_output_buffer.h"
@ -1876,6 +1886,44 @@ zlib_adler32_impl(PyObject *module, Py_buffer *data, unsigned int value)
return PyLong_FromUnsignedLong(value & 0xffffffffU); return PyLong_FromUnsignedLong(value & 0xffffffffU);
} }
/*[clinic input]
zlib.adler32_combine -> unsigned_int
adler1: unsigned_int(bitwise=True)
Adler-32 checksum for sequence A
adler2: unsigned_int(bitwise=True)
Adler-32 checksum for sequence B
len2: object(subclass_of='&PyLong_Type')
Length of sequence B
/
Combine two Adler-32 checksums into one.
Given the Adler-32 checksum 'adler1' of a sequence A and the
Adler-32 checksum 'adler2' of a sequence B of length 'len2',
return the Adler-32 checksum of A and B concatenated.
[clinic start generated code]*/
static unsigned int
zlib_adler32_combine_impl(PyObject *module, unsigned int adler1,
unsigned int adler2, PyObject *len2)
/*[clinic end generated code: output=61842cefb16afb1b input=51bb045c95130c6f]*/
{
#if defined(Z_WANT64)
z_off64_t len = convert_to_z_off_t(len2);
#else
z_off_t len = convert_to_z_off_t(len2);
#endif
if (PyErr_Occurred()) {
return (unsigned int)-1;
}
return adler32_combine(adler1, adler2, len);
}
/*[clinic input] /*[clinic input]
zlib.crc32 -> unsigned_int zlib.crc32 -> unsigned_int
@ -1923,13 +1971,50 @@ zlib_crc32_impl(PyObject *module, Py_buffer *data, unsigned int value)
return value; return value;
} }
/*[clinic input]
zlib.crc32_combine -> unsigned_int
crc1: unsigned_int(bitwise=True)
CRC-32 checksum for sequence A
crc2: unsigned_int(bitwise=True)
CRC-32 checksum for sequence B
len2: object(subclass_of='&PyLong_Type')
Length of sequence B
/
Combine two CRC-32 checksums into one.
Given the CRC-32 checksum 'crc1' of a sequence A and the
CRC-32 checksum 'crc2' of a sequence B of length 'len2',
return the CRC-32 checksum of A and B concatenated.
[clinic start generated code]*/
static unsigned int
zlib_crc32_combine_impl(PyObject *module, unsigned int crc1,
unsigned int crc2, PyObject *len2)
/*[clinic end generated code: output=c4def907c602e6eb input=9c8a065d9040dc66]*/
{
#if defined(Z_WANT64)
z_off64_t len = convert_to_z_off_t(len2);
#else
z_off_t len = convert_to_z_off_t(len2);
#endif
if (PyErr_Occurred()) {
return (unsigned int)-1;
}
return crc32_combine(crc1, crc2, len);
}
static PyMethodDef zlib_methods[] = static PyMethodDef zlib_methods[] =
{ {
ZLIB_ADLER32_METHODDEF ZLIB_ADLER32_METHODDEF
ZLIB_ADLER32_COMBINE_METHODDEF
ZLIB_COMPRESS_METHODDEF ZLIB_COMPRESS_METHODDEF
ZLIB_COMPRESSOBJ_METHODDEF ZLIB_COMPRESSOBJ_METHODDEF
ZLIB_CRC32_METHODDEF ZLIB_CRC32_METHODDEF
ZLIB_CRC32_COMBINE_METHODDEF
ZLIB_DECOMPRESS_METHODDEF ZLIB_DECOMPRESS_METHODDEF
ZLIB_DECOMPRESSOBJ_METHODDEF ZLIB_DECOMPRESSOBJ_METHODDEF
{NULL, NULL} {NULL, NULL}
@ -1981,14 +2066,17 @@ static PyType_Spec ZlibDecompressor_type_spec = {
.flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE), .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE),
.slots = ZlibDecompressor_type_slots, .slots = ZlibDecompressor_type_slots,
}; };
PyDoc_STRVAR(zlib_module_documentation, PyDoc_STRVAR(zlib_module_documentation,
"The functions in this module allow compression and decompression using the\n" "The functions in this module allow compression and decompression using the\n"
"zlib library, which is based on GNU zip.\n" "zlib library, which is based on GNU zip.\n"
"\n" "\n"
"adler32(string[, start]) -- Compute an Adler-32 checksum.\n" "adler32(string[, start]) -- Compute an Adler-32 checksum.\n"
"adler32_combine(adler1, adler2, len2, /) -- Combine two Adler-32 checksums.\n"
"compress(data[, level]) -- Compress data, with compression level 0-9 or -1.\n" "compress(data[, level]) -- Compress data, with compression level 0-9 or -1.\n"
"compressobj([level[, ...]]) -- Return a compressor object.\n" "compressobj([level[, ...]]) -- Return a compressor object.\n"
"crc32(string[, start]) -- Compute a CRC-32 checksum.\n" "crc32(string[, start]) -- Compute a CRC-32 checksum.\n"
"crc32_combine(crc1, crc2, len2, /) -- Combine two CRC-32 checksums.\n"
"decompress(string,[wbits],[bufsize]) -- Decompresses a compressed string.\n" "decompress(string,[wbits],[bufsize]) -- Decompresses a compressed string.\n"
"decompressobj([wbits[, zdict]]) -- Return a decompressor object.\n" "decompressobj([wbits[, zdict]]) -- Return a decompressor object.\n"
"\n" "\n"