bpo-45150: Add hashlib.file_digest() for efficient file hashing (GH-31930)

This commit is contained in:
Christian Heimes 2022-03-22 11:37:00 +02:00 committed by GitHub
parent 3751b6b030
commit 4f97d64c83
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 145 additions and 1 deletions

View file

@ -228,6 +228,49 @@ by the SHAKE algorithm.
exchange the value safely in email or other non-binary environments.
File hashing
------------
The hashlib module provides a helper function for efficient hashing of
a file or file-like object.
.. function:: file_digest(fileobj, digest, /)
Return a digest object that has been updated with contents of file object.
*fileobj* must be a file-like object opened for reading in binary mode.
It accepts file objects from builtin :func:`open`, :class:`~io.BytesIO`
instances, SocketIO objects from :meth:`socket.socket.makefile`, and
similar. The function may bypass Python's I/O and use the file descriptor
from :meth:`~io.IOBase.fileno` directly. *fileobj* must be assumed to be
in an unknown state after this function returns or raises. It is up to
the caller to close *fileobj*.
*digest* must either be a hash algorithm name as a *str*, a hash
constructor, or a callable that returns a hash object.
Example:
>>> import io, hashlib, hmac
>>> with open(hashlib.__file__, "rb") as f:
... digest = hashlib.file_digest(f, "sha256")
...
>>> digest.hexdigest() # doctest: +ELLIPSIS
'...'
>>> buf = io.BytesIO(b"somedata")
>>> mac1 = hmac.HMAC(b"key", digestmod=hashlib.sha512)
>>> digest = hashlib.file_digest(buf, lambda: mac1)
>>> digest is mac1
True
>>> mac2 = hmac.HMAC(b"key", b"somedata", digestmod=hashlib.sha512)
>>> mac1.digest() == mac2.digest()
True
.. versionadded:: 3.11
Key derivation
--------------

View file

@ -65,7 +65,7 @@ algorithms_guaranteed = set(__always_supported)
algorithms_available = set(__always_supported)
__all__ = __always_supported + ('new', 'algorithms_guaranteed',
'algorithms_available', 'pbkdf2_hmac')
'algorithms_available', 'pbkdf2_hmac', 'file_digest')
__builtin_constructor_cache = {}
@ -254,6 +254,52 @@ except ImportError:
pass
def file_digest(fileobj, digest, /, *, _bufsize=2**18):
"""Hash the contents of a file-like object. Returns a digest object.
*fileobj* must be a file-like object opened for reading in binary mode.
It accepts file objects from open(), io.BytesIO(), and SocketIO objects.
The function may bypass Python's I/O and use the file descriptor *fileno*
directly.
*digest* must either be a hash algorithm name as a *str*, a hash
constructor, or a callable that returns a hash object.
"""
# On Linux we could use AF_ALG sockets and sendfile() to archive zero-copy
# hashing with hardware acceleration.
if isinstance(digest, str):
digestobj = new(digest)
else:
digestobj = digest()
if hasattr(fileobj, "getbuffer"):
# io.BytesIO object, use zero-copy buffer
digestobj.update(fileobj.getbuffer())
return digestobj
# Only binary files implement readinto().
if not (
hasattr(fileobj, "readinto")
and hasattr(fileobj, "readable")
and fileobj.readable()
):
raise ValueError(
f"'{fileobj!r}' is not a file-like object in binary reading mode."
)
# binary file, socket.SocketIO object
# Note: socket I/O uses different syscalls than file I/O.
buf = bytearray(_bufsize) # Reusable buffer to reduce allocations.
view = memoryview(buf)
while True:
size = fileobj.readinto(buf)
if size == 0:
break # EOF
digestobj.update(view[:size])
return digestobj
for __func_name in __always_supported:
# try them all, some may not work due to the OpenSSL
# version not supporting that algorithm.

View file

@ -10,6 +10,7 @@ import array
from binascii import unhexlify
import hashlib
import importlib
import io
import itertools
import os
import sys
@ -20,6 +21,7 @@ import warnings
from test import support
from test.support import _4G, bigmemtest
from test.support.import_helper import import_fresh_module
from test.support import os_helper
from test.support import threading_helper
from test.support import warnings_helper
from http.client import HTTPException
@ -371,6 +373,31 @@ class HashLibTestCase(unittest.TestCase):
if not shake:
self.assertEqual(len(digest), m.digest_size)
if not shake and kwargs.get("key") is None:
# skip shake and blake2 extended parameter tests
self.check_file_digest(name, data, hexdigest)
def check_file_digest(self, name, data, hexdigest):
hexdigest = hexdigest.lower()
digests = [name]
digests.extend(self.constructors_to_test[name])
with open(os_helper.TESTFN, "wb") as f:
f.write(data)
try:
for digest in digests:
buf = io.BytesIO(data)
buf.seek(0)
self.assertEqual(
hashlib.file_digest(buf, digest).hexdigest(), hexdigest
)
with open(os_helper.TESTFN, "rb") as f:
digestobj = hashlib.file_digest(f, digest)
self.assertEqual(digestobj.hexdigest(), hexdigest)
finally:
os.unlink(os_helper.TESTFN)
def check_no_unicode(self, algorithm_name):
# Unicode objects are not allowed as input.
constructors = self.constructors_to_test[algorithm_name]
@ -1117,6 +1144,33 @@ class KDFTests(unittest.TestCase):
self.assertNotIn("blake2b512", hashlib.algorithms_available)
self.assertNotIn("sha3-512", hashlib.algorithms_available)
def test_file_digest(self):
data = b'a' * 65536
d1 = hashlib.sha256()
self.addCleanup(os.unlink, os_helper.TESTFN)
with open(os_helper.TESTFN, "wb") as f:
for _ in range(10):
d1.update(data)
f.write(data)
with open(os_helper.TESTFN, "rb") as f:
d2 = hashlib.file_digest(f, hashlib.sha256)
self.assertEqual(d1.hexdigest(), d2.hexdigest())
self.assertEqual(d1.name, d2.name)
self.assertIs(type(d1), type(d2))
with self.assertRaises(ValueError):
hashlib.file_digest(None, "sha256")
with self.assertRaises(ValueError):
with open(os_helper.TESTFN, "r") as f:
hashlib.file_digest(f, "sha256")
with self.assertRaises(ValueError):
with open(os_helper.TESTFN, "wb") as f:
hashlib.file_digest(f, "sha256")
if __name__ == "__main__":
unittest.main()

View file

@ -0,0 +1 @@
Add :func:`hashlib.file_digest` helper for efficient hashing of file object.