mirror of
https://github.com/python/cpython.git
synced 2025-11-25 12:44:13 +00:00
bpo-45150: Add hashlib.file_digest() for efficient file hashing (GH-31930)
This commit is contained in:
parent
3751b6b030
commit
4f97d64c83
4 changed files with 145 additions and 1 deletions
|
|
@ -228,6 +228,49 @@ by the SHAKE algorithm.
|
|||
exchange the value safely in email or other non-binary environments.
|
||||
|
||||
|
||||
File hashing
|
||||
------------
|
||||
|
||||
The hashlib module provides a helper function for efficient hashing of
|
||||
a file or file-like object.
|
||||
|
||||
.. function:: file_digest(fileobj, digest, /)
|
||||
|
||||
Return a digest object that has been updated with contents of file object.
|
||||
|
||||
*fileobj* must be a file-like object opened for reading in binary mode.
|
||||
It accepts file objects from builtin :func:`open`, :class:`~io.BytesIO`
|
||||
instances, SocketIO objects from :meth:`socket.socket.makefile`, and
|
||||
similar. The function may bypass Python's I/O and use the file descriptor
|
||||
from :meth:`~io.IOBase.fileno` directly. *fileobj* must be assumed to be
|
||||
in an unknown state after this function returns or raises. It is up to
|
||||
the caller to close *fileobj*.
|
||||
|
||||
*digest* must either be a hash algorithm name as a *str*, a hash
|
||||
constructor, or a callable that returns a hash object.
|
||||
|
||||
Example:
|
||||
|
||||
>>> import io, hashlib, hmac
|
||||
>>> with open(hashlib.__file__, "rb") as f:
|
||||
... digest = hashlib.file_digest(f, "sha256")
|
||||
...
|
||||
>>> digest.hexdigest() # doctest: +ELLIPSIS
|
||||
'...'
|
||||
|
||||
>>> buf = io.BytesIO(b"somedata")
|
||||
>>> mac1 = hmac.HMAC(b"key", digestmod=hashlib.sha512)
|
||||
>>> digest = hashlib.file_digest(buf, lambda: mac1)
|
||||
|
||||
>>> digest is mac1
|
||||
True
|
||||
>>> mac2 = hmac.HMAC(b"key", b"somedata", digestmod=hashlib.sha512)
|
||||
>>> mac1.digest() == mac2.digest()
|
||||
True
|
||||
|
||||
.. versionadded:: 3.11
|
||||
|
||||
|
||||
Key derivation
|
||||
--------------
|
||||
|
||||
|
|
|
|||
|
|
@ -65,7 +65,7 @@ algorithms_guaranteed = set(__always_supported)
|
|||
algorithms_available = set(__always_supported)
|
||||
|
||||
__all__ = __always_supported + ('new', 'algorithms_guaranteed',
|
||||
'algorithms_available', 'pbkdf2_hmac')
|
||||
'algorithms_available', 'pbkdf2_hmac', 'file_digest')
|
||||
|
||||
|
||||
__builtin_constructor_cache = {}
|
||||
|
|
@ -254,6 +254,52 @@ except ImportError:
|
|||
pass
|
||||
|
||||
|
||||
def file_digest(fileobj, digest, /, *, _bufsize=2**18):
|
||||
"""Hash the contents of a file-like object. Returns a digest object.
|
||||
|
||||
*fileobj* must be a file-like object opened for reading in binary mode.
|
||||
It accepts file objects from open(), io.BytesIO(), and SocketIO objects.
|
||||
The function may bypass Python's I/O and use the file descriptor *fileno*
|
||||
directly.
|
||||
|
||||
*digest* must either be a hash algorithm name as a *str*, a hash
|
||||
constructor, or a callable that returns a hash object.
|
||||
"""
|
||||
# On Linux we could use AF_ALG sockets and sendfile() to archive zero-copy
|
||||
# hashing with hardware acceleration.
|
||||
if isinstance(digest, str):
|
||||
digestobj = new(digest)
|
||||
else:
|
||||
digestobj = digest()
|
||||
|
||||
if hasattr(fileobj, "getbuffer"):
|
||||
# io.BytesIO object, use zero-copy buffer
|
||||
digestobj.update(fileobj.getbuffer())
|
||||
return digestobj
|
||||
|
||||
# Only binary files implement readinto().
|
||||
if not (
|
||||
hasattr(fileobj, "readinto")
|
||||
and hasattr(fileobj, "readable")
|
||||
and fileobj.readable()
|
||||
):
|
||||
raise ValueError(
|
||||
f"'{fileobj!r}' is not a file-like object in binary reading mode."
|
||||
)
|
||||
|
||||
# binary file, socket.SocketIO object
|
||||
# Note: socket I/O uses different syscalls than file I/O.
|
||||
buf = bytearray(_bufsize) # Reusable buffer to reduce allocations.
|
||||
view = memoryview(buf)
|
||||
while True:
|
||||
size = fileobj.readinto(buf)
|
||||
if size == 0:
|
||||
break # EOF
|
||||
digestobj.update(view[:size])
|
||||
|
||||
return digestobj
|
||||
|
||||
|
||||
for __func_name in __always_supported:
|
||||
# try them all, some may not work due to the OpenSSL
|
||||
# version not supporting that algorithm.
|
||||
|
|
|
|||
|
|
@ -10,6 +10,7 @@ import array
|
|||
from binascii import unhexlify
|
||||
import hashlib
|
||||
import importlib
|
||||
import io
|
||||
import itertools
|
||||
import os
|
||||
import sys
|
||||
|
|
@ -20,6 +21,7 @@ import warnings
|
|||
from test import support
|
||||
from test.support import _4G, bigmemtest
|
||||
from test.support.import_helper import import_fresh_module
|
||||
from test.support import os_helper
|
||||
from test.support import threading_helper
|
||||
from test.support import warnings_helper
|
||||
from http.client import HTTPException
|
||||
|
|
@ -371,6 +373,31 @@ class HashLibTestCase(unittest.TestCase):
|
|||
if not shake:
|
||||
self.assertEqual(len(digest), m.digest_size)
|
||||
|
||||
if not shake and kwargs.get("key") is None:
|
||||
# skip shake and blake2 extended parameter tests
|
||||
self.check_file_digest(name, data, hexdigest)
|
||||
|
||||
def check_file_digest(self, name, data, hexdigest):
|
||||
hexdigest = hexdigest.lower()
|
||||
digests = [name]
|
||||
digests.extend(self.constructors_to_test[name])
|
||||
|
||||
with open(os_helper.TESTFN, "wb") as f:
|
||||
f.write(data)
|
||||
|
||||
try:
|
||||
for digest in digests:
|
||||
buf = io.BytesIO(data)
|
||||
buf.seek(0)
|
||||
self.assertEqual(
|
||||
hashlib.file_digest(buf, digest).hexdigest(), hexdigest
|
||||
)
|
||||
with open(os_helper.TESTFN, "rb") as f:
|
||||
digestobj = hashlib.file_digest(f, digest)
|
||||
self.assertEqual(digestobj.hexdigest(), hexdigest)
|
||||
finally:
|
||||
os.unlink(os_helper.TESTFN)
|
||||
|
||||
def check_no_unicode(self, algorithm_name):
|
||||
# Unicode objects are not allowed as input.
|
||||
constructors = self.constructors_to_test[algorithm_name]
|
||||
|
|
@ -1117,6 +1144,33 @@ class KDFTests(unittest.TestCase):
|
|||
self.assertNotIn("blake2b512", hashlib.algorithms_available)
|
||||
self.assertNotIn("sha3-512", hashlib.algorithms_available)
|
||||
|
||||
def test_file_digest(self):
|
||||
data = b'a' * 65536
|
||||
d1 = hashlib.sha256()
|
||||
self.addCleanup(os.unlink, os_helper.TESTFN)
|
||||
with open(os_helper.TESTFN, "wb") as f:
|
||||
for _ in range(10):
|
||||
d1.update(data)
|
||||
f.write(data)
|
||||
|
||||
with open(os_helper.TESTFN, "rb") as f:
|
||||
d2 = hashlib.file_digest(f, hashlib.sha256)
|
||||
|
||||
self.assertEqual(d1.hexdigest(), d2.hexdigest())
|
||||
self.assertEqual(d1.name, d2.name)
|
||||
self.assertIs(type(d1), type(d2))
|
||||
|
||||
with self.assertRaises(ValueError):
|
||||
hashlib.file_digest(None, "sha256")
|
||||
|
||||
with self.assertRaises(ValueError):
|
||||
with open(os_helper.TESTFN, "r") as f:
|
||||
hashlib.file_digest(f, "sha256")
|
||||
|
||||
with self.assertRaises(ValueError):
|
||||
with open(os_helper.TESTFN, "wb") as f:
|
||||
hashlib.file_digest(f, "sha256")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
|
|
|||
|
|
@ -0,0 +1 @@
|
|||
Add :func:`hashlib.file_digest` helper for efficient hashing of file object.
|
||||
Loading…
Add table
Add a link
Reference in a new issue