mirror of
https://github.com/python/cpython.git
synced 2025-10-07 15:42:02 +00:00
gh-95534: Improve gzip reading speed by 10% (#97664)
Change summary: + There is now a `gzip.READ_BUFFER_SIZE` constant that is 128KB. Other programs that read in 128KB chunks: pigz and cat. So this seems best practice among good programs. Also it is faster than 8 kb chunks. + a zlib._ZlibDecompressor was added. This is the _bz2.BZ2Decompressor ported to zlib. Since the zlib.Decompress object is better for in-memory decompression, the _ZlibDecompressor is hidden. It only makes sense in file decompression, and that is already implemented now in the gzip library. No need to bother the users with this. + The ZlibDecompressor uses the older Cpython arrange_output_buffer functions, as those are faster and more appropriate for the use case. + GzipFile.read has been optimized. There is no longer a `unconsumed_tail` member to write back to padded file. This is instead handled by the ZlibDecompressor itself, which has an internal buffer. `_add_read_data` has been inlined, as it was just two calls. EDIT: While I am adding improvements anyway, I figured I could add another one-liner optimization now to the python -m gzip application. That read chunks in io.DEFAULT_BUFFER_SIZE previously, but has been updated now to use READ_BUFFER_SIZE chunks.
This commit is contained in:
parent
bb38b39b33
commit
eae7dad402
5 changed files with 850 additions and 80 deletions
24
Lib/gzip.py
24
Lib/gzip.py
|
@ -21,6 +21,8 @@ _COMPRESS_LEVEL_FAST = 1
|
|||
_COMPRESS_LEVEL_TRADEOFF = 6
|
||||
_COMPRESS_LEVEL_BEST = 9
|
||||
|
||||
READ_BUFFER_SIZE = 128 * 1024
|
||||
|
||||
|
||||
def open(filename, mode="rb", compresslevel=_COMPRESS_LEVEL_BEST,
|
||||
encoding=None, errors=None, newline=None):
|
||||
|
@ -446,7 +448,7 @@ def _read_gzip_header(fp):
|
|||
|
||||
class _GzipReader(_compression.DecompressReader):
|
||||
def __init__(self, fp):
|
||||
super().__init__(_PaddedFile(fp), zlib.decompressobj,
|
||||
super().__init__(_PaddedFile(fp), zlib._ZlibDecompressor,
|
||||
wbits=-zlib.MAX_WBITS)
|
||||
# Set flag indicating start of a new member
|
||||
self._new_member = True
|
||||
|
@ -494,12 +496,13 @@ class _GzipReader(_compression.DecompressReader):
|
|||
self._new_member = False
|
||||
|
||||
# Read a chunk of data from the file
|
||||
buf = self._fp.read(io.DEFAULT_BUFFER_SIZE)
|
||||
if self._decompressor.needs_input:
|
||||
buf = self._fp.read(READ_BUFFER_SIZE)
|
||||
uncompress = self._decompressor.decompress(buf, size)
|
||||
else:
|
||||
uncompress = self._decompressor.decompress(b"", size)
|
||||
|
||||
uncompress = self._decompressor.decompress(buf, size)
|
||||
if self._decompressor.unconsumed_tail != b"":
|
||||
self._fp.prepend(self._decompressor.unconsumed_tail)
|
||||
elif self._decompressor.unused_data != b"":
|
||||
if self._decompressor.unused_data != b"":
|
||||
# Prepend the already read bytes to the fileobj so they can
|
||||
# be seen by _read_eof() and _read_gzip_header()
|
||||
self._fp.prepend(self._decompressor.unused_data)
|
||||
|
@ -510,14 +513,11 @@ class _GzipReader(_compression.DecompressReader):
|
|||
raise EOFError("Compressed file ended before the "
|
||||
"end-of-stream marker was reached")
|
||||
|
||||
self._add_read_data( uncompress )
|
||||
self._crc = zlib.crc32(uncompress, self._crc)
|
||||
self._stream_size += len(uncompress)
|
||||
self._pos += len(uncompress)
|
||||
return uncompress
|
||||
|
||||
def _add_read_data(self, data):
|
||||
self._crc = zlib.crc32(data, self._crc)
|
||||
self._stream_size = self._stream_size + len(data)
|
||||
|
||||
def _read_eof(self):
|
||||
# We've read to the end of the file
|
||||
# We check that the computed CRC and size of the
|
||||
|
@ -647,7 +647,7 @@ def main():
|
|||
f = builtins.open(arg, "rb")
|
||||
g = open(arg + ".gz", "wb")
|
||||
while True:
|
||||
chunk = f.read(io.DEFAULT_BUFFER_SIZE)
|
||||
chunk = f.read(READ_BUFFER_SIZE)
|
||||
if not chunk:
|
||||
break
|
||||
g.write(chunk)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue