mirror of
https://github.com/python/cpython.git
synced 2025-07-07 19:35:27 +00:00

Co-authored-by: Adam Turner <9087854+AA-Turner@users.noreply.github.com> Co-authored-by: Gregory P. Smith <greg@krypto.org> Co-authored-by: Tomas R. <tomas.roun8@gmail.com> Co-authored-by: Rogdham <contact@rogdham.net>
234 lines
8.3 KiB
Python
234 lines
8.3 KiB
Python
"""Python bindings to the Zstandard (zstd) compression library (RFC-8878)."""
|
|
|
|
__all__ = (
|
|
# compression.zstd
|
|
"COMPRESSION_LEVEL_DEFAULT",
|
|
"compress",
|
|
"CompressionParameter",
|
|
"decompress",
|
|
"DecompressionParameter",
|
|
"finalize_dict",
|
|
"get_frame_info",
|
|
"Strategy",
|
|
"train_dict",
|
|
|
|
# compression.zstd._zstdfile
|
|
"open",
|
|
"ZstdFile",
|
|
|
|
# _zstd
|
|
"get_frame_size",
|
|
"zstd_version",
|
|
"zstd_version_info",
|
|
"ZstdCompressor",
|
|
"ZstdDecompressor",
|
|
"ZstdDict",
|
|
"ZstdError",
|
|
)
|
|
|
|
import _zstd
|
|
import enum
|
|
from _zstd import *
|
|
from compression.zstd._zstdfile import ZstdFile, open, _nbytes
|
|
|
|
COMPRESSION_LEVEL_DEFAULT = _zstd._compressionLevel_values[0]
|
|
"""The default compression level for Zstandard, currently '3'."""
|
|
|
|
|
|
class FrameInfo:
|
|
"""Information about a Zstandard frame."""
|
|
__slots__ = 'decompressed_size', 'dictionary_id'
|
|
|
|
def __init__(self, decompressed_size, dictionary_id):
|
|
super().__setattr__('decompressed_size', decompressed_size)
|
|
super().__setattr__('dictionary_id', dictionary_id)
|
|
|
|
def __repr__(self):
|
|
return (f'FrameInfo(decompressed_size={self.decompressed_size}, '
|
|
f'dictionary_id={self.dictionary_id})')
|
|
|
|
def __setattr__(self, name, _):
|
|
raise AttributeError(f"can't set attribute {name!r}")
|
|
|
|
|
|
def get_frame_info(frame_buffer):
|
|
"""Get Zstandard frame information from a frame header.
|
|
|
|
*frame_buffer* is a bytes-like object. It should start from the beginning
|
|
of a frame, and needs to include at least the frame header (6 to 18 bytes).
|
|
|
|
The returned FrameInfo object has two attributes.
|
|
'decompressed_size' is the size in bytes of the data in the frame when
|
|
decompressed, or None when the decompressed size is unknown.
|
|
'dictionary_id' is an int in the range (0, 2**32). The special value 0
|
|
means that the dictionary ID was not recorded in the frame header,
|
|
the frame may or may not need a dictionary to be decoded,
|
|
and the ID of such a dictionary is not specified.
|
|
"""
|
|
return FrameInfo(*_zstd._get_frame_info(frame_buffer))
|
|
|
|
|
|
def train_dict(samples, dict_size):
|
|
"""Return a ZstdDict representing a trained Zstandard dictionary.
|
|
|
|
*samples* is an iterable of samples, where a sample is a bytes-like
|
|
object representing a file.
|
|
|
|
*dict_size* is the dictionary's maximum size, in bytes.
|
|
"""
|
|
if not isinstance(dict_size, int):
|
|
ds_cls = type(dict_size).__qualname__
|
|
raise TypeError(f'dict_size must be an int object, not {ds_cls!r}.')
|
|
|
|
samples = tuple(samples)
|
|
chunks = b''.join(samples)
|
|
chunk_sizes = tuple(_nbytes(sample) for sample in samples)
|
|
if not chunks:
|
|
raise ValueError("samples contained no data; can't train dictionary.")
|
|
dict_content = _zstd._train_dict(chunks, chunk_sizes, dict_size)
|
|
return ZstdDict(dict_content)
|
|
|
|
|
|
def finalize_dict(zstd_dict, /, samples, dict_size, level):
|
|
"""Return a ZstdDict representing a finalized Zstandard dictionary.
|
|
|
|
Given a custom content as a basis for dictionary, and a set of samples,
|
|
finalize *zstd_dict* by adding headers and statistics according to the
|
|
Zstandard dictionary format.
|
|
|
|
You may compose an effective dictionary content by hand, which is used as
|
|
basis dictionary, and use some samples to finalize a dictionary. The basis
|
|
dictionary may be a "raw content" dictionary. See *is_raw* in ZstdDict.
|
|
|
|
*samples* is an iterable of samples, where a sample is a bytes-like object
|
|
representing a file.
|
|
*dict_size* is the dictionary's maximum size, in bytes.
|
|
*level* is the expected compression level. The statistics for each
|
|
compression level differ, so tuning the dictionary to the compression level
|
|
can provide improvements.
|
|
"""
|
|
|
|
if not isinstance(zstd_dict, ZstdDict):
|
|
raise TypeError('zstd_dict argument should be a ZstdDict object.')
|
|
if not isinstance(dict_size, int):
|
|
raise TypeError('dict_size argument should be an int object.')
|
|
if not isinstance(level, int):
|
|
raise TypeError('level argument should be an int object.')
|
|
|
|
samples = tuple(samples)
|
|
chunks = b''.join(samples)
|
|
chunk_sizes = tuple(_nbytes(sample) for sample in samples)
|
|
if not chunks:
|
|
raise ValueError("The samples are empty content, can't finalize the"
|
|
"dictionary.")
|
|
dict_content = _zstd._finalize_dict(zstd_dict.dict_content,
|
|
chunks, chunk_sizes,
|
|
dict_size, level)
|
|
return ZstdDict(dict_content)
|
|
|
|
def compress(data, level=None, options=None, zstd_dict=None):
|
|
"""Return Zstandard compressed *data* as bytes.
|
|
|
|
*level* is an int specifying the compression level to use, defaulting to
|
|
COMPRESSION_LEVEL_DEFAULT ('3').
|
|
*options* is a dict object that contains advanced compression
|
|
parameters. See CompressionParameter for more on options.
|
|
*zstd_dict* is a ZstdDict object, a pre-trained Zstandard dictionary. See
|
|
the function train_dict for how to train a ZstdDict on sample data.
|
|
|
|
For incremental compression, use a ZstdCompressor instead.
|
|
"""
|
|
comp = ZstdCompressor(level=level, options=options, zstd_dict=zstd_dict)
|
|
return comp.compress(data, mode=ZstdCompressor.FLUSH_FRAME)
|
|
|
|
def decompress(data, zstd_dict=None, options=None):
|
|
"""Decompress one or more frames of Zstandard compressed *data*.
|
|
|
|
*zstd_dict* is a ZstdDict object, a pre-trained Zstandard dictionary. See
|
|
the function train_dict for how to train a ZstdDict on sample data.
|
|
*options* is a dict object that contains advanced compression
|
|
parameters. See DecompressionParameter for more on options.
|
|
|
|
For incremental decompression, use a ZstdDecompressor instead.
|
|
"""
|
|
results = []
|
|
while True:
|
|
decomp = ZstdDecompressor(options=options, zstd_dict=zstd_dict)
|
|
results.append(decomp.decompress(data))
|
|
if not decomp.eof:
|
|
raise ZstdError("Compressed data ended before the "
|
|
"end-of-stream marker was reached")
|
|
data = decomp.unused_data
|
|
if not data:
|
|
break
|
|
return b"".join(results)
|
|
|
|
|
|
class CompressionParameter(enum.IntEnum):
|
|
"""Compression parameters."""
|
|
|
|
compression_level = _zstd._ZSTD_c_compressionLevel
|
|
window_log = _zstd._ZSTD_c_windowLog
|
|
hash_log = _zstd._ZSTD_c_hashLog
|
|
chain_log = _zstd._ZSTD_c_chainLog
|
|
search_log = _zstd._ZSTD_c_searchLog
|
|
min_match = _zstd._ZSTD_c_minMatch
|
|
target_length = _zstd._ZSTD_c_targetLength
|
|
strategy = _zstd._ZSTD_c_strategy
|
|
|
|
enable_long_distance_matching = _zstd._ZSTD_c_enableLongDistanceMatching
|
|
ldm_hash_log = _zstd._ZSTD_c_ldmHashLog
|
|
ldm_min_match = _zstd._ZSTD_c_ldmMinMatch
|
|
ldm_bucket_size_log = _zstd._ZSTD_c_ldmBucketSizeLog
|
|
ldm_hash_rate_log = _zstd._ZSTD_c_ldmHashRateLog
|
|
|
|
content_size_flag = _zstd._ZSTD_c_contentSizeFlag
|
|
checksum_flag = _zstd._ZSTD_c_checksumFlag
|
|
dict_id_flag = _zstd._ZSTD_c_dictIDFlag
|
|
|
|
nb_workers = _zstd._ZSTD_c_nbWorkers
|
|
job_size = _zstd._ZSTD_c_jobSize
|
|
overlap_log = _zstd._ZSTD_c_overlapLog
|
|
|
|
def bounds(self):
|
|
"""Return the (lower, upper) int bounds of a compression parameter.
|
|
|
|
Both the lower and upper bounds are inclusive.
|
|
"""
|
|
return _zstd._get_param_bounds(self.value, is_compress=True)
|
|
|
|
|
|
class DecompressionParameter(enum.IntEnum):
|
|
"""Decompression parameters."""
|
|
|
|
window_log_max = _zstd._ZSTD_d_windowLogMax
|
|
|
|
def bounds(self):
|
|
"""Return the (lower, upper) int bounds of a decompression parameter.
|
|
|
|
Both the lower and upper bounds are inclusive.
|
|
"""
|
|
return _zstd._get_param_bounds(self.value, is_compress=False)
|
|
|
|
|
|
class Strategy(enum.IntEnum):
|
|
"""Compression strategies, listed from fastest to strongest.
|
|
|
|
Note that new strategies might be added in the future.
|
|
Only the order (from fast to strong) is guaranteed,
|
|
the numeric value might change.
|
|
"""
|
|
|
|
fast = _zstd._ZSTD_fast
|
|
dfast = _zstd._ZSTD_dfast
|
|
greedy = _zstd._ZSTD_greedy
|
|
lazy = _zstd._ZSTD_lazy
|
|
lazy2 = _zstd._ZSTD_lazy2
|
|
btlazy2 = _zstd._ZSTD_btlazy2
|
|
btopt = _zstd._ZSTD_btopt
|
|
btultra = _zstd._ZSTD_btultra
|
|
btultra2 = _zstd._ZSTD_btultra2
|
|
|
|
|
|
# Check validity of the CompressionParameter & DecompressionParameter types
|
|
_zstd._set_parameter_types(CompressionParameter, DecompressionParameter)
|