mirror of
https://github.com/python/cpython.git
synced 2025-07-10 04:45:36 +00:00
242 lines
8.6 KiB
Python
242 lines
8.6 KiB
Python
"""Python bindings to the Zstandard (zstd) compression library (RFC-8878)."""
|
|
|
|
__all__ = (
|
|
# compression.zstd
|
|
'COMPRESSION_LEVEL_DEFAULT',
|
|
'compress',
|
|
'CompressionParameter',
|
|
'decompress',
|
|
'DecompressionParameter',
|
|
'finalize_dict',
|
|
'get_frame_info',
|
|
'Strategy',
|
|
'train_dict',
|
|
|
|
# compression.zstd._zstdfile
|
|
'open',
|
|
'ZstdFile',
|
|
|
|
# _zstd
|
|
'get_frame_size',
|
|
'zstd_version',
|
|
'zstd_version_info',
|
|
'ZstdCompressor',
|
|
'ZstdDecompressor',
|
|
'ZstdDict',
|
|
'ZstdError',
|
|
)
|
|
|
|
import _zstd
|
|
import enum
|
|
from _zstd import (ZstdCompressor, ZstdDecompressor, ZstdDict, ZstdError,
|
|
get_frame_size, zstd_version)
|
|
from compression.zstd._zstdfile import ZstdFile, open, _nbytes
|
|
|
|
# zstd_version_number is (MAJOR * 100 * 100 + MINOR * 100 + RELEASE)
|
|
zstd_version_info = (*divmod(_zstd.zstd_version_number // 100, 100),
|
|
_zstd.zstd_version_number % 100)
|
|
"""Version number of the runtime zstd library as a tuple of integers."""
|
|
|
|
COMPRESSION_LEVEL_DEFAULT = _zstd.ZSTD_CLEVEL_DEFAULT
|
|
"""The default compression level for Zstandard, currently '3'."""
|
|
|
|
|
|
class FrameInfo:
|
|
"""Information about a Zstandard frame."""
|
|
|
|
__slots__ = 'decompressed_size', 'dictionary_id'
|
|
|
|
def __init__(self, decompressed_size, dictionary_id):
|
|
super().__setattr__('decompressed_size', decompressed_size)
|
|
super().__setattr__('dictionary_id', dictionary_id)
|
|
|
|
def __repr__(self):
|
|
return (f'FrameInfo(decompressed_size={self.decompressed_size}, '
|
|
f'dictionary_id={self.dictionary_id})')
|
|
|
|
def __setattr__(self, name, _):
|
|
raise AttributeError(f"can't set attribute {name!r}")
|
|
|
|
|
|
def get_frame_info(frame_buffer):
|
|
"""Get Zstandard frame information from a frame header.
|
|
|
|
*frame_buffer* is a bytes-like object. It should start from the beginning
|
|
of a frame, and needs to include at least the frame header (6 to 18 bytes).
|
|
|
|
The returned FrameInfo object has two attributes.
|
|
'decompressed_size' is the size in bytes of the data in the frame when
|
|
decompressed, or None when the decompressed size is unknown.
|
|
'dictionary_id' is an int in the range (0, 2**32). The special value 0
|
|
means that the dictionary ID was not recorded in the frame header,
|
|
the frame may or may not need a dictionary to be decoded,
|
|
and the ID of such a dictionary is not specified.
|
|
"""
|
|
return FrameInfo(*_zstd.get_frame_info(frame_buffer))
|
|
|
|
|
|
def train_dict(samples, dict_size):
|
|
"""Return a ZstdDict representing a trained Zstandard dictionary.
|
|
|
|
*samples* is an iterable of samples, where a sample is a bytes-like
|
|
object representing a file.
|
|
|
|
*dict_size* is the dictionary's maximum size, in bytes.
|
|
"""
|
|
if not isinstance(dict_size, int):
|
|
ds_cls = type(dict_size).__qualname__
|
|
raise TypeError(f'dict_size must be an int object, not {ds_cls!r}.')
|
|
|
|
samples = tuple(samples)
|
|
chunks = b''.join(samples)
|
|
chunk_sizes = tuple(_nbytes(sample) for sample in samples)
|
|
if not chunks:
|
|
raise ValueError("samples contained no data; can't train dictionary.")
|
|
dict_content = _zstd.train_dict(chunks, chunk_sizes, dict_size)
|
|
return ZstdDict(dict_content)
|
|
|
|
|
|
def finalize_dict(zstd_dict, /, samples, dict_size, level):
|
|
"""Return a ZstdDict representing a finalized Zstandard dictionary.
|
|
|
|
Given a custom content as a basis for dictionary, and a set of samples,
|
|
finalize *zstd_dict* by adding headers and statistics according to the
|
|
Zstandard dictionary format.
|
|
|
|
You may compose an effective dictionary content by hand, which is used as
|
|
basis dictionary, and use some samples to finalize a dictionary. The basis
|
|
dictionary may be a "raw content" dictionary. See *is_raw* in ZstdDict.
|
|
|
|
*samples* is an iterable of samples, where a sample is a bytes-like object
|
|
representing a file.
|
|
*dict_size* is the dictionary's maximum size, in bytes.
|
|
*level* is the expected compression level. The statistics for each
|
|
compression level differ, so tuning the dictionary to the compression level
|
|
can provide improvements.
|
|
"""
|
|
|
|
if not isinstance(zstd_dict, ZstdDict):
|
|
raise TypeError('zstd_dict argument should be a ZstdDict object.')
|
|
if not isinstance(dict_size, int):
|
|
raise TypeError('dict_size argument should be an int object.')
|
|
if not isinstance(level, int):
|
|
raise TypeError('level argument should be an int object.')
|
|
|
|
samples = tuple(samples)
|
|
chunks = b''.join(samples)
|
|
chunk_sizes = tuple(_nbytes(sample) for sample in samples)
|
|
if not chunks:
|
|
raise ValueError("The samples are empty content, can't finalize the "
|
|
"dictionary.")
|
|
dict_content = _zstd.finalize_dict(zstd_dict.dict_content, chunks,
|
|
chunk_sizes, dict_size, level)
|
|
return ZstdDict(dict_content)
|
|
|
|
|
|
def compress(data, level=None, options=None, zstd_dict=None):
|
|
"""Return Zstandard compressed *data* as bytes.
|
|
|
|
*level* is an int specifying the compression level to use, defaulting to
|
|
COMPRESSION_LEVEL_DEFAULT ('3').
|
|
*options* is a dict object that contains advanced compression
|
|
parameters. See CompressionParameter for more on options.
|
|
*zstd_dict* is a ZstdDict object, a pre-trained Zstandard dictionary. See
|
|
the function train_dict for how to train a ZstdDict on sample data.
|
|
|
|
For incremental compression, use a ZstdCompressor instead.
|
|
"""
|
|
comp = ZstdCompressor(level=level, options=options, zstd_dict=zstd_dict)
|
|
return comp.compress(data, mode=ZstdCompressor.FLUSH_FRAME)
|
|
|
|
|
|
def decompress(data, zstd_dict=None, options=None):
|
|
"""Decompress one or more frames of Zstandard compressed *data*.
|
|
|
|
*zstd_dict* is a ZstdDict object, a pre-trained Zstandard dictionary. See
|
|
the function train_dict for how to train a ZstdDict on sample data.
|
|
*options* is a dict object that contains advanced compression
|
|
parameters. See DecompressionParameter for more on options.
|
|
|
|
For incremental decompression, use a ZstdDecompressor instead.
|
|
"""
|
|
results = []
|
|
while True:
|
|
decomp = ZstdDecompressor(options=options, zstd_dict=zstd_dict)
|
|
results.append(decomp.decompress(data))
|
|
if not decomp.eof:
|
|
raise ZstdError('Compressed data ended before the '
|
|
'end-of-stream marker was reached')
|
|
data = decomp.unused_data
|
|
if not data:
|
|
break
|
|
return b''.join(results)
|
|
|
|
|
|
class CompressionParameter(enum.IntEnum):
|
|
"""Compression parameters."""
|
|
|
|
compression_level = _zstd.ZSTD_c_compressionLevel
|
|
window_log = _zstd.ZSTD_c_windowLog
|
|
hash_log = _zstd.ZSTD_c_hashLog
|
|
chain_log = _zstd.ZSTD_c_chainLog
|
|
search_log = _zstd.ZSTD_c_searchLog
|
|
min_match = _zstd.ZSTD_c_minMatch
|
|
target_length = _zstd.ZSTD_c_targetLength
|
|
strategy = _zstd.ZSTD_c_strategy
|
|
|
|
enable_long_distance_matching = _zstd.ZSTD_c_enableLongDistanceMatching
|
|
ldm_hash_log = _zstd.ZSTD_c_ldmHashLog
|
|
ldm_min_match = _zstd.ZSTD_c_ldmMinMatch
|
|
ldm_bucket_size_log = _zstd.ZSTD_c_ldmBucketSizeLog
|
|
ldm_hash_rate_log = _zstd.ZSTD_c_ldmHashRateLog
|
|
|
|
content_size_flag = _zstd.ZSTD_c_contentSizeFlag
|
|
checksum_flag = _zstd.ZSTD_c_checksumFlag
|
|
dict_id_flag = _zstd.ZSTD_c_dictIDFlag
|
|
|
|
nb_workers = _zstd.ZSTD_c_nbWorkers
|
|
job_size = _zstd.ZSTD_c_jobSize
|
|
overlap_log = _zstd.ZSTD_c_overlapLog
|
|
|
|
def bounds(self):
|
|
"""Return the (lower, upper) int bounds of a compression parameter.
|
|
|
|
Both the lower and upper bounds are inclusive.
|
|
"""
|
|
return _zstd.get_param_bounds(self.value, is_compress=True)
|
|
|
|
|
|
class DecompressionParameter(enum.IntEnum):
|
|
"""Decompression parameters."""
|
|
|
|
window_log_max = _zstd.ZSTD_d_windowLogMax
|
|
|
|
def bounds(self):
|
|
"""Return the (lower, upper) int bounds of a decompression parameter.
|
|
|
|
Both the lower and upper bounds are inclusive.
|
|
"""
|
|
return _zstd.get_param_bounds(self.value, is_compress=False)
|
|
|
|
|
|
class Strategy(enum.IntEnum):
|
|
"""Compression strategies, listed from fastest to strongest.
|
|
|
|
Note that new strategies might be added in the future.
|
|
Only the order (from fast to strong) is guaranteed,
|
|
the numeric value might change.
|
|
"""
|
|
|
|
fast = _zstd.ZSTD_fast
|
|
dfast = _zstd.ZSTD_dfast
|
|
greedy = _zstd.ZSTD_greedy
|
|
lazy = _zstd.ZSTD_lazy
|
|
lazy2 = _zstd.ZSTD_lazy2
|
|
btlazy2 = _zstd.ZSTD_btlazy2
|
|
btopt = _zstd.ZSTD_btopt
|
|
btultra = _zstd.ZSTD_btultra
|
|
btultra2 = _zstd.ZSTD_btultra2
|
|
|
|
|
|
# Check validity of the CompressionParameter & DecompressionParameter types
|
|
_zstd.set_parameter_types(CompressionParameter, DecompressionParameter)
|