cpython/Lib/compression/zstd/__init__.py

"""Python bindings to the Zstandard (zstd) compression library (RFC-8878)."""

__all__ = (
    # compression.zstd
    'COMPRESSION_LEVEL_DEFAULT',
    'compress',
    'CompressionParameter',
    'decompress',
    'DecompressionParameter',
    'finalize_dict',
    'get_frame_info',
    'Strategy',
    'train_dict',

    # compression.zstd._zstdfile
    'open',
    'ZstdFile',

    # _zstd
    'get_frame_size',
    'zstd_version',
    'zstd_version_info',
    'ZstdCompressor',
    'ZstdDecompressor',
    'ZstdDict',
    'ZstdError',
)

import _zstd
import enum
from _zstd import (ZstdCompressor, ZstdDecompressor, ZstdDict, ZstdError,
                   get_frame_size, zstd_version)
from compression.zstd._zstdfile import ZstdFile, open, _nbytes

# zstd_version_number is (MAJOR * 100 * 100 + MINOR * 100 + RELEASE)
zstd_version_info = (*divmod(_zstd.zstd_version_number // 100, 100),
                     _zstd.zstd_version_number % 100)
"""Version number of the runtime zstd library as a tuple of integers."""

COMPRESSION_LEVEL_DEFAULT = _zstd.ZSTD_CLEVEL_DEFAULT
"""The default compression level for Zstandard, currently '3'."""


class FrameInfo:
    """Information about a Zstandard frame."""

    __slots__ = 'decompressed_size', 'dictionary_id'

    def __init__(self, decompressed_size, dictionary_id):
        super().__setattr__('decompressed_size', decompressed_size)
        super().__setattr__('dictionary_id', dictionary_id)

    def __repr__(self):
        return (f'FrameInfo(decompressed_size={self.decompressed_size}, '
                f'dictionary_id={self.dictionary_id})')

    def __setattr__(self, name, _):
        raise AttributeError(f"can't set attribute {name!r}")


def get_frame_info(frame_buffer):
    """Get Zstandard frame information from a frame header.

    *frame_buffer* is a bytes-like object. It should start from the beginning
    of a frame, and needs to include at least the frame header (6 to 18 bytes).

    The returned FrameInfo object has two attributes.
    'decompressed_size' is the size in bytes of the data in the frame when
    decompressed, or None when the decompressed size is unknown.
    'dictionary_id' is an int in the range (0, 2**32). The special value 0
    means that the dictionary ID was not recorded in the frame header,
    the frame may or may not need a dictionary to be decoded,
    and the ID of such a dictionary is not specified.
    """
    return FrameInfo(*_zstd.get_frame_info(frame_buffer))


def train_dict(samples, dict_size):
    """Return a ZstdDict representing a trained Zstandard dictionary.

    *samples* is an iterable of samples, where a sample is a bytes-like
    object representing a file.

    *dict_size* is the dictionary's maximum size, in bytes.
    """
    if not isinstance(dict_size, int):
        ds_cls = type(dict_size).__qualname__
        raise TypeError(f'dict_size must be an int object, not {ds_cls!r}.')

    samples = tuple(samples)
    chunks = b''.join(samples)
    chunk_sizes = tuple(_nbytes(sample) for sample in samples)
    if not chunks:
        raise ValueError("samples contained no data; can't train dictionary.")
    dict_content = _zstd.train_dict(chunks, chunk_sizes, dict_size)
    return ZstdDict(dict_content)


def finalize_dict(zstd_dict, /, samples, dict_size, level):
    """Return a ZstdDict representing a finalized Zstandard dictionary.

    Given a custom content as a basis for dictionary, and a set of samples,
    finalize *zstd_dict* by adding headers and statistics according to the
    Zstandard dictionary format.

    You may compose an effective dictionary content by hand, which is used as
    basis dictionary, and use some samples to finalize a dictionary. The basis
    dictionary may be a "raw content" dictionary. See *is_raw* in ZstdDict.

    *samples* is an iterable of samples, where a sample is a bytes-like object
    representing a file.
    *dict_size* is the dictionary's maximum size, in bytes.
    *level* is the expected compression level. The statistics for each
    compression level differ, so tuning the dictionary to the compression level
    can provide improvements.
    """

    if not isinstance(zstd_dict, ZstdDict):
        raise TypeError('zstd_dict argument should be a ZstdDict object.')
    if not isinstance(dict_size, int):
        raise TypeError('dict_size argument should be an int object.')
    if not isinstance(level, int):
        raise TypeError('level argument should be an int object.')

    samples = tuple(samples)
    chunks = b''.join(samples)
    chunk_sizes = tuple(_nbytes(sample) for sample in samples)
    if not chunks:
        raise ValueError("The samples are empty content, can't finalize the "
                         "dictionary.")
    dict_content = _zstd.finalize_dict(zstd_dict.dict_content, chunks,
                                       chunk_sizes, dict_size, level)
    return ZstdDict(dict_content)


def compress(data, level=None, options=None, zstd_dict=None):
    """Return Zstandard compressed *data* as bytes.

    *level* is an int specifying the compression level to use, defaulting to
    COMPRESSION_LEVEL_DEFAULT ('3').
    *options* is a dict object that contains advanced compression
    parameters. See CompressionParameter for more on options.
    *zstd_dict* is a ZstdDict object, a pre-trained Zstandard dictionary. See
    the function train_dict for how to train a ZstdDict on sample data.

    For incremental compression, use a ZstdCompressor instead.
    """
    comp = ZstdCompressor(level=level, options=options, zstd_dict=zstd_dict)
    return comp.compress(data, mode=ZstdCompressor.FLUSH_FRAME)


def decompress(data, zstd_dict=None, options=None):
    """Decompress one or more frames of Zstandard compressed *data*.

    *zstd_dict* is a ZstdDict object, a pre-trained Zstandard dictionary. See
    the function train_dict for how to train a ZstdDict on sample data.
    *options* is a dict object that contains advanced compression
    parameters. See DecompressionParameter for more on options.

    For incremental decompression, use a ZstdDecompressor instead.
    """
    results = []
    while True:
        decomp = ZstdDecompressor(options=options, zstd_dict=zstd_dict)
        results.append(decomp.decompress(data))
        if not decomp.eof:
            raise ZstdError('Compressed data ended before the '
                            'end-of-stream marker was reached')
        data = decomp.unused_data
        if not data:
            break
    return b''.join(results)


class CompressionParameter(enum.IntEnum):
    """Compression parameters."""

    compression_level = _zstd.ZSTD_c_compressionLevel
    window_log = _zstd.ZSTD_c_windowLog
    hash_log = _zstd.ZSTD_c_hashLog
    chain_log = _zstd.ZSTD_c_chainLog
    search_log = _zstd.ZSTD_c_searchLog
    min_match = _zstd.ZSTD_c_minMatch
    target_length = _zstd.ZSTD_c_targetLength
    strategy = _zstd.ZSTD_c_strategy

    enable_long_distance_matching = _zstd.ZSTD_c_enableLongDistanceMatching
    ldm_hash_log = _zstd.ZSTD_c_ldmHashLog
    ldm_min_match = _zstd.ZSTD_c_ldmMinMatch
    ldm_bucket_size_log = _zstd.ZSTD_c_ldmBucketSizeLog
    ldm_hash_rate_log = _zstd.ZSTD_c_ldmHashRateLog

    content_size_flag = _zstd.ZSTD_c_contentSizeFlag
    checksum_flag = _zstd.ZSTD_c_checksumFlag
    dict_id_flag = _zstd.ZSTD_c_dictIDFlag

    nb_workers = _zstd.ZSTD_c_nbWorkers
    job_size = _zstd.ZSTD_c_jobSize
    overlap_log = _zstd.ZSTD_c_overlapLog

    def bounds(self):
        """Return the (lower, upper) int bounds of a compression parameter.

        Both the lower and upper bounds are inclusive.
        """
        return _zstd.get_param_bounds(self.value, is_compress=True)


class DecompressionParameter(enum.IntEnum):
    """Decompression parameters."""

    window_log_max = _zstd.ZSTD_d_windowLogMax

    def bounds(self):
        """Return the (lower, upper) int bounds of a decompression parameter.

        Both the lower and upper bounds are inclusive.
        """
        return _zstd.get_param_bounds(self.value, is_compress=False)


class Strategy(enum.IntEnum):
    """Compression strategies, listed from fastest to strongest.

    Note that new strategies might be added in the future.
    Only the order (from fast to strong) is guaranteed,
    the numeric value might change.
    """

    fast = _zstd.ZSTD_fast
    dfast = _zstd.ZSTD_dfast
    greedy = _zstd.ZSTD_greedy
    lazy = _zstd.ZSTD_lazy
    lazy2 = _zstd.ZSTD_lazy2
    btlazy2 = _zstd.ZSTD_btlazy2
    btopt = _zstd.ZSTD_btopt
    btultra = _zstd.ZSTD_btultra
    btultra2 = _zstd.ZSTD_btultra2


# Check validity of the CompressionParameter & DecompressionParameter types
_zstd.set_parameter_types(CompressionParameter, DecompressionParameter)