gh-132983: Add the compression.zstd pacakge and tests (#133365)

Co-authored-by: Adam Turner <9087854+AA-Turner@users.noreply.github.com> Co-authored-by: Gregory P. Smith <greg@krypto.org> Co-authored-by: Tomas R. <tomas.roun8@gmail.com> Co-authored-by: Rogdham <contact@rogdham.net>
2025-11-25 04:34:37 +00:00 · 2025-05-05 17:38:08 -07:00 · 2025-05-05 17:38:08 -07:00 · c273f59fb3
commit c273f59fb3
parent 793402e217
15 changed files with 3358 additions and 100 deletions
--- a/Lib/compression/zstd/init.py
+++ b/Lib/compression/zstd/init.py
@ -0,0 +1,234 @@
+"""Python bindings to the Zstandard (zstd) compression library (RFC-8878)."""
+
+__all__ = (
+    # compression.zstd
+    "COMPRESSION_LEVEL_DEFAULT",
+    "compress",
+    "CompressionParameter",
+    "decompress",
+    "DecompressionParameter",
+    "finalize_dict",
+    "get_frame_info",
+    "Strategy",
+    "train_dict",
+
+    # compression.zstd._zstdfile
+    "open",
+    "ZstdFile",
+
+    # _zstd
+    "get_frame_size",
+    "zstd_version",
+    "zstd_version_info",
+    "ZstdCompressor",
+    "ZstdDecompressor",
+    "ZstdDict",
+    "ZstdError",
+)
+
+import _zstd
+import enum
+from _zstd import *
+from compression.zstd._zstdfile import ZstdFile, open, _nbytes
+
+COMPRESSION_LEVEL_DEFAULT = _zstd._compressionLevel_values[0]
+"""The default compression level for Zstandard, currently '3'."""
+
+
+class FrameInfo:
+    """Information about a Zstandard frame."""
+    __slots__ = 'decompressed_size', 'dictionary_id'
+
+    def __init__(self, decompressed_size, dictionary_id):
+        super().__setattr__('decompressed_size', decompressed_size)
+        super().__setattr__('dictionary_id', dictionary_id)
+
+    def __repr__(self):
+        return (f'FrameInfo(decompressed_size={self.decompressed_size}, '
+                f'dictionary_id={self.dictionary_id})')
+
+    def __setattr__(self, name, _):
+        raise AttributeError(f"can't set attribute {name!r}")
+
+
+def get_frame_info(frame_buffer):
+    """Get Zstandard frame information from a frame header.
+
+    *frame_buffer* is a bytes-like object. It should start from the beginning
+    of a frame, and needs to include at least the frame header (6 to 18 bytes).
+
+    The returned FrameInfo object has two attributes.
+    'decompressed_size' is the size in bytes of the data in the frame when
+    decompressed, or None when the decompressed size is unknown.
+    'dictionary_id' is an int in the range (0, 2**32). The special value 0
+    means that the dictionary ID was not recorded in the frame header,
+    the frame may or may not need a dictionary to be decoded,
+    and the ID of such a dictionary is not specified.
+    """
+    return FrameInfo(*_zstd._get_frame_info(frame_buffer))
+
+
+def train_dict(samples, dict_size):
+    """Return a ZstdDict representing a trained Zstandard dictionary.
+
+    *samples* is an iterable of samples, where a sample is a bytes-like
+    object representing a file.
+
+    *dict_size* is the dictionary's maximum size, in bytes.
+    """
+    if not isinstance(dict_size, int):
+        ds_cls = type(dict_size).__qualname__
+        raise TypeError(f'dict_size must be an int object, not {ds_cls!r}.')
+
+    samples = tuple(samples)
+    chunks = b''.join(samples)
+    chunk_sizes = tuple(_nbytes(sample) for sample in samples)
+    if not chunks:
+        raise ValueError("samples contained no data; can't train dictionary.")
+    dict_content = _zstd._train_dict(chunks, chunk_sizes, dict_size)
+    return ZstdDict(dict_content)
+
+
+def finalize_dict(zstd_dict, /, samples, dict_size, level):
+    """Return a ZstdDict representing a finalized Zstandard dictionary.
+
+    Given a custom content as a basis for dictionary, and a set of samples,
+    finalize *zstd_dict* by adding headers and statistics according to the
+    Zstandard dictionary format.
+
+    You may compose an effective dictionary content by hand, which is used as
+    basis dictionary, and use some samples to finalize a dictionary. The basis
+    dictionary may be a "raw content" dictionary. See *is_raw* in ZstdDict.
+
+    *samples* is an iterable of samples, where a sample is a bytes-like object
+    representing a file.
+    *dict_size* is the dictionary's maximum size, in bytes.
+    *level* is the expected compression level. The statistics for each
+    compression level differ, so tuning the dictionary to the compression level
+    can provide improvements.
+    """
+
+    if not isinstance(zstd_dict, ZstdDict):
+        raise TypeError('zstd_dict argument should be a ZstdDict object.')
+    if not isinstance(dict_size, int):
+        raise TypeError('dict_size argument should be an int object.')
+    if not isinstance(level, int):
+        raise TypeError('level argument should be an int object.')
+
+    samples = tuple(samples)
+    chunks = b''.join(samples)
+    chunk_sizes = tuple(_nbytes(sample) for sample in samples)
+    if not chunks:
+        raise ValueError("The samples are empty content, can't finalize the"
+                         "dictionary.")
+    dict_content = _zstd._finalize_dict(zstd_dict.dict_content,
+                                        chunks, chunk_sizes,
+                                        dict_size, level)
+    return ZstdDict(dict_content)
+
+def compress(data, level=None, options=None, zstd_dict=None):
+    """Return Zstandard compressed *data* as bytes.
+
+    *level* is an int specifying the compression level to use, defaulting to
+    COMPRESSION_LEVEL_DEFAULT ('3').
+    *options* is a dict object that contains advanced compression
+    parameters. See CompressionParameter for more on options.
+    *zstd_dict* is a ZstdDict object, a pre-trained Zstandard dictionary. See
+    the function train_dict for how to train a ZstdDict on sample data.
+
+    For incremental compression, use a ZstdCompressor instead.
+    """
+    comp = ZstdCompressor(level=level, options=options, zstd_dict=zstd_dict)
+    return comp.compress(data, mode=ZstdCompressor.FLUSH_FRAME)
+
+def decompress(data, zstd_dict=None, options=None):
+    """Decompress one or more frames of Zstandard compressed *data*.
+
+    *zstd_dict* is a ZstdDict object, a pre-trained Zstandard dictionary. See
+    the function train_dict for how to train a ZstdDict on sample data.
+    *options* is a dict object that contains advanced compression
+    parameters. See DecompressionParameter for more on options.
+
+    For incremental decompression, use a ZstdDecompressor instead.
+    """
+    results = []
+    while True:
+        decomp = ZstdDecompressor(options=options, zstd_dict=zstd_dict)
+        results.append(decomp.decompress(data))
+        if not decomp.eof:
+            raise ZstdError("Compressed data ended before the "
+                            "end-of-stream marker was reached")
+        data = decomp.unused_data
+        if not data:
+            break
+    return b"".join(results)
+
+
+class CompressionParameter(enum.IntEnum):
+    """Compression parameters."""
+
+    compression_level = _zstd._ZSTD_c_compressionLevel
+    window_log = _zstd._ZSTD_c_windowLog
+    hash_log = _zstd._ZSTD_c_hashLog
+    chain_log = _zstd._ZSTD_c_chainLog
+    search_log = _zstd._ZSTD_c_searchLog
+    min_match = _zstd._ZSTD_c_minMatch
+    target_length = _zstd._ZSTD_c_targetLength
+    strategy = _zstd._ZSTD_c_strategy
+
+    enable_long_distance_matching = _zstd._ZSTD_c_enableLongDistanceMatching
+    ldm_hash_log = _zstd._ZSTD_c_ldmHashLog
+    ldm_min_match = _zstd._ZSTD_c_ldmMinMatch
+    ldm_bucket_size_log = _zstd._ZSTD_c_ldmBucketSizeLog
+    ldm_hash_rate_log = _zstd._ZSTD_c_ldmHashRateLog
+
+    content_size_flag = _zstd._ZSTD_c_contentSizeFlag
+    checksum_flag = _zstd._ZSTD_c_checksumFlag
+    dict_id_flag = _zstd._ZSTD_c_dictIDFlag
+
+    nb_workers = _zstd._ZSTD_c_nbWorkers
+    job_size = _zstd._ZSTD_c_jobSize
+    overlap_log = _zstd._ZSTD_c_overlapLog
+
+    def bounds(self):
+        """Return the (lower, upper) int bounds of a compression parameter.
+
+        Both the lower and upper bounds are inclusive.
+        """
+        return _zstd._get_param_bounds(self.value, is_compress=True)
+
+
+class DecompressionParameter(enum.IntEnum):
+    """Decompression parameters."""
+
+    window_log_max = _zstd._ZSTD_d_windowLogMax
+
+    def bounds(self):
+        """Return the (lower, upper) int bounds of a decompression parameter.
+
+        Both the lower and upper bounds are inclusive.
+        """
+        return _zstd._get_param_bounds(self.value, is_compress=False)
+
+
+class Strategy(enum.IntEnum):
+    """Compression strategies, listed from fastest to strongest.
+
+    Note that new strategies might be added in the future.
+    Only the order (from fast to strong) is guaranteed,
+    the numeric value might change.
+    """
+
+    fast = _zstd._ZSTD_fast
+    dfast = _zstd._ZSTD_dfast
+    greedy = _zstd._ZSTD_greedy
+    lazy = _zstd._ZSTD_lazy
+    lazy2 = _zstd._ZSTD_lazy2
+    btlazy2 = _zstd._ZSTD_btlazy2
+    btopt = _zstd._ZSTD_btopt
+    btultra = _zstd._ZSTD_btultra
+    btultra2 = _zstd._ZSTD_btultra2
+
+
+# Check validity of the CompressionParameter & DecompressionParameter types
+_zstd._set_parameter_types(CompressionParameter, DecompressionParameter)
--- a/Lib/compression/zstd/_zstdfile.py
+++ b/Lib/compression/zstd/_zstdfile.py
@ -0,0 +1,349 @@
+import io
+from os import PathLike
+from _zstd import (ZstdCompressor, ZstdDecompressor, _ZSTD_DStreamSizes,
+                   ZstdError)
+from compression._common import _streams
+
+__all__ = ("ZstdFile", "open")
+
+_ZSTD_DStreamOutSize = _ZSTD_DStreamSizes[1]
+
+_MODE_CLOSED = 0
+_MODE_READ = 1
+_MODE_WRITE = 2
+
+
+def _nbytes(dat, /):
+    if isinstance(dat, (bytes, bytearray)):
+        return len(dat)
+    with memoryview(dat) as mv:
+        return mv.nbytes
+
+
+class ZstdFile(_streams.BaseStream):
+    """A file-like object providing transparent Zstandard (de)compression.
+
+    A ZstdFile can act as a wrapper for an existing file object, or refer
+    directly to a named file on disk.
+
+    ZstdFile provides a *binary* file interface. Data is read and returned as
+    bytes, and may only be written to objects that support the Buffer Protocol.
+    """
+
+    FLUSH_BLOCK = ZstdCompressor.FLUSH_BLOCK
+    FLUSH_FRAME = ZstdCompressor.FLUSH_FRAME
+
+    def __init__(self, file, /, mode="r", *,
+                 level=None, options=None, zstd_dict=None):
+        """Open a Zstandard compressed file in binary mode.
+
+        *file* can be either an file-like object, or a file name to open.
+
+        *mode* can be "r" for reading (default), "w" for (over)writing, "x" for
+        creating exclusively, or "a" for appending. These can equivalently be
+        given as "rb", "wb", "xb" and "ab" respectively.
+
+        *level* is an optional int specifying the compression level to use,
+        or COMPRESSION_LEVEL_DEFAULT if not given.
+
+        *options* is an optional dict for advanced compression parameters.
+        See CompressionParameter and DecompressionParameter for the possible
+        options.
+
+        *zstd_dict* is an optional ZstdDict object, a pre-trained Zstandard
+        dictionary. See train_dict() to train ZstdDict on sample data.
+        """
+        self._fp = None
+        self._close_fp = False
+        self._mode = _MODE_CLOSED
+        self._buffer = None
+
+        if not isinstance(mode, str):
+            raise ValueError("mode must be a str")
+        if options is not None and not isinstance(options, dict):
+            raise TypeError("options must be a dict or None")
+        mode = mode.removesuffix("b")  # handle rb, wb, xb, ab
+        if mode == "r":
+            if level is not None:
+                raise TypeError("level is illegal in read mode")
+            self._mode = _MODE_READ
+        elif mode in {"w", "a", "x"}:
+            if level is not None and not isinstance(level, int):
+                raise TypeError("level must be int or None")
+            self._mode = _MODE_WRITE
+            self._compressor = ZstdCompressor(level=level, options=options,
+                                              zstd_dict=zstd_dict)
+            self._pos = 0
+        else:
+            raise ValueError(f"Invalid mode: {mode!r}")
+
+        if isinstance(file, (str, bytes, PathLike)):
+            self._fp = io.open(file, f'{mode}b')
+            self._close_fp = True
+        elif ((mode == 'r' and hasattr(file, "read"))
+                or (mode != 'r' and hasattr(file, "write"))):
+            self._fp = file
+        else:
+            raise TypeError("file must be a file-like object "
+                            "or a str, bytes, or PathLike object")
+
+        if self._mode == _MODE_READ:
+            raw = _streams.DecompressReader(
+                self._fp,
+                ZstdDecompressor,
+                trailing_error=ZstdError,
+                zstd_dict=zstd_dict,
+                options=options,
+            )
+            self._buffer = io.BufferedReader(raw)
+
+    def close(self):
+        """Flush and close the file.
+
+        May be called multiple times. Once the file has been closed,
+        any other operation on it will raise ValueError.
+        """
+        if self._fp is None:
+            return
+        try:
+            if self._mode == _MODE_READ:
+                if getattr(self, '_buffer', None):
+                    self._buffer.close()
+                    self._buffer = None
+            elif self._mode == _MODE_WRITE:
+                self.flush(self.FLUSH_FRAME)
+                self._compressor = None
+        finally:
+            self._mode = _MODE_CLOSED
+            try:
+                if self._close_fp:
+                    self._fp.close()
+            finally:
+                self._fp = None
+                self._close_fp = False
+
+    def write(self, data, /):
+        """Write a bytes-like object *data* to the file.
+
+        Returns the number of uncompressed bytes written, which is
+        always the length of data in bytes. Note that due to buffering,
+        the file on disk may not reflect the data written until .flush()
+        or .close() is called.
+        """
+        self._check_can_write()
+
+        length = _nbytes(data)
+
+        compressed = self._compressor.compress(data)
+        self._fp.write(compressed)
+        self._pos += length
+        return length
+
+    def flush(self, mode=FLUSH_BLOCK):
+        """Flush remaining data to the underlying stream.
+
+        The mode argument can be FLUSH_BLOCK or FLUSH_FRAME. Abuse of this
+        method will reduce compression ratio, use it only when necessary.
+
+        If the program is interrupted afterwards, all data can be recovered.
+        To ensure saving to disk, also need to use os.fsync(fd).
+
+        This method does nothing in reading mode.
+        """
+        if self._mode == _MODE_READ:
+            return
+        self._check_not_closed()
+        if mode not in {self.FLUSH_BLOCK, self.FLUSH_FRAME}:
+            raise ValueError("Invalid mode argument, expected either "
+                             "ZstdFile.FLUSH_FRAME or "
+                             "ZstdFile.FLUSH_BLOCK")
+        if self._compressor.last_mode == mode:
+            return
+        # Flush zstd block/frame, and write.
+        data = self._compressor.flush(mode)
+        self._fp.write(data)
+        if hasattr(self._fp, "flush"):
+            self._fp.flush()
+
+    def read(self, size=-1):
+        """Read up to size uncompressed bytes from the file.
+
+        If size is negative or omitted, read until EOF is reached.
+        Returns b"" if the file is already at EOF.
+        """
+        if size is None:
+            size = -1
+        self._check_can_read()
+        return self._buffer.read(size)
+
+    def read1(self, size=-1):
+        """Read up to size uncompressed bytes, while trying to avoid
+        making multiple reads from the underlying stream. Reads up to a
+        buffer's worth of data if size is negative.
+
+        Returns b"" if the file is at EOF.
+        """
+        self._check_can_read()
+        if size < 0:
+            # Note this should *not* be io.DEFAULT_BUFFER_SIZE.
+            # ZSTD_DStreamOutSize is the minimum amount to read guaranteeing
+            # a full block is read.
+            size = _ZSTD_DStreamOutSize
+        return self._buffer.read1(size)
+
+    def readinto(self, b):
+        """Read bytes into b.
+
+        Returns the number of bytes read (0 for EOF).
+        """
+        self._check_can_read()
+        return self._buffer.readinto(b)
+
+    def readinto1(self, b):
+        """Read bytes into b, while trying to avoid making multiple reads
+        from the underlying stream.
+
+        Returns the number of bytes read (0 for EOF).
+        """
+        self._check_can_read()
+        return self._buffer.readinto1(b)
+
+    def readline(self, size=-1):
+        """Read a line of uncompressed bytes from the file.
+
+        The terminating newline (if present) is retained. If size is
+        non-negative, no more than size bytes will be read (in which
+        case the line may be incomplete). Returns b'' if already at EOF.
+        """
+        self._check_can_read()
+        return self._buffer.readline(size)
+
+    def seek(self, offset, whence=io.SEEK_SET):
+        """Change the file position.
+
+        The new position is specified by offset, relative to the
+        position indicated by whence. Possible values for whence are:
+
+            0: start of stream (default): offset must not be negative
+            1: current stream position
+            2: end of stream; offset must not be positive
+
+        Returns the new file position.
+
+        Note that seeking is emulated, so depending on the arguments,
+        this operation may be extremely slow.
+        """
+        self._check_can_read()
+
+        # BufferedReader.seek() checks seekable
+        return self._buffer.seek(offset, whence)
+
+    def peek(self, size=-1):
+        """Return buffered data without advancing the file position.
+
+        Always returns at least one byte of data, unless at EOF.
+        The exact number of bytes returned is unspecified.
+        """
+        # Relies on the undocumented fact that BufferedReader.peek() always
+        # returns at least one byte (except at EOF)
+        self._check_can_read()
+        return self._buffer.peek(size)
+
+    def __next__(self):
+        if ret := self._buffer.readline():
+            return ret
+        raise StopIteration
+
+    def tell(self):
+        """Return the current file position."""
+        self._check_not_closed()
+        if self._mode == _MODE_READ:
+            return self._buffer.tell()
+        elif self._mode == _MODE_WRITE:
+            return self._pos
+
+    def fileno(self):
+        """Return the file descriptor for the underlying file."""
+        self._check_not_closed()
+        return self._fp.fileno()
+
+    @property
+    def name(self):
+        self._check_not_closed()
+        return self._fp.name
+
+    @property
+    def mode(self):
+        return 'wb' if self._mode == _MODE_WRITE else 'rb'
+
+    @property
+    def closed(self):
+        """True if this file is closed."""
+        return self._mode == _MODE_CLOSED
+
+    def seekable(self):
+        """Return whether the file supports seeking."""
+        return self.readable() and self._buffer.seekable()
+
+    def readable(self):
+        """Return whether the file was opened for reading."""
+        self._check_not_closed()
+        return self._mode == _MODE_READ
+
+    def writable(self):
+        """Return whether the file was opened for writing."""
+        self._check_not_closed()
+        return self._mode == _MODE_WRITE
+
+
+def open(file, /, mode="rb", *, level=None, options=None, zstd_dict=None,
+         encoding=None, errors=None, newline=None):
+    """Open a Zstandard compressed file in binary or text mode.
+
+    file can be either a file name (given as a str, bytes, or PathLike object),
+    in which case the named file is opened, or it can be an existing file object
+    to read from or write to.
+
+    The mode parameter can be "r", "rb" (default), "w", "wb", "x", "xb", "a",
+    "ab" for binary mode, or "rt", "wt", "xt", "at" for text mode.
+
+    The level, options, and zstd_dict parameters specify the settings the same
+    as ZstdFile.
+
+    When using read mode (decompression), the options parameter is a dict
+    representing advanced decompression options. The level parameter is not
+    supported in this case. When using write mode (compression), only one of
+    level, an int representing the compression level, or options, a dict
+    representing advanced compression options, may be passed. In both modes,
+    zstd_dict is a ZstdDict instance containing a trained Zstandard dictionary.
+
+    For binary mode, this function is equivalent to the ZstdFile constructor:
+    ZstdFile(filename, mode, ...). In this case, the encoding, errors and
+    newline parameters must not be provided.
+
+    For text mode, an ZstdFile object is created, and wrapped in an
+    io.TextIOWrapper instance with the specified encoding, error handling
+    behavior, and line ending(s).
+    """
+
+    text_mode = "t" in mode
+    mode = mode.replace("t", "")
+
+    if text_mode:
+        if "b" in mode:
+            raise ValueError(f"Invalid mode: {mode!r}")
+    else:
+        if encoding is not None:
+            raise ValueError("Argument 'encoding' not supported in binary mode")
+        if errors is not None:
+            raise ValueError("Argument 'errors' not supported in binary mode")
+        if newline is not None:
+            raise ValueError("Argument 'newline' not supported in binary mode")
+
+    binary_file = ZstdFile(file, mode, level=level, options=options,
+                           zstd_dict=zstd_dict)
+
+    if text_mode:
+        return io.TextIOWrapper(binary_file, encoding, errors, newline)
+    else:
+        return binary_file