mirror of
https://github.com/python/cpython.git
synced 2025-12-10 11:00:14 +00:00
bpo-28080: Add support for the fallback encoding in ZIP files (GH-32007)
* Add the metadata_encoding parameter in the zipfile.ZipFile constructor. * Add the --metadata-encoding option in the zipfile CLI. Co-authored-by: Stephen J. Turnbull <stephen@xemacs.org>
This commit is contained in:
parent
c6cd3cc93c
commit
a25a985535
5 changed files with 211 additions and 11 deletions
|
|
@ -139,7 +139,8 @@ ZipFile Objects
|
||||||
|
|
||||||
|
|
||||||
.. class:: ZipFile(file, mode='r', compression=ZIP_STORED, allowZip64=True, \
|
.. class:: ZipFile(file, mode='r', compression=ZIP_STORED, allowZip64=True, \
|
||||||
compresslevel=None, *, strict_timestamps=True)
|
compresslevel=None, *, strict_timestamps=True,
|
||||||
|
metadata_encoding=None)
|
||||||
|
|
||||||
Open a ZIP file, where *file* can be a path to a file (a string), a
|
Open a ZIP file, where *file* can be a path to a file (a string), a
|
||||||
file-like object or a :term:`path-like object`.
|
file-like object or a :term:`path-like object`.
|
||||||
|
|
@ -183,6 +184,10 @@ ZipFile Objects
|
||||||
Similar behavior occurs with files newer than 2107-12-31,
|
Similar behavior occurs with files newer than 2107-12-31,
|
||||||
the timestamp is also set to the limit.
|
the timestamp is also set to the limit.
|
||||||
|
|
||||||
|
When mode is ``'r'``, *metadata_encoding* may be set to the name of a codec,
|
||||||
|
which will be used to decode metadata such as the names of members and ZIP
|
||||||
|
comments.
|
||||||
|
|
||||||
If the file is created with mode ``'w'``, ``'x'`` or ``'a'`` and then
|
If the file is created with mode ``'w'``, ``'x'`` or ``'a'`` and then
|
||||||
:meth:`closed <close>` without adding any files to the archive, the appropriate
|
:meth:`closed <close>` without adding any files to the archive, the appropriate
|
||||||
ZIP structures for an empty archive will be written to the file.
|
ZIP structures for an empty archive will be written to the file.
|
||||||
|
|
@ -194,6 +199,19 @@ ZipFile Objects
|
||||||
with ZipFile('spam.zip', 'w') as myzip:
|
with ZipFile('spam.zip', 'w') as myzip:
|
||||||
myzip.write('eggs.txt')
|
myzip.write('eggs.txt')
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
*metadata_encoding* is an instance-wide setting for the ZipFile.
|
||||||
|
It is not currently possible to set this on a per-member basis.
|
||||||
|
|
||||||
|
This attribute is a workaround for legacy implementations which produce
|
||||||
|
archives with names in the current locale encoding or code page (mostly
|
||||||
|
on Windows). According to the .ZIP standard, the encoding of metadata
|
||||||
|
may be specified to be either IBM code page (default) or UTF-8 by a flag
|
||||||
|
in the archive header.
|
||||||
|
That flag takes precedence over *metadata_encoding*, which is
|
||||||
|
a Python-specific extension.
|
||||||
|
|
||||||
.. versionadded:: 3.2
|
.. versionadded:: 3.2
|
||||||
Added the ability to use :class:`ZipFile` as a context manager.
|
Added the ability to use :class:`ZipFile` as a context manager.
|
||||||
|
|
||||||
|
|
@ -220,6 +238,10 @@ ZipFile Objects
|
||||||
.. versionadded:: 3.8
|
.. versionadded:: 3.8
|
||||||
The *strict_timestamps* keyword-only argument
|
The *strict_timestamps* keyword-only argument
|
||||||
|
|
||||||
|
.. versionchanged:: 3.11
|
||||||
|
Added support for specifying member name encoding for reading
|
||||||
|
metadata in the zipfile's directory and file headers.
|
||||||
|
|
||||||
|
|
||||||
.. method:: ZipFile.close()
|
.. method:: ZipFile.close()
|
||||||
|
|
||||||
|
|
@ -395,6 +417,15 @@ ZipFile Objects
|
||||||
given.
|
given.
|
||||||
The archive must be open with mode ``'w'``, ``'x'`` or ``'a'``.
|
The archive must be open with mode ``'w'``, ``'x'`` or ``'a'``.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
The ZIP file standard historically did not specify a metadata encoding,
|
||||||
|
but strongly recommended CP437 (the original IBM PC encoding) for
|
||||||
|
interoperability. Recent versions allow use of UTF-8 (only). In this
|
||||||
|
module, UTF-8 will automatically be used to write the member names if
|
||||||
|
they contain any non-ASCII characters. It is not possible to write
|
||||||
|
member names in any encoding other than ASCII or UTF-8.
|
||||||
|
|
||||||
.. note::
|
.. note::
|
||||||
|
|
||||||
Archive names should be relative to the archive root, that is, they should not
|
Archive names should be relative to the archive root, that is, they should not
|
||||||
|
|
@ -868,6 +899,14 @@ Command-line options
|
||||||
|
|
||||||
Test whether the zipfile is valid or not.
|
Test whether the zipfile is valid or not.
|
||||||
|
|
||||||
|
.. cmdoption:: --metadata-encoding <encoding>
|
||||||
|
|
||||||
|
Specify encoding of member names for :option:`-l`, :option:`-e` and
|
||||||
|
:option:`-t`.
|
||||||
|
|
||||||
|
.. versionadded:: 3.11
|
||||||
|
|
||||||
|
|
||||||
Decompression pitfalls
|
Decompression pitfalls
|
||||||
----------------------
|
----------------------
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -432,6 +432,12 @@ venv
|
||||||
Third party code that also creates new virtual environments should do the same.
|
Third party code that also creates new virtual environments should do the same.
|
||||||
(Contributed by Miro Hrončok in :issue:`45413`.)
|
(Contributed by Miro Hrončok in :issue:`45413`.)
|
||||||
|
|
||||||
|
zipfile
|
||||||
|
-------
|
||||||
|
|
||||||
|
* Added support for specifying member name encoding for reading
|
||||||
|
metadata in the zipfile's directory and file headers.
|
||||||
|
(Contributed by Stephen J. Turnbull and Serhiy Storchaka in :issue:`28080`.)
|
||||||
|
|
||||||
fcntl
|
fcntl
|
||||||
-----
|
-----
|
||||||
|
|
|
||||||
|
|
@ -21,8 +21,10 @@ from tempfile import TemporaryFile
|
||||||
from random import randint, random, randbytes
|
from random import randint, random, randbytes
|
||||||
|
|
||||||
from test.support import script_helper
|
from test.support import script_helper
|
||||||
from test.support import (findfile, requires_zlib, requires_bz2,
|
from test.support import (
|
||||||
requires_lzma, captured_stdout, requires_subprocess)
|
findfile, requires_zlib, requires_bz2, requires_lzma,
|
||||||
|
captured_stdout, captured_stderr, requires_subprocess
|
||||||
|
)
|
||||||
from test.support.os_helper import (
|
from test.support.os_helper import (
|
||||||
TESTFN, unlink, rmtree, temp_dir, temp_cwd, fd_count
|
TESTFN, unlink, rmtree, temp_dir, temp_cwd, fd_count
|
||||||
)
|
)
|
||||||
|
|
@ -3210,5 +3212,139 @@ class TestPath(unittest.TestCase):
|
||||||
assert isinstance(file, cls)
|
assert isinstance(file, cls)
|
||||||
|
|
||||||
|
|
||||||
|
class EncodedMetadataTests(unittest.TestCase):
|
||||||
|
file_names = ['\u4e00', '\u4e8c', '\u4e09'] # Han 'one', 'two', 'three'
|
||||||
|
file_content = [
|
||||||
|
"This is pure ASCII.\n".encode('ascii'),
|
||||||
|
# This is modern Japanese. (UTF-8)
|
||||||
|
"\u3053\u308c\u306f\u73fe\u4ee3\u7684\u65e5\u672c\u8a9e\u3067\u3059\u3002\n".encode('utf-8'),
|
||||||
|
# This is obsolete Japanese. (Shift JIS)
|
||||||
|
"\u3053\u308c\u306f\u53e4\u3044\u65e5\u672c\u8a9e\u3067\u3059\u3002\n".encode('shift_jis'),
|
||||||
|
]
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
self.addCleanup(unlink, TESTFN)
|
||||||
|
# Create .zip of 3 members with Han names encoded in Shift JIS.
|
||||||
|
# Each name is 1 Han character encoding to 2 bytes in Shift JIS.
|
||||||
|
# The ASCII names are arbitrary as long as they are length 2 and
|
||||||
|
# not otherwise contained in the zip file.
|
||||||
|
# Data elements are encoded bytes (ascii, utf-8, shift_jis).
|
||||||
|
placeholders = ["n1", "n2"] + self.file_names[2:]
|
||||||
|
with zipfile.ZipFile(TESTFN, mode="w") as tf:
|
||||||
|
for temp, content in zip(placeholders, self.file_content):
|
||||||
|
tf.writestr(temp, content, zipfile.ZIP_STORED)
|
||||||
|
# Hack in the Shift JIS names with flag bit 11 (UTF-8) unset.
|
||||||
|
with open(TESTFN, "rb") as tf:
|
||||||
|
data = tf.read()
|
||||||
|
for name, temp in zip(self.file_names, placeholders[:2]):
|
||||||
|
data = data.replace(temp.encode('ascii'),
|
||||||
|
name.encode('shift_jis'))
|
||||||
|
with open(TESTFN, "wb") as tf:
|
||||||
|
tf.write(data)
|
||||||
|
|
||||||
|
def _test_read(self, zipfp, expected_names, expected_content):
|
||||||
|
# Check the namelist
|
||||||
|
names = zipfp.namelist()
|
||||||
|
self.assertEqual(sorted(names), sorted(expected_names))
|
||||||
|
|
||||||
|
# Check infolist
|
||||||
|
infos = zipfp.infolist()
|
||||||
|
names = [zi.filename for zi in infos]
|
||||||
|
self.assertEqual(sorted(names), sorted(expected_names))
|
||||||
|
|
||||||
|
# check getinfo
|
||||||
|
for name, content in zip(expected_names, expected_content):
|
||||||
|
info = zipfp.getinfo(name)
|
||||||
|
self.assertEqual(info.filename, name)
|
||||||
|
self.assertEqual(info.file_size, len(content))
|
||||||
|
self.assertEqual(zipfp.read(name), content)
|
||||||
|
|
||||||
|
def test_read_with_metadata_encoding(self):
|
||||||
|
# Read the ZIP archive with correct metadata_encoding
|
||||||
|
with zipfile.ZipFile(TESTFN, "r", metadata_encoding='shift_jis') as zipfp:
|
||||||
|
self._test_read(zipfp, self.file_names, self.file_content)
|
||||||
|
|
||||||
|
def test_read_without_metadata_encoding(self):
|
||||||
|
# Read the ZIP archive without metadata_encoding
|
||||||
|
expected_names = [name.encode('shift_jis').decode('cp437')
|
||||||
|
for name in self.file_names[:2]] + self.file_names[2:]
|
||||||
|
with zipfile.ZipFile(TESTFN, "r") as zipfp:
|
||||||
|
self._test_read(zipfp, expected_names, self.file_content)
|
||||||
|
|
||||||
|
def test_read_with_incorrect_metadata_encoding(self):
|
||||||
|
# Read the ZIP archive with incorrect metadata_encoding
|
||||||
|
expected_names = [name.encode('shift_jis').decode('koi8-u')
|
||||||
|
for name in self.file_names[:2]] + self.file_names[2:]
|
||||||
|
with zipfile.ZipFile(TESTFN, "r", metadata_encoding='koi8-u') as zipfp:
|
||||||
|
self._test_read(zipfp, expected_names, self.file_content)
|
||||||
|
|
||||||
|
def test_read_with_unsuitable_metadata_encoding(self):
|
||||||
|
# Read the ZIP archive with metadata_encoding unsuitable for
|
||||||
|
# decoding metadata
|
||||||
|
with self.assertRaises(UnicodeDecodeError):
|
||||||
|
zipfile.ZipFile(TESTFN, "r", metadata_encoding='ascii')
|
||||||
|
with self.assertRaises(UnicodeDecodeError):
|
||||||
|
zipfile.ZipFile(TESTFN, "r", metadata_encoding='utf-8')
|
||||||
|
|
||||||
|
def test_read_after_append(self):
|
||||||
|
newname = '\u56db' # Han 'four'
|
||||||
|
expected_names = [name.encode('shift_jis').decode('cp437')
|
||||||
|
for name in self.file_names[:2]] + self.file_names[2:]
|
||||||
|
expected_names.append(newname)
|
||||||
|
expected_content = (*self.file_content, b"newcontent")
|
||||||
|
|
||||||
|
with zipfile.ZipFile(TESTFN, "a") as zipfp:
|
||||||
|
zipfp.writestr(newname, "newcontent")
|
||||||
|
self.assertEqual(sorted(zipfp.namelist()), sorted(expected_names))
|
||||||
|
|
||||||
|
with zipfile.ZipFile(TESTFN, "r") as zipfp:
|
||||||
|
self._test_read(zipfp, expected_names, expected_content)
|
||||||
|
|
||||||
|
with zipfile.ZipFile(TESTFN, "r", metadata_encoding='shift_jis') as zipfp:
|
||||||
|
self.assertEqual(sorted(zipfp.namelist()), sorted(expected_names))
|
||||||
|
for i, (name, content) in enumerate(zip(expected_names, expected_content)):
|
||||||
|
info = zipfp.getinfo(name)
|
||||||
|
self.assertEqual(info.filename, name)
|
||||||
|
self.assertEqual(info.file_size, len(content))
|
||||||
|
if i < 2:
|
||||||
|
with self.assertRaises(zipfile.BadZipFile):
|
||||||
|
zipfp.read(name)
|
||||||
|
else:
|
||||||
|
self.assertEqual(zipfp.read(name), content)
|
||||||
|
|
||||||
|
def test_write_with_metadata_encoding(self):
|
||||||
|
ZF = zipfile.ZipFile
|
||||||
|
for mode in ("w", "x", "a"):
|
||||||
|
with self.assertRaisesRegex(ValueError,
|
||||||
|
"^metadata_encoding is only"):
|
||||||
|
ZF("nonesuch.zip", mode, metadata_encoding="shift_jis")
|
||||||
|
|
||||||
|
def test_cli_with_metadata_encoding(self):
|
||||||
|
errmsg = "Non-conforming encodings not supported with -c."
|
||||||
|
args = ["--metadata-encoding=shift_jis", "-c", "nonesuch", "nonesuch"]
|
||||||
|
with captured_stdout() as stdout:
|
||||||
|
with captured_stderr() as stderr:
|
||||||
|
self.assertRaises(SystemExit, zipfile.main, args)
|
||||||
|
self.assertEqual(stdout.getvalue(), "")
|
||||||
|
self.assertIn(errmsg, stderr.getvalue())
|
||||||
|
|
||||||
|
with captured_stdout() as stdout:
|
||||||
|
zipfile.main(["--metadata-encoding=shift_jis", "-t", TESTFN])
|
||||||
|
listing = stdout.getvalue()
|
||||||
|
|
||||||
|
with captured_stdout() as stdout:
|
||||||
|
zipfile.main(["--metadata-encoding=shift_jis", "-l", TESTFN])
|
||||||
|
listing = stdout.getvalue()
|
||||||
|
for name in self.file_names:
|
||||||
|
self.assertIn(name, listing)
|
||||||
|
|
||||||
|
os.mkdir(TESTFN2)
|
||||||
|
self.addCleanup(rmtree, TESTFN2)
|
||||||
|
zipfile.main(["--metadata-encoding=shift_jis", "-e", TESTFN, TESTFN2])
|
||||||
|
listing = os.listdir(TESTFN2)
|
||||||
|
for name in self.file_names:
|
||||||
|
self.assertIn(name, listing)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
|
|
||||||
|
|
@ -480,7 +480,7 @@ class ZipInfo (object):
|
||||||
|
|
||||||
def _encodeFilenameFlags(self):
|
def _encodeFilenameFlags(self):
|
||||||
try:
|
try:
|
||||||
return self.filename.encode('ascii'), self.flag_bits
|
return self.filename.encode('ascii'), self.flag_bits & ~_MASK_UTF_FILENAME
|
||||||
except UnicodeEncodeError:
|
except UnicodeEncodeError:
|
||||||
return self.filename.encode('utf-8'), self.flag_bits | _MASK_UTF_FILENAME
|
return self.filename.encode('utf-8'), self.flag_bits | _MASK_UTF_FILENAME
|
||||||
|
|
||||||
|
|
@ -1240,7 +1240,7 @@ class ZipFile:
|
||||||
_windows_illegal_name_trans_table = None
|
_windows_illegal_name_trans_table = None
|
||||||
|
|
||||||
def __init__(self, file, mode="r", compression=ZIP_STORED, allowZip64=True,
|
def __init__(self, file, mode="r", compression=ZIP_STORED, allowZip64=True,
|
||||||
compresslevel=None, *, strict_timestamps=True):
|
compresslevel=None, *, strict_timestamps=True, metadata_encoding=None):
|
||||||
"""Open the ZIP file with mode read 'r', write 'w', exclusive create 'x',
|
"""Open the ZIP file with mode read 'r', write 'w', exclusive create 'x',
|
||||||
or append 'a'."""
|
or append 'a'."""
|
||||||
if mode not in ('r', 'w', 'x', 'a'):
|
if mode not in ('r', 'w', 'x', 'a'):
|
||||||
|
|
@ -1259,6 +1259,12 @@ class ZipFile:
|
||||||
self.pwd = None
|
self.pwd = None
|
||||||
self._comment = b''
|
self._comment = b''
|
||||||
self._strict_timestamps = strict_timestamps
|
self._strict_timestamps = strict_timestamps
|
||||||
|
self.metadata_encoding = metadata_encoding
|
||||||
|
|
||||||
|
# Check that we don't try to write with nonconforming codecs
|
||||||
|
if self.metadata_encoding and mode != 'r':
|
||||||
|
raise ValueError(
|
||||||
|
"metadata_encoding is only supported for reading files")
|
||||||
|
|
||||||
# Check if we were passed a file-like object
|
# Check if we were passed a file-like object
|
||||||
if isinstance(file, os.PathLike):
|
if isinstance(file, os.PathLike):
|
||||||
|
|
@ -1389,13 +1395,13 @@ class ZipFile:
|
||||||
if self.debug > 2:
|
if self.debug > 2:
|
||||||
print(centdir)
|
print(centdir)
|
||||||
filename = fp.read(centdir[_CD_FILENAME_LENGTH])
|
filename = fp.read(centdir[_CD_FILENAME_LENGTH])
|
||||||
flags = centdir[5]
|
flags = centdir[_CD_FLAG_BITS]
|
||||||
if flags & _MASK_UTF_FILENAME:
|
if flags & _MASK_UTF_FILENAME:
|
||||||
# UTF-8 file names extension
|
# UTF-8 file names extension
|
||||||
filename = filename.decode('utf-8')
|
filename = filename.decode('utf-8')
|
||||||
else:
|
else:
|
||||||
# Historical ZIP filename encoding
|
# Historical ZIP filename encoding
|
||||||
filename = filename.decode('cp437')
|
filename = filename.decode(self.metadata_encoding or 'cp437')
|
||||||
# Create ZipInfo instance to store file information
|
# Create ZipInfo instance to store file information
|
||||||
x = ZipInfo(filename)
|
x = ZipInfo(filename)
|
||||||
x.extra = fp.read(centdir[_CD_EXTRA_FIELD_LENGTH])
|
x.extra = fp.read(centdir[_CD_EXTRA_FIELD_LENGTH])
|
||||||
|
|
@ -1572,7 +1578,7 @@ class ZipFile:
|
||||||
# UTF-8 filename
|
# UTF-8 filename
|
||||||
fname_str = fname.decode("utf-8")
|
fname_str = fname.decode("utf-8")
|
||||||
else:
|
else:
|
||||||
fname_str = fname.decode("cp437")
|
fname_str = fname.decode(self.metadata_encoding or "cp437")
|
||||||
|
|
||||||
if fname_str != zinfo.orig_filename:
|
if fname_str != zinfo.orig_filename:
|
||||||
raise BadZipFile(
|
raise BadZipFile(
|
||||||
|
|
@ -2461,11 +2467,15 @@ def main(args=None):
|
||||||
help='Create zipfile from sources')
|
help='Create zipfile from sources')
|
||||||
group.add_argument('-t', '--test', metavar='<zipfile>',
|
group.add_argument('-t', '--test', metavar='<zipfile>',
|
||||||
help='Test if a zipfile is valid')
|
help='Test if a zipfile is valid')
|
||||||
|
parser.add_argument('--metadata-encoding', metavar='<encoding>',
|
||||||
|
help='Specify encoding of member names for -l, -e and -t')
|
||||||
args = parser.parse_args(args)
|
args = parser.parse_args(args)
|
||||||
|
|
||||||
|
encoding = args.metadata_encoding
|
||||||
|
|
||||||
if args.test is not None:
|
if args.test is not None:
|
||||||
src = args.test
|
src = args.test
|
||||||
with ZipFile(src, 'r') as zf:
|
with ZipFile(src, 'r', metadata_encoding=encoding) as zf:
|
||||||
badfile = zf.testzip()
|
badfile = zf.testzip()
|
||||||
if badfile:
|
if badfile:
|
||||||
print("The following enclosed file is corrupted: {!r}".format(badfile))
|
print("The following enclosed file is corrupted: {!r}".format(badfile))
|
||||||
|
|
@ -2473,15 +2483,20 @@ def main(args=None):
|
||||||
|
|
||||||
elif args.list is not None:
|
elif args.list is not None:
|
||||||
src = args.list
|
src = args.list
|
||||||
with ZipFile(src, 'r') as zf:
|
with ZipFile(src, 'r', metadata_encoding=encoding) as zf:
|
||||||
zf.printdir()
|
zf.printdir()
|
||||||
|
|
||||||
elif args.extract is not None:
|
elif args.extract is not None:
|
||||||
src, curdir = args.extract
|
src, curdir = args.extract
|
||||||
with ZipFile(src, 'r') as zf:
|
with ZipFile(src, 'r', metadata_encoding=encoding) as zf:
|
||||||
zf.extractall(curdir)
|
zf.extractall(curdir)
|
||||||
|
|
||||||
elif args.create is not None:
|
elif args.create is not None:
|
||||||
|
if encoding:
|
||||||
|
print("Non-conforming encodings not supported with -c.",
|
||||||
|
file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
zip_name = args.create.pop(0)
|
zip_name = args.create.pop(0)
|
||||||
files = args.create
|
files = args.create
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,4 @@
|
||||||
|
Add the *metadata_encoding* parameter in the :class:`zipfile.ZipFile`
|
||||||
|
constructor and the ``--metadata-encoding`` option in the :mod:`zipfile`
|
||||||
|
CLI to allow reading zipfiles using non-standard codecs to encode the
|
||||||
|
filenames within the archive.
|
||||||
Loading…
Add table
Add a link
Reference in a new issue