gh-102120: [TarFile] Add an iter function that doesn't cache (GH-102128)

This commit is contained in:
Robert O'Shea 2023-05-23 21:44:40 +01:00 committed by GitHub
parent 097b7830cd
commit 50fce89d12
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 42 additions and 7 deletions

View file

@ -318,7 +318,7 @@ be finalized; only the internally used file object will be closed. See the
.. versionadded:: 3.2 .. versionadded:: 3.2
Added support for the context management protocol. Added support for the context management protocol.
.. class:: TarFile(name=None, mode='r', fileobj=None, format=DEFAULT_FORMAT, tarinfo=TarInfo, dereference=False, ignore_zeros=False, encoding=ENCODING, errors='surrogateescape', pax_headers=None, debug=0, errorlevel=1) .. class:: TarFile(name=None, mode='r', fileobj=None, format=DEFAULT_FORMAT, tarinfo=TarInfo, dereference=False, ignore_zeros=False, encoding=ENCODING, errors='surrogateescape', pax_headers=None, debug=0, errorlevel=1, stream=False)
All following arguments are optional and can be accessed as instance attributes All following arguments are optional and can be accessed as instance attributes
as well. as well.
@ -369,6 +369,9 @@ be finalized; only the internally used file object will be closed. See the
The *pax_headers* argument is an optional dictionary of strings which The *pax_headers* argument is an optional dictionary of strings which
will be added as a pax global header if *format* is :const:`PAX_FORMAT`. will be added as a pax global header if *format* is :const:`PAX_FORMAT`.
If *stream* is set to :const:`True` then while reading the archive info about files
in the archive are not cached, saving memory.
.. versionchanged:: 3.2 .. versionchanged:: 3.2
Use ``'surrogateescape'`` as the default for the *errors* argument. Use ``'surrogateescape'`` as the default for the *errors* argument.
@ -378,6 +381,8 @@ be finalized; only the internally used file object will be closed. See the
.. versionchanged:: 3.6 .. versionchanged:: 3.6
The *name* parameter accepts a :term:`path-like object`. The *name* parameter accepts a :term:`path-like object`.
.. versionchanged:: 3.13
Add the *stream* parameter.
.. classmethod:: TarFile.open(...) .. classmethod:: TarFile.open(...)

View file

@ -1633,7 +1633,7 @@ class TarFile(object):
def __init__(self, name=None, mode="r", fileobj=None, format=None, def __init__(self, name=None, mode="r", fileobj=None, format=None,
tarinfo=None, dereference=None, ignore_zeros=None, encoding=None, tarinfo=None, dereference=None, ignore_zeros=None, encoding=None,
errors="surrogateescape", pax_headers=None, debug=None, errors="surrogateescape", pax_headers=None, debug=None,
errorlevel=None, copybufsize=None): errorlevel=None, copybufsize=None, stream=False):
"""Open an (uncompressed) tar archive `name'. `mode' is either 'r' to """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to
read from an existing archive, 'a' to append data to an existing read from an existing archive, 'a' to append data to an existing
file or 'w' to create a new file overwriting an existing one. `mode' file or 'w' to create a new file overwriting an existing one. `mode'
@ -1665,6 +1665,8 @@ class TarFile(object):
self.name = os.path.abspath(name) if name else None self.name = os.path.abspath(name) if name else None
self.fileobj = fileobj self.fileobj = fileobj
self.stream = stream
# Init attributes. # Init attributes.
if format is not None: if format is not None:
self.format = format self.format = format
@ -2631,6 +2633,8 @@ class TarFile(object):
break break
if tarinfo is not None: if tarinfo is not None:
# if streaming the file we do not want to cache the tarinfo
if not self.stream:
self.members.append(tarinfo) self.members.append(tarinfo)
else: else:
self._loaded = True self._loaded = True
@ -2682,8 +2686,9 @@ class TarFile(object):
def _load(self): def _load(self):
"""Read through the entire archive file and look for readable """Read through the entire archive file and look for readable
members. members. This should not run if the file is set to stream.
""" """
if not self.stream:
while self.next() is not None: while self.next() is not None:
pass pass
self._loaded = True self._loaded = True

View file

@ -100,6 +100,14 @@ class ReadTest(TarTest):
def tearDown(self): def tearDown(self):
self.tar.close() self.tar.close()
class StreamModeTest(ReadTest):
# Only needs to change how the tarfile is opened to set
# stream mode
def setUp(self):
self.tar = tarfile.open(self.tarname, mode=self.mode,
encoding="iso8859-1",
stream=True)
class UstarReadTest(ReadTest, unittest.TestCase): class UstarReadTest(ReadTest, unittest.TestCase):
@ -852,6 +860,21 @@ class Bz2StreamReadTest(Bz2Test, StreamReadTest):
class LzmaStreamReadTest(LzmaTest, StreamReadTest): class LzmaStreamReadTest(LzmaTest, StreamReadTest):
pass pass
class TarStreamModeReadTest(StreamModeTest, unittest.TestCase):
def test_stream_mode_no_cache(self):
for _ in self.tar:
pass
self.assertEqual(self.tar.members, [])
class GzipStreamModeReadTest(GzipTest, TarStreamModeReadTest):
pass
class Bz2StreamModeReadTest(Bz2Test, TarStreamModeReadTest):
pass
class LzmaStreamModeReadTest(LzmaTest, TarStreamModeReadTest):
pass
class DetectReadTest(TarTest, unittest.TestCase): class DetectReadTest(TarTest, unittest.TestCase):
def _testfunc_file(self, name, mode): def _testfunc_file(self, name, mode):

View file

@ -0,0 +1,2 @@
Added a stream mode to ``tarfile`` that allows for reading
archives without caching info about the inner files.