gh-133890: Handle UnicodeEncodeError in tarfile (GH-134147)

UnicodeEncodeError is now handled the same way as OSError during
TarFile member extraction.
This commit is contained in:
Serhiy Storchaka 2025-05-18 22:21:06 +03:00 committed by GitHub
parent 5cbc8c632e
commit 9983c7d441
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 49 additions and 6 deletions

View file

@ -3490,11 +3490,12 @@ class ArchiveMaker:
with t.open() as tar:
... # `tar` is now a TarFile with 'filename' in it!
"""
def __init__(self):
def __init__(self, **kwargs):
self.bio = io.BytesIO()
self.tar_kwargs = dict(kwargs)
def __enter__(self):
self.tar_w = tarfile.TarFile(mode='w', fileobj=self.bio)
self.tar_w = tarfile.TarFile(mode='w', fileobj=self.bio, **self.tar_kwargs)
return self
def __exit__(self, *exc):
@ -4073,7 +4074,10 @@ class TestExtractionFilters(unittest.TestCase):
# that in the test archive.)
with tarfile.TarFile.open(tarname) as tar:
for tarinfo in tar.getmembers():
filtered = tarfile.tar_filter(tarinfo, '')
try:
filtered = tarfile.tar_filter(tarinfo, '')
except UnicodeEncodeError:
continue
self.assertIs(filtered.name, tarinfo.name)
self.assertIs(filtered.type, tarinfo.type)
@ -4084,11 +4088,48 @@ class TestExtractionFilters(unittest.TestCase):
for tarinfo in tar.getmembers():
try:
filtered = tarfile.data_filter(tarinfo, '')
except tarfile.FilterError:
except (tarfile.FilterError, UnicodeEncodeError):
continue
self.assertIs(filtered.name, tarinfo.name)
self.assertIs(filtered.type, tarinfo.type)
@unittest.skipIf(sys.platform == 'win32', 'requires native bytes paths')
def test_filter_unencodable(self):
# Sanity check using a valid path.
tarinfo = tarfile.TarInfo(os_helper.TESTFN)
filtered = tarfile.tar_filter(tarinfo, '')
self.assertIs(filtered.name, tarinfo.name)
filtered = tarfile.data_filter(tarinfo, '')
self.assertIs(filtered.name, tarinfo.name)
tarinfo = tarfile.TarInfo('test\x00')
self.assertRaises(ValueError, tarfile.tar_filter, tarinfo, '')
self.assertRaises(ValueError, tarfile.data_filter, tarinfo, '')
tarinfo = tarfile.TarInfo('\ud800')
self.assertRaises(UnicodeEncodeError, tarfile.tar_filter, tarinfo, '')
self.assertRaises(UnicodeEncodeError, tarfile.data_filter, tarinfo, '')
@unittest.skipIf(sys.platform == 'win32', 'requires native bytes paths')
def test_extract_unencodable(self):
# Create a member with name \xed\xa0\x80 which is UTF-8 encoded
# lone surrogate \ud800.
with ArchiveMaker(encoding='ascii', errors='surrogateescape') as arc:
arc.add('\udced\udca0\udc80')
with os_helper.temp_cwd() as tmp:
tar = arc.open(encoding='utf-8', errors='surrogatepass',
errorlevel=1)
self.assertEqual(tar.getnames(), ['\ud800'])
with self.assertRaises(UnicodeEncodeError):
tar.extractall()
self.assertEqual(os.listdir(), [])
tar = arc.open(encoding='utf-8', errors='surrogatepass',
errorlevel=0, debug=1)
with support.captured_stderr() as stderr:
tar.extractall()
self.assertEqual(os.listdir(), [])
self.assertIn('tarfile: UnicodeEncodeError ', stderr.getvalue())
def test_change_default_filter_on_instance(self):
tar = tarfile.TarFile(tarname, 'r')
def strict_filter(tarinfo, path):