gh-88233: zipfile: refactor _strip_extra (#102084)

* Refactor zipfile._strip_extra to use higher level abstractions for extras instead of a heavy-state loop.

* Add blurb

* Remove _strip_extra and use _Extra.strip directly.

* Use memoryview to avoid unnecessary copies while splitting Extras.
This commit is contained in:
Jason R. Coombs 2023-09-25 19:46:58 -04:00 committed by GitHub
parent 25bb266fc8
commit e9791ba351
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 61 additions and 45 deletions

View file

@ -188,28 +188,42 @@ _CD64_OFFSET_START_CENTDIR = 9
_DD_SIGNATURE = 0x08074b50
_EXTRA_FIELD_STRUCT = struct.Struct('<HH')
def _strip_extra(extra, xids):
# Remove Extra Fields with specified IDs.
unpack = _EXTRA_FIELD_STRUCT.unpack
modified = False
buffer = []
start = i = 0
while i + 4 <= len(extra):
xid, xlen = unpack(extra[i : i + 4])
j = i + 4 + xlen
if xid in xids:
if i != start:
buffer.append(extra[start : i])
start = j
modified = True
i = j
if not modified:
return extra
if start != len(extra):
buffer.append(extra[start:])
return b''.join(buffer)
class _Extra(bytes):
FIELD_STRUCT = struct.Struct('<HH')
def __new__(cls, val, id=None):
return super().__new__(cls, val)
def __init__(self, val, id=None):
self.id = id
@classmethod
def read_one(cls, raw):
try:
xid, xlen = cls.FIELD_STRUCT.unpack(raw[:4])
except struct.error:
xid = None
xlen = 0
return cls(raw[:4+xlen], xid), raw[4+xlen:]
@classmethod
def split(cls, data):
# use memoryview for zero-copy slices
rest = memoryview(data)
while rest:
extra, rest = _Extra.read_one(rest)
yield extra
@classmethod
def strip(cls, data, xids):
"""Remove Extra fields with specified IDs."""
return b''.join(
ex
for ex in cls.split(data)
if ex.id not in xids
)
def _check_zipfile(fp):
try:
@ -1963,7 +1977,7 @@ class ZipFile:
min_version = 0
if extra:
# Append a ZIP64 field to the extra's
extra_data = _strip_extra(extra_data, (1,))
extra_data = _Extra.strip(extra_data, (1,))
extra_data = struct.pack(
'<HH' + 'Q'*len(extra),
1, 8*len(extra), *extra) + extra_data