mirror of
https://github.com/python/cpython.git
synced 2025-08-30 21:48:47 +00:00
#4661: add bytes parsing and generation to email (email version bump to 5.1.0)
The work on this is not 100% complete, but everything is present to allow real-world testing of the code. The only remaining major todo item is to (hopefully!) enhance the handling of non-ASCII bytes in headers converted to unicode by RFC2047 encoding them rather than replacing them with '?'s.
This commit is contained in:
parent
59fdd6736b
commit
96fd54eaec
11 changed files with 708 additions and 85 deletions
|
@ -24,8 +24,26 @@ SEMISPACE = '; '
|
|||
# existence of which force quoting of the parameter value.
|
||||
tspecials = re.compile(r'[ \(\)<>@,;:\\"/\[\]\?=]')
|
||||
|
||||
# How to figure out if we are processing strings that come from a byte
|
||||
# source with undecodable characters.
|
||||
_has_surrogates = re.compile(
|
||||
'([^\ud800-\udbff]|\A)[\udc00-\udfff]([^\udc00-\udfff]|\Z)').search
|
||||
|
||||
|
||||
# Helper functions
|
||||
def _sanitize_surrogates(value):
|
||||
# If the value contains surrogates, re-decode and replace the original
|
||||
# non-ascii bytes with '?'s. Used to sanitize header values before letting
|
||||
# them escape as strings.
|
||||
if not isinstance(value, str):
|
||||
# Header object
|
||||
return value
|
||||
if _has_surrogates(value):
|
||||
original_bytes = value.encode('ascii', 'surrogateescape')
|
||||
return original_bytes.decode('ascii', 'replace').replace('\ufffd', '?')
|
||||
else:
|
||||
return value
|
||||
|
||||
def _splitparam(param):
|
||||
# Split header parameters. BAW: this may be too simple. It isn't
|
||||
# strictly RFC 2045 (section 5.1) compliant, but it catches most headers
|
||||
|
@ -184,44 +202,72 @@ class Message:
|
|||
If the message is a multipart and the decode flag is True, then None
|
||||
is returned.
|
||||
"""
|
||||
if i is None:
|
||||
payload = self._payload
|
||||
elif not isinstance(self._payload, list):
|
||||
# Here is the logic table for this code, based on the email5.0.0 code:
|
||||
# i decode is_multipart result
|
||||
# ------ ------ ------------ ------------------------------
|
||||
# None True True None
|
||||
# i True True None
|
||||
# None False True _payload (a list)
|
||||
# i False True _payload element i (a Message)
|
||||
# i False False error (not a list)
|
||||
# i True False error (not a list)
|
||||
# None False False _payload
|
||||
# None True False _payload decoded (bytes)
|
||||
# Note that Barry planned to factor out the 'decode' case, but that
|
||||
# isn't so easy now that we handle the 8 bit data, which needs to be
|
||||
# converted in both the decode and non-decode path.
|
||||
if self.is_multipart():
|
||||
if decode:
|
||||
return None
|
||||
if i is None:
|
||||
return self._payload
|
||||
else:
|
||||
return self._payload[i]
|
||||
# For backward compatibility, Use isinstance and this error message
|
||||
# instead of the more logical is_multipart test.
|
||||
if i is not None and not isinstance(self._payload, list):
|
||||
raise TypeError('Expected list, got %s' % type(self._payload))
|
||||
else:
|
||||
payload = self._payload[i]
|
||||
payload = self._payload
|
||||
cte = self.get('content-transfer-encoding', '').lower()
|
||||
# payload can be bytes here, (I wonder if that is actually a bug?)
|
||||
if isinstance(payload, str):
|
||||
if _has_surrogates(payload):
|
||||
bpayload = payload.encode('ascii', 'surrogateescape')
|
||||
if not decode:
|
||||
try:
|
||||
payload = bpayload.decode(self.get_param('charset', 'ascii'), 'replace')
|
||||
except LookupError:
|
||||
payload = bpayload.decode('ascii', 'replace')
|
||||
elif decode:
|
||||
try:
|
||||
bpayload = payload.encode('ascii')
|
||||
except UnicodeError:
|
||||
# This won't happen for RFC compliant messages (messages
|
||||
# containing only ASCII codepoints in the unicode input).
|
||||
# If it does happen, turn the string into bytes in a way
|
||||
# guaranteed not to fail.
|
||||
bpayload = payload.encode('raw-unicode-escape')
|
||||
if not decode:
|
||||
return payload
|
||||
# Decoded payloads always return bytes. XXX split this part out into
|
||||
# a new method called .get_decoded_payload().
|
||||
if self.is_multipart():
|
||||
return None
|
||||
cte = self.get('content-transfer-encoding', '').lower()
|
||||
if cte == 'quoted-printable':
|
||||
if isinstance(payload, str):
|
||||
payload = payload.encode('ascii')
|
||||
return utils._qdecode(payload)
|
||||
return utils._qdecode(bpayload)
|
||||
elif cte == 'base64':
|
||||
try:
|
||||
if isinstance(payload, str):
|
||||
payload = payload.encode('ascii')
|
||||
return base64.b64decode(payload)
|
||||
return base64.b64decode(bpayload)
|
||||
except binascii.Error:
|
||||
# Incorrect padding
|
||||
pass
|
||||
return bpayload
|
||||
elif cte in ('x-uuencode', 'uuencode', 'uue', 'x-uue'):
|
||||
in_file = BytesIO(payload.encode('ascii'))
|
||||
in_file = BytesIO(bpayload)
|
||||
out_file = BytesIO()
|
||||
try:
|
||||
uu.decode(in_file, out_file, quiet=True)
|
||||
return out_file.getvalue()
|
||||
except uu.Error:
|
||||
# Some decoding problem
|
||||
pass
|
||||
# Is there a better way to do this? We can't use the bytes
|
||||
# constructor.
|
||||
return bpayload
|
||||
if isinstance(payload, str):
|
||||
return payload.encode('raw-unicode-escape')
|
||||
return bpayload
|
||||
return payload
|
||||
|
||||
def set_payload(self, payload, charset=None):
|
||||
|
@ -340,7 +386,7 @@ class Message:
|
|||
Any fields deleted and re-inserted are always appended to the header
|
||||
list.
|
||||
"""
|
||||
return [v for k, v in self._headers]
|
||||
return [_sanitize_surrogates(v) for k, v in self._headers]
|
||||
|
||||
def items(self):
|
||||
"""Get all the message's header fields and values.
|
||||
|
@ -350,7 +396,7 @@ class Message:
|
|||
Any fields deleted and re-inserted are always appended to the header
|
||||
list.
|
||||
"""
|
||||
return self._headers[:]
|
||||
return [(k, _sanitize_surrogates(v)) for k, v in self._headers]
|
||||
|
||||
def get(self, name, failobj=None):
|
||||
"""Get a header value.
|
||||
|
@ -361,7 +407,7 @@ class Message:
|
|||
name = name.lower()
|
||||
for k, v in self._headers:
|
||||
if k.lower() == name:
|
||||
return v
|
||||
return _sanitize_surrogates(v)
|
||||
return failobj
|
||||
|
||||
#
|
||||
|
@ -381,7 +427,7 @@ class Message:
|
|||
name = name.lower()
|
||||
for k, v in self._headers:
|
||||
if k.lower() == name:
|
||||
values.append(v)
|
||||
values.append(_sanitize_surrogates(v))
|
||||
if not values:
|
||||
return failobj
|
||||
return values
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue