#4661: add bytes parsing and generation to email (email version bump to 5.1.0)

The work on this is not 100% complete, but everything is present to allow real-world testing of the code. The only remaining major todo item is to (hopefully!) enhance the handling of non-ASCII bytes in headers converted to unicode by RFC2047 encoding them rather than replacing them with '?'s.
2025-08-30 21:48:47 +00:00 · 2010-10-08 15:55:28 +00:00 · 2010-10-08 15:55:28 +00:00 · 96fd54eaec
commit 96fd54eaec
parent 59fdd6736b
11 changed files with 708 additions and 85 deletions
--- a/Lib/email/message.py
+++ b/Lib/email/message.py
@ -24,8 +24,26 @@ SEMISPACE = '; '
 # existence of which force quoting of the parameter value.
 tspecials = re.compile(r'[ \(\)<>@,;:\\"/\[\]\?=]')

+# How to figure out if we are processing strings that come from a byte
+# source with undecodable characters.
+_has_surrogates = re.compile(
+    '([^\ud800-\udbff]|\A)[\udc00-\udfff]([^\udc00-\udfff]|\Z)').search
+

 # Helper functions
+def _sanitize_surrogates(value):
+    # If the value contains surrogates, re-decode and replace the original
+    # non-ascii bytes with '?'s.  Used to sanitize header values before letting
+    # them escape as strings.
+    if not isinstance(value, str):
+        # Header object
+        return value
+    if _has_surrogates(value):
+        original_bytes = value.encode('ascii', 'surrogateescape')
+        return original_bytes.decode('ascii', 'replace').replace('\ufffd', '?')
+    else:
+        return value
+
 def _splitparam(param):
    # Split header parameters.  BAW: this may be too simple.  It isn't
    # strictly RFC 2045 (section 5.1) compliant, but it catches most headers
@ -184,44 +202,72 @@ class Message:
        If the message is a multipart and the decode flag is True, then None
        is returned.
        """
-        if i is None:
-            payload = self._payload
-        elif not isinstance(self._payload, list):
+        # Here is the logic table for this code, based on the email5.0.0 code:
+        #   i     decode  is_multipart  result
+        # ------  ------  ------------  ------------------------------
+        #  None   True    True          None
+        #   i     True    True          None
+        #  None   False   True          _payload (a list)
+        #   i     False   True          _payload element i (a Message)
+        #   i     False   False         error (not a list)
+        #   i     True    False         error (not a list)
+        #  None   False   False         _payload
+        #  None   True    False         _payload decoded (bytes)
+        # Note that Barry planned to factor out the 'decode' case, but that
+        # isn't so easy now that we handle the 8 bit data, which needs to be
+        # converted in both the decode and non-decode path.
+        if self.is_multipart():
+            if decode:
+                return None
+            if i is None:
+                return self._payload
+            else:
+                return self._payload[i]
+        # For backward compatibility, Use isinstance and this error message
+        # instead of the more logical is_multipart test.
+        if i is not None and not isinstance(self._payload, list):
            raise TypeError('Expected list, got %s' % type(self._payload))
-        else:
-            payload = self._payload[i]
+        payload = self._payload
+        cte = self.get('content-transfer-encoding', '').lower()
+        # payload can be bytes here, (I wonder if that is actually a bug?)
+        if isinstance(payload, str):
+            if _has_surrogates(payload):
+                bpayload = payload.encode('ascii', 'surrogateescape')
+                if not decode:
+                    try:
+                        payload = bpayload.decode(self.get_param('charset', 'ascii'), 'replace')
+                    except LookupError:
+                        payload = bpayload.decode('ascii', 'replace')
+            elif decode:
+                try:
+                    bpayload = payload.encode('ascii')
+                except UnicodeError:
+                    # This won't happen for RFC compliant messages (messages
+                    # containing only ASCII codepoints in the unicode input).
+                    # If it does happen, turn the string into bytes in a way
+                    # guaranteed not to fail.
+                    bpayload = payload.encode('raw-unicode-escape')
        if not decode:
            return payload
-        # Decoded payloads always return bytes.  XXX split this part out into
-        # a new method called .get_decoded_payload().
-        if self.is_multipart():
-            return None
-        cte = self.get('content-transfer-encoding', '').lower()
        if cte == 'quoted-printable':
-            if isinstance(payload, str):
-                payload = payload.encode('ascii')
-            return utils._qdecode(payload)
+            return utils._qdecode(bpayload)
        elif cte == 'base64':
            try:
-                if isinstance(payload, str):
-                    payload = payload.encode('ascii')
-                return base64.b64decode(payload)
+                return base64.b64decode(bpayload)
            except binascii.Error:
                # Incorrect padding
-                pass
+                return bpayload
        elif cte in ('x-uuencode', 'uuencode', 'uue', 'x-uue'):
-            in_file = BytesIO(payload.encode('ascii'))
+            in_file = BytesIO(bpayload)
            out_file = BytesIO()
            try:
                uu.decode(in_file, out_file, quiet=True)
                return out_file.getvalue()
            except uu.Error:
                # Some decoding problem
-                pass
-        # Is there a better way to do this?  We can't use the bytes
-        # constructor.
+                return bpayload
        if isinstance(payload, str):
-            return payload.encode('raw-unicode-escape')
+            return bpayload
        return payload

    def set_payload(self, payload, charset=None):
@ -340,7 +386,7 @@ class Message:
        Any fields deleted and re-inserted are always appended to the header
        list.
        """
-        return [v for k, v in self._headers]
+        return [_sanitize_surrogates(v) for k, v in self._headers]

    def items(self):
        """Get all the message's header fields and values.
@ -350,7 +396,7 @@ class Message:
        Any fields deleted and re-inserted are always appended to the header
        list.
        """
-        return self._headers[:]
+        return [(k, _sanitize_surrogates(v)) for k, v in self._headers]

    def get(self, name, failobj=None):
        """Get a header value.
@ -361,7 +407,7 @@ class Message:
        name = name.lower()
        for k, v in self._headers:
            if k.lower() == name:
-                return v
+                return _sanitize_surrogates(v)
        return failobj

    #
@ -381,7 +427,7 @@ class Message:
        name = name.lower()
        for k, v in self._headers:
            if k.lower() == name:
-                values.append(v)
+                values.append(_sanitize_surrogates(v))
        if not values:
            return failobj
        return values