#10686: recode non-ASCII headers to 'unknown-8bit' instead of ?s.

This applies only when generating strings from non-RFC compliant binary input; it makes the existing recoding behavior more consistent (ie: now no data is lost when recoding).
2025-10-17 12:18:23 +00:00 · 2011-01-07 23:25:30 +00:00 · 2011-01-07 23:25:30 +00:00 · 9253214fd9
commit 9253214fd9
parent 6f0022d84a
9 changed files with 109 additions and 62 deletions
--- a/Lib/email/charset.py
+++ b/Lib/email/charset.py
@ -28,6 +28,7 @@ SHORTEST    = 3 # the shorter of QP and base64, but only for headers
 RFC2047_CHROME_LEN = 7

 DEFAULT_CHARSET = 'us-ascii'
+UNKNOWN8BIT = 'unknown-8bit'
 EMPTYSTRING = ''


@ -152,6 +153,16 @@ def add_codec(charset, codecname):
    CODEC_MAP[charset] = codecname


+
+# Convenience function for encoding strings, taking into account
+# that they might be unknown-8bit (ie: have surrogate-escaped bytes)
+def _encode(string, codec):
+    if codec == UNKNOWN8BIT:
+        return string.encode('ascii', 'surrogateescape')
+    else:
+        return string.encode(codec)
+
+

 class Charset:
    """Map character sets to their email properties.
@ -282,8 +293,7 @@ class Charset:
        :return: The encoded string, with RFC 2047 chrome.
        """
        codec = self.output_codec or 'us-ascii'
-        charset = self.get_output_charset()
-        header_bytes = string.encode(codec)
+        header_bytes = _encode(string, codec)
        # 7bit/8bit encodings return the string unchanged (modulo conversions)
        encoder_module = self._get_encoder(header_bytes)
        if encoder_module is None:
@ -309,7 +319,7 @@ class Charset:
        """
        # See which encoding we should use.
        codec = self.output_codec or 'us-ascii'
-        header_bytes = string.encode(codec)
+        header_bytes = _encode(string, codec)
        encoder_module = self._get_encoder(header_bytes)
        encoder = partial(encoder_module.header_encode, charset=str(self))
        # Calculate the number of characters that the RFC 2047 chrome will
@ -333,7 +343,7 @@ class Charset:
        for character in string:
            current_line.append(character)
            this_line = EMPTYSTRING.join(current_line)
-            length = encoder_module.header_length(this_line.encode(charset))
+            length = encoder_module.header_length(_encode(this_line, charset))
            if length > maxlen:
                # This last character doesn't fit so pop it off.
                current_line.pop()
@ -343,12 +353,12 @@ class Charset:
                else:
                    separator = (' ' if lines else '')
                    joined_line = EMPTYSTRING.join(current_line)
-                    header_bytes = joined_line.encode(codec)
+                    header_bytes = _encode(joined_line, codec)
                    lines.append(encoder(header_bytes))
                current_line = [character]
                maxlen = next(maxlengths) - extra
        joined_line = EMPTYSTRING.join(current_line)
-        header_bytes = joined_line.encode(codec)
+        header_bytes = _encode(joined_line, codec)
        lines.append(encoder(header_bytes))
        return lines