#10686: recode non-ASCII headers to 'unknown-8bit' instead of ?s.

This applies only when generating strings from non-RFC compliant binary
input; it makes the existing recoding behavior more consistent (ie:
now no data is lost when recoding).
This commit is contained in:
R. David Murray 2011-01-07 23:25:30 +00:00
parent 6f0022d84a
commit 9253214fd9
9 changed files with 109 additions and 62 deletions

View file

@ -28,6 +28,7 @@ SHORTEST = 3 # the shorter of QP and base64, but only for headers
RFC2047_CHROME_LEN = 7
DEFAULT_CHARSET = 'us-ascii'
UNKNOWN8BIT = 'unknown-8bit'
EMPTYSTRING = ''
@ -152,6 +153,16 @@ def add_codec(charset, codecname):
CODEC_MAP[charset] = codecname
# Convenience function for encoding strings, taking into account
# that they might be unknown-8bit (ie: have surrogate-escaped bytes)
def _encode(string, codec):
if codec == UNKNOWN8BIT:
return string.encode('ascii', 'surrogateescape')
else:
return string.encode(codec)
class Charset:
"""Map character sets to their email properties.
@ -282,8 +293,7 @@ class Charset:
:return: The encoded string, with RFC 2047 chrome.
"""
codec = self.output_codec or 'us-ascii'
charset = self.get_output_charset()
header_bytes = string.encode(codec)
header_bytes = _encode(string, codec)
# 7bit/8bit encodings return the string unchanged (modulo conversions)
encoder_module = self._get_encoder(header_bytes)
if encoder_module is None:
@ -309,7 +319,7 @@ class Charset:
"""
# See which encoding we should use.
codec = self.output_codec or 'us-ascii'
header_bytes = string.encode(codec)
header_bytes = _encode(string, codec)
encoder_module = self._get_encoder(header_bytes)
encoder = partial(encoder_module.header_encode, charset=str(self))
# Calculate the number of characters that the RFC 2047 chrome will
@ -333,7 +343,7 @@ class Charset:
for character in string:
current_line.append(character)
this_line = EMPTYSTRING.join(current_line)
length = encoder_module.header_length(this_line.encode(charset))
length = encoder_module.header_length(_encode(this_line, charset))
if length > maxlen:
# This last character doesn't fit so pop it off.
current_line.pop()
@ -343,12 +353,12 @@ class Charset:
else:
separator = (' ' if lines else '')
joined_line = EMPTYSTRING.join(current_line)
header_bytes = joined_line.encode(codec)
header_bytes = _encode(joined_line, codec)
lines.append(encoder(header_bytes))
current_line = [character]
maxlen = next(maxlengths) - extra
joined_line = EMPTYSTRING.join(current_line)
header_bytes = joined_line.encode(codec)
header_bytes = _encode(joined_line, codec)
lines.append(encoder(header_bytes))
return lines