mirror of
https://github.com/python/cpython.git
synced 2025-08-31 05:58:33 +00:00
#10686: recode non-ASCII headers to 'unknown-8bit' instead of ?s.
This applies only when generating strings from non-RFC compliant binary input; it makes the existing recoding behavior more consistent (ie: now no data is lost when recoding).
This commit is contained in:
parent
6f0022d84a
commit
9253214fd9
9 changed files with 109 additions and 62 deletions
|
@ -28,6 +28,7 @@ SHORTEST = 3 # the shorter of QP and base64, but only for headers
|
|||
RFC2047_CHROME_LEN = 7
|
||||
|
||||
DEFAULT_CHARSET = 'us-ascii'
|
||||
UNKNOWN8BIT = 'unknown-8bit'
|
||||
EMPTYSTRING = ''
|
||||
|
||||
|
||||
|
@ -152,6 +153,16 @@ def add_codec(charset, codecname):
|
|||
CODEC_MAP[charset] = codecname
|
||||
|
||||
|
||||
|
||||
# Convenience function for encoding strings, taking into account
|
||||
# that they might be unknown-8bit (ie: have surrogate-escaped bytes)
|
||||
def _encode(string, codec):
|
||||
if codec == UNKNOWN8BIT:
|
||||
return string.encode('ascii', 'surrogateescape')
|
||||
else:
|
||||
return string.encode(codec)
|
||||
|
||||
|
||||
|
||||
class Charset:
|
||||
"""Map character sets to their email properties.
|
||||
|
@ -282,8 +293,7 @@ class Charset:
|
|||
:return: The encoded string, with RFC 2047 chrome.
|
||||
"""
|
||||
codec = self.output_codec or 'us-ascii'
|
||||
charset = self.get_output_charset()
|
||||
header_bytes = string.encode(codec)
|
||||
header_bytes = _encode(string, codec)
|
||||
# 7bit/8bit encodings return the string unchanged (modulo conversions)
|
||||
encoder_module = self._get_encoder(header_bytes)
|
||||
if encoder_module is None:
|
||||
|
@ -309,7 +319,7 @@ class Charset:
|
|||
"""
|
||||
# See which encoding we should use.
|
||||
codec = self.output_codec or 'us-ascii'
|
||||
header_bytes = string.encode(codec)
|
||||
header_bytes = _encode(string, codec)
|
||||
encoder_module = self._get_encoder(header_bytes)
|
||||
encoder = partial(encoder_module.header_encode, charset=str(self))
|
||||
# Calculate the number of characters that the RFC 2047 chrome will
|
||||
|
@ -333,7 +343,7 @@ class Charset:
|
|||
for character in string:
|
||||
current_line.append(character)
|
||||
this_line = EMPTYSTRING.join(current_line)
|
||||
length = encoder_module.header_length(this_line.encode(charset))
|
||||
length = encoder_module.header_length(_encode(this_line, charset))
|
||||
if length > maxlen:
|
||||
# This last character doesn't fit so pop it off.
|
||||
current_line.pop()
|
||||
|
@ -343,12 +353,12 @@ class Charset:
|
|||
else:
|
||||
separator = (' ' if lines else '')
|
||||
joined_line = EMPTYSTRING.join(current_line)
|
||||
header_bytes = joined_line.encode(codec)
|
||||
header_bytes = _encode(joined_line, codec)
|
||||
lines.append(encoder(header_bytes))
|
||||
current_line = [character]
|
||||
maxlen = next(maxlengths) - extra
|
||||
joined_line = EMPTYSTRING.join(current_line)
|
||||
header_bytes = joined_line.encode(codec)
|
||||
header_bytes = _encode(joined_line, codec)
|
||||
lines.append(encoder(header_bytes))
|
||||
return lines
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue