mirror of
https://github.com/python/cpython.git
synced 2025-08-10 03:49:18 +00:00
[3.12] gh-113594: Fix UnicodeEncodeError in TokenList.fold() (GH-113730) (GH-113907)
It occurred when try to re-encode an unknown-8bit part combined with non-unknown-8bit part.
(cherry picked from commit e9d5b6ea2d
)
Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
This commit is contained in:
parent
082998c3af
commit
84677ff19c
3 changed files with 48 additions and 0 deletions
|
@ -2766,6 +2766,7 @@ def _refold_parse_tree(parse_tree, *, policy):
|
||||||
encoding = 'utf-8' if policy.utf8 else 'us-ascii'
|
encoding = 'utf-8' if policy.utf8 else 'us-ascii'
|
||||||
lines = ['']
|
lines = ['']
|
||||||
last_ew = None
|
last_ew = None
|
||||||
|
last_charset = None
|
||||||
wrap_as_ew_blocked = 0
|
wrap_as_ew_blocked = 0
|
||||||
want_encoding = False
|
want_encoding = False
|
||||||
end_ew_not_allowed = Terminal('', 'wrap_as_ew_blocked')
|
end_ew_not_allowed = Terminal('', 'wrap_as_ew_blocked')
|
||||||
|
@ -2820,8 +2821,14 @@ def _refold_parse_tree(parse_tree, *, policy):
|
||||||
else:
|
else:
|
||||||
# It's a terminal, wrap it as an encoded word, possibly
|
# It's a terminal, wrap it as an encoded word, possibly
|
||||||
# combining it with previously encoded words if allowed.
|
# combining it with previously encoded words if allowed.
|
||||||
|
if (last_ew is not None and
|
||||||
|
charset != last_charset and
|
||||||
|
(last_charset == 'unknown-8bit' or
|
||||||
|
last_charset == 'utf-8' and charset != 'us-ascii')):
|
||||||
|
last_ew = None
|
||||||
last_ew = _fold_as_ew(tstr, lines, maxlen, last_ew,
|
last_ew = _fold_as_ew(tstr, lines, maxlen, last_ew,
|
||||||
part.ew_combine_allowed, charset)
|
part.ew_combine_allowed, charset)
|
||||||
|
last_charset = charset
|
||||||
want_encoding = False
|
want_encoding = False
|
||||||
continue
|
continue
|
||||||
if len(tstr) <= maxlen - len(lines[-1]):
|
if len(tstr) <= maxlen - len(lines[-1]):
|
||||||
|
|
|
@ -2915,6 +2915,45 @@ class TestFolding(TestEmailBase):
|
||||||
"mich. And that's\n"
|
"mich. And that's\n"
|
||||||
" all I'm sayin.\n")
|
" all I'm sayin.\n")
|
||||||
|
|
||||||
|
def test_unicode_after_unknown_not_combined(self):
|
||||||
|
self._test(parser.get_unstructured("=?unknown-8bit?q?=A4?=\xa4"),
|
||||||
|
"=?unknown-8bit?q?=A4?==?utf-8?q?=C2=A4?=\n")
|
||||||
|
prefix = "0123456789 "*5
|
||||||
|
self._test(parser.get_unstructured(prefix + "=?unknown-8bit?q?=A4?=\xa4"),
|
||||||
|
prefix + "=?unknown-8bit?q?=A4?=\n =?utf-8?q?=C2=A4?=\n")
|
||||||
|
|
||||||
|
def test_ascii_after_unknown_not_combined(self):
|
||||||
|
self._test(parser.get_unstructured("=?unknown-8bit?q?=A4?=abc"),
|
||||||
|
"=?unknown-8bit?q?=A4?=abc\n")
|
||||||
|
prefix = "0123456789 "*5
|
||||||
|
self._test(parser.get_unstructured(prefix + "=?unknown-8bit?q?=A4?=abc"),
|
||||||
|
prefix + "=?unknown-8bit?q?=A4?=\n =?utf-8?q?abc?=\n")
|
||||||
|
|
||||||
|
def test_unknown_after_unicode_not_combined(self):
|
||||||
|
self._test(parser.get_unstructured("\xa4"
|
||||||
|
"=?unknown-8bit?q?=A4?="),
|
||||||
|
"=?utf-8?q?=C2=A4?==?unknown-8bit?q?=A4?=\n")
|
||||||
|
prefix = "0123456789 "*5
|
||||||
|
self._test(parser.get_unstructured(prefix + "\xa4=?unknown-8bit?q?=A4?="),
|
||||||
|
prefix + "=?utf-8?q?=C2=A4?=\n =?unknown-8bit?q?=A4?=\n")
|
||||||
|
|
||||||
|
def test_unknown_after_ascii_not_combined(self):
|
||||||
|
self._test(parser.get_unstructured("abc"
|
||||||
|
"=?unknown-8bit?q?=A4?="),
|
||||||
|
"abc=?unknown-8bit?q?=A4?=\n")
|
||||||
|
prefix = "0123456789 "*5
|
||||||
|
self._test(parser.get_unstructured(prefix + "abcd=?unknown-8bit?q?=A4?="),
|
||||||
|
prefix + "abcd\n =?unknown-8bit?q?=A4?=\n")
|
||||||
|
|
||||||
|
def test_unknown_after_unknown(self):
|
||||||
|
self._test(parser.get_unstructured("=?unknown-8bit?q?=C2?="
|
||||||
|
"=?unknown-8bit?q?=A4?="),
|
||||||
|
"=?unknown-8bit?q?=C2=A4?=\n")
|
||||||
|
prefix = "0123456789 "*5
|
||||||
|
self._test(parser.get_unstructured(prefix + "=?unknown-8bit?q?=C2?="
|
||||||
|
"=?unknown-8bit?q?=A4?="),
|
||||||
|
prefix + "=?unknown-8bit?q?=C2?=\n =?unknown-8bit?q?=A4?=\n")
|
||||||
|
|
||||||
# XXX Need test of an encoded word so long that it needs to be wrapped
|
# XXX Need test of an encoded word so long that it needs to be wrapped
|
||||||
|
|
||||||
def test_simple_address(self):
|
def test_simple_address(self):
|
||||||
|
|
|
@ -0,0 +1,2 @@
|
||||||
|
Fix :exc:`UnicodeEncodeError` in :mod:`email` when re-fold lines that
|
||||||
|
contain unknown-8bit encoded part followed by non-unknown-8bit encoded part.
|
Loading…
Add table
Add a link
Reference in a new issue