gh-113594: Fix UnicodeEncodeError in TokenList.fold() (GH-113730)

It occurred when try to re-encode an unknown-8bit part combined with non-unknown-8bit part.
This commit is contained in:
Serhiy Storchaka 2024-01-10 14:54:36 +02:00 committed by GitHub
parent 568d220993
commit e9d5b6ea2d
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 48 additions and 0 deletions

View file

@ -2766,6 +2766,7 @@ def _refold_parse_tree(parse_tree, *, policy):
encoding = 'utf-8' if policy.utf8 else 'us-ascii'
lines = ['']
last_ew = None
last_charset = None
wrap_as_ew_blocked = 0
want_encoding = False
end_ew_not_allowed = Terminal('', 'wrap_as_ew_blocked')
@ -2820,8 +2821,14 @@ def _refold_parse_tree(parse_tree, *, policy):
else:
# It's a terminal, wrap it as an encoded word, possibly
# combining it with previously encoded words if allowed.
if (last_ew is not None and
charset != last_charset and
(last_charset == 'unknown-8bit' or
last_charset == 'utf-8' and charset != 'us-ascii')):
last_ew = None
last_ew = _fold_as_ew(tstr, lines, maxlen, last_ew,
part.ew_combine_allowed, charset)
last_charset = charset
want_encoding = False
continue
if len(tstr) <= maxlen - len(lines[-1]):