mirror of
https://github.com/python/cpython.git
synced 2025-08-31 05:58:33 +00:00
#18044: Fix parsing of encoded words of the form =?utf8?q?=XX...?=
The problem was I was only checking for decimal digits after the third '?', not for *hex* digits :(. This changeset also fixes a couple of comment typos, deletes an unused function relating to encoded word parsing, and removed an invalid 'if' test from the folding function that was revealed by the tests written to validate this issue.
This commit is contained in:
parent
3641a74e1c
commit
65171b28e7
5 changed files with 62 additions and 40 deletions
|
@ -69,6 +69,7 @@ XXX: provide complete list of token types.
|
|||
|
||||
import re
|
||||
import urllib # For urllib.parse.unquote
|
||||
from string import hexdigits
|
||||
from collections import namedtuple, OrderedDict
|
||||
from email import _encoded_words as _ew
|
||||
from email import errors
|
||||
|
@ -392,10 +393,6 @@ class UnstructuredTokenList(TokenList):
|
|||
token_type = 'unstructured'
|
||||
|
||||
def _fold(self, folded):
|
||||
if any(x.token_type=='encoded-word' for x in self):
|
||||
return self._fold_encoded(folded)
|
||||
# Here we can have either a pure ASCII string that may or may not
|
||||
# have surrogateescape encoded bytes, or a unicode string.
|
||||
last_ew = None
|
||||
for part in self.parts:
|
||||
tstr = str(part)
|
||||
|
@ -1389,35 +1386,6 @@ def _get_ptext_to_endchars(value, endchars):
|
|||
pos = pos + 1
|
||||
return ''.join(vchars), ''.join([fragment[pos:]] + remainder), had_qp
|
||||
|
||||
def _decode_ew_run(value):
|
||||
""" Decode a run of RFC2047 encoded words.
|
||||
|
||||
_decode_ew_run(value) -> (text, value, defects)
|
||||
|
||||
Scans the supplied value for a run of tokens that look like they are RFC
|
||||
2047 encoded words, decodes those words into text according to RFC 2047
|
||||
rules (whitespace between encoded words is discarded), and returns the text
|
||||
and the remaining value (including any leading whitespace on the remaining
|
||||
value), as well as a list of any defects encountered while decoding. The
|
||||
input value may not have any leading whitespace.
|
||||
|
||||
"""
|
||||
res = []
|
||||
defects = []
|
||||
last_ws = ''
|
||||
while value:
|
||||
try:
|
||||
tok, ws, value = _wsp_splitter(value, 1)
|
||||
except ValueError:
|
||||
tok, ws, value = value, '', ''
|
||||
if not (tok.startswith('=?') and tok.endswith('?=')):
|
||||
return ''.join(res), last_ws + tok + ws + value, defects
|
||||
text, charset, lang, new_defects = _ew.decode(tok)
|
||||
res.append(text)
|
||||
defects.extend(new_defects)
|
||||
last_ws = ws
|
||||
return ''.join(res), last_ws, defects
|
||||
|
||||
def get_fws(value):
|
||||
"""FWS = 1*WSP
|
||||
|
||||
|
@ -1443,7 +1411,8 @@ def get_encoded_word(value):
|
|||
raise errors.HeaderParseError(
|
||||
"expected encoded word but found {}".format(value))
|
||||
remstr = ''.join(remainder)
|
||||
if remstr[:2].isdigit():
|
||||
if len(remstr) > 1 and remstr[0] in hexdigits and remstr[1] in hexdigits:
|
||||
# The ? after the CTE was followed by an encoded word escape (=XX).
|
||||
rest, *remainder = remstr.split('?=', 1)
|
||||
tok = tok + '?=' + rest
|
||||
if len(tok.split()) > 1:
|
||||
|
@ -1491,8 +1460,8 @@ def get_unstructured(value):
|
|||
|
||||
"""
|
||||
# XXX: but what about bare CR and LF? They might signal the start or
|
||||
# end of an encoded word. YAGNI for now, since out current parsers
|
||||
# will never send us strings with bard CR or LF.
|
||||
# end of an encoded word. YAGNI for now, since our current parsers
|
||||
# will never send us strings with bare CR or LF.
|
||||
|
||||
unstructured = UnstructuredTokenList()
|
||||
while value:
|
||||
|
@ -1504,6 +1473,8 @@ def get_unstructured(value):
|
|||
try:
|
||||
token, value = get_encoded_word(value)
|
||||
except errors.HeaderParseError:
|
||||
# XXX: Need to figure out how to register defects when
|
||||
# appropriate here.
|
||||
pass
|
||||
else:
|
||||
have_ws = True
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue