mirror of
https://github.com/python/cpython.git
synced 2025-09-27 02:39:58 +00:00
Merge: #18431: Decode encoded words in atoms in new email parser.
This commit is contained in:
commit
1f9d24a18d
4 changed files with 73 additions and 3 deletions
|
@ -1624,6 +1624,7 @@ def get_quoted_string(value):
|
||||||
def get_atom(value):
|
def get_atom(value):
|
||||||
"""atom = [CFWS] 1*atext [CFWS]
|
"""atom = [CFWS] 1*atext [CFWS]
|
||||||
|
|
||||||
|
An atom could be an rfc2047 encoded word.
|
||||||
"""
|
"""
|
||||||
atom = Atom()
|
atom = Atom()
|
||||||
if value and value[0] in CFWS_LEADER:
|
if value and value[0] in CFWS_LEADER:
|
||||||
|
@ -1632,7 +1633,15 @@ def get_atom(value):
|
||||||
if value and value[0] in ATOM_ENDS:
|
if value and value[0] in ATOM_ENDS:
|
||||||
raise errors.HeaderParseError(
|
raise errors.HeaderParseError(
|
||||||
"expected atom but found '{}'".format(value))
|
"expected atom but found '{}'".format(value))
|
||||||
token, value = get_atext(value)
|
if value.startswith('=?'):
|
||||||
|
try:
|
||||||
|
token, value = get_encoded_word(value)
|
||||||
|
except errors.HeaderParseError:
|
||||||
|
# XXX: need to figure out how to register defects when
|
||||||
|
# appropriate here.
|
||||||
|
token, value = get_atext(value)
|
||||||
|
else:
|
||||||
|
token, value = get_atext(value)
|
||||||
atom.append(token)
|
atom.append(token)
|
||||||
if value and value[0] in CFWS_LEADER:
|
if value and value[0] in CFWS_LEADER:
|
||||||
token, value = get_cfws(value)
|
token, value = get_cfws(value)
|
||||||
|
@ -1661,12 +1670,22 @@ def get_dot_atom_text(value):
|
||||||
def get_dot_atom(value):
|
def get_dot_atom(value):
|
||||||
""" dot-atom = [CFWS] dot-atom-text [CFWS]
|
""" dot-atom = [CFWS] dot-atom-text [CFWS]
|
||||||
|
|
||||||
|
Any place we can have a dot atom, we could instead have an rfc2047 encoded
|
||||||
|
word.
|
||||||
"""
|
"""
|
||||||
dot_atom = DotAtom()
|
dot_atom = DotAtom()
|
||||||
if value[0] in CFWS_LEADER:
|
if value[0] in CFWS_LEADER:
|
||||||
token, value = get_cfws(value)
|
token, value = get_cfws(value)
|
||||||
dot_atom.append(token)
|
dot_atom.append(token)
|
||||||
token, value = get_dot_atom_text(value)
|
if value.startswith('=?'):
|
||||||
|
try:
|
||||||
|
token, value = get_encoded_word(value)
|
||||||
|
except errors.HeaderParseError:
|
||||||
|
# XXX: need to figure out how to register defects when
|
||||||
|
# appropriate here.
|
||||||
|
token, value = get_dot_atom_text(value)
|
||||||
|
else:
|
||||||
|
token, value = get_dot_atom_text(value)
|
||||||
dot_atom.append(token)
|
dot_atom.append(token)
|
||||||
if value and value[0] in CFWS_LEADER:
|
if value and value[0] in CFWS_LEADER:
|
||||||
token, value = get_cfws(value)
|
token, value = get_cfws(value)
|
||||||
|
|
|
@ -808,9 +808,13 @@ class TestParser(TestParserMixin, TestEmailBase):
|
||||||
self.assertEqual(atom[2].comments, ['bar'])
|
self.assertEqual(atom[2].comments, ['bar'])
|
||||||
|
|
||||||
def test_get_atom_atom_ends_at_noncfws(self):
|
def test_get_atom_atom_ends_at_noncfws(self):
|
||||||
atom = self._test_get_x(parser.get_atom,
|
self._test_get_x(parser.get_atom,
|
||||||
'bob fred', 'bob ', 'bob ', [], 'fred')
|
'bob fred', 'bob ', 'bob ', [], 'fred')
|
||||||
|
|
||||||
|
def test_get_atom_rfc2047_atom(self):
|
||||||
|
self._test_get_x(parser.get_atom,
|
||||||
|
'=?utf-8?q?=20bob?=', ' bob', ' bob', [], '')
|
||||||
|
|
||||||
# get_dot_atom_text
|
# get_dot_atom_text
|
||||||
|
|
||||||
def test_get_dot_atom_text(self):
|
def test_get_dot_atom_text(self):
|
||||||
|
@ -885,6 +889,10 @@ class TestParser(TestParserMixin, TestEmailBase):
|
||||||
with self.assertRaises(errors.HeaderParseError):
|
with self.assertRaises(errors.HeaderParseError):
|
||||||
parser.get_dot_atom(' (foo) bar.bang. foo')
|
parser.get_dot_atom(' (foo) bar.bang. foo')
|
||||||
|
|
||||||
|
def test_get_dot_atom_rfc2047_atom(self):
|
||||||
|
self._test_get_x(parser.get_dot_atom,
|
||||||
|
'=?utf-8?q?=20bob?=', ' bob', ' bob', [], '')
|
||||||
|
|
||||||
# get_word (if this were black box we'd repeat all the qs/atom tests)
|
# get_word (if this were black box we'd repeat all the qs/atom tests)
|
||||||
|
|
||||||
def test_get_word_atom_yields_atom(self):
|
def test_get_word_atom_yields_atom(self):
|
||||||
|
@ -2156,6 +2164,22 @@ class TestParser(TestParserMixin, TestEmailBase):
|
||||||
self.assertEqual(address[0].token_type,
|
self.assertEqual(address[0].token_type,
|
||||||
'mailbox')
|
'mailbox')
|
||||||
|
|
||||||
|
def test_get_address_rfc2047_display_name(self):
|
||||||
|
address = self._test_get_x(parser.get_address,
|
||||||
|
'=?utf-8?q?=C3=89ric?= <foo@example.com>',
|
||||||
|
'Éric <foo@example.com>',
|
||||||
|
'Éric <foo@example.com>',
|
||||||
|
[],
|
||||||
|
'')
|
||||||
|
self.assertEqual(address.token_type, 'address')
|
||||||
|
self.assertEqual(len(address.mailboxes), 1)
|
||||||
|
self.assertEqual(address.mailboxes,
|
||||||
|
address.all_mailboxes)
|
||||||
|
self.assertEqual(address.mailboxes[0].display_name,
|
||||||
|
'Éric')
|
||||||
|
self.assertEqual(address[0].token_type,
|
||||||
|
'mailbox')
|
||||||
|
|
||||||
def test_get_address_empty_group(self):
|
def test_get_address_empty_group(self):
|
||||||
address = self._test_get_x(parser.get_address,
|
address = self._test_get_x(parser.get_address,
|
||||||
'Monty Python:;',
|
'Monty Python:;',
|
||||||
|
|
|
@ -158,6 +158,10 @@ class TestUnstructuredHeader(TestHeaderBase):
|
||||||
'=?utf-8?q?=C3=89ric?=',
|
'=?utf-8?q?=C3=89ric?=',
|
||||||
'Éric'),
|
'Éric'),
|
||||||
|
|
||||||
|
'rfc2047_quopri_with_regular_text': (
|
||||||
|
'The =?utf-8?q?=C3=89ric=2C?= Himself',
|
||||||
|
'The Éric, Himself'),
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -1119,6 +1123,26 @@ class TestAddressHeader(TestHeaderBase):
|
||||||
'example.com',
|
'example.com',
|
||||||
None),
|
None),
|
||||||
|
|
||||||
|
'rfc2047_atom_is_decoded':
|
||||||
|
('=?utf-8?q?=C3=89ric?= <foo@example.com>',
|
||||||
|
[],
|
||||||
|
'Éric <foo@example.com>',
|
||||||
|
'Éric',
|
||||||
|
'foo@example.com',
|
||||||
|
'foo',
|
||||||
|
'example.com',
|
||||||
|
None),
|
||||||
|
|
||||||
|
'rfc2047_atom_in_phrase_is_decoded':
|
||||||
|
('The =?utf-8?q?=C3=89ric=2C?= Himself <foo@example.com>',
|
||||||
|
[],
|
||||||
|
'"The Éric, Himself" <foo@example.com>',
|
||||||
|
'The Éric, Himself',
|
||||||
|
'foo@example.com',
|
||||||
|
'foo',
|
||||||
|
'example.com',
|
||||||
|
None),
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
# XXX: Need many more examples, and in particular some with names in
|
# XXX: Need many more examples, and in particular some with names in
|
||||||
|
|
|
@ -154,6 +154,9 @@ Core and Builtins
|
||||||
Library
|
Library
|
||||||
-------
|
-------
|
||||||
|
|
||||||
|
- Issue #18431: The new email header parser now decodes RFC2047 encoded words
|
||||||
|
in structured headers.
|
||||||
|
|
||||||
- Issue #18044: The new email header parser was mis-parsing encoded words where
|
- Issue #18044: The new email header parser was mis-parsing encoded words where
|
||||||
an encoded character immediately followed the '?' that follows the CTE
|
an encoded character immediately followed the '?' that follows the CTE
|
||||||
character, resulting in a decoding failure. They are now decoded correctly.
|
character, resulting in a decoding failure. They are now decoded correctly.
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue