Merge: #18431: Decode encoded words in atoms in new email parser.

2025-09-27 02:39:58 +00:00 · 2013-07-12 16:01:10 -04:00 · 2013-07-12 16:01:10 -04:00 · 1f9d24a18d
commit 1f9d24a18d
parent ae95b4f7a5 923512f327
4 changed files with 73 additions and 3 deletions
--- a/Lib/email/_header_value_parser.py
+++ b/Lib/email/_header_value_parser.py
@ -1624,6 +1624,7 @@ def get_quoted_string(value):
 def get_atom(value):
    """atom = [CFWS] 1*atext [CFWS]
    An atom could be an rfc2047 encoded word.
    """
    atom = Atom()
    if value and value[0] in CFWS_LEADER:
@ -1632,7 +1633,15 @@ def get_atom(value):
    if value and value[0] in ATOM_ENDS:
        raise errors.HeaderParseError(
            "expected atom but found '{}'".format(value))
-    token, value = get_atext(value)
+    if value.startswith('=?'):
        try:
            token, value = get_encoded_word(value)
        except errors.HeaderParseError:
            # XXX: need to figure out how to register defects when
            # appropriate here.
            token, value = get_atext(value)
    else:
        token, value = get_atext(value)
    atom.append(token)
    if value and value[0] in CFWS_LEADER:
        token, value = get_cfws(value)
@ -1661,12 +1670,22 @@ def get_dot_atom_text(value):
 def get_dot_atom(value):
    """ dot-atom = [CFWS] dot-atom-text [CFWS]
    Any place we can have a dot atom, we could instead have an rfc2047 encoded
    word.
    """
    dot_atom = DotAtom()
    if value[0] in CFWS_LEADER:
        token, value = get_cfws(value)
        dot_atom.append(token)
-    token, value = get_dot_atom_text(value)
+    if value.startswith('=?'):
        try:
            token, value = get_encoded_word(value)
        except errors.HeaderParseError:
            # XXX: need to figure out how to register defects when
            # appropriate here.
            token, value = get_dot_atom_text(value)
    else:
        token, value = get_dot_atom_text(value)
    dot_atom.append(token)
    if value and value[0] in CFWS_LEADER:
        token, value = get_cfws(value)
--- a/Lib/test/test_email/test__header_value_parser.py
+++ b/Lib/test/test_email/test__header_value_parser.py
@ -808,9 +808,13 @@ class TestParser(TestParserMixin, TestEmailBase):
        self.assertEqual(atom[2].comments, ['bar'])
    def test_get_atom_atom_ends_at_noncfws(self):
-        atom = self._test_get_x(parser.get_atom,
+        self._test_get_x(parser.get_atom,
            'bob  fred', 'bob  ', 'bob ', [], 'fred')
    def test_get_atom_rfc2047_atom(self):
        self._test_get_x(parser.get_atom,
            '=?utf-8?q?=20bob?=', ' bob', ' bob', [], '')
    # get_dot_atom_text
    def test_get_dot_atom_text(self):
@ -885,6 +889,10 @@ class TestParser(TestParserMixin, TestEmailBase):
        with self.assertRaises(errors.HeaderParseError):
            parser.get_dot_atom(' (foo) bar.bang. foo')
    def test_get_dot_atom_rfc2047_atom(self):
        self._test_get_x(parser.get_dot_atom,
            '=?utf-8?q?=20bob?=', ' bob', ' bob', [], '')
    # get_word (if this were black box we'd repeat all the qs/atom tests)
    def test_get_word_atom_yields_atom(self):
@ -2156,6 +2164,22 @@ class TestParser(TestParserMixin, TestEmailBase):
        self.assertEqual(address[0].token_type,
                         'mailbox')
    def test_get_address_rfc2047_display_name(self):
        address = self._test_get_x(parser.get_address,
            '=?utf-8?q?=C3=89ric?= <foo@example.com>',
            'Éric <foo@example.com>',
            'Éric <foo@example.com>',
            [],
            '')
        self.assertEqual(address.token_type, 'address')
        self.assertEqual(len(address.mailboxes), 1)
        self.assertEqual(address.mailboxes,
                         address.all_mailboxes)
        self.assertEqual(address.mailboxes[0].display_name,
                         'Éric')
        self.assertEqual(address[0].token_type,
                         'mailbox')
    def test_get_address_empty_group(self):
        address = self._test_get_x(parser.get_address,
            'Monty Python:;',
--- a/Lib/test/test_email/test_headerregistry.py
+++ b/Lib/test/test_email/test_headerregistry.py
@ -158,6 +158,10 @@ class TestUnstructuredHeader(TestHeaderBase):
            '=?utf-8?q?=C3=89ric?=',
            'Éric'),
        'rfc2047_quopri_with_regular_text': (
            'The =?utf-8?q?=C3=89ric=2C?= Himself',
            'The Éric, Himself'),
    }
@ -1119,6 +1123,26 @@ class TestAddressHeader(TestHeaderBase):
             'example.com',
             None),
        'rfc2047_atom_is_decoded':
            ('=?utf-8?q?=C3=89ric?= <foo@example.com>',
            [],
            'Éric <foo@example.com>',
            'Éric',
            'foo@example.com',
            'foo',
            'example.com',
            None),
        'rfc2047_atom_in_phrase_is_decoded':
            ('The =?utf-8?q?=C3=89ric=2C?= Himself <foo@example.com>',
            [],
            '"The Éric, Himself" <foo@example.com>',
            'The Éric, Himself',
            'foo@example.com',
            'foo',
            'example.com',
            None),
        }
        # XXX: Need many more examples, and in particular some with names in
--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -154,6 +154,9 @@ Core and Builtins
 Library
 -------
 - Issue #18431: The new email header parser now decodes RFC2047 encoded words
  in structured headers.
 - Issue #18044: The new email header parser was mis-parsing encoded words where
  an encoded character immediately followed the '?' that follows the CTE
  character, resulting in a decoding failure.  They are now decoded correctly.