Sync'ing with standalone email package 2.0.1. This adds support for

non-us-ascii character sets in headers and bodies. Some API changes (with DeprecationWarnings for the old APIs). Better RFC-compliant implementations of base64 and quoted-printable. Updated test cases. Documentation updates to follow (after I finish writing them ;).
2025-09-26 10:19:53 +00:00 · 2002-04-10 21:01:31 +00:00 · 2002-04-10 21:01:31 +00:00 · 409a4c08b5
commit 409a4c08b5
parent 68e69338ae
20 changed files with 2209 additions and 143 deletions
--- a/Lib/email/Header.py
+++ b/Lib/email/Header.py
@ -0,0 +1,210 @@
+# Copyright (C) 2002 Python Software Foundation
+# Author: che@debian.org (Ben Gertzfield)
+
+"""Header encoding and decoding functionality."""
+
+import re
+import email.quopriMIME
+import email.base64MIME
+from email.Charset import Charset
+
+CRLFSPACE = '\r\n '
+CRLF = '\r\n'
+NLSPACE = '\n '
+
+MAXLINELEN = 76
+
+ENCODE = 1
+DECODE = 2
+
+# Match encoded-word strings in the form =?charset?q?Hello_World?=
+ecre = re.compile(r'''
+  =\?                   # literal =?
+  (?P<charset>[^?]*?)   # non-greedy up to the next ? is the charset
+  \?                    # literal ?
+  (?P<encoding>[qb])    # either a "q" or a "b", case insensitive
+  \?                    # literal ?
+  (?P<encoded>.*?)      # non-greedy up to the next ?= is the encoded string
+  \?=                   # literal ?=
+  ''', re.VERBOSE | re.IGNORECASE)
+
+
+
+# Helpers
+_max_append = email.quopriMIME._max_append
+
+
+
+def decode_header(header):
+    """Decode a message header value without converting charset.
+
+    Returns a list of (decoded_string, charset) pairs containing each of the
+    decoded parts of the header.  Charset is None for non-encoded parts of the
+    header, otherwise a lower-case string containing the name of the character
+    set specified in the encoded string.
+    """
+    # If no encoding, just return the header
+    header = str(header)
+    if not ecre.search(header):
+        return [(header, None)]
+
+    decoded = []
+    dec = ''
+    for line in header.splitlines():
+        # This line might not have an encoding in it
+        if not ecre.search(line):
+            decoded.append((line, None))
+            continue
+        
+        parts = ecre.split(line)
+        while parts:
+            unenc = parts.pop(0).strip()
+            if unenc:
+                # Should we continue a long line?
+                if decoded and decoded[-1][1] is None:
+                    decoded[-1] = (decoded[-1][0] + dec, None)
+                else:
+                    decoded.append((unenc, None))
+            if parts:
+                charset, encoding = [s.lower() for s in parts[0:2]]
+                encoded = parts[2]
+                dec = ''
+                if encoding == 'q':
+                    dec = email.quopriMIME.header_decode(encoded)
+                elif encoding == 'b':
+                    dec = email.base64MIME.decode(encoded)
+                else:
+                    dec = encoded
+
+                if decoded and decoded[-1][1] == charset:
+                    decoded[-1] = (decoded[-1][0] + dec, decoded[-1][1])
+                else:
+                    decoded.append((dec, charset))
+            del parts[0:3]
+    return decoded
+
+
+
+class Header:
+    def __init__(self, s, charset=None, maxlinelen=MAXLINELEN,
+                 header_name=None):
+        """Create a MIME-compliant header that can contain many languages.
+
+        Specify the initial header value in s.  Specify its character set as a
+        Charset object in the charset argument.  If none, a default Charset
+        instance will be used.
+
+        You can later append to the header with append(s, charset) below;
+        charset does not have to be the same as the one initially specified
+        here.  In fact, it's optional, and if not given, defaults to the
+        charset specified in the constructor.
+
+        The maximum line length can either be specified by maxlinelen, or you
+        can pass in the name of the header field (e.g. "Subject") to let this
+        class guess the best line length to use to prevent wrapping.  The
+        default maxlinelen is 76.
+        """
+        if charset is None:
+            charset = Charset()
+        self._charset = charset
+        # BAW: I believe `chunks' and `maxlinelen' should be non-public.
+        self._chunks = []
+        self.append(s, charset)
+        self._maxlinelen = maxlinelen
+        if header_name is not None:
+            self.guess_maxlinelen(header_name)
+
+    def __str__(self):
+        """A synonym for self.encode()."""
+        return self.encode()
+
+    def guess_maxlinelen(self, s=None):
+        """Guess the maximum length to make each header line.
+
+        Given a header name (e.g. "Subject"), set this header's maximum line
+        length to an appropriate length to avoid line wrapping.  If s is not
+        given, return the previous maximum line length and don't set it.
+
+        Returns the new maximum line length.
+        """
+        # BAW: is this semantic necessary?
+        if s is not None:
+            self._maxlinelen = MAXLINELEN - len(s) - 2
+        return self._maxlinelen
+
+    def append(self, s, charset=None):
+        """Append string s with Charset charset to the MIME header.
+
+        charset defaults to the one given in the class constructor.
+        """
+        if charset is None:
+            charset = self._charset
+        self._chunks.append((s, charset))
+        
+    def _split(self, s, charset):
+        # Split up a header safely for use with encode_chunks.  BAW: this
+        # appears to be a private convenience method.
+        splittable = charset.to_splittable(s)
+        encoded = charset.from_splittable(splittable)
+        
+        if charset.encoded_header_len(encoded) < self._maxlinelen:
+            return [(encoded, charset)]
+        else:
+            # Divide and conquer.  BAW: halfway depends on integer division.
+            # When porting to Python 2.2, use the // operator.
+            halfway = len(splittable) // 2
+            first = charset.from_splittable(splittable[:halfway], 0)
+            last = charset.from_splittable(splittable[halfway:], 0)
+            return self._split(first, charset) + self._split(last, charset)
+
+    def encode(self):
+        """Encode a message header, possibly converting charset and encoding.
+
+        There are many issues involved in converting a given string for use in
+        an email header.  Only certain character sets are readable in most
+        email clients, and as header strings can only contain a subset of
+        7-bit ASCII, care must be taken to properly convert and encode (with
+        Base64 or quoted-printable) header strings.  In addition, there is a
+        75-character length limit on any given encoded header field, so
+        line-wrapping must be performed, even with double-byte character sets.
+        
+        This method will do its best to convert the string to the correct
+        character set used in email, and encode and line wrap it safely with
+        the appropriate scheme for that character set.
+
+        If the given charset is not known or an error occurs during
+        conversion, this function will return the header untouched.
+        """
+        newchunks = []
+        for s, charset in self._chunks:
+            newchunks += self._split(s, charset)
+        self._chunks = newchunks
+        return self.encode_chunks()
+
+    def encode_chunks(self):
+        """MIME-encode a header with many different charsets and/or encodings.
+
+        Given a list of pairs (string, charset), return a MIME-encoded string
+        suitable for use in a header field.  Each pair may have different
+        charsets and/or encodings, and the resulting header will accurately
+        reflect each setting.
+
+        Each encoding can be email.Utils.QP (quoted-printable, for ASCII-like
+        character sets like iso-8859-1), email.Utils.BASE64 (Base64, for
+        non-ASCII like character sets like KOI8-R and iso-2022-jp), or None
+        (no encoding).
+
+        Each pair will be represented on a separate line; the resulting string
+        will be in the format:
+
+        "=?charset1?q?Mar=EDa_Gonz=E1lez_Alonso?=\n
+          =?charset2?b?SvxyZ2VuIEL2aW5n?="
+        """
+        chunks = []
+        for header, charset in self._chunks:
+            if charset is None:
+                _max_append(chunks, header, self._maxlinelen, ' ')
+            else:
+                _max_append(chunks, charset.header_encode(header, 0),
+                            self._maxlinelen, ' ')
+        return NLSPACE.join(chunks)