mirror of
https://github.com/python/cpython.git
synced 2025-07-19 01:05:26 +00:00
With help from Martin v. Loewis, clarification is added for the
semantics of header chunks using byte and Unicode strings. Specifically, append(): When the given string is a byte string, charset (whether specified explicitly in the argument list or implicitly via the constructor default) is the encoding of the byte string, and a UnicodeError will be raised if the string cannot be decoded with that charset. If s is a Unicode string, then charset is a hint specifying the character set of the characters in the string. In this case, when producing an RFC 2822 compliant header using RFC 2047 rules, the Unicode string will be encoded using the following charsets in order: us-ascii, the charset hint, utf-8. __init__(): Use the global USASCII Charset instance when the charset argument is None. Also, clarification in the docstring. Also, use True/False where appropriate.
This commit is contained in:
parent
884554dfe5
commit
174aa49a88
1 changed files with 60 additions and 28 deletions
|
@ -1,9 +1,11 @@
|
||||||
# Copyright (C) 2002 Python Software Foundation
|
# Copyright (C) 2002 Python Software Foundation
|
||||||
# Author: che@debian.org (Ben Gertzfield)
|
# Author: che@debian.org (Ben Gertzfield), barry@zope.com (Barry Warsaw)
|
||||||
|
|
||||||
"""Header encoding and decoding functionality."""
|
"""Header encoding and decoding functionality."""
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
from types import StringType, UnicodeType
|
||||||
|
|
||||||
import email.quopriMIME
|
import email.quopriMIME
|
||||||
import email.base64MIME
|
import email.base64MIME
|
||||||
from email.Charset import Charset
|
from email.Charset import Charset
|
||||||
|
@ -14,6 +16,12 @@ except SyntaxError:
|
||||||
# Python 2.1 spells integer division differently
|
# Python 2.1 spells integer division differently
|
||||||
from email._compat21 import _floordiv
|
from email._compat21 import _floordiv
|
||||||
|
|
||||||
|
try:
|
||||||
|
True, False
|
||||||
|
except NameError:
|
||||||
|
True = 1
|
||||||
|
False = 0
|
||||||
|
|
||||||
CRLFSPACE = '\r\n '
|
CRLFSPACE = '\r\n '
|
||||||
CRLF = '\r\n'
|
CRLF = '\r\n'
|
||||||
NL = '\n'
|
NL = '\n'
|
||||||
|
@ -25,6 +33,9 @@ MAXLINELEN = 76
|
||||||
ENCODE = 1
|
ENCODE = 1
|
||||||
DECODE = 2
|
DECODE = 2
|
||||||
|
|
||||||
|
USASCII = Charset('us-ascii')
|
||||||
|
UTF8 = Charset('utf-8')
|
||||||
|
|
||||||
# Match encoded-word strings in the form =?charset?q?Hello_World?=
|
# Match encoded-word strings in the form =?charset?q?Hello_World?=
|
||||||
ecre = re.compile(r'''
|
ecre = re.compile(r'''
|
||||||
=\? # literal =?
|
=\? # literal =?
|
||||||
|
@ -117,21 +128,19 @@ def make_header(decoded_seq, maxlinelen=None, header_name=None,
|
||||||
class Header:
|
class Header:
|
||||||
def __init__(self, s=None, charset=None, maxlinelen=None, header_name=None,
|
def __init__(self, s=None, charset=None, maxlinelen=None, header_name=None,
|
||||||
continuation_ws=' '):
|
continuation_ws=' '):
|
||||||
"""Create a MIME-compliant header that can contain many languages.
|
"""Create a MIME-compliant header that can contain many character sets.
|
||||||
|
|
||||||
Specify the initial header value in s. If None, the initial header
|
Optional s is the initial header value. If None, the initial header
|
||||||
value is not set.
|
value is not set. You can later append to the header with .append()
|
||||||
|
method calls. s may be a byte string or a Unicode string, but see the
|
||||||
|
.append() documentation for semantics.
|
||||||
|
|
||||||
Specify both s's character set, and the default character set by
|
Optional charset serves two purposes: it has the same meaning as the
|
||||||
setting the charset argument to a Charset object (not a character set
|
charset argument to the .append() method. It also sets the default
|
||||||
name string!). If None, a us-ascii Charset is used as both s's
|
character set for all subsequent .append() calls that omit the charset
|
||||||
initial charset and as the default character set for subsequent
|
argument. If charset is not provided in the constructor, the us-ascii
|
||||||
.append() calls.
|
charset is used both as s's initial charset and as the default for
|
||||||
|
subsequent .append() calls.
|
||||||
You can later append to the header with append(s, charset) below;
|
|
||||||
charset does not have to be the same as the one initially specified
|
|
||||||
here. In fact, it's optional, and if not given, defaults to the
|
|
||||||
charset specified in the constructor.
|
|
||||||
|
|
||||||
The maximum line length can be specified explicit via maxlinelen. For
|
The maximum line length can be specified explicit via maxlinelen. For
|
||||||
splitting the first line to a shorter value (to account for the field
|
splitting the first line to a shorter value (to account for the field
|
||||||
|
@ -143,7 +152,7 @@ class Header:
|
||||||
lines.
|
lines.
|
||||||
"""
|
"""
|
||||||
if charset is None:
|
if charset is None:
|
||||||
charset = Charset()
|
charset = USASCII
|
||||||
self._charset = charset
|
self._charset = charset
|
||||||
self._continuation_ws = continuation_ws
|
self._continuation_ws = continuation_ws
|
||||||
cws_expanded_len = len(continuation_ws.replace('\t', SPACE8))
|
cws_expanded_len = len(continuation_ws.replace('\t', SPACE8))
|
||||||
|
@ -186,20 +195,43 @@ class Header:
|
||||||
return not self == other
|
return not self == other
|
||||||
|
|
||||||
def append(self, s, charset=None):
|
def append(self, s, charset=None):
|
||||||
"""Append string s with Charset charset to the MIME header.
|
"""Append a string to the MIME header.
|
||||||
|
|
||||||
If charset is given, it should be a Charset instance, or the name of a
|
Optional charset, if given, should be a Charset instance or the name
|
||||||
character set (which will be converted to a Charset instance). A
|
of a character set (which will be converted to a Charset instance). A
|
||||||
value of None (the default) means charset is the one given in the
|
value of None (the default) means that the charset given in the
|
||||||
class constructor.
|
constructor is used.
|
||||||
|
|
||||||
|
s may be a byte string or a Unicode string. If it is a byte string
|
||||||
|
(i.e. isinstance(s, StringType) is true), then charset is the encoding
|
||||||
|
of that byte string, and a UnicodeError will be raised if the string
|
||||||
|
cannot be decoded with that charset. If `s' is a Unicode string, then
|
||||||
|
charset is a hint specifying the character set of the characters in
|
||||||
|
the string. In this case, when producing an RFC 2822 compliant header
|
||||||
|
using RFC 2047 rules, the Unicode string will be encoded using the
|
||||||
|
following charsets in order: us-ascii, the charset hint, utf-8.
|
||||||
"""
|
"""
|
||||||
if charset is None:
|
if charset is None:
|
||||||
charset = self._charset
|
charset = self._charset
|
||||||
elif not isinstance(charset, Charset):
|
elif not isinstance(charset, Charset):
|
||||||
charset = Charset(charset)
|
charset = Charset(charset)
|
||||||
|
# Normalize and check the string
|
||||||
|
if isinstance(s, StringType):
|
||||||
|
# Possibly raise UnicodeError if it can't e encoded
|
||||||
|
unicode(s, charset.get_output_charset())
|
||||||
|
elif isinstance(s, UnicodeType):
|
||||||
|
# Convert Unicode to byte string for later concatenation
|
||||||
|
for charset in USASCII, charset, UTF8:
|
||||||
|
try:
|
||||||
|
s = s.encode(charset.get_output_charset())
|
||||||
|
break
|
||||||
|
except UnicodeError:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
assert False, 'Could not encode to utf-8'
|
||||||
self._chunks.append((s, charset))
|
self._chunks.append((s, charset))
|
||||||
|
|
||||||
def _split(self, s, charset, firstline=0):
|
def _split(self, s, charset, firstline=False):
|
||||||
# Split up a header safely for use with encode_chunks. BAW: this
|
# Split up a header safely for use with encode_chunks. BAW: this
|
||||||
# appears to be a private convenience method.
|
# appears to be a private convenience method.
|
||||||
splittable = charset.to_splittable(s)
|
splittable = charset.to_splittable(s)
|
||||||
|
@ -227,13 +259,13 @@ class Header:
|
||||||
# We can split on _maxlinelen boundaries because we know that the
|
# We can split on _maxlinelen boundaries because we know that the
|
||||||
# encoding won't change the size of the string
|
# encoding won't change the size of the string
|
||||||
splitpnt = self._maxlinelen
|
splitpnt = self._maxlinelen
|
||||||
first = charset.from_splittable(splittable[:splitpnt], 0)
|
first = charset.from_splittable(splittable[:splitpnt], False)
|
||||||
last = charset.from_splittable(splittable[splitpnt:], 0)
|
last = charset.from_splittable(splittable[splitpnt:], False)
|
||||||
else:
|
else:
|
||||||
# Divide and conquer.
|
# Divide and conquer.
|
||||||
halfway = _floordiv(len(splittable), 2)
|
halfway = _floordiv(len(splittable), 2)
|
||||||
first = charset.from_splittable(splittable[:halfway], 0)
|
first = charset.from_splittable(splittable[:halfway], False)
|
||||||
last = charset.from_splittable(splittable[halfway:], 0)
|
last = charset.from_splittable(splittable[halfway:], False)
|
||||||
# Do the split
|
# Do the split
|
||||||
return self._split(first, charset, firstline) + \
|
return self._split(first, charset, firstline) + \
|
||||||
self._split(last, charset)
|
self._split(last, charset)
|
||||||
|
@ -248,7 +280,7 @@ class Header:
|
||||||
line = lines.pop(0)
|
line = lines.pop(0)
|
||||||
if firstline:
|
if firstline:
|
||||||
maxlinelen = self._firstlinelen
|
maxlinelen = self._firstlinelen
|
||||||
firstline = 0
|
firstline = False
|
||||||
else:
|
else:
|
||||||
#line = line.lstrip()
|
#line = line.lstrip()
|
||||||
maxlinelen = self._maxlinelen
|
maxlinelen = self._maxlinelen
|
||||||
|
@ -338,7 +370,7 @@ class Header:
|
||||||
# There's no encoding for this chunk's charsets
|
# There's no encoding for this chunk's charsets
|
||||||
_max_append(chunks, header, self._maxlinelen)
|
_max_append(chunks, header, self._maxlinelen)
|
||||||
else:
|
else:
|
||||||
_max_append(chunks, charset.header_encode(header, 0),
|
_max_append(chunks, charset.header_encode(header),
|
||||||
self._maxlinelen, ' ')
|
self._maxlinelen, ' ')
|
||||||
joiner = NL + self._continuation_ws
|
joiner = NL + self._continuation_ws
|
||||||
return joiner.join(chunks)
|
return joiner.join(chunks)
|
||||||
|
@ -363,6 +395,6 @@ class Header:
|
||||||
"""
|
"""
|
||||||
newchunks = []
|
newchunks = []
|
||||||
for s, charset in self._chunks:
|
for s, charset in self._chunks:
|
||||||
newchunks += self._split(s, charset, 1)
|
newchunks += self._split(s, charset, True)
|
||||||
self._chunks = newchunks
|
self._chunks = newchunks
|
||||||
return self._encode_chunks()
|
return self._encode_chunks()
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue