mirror of
https://github.com/python/cpython.git
synced 2025-07-24 03:35:53 +00:00
Marc-Andre Lemburg <mal@lemburg.com>:
API documentation for Unicode support from C.
This commit is contained in:
parent
8b3ce9e099
commit
a4cd2611f4
1 changed files with 712 additions and 0 deletions
712
Doc/api/api.tex
712
Doc/api/api.tex
|
@ -1899,6 +1899,718 @@ interned string object with the same value.
|
|||
\end{cfuncdesc}
|
||||
|
||||
|
||||
\subsection{Unicode Objects \label{unicodeObjects}}
|
||||
\sectionauthor{Marc-Andre Lemburg}{mal@lemburg.com}
|
||||
|
||||
%--- Unicode Type -------------------------------------------------------
|
||||
|
||||
These are the basic Unicode object types used for the Unicode
|
||||
implementation in Python:
|
||||
|
||||
\begin{ctypedesc}{Py_UNICODE}
|
||||
This type represents a 16-bit unsigned storage type which is used by
|
||||
Python internally as basis for holding Unicode ordinals. On platforms
|
||||
where \ctype{wchar_t} is available and also has 16-bits,
|
||||
\ctype{Py_UNICODE} is a typedef alias for \ctype{wchar_t} to enhance
|
||||
native platform compatibility. On all other platforms,
|
||||
\ctype{Py_UNICODE} is a typedef alias for \ctype{unsigned short}.
|
||||
\end{ctypedesc}
|
||||
|
||||
\begin{ctypedesc}{PyUnicodeObject}
|
||||
This subtype of \ctype{PyObject} represents a Python Unicode object.
|
||||
\end{ctypedesc}
|
||||
|
||||
\begin{cvardesc}{PyTypeObject}{PyUnicode_Type}
|
||||
This instance of \ctype{PyTypeObject} represents the Python Unicode type.
|
||||
\end{cvardesc}
|
||||
|
||||
%--- These are really C macros... is there a macrodesc TeX macro ?
|
||||
|
||||
The following APIs are really C macros and can be used to do fast
|
||||
checks and to access internal read-only data of Unicode objects:
|
||||
|
||||
\begin{cfuncdesc}{int}{PyUnicode_Check}{PyObject *o}
|
||||
Returns true if the object \var{o} is a Unicode object.
|
||||
\end{cfuncdesc}
|
||||
|
||||
\begin{cfuncdesc}{int}{PyUnicode_GET_SIZE}{PyObject *o}
|
||||
Returns the size of the object. o has to be a
|
||||
PyUnicodeObject (not checked).
|
||||
\end{cfuncdesc}
|
||||
|
||||
\begin{cfuncdesc}{int}{PyUnicode_GET_DATA_SIZE}{PyObject *o}
|
||||
Returns the size of the object's internal buffer in bytes. o has to be
|
||||
a PyUnicodeObject (not checked).
|
||||
\end{cfuncdesc}
|
||||
|
||||
\begin{cfuncdesc}{int}{PyUnicode_AS_UNICODE}{PyObject *o}
|
||||
Returns a pointer to the internal Py_UNICODE buffer of the object. o
|
||||
has to be a PyUnicodeObject (not checked).
|
||||
\end{cfuncdesc}
|
||||
|
||||
\begin{cfuncdesc}{int}{PyUnicode_AS_DATA}{PyObject *o}
|
||||
Returns a (const char *) pointer to the internal buffer of the object.
|
||||
o has to be a PyUnicodeObject (not checked).
|
||||
\end{cfuncdesc}
|
||||
|
||||
% --- Unicode character properties ---------------------------------------
|
||||
|
||||
Unicode provides many different character properties. The most often
|
||||
needed ones are available through these macros which are mapped to C
|
||||
functions depending on the Python configuration.
|
||||
|
||||
\begin{cfuncdesc}{int}{Py_UNICODE_ISSPACE}{Py_UNICODE ch}
|
||||
Returns 1/0 depending on whether \var{ch} is a whitespace character.
|
||||
\end{cfuncdesc}
|
||||
|
||||
\begin{cfuncdesc}{int}{Py_UNICODE_ISLOWER}{Py_UNICODE ch}
|
||||
Returns 1/0 depending on whether \var{ch} is a lowercase character.
|
||||
\end{cfuncdesc}
|
||||
|
||||
\begin{cfuncdesc}{int}{Py_UNICODE_ISUPPER}{Py_UNICODE ch}
|
||||
Returns 1/0 depending on whether \var{ch} is a uppercase character.
|
||||
\end{cfuncdesc}
|
||||
|
||||
\begin{cfuncdesc}{int}{Py_UNICODE_ISTITLE}{Py_UNICODE ch}
|
||||
Returns 1/0 depending on whether \var{ch} is a titlecase character.
|
||||
\end{cfuncdesc}
|
||||
|
||||
\begin{cfuncdesc}{int}{Py_UNICODE_ISLINEBREAK}{Py_UNICODE ch}
|
||||
Returns 1/0 depending on whether \var{ch} is a linebreak character.
|
||||
\end{cfuncdesc}
|
||||
|
||||
\begin{cfuncdesc}{int}{Py_UNICODE_ISDECIMAL}{Py_UNICODE ch}
|
||||
Returns 1/0 depending on whether \var{ch} is a decimal character.
|
||||
\end{cfuncdesc}
|
||||
|
||||
\begin{cfuncdesc}{int}{Py_UNICODE_ISDIGIT}{Py_UNICODE ch}
|
||||
Returns 1/0 depending on whether \var{ch} is a digit character.
|
||||
\end{cfuncdesc}
|
||||
|
||||
\begin{cfuncdesc}{int}{Py_UNICODE_ISNUMERIC}{Py_UNICODE ch}
|
||||
Returns 1/0 depending on whether \var{ch} is a numeric character.
|
||||
\end{cfuncdesc}
|
||||
|
||||
These APIs can be used for fast direct character conversions:
|
||||
|
||||
\begin{cfuncdesc}{Py_UNICODE}{Py_UNICODE_TOLOWER}{Py_UNICODE ch}
|
||||
Returns the character \var{ch} converted to lower case.
|
||||
\end{cfuncdesc}
|
||||
|
||||
\begin{cfuncdesc}{Py_UNICODE}{Py_UNICODE_TOUPPER}{Py_UNICODE ch}
|
||||
Returns the character \var{ch} converted to upper case.
|
||||
\end{cfuncdesc}
|
||||
|
||||
\begin{cfuncdesc}{Py_UNICODE}{Py_UNICODE_TOTITLE}{Py_UNICODE ch}
|
||||
Returns the character \var{ch} converted to title case.
|
||||
\end{cfuncdesc}
|
||||
|
||||
\begin{cfuncdesc}{int}{Py_UNICODE_TODECIMAL}{Py_UNICODE ch}
|
||||
Returns the character \var{ch} converted to a decimal positive integer.
|
||||
Returns -1 in case this is not possible. Does not raise exceptions.
|
||||
\end{cfuncdesc}
|
||||
|
||||
\begin{cfuncdesc}{int}{Py_UNICODE_TODIGIT}{Py_UNICODE ch}
|
||||
Returns the character \var{ch} converted to a single digit integer.
|
||||
Returns -1 in case this is not possible. Does not raise exceptions.
|
||||
\end{cfuncdesc}
|
||||
|
||||
\begin{cfuncdesc}{double}{Py_UNICODE_TONUMERIC}{Py_UNICODE ch}
|
||||
Returns the character \var{ch} converted to a (positive) double.
|
||||
Returns -1.0 in case this is not possible. Does not raise exceptions.
|
||||
\end{cfuncdesc}
|
||||
|
||||
% --- Plain Py_UNICODE ---------------------------------------------------
|
||||
|
||||
To create Unicode objects and access their basic sequence properties,
|
||||
use these APIs:
|
||||
|
||||
\begin{cfuncdesc}{PyObject*}{PyUnicode_FromUnicode}{const Py_UNICODE *u,
|
||||
int size}
|
||||
|
||||
Create a Unicode Object from the Py_UNICODE buffer \var{u} of the
|
||||
given size. \var{u} may be \NULL{} which causes the contents to be
|
||||
undefined. It is the user's responsibility to fill in the needed data.
|
||||
The buffer is copied into the new object.
|
||||
\end{cfuncdesc}
|
||||
|
||||
\begin{cfuncdesc}{Py_UNICODE *}{PyUnicode_AsUnicode}{PyObject *unicode}
|
||||
Return a read-only pointer to the Unicode object's internal
|
||||
\ctype{Py_UNICODE} buffer.
|
||||
\end{cfuncdesc}
|
||||
|
||||
\begin{cfuncdesc}{int}{PyUnicode_GetSize}{PyObject *unicode}
|
||||
Return the length of the Unicode object.
|
||||
\end{cfuncdesc}
|
||||
|
||||
\begin{cfuncdesc}{PyObject*}{PyUnicode_FromObject}{PyObject *obj}
|
||||
|
||||
Coerce obj to an Unicode object and return a reference with
|
||||
incremented refcount.
|
||||
|
||||
Coercion is done in the following way:
|
||||
\begin{enumerate}
|
||||
\item Unicode objects are passed back as-is with incremented
|
||||
refcount.
|
||||
|
||||
\item String and other char buffer compatible objects are decoded
|
||||
under the assumptions that they contain UTF-8 data. Decoding
|
||||
is done in "strict" mode.
|
||||
|
||||
\item All other objects raise an exception.
|
||||
\end{enumerate}
|
||||
The API returns NULL in case of an error. The caller is responsible
|
||||
for decref'ing the returned objects.
|
||||
\end{cfuncdesc}
|
||||
|
||||
% --- wchar_t support for platforms which support it ---------------------
|
||||
|
||||
If the platform supports \ctype{wchar_t} and provides a header file
|
||||
wchar.h, Python can interface directly to this type using the
|
||||
following functions. Support is optimized if Python's own
|
||||
\ctype{Py_UNICODE} type is identical to the system's \ctype{wchar_t}.
|
||||
|
||||
\begin{cfuncdesc}{PyObject*}{PyUnicode_FromWideChar}{const wchar_t *w,
|
||||
int size}
|
||||
Create a Unicode Object from the \ctype{whcar_t} buffer \var{w} of the
|
||||
given size. Returns \NULL{} on failure.
|
||||
\end{cfuncdesc}
|
||||
|
||||
\begin{cfuncdesc}{int}{PyUnicode_AsWideChar}{PyUnicodeObject *unicode,
|
||||
wchar_t *w,
|
||||
int size}
|
||||
|
||||
Copies the Unicode Object contents into the \ctype{whcar_t} buffer
|
||||
\var{w}. At most \var{size} \ctype{whcar_t} characters are copied.
|
||||
Returns the number of \ctype{whcar_t} characters copied or -1 in case
|
||||
of an error.
|
||||
\end{cfuncdesc}
|
||||
|
||||
|
||||
\subsubsection{Builtin Codecs \label{builtinCodecs}}
|
||||
|
||||
Python provides a set of builtin codecs which are written in C
|
||||
for speed. All of these codecs are directly usable via the
|
||||
following functions.
|
||||
|
||||
Many of the following APIs take two arguments encoding and
|
||||
errors. These parameters encoding and errors have the same semantics
|
||||
as the ones of the builtin unicode() Unicode object constructor.
|
||||
|
||||
Setting encoding to NULL causes the default encoding to be used which
|
||||
is UTF-8.
|
||||
|
||||
Error handling is set by errors which may also be set to NULL meaning
|
||||
to use the default handling defined for the codec. Default error
|
||||
handling for all builtin codecs is ``strict'' (ValueErrors are raised).
|
||||
|
||||
The codecs all use a similar interface. Only deviation from the
|
||||
following generic ones are documented for simplicity.
|
||||
|
||||
% --- Generic Codecs -----------------------------------------------------
|
||||
|
||||
These are the generic codec APIs:
|
||||
|
||||
\begin{cfuncdesc}{PyObject*}{PyUnicode_Decode}{const char *s,
|
||||
int size,
|
||||
const char *encoding,
|
||||
const char *errors}
|
||||
|
||||
Create a Unicode object by decoding \var{size} bytes of the encoded
|
||||
string \var{s}. \var{encoding} and \var{errors} have the same meaning
|
||||
as the parameters of the same name in the unicode() builtin
|
||||
function. The codec to be used is looked up using the Python codec
|
||||
registry. Returns \NULL{} in case an exception was raised by the
|
||||
codec.
|
||||
\end{cfuncdesc}
|
||||
|
||||
\begin{cfuncdesc}{PyObject*}{PyUnicode_Encode}{const Py_UNICODE *s,
|
||||
int size,
|
||||
const char *encoding,
|
||||
const char *errors}
|
||||
|
||||
Encodes the \ctype{Py_UNICODE} buffer of the given size and returns a
|
||||
Python string object. \var{encoding} and \var{errors} have the same
|
||||
meaning as the parameters of the same name in the Unicode .encode()
|
||||
method. The codec to be used is looked up using the Python codec
|
||||
registry. Returns \NULL{} in case an exception was raised by the
|
||||
codec.
|
||||
\end{cfuncdesc}
|
||||
|
||||
\begin{cfuncdesc}{PyObject*}{PyUnicode_AsEncodedString}{PyObject *unicode,
|
||||
const char *encoding,
|
||||
const char *errors}
|
||||
|
||||
Encodes a Unicode object and returns the result as Python string
|
||||
object. \var{encoding} and \var{errors} have the same meaning as the
|
||||
parameters of the same name in the Unicode .encode() method. The codec
|
||||
to be used is looked up using the Python codec registry. Returns
|
||||
\NULL{} in case an exception was raised by the codec.
|
||||
\end{cfuncdesc}
|
||||
|
||||
% --- UTF-8 Codecs -------------------------------------------------------
|
||||
|
||||
These are the UTF-8 codec APIs:
|
||||
|
||||
\begin{cfuncdesc}{PyObject*}{PyUnicode_DecodeUTF8}{const char *s,
|
||||
int size,
|
||||
const char *errors}
|
||||
|
||||
Creates a Unicode object by decoding \var{size} bytes of the UTF-8
|
||||
encoded string \var{s}. Returns \NULL{} in case an exception was
|
||||
raised by the codec.
|
||||
\end{cfuncdesc}
|
||||
|
||||
\begin{cfuncdesc}{PyObject*}{PyUnicode_EncodeUTF8}{const Py_UNICODE *s,
|
||||
int size,
|
||||
const char *errors}
|
||||
|
||||
Encodes the \ctype{Py_UNICODE} buffer of the given size using UTF-8
|
||||
and returns a Python string object. Returns \NULL{} in case an
|
||||
exception was raised by the codec.
|
||||
\end{cfuncdesc}
|
||||
|
||||
\begin{cfuncdesc}{PyObject*}{PyUnicode_AsUTF8String}{PyObject *unicode}
|
||||
|
||||
Encodes a Unicode objects using UTF-8 and returns the result as Python
|
||||
string object. Error handling is ``strict''. Returns
|
||||
\NULL{} in case an exception was raised by the codec.
|
||||
\end{cfuncdesc}
|
||||
|
||||
% --- UTF-16 Codecs ------------------------------------------------------ */
|
||||
|
||||
These are the UTF-16 codec APIs:
|
||||
|
||||
\begin{cfuncdesc}{PyObject*}{PyUnicode_DecodeUTF16}{const char *s,
|
||||
int size,
|
||||
const char *errors,
|
||||
int *byteorder}
|
||||
|
||||
Decodes \var{length} bytes from a UTF-16 encoded buffer string and
|
||||
returns the corresponding Unicode object.
|
||||
|
||||
\var{errors} (if non-NULL) defines the error handling. It defaults
|
||||
to ``strict''.
|
||||
|
||||
If \var{byteorder} is non-\NULL{}, the decoder starts decoding using
|
||||
the given byte order:
|
||||
|
||||
\begin{verbatim}
|
||||
*byteorder == -1: little endian
|
||||
*byteorder == 0: native order
|
||||
*byteorder == 1: big endian
|
||||
\end{verbatim}
|
||||
|
||||
and then switches according to all byte order marks (BOM) it finds in
|
||||
the input data. BOM marks are not copied into the resulting Unicode
|
||||
string. After completion, \var{*byteorder} is set to the current byte
|
||||
order at the end of input data.
|
||||
|
||||
If \var{byteorder} is \NULL{}, the codec starts in native order mode.
|
||||
|
||||
Returns \NULL{} in case an exception was raised by the codec.
|
||||
\end{cfuncdesc}
|
||||
|
||||
\begin{cfuncdesc}{PyObject*}{PyUnicode_EncodeUTF16}{const Py_UNICODE *s,
|
||||
int size,
|
||||
const char *errors,
|
||||
int byteorder}
|
||||
|
||||
Returns a Python string object holding the UTF-16 encoded value of the
|
||||
Unicode data in \var{s}.
|
||||
|
||||
If \var{byteorder} is not 0, output is written according to the
|
||||
following byte order:
|
||||
|
||||
\begin{verbatim}
|
||||
byteorder == -1: little endian
|
||||
byteorder == 0: native byte order (writes a BOM mark)
|
||||
byteorder == 1: big endian
|
||||
\end{verbatim}
|
||||
|
||||
If byteorder is 0, the output string will always start with the
|
||||
Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
|
||||
prepended.
|
||||
|
||||
Note that \ctype{Py_UNICODE} data is being interpreted as UTF-16
|
||||
reduced to UCS-2. This trick makes it possible to add full UTF-16
|
||||
capabilities at a later point without comprimising the APIs.
|
||||
|
||||
Returns \NULL{} in case an exception was raised by the codec.
|
||||
\end{cfuncdesc}
|
||||
|
||||
\begin{cfuncdesc}{PyObject*}{PyUnicode_AsUTF16String}{PyObject *unicode}
|
||||
|
||||
Returns a Python string using the UTF-16 encoding in native byte
|
||||
order. The string always starts with a BOM mark. Error handling is
|
||||
``strict''. Returns \NULL{} in case an exception was raised by the
|
||||
codec.
|
||||
\end{cfuncdesc}
|
||||
|
||||
% --- Unicode-Escape Codecs ----------------------------------------------
|
||||
|
||||
These are the ``Unicode Esacpe'' codec APIs:
|
||||
|
||||
\begin{cfuncdesc}{PyObject*}{PyUnicode_DecodeUnicodeEscape}{const char *s,
|
||||
int size,
|
||||
const char *errors}
|
||||
|
||||
Creates a Unicode object by decoding \var{size} bytes of the Unicode-Esacpe
|
||||
encoded string \var{s}. Returns \NULL{} in case an exception was
|
||||
raised by the codec.
|
||||
\end{cfuncdesc}
|
||||
|
||||
\begin{cfuncdesc}{PyObject*}{PyUnicode_EncodeUnicodeEscape}{const Py_UNICODE *s,
|
||||
int size,
|
||||
const char *errors}
|
||||
|
||||
Encodes the \ctype{Py_UNICODE} buffer of the given size using Unicode-Escape
|
||||
and returns a Python string object. Returns \NULL{} in case an
|
||||
exception was raised by the codec.
|
||||
\end{cfuncdesc}
|
||||
|
||||
\begin{cfuncdesc}{PyObject*}{PyUnicode_AsUnicodeEscapeString}{PyObject *unicode}
|
||||
|
||||
Encodes a Unicode objects using Unicode-Escape and returns the result
|
||||
as Python string object. Error handling is ``strict''. Returns
|
||||
\NULL{} in case an exception was raised by the codec.
|
||||
\end{cfuncdesc}
|
||||
|
||||
% --- Raw-Unicode-Escape Codecs ------------------------------------------
|
||||
|
||||
These are the ``Raw Unicode Esacpe'' codec APIs:
|
||||
|
||||
\begin{cfuncdesc}{PyObject*}{PyUnicode_DecodeRawUnicodeEscape}{const char *s,
|
||||
int size,
|
||||
const char *errors}
|
||||
|
||||
Creates a Unicode object by decoding \var{size} bytes of the Raw-Unicode-Esacpe
|
||||
encoded string \var{s}. Returns \NULL{} in case an exception was
|
||||
raised by the codec.
|
||||
\end{cfuncdesc}
|
||||
|
||||
\begin{cfuncdesc}{PyObject*}{PyUnicode_EncodeRawUnicodeEscape}{const Py_UNICODE *s,
|
||||
int size,
|
||||
const char *errors}
|
||||
|
||||
Encodes the \ctype{Py_UNICODE} buffer of the given size using Raw-Unicode-Escape
|
||||
and returns a Python string object. Returns \NULL{} in case an
|
||||
exception was raised by the codec.
|
||||
\end{cfuncdesc}
|
||||
|
||||
\begin{cfuncdesc}{PyObject*}{PyUnicode_AsRawUnicodeEscapeString}{PyObject *unicode}
|
||||
|
||||
Encodes a Unicode objects using Raw-Unicode-Escape and returns the result
|
||||
as Python string object. Error handling is ``strict''. Returns
|
||||
\NULL{} in case an exception was raised by the codec.
|
||||
\end{cfuncdesc}
|
||||
|
||||
% --- Latin-1 Codecs -----------------------------------------------------
|
||||
|
||||
These are the Latin-1 codec APIs:
|
||||
|
||||
Latin-1 corresponds to the first 256 Unicode ordinals and only these
|
||||
are accepted by the codecs during encoding.
|
||||
|
||||
\begin{cfuncdesc}{PyObject*}{PyUnicode_DecodeLatin1}{const char *s,
|
||||
int size,
|
||||
const char *errors}
|
||||
|
||||
Creates a Unicode object by decoding \var{size} bytes of the Latin-1
|
||||
encoded string \var{s}. Returns \NULL{} in case an exception was
|
||||
raised by the codec.
|
||||
\end{cfuncdesc}
|
||||
|
||||
\begin{cfuncdesc}{PyObject*}{PyUnicode_EncodeLatin1}{const Py_UNICODE *s,
|
||||
int size,
|
||||
const char *errors}
|
||||
|
||||
Encodes the \ctype{Py_UNICODE} buffer of the given size using Latin-1
|
||||
and returns a Python string object. Returns \NULL{} in case an
|
||||
exception was raised by the codec.
|
||||
\end{cfuncdesc}
|
||||
|
||||
\begin{cfuncdesc}{PyObject*}{PyUnicode_AsLatin1String}{PyObject *unicode}
|
||||
|
||||
Encodes a Unicode objects using Latin-1 and returns the result as
|
||||
Python string object. Error handling is ``strict''. Returns
|
||||
\NULL{} in case an exception was raised by the codec.
|
||||
\end{cfuncdesc}
|
||||
|
||||
% --- ASCII Codecs -------------------------------------------------------
|
||||
|
||||
These are the ASCII codec APIs:
|
||||
|
||||
Only 7-bit ASCII data is excepted. All other codes generate errors.
|
||||
|
||||
\begin{cfuncdesc}{PyObject*}{PyUnicode_DecodeASCII}{const char *s,
|
||||
int size,
|
||||
const char *errors}
|
||||
|
||||
Creates a Unicode object by decoding \var{size} bytes of the ASCII
|
||||
encoded string \var{s}. Returns \NULL{} in case an exception was
|
||||
raised by the codec.
|
||||
\end{cfuncdesc}
|
||||
|
||||
\begin{cfuncdesc}{PyObject*}{PyUnicode_EncodeASCII}{const Py_UNICODE *s,
|
||||
int size,
|
||||
const char *errors}
|
||||
|
||||
Encodes the \ctype{Py_UNICODE} buffer of the given size using ASCII
|
||||
and returns a Python string object. Returns \NULL{} in case an
|
||||
exception was raised by the codec.
|
||||
\end{cfuncdesc}
|
||||
|
||||
\begin{cfuncdesc}{PyObject*}{PyUnicode_AsASCIIString}{PyObject *unicode}
|
||||
|
||||
Encodes a Unicode objects using ASCII and returns the result as Python
|
||||
string object. Error handling is ``strict''. Returns
|
||||
\NULL{} in case an exception was raised by the codec.
|
||||
\end{cfuncdesc}
|
||||
|
||||
% --- Character Map Codecs -----------------------------------------------
|
||||
|
||||
These are the mapping codec APIs:
|
||||
|
||||
This codec is special in that it can be used to implement many
|
||||
different codecs (and this is in fact what was done to obtain most of
|
||||
the standard codecs included in the \module{encodings} package). The
|
||||
codec uses mapping to encode and decode characters.
|
||||
|
||||
Decoding mappings must map single string characters to single Unicode
|
||||
characters, integers (which are then interpreted as Unicode ordinals)
|
||||
or None (meaning "undefined mapping" and causing an error).
|
||||
|
||||
Encoding mappings must map single Unicode characters to single string
|
||||
characters, integers (which are then interpreted as Latin-1 ordinals)
|
||||
or None (meaning "undefined mapping" and causing an error).
|
||||
|
||||
The mapping objects provided must only support the __getitem__ mapping
|
||||
interface.
|
||||
|
||||
If a character lookup fails with a LookupError, the character is
|
||||
copied as-is meaning that its ordinal value will be interpreted as
|
||||
Unicode or Latin-1 ordinal resp. Because of this, mappings only need
|
||||
to contain those mappings which map characters to different code
|
||||
points.
|
||||
|
||||
\begin{cfuncdesc}{PyObject*}{PyUnicode_DecodeCharmap}{const char *s,
|
||||
int size,
|
||||
PyObject *mapping,
|
||||
const char *errors}
|
||||
|
||||
Creates a Unicode object by decoding \var{size} bytes of the encoded
|
||||
string \var{s} using the given \var{mapping} object. Returns \NULL{}
|
||||
in case an exception was raised by the codec.
|
||||
\end{cfuncdesc}
|
||||
|
||||
\begin{cfuncdesc}{PyObject*}{PyUnicode_EncodeCharmap}{const Py_UNICODE *s,
|
||||
int size,
|
||||
PyObject *mapping,
|
||||
const char *errors}
|
||||
|
||||
Encodes the \ctype{Py_UNICODE} buffer of the given size using the
|
||||
given \var{mapping} object and returns a Python string object.
|
||||
Returns \NULL{} in case an exception was raised by the codec.
|
||||
\end{cfuncdesc}
|
||||
|
||||
\begin{cfuncdesc}{PyObject*}{PyUnicode_AsCharmapString}{PyObject *unicode,
|
||||
PyObject *mapping}
|
||||
|
||||
Encodes a Unicode objects using the given \var{mapping} object and
|
||||
returns the result as Python string object. Error handling is
|
||||
``strict''. Returns \NULL{} in case an exception was raised by the
|
||||
codec.
|
||||
\end{cfuncdesc}
|
||||
|
||||
The following codec API is special in that maps Unicode to Unicode.
|
||||
|
||||
\begin{cfuncdesc}{PyObject*}{PyUnicode_TranslateCharmap}{const Py_UNICODE *s,
|
||||
int size,
|
||||
PyObject *table,
|
||||
const char *errors}
|
||||
|
||||
Translates a \ctype{Py_UNICODE} buffer of the given length by applying
|
||||
a character mapping \var{table} to it and returns the resulting
|
||||
Unicode object.
|
||||
|
||||
The \var{mapping} table must map Unicode ordinal integers to Unicode
|
||||
ordinal integers or None (causing deletion of the character).
|
||||
|
||||
Mapping tables must only provide the __getitem__ interface,
|
||||
e.g. dictionaries or sequences. Unmapped character ordinals (ones
|
||||
which cause a LookupError) are left untouched and are copied as-is.
|
||||
|
||||
Returns \NULL{} in case an exception was raised by the codec.
|
||||
\end{cfuncdesc}
|
||||
|
||||
% --- MBCS codecs for Windows --------------------------------------------
|
||||
|
||||
These are the MBCS codec APIs. They are currently only available
|
||||
Windows and use the Win32 MBCS converters to implement the
|
||||
conversions.
|
||||
|
||||
Note that MBCS (or DBCS) is a class of encodings, not just one. The
|
||||
target encoding is defined by the user settings on the machine running
|
||||
the codec.
|
||||
|
||||
\begin{cfuncdesc}{PyObject*}{PyUnicode_DecodeMBCS}{const char *s,
|
||||
int size,
|
||||
const char *errors}
|
||||
|
||||
Creates a Unicode object by decoding \var{size} bytes of the MBCS
|
||||
encoded string \var{s}. Returns \NULL{} in case an exception was
|
||||
raised by the codec.
|
||||
\end{cfuncdesc}
|
||||
|
||||
\begin{cfuncdesc}{PyObject*}{PyUnicode_EncodeMBCS}{const Py_UNICODE *s,
|
||||
int size,
|
||||
const char *errors}
|
||||
|
||||
Encodes the \ctype{Py_UNICODE} buffer of the given size using MBCS
|
||||
and returns a Python string object. Returns \NULL{} in case an
|
||||
exception was raised by the codec.
|
||||
\end{cfuncdesc}
|
||||
|
||||
\begin{cfuncdesc}{PyObject*}{PyUnicode_AsMBCSString}{PyObject *unicode}
|
||||
|
||||
Encodes a Unicode objects using MBCS and returns the result as Python
|
||||
string object. Error handling is ``strict''. Returns
|
||||
\NULL{} in case an exception was raised by the codec.
|
||||
\end{cfuncdesc}
|
||||
|
||||
% --- Methods & Slots ----------------------------------------------------
|
||||
|
||||
\subsubsection{Methods and Slot Functions \label{unicodeMethodsAndSlots}}
|
||||
|
||||
The following APIs are capable of handling Unicode objects and strings
|
||||
on input (we refer to them as strings in the descriptions) and return
|
||||
Unicode objects or integers as apporpriate.
|
||||
|
||||
They all return \NULL{} or -1 in case an exception occurrs.
|
||||
|
||||
\begin{cfuncdesc}{PyObject*}{PyUnicode_Concat}{PyObject *left,
|
||||
PyObject *right}
|
||||
|
||||
Concat two strings giving a new Unicode string.
|
||||
\end{cfuncdesc}
|
||||
|
||||
\begin{cfuncdesc}{PyObject*}{PyUnicode_Split}{PyObject *s,
|
||||
PyObject *sep,
|
||||
int maxsplit}
|
||||
|
||||
Split a string giving a list of Unicode strings.
|
||||
|
||||
If sep is NULL, splitting will be done at all whitespace
|
||||
substrings. Otherwise, splits occur at the given separator.
|
||||
|
||||
At most maxsplit splits will be done. If negative, no limit is set.
|
||||
|
||||
Separators are not included in the resulting list.
|
||||
\end{cfuncdesc}
|
||||
|
||||
\begin{cfuncdesc}{PyObject*}{PyUnicode_Splitlines}{PyObject *s,
|
||||
int maxsplit}
|
||||
|
||||
Dito, but split at line breaks.
|
||||
|
||||
CRLF is considered to be one line break. Line breaks are not
|
||||
included in the resulting list.
|
||||
\end{cfuncdesc}
|
||||
|
||||
\begin{cfuncdesc}{PyObject*}{PyUnicode_Translate}{PyObject *str,
|
||||
PyObject *table,
|
||||
const char *errors}
|
||||
|
||||
Translate a string by applying a character mapping table to it and
|
||||
return the resulting Unicode object.
|
||||
|
||||
The mapping table must map Unicode ordinal integers to Unicode ordinal
|
||||
integers or None (causing deletion of the character).
|
||||
|
||||
Mapping tables must only provide the __getitem__ interface,
|
||||
e.g. dictionaries or sequences. Unmapped character ordinals (ones
|
||||
which cause a LookupError) are left untouched and are copied as-is.
|
||||
|
||||
\var{errors} has the usual meaning for codecs. It may be \NULL{}
|
||||
which indicates to use the default error handling.
|
||||
|
||||
\end{cfuncdesc}
|
||||
|
||||
\begin{cfuncdesc}{PyObject*}{PyUnicode_Join}{PyObject *separator,
|
||||
PyObject *seq}
|
||||
|
||||
Join a sequence of strings using the given separator and return
|
||||
the resulting Unicode string.
|
||||
\end{cfuncdesc}
|
||||
|
||||
\begin{cfuncdesc}{PyObject*}{PyUnicode_Tailmatch}{PyObject *str,
|
||||
PyObject *substr,
|
||||
int start,
|
||||
int end,
|
||||
int direction}
|
||||
|
||||
Return 1 if \var{substr} matches \var{str}[\var{start}:\var{end}] at
|
||||
the given tail end (\var{direction} == -1 means to do a prefix match,
|
||||
\var{direction} == 1 a suffix match), 0 otherwise.
|
||||
\end{cfuncdesc}
|
||||
|
||||
\begin{cfuncdesc}{PyObject*}{PyUnicode_Find}{PyObject *str,
|
||||
PyObject *substr,
|
||||
int start,
|
||||
int end,
|
||||
int direction}
|
||||
|
||||
Return the first position of \var{substr} in
|
||||
\var{str}[\var{start}:\var{end}] using the given \var{direction}
|
||||
(\var{direction} == 1 means to do a forward search,
|
||||
\var{direction} == -1 a backward search), 0 otherwise.
|
||||
\end{cfuncdesc}
|
||||
|
||||
\begin{cfuncdesc}{PyObject*}{PyUnicode_Count}{PyObject *str,
|
||||
PyObject *substr,
|
||||
int start,
|
||||
int end}
|
||||
|
||||
Count the number of occurrences of \var{substr} in
|
||||
\var{str}[\var{start}:\var{end}]
|
||||
\end{cfuncdesc}
|
||||
|
||||
\begin{cfuncdesc}{PyObject*}{PyUnicode_Replace}{PyObject *str,
|
||||
PyObject *substr,
|
||||
PyObject *replstr,
|
||||
int maxcount}
|
||||
|
||||
Replace at most \var{maxcount} occurrences of \var{substr} in
|
||||
\var{str} with \var{replstr} and return the resulting Unicode object.
|
||||
\var{maxcount} == -1 means: replace all occurrences.
|
||||
\end{cfuncdesc}
|
||||
|
||||
\begin{cfuncdesc}{int}{PyUnicode_Compare}{PyObject *left,
|
||||
PyObject *right}
|
||||
|
||||
Compare two strings and return -1, 0, 1 for less than, equal,
|
||||
greater than resp.
|
||||
\end{cfuncdesc}
|
||||
|
||||
\begin{cfuncdesc}{PyObject*}{PyUnicode_Format}{PyObject *format,
|
||||
PyObject *args}
|
||||
Returns a new string object from \var{format} and \var{args}. Analogous
|
||||
to \code{\var{format} \% \var{args}}. The \var{args} argument must be
|
||||
a tuple.
|
||||
\end{cfuncdesc}
|
||||
|
||||
\begin{cfuncdesc}{int}{PyUnicode_Contains}{PyObject *container,
|
||||
PyObject *element}
|
||||
|
||||
Checks whether \var{element} is contained in \var{container} and
|
||||
returns 1/0 accordingly.
|
||||
|
||||
\var{element} has to coerce to an one element Unicode string. -1 is
|
||||
returned in case of an error.
|
||||
\end{cfuncdesc}
|
||||
|
||||
|
||||
\subsection{Buffer Objects \label{bufferObjects}}
|
||||
\sectionauthor{Greg Stein}{gstein@lyra.org}
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue