Implement PEP 393.

This commit is contained in:
Martin v. Löwis 2011-09-28 07:41:54 +02:00
parent 48d49497c5
commit d63a3b8beb
102 changed files with 8153 additions and 5431 deletions

View file

@ -64,16 +64,15 @@ Copyright (c) Corporation for National Research Initiatives.
/* Python 3.x requires unicode */
#define Py_USING_UNICODE
/* FIXME: MvL's new implementation assumes that Py_UNICODE_SIZE is
properly set, but the default rules below doesn't set it. I'll
sort this out some other day -- fredrik@pythonware.com */
#ifndef Py_UNICODE_SIZE
#error Must define Py_UNICODE_SIZE
#ifndef SIZEOF_WCHAR_T
#error Must define SIZEOF_WCHAR_T
#endif
/* Setting Py_UNICODE_WIDE enables UCS-4 storage. Otherwise, Unicode
strings are stored as UCS-2 (with limited support for UTF-16) */
#define Py_UNICODE_SIZE SIZEOF_WCHAR_T
/* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE.
Otherwise, Unicode strings are stored as UCS-2 (with limited support
for UTF-16) */
#if Py_UNICODE_SIZE >= 4
#define Py_UNICODE_WIDE
@ -84,19 +83,14 @@ Copyright (c) Corporation for National Research Initiatives.
/* #define HAVE_WCHAR_H */
/* #define HAVE_USABLE_WCHAR_T */
/* Defaults for various platforms */
#ifndef PY_UNICODE_TYPE
/* Windows has a usable wchar_t type (unless we're using UCS-4) */
# if defined(MS_WIN32) && Py_UNICODE_SIZE == 2
# define HAVE_USABLE_WCHAR_T
# define PY_UNICODE_TYPE wchar_t
# endif
# if defined(Py_UNICODE_WIDE)
# define PY_UNICODE_TYPE Py_UCS4
# endif
/* Py_UNICODE was the native Unicode storage format (code unit) used by
Python and represents a single Unicode element in the Unicode type.
With PEP 393, Py_UNICODE is deprected and replaced with a
typedef to wchar_t. */
#ifndef Py_LIMITED_API
#define PY_UNICODE_TYPE wchar_t
typedef wchar_t Py_UNICODE;
#endif
/* If the compiler provides a wchar_t type we try to support it
@ -109,7 +103,7 @@ Copyright (c) Corporation for National Research Initiatives.
# endif
#endif
#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
#if defined(MS_WINDOWS)
# define HAVE_MBCS
#endif
@ -121,201 +115,19 @@ Copyright (c) Corporation for National Research Initiatives.
# include <wchar.h>
#endif
/*
* Use this typedef when you need to represent a UTF-16 surrogate pair
* as single unsigned integer.
*/
/* Py_UCS4 and Py_UCS2 are typdefs for the respecitve
unicode representations. */
#if SIZEOF_INT >= 4
typedef unsigned int Py_UCS4;
#elif SIZEOF_LONG >= 4
typedef unsigned long Py_UCS4;
#endif
/* Py_UNICODE is the native Unicode storage format (code unit) used by
Python and represents a single Unicode element in the Unicode
type. */
#ifndef Py_LIMITED_API
typedef PY_UNICODE_TYPE Py_UNICODE;
#endif
/* --- UCS-2/UCS-4 Name Mangling ------------------------------------------ */
/* Unicode API names are mangled to assure that UCS-2 and UCS-4 builds
produce different external names and thus cause import errors in
case Python interpreters and extensions with mixed compiled in
Unicode width assumptions are combined. */
#ifndef Py_UNICODE_WIDE
# define PyUnicode_AsASCIIString PyUnicodeUCS2_AsASCIIString
# define PyUnicode_AsCharmapString PyUnicodeUCS2_AsCharmapString
# define PyUnicode_AsDecodedObject PyUnicodeUCS2_AsDecodedObject
# define PyUnicode_AsDecodedUnicode PyUnicodeUCS2_AsDecodedUnicode
# define PyUnicode_AsEncodedObject PyUnicodeUCS2_AsEncodedObject
# define PyUnicode_AsEncodedString PyUnicodeUCS2_AsEncodedString
# define PyUnicode_AsEncodedUnicode PyUnicodeUCS2_AsEncodedUnicode
# define PyUnicode_AsLatin1String PyUnicodeUCS2_AsLatin1String
# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS2_AsRawUnicodeEscapeString
# define PyUnicode_AsUTF32String PyUnicodeUCS2_AsUTF32String
# define PyUnicode_AsUTF16String PyUnicodeUCS2_AsUTF16String
# define PyUnicode_AsUTF8String PyUnicodeUCS2_AsUTF8String
# define PyUnicode_AsUnicode PyUnicodeUCS2_AsUnicode
# define PyUnicode_AsUnicodeEscapeString PyUnicodeUCS2_AsUnicodeEscapeString
# define PyUnicode_AsWideChar PyUnicodeUCS2_AsWideChar
# define PyUnicode_AsWideCharString PyUnicodeUCS2_AsWideCharString
# define PyUnicode_ClearFreeList PyUnicodeUCS2_ClearFreelist
# define PyUnicode_Compare PyUnicodeUCS2_Compare
# define PyUnicode_CompareWithASCIIString PyUnicodeUCS2_CompareWithASCIIString
# define PyUnicode_Concat PyUnicodeUCS2_Concat
# define PyUnicode_Append PyUnicodeUCS2_Append
# define PyUnicode_AppendAndDel PyUnicodeUCS2_AppendAndDel
# define PyUnicode_Contains PyUnicodeUCS2_Contains
# define PyUnicode_Count PyUnicodeUCS2_Count
# define PyUnicode_Decode PyUnicodeUCS2_Decode
# define PyUnicode_DecodeASCII PyUnicodeUCS2_DecodeASCII
# define PyUnicode_DecodeCharmap PyUnicodeUCS2_DecodeCharmap
# define PyUnicode_DecodeLatin1 PyUnicodeUCS2_DecodeLatin1
# define PyUnicode_DecodeFSDefault PyUnicodeUCS2_DecodeFSDefault
# define PyUnicode_DecodeFSDefaultAndSize PyUnicodeUCS2_DecodeFSDefaultAndSize
# define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS2_DecodeRawUnicodeEscape
# define PyUnicode_DecodeUTF32 PyUnicodeUCS2_DecodeUTF32
# define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS2_DecodeUTF32Stateful
# define PyUnicode_DecodeUTF16 PyUnicodeUCS2_DecodeUTF16
# define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS2_DecodeUTF16Stateful
# define PyUnicode_DecodeUTF8 PyUnicodeUCS2_DecodeUTF8
# define PyUnicode_DecodeUTF8Stateful PyUnicodeUCS2_DecodeUTF8Stateful
# define PyUnicode_DecodeUnicodeEscape PyUnicodeUCS2_DecodeUnicodeEscape
# define PyUnicode_Encode PyUnicodeUCS2_Encode
# define PyUnicode_EncodeASCII PyUnicodeUCS2_EncodeASCII
# define PyUnicode_EncodeCharmap PyUnicodeUCS2_EncodeCharmap
# define PyUnicode_EncodeDecimal PyUnicodeUCS2_EncodeDecimal
# define PyUnicode_EncodeLatin1 PyUnicodeUCS2_EncodeLatin1
# define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS2_EncodeRawUnicodeEscape
# define PyUnicode_EncodeUTF32 PyUnicodeUCS2_EncodeUTF32
# define PyUnicode_EncodeUTF16 PyUnicodeUCS2_EncodeUTF16
# define PyUnicode_EncodeUTF8 PyUnicodeUCS2_EncodeUTF8
# define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS2_EncodeUnicodeEscape
# define PyUnicode_Find PyUnicodeUCS2_Find
# define PyUnicode_Format PyUnicodeUCS2_Format
# define PyUnicode_FromEncodedObject PyUnicodeUCS2_FromEncodedObject
# define PyUnicode_FromFormat PyUnicodeUCS2_FromFormat
# define PyUnicode_FromFormatV PyUnicodeUCS2_FromFormatV
# define PyUnicode_FromObject PyUnicodeUCS2_FromObject
# define PyUnicode_FromOrdinal PyUnicodeUCS2_FromOrdinal
# define PyUnicode_FromString PyUnicodeUCS2_FromString
# define PyUnicode_FromStringAndSize PyUnicodeUCS2_FromStringAndSize
# define PyUnicode_FromUnicode PyUnicodeUCS2_FromUnicode
# define PyUnicode_FromWideChar PyUnicodeUCS2_FromWideChar
# define PyUnicode_FSConverter PyUnicodeUCS2_FSConverter
# define PyUnicode_FSDecoder PyUnicodeUCS2_FSDecoder
# define PyUnicode_GetDefaultEncoding PyUnicodeUCS2_GetDefaultEncoding
# define PyUnicode_GetMax PyUnicodeUCS2_GetMax
# define PyUnicode_GetSize PyUnicodeUCS2_GetSize
# define PyUnicode_IsIdentifier PyUnicodeUCS2_IsIdentifier
# define PyUnicode_Join PyUnicodeUCS2_Join
# define PyUnicode_Partition PyUnicodeUCS2_Partition
# define PyUnicode_RPartition PyUnicodeUCS2_RPartition
# define PyUnicode_RSplit PyUnicodeUCS2_RSplit
# define PyUnicode_Replace PyUnicodeUCS2_Replace
# define PyUnicode_Resize PyUnicodeUCS2_Resize
# define PyUnicode_RichCompare PyUnicodeUCS2_RichCompare
# define PyUnicode_Split PyUnicodeUCS2_Split
# define PyUnicode_Splitlines PyUnicodeUCS2_Splitlines
# define PyUnicode_Tailmatch PyUnicodeUCS2_Tailmatch
# define PyUnicode_Translate PyUnicodeUCS2_Translate
# define PyUnicode_TranslateCharmap PyUnicodeUCS2_TranslateCharmap
# define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS2_AsDefaultEncodedString
# define _PyUnicode_Fini _PyUnicodeUCS2_Fini
# define _PyUnicode_Init _PyUnicodeUCS2_Init
# define PyUnicode_strdup PyUnicodeUCS2_strdup
#else
# define PyUnicode_AsASCIIString PyUnicodeUCS4_AsASCIIString
# define PyUnicode_AsCharmapString PyUnicodeUCS4_AsCharmapString
# define PyUnicode_AsDecodedObject PyUnicodeUCS4_AsDecodedObject
# define PyUnicode_AsDecodedUnicode PyUnicodeUCS4_AsDecodedUnicode
# define PyUnicode_AsEncodedObject PyUnicodeUCS4_AsEncodedObject
# define PyUnicode_AsEncodedString PyUnicodeUCS4_AsEncodedString
# define PyUnicode_AsEncodedUnicode PyUnicodeUCS4_AsEncodedUnicode
# define PyUnicode_AsLatin1String PyUnicodeUCS4_AsLatin1String
# define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS4_AsRawUnicodeEscapeString
# define PyUnicode_AsUTF32String PyUnicodeUCS4_AsUTF32String
# define PyUnicode_AsUTF16String PyUnicodeUCS4_AsUTF16String
# define PyUnicode_AsUTF8String PyUnicodeUCS4_AsUTF8String
# define PyUnicode_AsUnicode PyUnicodeUCS4_AsUnicode
# define PyUnicode_AsUnicodeEscapeString PyUnicodeUCS4_AsUnicodeEscapeString
# define PyUnicode_AsWideChar PyUnicodeUCS4_AsWideChar
# define PyUnicode_AsWideCharString PyUnicodeUCS4_AsWideCharString
# define PyUnicode_ClearFreeList PyUnicodeUCS4_ClearFreelist
# define PyUnicode_Compare PyUnicodeUCS4_Compare
# define PyUnicode_CompareWithASCIIString PyUnicodeUCS4_CompareWithASCIIString
# define PyUnicode_Concat PyUnicodeUCS4_Concat
# define PyUnicode_Append PyUnicodeUCS4_Append
# define PyUnicode_AppendAndDel PyUnicodeUCS4_AppendAndDel
# define PyUnicode_Contains PyUnicodeUCS4_Contains
# define PyUnicode_Count PyUnicodeUCS4_Count
# define PyUnicode_Decode PyUnicodeUCS4_Decode
# define PyUnicode_DecodeASCII PyUnicodeUCS4_DecodeASCII
# define PyUnicode_DecodeCharmap PyUnicodeUCS4_DecodeCharmap
# define PyUnicode_DecodeLatin1 PyUnicodeUCS4_DecodeLatin1
# define PyUnicode_DecodeFSDefault PyUnicodeUCS4_DecodeFSDefault
# define PyUnicode_DecodeFSDefaultAndSize PyUnicodeUCS4_DecodeFSDefaultAndSize
# define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS4_DecodeRawUnicodeEscape
# define PyUnicode_DecodeUTF32 PyUnicodeUCS4_DecodeUTF32
# define PyUnicode_DecodeUTF32Stateful PyUnicodeUCS4_DecodeUTF32Stateful
# define PyUnicode_DecodeUTF16 PyUnicodeUCS4_DecodeUTF16
# define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS4_DecodeUTF16Stateful
# define PyUnicode_DecodeUTF8 PyUnicodeUCS4_DecodeUTF8
# define PyUnicode_DecodeUTF8Stateful PyUnicodeUCS4_DecodeUTF8Stateful
# define PyUnicode_DecodeUnicodeEscape PyUnicodeUCS4_DecodeUnicodeEscape
# define PyUnicode_Encode PyUnicodeUCS4_Encode
# define PyUnicode_EncodeASCII PyUnicodeUCS4_EncodeASCII
# define PyUnicode_EncodeCharmap PyUnicodeUCS4_EncodeCharmap
# define PyUnicode_EncodeDecimal PyUnicodeUCS4_EncodeDecimal
# define PyUnicode_EncodeLatin1 PyUnicodeUCS4_EncodeLatin1
# define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS4_EncodeRawUnicodeEscape
# define PyUnicode_EncodeUTF32 PyUnicodeUCS4_EncodeUTF32
# define PyUnicode_EncodeUTF16 PyUnicodeUCS4_EncodeUTF16
# define PyUnicode_EncodeUTF8 PyUnicodeUCS4_EncodeUTF8
# define PyUnicode_EncodeUnicodeEscape PyUnicodeUCS4_EncodeUnicodeEscape
# define PyUnicode_Find PyUnicodeUCS4_Find
# define PyUnicode_Format PyUnicodeUCS4_Format
# define PyUnicode_FromEncodedObject PyUnicodeUCS4_FromEncodedObject
# define PyUnicode_FromFormat PyUnicodeUCS4_FromFormat
# define PyUnicode_FromFormatV PyUnicodeUCS4_FromFormatV
# define PyUnicode_FromObject PyUnicodeUCS4_FromObject
# define PyUnicode_FromOrdinal PyUnicodeUCS4_FromOrdinal
# define PyUnicode_FromString PyUnicodeUCS4_FromString
# define PyUnicode_FromStringAndSize PyUnicodeUCS4_FromStringAndSize
# define PyUnicode_FromUnicode PyUnicodeUCS4_FromUnicode
# define PyUnicode_FromWideChar PyUnicodeUCS4_FromWideChar
# define PyUnicode_FSConverter PyUnicodeUCS4_FSConverter
# define PyUnicode_FSDecoder PyUnicodeUCS4_FSDecoder
# define PyUnicode_GetDefaultEncoding PyUnicodeUCS4_GetDefaultEncoding
# define PyUnicode_GetMax PyUnicodeUCS4_GetMax
# define PyUnicode_GetSize PyUnicodeUCS4_GetSize
# define PyUnicode_IsIdentifier PyUnicodeUCS4_IsIdentifier
# define PyUnicode_Join PyUnicodeUCS4_Join
# define PyUnicode_Partition PyUnicodeUCS4_Partition
# define PyUnicode_RPartition PyUnicodeUCS4_RPartition
# define PyUnicode_RSplit PyUnicodeUCS4_RSplit
# define PyUnicode_Replace PyUnicodeUCS4_Replace
# define PyUnicode_Resize PyUnicodeUCS4_Resize
# define PyUnicode_RichCompare PyUnicodeUCS4_RichCompare
# define PyUnicode_Split PyUnicodeUCS4_Split
# define PyUnicode_Splitlines PyUnicodeUCS4_Splitlines
# define PyUnicode_Tailmatch PyUnicodeUCS4_Tailmatch
# define PyUnicode_Translate PyUnicodeUCS4_Translate
# define PyUnicode_TranslateCharmap PyUnicodeUCS4_TranslateCharmap
# define _PyUnicode_AsDefaultEncodedString _PyUnicodeUCS4_AsDefaultEncodedString
# define _PyUnicode_Fini _PyUnicodeUCS4_Fini
# define _PyUnicode_Init _PyUnicodeUCS4_Init
# define PyUnicode_strdup PyUnicodeUCS4_strdup
#error "Could not find a proper typedef for Py_UCS4"
#endif
typedef unsigned short Py_UCS2;
typedef unsigned char Py_UCS1;
/* --- Internal Unicode Operations ---------------------------------------- */
/* Since splitting on whitespace is an important use case, and
@ -354,7 +166,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
Py_UNICODE_ISDIGIT(ch) || \
Py_UNICODE_ISNUMERIC(ch))
#define Py_UNICODE_COPY(target, source, length) \
#define Py_UNICODE_COPY(target, source, length) \
Py_MEMCPY((target), (source), (length)*sizeof(Py_UNICODE))
#define Py_UNICODE_FILL(target, value, length) \
@ -375,9 +187,10 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
valid, and the substring must not be empty. */
#define Py_UNICODE_MATCH(string, offset, substring) \
((*((string)->str + (offset)) == *((substring)->str)) && \
((*((string)->str + (offset) + (substring)->length-1) == *((substring)->str + (substring)->length-1))) && \
!memcmp((string)->str + (offset), (substring)->str, (substring)->length*sizeof(Py_UNICODE)))
((*((string)->wstr + (offset)) == *((substring)->wstr)) && \
((*((string)->wstr + (offset) + (substring)->wstr_length-1) == *((substring)->wstr + (substring)->wstr_length-1))) && \
!memcmp((string)->wstr + (offset), (substring)->wstr, (substring)->wstr_length*sizeof(Py_UNICODE)))
#endif /* Py_LIMITED_API */
#ifdef __cplusplus
@ -387,41 +200,303 @@ extern "C" {
/* --- Unicode Type ------------------------------------------------------- */
#ifndef Py_LIMITED_API
/* ASCII-only strings created through PyUnicode_New use the PyASCIIObject
structure. state.ascii and state.compact are set, and the data
immediately follow the structure. utf8_length and wstr_length can be found
in the length field; the utf8 pointer is equal to the data pointer. */
typedef struct {
PyObject_HEAD
Py_ssize_t length; /* Length of raw Unicode data in buffer */
Py_UNICODE *str; /* Raw Unicode buffer */
Py_ssize_t length; /* Number of code points in the string */
Py_hash_t hash; /* Hash value; -1 if not set */
int state; /* != 0 if interned. In this case the two
* references from the dictionary to this object
* are *not* counted in ob_refcnt. */
PyObject *defenc; /* (Default) Encoded version as Python
string, or NULL; this is used for
implementing the buffer protocol */
struct {
/*
SSTATE_NOT_INTERNED (0)
SSTATE_INTERNED_MORTAL (1)
SSTATE_INTERNED_IMMORTAL (2)
If interned != SSTATE_NOT_INTERNED, the two references from the
dictionary to this object are *not* counted in ob_refcnt.
*/
unsigned int interned:2;
/* Character size:
PyUnicode_WCHAR_KIND (0): wchar_t*
PyUnicode_1BYTE_KIND (1): Py_UCS1*
PyUnicode_2BYTE_KIND (2): Py_UCS2*
PyUnicode_4BYTE_KIND (3): Py_UCS4*
*/
unsigned int kind:2;
/* Compact is with respect to the allocation scheme. Compact unicode
objects only require one memory block while non-compact objects use
one block for the PyUnicodeObject struct and another for its data
buffer. */
unsigned int compact:1;
/* Compact objects which are ASCII-only also have the state.compact
flag set, and use the PyASCIIObject struct. */
unsigned int ascii:1;
/* The ready flag indicates whether the object layout is initialized
completely. This means that this is either a compact object, or
the data pointer is filled out. The bit is redundant, and helps
to minimize the test in PyUnicode_IS_READY(). */
unsigned int ready:1;
} state;
wchar_t *wstr; /* wchar_t representation (null-terminated) */
} PyASCIIObject;
/* Non-ASCII strings allocated through PyUnicode_New use the
PyCompactUnicodeOject structure. state.compact is set, and the data
immediately follow the structure. */
typedef struct {
PyASCIIObject _base;
Py_ssize_t utf8_length; /* Number of bytes in utf8, excluding the
* terminating \0. */
char *utf8; /* UTF-8 representation (null-terminated) */
Py_ssize_t wstr_length; /* Number of code points in wstr, possible
* surrogates count as two code points. */
} PyCompactUnicodeObject;
/* Strings allocated through PyUnicode_FromUnicode(NULL, len) use the
PyUnicodeObject structure. The actual string data is initially in the wstr
block, and copied into the data block using PyUnicode_Ready. */
typedef struct {
PyCompactUnicodeObject _base;
union {
void *any;
Py_UCS1 *latin1;
Py_UCS2 *ucs2;
Py_UCS4 *ucs4;
} data; /* Canonical, smallest-form Unicode buffer */
} PyUnicodeObject;
#endif
PyAPI_DATA(PyTypeObject) PyUnicode_Type;
PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
#define SSTATE_NOT_INTERNED 0
#define SSTATE_INTERNED_MORTAL 1
#define SSTATE_INTERNED_IMMORTAL 2
#define PyUnicode_Check(op) \
PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
#define PyUnicode_CheckExact(op) (Py_TYPE(op) == &PyUnicode_Type)
/* Fast access macros */
#ifndef Py_LIMITED_API
#define PyUnicode_WSTR_LENGTH(op) \
(((PyASCIIObject*)op)->state.ascii ? \
((PyASCIIObject*)op)->length : \
((PyCompactUnicodeObject*)op)->wstr_length)
/* Returns the deprecated Py_UNICODE representation's size in code units
(this includes surrogate pairs as 2 units).
If the Py_UNICODE representation is not available, it will be computed
on request. Use PyUnicode_GET_LENGTH() for the length in code points. */
#define PyUnicode_GET_SIZE(op) \
(assert(PyUnicode_Check(op)),(((PyUnicodeObject *)(op))->length))
(assert(PyUnicode_Check(op)), \
(((PyASCIIObject *)(op))->wstr) ? \
PyUnicode_WSTR_LENGTH(op) : \
((void)PyUnicode_AsUnicode((PyObject *)(op)), \
PyUnicode_WSTR_LENGTH(op)))
#define PyUnicode_GET_DATA_SIZE(op) \
(assert(PyUnicode_Check(op)),(((PyUnicodeObject *)(op))->length * sizeof(Py_UNICODE)))
(PyUnicode_GET_SIZE(op) * Py_UNICODE_SIZE)
/* Alias for PyUnicode_AsUnicode(). This will create a wchar_t/Py_UNICODE
representation on demand. Using this macro is very inefficient now,
try to port your code to use the new PyUnicode_*BYTE_DATA() macros or
use PyUnicode_WRITE() and PyUnicode_READ(). */
#define PyUnicode_AS_UNICODE(op) \
(assert(PyUnicode_Check(op)),(((PyUnicodeObject *)(op))->str))
(assert(PyUnicode_Check(op)), \
(((PyASCIIObject *)(op))->wstr) ? (((PyASCIIObject *)(op))->wstr) : \
PyUnicode_AsUnicode((PyObject *)(op)))
#define PyUnicode_AS_DATA(op) \
(assert(PyUnicode_Check(op)),((const char *)((PyUnicodeObject *)(op))->str))
((const char *)(PyUnicode_AS_UNICODE(op)))
/* --- Flexible String Representaion Helper Macros (PEP 393) -------------- */
/* Values for PyUnicodeObject.state: */
/* Interning state. */
#define SSTATE_NOT_INTERNED 0
#define SSTATE_INTERNED_MORTAL 1
#define SSTATE_INTERNED_IMMORTAL 2
#define PyUnicode_IS_COMPACT_ASCII(op) (((PyASCIIObject*)op)->state.ascii)
/* String contains only wstr byte characters. This is only possible
when the string was created with a legacy API and PyUnicode_Ready()
has not been called yet. */
#define PyUnicode_WCHAR_KIND 0
/* Return values of the PyUnicode_KIND() macro: */
#define PyUnicode_1BYTE_KIND 1
#define PyUnicode_2BYTE_KIND 2
#define PyUnicode_4BYTE_KIND 3
/* Return the number of bytes the string uses to represent single characters,
this can be 1, 2 or 4. */
#define PyUnicode_CHARACTER_SIZE(op) \
(1 << (PyUnicode_KIND(op) - 1))
/* Return pointers to the canonical representation casted as unsigned char,
Py_UCS2, or Py_UCS4 for direct character access.
No checks are performed, use PyUnicode_CHARACTER_SIZE or
PyUnicode_KIND() before to ensure these will work correctly. */
#define PyUnicode_1BYTE_DATA(op) ((Py_UCS1*)PyUnicode_DATA(op))
#define PyUnicode_2BYTE_DATA(op) ((Py_UCS2*)PyUnicode_DATA(op))
#define PyUnicode_4BYTE_DATA(op) ((Py_UCS4*)PyUnicode_DATA(op))
/* Return true if the string is compact or 0 if not.
No type checks or Ready calls are performed. */
#define PyUnicode_IS_COMPACT(op) \
(((PyASCIIObject*)(op))->state.compact)
/* Return one of the PyUnicode_*_KIND values defined above. */
#define PyUnicode_KIND(op) \
(assert(PyUnicode_Check(op)), \
assert(PyUnicode_IS_READY(op)), \
((PyASCIIObject *)(op))->state.kind)
/* Return a void pointer to the raw unicode buffer. */
#define _PyUnicode_COMPACT_DATA(op) \
(PyUnicode_IS_COMPACT_ASCII(op) ? \
((void*)((PyASCIIObject*)(op) + 1)) : \
((void*)((PyCompactUnicodeObject*)(op) + 1)))
#define _PyUnicode_NONCOMPACT_DATA(op) \
(assert(((PyUnicodeObject*)(op))->data.any), \
((((PyUnicodeObject *)(op))->data.any)))
#define PyUnicode_DATA(op) \
(assert(PyUnicode_Check(op)), \
PyUnicode_IS_COMPACT(op) ? _PyUnicode_COMPACT_DATA(op) : \
_PyUnicode_NONCOMPACT_DATA(op))
#define _PyUnicode_UTF8(op) \
(PyUnicode_IS_COMPACT_ASCII(op) ? \
((char*)((PyASCIIObject*)(op) + 1)) : \
((PyCompactUnicodeObject*)(op))->utf8)
#define _PyUnicode_UTF8_LENGTH(op) \
(PyUnicode_IS_COMPACT_ASCII(op) ? \
((PyASCIIObject*)(op))->length : \
((PyCompactUnicodeObject*)(op))->utf8_length)
/* Compute (index * char_size) where char_size is 2 ** (kind - 1).
The index is a character index, the result is a size in bytes. */
#define PyUnicode_KIND_SIZE(kind, index) ((index) << ((kind) - 1))
/* In the access macros below, "kind" may be evaluated more than once.
All other macro parameters are evaluated exactly once, so it is safe
to put side effects into them (such as increasing the index). */
/* Write into the canonical representation, this macro does not do any sanity
checks and is intended for usage in loops. The caller should cache the
kind and data pointers optained form other macro calls.
index is the index in the string (starts at 0) and value is the new
code point value which shoule be written to that location. */
#define PyUnicode_WRITE(kind, data, index, value) \
do { \
switch ((kind)) { \
case PyUnicode_1BYTE_KIND: { \
((Py_UCS1 *)(data))[(index)] = (Py_UCS1)(value); \
break; \
} \
case PyUnicode_2BYTE_KIND: { \
((Py_UCS2 *)(data))[(index)] = (Py_UCS2)(value); \
break; \
} \
default: { \
assert((kind) == PyUnicode_4BYTE_KIND); \
((Py_UCS4 *)(data))[(index)] = (Py_UCS4)(value); \
} \
} \
} while (0)
/* Read a code point form the string's canonical representation. No checks
or ready calls are performed. */
#define PyUnicode_READ(kind, data, index) \
((Py_UCS4) \
((kind) == PyUnicode_1BYTE_KIND ? \
((const unsigned char *)(data))[(index)] : \
((kind) == PyUnicode_2BYTE_KIND ? \
((const Py_UCS2 *)(data))[(index)] : \
((const Py_UCS4 *)(data))[(index)] \
) \
))
/* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it
calls PyUnicode_KIND() and might call it twice. For single reads, use
PyUnicode_READ_CHAR, for multiple consecutive reads callers should
cache kind and use PyUnicode_READ instead. */
#define PyUnicode_READ_CHAR(unicode, index) \
((Py_UCS4) \
(PyUnicode_KIND((unicode)) == PyUnicode_1BYTE_KIND ? \
((const unsigned char *)(PyUnicode_DATA((unicode))))[(index)] : \
(PyUnicode_KIND((unicode)) == PyUnicode_2BYTE_KIND ? \
((const Py_UCS2 *)(PyUnicode_DATA((unicode))))[(index)] : \
((const Py_UCS4 *)(PyUnicode_DATA((unicode))))[(index)] \
) \
))
/* Returns the length of the unicode string. The caller has to make sure that
the string has it's canonical representation set before calling
this macro. Call PyUnicode_(FAST_)Ready to ensure that. */
#define PyUnicode_GET_LENGTH(op) \
(assert(PyUnicode_Check(op)), \
assert(PyUnicode_IS_READY(op)), \
((PyASCIIObject *)(op))->length)
/* Fast check to determine whether an object is ready. Equivalent to
PyUnicode_IS_COMPACT(op) || ((PyUnicodeObject*)(op))->data.any) */
#define PyUnicode_IS_READY(op) (((PyASCIIObject*)op)->state.ready)
/* PyUnicode_READY() does less work than PyUnicode_Ready() in the best
case. If the canonical representation is not yet set, it will still call
PyUnicode_Ready().
Returns 0 on success and -1 on errors. */
#define PyUnicode_READY(op) \
(assert(PyUnicode_Check(op)), \
(PyUnicode_IS_READY(op) ? \
0 : _PyUnicode_Ready((PyUnicodeObject *)(op))))
/* Generic helper macro to convert characters of different types.
from_type and to_type have to be valid type names, begin and end
are pointers to the source characters which should be of type
"from_type *". to is a pointer of type "to_type *" and points to the
buffer where the result characters are written to. */
#define PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
do { \
const from_type *iter_; to_type *to_; \
for (iter_ = (begin), to_ = (to_type *)(to); \
iter_ < (end); \
++iter_, ++to_) { \
*to_ = (to_type)*iter_; \
} \
} while (0)
/* Return a maximum character value which is suitable for creating another
string based on op. This is always an approximation but more efficient
than interating over the string. */
#define PyUnicode_MAX_CHAR_VALUE(op) \
(assert(PyUnicode_IS_READY(op)), \
(PyUnicode_IS_COMPACT_ASCII(op) ? 0x7f: \
(PyUnicode_KIND(op) == PyUnicode_1BYTE_KIND ? \
(PyUnicode_DATA(op) == (((PyCompactUnicodeObject *)(op))->utf8) ? \
(0x7fU) : (0xffU) \
) : \
(PyUnicode_KIND(op) == PyUnicode_2BYTE_KIND ? \
(0xffffU) : (0x10ffffU) \
))))
#endif
/* --- Constants ---------------------------------------------------------- */
@ -437,6 +512,52 @@ PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
/* --- Plain Py_UNICODE --------------------------------------------------- */
/* With PEP 393, this is the recommended way to allocate a new unicode object.
This function will allocate the object and its buffer in a single memory
block. Objects created using this function are not resizable. */
#ifndef Py_LIMITED_API
PyAPI_FUNC(PyObject*) PyUnicode_New(
Py_ssize_t size, /* Number of code points in the new string */
Py_UCS4 maxchar /* maximum code point value in the string */
);
#endif
/* Initializes the canonical string representation from a the deprected
wstr/Py_UNICODE representation. This function is used to convert
unicode objects which were created using the old API to the new flexible
format introduced with PEP 393. The PyUnicode_READY() macro can be
more efficient if the string is already ready. */
#ifndef Py_LIMITED_API
PyAPI_FUNC(int) _PyUnicode_Ready(
PyUnicodeObject *unicode /* Unicode object */
);
#endif
/* Copy character from one unicode object into another, this function performs
character conversion when nessesary and falls back to memcpy if possible.
Return -1 and raise an exception on error, return 0 on success. */
#ifndef Py_LIMITED_API
PyAPI_FUNC(int) PyUnicode_CopyCharacters(
PyObject *to,
Py_ssize_t to_start,
PyObject *from,
Py_ssize_t from_start,
Py_ssize_t how_many
);
#endif
/* Find the maximum code point and count the number of surrogate pairs so a
correct string length can be computed before converting a string to UCS4.
This function counts single surrogates as a character and not as a pair. */
#ifndef Py_LIMITED_API
PyAPI_FUNC(int) _PyUnicode_FindMaxCharAndNumSurrogatePairs(
const wchar_t *begin,
const wchar_t *end,
Py_UCS4 *maxchar,
Py_ssize_t *num_surrogates
);
#endif
/* Create a Unicode Object from the Py_UNICODE buffer u of the given
size.
@ -461,13 +582,43 @@ PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
);
/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
UTF-8 encoded bytes */
UTF-8 encoded bytes. The size is determined with strlen(). */
PyAPI_FUNC(PyObject*) PyUnicode_FromString(
const char *u /* UTF-8 encoded string */
);
#ifndef Py_LIMITED_API
PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData(
int kind,
const void *buffer,
Py_ssize_t size);
#endif
PyAPI_FUNC(PyObject*) PyUnicode_Substring(
PyObject *str,
Py_ssize_t start,
Py_ssize_t end);
/* Copy the string into a UCS4 buffer including the null character is copy_null
is set. Return NULL and raise an exception on error. Raise a ValueError if
the buffer is smaller than the string. Return buffer on success.
buflen is the length of the buffer in (Py_UCS4) characters. */
PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4(
PyObject *unicode,
Py_UCS4* buffer,
Py_ssize_t buflen,
int copy_null);
/* Copy the string into a UCS4 buffer. A new buffer is allocated using
* PyMem_Malloc; if this fails, NULL is returned with a memory error
exception set. */
PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode);
/* Return a read-only pointer to the Unicode object's internal
Py_UNICODE buffer. */
Py_UNICODE buffer.
If the wchar_t/Py_UNICODE representation is not yet available, this
function will calculate it. */
#ifndef Py_LIMITED_API
PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(
@ -475,12 +626,47 @@ PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicode(
);
#endif
/* Return a read-only pointer to the Unicode object's internal
Py_UNICODE buffer and save the length at size.
If the wchar_t/Py_UNICODE representation is not yet available, this
function will calculate it. */
#ifndef Py_LIMITED_API
PyAPI_FUNC(Py_UNICODE *) PyUnicode_AsUnicodeAndSize(
PyObject *unicode, /* Unicode object */
Py_ssize_t *size /* location where to save the length */
);
#endif
/* Get the length of the Unicode object. */
PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength(
PyObject *unicode
);
/* Get the number of Py_UNICODE units in the
string representation. */
PyAPI_FUNC(Py_ssize_t) PyUnicode_GetSize(
PyObject *unicode /* Unicode object */
);
/* Read a character from the string. */
PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar(
PyObject *unicode,
Py_ssize_t index
);
/* Write a character to the string. The string must have been created through
PyUnicode_New, must not be shared, and must not have been hashed yet. */
PyAPI_FUNC(int) PyUnicode_WriteChar(
PyObject *unicode,
Py_ssize_t index,
Py_UCS4 character
);
#ifndef Py_LIMITED_API
/* Get the maximum ordinal for a Unicode character. */
PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void);
@ -558,8 +744,9 @@ PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(
/* Format the object based on the format_spec, as defined in PEP 3101
(Advanced String Formatting). */
PyAPI_FUNC(PyObject *) _PyUnicode_FormatAdvanced(PyObject *obj,
Py_UNICODE *format_spec,
Py_ssize_t format_spec_len);
PyObject *format_spec,
Py_ssize_t start,
Py_ssize_t end);
#endif
PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
@ -572,7 +759,8 @@ PyAPI_FUNC(void) _Py_ReleaseInternedUnicodeStrings(void);
#endif
/* Use only if you know it's a string */
#define PyUnicode_CHECK_INTERNED(op) (((PyUnicodeObject *)(op))->state)
#define PyUnicode_CHECK_INTERNED(op) \
(((PyASCIIObject *)(op))->state.interned)
/* --- wchar_t support for platforms which support it --------------------- */
@ -619,6 +807,8 @@ PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString(
Py_ssize_t *size /* number of characters of the result */
);
PyAPI_FUNC(void*) _PyUnicode_AsKind(PyObject *s, unsigned int kind);
#endif
/* --- Unicode ordinals --------------------------------------------------- */
@ -664,49 +854,42 @@ PyAPI_FUNC(int) PyUnicode_ClearFreeList(void);
/* --- Manage the default encoding ---------------------------------------- */
/* Return a Python string holding the default encoded value of the
Unicode object.
Same as PyUnicode_AsUTF8String() except
the resulting string is cached in the Unicode object for subsequent
usage by this function. The cached version is needed to implement
the character buffer interface and will live (at least) as long as
the Unicode object itself.
The refcount of the string is *not* incremented.
*** Exported for internal use by the interpreter only !!! ***
*/
#ifndef Py_LIMITED_API
PyAPI_FUNC(PyObject *) _PyUnicode_AsDefaultEncodedString(
PyObject *unicode);
#endif
/* Returns a pointer to the default encoding (UTF-8) of the
Unicode object unicode and the size of the encoded representation
in bytes stored in *size.
In case of an error, no *size is set.
This funcation caches the UTF-8 encoded string in the unicodeobject
and subsequent calls will return the same string. The memory is relased
when the unicodeobject is deallocated.
_PyUnicode_AsStringAndSize is a #define for PyUnicode_AsUTF8AndSize to
support the previous internal function with the same behaviour.
*** This API is for interpreter INTERNAL USE ONLY and will likely
*** be removed or changed in the future.
*** If you need to access the Unicode object as UTF-8 bytes string,
*** please use PyUnicode_AsUTF8String() instead.
*/
#ifndef Py_LIMITED_API
PyAPI_FUNC(char *) _PyUnicode_AsStringAndSize(
PyAPI_FUNC(char *) PyUnicode_AsUTF8AndSize(
PyObject *unicode,
Py_ssize_t *size);
#define _PyUnicode_AsStringAndSize PyUnicode_AsUTF8AndSize
#endif
/* Returns a pointer to the default encoding (UTF-8) of the
Unicode object unicode.
Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation
in the unicodeobject.
_PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to
support the previous internal function with the same behaviour.
Use of this API is DEPRECATED since no size information can be
extracted from the returned data.
@ -719,7 +902,8 @@ PyAPI_FUNC(char *) _PyUnicode_AsStringAndSize(
*/
#ifndef Py_LIMITED_API
PyAPI_FUNC(char *) _PyUnicode_AsString(PyObject *unicode);
PyAPI_FUNC(char *) PyUnicode_AsUTF8(PyObject *unicode);
#define _PyUnicode_AsString PyUnicode_AsUTF8
#endif
/* Returns "utf-8". */
@ -846,6 +1030,10 @@ PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
);
#ifndef Py_LIMITED_API
PyAPI_FUNC(PyObject*) _PyUnicode_AsUTF8String(
PyObject *unicode,
const char *errors);
PyAPI_FUNC(PyObject*) PyUnicode_EncodeUTF8(
const Py_UNICODE *data, /* Unicode char buffer */
Py_ssize_t length, /* number of Py_UNICODE chars to encode */
@ -1076,6 +1264,10 @@ PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
);
#ifndef Py_LIMITED_API
PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String(
PyObject* unicode,
const char* errors);
PyAPI_FUNC(PyObject*) PyUnicode_EncodeLatin1(
const Py_UNICODE *data, /* Unicode char buffer */
Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
@ -1100,6 +1292,10 @@ PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
);
#ifndef Py_LIMITED_API
PyAPI_FUNC(PyObject*) _PyUnicode_AsASCIIString(
PyObject* unicode,
const char* errors);
PyAPI_FUNC(PyObject*) PyUnicode_EncodeASCII(
const Py_UNICODE *data, /* Unicode char buffer */
Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
@ -1252,6 +1448,17 @@ PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII(
);
#endif
/* Similar to PyUnicode_TransformDecimalToASCII(), but takes a PyUnicodeObject
as argument instead of a raw buffer and length. This function additionally
transforms spaces to ASCII because this is what the callers in longobject,
floatobject, and complexobject did anyways. */
#ifndef Py_LIMITED_API
PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
PyObject *unicode /* Unicode object */
);
#endif
/* --- File system encoding ---------------------------------------------- */
/* ParseTuple converter: encode str objects to bytes using
@ -1439,6 +1646,15 @@ PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
int direction /* Find direction: +1 forward, -1 backward */
);
/* Like PyUnicode_Find, but search for single character only. */
PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar(
PyObject *str,
Py_UCS4 ch,
Py_ssize_t start,
Py_ssize_t end,
int direction
);
/* Count the number of occurrences of substr in str[start:end]. */
PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
@ -1542,13 +1758,15 @@ PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGroupingLocale(Py_UNICODE *buff
into the string pointed to by buffer. For the argument descriptions,
see Objects/stringlib/localeutil.h */
#ifndef Py_LIMITED_API
PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(Py_UNICODE *buffer,
Py_ssize_t n_buffer,
Py_UNICODE *digits,
Py_ssize_t n_digits,
Py_ssize_t min_width,
const char *grouping,
const char *thousands_sep);
PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(
int kind,
void *buffer,
Py_ssize_t n_buffer,
void *digits,
Py_ssize_t n_digits,
Py_ssize_t min_width,
const char *grouping,
const char *thousands_sep);
#endif
/* === Characters Type APIs =============================================== */
@ -1673,6 +1891,43 @@ PyAPI_FUNC(Py_UNICODE*) Py_UNICODE_strrchr(
Py_UNICODE c
);
PyAPI_FUNC(size_t) Py_UCS4_strlen(
const Py_UCS4 *u
);
PyAPI_FUNC(Py_UCS4*) Py_UCS4_strcpy(
Py_UCS4 *s1,
const Py_UCS4 *s2);
PyAPI_FUNC(Py_UCS4*) Py_UCS4_strcat(
Py_UCS4 *s1, const Py_UCS4 *s2);
PyAPI_FUNC(Py_UCS4*) Py_UCS4_strncpy(
Py_UCS4 *s1,
const Py_UCS4 *s2,
size_t n);
PyAPI_FUNC(int) Py_UCS4_strcmp(
const Py_UCS4 *s1,
const Py_UCS4 *s2
);
PyAPI_FUNC(int) Py_UCS4_strncmp(
const Py_UCS4 *s1,
const Py_UCS4 *s2,
size_t n
);
PyAPI_FUNC(Py_UCS4*) Py_UCS4_strchr(
const Py_UCS4 *s,
Py_UCS4 c
);
PyAPI_FUNC(Py_UCS4*) Py_UCS4_strrchr(
const Py_UCS4 *s,
Py_UCS4 c
);
/* Create a copy of a unicode string ending with a nul character. Return NULL
and raise a MemoryError exception on memory allocation failure, otherwise
return a new allocated buffer (use PyMem_Free() to free the buffer). */