mirror of
				https://github.com/python/cpython.git
				synced 2025-10-25 07:48:51 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			1458 lines
		
	
	
	
		
			43 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			1458 lines
		
	
	
	
		
			43 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| /* ------------------------------------------------------------------------
 | |
| 
 | |
|    unicodedata -- Provides access to the Unicode database.
 | |
| 
 | |
|    Data was extracted from the UnicodeData.txt file.
 | |
|    The current version number is reported in the unidata_version constant.
 | |
| 
 | |
|    Written by Marc-Andre Lemburg (mal@lemburg.com).
 | |
|    Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
 | |
|    Modified by Martin v. Löwis (martin@v.loewis.de)
 | |
| 
 | |
|    Copyright (c) Corporation for National Research Initiatives.
 | |
| 
 | |
|    ------------------------------------------------------------------------ */
 | |
| 
 | |
| #define PY_SSIZE_T_CLEAN
 | |
| 
 | |
| #include "Python.h"
 | |
| #include "ucnhash.h"
 | |
| #include "structmember.h"
 | |
| 
 | |
| _Py_IDENTIFIER(NFC);
 | |
| _Py_IDENTIFIER(NFD);
 | |
| _Py_IDENTIFIER(NFKC);
 | |
| _Py_IDENTIFIER(NFKD);
 | |
| 
 | |
| /*[clinic input]
 | |
| module unicodedata
 | |
| class unicodedata.UCD 'PreviousDBVersion *' '&UCD_Type'
 | |
| [clinic start generated code]*/
 | |
| /*[clinic end generated code: output=da39a3ee5e6b4b0d input=6dac153082d150bc]*/
 | |
| 
 | |
| /* character properties */
 | |
| 
 | |
| typedef struct {
 | |
|     const unsigned char category;       /* index into
 | |
|                                            _PyUnicode_CategoryNames */
 | |
|     const unsigned char combining;      /* combining class value 0 - 255 */
 | |
|     const unsigned char bidirectional;  /* index into
 | |
|                                            _PyUnicode_BidirectionalNames */
 | |
|     const unsigned char mirrored;       /* true if mirrored in bidir mode */
 | |
|     const unsigned char east_asian_width;       /* index into
 | |
|                                                    _PyUnicode_EastAsianWidth */
 | |
|     const unsigned char normalization_quick_check; /* see is_normalized() */
 | |
| } _PyUnicode_DatabaseRecord;
 | |
| 
 | |
| typedef struct change_record {
 | |
|     /* sequence of fields should be the same as in merge_old_version */
 | |
|     const unsigned char bidir_changed;
 | |
|     const unsigned char category_changed;
 | |
|     const unsigned char decimal_changed;
 | |
|     const unsigned char mirrored_changed;
 | |
|     const unsigned char east_asian_width_changed;
 | |
|     const double numeric_changed;
 | |
| } change_record;
 | |
| 
 | |
| /* data file generated by Tools/unicode/makeunicodedata.py */
 | |
| #include "unicodedata_db.h"
 | |
| 
 | |
| static const _PyUnicode_DatabaseRecord*
 | |
| _getrecord_ex(Py_UCS4 code)
 | |
| {
 | |
|     int index;
 | |
|     if (code >= 0x110000)
 | |
|         index = 0;
 | |
|     else {
 | |
|         index = index1[(code>>SHIFT)];
 | |
|         index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
 | |
|     }
 | |
| 
 | |
|     return &_PyUnicode_Database_Records[index];
 | |
| }
 | |
| 
 | |
| /* ------------- Previous-version API ------------------------------------- */
 | |
| typedef struct previous_version {
 | |
|     PyObject_HEAD
 | |
|     const char *name;
 | |
|     const change_record* (*getrecord)(Py_UCS4);
 | |
|     Py_UCS4 (*normalization)(Py_UCS4);
 | |
| } PreviousDBVersion;
 | |
| 
 | |
| #include "clinic/unicodedata.c.h"
 | |
| 
 | |
| #define get_old_record(self, v)    ((((PreviousDBVersion*)self)->getrecord)(v))
 | |
| 
 | |
| static PyMemberDef DB_members[] = {
 | |
|         {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
 | |
|         {NULL}
 | |
| };
 | |
| 
 | |
| /* forward declaration */
 | |
| static PyTypeObject UCD_Type;
 | |
| #define UCD_Check(o) (Py_TYPE(o)==&UCD_Type)
 | |
| 
 | |
| static PyObject*
 | |
| new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
 | |
|                      Py_UCS4 (*normalization)(Py_UCS4))
 | |
| {
 | |
|         PreviousDBVersion *self;
 | |
|         self = PyObject_New(PreviousDBVersion, &UCD_Type);
 | |
|         if (self == NULL)
 | |
|                 return NULL;
 | |
|         self->name = name;
 | |
|         self->getrecord = getrecord;
 | |
|         self->normalization = normalization;
 | |
|         return (PyObject*)self;
 | |
| }
 | |
| 
 | |
| 
 | |
| /* --- Module API --------------------------------------------------------- */
 | |
| 
 | |
| /*[clinic input]
 | |
| unicodedata.UCD.decimal
 | |
| 
 | |
|     self: self
 | |
|     chr: int(accept={str})
 | |
|     default: object=NULL
 | |
|     /
 | |
| 
 | |
| Converts a Unicode character into its equivalent decimal value.
 | |
| 
 | |
| Returns the decimal value assigned to the character chr as integer.
 | |
| If no such value is defined, default is returned, or, if not given,
 | |
| ValueError is raised.
 | |
| [clinic start generated code]*/
 | |
| 
 | |
| static PyObject *
 | |
| unicodedata_UCD_decimal_impl(PyObject *self, int chr,
 | |
|                              PyObject *default_value)
 | |
| /*[clinic end generated code: output=be23376e1a185231 input=933f8107993f23d0]*/
 | |
| {
 | |
|     int have_old = 0;
 | |
|     long rc;
 | |
|     Py_UCS4 c = (Py_UCS4)chr;
 | |
| 
 | |
|     if (self && UCD_Check(self)) {
 | |
|         const change_record *old = get_old_record(self, c);
 | |
|         if (old->category_changed == 0) {
 | |
|             /* unassigned */
 | |
|             have_old = 1;
 | |
|             rc = -1;
 | |
|         }
 | |
|         else if (old->decimal_changed != 0xFF) {
 | |
|             have_old = 1;
 | |
|             rc = old->decimal_changed;
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     if (!have_old)
 | |
|         rc = Py_UNICODE_TODECIMAL(c);
 | |
|     if (rc < 0) {
 | |
|         if (default_value == NULL) {
 | |
|             PyErr_SetString(PyExc_ValueError,
 | |
|                             "not a decimal");
 | |
|             return NULL;
 | |
|         }
 | |
|         else {
 | |
|             Py_INCREF(default_value);
 | |
|             return default_value;
 | |
|         }
 | |
|     }
 | |
|     return PyLong_FromLong(rc);
 | |
| }
 | |
| 
 | |
| /*[clinic input]
 | |
| unicodedata.UCD.digit
 | |
| 
 | |
|     self: self
 | |
|     chr: int(accept={str})
 | |
|     default: object=NULL
 | |
|     /
 | |
| 
 | |
| Converts a Unicode character into its equivalent digit value.
 | |
| 
 | |
| Returns the digit value assigned to the character chr as integer.
 | |
| If no such value is defined, default is returned, or, if not given,
 | |
| ValueError is raised.
 | |
| [clinic start generated code]*/
 | |
| 
 | |
| static PyObject *
 | |
| unicodedata_UCD_digit_impl(PyObject *self, int chr, PyObject *default_value)
 | |
| /*[clinic end generated code: output=96e18c950171fd2f input=e27d6e4565cd29f2]*/
 | |
| {
 | |
|     long rc;
 | |
|     Py_UCS4 c = (Py_UCS4)chr;
 | |
|     rc = Py_UNICODE_TODIGIT(c);
 | |
|     if (rc < 0) {
 | |
|         if (default_value == NULL) {
 | |
|             PyErr_SetString(PyExc_ValueError, "not a digit");
 | |
|             return NULL;
 | |
|         }
 | |
|         else {
 | |
|             Py_INCREF(default_value);
 | |
|             return default_value;
 | |
|         }
 | |
|     }
 | |
|     return PyLong_FromLong(rc);
 | |
| }
 | |
| 
 | |
| /*[clinic input]
 | |
| unicodedata.UCD.numeric
 | |
| 
 | |
|     self: self
 | |
|     chr: int(accept={str})
 | |
|     default: object=NULL
 | |
|     /
 | |
| 
 | |
| Converts a Unicode character into its equivalent numeric value.
 | |
| 
 | |
| Returns the numeric value assigned to the character chr as float.
 | |
| If no such value is defined, default is returned, or, if not given,
 | |
| ValueError is raised.
 | |
| [clinic start generated code]*/
 | |
| 
 | |
| static PyObject *
 | |
| unicodedata_UCD_numeric_impl(PyObject *self, int chr,
 | |
|                              PyObject *default_value)
 | |
| /*[clinic end generated code: output=53ce281fe85b10c4 input=fdf5871a5542893c]*/
 | |
| {
 | |
|     int have_old = 0;
 | |
|     double rc;
 | |
|     Py_UCS4 c = (Py_UCS4)chr;
 | |
| 
 | |
|     if (self && UCD_Check(self)) {
 | |
|         const change_record *old = get_old_record(self, c);
 | |
|         if (old->category_changed == 0) {
 | |
|             /* unassigned */
 | |
|             have_old = 1;
 | |
|             rc = -1.0;
 | |
|         }
 | |
|         else if (old->decimal_changed != 0xFF) {
 | |
|             have_old = 1;
 | |
|             rc = old->decimal_changed;
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     if (!have_old)
 | |
|         rc = Py_UNICODE_TONUMERIC(c);
 | |
|     if (rc == -1.0) {
 | |
|         if (default_value == NULL) {
 | |
|             PyErr_SetString(PyExc_ValueError, "not a numeric character");
 | |
|             return NULL;
 | |
|         }
 | |
|         else {
 | |
|             Py_INCREF(default_value);
 | |
|             return default_value;
 | |
|         }
 | |
|     }
 | |
|     return PyFloat_FromDouble(rc);
 | |
| }
 | |
| 
 | |
| /*[clinic input]
 | |
| unicodedata.UCD.category
 | |
| 
 | |
|     self: self
 | |
|     chr: int(accept={str})
 | |
|     /
 | |
| 
 | |
| Returns the general category assigned to the character chr as string.
 | |
| [clinic start generated code]*/
 | |
| 
 | |
| static PyObject *
 | |
| unicodedata_UCD_category_impl(PyObject *self, int chr)
 | |
| /*[clinic end generated code: output=8571539ee2e6783a input=27d6f3d85050bc06]*/
 | |
| {
 | |
|     int index;
 | |
|     Py_UCS4 c = (Py_UCS4)chr;
 | |
|     index = (int) _getrecord_ex(c)->category;
 | |
|     if (self && UCD_Check(self)) {
 | |
|         const change_record *old = get_old_record(self, c);
 | |
|         if (old->category_changed != 0xFF)
 | |
|             index = old->category_changed;
 | |
|     }
 | |
|     return PyUnicode_FromString(_PyUnicode_CategoryNames[index]);
 | |
| }
 | |
| 
 | |
| /*[clinic input]
 | |
| unicodedata.UCD.bidirectional
 | |
| 
 | |
|     self: self
 | |
|     chr: int(accept={str})
 | |
|     /
 | |
| 
 | |
| Returns the bidirectional class assigned to the character chr as string.
 | |
| 
 | |
| If no such value is defined, an empty string is returned.
 | |
| [clinic start generated code]*/
 | |
| 
 | |
| static PyObject *
 | |
| unicodedata_UCD_bidirectional_impl(PyObject *self, int chr)
 | |
| /*[clinic end generated code: output=d36310ce2039bb92 input=b3d8f42cebfcf475]*/
 | |
| {
 | |
|     int index;
 | |
|     Py_UCS4 c = (Py_UCS4)chr;
 | |
|     index = (int) _getrecord_ex(c)->bidirectional;
 | |
|     if (self && UCD_Check(self)) {
 | |
|         const change_record *old = get_old_record(self, c);
 | |
|         if (old->category_changed == 0)
 | |
|             index = 0; /* unassigned */
 | |
|         else if (old->bidir_changed != 0xFF)
 | |
|             index = old->bidir_changed;
 | |
|     }
 | |
|     return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]);
 | |
| }
 | |
| 
 | |
| /*[clinic input]
 | |
| unicodedata.UCD.combining -> int
 | |
| 
 | |
|     self: self
 | |
|     chr: int(accept={str})
 | |
|     /
 | |
| 
 | |
| Returns the canonical combining class assigned to the character chr as integer.
 | |
| 
 | |
| Returns 0 if no combining class is defined.
 | |
| [clinic start generated code]*/
 | |
| 
 | |
| static int
 | |
| unicodedata_UCD_combining_impl(PyObject *self, int chr)
 | |
| /*[clinic end generated code: output=cad056d0cb6a5920 input=9f2d6b2a95d0a22a]*/
 | |
| {
 | |
|     int index;
 | |
|     Py_UCS4 c = (Py_UCS4)chr;
 | |
|     index = (int) _getrecord_ex(c)->combining;
 | |
|     if (self && UCD_Check(self)) {
 | |
|         const change_record *old = get_old_record(self, c);
 | |
|         if (old->category_changed == 0)
 | |
|             index = 0; /* unassigned */
 | |
|     }
 | |
|     return index;
 | |
| }
 | |
| 
 | |
| /*[clinic input]
 | |
| unicodedata.UCD.mirrored -> int
 | |
| 
 | |
|     self: self
 | |
|     chr: int(accept={str})
 | |
|     /
 | |
| 
 | |
| Returns the mirrored property assigned to the character chr as integer.
 | |
| 
 | |
| Returns 1 if the character has been identified as a "mirrored"
 | |
| character in bidirectional text, 0 otherwise.
 | |
| [clinic start generated code]*/
 | |
| 
 | |
| static int
 | |
| unicodedata_UCD_mirrored_impl(PyObject *self, int chr)
 | |
| /*[clinic end generated code: output=2532dbf8121b50e6 input=5dd400d351ae6f3b]*/
 | |
| {
 | |
|     int index;
 | |
|     Py_UCS4 c = (Py_UCS4)chr;
 | |
|     index = (int) _getrecord_ex(c)->mirrored;
 | |
|     if (self && UCD_Check(self)) {
 | |
|         const change_record *old = get_old_record(self, c);
 | |
|         if (old->category_changed == 0)
 | |
|             index = 0; /* unassigned */
 | |
|         else if (old->mirrored_changed != 0xFF)
 | |
|             index = old->mirrored_changed;
 | |
|     }
 | |
|     return index;
 | |
| }
 | |
| 
 | |
| /*[clinic input]
 | |
| unicodedata.UCD.east_asian_width
 | |
| 
 | |
|     self: self
 | |
|     chr: int(accept={str})
 | |
|     /
 | |
| 
 | |
| Returns the east asian width assigned to the character chr as string.
 | |
| [clinic start generated code]*/
 | |
| 
 | |
| static PyObject *
 | |
| unicodedata_UCD_east_asian_width_impl(PyObject *self, int chr)
 | |
| /*[clinic end generated code: output=484e8537d9ee8197 input=c4854798aab026e0]*/
 | |
| {
 | |
|     int index;
 | |
|     Py_UCS4 c = (Py_UCS4)chr;
 | |
|     index = (int) _getrecord_ex(c)->east_asian_width;
 | |
|     if (self && UCD_Check(self)) {
 | |
|         const change_record *old = get_old_record(self, c);
 | |
|         if (old->category_changed == 0)
 | |
|             index = 0; /* unassigned */
 | |
|         else if (old->east_asian_width_changed != 0xFF)
 | |
|             index = old->east_asian_width_changed;
 | |
|     }
 | |
|     return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
 | |
| }
 | |
| 
 | |
| /*[clinic input]
 | |
| unicodedata.UCD.decomposition
 | |
| 
 | |
|     self: self
 | |
|     chr: int(accept={str})
 | |
|     /
 | |
| 
 | |
| Returns the character decomposition mapping assigned to the character chr as string.
 | |
| 
 | |
| An empty string is returned in case no such mapping is defined.
 | |
| [clinic start generated code]*/
 | |
| 
 | |
| static PyObject *
 | |
| unicodedata_UCD_decomposition_impl(PyObject *self, int chr)
 | |
| /*[clinic end generated code: output=7d699f3ec7565d27 input=e4c12459ad68507b]*/
 | |
| {
 | |
|     char decomp[256];
 | |
|     int code, index, count;
 | |
|     size_t i;
 | |
|     unsigned int prefix_index;
 | |
|     Py_UCS4 c = (Py_UCS4)chr;
 | |
| 
 | |
|     code = (int)c;
 | |
| 
 | |
|     if (self && UCD_Check(self)) {
 | |
|         const change_record *old = get_old_record(self, c);
 | |
|         if (old->category_changed == 0)
 | |
|             return PyUnicode_FromString(""); /* unassigned */
 | |
|     }
 | |
| 
 | |
|     if (code < 0 || code >= 0x110000)
 | |
|         index = 0;
 | |
|     else {
 | |
|         index = decomp_index1[(code>>DECOMP_SHIFT)];
 | |
|         index = decomp_index2[(index<<DECOMP_SHIFT)+
 | |
|                              (code&((1<<DECOMP_SHIFT)-1))];
 | |
|     }
 | |
| 
 | |
|     /* high byte is number of hex bytes (usually one or two), low byte
 | |
|        is prefix code (from*/
 | |
|     count = decomp_data[index] >> 8;
 | |
| 
 | |
|     /* XXX: could allocate the PyString up front instead
 | |
|        (strlen(prefix) + 5 * count + 1 bytes) */
 | |
| 
 | |
|     /* Based on how index is calculated above and decomp_data is generated
 | |
|        from Tools/unicode/makeunicodedata.py, it should not be possible
 | |
|        to overflow decomp_prefix. */
 | |
|     prefix_index = decomp_data[index] & 255;
 | |
|     assert(prefix_index < Py_ARRAY_LENGTH(decomp_prefix));
 | |
| 
 | |
|     /* copy prefix */
 | |
|     i = strlen(decomp_prefix[prefix_index]);
 | |
|     memcpy(decomp, decomp_prefix[prefix_index], i);
 | |
| 
 | |
|     while (count-- > 0) {
 | |
|         if (i)
 | |
|             decomp[i++] = ' ';
 | |
|         assert(i < sizeof(decomp));
 | |
|         PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
 | |
|                       decomp_data[++index]);
 | |
|         i += strlen(decomp + i);
 | |
|     }
 | |
|     return PyUnicode_FromStringAndSize(decomp, i);
 | |
| }
 | |
| 
 | |
| static void
 | |
| get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
 | |
| {
 | |
|     if (code >= 0x110000) {
 | |
|         *index = 0;
 | |
|     } else if (self && UCD_Check(self) &&
 | |
|                get_old_record(self, code)->category_changed==0) {
 | |
|         /* unassigned in old version */
 | |
|         *index = 0;
 | |
|     }
 | |
|     else {
 | |
|         *index = decomp_index1[(code>>DECOMP_SHIFT)];
 | |
|         *index = decomp_index2[(*index<<DECOMP_SHIFT)+
 | |
|                                (code&((1<<DECOMP_SHIFT)-1))];
 | |
|     }
 | |
| 
 | |
|     /* high byte is number of hex bytes (usually one or two), low byte
 | |
|        is prefix code (from*/
 | |
|     *count = decomp_data[*index] >> 8;
 | |
|     *prefix = decomp_data[*index] & 255;
 | |
| 
 | |
|     (*index)++;
 | |
| }
 | |
| 
 | |
| #define SBase   0xAC00
 | |
| #define LBase   0x1100
 | |
| #define VBase   0x1161
 | |
| #define TBase   0x11A7
 | |
| #define LCount  19
 | |
| #define VCount  21
 | |
| #define TCount  28
 | |
| #define NCount  (VCount*TCount)
 | |
| #define SCount  (LCount*NCount)
 | |
| 
 | |
| static PyObject*
 | |
| nfd_nfkd(PyObject *self, PyObject *input, int k)
 | |
| {
 | |
|     PyObject *result;
 | |
|     Py_UCS4 *output;
 | |
|     Py_ssize_t i, o, osize;
 | |
|     int kind;
 | |
|     void *data;
 | |
|     /* Longest decomposition in Unicode 3.2: U+FDFA */
 | |
|     Py_UCS4 stack[20];
 | |
|     Py_ssize_t space, isize;
 | |
|     int index, prefix, count, stackptr;
 | |
|     unsigned char prev, cur;
 | |
| 
 | |
|     stackptr = 0;
 | |
|     isize = PyUnicode_GET_LENGTH(input);
 | |
|     space = isize;
 | |
|     /* Overallocate at most 10 characters. */
 | |
|     if (space > 10) {
 | |
|         if (space <= PY_SSIZE_T_MAX - 10)
 | |
|             space += 10;
 | |
|     }
 | |
|     else {
 | |
|         space *= 2;
 | |
|     }
 | |
|     osize = space;
 | |
|     output = PyMem_NEW(Py_UCS4, space);
 | |
|     if (!output) {
 | |
|         PyErr_NoMemory();
 | |
|         return NULL;
 | |
|     }
 | |
|     i = o = 0;
 | |
|     kind = PyUnicode_KIND(input);
 | |
|     data = PyUnicode_DATA(input);
 | |
| 
 | |
|     while (i < isize) {
 | |
|         stack[stackptr++] = PyUnicode_READ(kind, data, i++);
 | |
|         while(stackptr) {
 | |
|             Py_UCS4 code = stack[--stackptr];
 | |
|             /* Hangul Decomposition adds three characters in
 | |
|                a single step, so we need at least that much room. */
 | |
|             if (space < 3) {
 | |
|                 Py_UCS4 *new_output;
 | |
|                 osize += 10;
 | |
|                 space += 10;
 | |
|                 new_output = PyMem_Realloc(output, osize*sizeof(Py_UCS4));
 | |
|                 if (new_output == NULL) {
 | |
|                     PyMem_Free(output);
 | |
|                     PyErr_NoMemory();
 | |
|                     return NULL;
 | |
|                 }
 | |
|                 output = new_output;
 | |
|             }
 | |
|             /* Hangul Decomposition. */
 | |
|             if (SBase <= code && code < (SBase+SCount)) {
 | |
|                 int SIndex = code - SBase;
 | |
|                 int L = LBase + SIndex / NCount;
 | |
|                 int V = VBase + (SIndex % NCount) / TCount;
 | |
|                 int T = TBase + SIndex % TCount;
 | |
|                 output[o++] = L;
 | |
|                 output[o++] = V;
 | |
|                 space -= 2;
 | |
|                 if (T != TBase) {
 | |
|                     output[o++] = T;
 | |
|                     space --;
 | |
|                 }
 | |
|                 continue;
 | |
|             }
 | |
|             /* normalization changes */
 | |
|             if (self && UCD_Check(self)) {
 | |
|                 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
 | |
|                 if (value != 0) {
 | |
|                     stack[stackptr++] = value;
 | |
|                     continue;
 | |
|                 }
 | |
|             }
 | |
| 
 | |
|             /* Other decompositions. */
 | |
|             get_decomp_record(self, code, &index, &prefix, &count);
 | |
| 
 | |
|             /* Copy character if it is not decomposable, or has a
 | |
|                compatibility decomposition, but we do NFD. */
 | |
|             if (!count || (prefix && !k)) {
 | |
|                 output[o++] = code;
 | |
|                 space--;
 | |
|                 continue;
 | |
|             }
 | |
|             /* Copy decomposition onto the stack, in reverse
 | |
|                order.  */
 | |
|             while(count) {
 | |
|                 code = decomp_data[index + (--count)];
 | |
|                 stack[stackptr++] = code;
 | |
|             }
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
 | |
|                                        output, o);
 | |
|     PyMem_Free(output);
 | |
|     if (!result)
 | |
|         return NULL;
 | |
|     /* result is guaranteed to be ready, as it is compact. */
 | |
|     kind = PyUnicode_KIND(result);
 | |
|     data = PyUnicode_DATA(result);
 | |
| 
 | |
|     /* Sort canonically. */
 | |
|     i = 0;
 | |
|     prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
 | |
|     for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
 | |
|         cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
 | |
|         if (prev == 0 || cur == 0 || prev <= cur) {
 | |
|             prev = cur;
 | |
|             continue;
 | |
|         }
 | |
|         /* Non-canonical order. Need to switch *i with previous. */
 | |
|         o = i - 1;
 | |
|         while (1) {
 | |
|             Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
 | |
|             PyUnicode_WRITE(kind, data, o+1,
 | |
|                             PyUnicode_READ(kind, data, o));
 | |
|             PyUnicode_WRITE(kind, data, o, tmp);
 | |
|             o--;
 | |
|             if (o < 0)
 | |
|                 break;
 | |
|             prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining;
 | |
|             if (prev == 0 || prev <= cur)
 | |
|                 break;
 | |
|         }
 | |
|         prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
 | |
|     }
 | |
|     return result;
 | |
| }
 | |
| 
 | |
| static int
 | |
| find_nfc_index(PyObject *self, struct reindex* nfc, Py_UCS4 code)
 | |
| {
 | |
|     unsigned int index;
 | |
|     for (index = 0; nfc[index].start; index++) {
 | |
|         unsigned int start = nfc[index].start;
 | |
|         if (code < start)
 | |
|             return -1;
 | |
|         if (code <= start + nfc[index].count) {
 | |
|             unsigned int delta = code - start;
 | |
|             return nfc[index].index + delta;
 | |
|         }
 | |
|     }
 | |
|     return -1;
 | |
| }
 | |
| 
 | |
| static PyObject*
 | |
| nfc_nfkc(PyObject *self, PyObject *input, int k)
 | |
| {
 | |
|     PyObject *result;
 | |
|     int kind;
 | |
|     void *data;
 | |
|     Py_UCS4 *output;
 | |
|     Py_ssize_t i, i1, o, len;
 | |
|     int f,l,index,index1,comb;
 | |
|     Py_UCS4 code;
 | |
|     Py_ssize_t skipped[20];
 | |
|     int cskipped = 0;
 | |
| 
 | |
|     result = nfd_nfkd(self, input, k);
 | |
|     if (!result)
 | |
|         return NULL;
 | |
|     /* result will be "ready". */
 | |
|     kind = PyUnicode_KIND(result);
 | |
|     data = PyUnicode_DATA(result);
 | |
|     len = PyUnicode_GET_LENGTH(result);
 | |
| 
 | |
|     /* We allocate a buffer for the output.
 | |
|        If we find that we made no changes, we still return
 | |
|        the NFD result. */
 | |
|     output = PyMem_NEW(Py_UCS4, len);
 | |
|     if (!output) {
 | |
|         PyErr_NoMemory();
 | |
|         Py_DECREF(result);
 | |
|         return 0;
 | |
|     }
 | |
|     i = o = 0;
 | |
| 
 | |
|   again:
 | |
|     while (i < len) {
 | |
|       for (index = 0; index < cskipped; index++) {
 | |
|           if (skipped[index] == i) {
 | |
|               /* *i character is skipped.
 | |
|                  Remove from list. */
 | |
|               skipped[index] = skipped[cskipped-1];
 | |
|               cskipped--;
 | |
|               i++;
 | |
|               goto again; /* continue while */
 | |
|           }
 | |
|       }
 | |
|       /* Hangul Composition. We don't need to check for <LV,T>
 | |
|          pairs, since we always have decomposed data. */
 | |
|       code = PyUnicode_READ(kind, data, i);
 | |
|       if (LBase <= code && code < (LBase+LCount) &&
 | |
|           i + 1 < len &&
 | |
|           VBase <= PyUnicode_READ(kind, data, i+1) &&
 | |
|           PyUnicode_READ(kind, data, i+1) < (VBase+VCount)) {
 | |
|           /* check L character is a modern leading consonant (0x1100 ~ 0x1112)
 | |
|              and V character is a modern vowel (0x1161 ~ 0x1175). */
 | |
|           int LIndex, VIndex;
 | |
|           LIndex = code - LBase;
 | |
|           VIndex = PyUnicode_READ(kind, data, i+1) - VBase;
 | |
|           code = SBase + (LIndex*VCount+VIndex)*TCount;
 | |
|           i+=2;
 | |
|           if (i < len &&
 | |
|               TBase < PyUnicode_READ(kind, data, i) &&
 | |
|               PyUnicode_READ(kind, data, i) < (TBase+TCount)) {
 | |
|               /* check T character is a modern trailing consonant
 | |
|                  (0x11A8 ~ 0x11C2). */
 | |
|               code += PyUnicode_READ(kind, data, i)-TBase;
 | |
|               i++;
 | |
|           }
 | |
|           output[o++] = code;
 | |
|           continue;
 | |
|       }
 | |
| 
 | |
|       /* code is still input[i] here */
 | |
|       f = find_nfc_index(self, nfc_first, code);
 | |
|       if (f == -1) {
 | |
|           output[o++] = code;
 | |
|           i++;
 | |
|           continue;
 | |
|       }
 | |
|       /* Find next unblocked character. */
 | |
|       i1 = i+1;
 | |
|       comb = 0;
 | |
|       /* output base character for now; might be updated later. */
 | |
|       output[o] = PyUnicode_READ(kind, data, i);
 | |
|       while (i1 < len) {
 | |
|           Py_UCS4 code1 = PyUnicode_READ(kind, data, i1);
 | |
|           int comb1 = _getrecord_ex(code1)->combining;
 | |
|           if (comb) {
 | |
|               if (comb1 == 0)
 | |
|                   break;
 | |
|               if (comb >= comb1) {
 | |
|                   /* Character is blocked. */
 | |
|                   i1++;
 | |
|                   continue;
 | |
|               }
 | |
|           }
 | |
|           l = find_nfc_index(self, nfc_last, code1);
 | |
|           /* i1 cannot be combined with i. If i1
 | |
|              is a starter, we don't need to look further.
 | |
|              Otherwise, record the combining class. */
 | |
|           if (l == -1) {
 | |
|             not_combinable:
 | |
|               if (comb1 == 0)
 | |
|                   break;
 | |
|               comb = comb1;
 | |
|               i1++;
 | |
|               continue;
 | |
|           }
 | |
|           index = f*TOTAL_LAST + l;
 | |
|           index1 = comp_index[index >> COMP_SHIFT];
 | |
|           code = comp_data[(index1<<COMP_SHIFT)+
 | |
|                            (index&((1<<COMP_SHIFT)-1))];
 | |
|           if (code == 0)
 | |
|               goto not_combinable;
 | |
| 
 | |
|           /* Replace the original character. */
 | |
|           output[o] = code;
 | |
|           /* Mark the second character unused. */
 | |
|           assert(cskipped < 20);
 | |
|           skipped[cskipped++] = i1;
 | |
|           i1++;
 | |
|           f = find_nfc_index(self, nfc_first, output[o]);
 | |
|           if (f == -1)
 | |
|               break;
 | |
|       }
 | |
|       /* Output character was already written.
 | |
|          Just advance the indices. */
 | |
|       o++; i++;
 | |
|     }
 | |
|     if (o == len) {
 | |
|         /* No changes. Return original string. */
 | |
|         PyMem_Free(output);
 | |
|         return result;
 | |
|     }
 | |
|     Py_DECREF(result);
 | |
|     result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
 | |
|                                        output, o);
 | |
|     PyMem_Free(output);
 | |
|     return result;
 | |
| }
 | |
| 
 | |
| typedef enum {YES, NO, MAYBE} NormalMode;
 | |
| 
 | |
| /* Return YES if the input is certainly normalized, NO or MAYBE if it might not be. */
 | |
| static NormalMode
 | |
| is_normalized(PyObject *self, PyObject *input, int nfc, int k)
 | |
| {
 | |
|     Py_ssize_t i, len;
 | |
|     int kind;
 | |
|     void *data;
 | |
|     unsigned char prev_combining = 0, quickcheck_mask;
 | |
| 
 | |
|     /* An older version of the database is requested, quickchecks must be
 | |
|        disabled. */
 | |
|     if (self && UCD_Check(self))
 | |
|         return NO;
 | |
| 
 | |
|     /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
 | |
|        as described in http://unicode.org/reports/tr15/#Annex8. */
 | |
|     quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0));
 | |
| 
 | |
|     i = 0;
 | |
|     kind = PyUnicode_KIND(input);
 | |
|     data = PyUnicode_DATA(input);
 | |
|     len = PyUnicode_GET_LENGTH(input);
 | |
|     while (i < len) {
 | |
|         Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
 | |
|         const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch);
 | |
|         unsigned char combining = record->combining;
 | |
|         unsigned char quickcheck = record->normalization_quick_check;
 | |
| 
 | |
|         if (quickcheck & quickcheck_mask)
 | |
|             return MAYBE; /* this string might need normalization */
 | |
|         if (combining && prev_combining > combining)
 | |
|             return NO; /* non-canonical sort order, not normalized */
 | |
|         prev_combining = combining;
 | |
|     }
 | |
|     return YES; /* certainly normalized */
 | |
| }
 | |
| 
 | |
| /*[clinic input]
 | |
| unicodedata.UCD.is_normalized
 | |
| 
 | |
|     self: self
 | |
|     form: unicode
 | |
|     unistr as input: unicode
 | |
|     /
 | |
| 
 | |
| Return whether the Unicode string unistr is in the normal form 'form'.
 | |
| 
 | |
| Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
 | |
| [clinic start generated code]*/
 | |
| 
 | |
| static PyObject *
 | |
| unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form,
 | |
|                                    PyObject *input)
 | |
| /*[clinic end generated code: output=11e5a3694e723ca5 input=a544f14cea79e508]*/
 | |
| {
 | |
|     if (PyUnicode_READY(input) == -1) {
 | |
|         return NULL;
 | |
|     }
 | |
| 
 | |
|     if (PyUnicode_GET_LENGTH(input) == 0) {
 | |
|         /* special case empty input strings. */
 | |
|         Py_RETURN_TRUE;
 | |
|     }
 | |
| 
 | |
|     PyObject *result;
 | |
|     int nfc = 0;
 | |
|     int k = 0;
 | |
|     NormalMode m;
 | |
| 
 | |
|     PyObject *cmp;
 | |
|     int match = 0;
 | |
| 
 | |
|     if (_PyUnicode_EqualToASCIIId(form, &PyId_NFC)) {
 | |
|         nfc = 1;
 | |
|     }
 | |
|     else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKC)) {
 | |
|         nfc = 1;
 | |
|         k = 1;
 | |
|     }
 | |
|     else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFD)) {
 | |
|         /* matches default values for `nfc` and `k` */
 | |
|     }
 | |
|     else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKD)) {
 | |
|         k = 1;
 | |
|     }
 | |
|     else {
 | |
|         PyErr_SetString(PyExc_ValueError, "invalid normalization form");
 | |
|         return NULL;
 | |
|     }
 | |
| 
 | |
|     m = is_normalized(self, input, nfc, k);
 | |
| 
 | |
|     if (m == MAYBE) {
 | |
|         cmp = (nfc ? nfc_nfkc : nfd_nfkd)(self, input, k);
 | |
|         if (cmp == NULL) {
 | |
|             return NULL;
 | |
|         }
 | |
|         match = PyUnicode_Compare(input, cmp);
 | |
|         Py_DECREF(cmp);
 | |
|         result = (match == 0) ? Py_True : Py_False;
 | |
|     }
 | |
|     else {
 | |
|         result = (m == YES) ? Py_True : Py_False;
 | |
|     }
 | |
| 
 | |
|     Py_INCREF(result);
 | |
|     return result;
 | |
| }
 | |
| 
 | |
| 
 | |
| /*[clinic input]
 | |
| unicodedata.UCD.normalize
 | |
| 
 | |
|     self: self
 | |
|     form: unicode
 | |
|     unistr as input: unicode
 | |
|     /
 | |
| 
 | |
| Return the normal form 'form' for the Unicode string unistr.
 | |
| 
 | |
| Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
 | |
| [clinic start generated code]*/
 | |
| 
 | |
| static PyObject *
 | |
| unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form,
 | |
|                                PyObject *input)
 | |
| /*[clinic end generated code: output=05ca4385a2ad6983 input=3a5206c0ad2833fb]*/
 | |
| {
 | |
|     if (PyUnicode_GET_LENGTH(input) == 0) {
 | |
|         /* Special case empty input strings, since resizing
 | |
|            them  later would cause internal errors. */
 | |
|         Py_INCREF(input);
 | |
|         return input;
 | |
|     }
 | |
| 
 | |
|     if (_PyUnicode_EqualToASCIIId(form, &PyId_NFC)) {
 | |
|         if (is_normalized(self, input, 1, 0) == YES) {
 | |
|             Py_INCREF(input);
 | |
|             return input;
 | |
|         }
 | |
|         return nfc_nfkc(self, input, 0);
 | |
|     }
 | |
|     if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKC)) {
 | |
|         if (is_normalized(self, input, 1, 1) == YES) {
 | |
|             Py_INCREF(input);
 | |
|             return input;
 | |
|         }
 | |
|         return nfc_nfkc(self, input, 1);
 | |
|     }
 | |
|     if (_PyUnicode_EqualToASCIIId(form, &PyId_NFD)) {
 | |
|         if (is_normalized(self, input, 0, 0) == YES) {
 | |
|             Py_INCREF(input);
 | |
|             return input;
 | |
|         }
 | |
|         return nfd_nfkd(self, input, 0);
 | |
|     }
 | |
|     if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKD)) {
 | |
|         if (is_normalized(self, input, 0, 1) == YES) {
 | |
|             Py_INCREF(input);
 | |
|             return input;
 | |
|         }
 | |
|         return nfd_nfkd(self, input, 1);
 | |
|     }
 | |
|     PyErr_SetString(PyExc_ValueError, "invalid normalization form");
 | |
|     return NULL;
 | |
| }
 | |
| 
 | |
| /* -------------------------------------------------------------------- */
 | |
| /* unicode character name tables */
 | |
| 
 | |
| /* data file generated by Tools/unicode/makeunicodedata.py */
 | |
| #include "unicodename_db.h"
 | |
| 
 | |
| /* -------------------------------------------------------------------- */
 | |
| /* database code (cut and pasted from the unidb package) */
 | |
| 
 | |
| static unsigned long
 | |
| _gethash(const char *s, int len, int scale)
 | |
| {
 | |
|     int i;
 | |
|     unsigned long h = 0;
 | |
|     unsigned long ix;
 | |
|     for (i = 0; i < len; i++) {
 | |
|         h = (h * scale) + (unsigned char) Py_TOUPPER(Py_CHARMASK(s[i]));
 | |
|         ix = h & 0xff000000;
 | |
|         if (ix)
 | |
|             h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
 | |
|     }
 | |
|     return h;
 | |
| }
 | |
| 
 | |
| static const char * const hangul_syllables[][3] = {
 | |
|     { "G",  "A",   ""   },
 | |
|     { "GG", "AE",  "G"  },
 | |
|     { "N",  "YA",  "GG" },
 | |
|     { "D",  "YAE", "GS" },
 | |
|     { "DD", "EO",  "N", },
 | |
|     { "R",  "E",   "NJ" },
 | |
|     { "M",  "YEO", "NH" },
 | |
|     { "B",  "YE",  "D"  },
 | |
|     { "BB", "O",   "L"  },
 | |
|     { "S",  "WA",  "LG" },
 | |
|     { "SS", "WAE", "LM" },
 | |
|     { "",   "OE",  "LB" },
 | |
|     { "J",  "YO",  "LS" },
 | |
|     { "JJ", "U",   "LT" },
 | |
|     { "C",  "WEO", "LP" },
 | |
|     { "K",  "WE",  "LH" },
 | |
|     { "T",  "WI",  "M"  },
 | |
|     { "P",  "YU",  "B"  },
 | |
|     { "H",  "EU",  "BS" },
 | |
|     { 0,    "YI",  "S"  },
 | |
|     { 0,    "I",   "SS" },
 | |
|     { 0,    0,     "NG" },
 | |
|     { 0,    0,     "J"  },
 | |
|     { 0,    0,     "C"  },
 | |
|     { 0,    0,     "K"  },
 | |
|     { 0,    0,     "T"  },
 | |
|     { 0,    0,     "P"  },
 | |
|     { 0,    0,     "H"  }
 | |
| };
 | |
| 
 | |
| /* These ranges need to match makeunicodedata.py:cjk_ranges. */
 | |
| static int
 | |
| is_unified_ideograph(Py_UCS4 code)
 | |
| {
 | |
|     return
 | |
|         (0x3400 <= code && code <= 0x4DB5)   || /* CJK Ideograph Extension A */
 | |
|         (0x4E00 <= code && code <= 0x9FEF)   || /* CJK Ideograph */
 | |
|         (0x20000 <= code && code <= 0x2A6D6) || /* CJK Ideograph Extension B */
 | |
|         (0x2A700 <= code && code <= 0x2B734) || /* CJK Ideograph Extension C */
 | |
|         (0x2B740 <= code && code <= 0x2B81D) || /* CJK Ideograph Extension D */
 | |
|         (0x2B820 <= code && code <= 0x2CEA1) || /* CJK Ideograph Extension E */
 | |
|         (0x2CEB0 <= code && code <= 0x2EBEF);   /* CJK Ideograph Extension F */
 | |
| }
 | |
| 
 | |
| /* macros used to determine if the given code point is in the PUA range that
 | |
|  * we are using to store aliases and named sequences */
 | |
| #define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end))
 | |
| #define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \
 | |
|                           (cp < named_sequences_end))
 | |
| 
 | |
| static int
 | |
| _getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen,
 | |
|            int with_alias_and_seq)
 | |
| {
 | |
|     /* Find the name associated with the given code point.
 | |
|      * If with_alias_and_seq is 1, check for names in the Private Use Area 15
 | |
|      * that we are using for aliases and named sequences. */
 | |
|     int offset;
 | |
|     int i;
 | |
|     int word;
 | |
|     unsigned char* w;
 | |
| 
 | |
|     if (code >= 0x110000)
 | |
|         return 0;
 | |
| 
 | |
|     /* XXX should we just skip all the code points in the PUAs here? */
 | |
|     if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code)))
 | |
|         return 0;
 | |
| 
 | |
|     if (self && UCD_Check(self)) {
 | |
|         /* in 3.2.0 there are no aliases and named sequences */
 | |
|         const change_record *old;
 | |
|         if (IS_ALIAS(code) || IS_NAMED_SEQ(code))
 | |
|             return 0;
 | |
|         old = get_old_record(self, code);
 | |
|         if (old->category_changed == 0) {
 | |
|             /* unassigned */
 | |
|             return 0;
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     if (SBase <= code && code < SBase+SCount) {
 | |
|         /* Hangul syllable. */
 | |
|         int SIndex = code - SBase;
 | |
|         int L = SIndex / NCount;
 | |
|         int V = (SIndex % NCount) / TCount;
 | |
|         int T = SIndex % TCount;
 | |
| 
 | |
|         if (buflen < 27)
 | |
|             /* Worst case: HANGUL SYLLABLE <10chars>. */
 | |
|             return 0;
 | |
|         strcpy(buffer, "HANGUL SYLLABLE ");
 | |
|         buffer += 16;
 | |
|         strcpy(buffer, hangul_syllables[L][0]);
 | |
|         buffer += strlen(hangul_syllables[L][0]);
 | |
|         strcpy(buffer, hangul_syllables[V][1]);
 | |
|         buffer += strlen(hangul_syllables[V][1]);
 | |
|         strcpy(buffer, hangul_syllables[T][2]);
 | |
|         buffer += strlen(hangul_syllables[T][2]);
 | |
|         *buffer = '\0';
 | |
|         return 1;
 | |
|     }
 | |
| 
 | |
|     if (is_unified_ideograph(code)) {
 | |
|         if (buflen < 28)
 | |
|             /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
 | |
|             return 0;
 | |
|         sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
 | |
|         return 1;
 | |
|     }
 | |
| 
 | |
|     /* get offset into phrasebook */
 | |
|     offset = phrasebook_offset1[(code>>phrasebook_shift)];
 | |
|     offset = phrasebook_offset2[(offset<<phrasebook_shift) +
 | |
|                                (code&((1<<phrasebook_shift)-1))];
 | |
|     if (!offset)
 | |
|         return 0;
 | |
| 
 | |
|     i = 0;
 | |
| 
 | |
|     for (;;) {
 | |
|         /* get word index */
 | |
|         word = phrasebook[offset] - phrasebook_short;
 | |
|         if (word >= 0) {
 | |
|             word = (word << 8) + phrasebook[offset+1];
 | |
|             offset += 2;
 | |
|         } else
 | |
|             word = phrasebook[offset++];
 | |
|         if (i) {
 | |
|             if (i > buflen)
 | |
|                 return 0; /* buffer overflow */
 | |
|             buffer[i++] = ' ';
 | |
|         }
 | |
|         /* copy word string from lexicon.  the last character in the
 | |
|            word has bit 7 set.  the last word in a string ends with
 | |
|            0x80 */
 | |
|         w = lexicon + lexicon_offset[word];
 | |
|         while (*w < 128) {
 | |
|             if (i >= buflen)
 | |
|                 return 0; /* buffer overflow */
 | |
|             buffer[i++] = *w++;
 | |
|         }
 | |
|         if (i >= buflen)
 | |
|             return 0; /* buffer overflow */
 | |
|         buffer[i++] = *w & 127;
 | |
|         if (*w == 128)
 | |
|             break; /* end of word */
 | |
|     }
 | |
| 
 | |
|     return 1;
 | |
| }
 | |
| 
 | |
| static int
 | |
| _cmpname(PyObject *self, int code, const char* name, int namelen)
 | |
| {
 | |
|     /* check if code corresponds to the given name */
 | |
|     int i;
 | |
|     char buffer[NAME_MAXLEN+1];
 | |
|     if (!_getucname(self, code, buffer, NAME_MAXLEN, 1))
 | |
|         return 0;
 | |
|     for (i = 0; i < namelen; i++) {
 | |
|         if (Py_TOUPPER(Py_CHARMASK(name[i])) != buffer[i])
 | |
|             return 0;
 | |
|     }
 | |
|     return buffer[namelen] == '\0';
 | |
| }
 | |
| 
 | |
| static void
 | |
| find_syllable(const char *str, int *len, int *pos, int count, int column)
 | |
| {
 | |
|     int i, len1;
 | |
|     *len = -1;
 | |
|     for (i = 0; i < count; i++) {
 | |
|         const char *s = hangul_syllables[i][column];
 | |
|         len1 = Py_SAFE_DOWNCAST(strlen(s), size_t, int);
 | |
|         if (len1 <= *len)
 | |
|             continue;
 | |
|         if (strncmp(str, s, len1) == 0) {
 | |
|             *len = len1;
 | |
|             *pos = i;
 | |
|         }
 | |
|     }
 | |
|     if (*len == -1) {
 | |
|         *len = 0;
 | |
|     }
 | |
| }
 | |
| 
 | |
| static int
 | |
| _check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq)
 | |
| {
 | |
|     /* check if named sequences are allowed */
 | |
|     if (!with_named_seq && IS_NAMED_SEQ(cp))
 | |
|         return 0;
 | |
|     /* if the code point is in the PUA range that we use for aliases,
 | |
|      * convert it to obtain the right code point */
 | |
|     if (IS_ALIAS(cp))
 | |
|         *code = name_aliases[cp-aliases_start];
 | |
|     else
 | |
|         *code = cp;
 | |
|     return 1;
 | |
| }
 | |
| 
 | |
| static int
 | |
| _getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code,
 | |
|          int with_named_seq)
 | |
| {
 | |
|     /* Return the code point associated with the given name.
 | |
|      * Named aliases are resolved too (unless self != NULL (i.e. we are using
 | |
|      * 3.2.0)).  If with_named_seq is 1, returns the PUA code point that we are
 | |
|      * using for the named sequence, and the caller must then convert it. */
 | |
|     unsigned int h, v;
 | |
|     unsigned int mask = code_size-1;
 | |
|     unsigned int i, incr;
 | |
| 
 | |
|     /* Check for hangul syllables. */
 | |
|     if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
 | |
|         int len, L = -1, V = -1, T = -1;
 | |
|         const char *pos = name + 16;
 | |
|         find_syllable(pos, &len, &L, LCount, 0);
 | |
|         pos += len;
 | |
|         find_syllable(pos, &len, &V, VCount, 1);
 | |
|         pos += len;
 | |
|         find_syllable(pos, &len, &T, TCount, 2);
 | |
|         pos += len;
 | |
|         if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
 | |
|             *code = SBase + (L*VCount+V)*TCount + T;
 | |
|             return 1;
 | |
|         }
 | |
|         /* Otherwise, it's an illegal syllable name. */
 | |
|         return 0;
 | |
|     }
 | |
| 
 | |
|     /* Check for unified ideographs. */
 | |
|     if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
 | |
|         /* Four or five hexdigits must follow. */
 | |
|         v = 0;
 | |
|         name += 22;
 | |
|         namelen -= 22;
 | |
|         if (namelen != 4 && namelen != 5)
 | |
|             return 0;
 | |
|         while (namelen--) {
 | |
|             v *= 16;
 | |
|             if (*name >= '0' && *name <= '9')
 | |
|                 v += *name - '0';
 | |
|             else if (*name >= 'A' && *name <= 'F')
 | |
|                 v += *name - 'A' + 10;
 | |
|             else
 | |
|                 return 0;
 | |
|             name++;
 | |
|         }
 | |
|         if (!is_unified_ideograph(v))
 | |
|             return 0;
 | |
|         *code = v;
 | |
|         return 1;
 | |
|     }
 | |
| 
 | |
|     /* the following is the same as python's dictionary lookup, with
 | |
|        only minor changes.  see the makeunicodedata script for more
 | |
|        details */
 | |
| 
 | |
|     h = (unsigned int) _gethash(name, namelen, code_magic);
 | |
|     i = (~h) & mask;
 | |
|     v = code_hash[i];
 | |
|     if (!v)
 | |
|         return 0;
 | |
|     if (_cmpname(self, v, name, namelen))
 | |
|         return _check_alias_and_seq(v, code, with_named_seq);
 | |
|     incr = (h ^ (h >> 3)) & mask;
 | |
|     if (!incr)
 | |
|         incr = mask;
 | |
|     for (;;) {
 | |
|         i = (i + incr) & mask;
 | |
|         v = code_hash[i];
 | |
|         if (!v)
 | |
|             return 0;
 | |
|         if (_cmpname(self, v, name, namelen))
 | |
|             return _check_alias_and_seq(v, code, with_named_seq);
 | |
|         incr = incr << 1;
 | |
|         if (incr > mask)
 | |
|             incr = incr ^ code_poly;
 | |
|     }
 | |
| }
 | |
| 
 | |
| static const _PyUnicode_Name_CAPI hashAPI =
 | |
| {
 | |
|     sizeof(_PyUnicode_Name_CAPI),
 | |
|     _getucname,
 | |
|     _getcode
 | |
| };
 | |
| 
 | |
| /* -------------------------------------------------------------------- */
 | |
| /* Python bindings */
 | |
| 
 | |
| /*[clinic input]
 | |
| unicodedata.UCD.name
 | |
| 
 | |
|     self: self
 | |
|     chr: int(accept={str})
 | |
|     default: object=NULL
 | |
|     /
 | |
| 
 | |
| Returns the name assigned to the character chr as a string.
 | |
| 
 | |
| If no name is defined, default is returned, or, if not given,
 | |
| ValueError is raised.
 | |
| [clinic start generated code]*/
 | |
| 
 | |
| static PyObject *
 | |
| unicodedata_UCD_name_impl(PyObject *self, int chr, PyObject *default_value)
 | |
| /*[clinic end generated code: output=6bbb37a326407707 input=3e0367f534de56d9]*/
 | |
| {
 | |
|     char name[NAME_MAXLEN+1];
 | |
|     Py_UCS4 c = (Py_UCS4)chr;
 | |
| 
 | |
|     if (!_getucname(self, c, name, NAME_MAXLEN, 0)) {
 | |
|         if (default_value == NULL) {
 | |
|             PyErr_SetString(PyExc_ValueError, "no such name");
 | |
|             return NULL;
 | |
|         }
 | |
|         else {
 | |
|             Py_INCREF(default_value);
 | |
|             return default_value;
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     return PyUnicode_FromString(name);
 | |
| }
 | |
| 
 | |
| /*[clinic input]
 | |
| unicodedata.UCD.lookup
 | |
| 
 | |
|     self: self
 | |
|     name: str(accept={str, robuffer}, zeroes=True)
 | |
|     /
 | |
| 
 | |
| Look up character by name.
 | |
| 
 | |
| If a character with the given name is found, return the
 | |
| corresponding character.  If not found, KeyError is raised.
 | |
| [clinic start generated code]*/
 | |
| 
 | |
| static PyObject *
 | |
| unicodedata_UCD_lookup_impl(PyObject *self, const char *name,
 | |
|                             Py_ssize_clean_t name_length)
 | |
| /*[clinic end generated code: output=765cb8186788e6be input=a557be0f8607a0d6]*/
 | |
| {
 | |
|     Py_UCS4 code;
 | |
|     unsigned int index;
 | |
|     if (name_length > NAME_MAXLEN) {
 | |
|         PyErr_SetString(PyExc_KeyError, "name too long");
 | |
|         return NULL;
 | |
|     }
 | |
| 
 | |
|     if (!_getcode(self, name, (int)name_length, &code, 1)) {
 | |
|         PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
 | |
|         return NULL;
 | |
|     }
 | |
|     /* check if code is in the PUA range that we use for named sequences
 | |
|        and convert it */
 | |
|     if (IS_NAMED_SEQ(code)) {
 | |
|         index = code-named_sequences_start;
 | |
|         return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND,
 | |
|                                          named_sequences[index].seq,
 | |
|                                          named_sequences[index].seqlen);
 | |
|     }
 | |
|     return PyUnicode_FromOrdinal(code);
 | |
| }
 | |
| 
 | |
| /* XXX Add doc strings. */
 | |
| 
 | |
| static PyMethodDef unicodedata_functions[] = {
 | |
|     UNICODEDATA_UCD_DECIMAL_METHODDEF
 | |
|     UNICODEDATA_UCD_DIGIT_METHODDEF
 | |
|     UNICODEDATA_UCD_NUMERIC_METHODDEF
 | |
|     UNICODEDATA_UCD_CATEGORY_METHODDEF
 | |
|     UNICODEDATA_UCD_BIDIRECTIONAL_METHODDEF
 | |
|     UNICODEDATA_UCD_COMBINING_METHODDEF
 | |
|     UNICODEDATA_UCD_MIRRORED_METHODDEF
 | |
|     UNICODEDATA_UCD_EAST_ASIAN_WIDTH_METHODDEF
 | |
|     UNICODEDATA_UCD_DECOMPOSITION_METHODDEF
 | |
|     UNICODEDATA_UCD_NAME_METHODDEF
 | |
|     UNICODEDATA_UCD_LOOKUP_METHODDEF
 | |
|     UNICODEDATA_UCD_IS_NORMALIZED_METHODDEF
 | |
|     UNICODEDATA_UCD_NORMALIZE_METHODDEF
 | |
|     {NULL, NULL}                /* sentinel */
 | |
| };
 | |
| 
 | |
| static PyTypeObject UCD_Type = {
 | |
|         /* The ob_type field must be initialized in the module init function
 | |
|          * to be portable to Windows without using C++. */
 | |
|         PyVarObject_HEAD_INIT(NULL, 0)
 | |
|         "unicodedata.UCD",              /*tp_name*/
 | |
|         sizeof(PreviousDBVersion),      /*tp_basicsize*/
 | |
|         0,                      /*tp_itemsize*/
 | |
|         /* methods */
 | |
|         (destructor)PyObject_Del, /*tp_dealloc*/
 | |
|         0,                      /*tp_print*/
 | |
|         0,                      /*tp_getattr*/
 | |
|         0,                      /*tp_setattr*/
 | |
|         0,                      /*tp_reserved*/
 | |
|         0,                      /*tp_repr*/
 | |
|         0,                      /*tp_as_number*/
 | |
|         0,                      /*tp_as_sequence*/
 | |
|         0,                      /*tp_as_mapping*/
 | |
|         0,                      /*tp_hash*/
 | |
|         0,                      /*tp_call*/
 | |
|         0,                      /*tp_str*/
 | |
|         PyObject_GenericGetAttr,/*tp_getattro*/
 | |
|         0,                      /*tp_setattro*/
 | |
|         0,                      /*tp_as_buffer*/
 | |
|         Py_TPFLAGS_DEFAULT,     /*tp_flags*/
 | |
|         0,                      /*tp_doc*/
 | |
|         0,                      /*tp_traverse*/
 | |
|         0,                      /*tp_clear*/
 | |
|         0,                      /*tp_richcompare*/
 | |
|         0,                      /*tp_weaklistoffset*/
 | |
|         0,                      /*tp_iter*/
 | |
|         0,                      /*tp_iternext*/
 | |
|         unicodedata_functions,  /*tp_methods*/
 | |
|         DB_members,             /*tp_members*/
 | |
|         0,                      /*tp_getset*/
 | |
|         0,                      /*tp_base*/
 | |
|         0,                      /*tp_dict*/
 | |
|         0,                      /*tp_descr_get*/
 | |
|         0,                      /*tp_descr_set*/
 | |
|         0,                      /*tp_dictoffset*/
 | |
|         0,                      /*tp_init*/
 | |
|         0,                      /*tp_alloc*/
 | |
|         0,                      /*tp_new*/
 | |
|         0,                      /*tp_free*/
 | |
|         0,                      /*tp_is_gc*/
 | |
| };
 | |
| 
 | |
| PyDoc_STRVAR(unicodedata_docstring,
 | |
| "This module provides access to the Unicode Character Database which\n\
 | |
| defines character properties for all Unicode characters. The data in\n\
 | |
| this database is based on the UnicodeData.txt file version\n\
 | |
| " UNIDATA_VERSION " which is publicly available from ftp://ftp.unicode.org/.\n\
 | |
| \n\
 | |
| The module uses the same names and symbols as defined by the\n\
 | |
| UnicodeData File Format " UNIDATA_VERSION ".");
 | |
| 
 | |
| static struct PyModuleDef unicodedatamodule = {
 | |
|         PyModuleDef_HEAD_INIT,
 | |
|         "unicodedata",
 | |
|         unicodedata_docstring,
 | |
|         -1,
 | |
|         unicodedata_functions,
 | |
|         NULL,
 | |
|         NULL,
 | |
|         NULL,
 | |
|         NULL
 | |
| };
 | |
| 
 | |
| PyMODINIT_FUNC
 | |
| PyInit_unicodedata(void)
 | |
| {
 | |
|     PyObject *m, *v;
 | |
| 
 | |
|     Py_TYPE(&UCD_Type) = &PyType_Type;
 | |
| 
 | |
|     m = PyModule_Create(&unicodedatamodule);
 | |
|     if (!m)
 | |
|         return NULL;
 | |
| 
 | |
|     PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
 | |
|     Py_INCREF(&UCD_Type);
 | |
|     PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
 | |
| 
 | |
|     /* Previous versions */
 | |
|     v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
 | |
|     if (v != NULL)
 | |
|         PyModule_AddObject(m, "ucd_3_2_0", v);
 | |
| 
 | |
|     /* Export C API */
 | |
|     v = PyCapsule_New((void *)&hashAPI, PyUnicodeData_CAPSULE_NAME, NULL);
 | |
|     if (v != NULL)
 | |
|         PyModule_AddObject(m, "ucnhash_CAPI", v);
 | |
|     return m;
 | |
| }
 | |
| 
 | |
| /*
 | |
| Local variables:
 | |
| c-basic-offset: 4
 | |
| indent-tabs-mode: nil
 | |
| End:
 | |
| */
 | 
