mirror of
				https://github.com/python/cpython.git
				synced 2025-10-31 10:26:02 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			477 lines
		
	
	
	
		
			12 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			477 lines
		
	
	
	
		
			12 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| /* ------------------------------------------------------------------------
 | |
| 
 | |
|    unicodedata -- Provides access to the Unicode 3.0 data base.
 | |
| 
 | |
|    Data was extracted from the Unicode 3.0 UnicodeData.txt file.
 | |
| 
 | |
|    Written by Marc-Andre Lemburg (mal@lemburg.com).
 | |
|    Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
 | |
| 
 | |
|    Copyright (c) Corporation for National Research Initiatives.
 | |
| 
 | |
|    ------------------------------------------------------------------------ */
 | |
| 
 | |
| #include "Python.h"
 | |
| #include "ucnhash.h"
 | |
| 
 | |
| /* character properties */
 | |
| 
 | |
| typedef struct {
 | |
|     const unsigned char category;	/* index into
 | |
| 					   _PyUnicode_CategoryNames */
 | |
|     const unsigned char	combining; 	/* combining class value 0 - 255 */
 | |
|     const unsigned char	bidirectional; 	/* index into
 | |
| 					   _PyUnicode_BidirectionalNames */
 | |
|     const unsigned char mirrored;	/* true if mirrored in bidir mode */
 | |
| } _PyUnicode_DatabaseRecord;
 | |
| 
 | |
| /* data file generated by Tools/unicode/makeunicodedata.py */
 | |
| #include "unicodedata_db.h"
 | |
| 
 | |
| static const _PyUnicode_DatabaseRecord*
 | |
| _getrecord(PyUnicodeObject* v)
 | |
| {
 | |
|     int code;
 | |
|     int index;
 | |
| 
 | |
|     code = (int) *PyUnicode_AS_UNICODE(v);
 | |
| 
 | |
|     if (code < 0 || code >= 65536)
 | |
|         index = 0;
 | |
|     else {
 | |
|         index = index1[(code>>SHIFT)];
 | |
|         index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
 | |
|     }
 | |
| 
 | |
|     return &_PyUnicode_Database_Records[index];
 | |
| }
 | |
| 
 | |
| /* --- Module API --------------------------------------------------------- */
 | |
| 
 | |
| static PyObject *
 | |
| unicodedata_decimal(PyObject *self, PyObject *args)
 | |
| {
 | |
|     PyUnicodeObject *v;
 | |
|     PyObject *defobj = NULL;
 | |
|     long rc;
 | |
| 
 | |
|     if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
 | |
|         return NULL;
 | |
|     if (PyUnicode_GET_SIZE(v) != 1) {
 | |
| 	PyErr_SetString(PyExc_TypeError,
 | |
| 			"need a single Unicode character as parameter");
 | |
|         return NULL;
 | |
|     }
 | |
|     rc = Py_UNICODE_TODECIMAL(*PyUnicode_AS_UNICODE(v));
 | |
|     if (rc < 0) {
 | |
| 	if (defobj == NULL) {
 | |
| 	    PyErr_SetString(PyExc_ValueError,
 | |
| 			    "not a decimal");
 | |
|             return NULL;
 | |
| 	}
 | |
| 	else {
 | |
| 	    Py_INCREF(defobj);
 | |
| 	    return defobj;
 | |
| 	}
 | |
|     }
 | |
|     return PyInt_FromLong(rc);
 | |
| }
 | |
| 
 | |
| static PyObject *
 | |
| unicodedata_digit(PyObject *self, PyObject *args)
 | |
| {
 | |
|     PyUnicodeObject *v;
 | |
|     PyObject *defobj = NULL;
 | |
|     long rc;
 | |
| 
 | |
|     if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
 | |
|         return NULL;
 | |
|     if (PyUnicode_GET_SIZE(v) != 1) {
 | |
| 	PyErr_SetString(PyExc_TypeError,
 | |
| 			"need a single Unicode character as parameter");
 | |
|         return NULL;
 | |
|     }
 | |
|     rc = Py_UNICODE_TODIGIT(*PyUnicode_AS_UNICODE(v));
 | |
|     if (rc < 0) {
 | |
| 	if (defobj == NULL) {
 | |
| 	    PyErr_SetString(PyExc_ValueError, "not a digit");
 | |
|             return NULL;
 | |
| 	}
 | |
| 	else {
 | |
| 	    Py_INCREF(defobj);
 | |
| 	    return defobj;
 | |
| 	}
 | |
|     }
 | |
|     return PyInt_FromLong(rc);
 | |
| }
 | |
| 
 | |
| static PyObject *
 | |
| unicodedata_numeric(PyObject *self, PyObject *args)
 | |
| {
 | |
|     PyUnicodeObject *v;
 | |
|     PyObject *defobj = NULL;
 | |
|     double rc;
 | |
| 
 | |
|     if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
 | |
|         return NULL;
 | |
|     if (PyUnicode_GET_SIZE(v) != 1) {
 | |
| 	PyErr_SetString(PyExc_TypeError,
 | |
| 			"need a single Unicode character as parameter");
 | |
| 	return NULL;
 | |
|     }
 | |
|     rc = Py_UNICODE_TONUMERIC(*PyUnicode_AS_UNICODE(v));
 | |
|     if (rc < 0) {
 | |
| 	if (defobj == NULL) {
 | |
| 	    PyErr_SetString(PyExc_ValueError, "not a numeric character");
 | |
| 	    return NULL;
 | |
| 	}
 | |
| 	else {
 | |
| 	    Py_INCREF(defobj);
 | |
| 	    return defobj;
 | |
| 	}
 | |
|     }
 | |
|     return PyFloat_FromDouble(rc);
 | |
| }
 | |
| 
 | |
| static PyObject *
 | |
| unicodedata_category(PyObject *self, PyObject *args)
 | |
| {
 | |
|     PyUnicodeObject *v;
 | |
|     int index;
 | |
| 
 | |
|     if (!PyArg_ParseTuple(args, "O!:category",
 | |
| 			  &PyUnicode_Type, &v))
 | |
| 	return NULL;
 | |
|     if (PyUnicode_GET_SIZE(v) != 1) {
 | |
| 	PyErr_SetString(PyExc_TypeError,
 | |
| 			"need a single Unicode character as parameter");
 | |
| 	return NULL;
 | |
|     }
 | |
|     index = (int) _getrecord(v)->category;
 | |
|     return PyString_FromString(_PyUnicode_CategoryNames[index]);
 | |
| }
 | |
| 
 | |
| static PyObject *
 | |
| unicodedata_bidirectional(PyObject *self, PyObject *args)
 | |
| {
 | |
|     PyUnicodeObject *v;
 | |
|     int index;
 | |
| 
 | |
|     if (!PyArg_ParseTuple(args, "O!:bidirectional",
 | |
| 			  &PyUnicode_Type, &v))
 | |
| 	return NULL;
 | |
|     if (PyUnicode_GET_SIZE(v) != 1) {
 | |
| 	PyErr_SetString(PyExc_TypeError,
 | |
| 			"need a single Unicode character as parameter");
 | |
| 	return NULL;
 | |
|     }
 | |
|     index = (int) _getrecord(v)->bidirectional;
 | |
|     return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
 | |
| }
 | |
| 
 | |
| static PyObject *
 | |
| unicodedata_combining(PyObject *self, PyObject *args)
 | |
| {
 | |
|     PyUnicodeObject *v;
 | |
| 
 | |
|     if (!PyArg_ParseTuple(args, "O!:combining",
 | |
| 			  &PyUnicode_Type, &v))
 | |
| 	return NULL;
 | |
|     if (PyUnicode_GET_SIZE(v) != 1) {
 | |
| 	PyErr_SetString(PyExc_TypeError,
 | |
| 			"need a single Unicode character as parameter");
 | |
| 	return NULL;
 | |
|     }
 | |
|     return PyInt_FromLong((int) _getrecord(v)->combining);
 | |
| }
 | |
| 
 | |
| static PyObject *
 | |
| unicodedata_mirrored(PyObject *self, PyObject *args)
 | |
| {
 | |
|     PyUnicodeObject *v;
 | |
| 
 | |
|     if (!PyArg_ParseTuple(args, "O!:mirrored",
 | |
| 			  &PyUnicode_Type, &v))
 | |
| 	return NULL;
 | |
|     if (PyUnicode_GET_SIZE(v) != 1) {
 | |
| 	PyErr_SetString(PyExc_TypeError,
 | |
| 			"need a single Unicode character as parameter");
 | |
| 	return NULL;
 | |
|     }
 | |
|     return PyInt_FromLong((int) _getrecord(v)->mirrored);
 | |
| }
 | |
| 
 | |
| static PyObject *
 | |
| unicodedata_decomposition(PyObject *self, PyObject *args)
 | |
| {
 | |
|     PyUnicodeObject *v;
 | |
|     char decomp[256];
 | |
|     int code, index, count, i;
 | |
| 
 | |
|     if (!PyArg_ParseTuple(args, "O!:decomposition",
 | |
| 			  &PyUnicode_Type, &v))
 | |
| 	return NULL;
 | |
|     if (PyUnicode_GET_SIZE(v) != 1) {
 | |
| 	PyErr_SetString(PyExc_TypeError,
 | |
| 			"need a single Unicode character as parameter");
 | |
| 	return NULL;
 | |
|     }
 | |
| 
 | |
|     code = (int) *PyUnicode_AS_UNICODE(v);
 | |
| 
 | |
|     if (code < 0 || code >= 65536)
 | |
|         index = 0;
 | |
|     else {
 | |
|         index = decomp_index1[(code>>DECOMP_SHIFT)];
 | |
|         index = decomp_index2[(index<<DECOMP_SHIFT)+
 | |
|                              (code&((1<<DECOMP_SHIFT)-1))];
 | |
|     }
 | |
| 
 | |
|     /* high byte is number of hex bytes (usually one or two), low byte
 | |
|        is prefix code (from*/
 | |
|     count = decomp_data[index] >> 8;
 | |
| 
 | |
|     /* XXX: could allocate the PyString up front instead
 | |
|        (strlen(prefix) + 5 * count + 1 bytes) */
 | |
| 
 | |
|     /* copy prefix */
 | |
|     i = strlen(decomp_prefix[decomp_data[index] & 255]);
 | |
|     memcpy(decomp, decomp_prefix[decomp_data[index] & 255], i);
 | |
| 
 | |
|     while (count-- > 0) {
 | |
|         if (i)
 | |
|             decomp[i++] = ' ';
 | |
|         assert((size_t)i < sizeof(decomp));
 | |
|         PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
 | |
|                       decomp_data[++index]);
 | |
|         i += strlen(decomp + i);
 | |
|     }
 | |
|     
 | |
|     decomp[i] = '\0';
 | |
| 
 | |
|     return PyString_FromString(decomp);
 | |
| }
 | |
| 
 | |
| /* -------------------------------------------------------------------- */
 | |
| /* unicode character name tables */
 | |
| 
 | |
| /* data file generated by Tools/unicode/makeunicodedata.py */
 | |
| #include "unicodename_db.h"
 | |
| 
 | |
| /* -------------------------------------------------------------------- */
 | |
| /* database code (cut and pasted from the unidb package) */
 | |
| 
 | |
| static unsigned long
 | |
| _gethash(const char *s, int len, int scale)
 | |
| {
 | |
|     int i;
 | |
|     unsigned long h = 0;
 | |
|     unsigned long ix;
 | |
|     for (i = 0; i < len; i++) {
 | |
|         h = (h * scale) + (unsigned char) toupper(s[i]);
 | |
|         ix = h & 0xff000000;
 | |
|         if (ix)
 | |
|             h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
 | |
|     }
 | |
|     return h;
 | |
| }
 | |
| 
 | |
| static int
 | |
| _Py_getname(Py_UCS4 code, char* buffer, int buflen)
 | |
| {
 | |
|     int offset;
 | |
|     int i;
 | |
|     int word;
 | |
|     unsigned char* w;
 | |
| 
 | |
|     if (code >= 65536)
 | |
|         return 0;
 | |
| 
 | |
|     /* get offset into phrasebook */
 | |
|     offset = phrasebook_offset1[(code>>phrasebook_shift)];
 | |
|     offset = phrasebook_offset2[(offset<<phrasebook_shift) +
 | |
|                                (code&((1<<phrasebook_shift)-1))];
 | |
|     if (!offset)
 | |
|         return 0;
 | |
| 
 | |
|     i = 0;
 | |
| 
 | |
|     for (;;) {
 | |
|         /* get word index */
 | |
|         word = phrasebook[offset] - phrasebook_short;
 | |
|         if (word >= 0) {
 | |
|             word = (word << 8) + phrasebook[offset+1];
 | |
|             offset += 2;
 | |
|         } else
 | |
|             word = phrasebook[offset++];
 | |
|         if (i) {
 | |
|             if (i > buflen)
 | |
|                 return 0; /* buffer overflow */
 | |
|             buffer[i++] = ' ';
 | |
|         }
 | |
|         /* copy word string from lexicon.  the last character in the
 | |
|            word has bit 7 set.  the last word in a string ends with
 | |
|            0x80 */
 | |
|         w = lexicon + lexicon_offset[word];
 | |
|         while (*w < 128) {
 | |
|             if (i >= buflen)
 | |
|                 return 0; /* buffer overflow */
 | |
|             buffer[i++] = *w++;
 | |
|         }
 | |
|         if (i >= buflen)
 | |
|             return 0; /* buffer overflow */
 | |
|         buffer[i++] = *w & 127;
 | |
|         if (*w == 128)
 | |
|             break; /* end of word */
 | |
|     }
 | |
| 
 | |
|     return 1;
 | |
| }
 | |
| 
 | |
| static int
 | |
| _cmpname(int code, const char* name, int namelen)
 | |
| {
 | |
|     /* check if code corresponds to the given name */
 | |
|     int i;
 | |
|     char buffer[NAME_MAXLEN];
 | |
|     if (!_Py_getname(code, buffer, sizeof(buffer)))
 | |
|         return 0;
 | |
|     for (i = 0; i < namelen; i++) {
 | |
|         if (toupper(name[i]) != buffer[i])
 | |
|             return 0;
 | |
|     }
 | |
|     return buffer[namelen] == '\0';
 | |
| }
 | |
| 
 | |
| static int
 | |
| _getcode(const char* name, int namelen, Py_UCS4* code)
 | |
| {
 | |
|     unsigned int h, v;
 | |
|     unsigned int mask = code_size-1;
 | |
|     unsigned int i, incr;
 | |
| 
 | |
|     /* the following is the same as python's dictionary lookup, with
 | |
|        only minor changes.  see the makeunicodedata script for more
 | |
|        details */
 | |
| 
 | |
|     h = (unsigned int) _gethash(name, namelen, code_magic);
 | |
|     i = (~h) & mask;
 | |
|     v = code_hash[i];
 | |
|     if (!v)
 | |
|         return 0;
 | |
|     if (_cmpname(v, name, namelen)) {
 | |
|         *code = v;
 | |
|         return 1;
 | |
|     }
 | |
|     incr = (h ^ (h >> 3)) & mask;
 | |
|     if (!incr)
 | |
|         incr = mask;
 | |
|     for (;;) {
 | |
|         i = (i + incr) & mask;
 | |
|         v = code_hash[i];
 | |
|         if (!v)
 | |
|             return 0;
 | |
|         if (_cmpname(v, name, namelen)) {
 | |
|             *code = v;
 | |
|             return 1;
 | |
|         }
 | |
|         incr = incr << 1;
 | |
|         if (incr > mask)
 | |
|             incr = incr ^ code_poly;
 | |
|     }
 | |
| }
 | |
| 
 | |
| static const _PyUnicode_Name_CAPI hashAPI = 
 | |
| {
 | |
|     sizeof(_PyUnicode_Name_CAPI),
 | |
|     _Py_getname,
 | |
|     _getcode
 | |
| };
 | |
| 
 | |
| /* -------------------------------------------------------------------- */
 | |
| /* Python bindings */
 | |
| 
 | |
| static PyObject *
 | |
| unicodedata_name(PyObject* self, PyObject* args)
 | |
| {
 | |
|     char name[NAME_MAXLEN];
 | |
| 
 | |
|     PyUnicodeObject* v;
 | |
|     PyObject* defobj = NULL;
 | |
|     if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
 | |
|         return NULL;
 | |
| 
 | |
|     if (PyUnicode_GET_SIZE(v) != 1) {
 | |
| 	PyErr_SetString(PyExc_TypeError,
 | |
| 			"need a single Unicode character as parameter");
 | |
| 	return NULL;
 | |
|     }
 | |
| 
 | |
|     if (!_Py_getname((Py_UCS4) *PyUnicode_AS_UNICODE(v),
 | |
|                              name, sizeof(name))) {
 | |
| 	if (defobj == NULL) {
 | |
| 	    PyErr_SetString(PyExc_ValueError, "no such name");
 | |
|             return NULL;
 | |
| 	}
 | |
| 	else {
 | |
| 	    Py_INCREF(defobj);
 | |
| 	    return defobj;
 | |
| 	}
 | |
|     }
 | |
| 
 | |
|     return Py_BuildValue("s", name);
 | |
| }
 | |
| 
 | |
| static PyObject *
 | |
| unicodedata_lookup(PyObject* self, PyObject* args)
 | |
| {
 | |
|     Py_UCS4 code;
 | |
|     Py_UNICODE str[1];
 | |
| 
 | |
|     char* name;
 | |
|     int namelen;
 | |
|     if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
 | |
|         return NULL;
 | |
| 
 | |
|     if (!_getcode(name, namelen, &code)) {
 | |
|         PyErr_SetString(PyExc_KeyError, "undefined character name");
 | |
|         return NULL;
 | |
|     }
 | |
| 
 | |
|     str[0] = (Py_UNICODE) code;
 | |
|     return PyUnicode_FromUnicode(str, 1);
 | |
| }
 | |
| 
 | |
| /* XXX Add doc strings. */
 | |
| 
 | |
| static PyMethodDef unicodedata_functions[] = {
 | |
|     {"decimal", unicodedata_decimal, METH_VARARGS},
 | |
|     {"digit", unicodedata_digit, METH_VARARGS},
 | |
|     {"numeric", unicodedata_numeric, METH_VARARGS},
 | |
|     {"category", unicodedata_category, METH_VARARGS},
 | |
|     {"bidirectional", unicodedata_bidirectional, METH_VARARGS},
 | |
|     {"combining", unicodedata_combining, METH_VARARGS},
 | |
|     {"mirrored", unicodedata_mirrored, METH_VARARGS},
 | |
|     {"decomposition",unicodedata_decomposition, METH_VARARGS},
 | |
|     {"name", unicodedata_name, METH_VARARGS},
 | |
|     {"lookup", unicodedata_lookup, METH_VARARGS},
 | |
|     {NULL, NULL}		/* sentinel */
 | |
| };
 | |
| 
 | |
| static char *unicodedata_docstring = "unicode character database";
 | |
| 
 | |
| DL_EXPORT(void)
 | |
| initunicodedata(void)
 | |
| {
 | |
|     PyObject *m, *v;
 | |
| 
 | |
|     m = Py_InitModule3(
 | |
|         "unicodedata", unicodedata_functions, unicodedata_docstring);
 | |
|     if (!m)
 | |
|         return;
 | |
| 
 | |
|     /* Export C API */
 | |
|     v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
 | |
|     if (v != NULL)
 | |
|         PyModule_AddObject(m, "ucnhash_CAPI", v);
 | |
| }
 | 
