Move uchhash functionality into unicodedata (after the recent

crop of changes, the files are small enough to do this). Also adds "name" and "lookup" functions to unicodedata.
2025-11-01 18:51:43 +00:00 · 2001-01-24 07:59:11 +00:00 · 2001-01-24 07:59:11 +00:00 · 06d126803c
commit 06d126803c
parent eda28445c0
4 changed files with 248 additions and 228 deletions
--- a/Modules/unicodedata.c
+++ b/Modules/unicodedata.c
@ -12,6 +12,9 @@
   ------------------------------------------------------------------------ */

 #include "Python.h"
+#include "ucnhash.h"
+
+/* character properties */

 typedef struct {
    const unsigned char category;	/* index into
@ -52,8 +55,7 @@ unicodedata_decimal(PyObject *self, PyObject *args)
    PyObject *defobj = NULL;
    long rc;

-    if (!PyArg_ParseTuple(args, "O!|O:decimal",
-			  &PyUnicode_Type, &v, &defobj))
+    if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
        return NULL;
    if (PyUnicode_GET_SIZE(v) != 1) {
 	PyErr_SetString(PyExc_TypeError,
@ -82,8 +84,7 @@ unicodedata_digit(PyObject *self, PyObject *args)
    PyObject *defobj = NULL;
    long rc;

-    if (!PyArg_ParseTuple(args, "O!|O:digit",
-			  &PyUnicode_Type, &v, &defobj))
+    if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
        return NULL;
    if (PyUnicode_GET_SIZE(v) != 1) {
 	PyErr_SetString(PyExc_TypeError,
@ -93,8 +94,7 @@ unicodedata_digit(PyObject *self, PyObject *args)
    rc = Py_UNICODE_TODIGIT(*PyUnicode_AS_UNICODE(v));
    if (rc < 0) {
 	if (defobj == NULL) {
-	    PyErr_SetString(PyExc_ValueError,
-			    "not a digit");
+	    PyErr_SetString(PyExc_ValueError, "not a digit");
            return NULL;
 	}
 	else {
@ -112,8 +112,7 @@ unicodedata_numeric(PyObject *self, PyObject *args)
    PyObject *defobj = NULL;
    double rc;

-    if (!PyArg_ParseTuple(args, "O!|O:numeric",
-			  &PyUnicode_Type, &v, &defobj))
+    if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
        return NULL;
    if (PyUnicode_GET_SIZE(v) != 1) {
 	PyErr_SetString(PyExc_TypeError,
@ -123,8 +122,7 @@ unicodedata_numeric(PyObject *self, PyObject *args)
    rc = Py_UNICODE_TONUMERIC(*PyUnicode_AS_UNICODE(v));
    if (rc < 0) {
 	if (defobj == NULL) {
-	    PyErr_SetString(PyExc_ValueError,
-			    "not a numeric character");
+	    PyErr_SetString(PyExc_ValueError, "not a numeric character");
 	    return NULL;
 	}
 	else {
@ -252,22 +250,231 @@ unicodedata_decomposition(PyObject *self, PyObject *args)
    return PyString_FromString(decomp);
 }

+/* -------------------------------------------------------------------- */
+/* unicode character name tables */
+
+/* data file generated by Tools/unicode/makeunicodedata.py */
+#include "unicodename_db.h"
+
+/* -------------------------------------------------------------------- */
+/* database code (cut and pasted from the unidb package) */
+
+static unsigned long
+gethash(const char *s, int len, int scale)
+{
+    int i;
+    unsigned long h = 0;
+    unsigned long ix;
+    for (i = 0; i < len; i++) {
+        h = (h * scale) + (unsigned char) toupper(s[i]);
+        ix = h & 0xff000000;
+        if (ix)
+            h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
+    }
+    return h;
+}
+
+static int
+getname(Py_UCS4 code, char* buffer, int buflen)
+{
+    int offset;
+    int i;
+    int word;
+    unsigned char* w;
+
+    if (code < 0 || code >= 65536)
+        return 0;
+
+    /* get offset into phrasebook */
+    offset = phrasebook_offset1[(code>>phrasebook_shift)];
+    offset = phrasebook_offset2[(offset<<phrasebook_shift) +
+                               (code&((1<<phrasebook_shift)-1))];
+    if (!offset)
+        return 0;
+
+    i = 0;
+
+    for (;;) {
+        /* get word index */
+        word = phrasebook[offset] - phrasebook_short;
+        if (word >= 0) {
+            word = (word << 8) + phrasebook[offset+1];
+            offset += 2;
+        } else
+            word = phrasebook[offset++];
+        if (i) {
+            if (i > buflen)
+                return 0; /* buffer overflow */
+            buffer[i++] = ' ';
+        }
+        /* copy word string from lexicon.  the last character in the
+           word has bit 7 set.  the last word in a string ends with
+           0x80 */
+        w = lexicon + lexicon_offset[word];
+        while (*w < 128) {
+            if (i >= buflen)
+                return 0; /* buffer overflow */
+            buffer[i++] = *w++;
+        }
+        if (i >= buflen)
+            return 0; /* buffer overflow */
+        buffer[i++] = *w & 127;
+        if (*w == 128)
+            break; /* end of word */
+    }
+
+    return 1;
+}
+
+static int
+cmpname(int code, const char* name, int namelen)
+{
+    /* check if code corresponds to the given name */
+    int i;
+    char buffer[NAME_MAXLEN];
+    if (!getname(code, buffer, sizeof(buffer)))
+        return 0;
+    for (i = 0; i < namelen; i++) {
+        if (toupper(name[i]) != buffer[i])
+            return 0;
+    }
+    return buffer[namelen] == '\0';
+}
+
+static int
+getcode(const char* name, int namelen, Py_UCS4* code)
+{
+    unsigned int h, v;
+    unsigned int mask = code_size-1;
+    unsigned int i, incr;
+
+    /* the following is the same as python's dictionary lookup, with
+       only minor changes.  see the makeunicodedata script for more
+       details */
+
+    h = (unsigned int) gethash(name, namelen, code_magic);
+    i = (~h) & mask;
+    v = code_hash[i];
+    if (!v)
+        return 0;
+    if (cmpname(v, name, namelen)) {
+        *code = v;
+        return 1;
+    }
+    incr = (h ^ (h >> 3)) & mask;
+    if (!incr)
+        incr = mask;
+    for (;;) {
+        i = (i + incr) & mask;
+        v = code_hash[i];
+        if (!v)
+            return -1;
+        if (cmpname(v, name, namelen)) {
+            *code = v;
+            return 1;
+        }
+        incr = incr << 1;
+        if (incr > mask)
+            incr = incr ^ code_poly;
+    }
+}
+
+static const _PyUnicode_Name_CAPI hashAPI = 
+{
+    sizeof(_PyUnicode_Name_CAPI),
+    getname,
+    getcode
+};
+
+/* -------------------------------------------------------------------- */
+/* Python bindings */
+
+static PyObject *
+unicodedata_name(PyObject* self, PyObject* args)
+{
+    char name[NAME_MAXLEN];
+
+    PyUnicodeObject* v;
+    PyObject* defobj = NULL;
+    if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
+        return NULL;
+
+    if (PyUnicode_GET_SIZE(v) != 1) {
+	PyErr_SetString(PyExc_TypeError,
+			"need a single Unicode character as parameter");
+	return NULL;
+    }
+
+    if (!getname((Py_UCS4) *PyUnicode_AS_UNICODE(v), name, sizeof(name))) {
+	if (defobj == NULL) {
+	    PyErr_SetString(PyExc_ValueError, "no such name");
+            return NULL;
+	}
+	else {
+	    Py_INCREF(defobj);
+	    return defobj;
+	}
+    }
+
+    return Py_BuildValue("s", name);
+}
+
+static PyObject *
+unicodedata_lookup(PyObject* self, PyObject* args)
+{
+    Py_UCS4 code;
+    Py_UNICODE str[1];
+
+    char* name;
+    int namelen;
+    if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
+        return NULL;
+
+    if (!getcode(name, namelen, &code)) {
+        PyErr_SetString(PyExc_KeyError, "undefined character name");
+        return NULL;
+    }
+
+    str[0] = (Py_UNICODE) code;
+    return PyUnicode_FromUnicode(str, 1);
+}
+
 /* XXX Add doc strings. */

 static PyMethodDef unicodedata_functions[] = {
-    {"decimal",		unicodedata_decimal,			1},
-    {"digit",		unicodedata_digit,			1},
-    {"numeric",		unicodedata_numeric,			1},
-    {"category",	unicodedata_category,			1},
-    {"bidirectional",	unicodedata_bidirectional,		1},
-    {"combining",	unicodedata_combining,			1},
-    {"mirrored",	unicodedata_mirrored,			1},
-    {"decomposition",	unicodedata_decomposition,		1},
+    {"decimal", unicodedata_decimal, METH_VARARGS},
+    {"digit", unicodedata_digit, METH_VARARGS},
+    {"numeric", unicodedata_numeric, METH_VARARGS},
+    {"category", unicodedata_category, METH_VARARGS},
+    {"bidirectional", unicodedata_bidirectional, METH_VARARGS},
+    {"combining", unicodedata_combining, METH_VARARGS},
+    {"mirrored", unicodedata_mirrored, METH_VARARGS},
+    {"decomposition",unicodedata_decomposition, METH_VARARGS},
+    {"name", unicodedata_name, METH_VARARGS},
+    {"lookup", unicodedata_lookup, METH_VARARGS},
    {NULL, NULL}		/* sentinel */
 };

+static char *unicodedata_docstring = "unicode character database";
+
 DL_EXPORT(void)
 initunicodedata(void)
 {
-    Py_InitModule("unicodedata", unicodedata_functions);
+    PyObject *m, *d, *v;
+
+    m = Py_InitModule4(
+        "unicodedata", unicodedata_functions,
+        unicodedata_docstring, NULL, PYTHON_API_VERSION);
+    if (!m)
+        return;
+
+    d = PyModule_GetDict(m);
+    if (!d)
+        return;
+
+    /* Export C API */
+    v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
+    PyDict_SetItemString(d, "ucnhash_CAPI", v);
+    Py_XDECREF(v);
+
 }