mirror of
https://github.com/python/cpython.git
synced 2025-11-01 18:51:43 +00:00
Move uchhash functionality into unicodedata (after the recent
crop of changes, the files are small enough to do this). Also adds "name" and "lookup" functions to unicodedata.
This commit is contained in:
parent
eda28445c0
commit
06d126803c
4 changed files with 248 additions and 228 deletions
|
|
@ -12,6 +12,9 @@
|
|||
------------------------------------------------------------------------ */
|
||||
|
||||
#include "Python.h"
|
||||
#include "ucnhash.h"
|
||||
|
||||
/* character properties */
|
||||
|
||||
typedef struct {
|
||||
const unsigned char category; /* index into
|
||||
|
|
@ -52,8 +55,7 @@ unicodedata_decimal(PyObject *self, PyObject *args)
|
|||
PyObject *defobj = NULL;
|
||||
long rc;
|
||||
|
||||
if (!PyArg_ParseTuple(args, "O!|O:decimal",
|
||||
&PyUnicode_Type, &v, &defobj))
|
||||
if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
|
||||
return NULL;
|
||||
if (PyUnicode_GET_SIZE(v) != 1) {
|
||||
PyErr_SetString(PyExc_TypeError,
|
||||
|
|
@ -82,8 +84,7 @@ unicodedata_digit(PyObject *self, PyObject *args)
|
|||
PyObject *defobj = NULL;
|
||||
long rc;
|
||||
|
||||
if (!PyArg_ParseTuple(args, "O!|O:digit",
|
||||
&PyUnicode_Type, &v, &defobj))
|
||||
if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
|
||||
return NULL;
|
||||
if (PyUnicode_GET_SIZE(v) != 1) {
|
||||
PyErr_SetString(PyExc_TypeError,
|
||||
|
|
@ -93,8 +94,7 @@ unicodedata_digit(PyObject *self, PyObject *args)
|
|||
rc = Py_UNICODE_TODIGIT(*PyUnicode_AS_UNICODE(v));
|
||||
if (rc < 0) {
|
||||
if (defobj == NULL) {
|
||||
PyErr_SetString(PyExc_ValueError,
|
||||
"not a digit");
|
||||
PyErr_SetString(PyExc_ValueError, "not a digit");
|
||||
return NULL;
|
||||
}
|
||||
else {
|
||||
|
|
@ -112,8 +112,7 @@ unicodedata_numeric(PyObject *self, PyObject *args)
|
|||
PyObject *defobj = NULL;
|
||||
double rc;
|
||||
|
||||
if (!PyArg_ParseTuple(args, "O!|O:numeric",
|
||||
&PyUnicode_Type, &v, &defobj))
|
||||
if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
|
||||
return NULL;
|
||||
if (PyUnicode_GET_SIZE(v) != 1) {
|
||||
PyErr_SetString(PyExc_TypeError,
|
||||
|
|
@ -123,8 +122,7 @@ unicodedata_numeric(PyObject *self, PyObject *args)
|
|||
rc = Py_UNICODE_TONUMERIC(*PyUnicode_AS_UNICODE(v));
|
||||
if (rc < 0) {
|
||||
if (defobj == NULL) {
|
||||
PyErr_SetString(PyExc_ValueError,
|
||||
"not a numeric character");
|
||||
PyErr_SetString(PyExc_ValueError, "not a numeric character");
|
||||
return NULL;
|
||||
}
|
||||
else {
|
||||
|
|
@ -252,22 +250,231 @@ unicodedata_decomposition(PyObject *self, PyObject *args)
|
|||
return PyString_FromString(decomp);
|
||||
}
|
||||
|
||||
/* -------------------------------------------------------------------- */
|
||||
/* unicode character name tables */
|
||||
|
||||
/* data file generated by Tools/unicode/makeunicodedata.py */
|
||||
#include "unicodename_db.h"
|
||||
|
||||
/* -------------------------------------------------------------------- */
|
||||
/* database code (cut and pasted from the unidb package) */
|
||||
|
||||
static unsigned long
|
||||
gethash(const char *s, int len, int scale)
|
||||
{
|
||||
int i;
|
||||
unsigned long h = 0;
|
||||
unsigned long ix;
|
||||
for (i = 0; i < len; i++) {
|
||||
h = (h * scale) + (unsigned char) toupper(s[i]);
|
||||
ix = h & 0xff000000;
|
||||
if (ix)
|
||||
h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
|
||||
}
|
||||
return h;
|
||||
}
|
||||
|
||||
static int
|
||||
getname(Py_UCS4 code, char* buffer, int buflen)
|
||||
{
|
||||
int offset;
|
||||
int i;
|
||||
int word;
|
||||
unsigned char* w;
|
||||
|
||||
if (code < 0 || code >= 65536)
|
||||
return 0;
|
||||
|
||||
/* get offset into phrasebook */
|
||||
offset = phrasebook_offset1[(code>>phrasebook_shift)];
|
||||
offset = phrasebook_offset2[(offset<<phrasebook_shift) +
|
||||
(code&((1<<phrasebook_shift)-1))];
|
||||
if (!offset)
|
||||
return 0;
|
||||
|
||||
i = 0;
|
||||
|
||||
for (;;) {
|
||||
/* get word index */
|
||||
word = phrasebook[offset] - phrasebook_short;
|
||||
if (word >= 0) {
|
||||
word = (word << 8) + phrasebook[offset+1];
|
||||
offset += 2;
|
||||
} else
|
||||
word = phrasebook[offset++];
|
||||
if (i) {
|
||||
if (i > buflen)
|
||||
return 0; /* buffer overflow */
|
||||
buffer[i++] = ' ';
|
||||
}
|
||||
/* copy word string from lexicon. the last character in the
|
||||
word has bit 7 set. the last word in a string ends with
|
||||
0x80 */
|
||||
w = lexicon + lexicon_offset[word];
|
||||
while (*w < 128) {
|
||||
if (i >= buflen)
|
||||
return 0; /* buffer overflow */
|
||||
buffer[i++] = *w++;
|
||||
}
|
||||
if (i >= buflen)
|
||||
return 0; /* buffer overflow */
|
||||
buffer[i++] = *w & 127;
|
||||
if (*w == 128)
|
||||
break; /* end of word */
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int
|
||||
cmpname(int code, const char* name, int namelen)
|
||||
{
|
||||
/* check if code corresponds to the given name */
|
||||
int i;
|
||||
char buffer[NAME_MAXLEN];
|
||||
if (!getname(code, buffer, sizeof(buffer)))
|
||||
return 0;
|
||||
for (i = 0; i < namelen; i++) {
|
||||
if (toupper(name[i]) != buffer[i])
|
||||
return 0;
|
||||
}
|
||||
return buffer[namelen] == '\0';
|
||||
}
|
||||
|
||||
static int
|
||||
getcode(const char* name, int namelen, Py_UCS4* code)
|
||||
{
|
||||
unsigned int h, v;
|
||||
unsigned int mask = code_size-1;
|
||||
unsigned int i, incr;
|
||||
|
||||
/* the following is the same as python's dictionary lookup, with
|
||||
only minor changes. see the makeunicodedata script for more
|
||||
details */
|
||||
|
||||
h = (unsigned int) gethash(name, namelen, code_magic);
|
||||
i = (~h) & mask;
|
||||
v = code_hash[i];
|
||||
if (!v)
|
||||
return 0;
|
||||
if (cmpname(v, name, namelen)) {
|
||||
*code = v;
|
||||
return 1;
|
||||
}
|
||||
incr = (h ^ (h >> 3)) & mask;
|
||||
if (!incr)
|
||||
incr = mask;
|
||||
for (;;) {
|
||||
i = (i + incr) & mask;
|
||||
v = code_hash[i];
|
||||
if (!v)
|
||||
return -1;
|
||||
if (cmpname(v, name, namelen)) {
|
||||
*code = v;
|
||||
return 1;
|
||||
}
|
||||
incr = incr << 1;
|
||||
if (incr > mask)
|
||||
incr = incr ^ code_poly;
|
||||
}
|
||||
}
|
||||
|
||||
static const _PyUnicode_Name_CAPI hashAPI =
|
||||
{
|
||||
sizeof(_PyUnicode_Name_CAPI),
|
||||
getname,
|
||||
getcode
|
||||
};
|
||||
|
||||
/* -------------------------------------------------------------------- */
|
||||
/* Python bindings */
|
||||
|
||||
static PyObject *
|
||||
unicodedata_name(PyObject* self, PyObject* args)
|
||||
{
|
||||
char name[NAME_MAXLEN];
|
||||
|
||||
PyUnicodeObject* v;
|
||||
PyObject* defobj = NULL;
|
||||
if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
|
||||
return NULL;
|
||||
|
||||
if (PyUnicode_GET_SIZE(v) != 1) {
|
||||
PyErr_SetString(PyExc_TypeError,
|
||||
"need a single Unicode character as parameter");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (!getname((Py_UCS4) *PyUnicode_AS_UNICODE(v), name, sizeof(name))) {
|
||||
if (defobj == NULL) {
|
||||
PyErr_SetString(PyExc_ValueError, "no such name");
|
||||
return NULL;
|
||||
}
|
||||
else {
|
||||
Py_INCREF(defobj);
|
||||
return defobj;
|
||||
}
|
||||
}
|
||||
|
||||
return Py_BuildValue("s", name);
|
||||
}
|
||||
|
||||
static PyObject *
|
||||
unicodedata_lookup(PyObject* self, PyObject* args)
|
||||
{
|
||||
Py_UCS4 code;
|
||||
Py_UNICODE str[1];
|
||||
|
||||
char* name;
|
||||
int namelen;
|
||||
if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
|
||||
return NULL;
|
||||
|
||||
if (!getcode(name, namelen, &code)) {
|
||||
PyErr_SetString(PyExc_KeyError, "undefined character name");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
str[0] = (Py_UNICODE) code;
|
||||
return PyUnicode_FromUnicode(str, 1);
|
||||
}
|
||||
|
||||
/* XXX Add doc strings. */
|
||||
|
||||
static PyMethodDef unicodedata_functions[] = {
|
||||
{"decimal", unicodedata_decimal, 1},
|
||||
{"digit", unicodedata_digit, 1},
|
||||
{"numeric", unicodedata_numeric, 1},
|
||||
{"category", unicodedata_category, 1},
|
||||
{"bidirectional", unicodedata_bidirectional, 1},
|
||||
{"combining", unicodedata_combining, 1},
|
||||
{"mirrored", unicodedata_mirrored, 1},
|
||||
{"decomposition", unicodedata_decomposition, 1},
|
||||
{"decimal", unicodedata_decimal, METH_VARARGS},
|
||||
{"digit", unicodedata_digit, METH_VARARGS},
|
||||
{"numeric", unicodedata_numeric, METH_VARARGS},
|
||||
{"category", unicodedata_category, METH_VARARGS},
|
||||
{"bidirectional", unicodedata_bidirectional, METH_VARARGS},
|
||||
{"combining", unicodedata_combining, METH_VARARGS},
|
||||
{"mirrored", unicodedata_mirrored, METH_VARARGS},
|
||||
{"decomposition",unicodedata_decomposition, METH_VARARGS},
|
||||
{"name", unicodedata_name, METH_VARARGS},
|
||||
{"lookup", unicodedata_lookup, METH_VARARGS},
|
||||
{NULL, NULL} /* sentinel */
|
||||
};
|
||||
|
||||
static char *unicodedata_docstring = "unicode character database";
|
||||
|
||||
DL_EXPORT(void)
|
||||
initunicodedata(void)
|
||||
{
|
||||
Py_InitModule("unicodedata", unicodedata_functions);
|
||||
PyObject *m, *d, *v;
|
||||
|
||||
m = Py_InitModule4(
|
||||
"unicodedata", unicodedata_functions,
|
||||
unicodedata_docstring, NULL, PYTHON_API_VERSION);
|
||||
if (!m)
|
||||
return;
|
||||
|
||||
d = PyModule_GetDict(m);
|
||||
if (!d)
|
||||
return;
|
||||
|
||||
/* Export C API */
|
||||
v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
|
||||
PyDict_SetItemString(d, "ucnhash_CAPI", v);
|
||||
Py_XDECREF(v);
|
||||
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue