mirror of
https://github.com/python/cpython.git
synced 2025-07-24 19:54:21 +00:00
Update Unicode database to Unicode 4.1.
This commit is contained in:
parent
e2b4677253
commit
480f1bb67b
12 changed files with 17302 additions and 13365 deletions
|
@ -14,6 +14,7 @@
|
|||
|
||||
#include "Python.h"
|
||||
#include "ucnhash.h"
|
||||
#include "structmember.h"
|
||||
|
||||
/* character properties */
|
||||
|
||||
|
@ -28,6 +29,14 @@ typedef struct {
|
|||
_PyUnicode_EastAsianWidth */
|
||||
} _PyUnicode_DatabaseRecord;
|
||||
|
||||
typedef struct change_record {
|
||||
/* sequence of fields should be the same as in merge_old_version */
|
||||
const unsigned char bidir_changed;
|
||||
const unsigned char category_changed;
|
||||
const unsigned char decimal_changed;
|
||||
const int numeric_changed;
|
||||
} change_record;
|
||||
|
||||
/* data file generated by Tools/unicode/makeunicodedata.py */
|
||||
#include "unicodedata_db.h"
|
||||
|
||||
|
@ -51,6 +60,85 @@ _getrecord(PyUnicodeObject* v)
|
|||
return _getrecord_ex(*PyUnicode_AS_UNICODE(v));
|
||||
}
|
||||
|
||||
/* ------------- Previous-version API ------------------------------------- */
|
||||
typedef struct previous_version {
|
||||
PyObject_HEAD
|
||||
const char *name;
|
||||
const change_record* (*getrecord)(Py_UCS4);
|
||||
Py_UCS4 (*normalization)(Py_UCS4);
|
||||
} PreviousDBVersion;
|
||||
|
||||
#define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
|
||||
|
||||
/* Forward declaration */
|
||||
static PyMethodDef unicodedata_functions[];
|
||||
|
||||
static PyMemberDef DB_members[] = {
|
||||
{"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
|
||||
{NULL}
|
||||
};
|
||||
|
||||
static PyTypeObject Xxo_Type = {
|
||||
/* The ob_type field must be initialized in the module init function
|
||||
* to be portable to Windows without using C++. */
|
||||
PyObject_HEAD_INIT(NULL)
|
||||
0, /*ob_size*/
|
||||
"unicodedata.DB", /*tp_name*/
|
||||
sizeof(PreviousDBVersion), /*tp_basicsize*/
|
||||
0, /*tp_itemsize*/
|
||||
/* methods */
|
||||
(destructor)PyObject_Del, /*tp_dealloc*/
|
||||
0, /*tp_print*/
|
||||
0, /*tp_getattr*/
|
||||
0, /*tp_setattr*/
|
||||
0, /*tp_compare*/
|
||||
0, /*tp_repr*/
|
||||
0, /*tp_as_number*/
|
||||
0, /*tp_as_sequence*/
|
||||
0, /*tp_as_mapping*/
|
||||
0, /*tp_hash*/
|
||||
0, /*tp_call*/
|
||||
0, /*tp_str*/
|
||||
PyObject_GenericGetAttr,/*tp_getattro*/
|
||||
0, /*tp_setattro*/
|
||||
0, /*tp_as_buffer*/
|
||||
Py_TPFLAGS_DEFAULT, /*tp_flags*/
|
||||
0, /*tp_doc*/
|
||||
0, /*tp_traverse*/
|
||||
0, /*tp_clear*/
|
||||
0, /*tp_richcompare*/
|
||||
0, /*tp_weaklistoffset*/
|
||||
0, /*tp_iter*/
|
||||
0, /*tp_iternext*/
|
||||
unicodedata_functions, /*tp_methods*/
|
||||
DB_members, /*tp_members*/
|
||||
0, /*tp_getset*/
|
||||
0, /*tp_base*/
|
||||
0, /*tp_dict*/
|
||||
0, /*tp_descr_get*/
|
||||
0, /*tp_descr_set*/
|
||||
0, /*tp_dictoffset*/
|
||||
0, /*tp_init*/
|
||||
0, /*tp_alloc*/
|
||||
0, /*tp_new*/
|
||||
0, /*tp_free*/
|
||||
0, /*tp_is_gc*/
|
||||
};
|
||||
|
||||
static PyObject*
|
||||
new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
|
||||
Py_UCS4 (*normalization)(Py_UCS4))
|
||||
{
|
||||
PreviousDBVersion *self;
|
||||
self = PyObject_New(PreviousDBVersion, &Xxo_Type);
|
||||
if (self == NULL)
|
||||
return NULL;
|
||||
self->name = name;
|
||||
self->getrecord = getrecord;
|
||||
self->normalization = normalization;
|
||||
return (PyObject*)self;
|
||||
}
|
||||
|
||||
/* --- Module API --------------------------------------------------------- */
|
||||
|
||||
PyDoc_STRVAR(unicodedata_decimal__doc__,
|
||||
|
@ -65,6 +153,7 @@ unicodedata_decimal(PyObject *self, PyObject *args)
|
|||
{
|
||||
PyUnicodeObject *v;
|
||||
PyObject *defobj = NULL;
|
||||
int have_old = 0;
|
||||
long rc;
|
||||
|
||||
if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
|
||||
|
@ -74,7 +163,22 @@ unicodedata_decimal(PyObject *self, PyObject *args)
|
|||
"need a single Unicode character as parameter");
|
||||
return NULL;
|
||||
}
|
||||
rc = Py_UNICODE_TODECIMAL(*PyUnicode_AS_UNICODE(v));
|
||||
|
||||
if (self) {
|
||||
const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
|
||||
if (old->category_changed == 0) {
|
||||
/* unassigned */
|
||||
have_old = 1;
|
||||
rc = -1;
|
||||
}
|
||||
else if (old->decimal_changed != 0xFF) {
|
||||
have_old = 1;
|
||||
rc = old->decimal_changed;
|
||||
}
|
||||
}
|
||||
|
||||
if (!have_old)
|
||||
rc = Py_UNICODE_TODECIMAL(*PyUnicode_AS_UNICODE(v));
|
||||
if (rc < 0) {
|
||||
if (defobj == NULL) {
|
||||
PyErr_SetString(PyExc_ValueError,
|
||||
|
@ -136,6 +240,7 @@ unicodedata_numeric(PyObject *self, PyObject *args)
|
|||
{
|
||||
PyUnicodeObject *v;
|
||||
PyObject *defobj = NULL;
|
||||
int have_old = 0;
|
||||
double rc;
|
||||
|
||||
if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
|
||||
|
@ -145,7 +250,22 @@ unicodedata_numeric(PyObject *self, PyObject *args)
|
|||
"need a single Unicode character as parameter");
|
||||
return NULL;
|
||||
}
|
||||
rc = Py_UNICODE_TONUMERIC(*PyUnicode_AS_UNICODE(v));
|
||||
|
||||
if (self) {
|
||||
const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
|
||||
if (old->category_changed == 0) {
|
||||
/* unassigned */
|
||||
have_old = 1;
|
||||
rc = -1;
|
||||
}
|
||||
else if (old->decimal_changed != 0xFF) {
|
||||
have_old = 1;
|
||||
rc = old->decimal_changed;
|
||||
}
|
||||
}
|
||||
|
||||
if (!have_old)
|
||||
rc = Py_UNICODE_TONUMERIC(*PyUnicode_AS_UNICODE(v));
|
||||
if (rc < 0) {
|
||||
if (defobj == NULL) {
|
||||
PyErr_SetString(PyExc_ValueError, "not a numeric character");
|
||||
|
@ -180,6 +300,11 @@ unicodedata_category(PyObject *self, PyObject *args)
|
|||
return NULL;
|
||||
}
|
||||
index = (int) _getrecord(v)->category;
|
||||
if (self) {
|
||||
const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
|
||||
if (old->category_changed != 0xFF)
|
||||
index = old->category_changed;
|
||||
}
|
||||
return PyString_FromString(_PyUnicode_CategoryNames[index]);
|
||||
}
|
||||
|
||||
|
@ -205,6 +330,13 @@ unicodedata_bidirectional(PyObject *self, PyObject *args)
|
|||
return NULL;
|
||||
}
|
||||
index = (int) _getrecord(v)->bidirectional;
|
||||
if (self) {
|
||||
const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
|
||||
if (old->category_changed == 0)
|
||||
index = 0; /* unassigned */
|
||||
else if (old->bidir_changed != 0xFF)
|
||||
index = old->bidir_changed;
|
||||
}
|
||||
return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
|
||||
}
|
||||
|
||||
|
@ -219,6 +351,7 @@ static PyObject *
|
|||
unicodedata_combining(PyObject *self, PyObject *args)
|
||||
{
|
||||
PyUnicodeObject *v;
|
||||
int index;
|
||||
|
||||
if (!PyArg_ParseTuple(args, "O!:combining",
|
||||
&PyUnicode_Type, &v))
|
||||
|
@ -228,7 +361,13 @@ unicodedata_combining(PyObject *self, PyObject *args)
|
|||
"need a single Unicode character as parameter");
|
||||
return NULL;
|
||||
}
|
||||
return PyInt_FromLong((int) _getrecord(v)->combining);
|
||||
index = (int) _getrecord(v)->combining;
|
||||
if (self) {
|
||||
const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
|
||||
if (old->category_changed == 0)
|
||||
index = 0; /* unassigned */
|
||||
}
|
||||
return PyInt_FromLong(index);
|
||||
}
|
||||
|
||||
PyDoc_STRVAR(unicodedata_mirrored__doc__,
|
||||
|
@ -242,6 +381,7 @@ static PyObject *
|
|||
unicodedata_mirrored(PyObject *self, PyObject *args)
|
||||
{
|
||||
PyUnicodeObject *v;
|
||||
int index;
|
||||
|
||||
if (!PyArg_ParseTuple(args, "O!:mirrored",
|
||||
&PyUnicode_Type, &v))
|
||||
|
@ -251,7 +391,13 @@ unicodedata_mirrored(PyObject *self, PyObject *args)
|
|||
"need a single Unicode character as parameter");
|
||||
return NULL;
|
||||
}
|
||||
return PyInt_FromLong((int) _getrecord(v)->mirrored);
|
||||
index = (int) _getrecord(v)->mirrored;
|
||||
if (self) {
|
||||
const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
|
||||
if (old->category_changed == 0)
|
||||
index = 0; /* unassigned */
|
||||
}
|
||||
return PyInt_FromLong(index);
|
||||
}
|
||||
|
||||
PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
|
||||
|
@ -275,6 +421,11 @@ unicodedata_east_asian_width(PyObject *self, PyObject *args)
|
|||
return NULL;
|
||||
}
|
||||
index = (int) _getrecord(v)->east_asian_width;
|
||||
if (self) {
|
||||
const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
|
||||
if (old->category_changed == 0)
|
||||
index = 0; /* unassigned */
|
||||
}
|
||||
return PyString_FromString(_PyUnicode_EastAsianWidthNames[index]);
|
||||
}
|
||||
|
||||
|
@ -303,6 +454,12 @@ unicodedata_decomposition(PyObject *self, PyObject *args)
|
|||
|
||||
code = (int) *PyUnicode_AS_UNICODE(v);
|
||||
|
||||
if (self) {
|
||||
const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
|
||||
if (old->category_changed == 0)
|
||||
return PyString_FromString(""); /* unassigned */
|
||||
}
|
||||
|
||||
if (code < 0 || code >= 0x110000)
|
||||
index = 0;
|
||||
else {
|
||||
|
@ -337,11 +494,14 @@ unicodedata_decomposition(PyObject *self, PyObject *args)
|
|||
}
|
||||
|
||||
void
|
||||
get_decomp_record(Py_UCS4 code, int *index, int *prefix, int *count)
|
||||
get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
|
||||
{
|
||||
if (code >= 0x110000) {
|
||||
*index = 0;
|
||||
}
|
||||
} else if (self && get_old_record(self, code)->category_changed==0) {
|
||||
/* unassigned in old version */
|
||||
*index = 0;
|
||||
}
|
||||
else {
|
||||
*index = decomp_index1[(code>>DECOMP_SHIFT)];
|
||||
*index = decomp_index2[(*index<<DECOMP_SHIFT)+
|
||||
|
@ -367,7 +527,7 @@ get_decomp_record(Py_UCS4 code, int *index, int *prefix, int *count)
|
|||
#define SCount (LCount*NCount)
|
||||
|
||||
static PyObject*
|
||||
nfd_nfkd(PyObject *input, int k)
|
||||
nfd_nfkd(PyObject *self, PyObject *input, int k)
|
||||
{
|
||||
PyObject *result;
|
||||
Py_UNICODE *i, *end, *o;
|
||||
|
@ -416,8 +576,17 @@ nfd_nfkd(PyObject *input, int k)
|
|||
}
|
||||
continue;
|
||||
}
|
||||
/* Other decompoistions. */
|
||||
get_decomp_record(code, &index, &prefix, &count);
|
||||
/* normalization changes */
|
||||
if (self) {
|
||||
Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
|
||||
if (value != 0) {
|
||||
stack[stackptr++] = value;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
/* Other decompositions. */
|
||||
get_decomp_record(self, code, &index, &prefix, &count);
|
||||
|
||||
/* Copy character if it is not decomposable, or has a
|
||||
compatibility decomposition, but we do NFD. */
|
||||
|
@ -467,7 +636,7 @@ nfd_nfkd(PyObject *input, int k)
|
|||
}
|
||||
|
||||
static int
|
||||
find_nfc_index(struct reindex* nfc, Py_UNICODE code)
|
||||
find_nfc_index(PyObject *self, struct reindex* nfc, Py_UNICODE code)
|
||||
{
|
||||
int index;
|
||||
for (index = 0; nfc[index].start; index++) {
|
||||
|
@ -483,7 +652,7 @@ find_nfc_index(struct reindex* nfc, Py_UNICODE code)
|
|||
}
|
||||
|
||||
static PyObject*
|
||||
nfc_nfkc(PyObject *input, int k)
|
||||
nfc_nfkc(PyObject *self, PyObject *input, int k)
|
||||
{
|
||||
PyObject *result;
|
||||
Py_UNICODE *i, *i1, *o, *end;
|
||||
|
@ -492,7 +661,7 @@ nfc_nfkc(PyObject *input, int k)
|
|||
Py_UNICODE *skipped[20];
|
||||
int cskipped = 0;
|
||||
|
||||
result = nfd_nfkd(input, k);
|
||||
result = nfd_nfkd(self, input, k);
|
||||
if (!result)
|
||||
return NULL;
|
||||
|
||||
|
@ -536,7 +705,7 @@ nfc_nfkc(PyObject *input, int k)
|
|||
continue;
|
||||
}
|
||||
|
||||
f = find_nfc_index(nfc_first, *i);
|
||||
f = find_nfc_index(self, nfc_first, *i);
|
||||
if (f == -1) {
|
||||
*o++ = *i++;
|
||||
continue;
|
||||
|
@ -551,7 +720,7 @@ nfc_nfkc(PyObject *input, int k)
|
|||
i1++;
|
||||
continue;
|
||||
}
|
||||
l = find_nfc_index(nfc_last, *i1);
|
||||
l = find_nfc_index(self, nfc_last, *i1);
|
||||
/* *i1 cannot be combined with *i. If *i1
|
||||
is a starter, we don't need to look further.
|
||||
Otherwise, record the combining class. */
|
||||
|
@ -575,7 +744,7 @@ nfc_nfkc(PyObject *input, int k)
|
|||
/* Mark the second character unused. */
|
||||
skipped[cskipped++] = i1;
|
||||
i1++;
|
||||
f = find_nfc_index(nfc_first, *i);
|
||||
f = find_nfc_index(self, nfc_first, *i);
|
||||
if (f == -1)
|
||||
break;
|
||||
}
|
||||
|
@ -610,13 +779,13 @@ unicodedata_normalize(PyObject *self, PyObject *args)
|
|||
}
|
||||
|
||||
if (strcmp(form, "NFC") == 0)
|
||||
return nfc_nfkc(input, 0);
|
||||
return nfc_nfkc(self, input, 0);
|
||||
if (strcmp(form, "NFKC") == 0)
|
||||
return nfc_nfkc(input, 1);
|
||||
return nfc_nfkc(self, input, 1);
|
||||
if (strcmp(form, "NFD") == 0)
|
||||
return nfd_nfkd(input, 0);
|
||||
return nfd_nfkd(self, input, 0);
|
||||
if (strcmp(form, "NFKD") == 0)
|
||||
return nfd_nfkd(input, 1);
|
||||
return nfd_nfkd(self, input, 1);
|
||||
PyErr_SetString(PyExc_ValueError, "invalid normalization form");
|
||||
return NULL;
|
||||
}
|
||||
|
@ -686,7 +855,7 @@ is_unified_ideograph(Py_UCS4 code)
|
|||
}
|
||||
|
||||
static int
|
||||
_getucname(Py_UCS4 code, char* buffer, int buflen)
|
||||
_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen)
|
||||
{
|
||||
int offset;
|
||||
int i;
|
||||
|
@ -726,6 +895,15 @@ _getucname(Py_UCS4 code, char* buffer, int buflen)
|
|||
if (code >= 0x110000)
|
||||
return 0;
|
||||
|
||||
if (self) {
|
||||
const change_record *old = get_old_record(self, code);
|
||||
if (old->category_changed == 0) {
|
||||
/* unassigned */
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* get offset into phrasebook */
|
||||
offset = phrasebook_offset1[(code>>phrasebook_shift)];
|
||||
offset = phrasebook_offset2[(offset<<phrasebook_shift) +
|
||||
|
@ -768,12 +946,12 @@ _getucname(Py_UCS4 code, char* buffer, int buflen)
|
|||
}
|
||||
|
||||
static int
|
||||
_cmpname(int code, const char* name, int namelen)
|
||||
_cmpname(PyObject *self, int code, const char* name, int namelen)
|
||||
{
|
||||
/* check if code corresponds to the given name */
|
||||
int i;
|
||||
char buffer[NAME_MAXLEN];
|
||||
if (!_getucname(code, buffer, sizeof(buffer)))
|
||||
if (!_getucname(self, code, buffer, sizeof(buffer)))
|
||||
return 0;
|
||||
for (i = 0; i < namelen; i++) {
|
||||
if (toupper(name[i]) != buffer[i])
|
||||
|
@ -803,7 +981,7 @@ find_syllable(const char *str, int *len, int *pos, int count, int column)
|
|||
}
|
||||
|
||||
static int
|
||||
_getcode(const char* name, int namelen, Py_UCS4* code)
|
||||
_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code)
|
||||
{
|
||||
unsigned int h, v;
|
||||
unsigned int mask = code_size-1;
|
||||
|
@ -860,7 +1038,7 @@ _getcode(const char* name, int namelen, Py_UCS4* code)
|
|||
v = code_hash[i];
|
||||
if (!v)
|
||||
return 0;
|
||||
if (_cmpname(v, name, namelen)) {
|
||||
if (_cmpname(self, v, name, namelen)) {
|
||||
*code = v;
|
||||
return 1;
|
||||
}
|
||||
|
@ -872,7 +1050,7 @@ _getcode(const char* name, int namelen, Py_UCS4* code)
|
|||
v = code_hash[i];
|
||||
if (!v)
|
||||
return 0;
|
||||
if (_cmpname(v, name, namelen)) {
|
||||
if (_cmpname(self, v, name, namelen)) {
|
||||
*code = v;
|
||||
return 1;
|
||||
}
|
||||
|
@ -914,8 +1092,8 @@ unicodedata_name(PyObject* self, PyObject* args)
|
|||
return NULL;
|
||||
}
|
||||
|
||||
if (!_getucname((Py_UCS4) *PyUnicode_AS_UNICODE(v),
|
||||
name, sizeof(name))) {
|
||||
if (!_getucname(self, (Py_UCS4) *PyUnicode_AS_UNICODE(v),
|
||||
name, sizeof(name))) {
|
||||
if (defobj == NULL) {
|
||||
PyErr_SetString(PyExc_ValueError, "no such name");
|
||||
return NULL;
|
||||
|
@ -947,7 +1125,7 @@ unicodedata_lookup(PyObject* self, PyObject* args)
|
|||
if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
|
||||
return NULL;
|
||||
|
||||
if (!_getcode(name, namelen, &code)) {
|
||||
if (!_getcode(self, name, namelen, &code)) {
|
||||
char fmt[] = "undefined character name '%s'";
|
||||
char *buf = PyMem_MALLOC(sizeof(fmt) + namelen);
|
||||
sprintf(buf, fmt, name);
|
||||
|
@ -985,6 +1163,8 @@ static PyMethodDef unicodedata_functions[] = {
|
|||
{NULL, NULL} /* sentinel */
|
||||
};
|
||||
|
||||
|
||||
|
||||
PyDoc_STRVAR(unicodedata_docstring,
|
||||
"This module provides access to the Unicode Character Database which\n\
|
||||
defines character properties for all Unicode characters. The data in\n\
|
||||
|
@ -1007,6 +1187,11 @@ initunicodedata(void)
|
|||
|
||||
PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
|
||||
|
||||
/* Previous versions */
|
||||
v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
|
||||
if (v != NULL)
|
||||
PyModule_AddObject(m, "db_3_2_0", v);
|
||||
|
||||
/* Export C API */
|
||||
v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
|
||||
if (v != NULL)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue