Patch #626485: Support Unicode normalization.

2025-11-03 03:22:27 +00:00 · 2002-11-23 22:08:15 +00:00 · 2002-11-23 22:08:15 +00:00 · 677bde2dd1
commit 677bde2dd1
parent 74a530d42d
6 changed files with 1053 additions and 23 deletions
--- a/Modules/unicodedata.c
+++ b/Modules/unicodedata.c
@ -30,13 +30,9 @@ typedef struct {
 #include "unicodedata_db.h"

 static const _PyUnicode_DatabaseRecord*
-_getrecord(PyUnicodeObject* v)
+_getrecord_ex(Py_UCS4 code)
 {
-    int code;
    int index;
-
-    code = (int) *PyUnicode_AS_UNICODE(v);
-
    if (code < 0 || code >= 0x110000)
        index = 0;
    else {
@ -47,6 +43,12 @@ _getrecord(PyUnicodeObject* v)
    return &_PyUnicode_Database_Records[index];
 }

+static const _PyUnicode_DatabaseRecord*
+_getrecord(PyUnicodeObject* v)
+{
+    return _getrecord_ex(*PyUnicode_AS_UNICODE(v));
+}
+
 /* --- Module API --------------------------------------------------------- */

 static PyObject *
@ -253,6 +255,276 @@ unicodedata_decomposition(PyObject *self, PyObject *args)
    return PyString_FromString(decomp);
 }

+void
+get_decomp_record(Py_UCS4 code, int *index, int *prefix, int *count)
+{
+    if (code < 0 || code >= 0x110000) {
+        *index = 0;
+    } 
+    else {
+        *index = decomp_index1[(code>>DECOMP_SHIFT)];
+        *index = decomp_index2[(*index<<DECOMP_SHIFT)+
+                               (code&((1<<DECOMP_SHIFT)-1))];
+    }
+	
+    /* high byte is number of hex bytes (usually one or two), low byte
+       is prefix code (from*/
+    *count = decomp_data[*index] >> 8;
+    *prefix = decomp_data[*index] & 255;
+
+    (*index)++;
+}
+
+#define SBase   0xAC00
+#define LBase   0x1100
+#define VBase   0x1161
+#define TBase   0x11A7
+#define LCount  19
+#define VCount  21
+#define TCount  28
+#define NCount  (VCount*TCount)
+#define SCount  (LCount*NCount)
+
+static PyObject*
+nfd_nfkd(PyObject *input, int k)
+{
+    PyObject *result;
+    Py_UNICODE *i, *end, *o;
+    /* Longest decomposition in Unicode 3.2: U+FDFA */
+    Py_UNICODE stack[20]; 
+    int space, stackptr, isize;
+    int index, prefix, count;
+    unsigned char prev, cur;
+	
+    stackptr = 0;
+    isize = PyUnicode_GET_SIZE(input);
+    /* Overallocate atmost 10 characters. */
+    space = (isize > 10 ? 10 : isize) + isize;
+    result = PyUnicode_FromUnicode(NULL, space);
+    if (!result)
+        return NULL;
+    i = PyUnicode_AS_UNICODE(input);
+    end = i + isize;
+    o = PyUnicode_AS_UNICODE(result);
+
+    while (i < end) {
+        stack[stackptr++] = *i++;
+        while(stackptr) {
+            Py_UNICODE code = stack[--stackptr];
+            if (!space) {
+                space = PyString_GET_SIZE(result) + 10;
+                if (PyUnicode_Resize(&result, space) == -1)
+                    return NULL;
+                o = PyUnicode_AS_UNICODE(result) + space - 10;
+                space = 10;
+            }
+            /* Hangul Decomposition. */
+            if (SBase <= code && code < (SBase+SCount)) {
+                int SIndex = code - SBase;
+                int L = LBase + SIndex / NCount;
+                int V = VBase + (SIndex % NCount) / TCount;
+                int T = TBase + SIndex % TCount;
+                *o++ = L;
+                *o++ = V;
+                space -= 2;
+                if (T != TBase) {
+                    *o++ = T;
+                    space --;
+                }
+                continue;
+            }
+            /* Other decompoistions. */
+            get_decomp_record(code, &index, &prefix, &count);
+
+            /* Copy character if it is not decomposable, or has a
+               compatibility decomposition, but we do NFD. */
+            if (!count || (prefix && !k)) {
+                *o++ = code;
+                space--;
+                continue;
+            }
+            /* Copy decomposition onto the stack, in reverse
+               order.  */
+            while(count) {
+                code = decomp_data[index + (--count)];
+                stack[stackptr++] = code;
+            }
+        }
+    }
+
+    /* Drop overallocation. Cannot fail. */
+    PyUnicode_Resize(&result, PyUnicode_GET_SIZE(result) - space);
+
+    /* Sort canonically. */
+    i = PyUnicode_AS_UNICODE(result);
+    prev = _getrecord_ex(*i)->combining;
+    end = i + PyUnicode_GET_SIZE(result);
+    for (i++; i < end; i++) {
+        cur = _getrecord_ex(*i)->combining;
+        if (prev == 0 || cur == 0 || prev <= cur) {
+            prev = cur;
+            continue;
+        }
+        /* Non-canonical order. Need to switch *i with previous. */
+        o = i - 1;
+        while (1) {
+            Py_UNICODE tmp = o[1];
+            o[1] = o[0];
+            o[0] = tmp;
+            o--;
+            if (o < PyUnicode_AS_UNICODE(result))
+                break;
+            prev = _getrecord_ex(*o)->combining;
+            if (prev == 0 || prev <= cur)
+                break;
+        }
+        prev = _getrecord_ex(*i)->combining;
+    }
+    return result;
+}
+
+static int
+find_nfc_index(struct reindex* nfc, Py_UNICODE code)
+{
+    int index;
+    for (index = 0; nfc[index].start; index++) {
+        int start = nfc[index].start;
+        if (code < start)
+            return -1;
+        if (code <= start + nfc[index].count) {
+            int delta = code - start;
+            return nfc[index].index + delta;
+        }
+    }
+    return -1;
+}
+
+static PyObject*
+nfc_nfkc(PyObject *input, int k)
+{
+    PyObject *result;
+    Py_UNICODE *i, *i1, *o, *end;
+    int f,l,index,index1,comb;
+    Py_UNICODE code;
+    Py_UNICODE *skipped[20];
+    int cskipped = 0;
+
+    result = nfd_nfkd(input, k);
+    if (!result)
+        return NULL;
+
+    /* We are going to modify result in-place.
+       If nfd_nfkd is changed to sometimes return the input,
+       this code needs to be reviewed. */
+    assert(result != input);
+
+    i = PyUnicode_AS_UNICODE(result);
+    end = i + PyUnicode_GET_SIZE(result);
+    o = PyUnicode_AS_UNICODE(result);
+	
+  again:
+    while (i < end) {
+      for (index = 0; index < cskipped; index++) {
+          if (skipped[index] == i) {
+              /* *i character is skipped. 
+                 Remove from list. */
+              skipped[index] = skipped[cskipped-1];
+              cskipped--;
+              i++;
+              goto again; // continue while
+          }
+      }
+      /* Hangul Composition. We don't need to check for <LV,T>
+         pairs, since we always have decomposed data. */
+      if (LBase <= *i && *i < (LBase+LCount) &&
+          i + 1 < end && 
+          VBase <= i[1] && i[1] <= (VBase+VCount)) {
+          int LIndex, VIndex;
+          LIndex = i[0] - LBase;
+          VIndex = i[1] - VBase;
+          code = SBase + (LIndex*VCount+VIndex)*TCount;
+          i+=2;
+          if (i < end &&
+              TBase <= *i && *i <= (TBase+TCount)) {
+              code += *i-TBase;
+              i++;
+          }
+          *o++ = code;
+          continue;
+      }
+
+      f = find_nfc_index(nfc_first, *i);
+      if (f == -1) {
+          *o++ = *i++;
+          continue;
+      }
+      /* Find next unblocked character. */
+      i1 = i+1;
+      comb = 0;
+      while (i1 < end) {
+          int comb1 = _getrecord_ex(*i1)->combining;
+          if (comb1 && comb == comb1) {
+              /* Character is blocked. */
+              i1++;
+              continue;
+          }
+          l = find_nfc_index(nfc_last, *i1);
+          /* *i1 cannot be combined with *i. If *i1
+             is a starter, we don't need to look further.
+             Otherwise, record the combining class. */
+          if (l == -1) {
+            not_combinable:
+              if (comb1 == 0)
+                  break;
+              comb = comb1;
+              i1++;
+              continue;
+          }
+          index = f*TOTAL_LAST + l;
+          index1 = comp_index[index >> COMP_SHIFT];
+          code = comp_data[(index1<<COMP_SHIFT)+
+                           (index&((1<<COMP_SHIFT)-1))];
+          if (code == 0)
+              goto not_combinable;
+			
+          /* Replace the original character. */
+          *i = code;
+          /* Mark the second character unused. */
+          skipped[cskipped++] = i1;
+          i1++;
+          f = find_nfc_index(nfc_first, *i);
+          if (f == -1)
+              break;
+      }
+      *o++ = *i++;
+    }
+    if (o != end)
+        PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result));
+    return result;
+}
+		
+static PyObject*
+unicodedata_normalize(PyObject *self, PyObject *args)
+{
+    char *form;
+    PyObject *input;
+
+    if(!PyArg_ParseTuple(args, "sO!:normalized",
+                         &form, &PyUnicode_Type, &input))
+        return NULL;
+
+    if (strcmp(form, "NFC") == 0)
+        return nfc_nfkc(input, 0);
+    if (strcmp(form, "NFKC") == 0)
+        return nfc_nfkc(input, 1);
+    if (strcmp(form, "NFD") == 0)
+        return nfd_nfkd(input, 0);
+    if (strcmp(form, "NFKD") == 0)
+        return nfd_nfkd(input, 1);
+    PyErr_SetString(PyExc_ValueError, "invalid normalization form");
+    return NULL;
+}
+
 /* -------------------------------------------------------------------- */
 /* unicode character name tables */

@ -277,16 +549,6 @@ _gethash(const char *s, int len, int scale)
    return h;
 }

-#define SBase   0xAC00
-#define LBase   0x1100
-#define VBase   0x1161
-#define TBase   0x11A7
-#define LCount  19
-#define VCount  21
-#define TCount  28
-#define NCount  (VCount*TCount)
-#define SCount  (LCount*NCount)
-
 static char *hangul_syllables[][3] = {
    { "G",  "A",   ""   },
    { "GG", "AE",  "G"  },
@ -594,6 +856,7 @@ static PyMethodDef unicodedata_functions[] = {
    {"decomposition",unicodedata_decomposition, METH_VARARGS},
    {"name", unicodedata_name, METH_VARARGS},
    {"lookup", unicodedata_lookup, METH_VARARGS},
+    {"normalize", unicodedata_normalize, METH_VARARGS},
    {NULL, NULL}		/* sentinel */
 };

@ -618,5 +881,6 @@ initunicodedata(void)
 /* 
 Local variables:
 c-basic-offset: 4
+indent-tabs-mode: nil
 End:
 */