Marc-Andre Lemburg <mal@lemburg.com>:

Patch to the standard unicode-escape codec which dynamically loads the Unicode name to ordinal mapping from the module ucnhash. By Bill Tutt.
2025-09-10 02:36:56 +00:00 · 2000-06-28 16:43:35 +00:00 · 2000-06-28 16:43:35 +00:00 · 0f774e3987
commit 0f774e3987
parent 2dabf69f5c
1 changed files with 121 additions and 0 deletions
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -66,6 +66,7 @@ Unicode Integration Proposal (see file Misc/unicode.txt).
 #include "mymath.h"
 #include "unicodeobject.h"
 #include <ucnhash.h>
 #if defined(HAVE_LIMITS_H)
 #include <limits.h>
@ -1020,6 +1021,28 @@ int unicodeescape_decoding_error(const char **source,
    }
 }
 static _Py_UCNHashAPI *pucnHash = NULL;
 static
 int mystrnicmp(const char *s1, const char *s2, size_t count)
 {
    char c1, c2;
    if (count)
    {
        do
        {
           c1 = tolower(*(s1++));
           c2 = tolower(*(s2++));
        }
        while(--count && c1 == c2);
        return c1 - c2;
    }
    return 0;
 }
 PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
 					int size,
 					const char *errors)
@ -1123,6 +1146,104 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
            *p++ = x;
            break;
        case 'N':
            /* Ok, we need to deal with Unicode Character Names now,
             * make sure we've imported the hash table data...
             */
            if (pucnHash == NULL)
            {
                PyObject *mod = 0, *v = 0;
                mod = PyImport_ImportModule("ucnhash");
                if (mod == NULL)
                    goto onError;
                v = PyObject_GetAttrString(mod,"ucnhashAPI");
                Py_DECREF(mod);
                if (v == NULL)
                {
                    goto onError;
                }
                pucnHash = PyCObject_AsVoidPtr(v);
                Py_DECREF(v);
                if (pucnHash == NULL)
                {
                    goto onError;
                }
            }
            if (*s == '{')
            {
                const char *start = s + 1;
                const char *endBrace = start;
                unsigned int uiValue;
                unsigned long j;
                /* look for either the closing brace, or we
                 * exceed the maximum length of the unicode character names
                 */
                while (*endBrace != '}' &&
                       (unsigned int)(endBrace - start) <=
                           pucnHash->cchMax &&
                       endBrace < end)
                {
                    endBrace++;
                }
                if (endBrace != end && *endBrace == '}')
                {
                    j = pucnHash->hash(start, endBrace - start);
                    if (j > pucnHash->cKeys ||
                        mystrnicmp(
                            start,
                            ((_Py_UnicodeCharacterName *) 
                             (pucnHash->getValue(j)))->pszUCN,
                            (int)(endBrace - start)) != 0)
                    {
                        if (unicodeescape_decoding_error(
                                &s, &x, errors,
                                "Invalid Unicode Character Name"))
                        {
                            goto onError;
                        }
                        goto ucnFallthrough;
                    }
                    uiValue = ((_Py_UnicodeCharacterName *)
                               (pucnHash->getValue(j)))->uiValue;
                    if (uiValue < 1<<16)
                    {
                        /* In UCS-2 range, easy solution.. */
                        *p++ = uiValue;
                    }
                    else
                    {
                        /* Oops, its in UCS-4 space, */
                        /*  compute and append the two surrogates: */
                        /*  translate from 10000..10FFFF to 0..FFFFF */
                        uiValue -= 0x10000;
                        /* high surrogate = top 10 bits added to D800 */
                        *p++ = 0xD800 + (uiValue >> 10);
                        /* low surrogate  = bottom 10 bits added to DC00 */
                        *p++ = 0xDC00 + (uiValue & ~0xFC00);
                    }
                    s = endBrace + 1;
                }
                else
                {
                    if (unicodeescape_decoding_error(
                            &s, &x, errors,
                            "Unicode name missing closing brace"))
                        goto onError;
                    goto ucnFallthrough;
                }
                break;                
            }
            if (unicodeescape_decoding_error(
                    &s, &x, errors,
                    "Missing opening brace for Unicode Character Name escape"))
                goto onError;
 ucnFallthrough:
            /* fall through on purpose */
        default:
            *p++ = '\\';
            *p++ = (unsigned char)s[-1];