refactored the unicodeobject/ucnhash interface, to hide the

implementation details inside the ucnhash module. also cleaned up the unicode copyright blurb a little; Secret Labs' internal revision history isn't that interesting...
2025-09-27 02:39:58 +00:00 · 2001-01-19 09:45:02 +00:00 · 2001-01-19 09:45:02 +00:00 · 0fdb90cafe
commit 0fdb90cafe
parent a2bf2709b3
4 changed files with 144 additions and 208 deletions
--- a/Include/ucnhash.h
+++ b/Include/ucnhash.h
@ -1,20 +1,29 @@
 /* Unicode name database interface */
-#include "Python.h"
+#ifndef Py_UCNHASH_H
-#include <stdlib.h>
+#define Py_UCNHASH_H
 #ifdef __cplusplus
 extern "C" {
 #endif
-/* --- C API ----------------------------------------------------*/
+/* revised ucnhash CAPI interface (exported through a PyCObject) */
 /* C API for usage by other Python modules */
 typedef struct _Py_UCNHashAPI
 {
    unsigned long cKeys;
    unsigned long cchMax;
    unsigned long (*hash)(const char *key, unsigned int cch);
    const void *(*getValue)(unsigned long iKey);
 } _Py_UCNHashAPI;
-typedef struct 
+typedef struct {
 {
    const char *pszUCN;
    Py_UCS4 value;
 } _Py_UnicodeCharacterName;
    /* Size of this struct */
    int size;
    /* Get name for a given character code.  Returns non-zero if
       success, zero if not.  Does not set Python exceptions. */
    int (*getname)(Py_UCS4 code, char* buffer, int buflen);
    /* Get character code for a given name.  Same error handling
       as for getname. */
    int (*getcode)(const char* name, int namelen, Py_UCS4* code);
 } _PyUnicode_Name_CAPI;
 #ifdef __cplusplus
 }
 #endif
 #endif /* !Py_UCNHASH_H */
--- a/Lib/test/test_ucn.py
+++ b/Lib/test/test_ucn.py
@ -50,16 +50,6 @@ print "done."
 # strict error testing:
 print "Testing unicode character name expansion strict error handling....",
 k_cchMaxUnicodeName = 83
 s = "\N{" + "1" * (k_cchMaxUnicodeName + 2) + "}"
 try:
    unicode(s, 'unicode-escape', 'strict')
 except UnicodeError:
    pass
 else:
    raise AssertionError, "failed to raise an exception when presented " \
                          "with a UCN > k_cchMaxUnicodeName"
 try:
    unicode("\N{blah}", 'unicode-escape', 'strict')
 except UnicodeError:
@ -67,6 +57,14 @@ except UnicodeError:
 else:
    raise AssertionError, "failed to raise an exception when given a bogus character name"
 try:
    unicode("\N{" + "x" * 100000 + "}", 'unicode-escape', 'strict')
 except UnicodeError:
    pass
 else:
    raise AssertionError, "failed to raise an exception when given a very " \
                          "long bogus character name"
 try:
    unicode("\N{SPACE", 'unicode-escape', 'strict')
 except UnicodeError:
--- a/Modules/ucnhash.c
+++ b/Modules/ucnhash.c
@ -1,5 +1,13 @@
 #include "Python.h"
 #include "ucnhash.h"
 /* Modified for Python 2.1 by Fredrik Lundh (fredrik@pythonware.com) */
 typedef struct {
    const char* pszUCN;
    Py_UCS4 value;
 }_Py_UnicodeCharacterName;   
 /*
 * The hash is produced using the algorithm described in
 * "Optimal algorithms for minimal perfect hashing",
@ -14,11 +22,11 @@
 * Generated on: Fri Jul 14 08:00:58 2000
 */
 #define cKeys 10538
 #define k_cHashElements 18836
 #define k_cchMaxKey  83
 #define k_cKeys  10538
 staticforward const unsigned short G[k_cHashElements]; 
 staticforward const _Py_UnicodeCharacterName aucn[k_cKeys];   
@ -34,8 +42,7 @@ static long f1(const char *key, unsigned int cch)
    while (--len >= 0)
    {   
        /* (1000003 * x) ^ toupper(*(p++)) 
-         * translated to handle > 32 bit longs 
+         * translated to handle > 32 bit longs */
         */
        x = (0xf4243 * x);
        x = x & 0xFFFFFFFF;
        x = x ^ toupper(*(p++));
@ -98,110 +105,96 @@ static long f2(const char *key, unsigned int cch)
 }
-static unsigned long hash(const char *key, unsigned int cch)
+static unsigned long
 hash(const char *key, unsigned int cch)
 {
    return ((unsigned long)(G[ f1(key, cch) ]) + (unsigned long)(G[ f2(key, cch) ]) ) % k_cHashElements;
 }
-const void *getValue(unsigned long iKey)
+const _Py_UnicodeCharacterName *
 getValue(unsigned long iKey)
 {
-    return &aucn[iKey];
+    return (_Py_UnicodeCharacterName *) &aucn[iKey];
 }
-/* Helper for adding objects to dictionaries. Check for errors with
+static int
-   PyErr_Occurred() */
+mystrnicmp(const char *s1, const char *s2, size_t count)
 static 
 void insobj(PyObject *dict,
     char *name,
     PyObject *v)
 {
-    PyDict_SetItemString(dict, name, v);
+    char c1, c2;
-    Py_XDECREF(v);
+    
    if (count) {
        do {
           c1 = tolower(*(s1++));
           c2 = tolower(*(s2++));
        } while (--count && c1 == c2);
        return c1 - c2;
    }
    return 0;
 }
-static const _Py_UCNHashAPI hashAPI = 
+/* bindings for the new API */
 static int
 ucnhash_getname(Py_UCS4 code, char* buffer, int buflen)
 {
-    k_cKeys,
+    return 0;
-    k_cchMaxKey,
+}
-    &hash,
+
-    &getValue,
+static int
 ucnhash_getcode(const char* name, int namelen, Py_UCS4* code)
 {
    unsigned long j;
    j = hash(name, namelen);
    if (j > cKeys || mystrnicmp(name, getValue(j)->pszUCN, namelen) != 0)
        return 0;
    *code = getValue(j)->value;
    return 1;
 }
 static const _PyUnicode_Name_CAPI hashAPI = 
 {
    sizeof(_PyUnicode_Name_CAPI),
    ucnhash_getname,
    ucnhash_getcode
 };
 static  
-PyMethodDef Module_methods[] =
+PyMethodDef ucnhash_methods[] =
 {   
    {NULL, NULL},
 };
-static char *Module_docstring = "ucnhash hash function module";
+static char *ucnhash_docstring = "ucnhash hash function module";
 /* Error reporting for module init functions */
 #define Py_ReportModuleInitError(modname) {			\
    PyObject *exc_type, *exc_value, *exc_tb;			\
    PyObject *str_type, *str_value;				\
 								\
    /* Fetch error objects and convert them to strings */	\
    PyErr_Fetch(&exc_type, &exc_value, &exc_tb);		\
    if (exc_type && exc_value) {				\
 	    str_type = PyObject_Str(exc_type);			\
 	    str_value = PyObject_Str(exc_value);			\
    }								\
    else {							\
 	   str_type = NULL;					\
 	   str_value = NULL;					\
    }								\
    /* Try to format a more informative error message using the	\
       original error */					\
    if (str_type && str_value &&				\
 	    PyString_Check(str_type) && PyString_Check(str_value))	\
 	    PyErr_Format(						\
   		    PyExc_ImportError,				\
 		    "initialization of module "modname" failed "	\
 		    "(%s:%s)",					\
 		PyString_AS_STRING(str_type),			\
 		PyString_AS_STRING(str_value));			\
    else							\
 	    PyErr_SetString(					\
 		    PyExc_ImportError,				\
 		    "initialization of module "modname" failed");	\
    Py_XDECREF(str_type);					\
    Py_XDECREF(str_value);					\
    Py_XDECREF(exc_type);					\
    Py_XDECREF(exc_value);					\
    Py_XDECREF(exc_tb);						\
 }
 /* Create PyMethodObjects and register them in the module's dict */
 DL_EXPORT(void) 
 initucnhash(void)
 {
-    PyObject *module, *moddict;
+    PyObject *m, *d, *v;
-    /* Create module */
+
-    module = Py_InitModule4("ucnhash", /* Module name */
+    m = Py_InitModule4(
-             Module_methods, /* Method list */
+        "ucnhash", /* Module name */
-             Module_docstring, /* Module doc-string */
+        ucnhash_methods, /* Method list */
-             (PyObject *)NULL, /* always pass this as *self */
+        ucnhash_docstring, /* Module doc-string */
-             PYTHON_API_VERSION); /* API Version */
+        (PyObject *)NULL, /* always pass this as *self */
-    if (module == NULL)
+        PYTHON_API_VERSION); /* API Version */
-        goto onError;
+    if (!m)
-    /* Add some constants to the module's dict */
+        return;
-    moddict = PyModule_GetDict(module);
+
-    if (moddict == NULL)
+    d = PyModule_GetDict(m);
-        goto onError;
+    if (!d)
        return;
    /* Export C API */
-    insobj(
+    v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
-        moddict,
+    PyDict_SetItemString(d, "Unicode_Names_CAPI", v);
-        "ucnhashAPI",
+    Py_XDECREF(v);
        PyCObject_FromVoidPtr((void *)&hashAPI, NULL));
 onError:
    /* Check for errors and report them */
    if (PyErr_Occurred())
        Py_ReportModuleInitError("ucnhash");
    return;
 }
 static const unsigned short G[] = 
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -6,61 +6,35 @@ Unicode Integration Proposal (see file Misc/unicode.txt).
 Copyright (c) Corporation for National Research Initiatives.
 --------------------------------------------------------------------
 The original string type implementation is:
- Original header:
+    Copyright (c) 1999 by Secret Labs AB
- --------------------------------------------------------------------
+    Copyright (c) 1999 by Fredrik Lundh
- * Yet another Unicode string type for Python.  This type supports the
+By obtaining, using, and/or copying this software and/or its
- * 16-bit Basic Multilingual Plane (BMP) only.
+associated documentation, you agree that you have read, understood,
- *
+and will comply with the following terms and conditions:
- * Note that this string class supports embedded NULL characters.  End
+
- * of string is given by the length attribute.  However, the internal
+Permission to use, copy, modify, and distribute this software and its
- * representation always stores a trailing NULL to make it easier to
+associated documentation for any purpose and without fee is hereby
- * use unicode strings with standard APIs.
+granted, provided that the above copyright notice appears in all
- *
+copies, and that both that copyright notice and this permission notice
- * History:
+appear in supporting documentation, and that the name of Secret Labs
- * 1999-01-23 fl  Created
+AB or the author not be used in advertising or publicity pertaining to
- * 1999-01-24 fl  Added split, join, capwords; basic UTF-8 support
+distribution of the software without specific, written prior
- * 1999-01-24 fl  Basic UCS-2 support, buffer interface, etc.
+permission.
- * 1999-03-06 fl  Moved declarations to separate file, etc.
+
- * 1999-06-13 fl  Changed join method semantics according to Tim's proposal
+SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
- * 1999-08-10 fl  Some minor tweaks
+THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
- *
+FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
- * Written by Fredrik Lundh, January 1999.
+ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- *
+WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- * Copyright (c) 1999 by Secret Labs AB.
+ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
- * Copyright (c) 1999 by Fredrik Lundh.
+OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- *
+--------------------------------------------------------------------
- * fredrik@pythonware.com
+
- * http://www.pythonware.com
+*/
 *
 * --------------------------------------------------------------------
 * This Unicode String Type is
 * 
 * Copyright (c) 1999 by Secret Labs AB
 * Copyright (c) 1999 by Fredrik Lundh
 * 
 * By obtaining, using, and/or copying this software and/or its
 * associated documentation, you agree that you have read, understood,
 * and will comply with the following terms and conditions:
 * 
 * Permission to use, copy, modify, and distribute this software and its
 * associated documentation for any purpose and without fee is hereby
 * granted, provided that the above copyright notice appears in all
 * copies, and that both that copyright notice and this permission notice
 * appear in supporting documentation, and that the name of Secret Labs
 * AB or the author not be used in advertising or publicity pertaining to
 * distribution of the software without specific, written prior
 * permission.
 * 
 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
 * FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 * -------------------------------------------------------------------- */
 #include "Python.h"
@ -1129,27 +1103,7 @@ int unicodeescape_decoding_error(const char **source,
    }
 }
-static _Py_UCNHashAPI *pucnHash = NULL;
+static _PyUnicode_Name_CAPI *unicode_names = NULL;
 static
 int mystrnicmp(const char *s1, const char *s2, size_t count)
 {
    char c1, c2;
    if (count)
    {
        do
        {
           c1 = tolower(*(s1++));
           c2 = tolower(*(s2++));
        }
        while(--count && c1 == c2);
        return c1 - c2;
    }
    return 0;
 }
 PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
 					int size,
@ -1282,55 +1236,37 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
            /* Ok, we need to deal with Unicode Character Names now,
             * make sure we've imported the hash table data...
             */
-            if (pucnHash == NULL) {
+            if (unicode_names == NULL) {
                PyObject *mod = 0, *v = 0;
                mod = PyImport_ImportModule("ucnhash");
                if (mod == NULL)
                    goto onError;
-                v = PyObject_GetAttrString(mod,"ucnhashAPI");
+                v = PyObject_GetAttrString(mod,"Unicode_Names_CAPI");
                Py_DECREF(mod);
                if (v == NULL)
                    goto onError;
-                pucnHash = PyCObject_AsVoidPtr(v);
+                unicode_names = PyCObject_AsVoidPtr(v);
                Py_DECREF(v);
-                if (pucnHash == NULL)
+                if (unicode_names == NULL)
                    goto onError;
            }
            if (*s == '{') {
                const char *start = s + 1;
                const char *endBrace = start;
                unsigned long j;
-                /* look for either the closing brace, or we
+                /* look for the closing brace */
-                 * exceed the maximum length of the unicode character names
+                while (*endBrace != '}' && endBrace < end)
                 */
                while (*endBrace != '}' &&
                       (unsigned int)(endBrace - start) <=
                           pucnHash->cchMax &&
                       endBrace < end)
                {
                    endBrace++;
                }
                if (endBrace != end && *endBrace == '}') {
-                    j = pucnHash->hash(start, endBrace - start);
+                    if (!unicode_names->getcode(start, endBrace-start, &chr)) {
                    if (j > pucnHash->cKeys ||
                        mystrnicmp(
                            start,
                            ((_Py_UnicodeCharacterName *) 
                             (pucnHash->getValue(j)))->pszUCN,
                            (int)(endBrace - start)) != 0)
                    {
                        if (unicodeescape_decoding_error(
                                &s, &x, errors,
-                                "Invalid Unicode Character Name"))
+                                "Invalid Unicode Character Name")
-                        {
+                            )
                            goto onError;
                        }
                        goto ucnFallthrough;
                    }
                    chr = ((_Py_UnicodeCharacterName *)
                           (pucnHash->getValue(j)))->value;
                    s = endBrace + 1;
                    goto store;
                } else {