refactored the unicodeobject/ucnhash interface, to hide the

implementation details inside the ucnhash module.

also cleaned up the unicode copyright blurb a little; Secret Labs'
internal revision history isn't that interesting...
This commit is contained in:
Fredrik Lundh 2001-01-19 09:45:02 +00:00
parent a2bf2709b3
commit 0fdb90cafe
4 changed files with 144 additions and 208 deletions

View file

@ -1,20 +1,29 @@
/* Unicode name database interface */
#include "Python.h" #ifndef Py_UCNHASH_H
#include <stdlib.h> #define Py_UCNHASH_H
#ifdef __cplusplus
extern "C" {
#endif
/* --- C API ----------------------------------------------------*/ /* revised ucnhash CAPI interface (exported through a PyCObject) */
/* C API for usage by other Python modules */
typedef struct _Py_UCNHashAPI
{
unsigned long cKeys;
unsigned long cchMax;
unsigned long (*hash)(const char *key, unsigned int cch);
const void *(*getValue)(unsigned long iKey);
} _Py_UCNHashAPI;
typedef struct typedef struct {
{
const char *pszUCN;
Py_UCS4 value;
} _Py_UnicodeCharacterName;
/* Size of this struct */
int size;
/* Get name for a given character code. Returns non-zero if
success, zero if not. Does not set Python exceptions. */
int (*getname)(Py_UCS4 code, char* buffer, int buflen);
/* Get character code for a given name. Same error handling
as for getname. */
int (*getcode)(const char* name, int namelen, Py_UCS4* code);
} _PyUnicode_Name_CAPI;
#ifdef __cplusplus
}
#endif
#endif /* !Py_UCNHASH_H */

View file

@ -50,16 +50,6 @@ print "done."
# strict error testing: # strict error testing:
print "Testing unicode character name expansion strict error handling....", print "Testing unicode character name expansion strict error handling....",
k_cchMaxUnicodeName = 83
s = "\N{" + "1" * (k_cchMaxUnicodeName + 2) + "}"
try:
unicode(s, 'unicode-escape', 'strict')
except UnicodeError:
pass
else:
raise AssertionError, "failed to raise an exception when presented " \
"with a UCN > k_cchMaxUnicodeName"
try: try:
unicode("\N{blah}", 'unicode-escape', 'strict') unicode("\N{blah}", 'unicode-escape', 'strict')
except UnicodeError: except UnicodeError:
@ -67,6 +57,14 @@ except UnicodeError:
else: else:
raise AssertionError, "failed to raise an exception when given a bogus character name" raise AssertionError, "failed to raise an exception when given a bogus character name"
try:
unicode("\N{" + "x" * 100000 + "}", 'unicode-escape', 'strict')
except UnicodeError:
pass
else:
raise AssertionError, "failed to raise an exception when given a very " \
"long bogus character name"
try: try:
unicode("\N{SPACE", 'unicode-escape', 'strict') unicode("\N{SPACE", 'unicode-escape', 'strict')
except UnicodeError: except UnicodeError:

View file

@ -1,5 +1,13 @@
#include "Python.h"
#include "ucnhash.h" #include "ucnhash.h"
/* Modified for Python 2.1 by Fredrik Lundh (fredrik@pythonware.com) */
typedef struct {
const char* pszUCN;
Py_UCS4 value;
}_Py_UnicodeCharacterName;
/* /*
* The hash is produced using the algorithm described in * The hash is produced using the algorithm described in
* "Optimal algorithms for minimal perfect hashing", * "Optimal algorithms for minimal perfect hashing",
@ -14,11 +22,11 @@
* Generated on: Fri Jul 14 08:00:58 2000 * Generated on: Fri Jul 14 08:00:58 2000
*/ */
#define cKeys 10538
#define k_cHashElements 18836 #define k_cHashElements 18836
#define k_cchMaxKey 83 #define k_cchMaxKey 83
#define k_cKeys 10538 #define k_cKeys 10538
staticforward const unsigned short G[k_cHashElements]; staticforward const unsigned short G[k_cHashElements];
staticforward const _Py_UnicodeCharacterName aucn[k_cKeys]; staticforward const _Py_UnicodeCharacterName aucn[k_cKeys];
@ -34,8 +42,7 @@ static long f1(const char *key, unsigned int cch)
while (--len >= 0) while (--len >= 0)
{ {
/* (1000003 * x) ^ toupper(*(p++)) /* (1000003 * x) ^ toupper(*(p++))
* translated to handle > 32 bit longs * translated to handle > 32 bit longs */
*/
x = (0xf4243 * x); x = (0xf4243 * x);
x = x & 0xFFFFFFFF; x = x & 0xFFFFFFFF;
x = x ^ toupper(*(p++)); x = x ^ toupper(*(p++));
@ -98,110 +105,96 @@ static long f2(const char *key, unsigned int cch)
} }
static unsigned long hash(const char *key, unsigned int cch) static unsigned long
hash(const char *key, unsigned int cch)
{ {
return ((unsigned long)(G[ f1(key, cch) ]) + (unsigned long)(G[ f2(key, cch) ]) ) % k_cHashElements; return ((unsigned long)(G[ f1(key, cch) ]) + (unsigned long)(G[ f2(key, cch) ]) ) % k_cHashElements;
} }
const void *getValue(unsigned long iKey) const _Py_UnicodeCharacterName *
getValue(unsigned long iKey)
{ {
return &aucn[iKey]; return (_Py_UnicodeCharacterName *) &aucn[iKey];
} }
/* Helper for adding objects to dictionaries. Check for errors with static int
PyErr_Occurred() */ mystrnicmp(const char *s1, const char *s2, size_t count)
static
void insobj(PyObject *dict,
char *name,
PyObject *v)
{ {
PyDict_SetItemString(dict, name, v); char c1, c2;
Py_XDECREF(v);
if (count) {
do {
c1 = tolower(*(s1++));
c2 = tolower(*(s2++));
} while (--count && c1 == c2);
return c1 - c2;
}
return 0;
} }
static const _Py_UCNHashAPI hashAPI = /* bindings for the new API */
static int
ucnhash_getname(Py_UCS4 code, char* buffer, int buflen)
{ {
k_cKeys, return 0;
k_cchMaxKey, }
&hash,
&getValue, static int
ucnhash_getcode(const char* name, int namelen, Py_UCS4* code)
{
unsigned long j;
j = hash(name, namelen);
if (j > cKeys || mystrnicmp(name, getValue(j)->pszUCN, namelen) != 0)
return 0;
*code = getValue(j)->value;
return 1;
}
static const _PyUnicode_Name_CAPI hashAPI =
{
sizeof(_PyUnicode_Name_CAPI),
ucnhash_getname,
ucnhash_getcode
}; };
static static
PyMethodDef Module_methods[] = PyMethodDef ucnhash_methods[] =
{ {
{NULL, NULL}, {NULL, NULL},
}; };
static char *Module_docstring = "ucnhash hash function module"; static char *ucnhash_docstring = "ucnhash hash function module";
/* Error reporting for module init functions */
#define Py_ReportModuleInitError(modname) { \
PyObject *exc_type, *exc_value, *exc_tb; \
PyObject *str_type, *str_value; \
\
/* Fetch error objects and convert them to strings */ \
PyErr_Fetch(&exc_type, &exc_value, &exc_tb); \
if (exc_type && exc_value) { \
str_type = PyObject_Str(exc_type); \
str_value = PyObject_Str(exc_value); \
} \
else { \
str_type = NULL; \
str_value = NULL; \
} \
/* Try to format a more informative error message using the \
original error */ \
if (str_type && str_value && \
PyString_Check(str_type) && PyString_Check(str_value)) \
PyErr_Format( \
PyExc_ImportError, \
"initialization of module "modname" failed " \
"(%s:%s)", \
PyString_AS_STRING(str_type), \
PyString_AS_STRING(str_value)); \
else \
PyErr_SetString( \
PyExc_ImportError, \
"initialization of module "modname" failed"); \
Py_XDECREF(str_type); \
Py_XDECREF(str_value); \
Py_XDECREF(exc_type); \
Py_XDECREF(exc_value); \
Py_XDECREF(exc_tb); \
}
/* Create PyMethodObjects and register them in the module's dict */ /* Create PyMethodObjects and register them in the module's dict */
DL_EXPORT(void) DL_EXPORT(void)
initucnhash(void) initucnhash(void)
{ {
PyObject *module, *moddict; PyObject *m, *d, *v;
/* Create module */
module = Py_InitModule4("ucnhash", /* Module name */ m = Py_InitModule4(
Module_methods, /* Method list */ "ucnhash", /* Module name */
Module_docstring, /* Module doc-string */ ucnhash_methods, /* Method list */
(PyObject *)NULL, /* always pass this as *self */ ucnhash_docstring, /* Module doc-string */
PYTHON_API_VERSION); /* API Version */ (PyObject *)NULL, /* always pass this as *self */
if (module == NULL) PYTHON_API_VERSION); /* API Version */
goto onError; if (!m)
/* Add some constants to the module's dict */ return;
moddict = PyModule_GetDict(module);
if (moddict == NULL) d = PyModule_GetDict(m);
goto onError; if (!d)
return;
/* Export C API */ /* Export C API */
insobj( v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
moddict, PyDict_SetItemString(d, "Unicode_Names_CAPI", v);
"ucnhashAPI", Py_XDECREF(v);
PyCObject_FromVoidPtr((void *)&hashAPI, NULL));
onError:
/* Check for errors and report them */
if (PyErr_Occurred())
Py_ReportModuleInitError("ucnhash");
return;
} }
static const unsigned short G[] = static const unsigned short G[] =

View file

@ -6,61 +6,35 @@ Unicode Integration Proposal (see file Misc/unicode.txt).
Copyright (c) Corporation for National Research Initiatives. Copyright (c) Corporation for National Research Initiatives.
--------------------------------------------------------------------
The original string type implementation is:
Original header: Copyright (c) 1999 by Secret Labs AB
-------------------------------------------------------------------- Copyright (c) 1999 by Fredrik Lundh
* Yet another Unicode string type for Python. This type supports the By obtaining, using, and/or copying this software and/or its
* 16-bit Basic Multilingual Plane (BMP) only. associated documentation, you agree that you have read, understood,
* and will comply with the following terms and conditions:
* Note that this string class supports embedded NULL characters. End
* of string is given by the length attribute. However, the internal Permission to use, copy, modify, and distribute this software and its
* representation always stores a trailing NULL to make it easier to associated documentation for any purpose and without fee is hereby
* use unicode strings with standard APIs. granted, provided that the above copyright notice appears in all
* copies, and that both that copyright notice and this permission notice
* History: appear in supporting documentation, and that the name of Secret Labs
* 1999-01-23 fl Created AB or the author not be used in advertising or publicity pertaining to
* 1999-01-24 fl Added split, join, capwords; basic UTF-8 support distribution of the software without specific, written prior
* 1999-01-24 fl Basic UCS-2 support, buffer interface, etc. permission.
* 1999-03-06 fl Moved declarations to separate file, etc.
* 1999-06-13 fl Changed join method semantics according to Tim's proposal SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
* 1999-08-10 fl Some minor tweaks THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
* Written by Fredrik Lundh, January 1999. ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* Copyright (c) 1999 by Secret Labs AB. ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
* Copyright (c) 1999 by Fredrik Lundh. OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
* --------------------------------------------------------------------
* fredrik@pythonware.com
* http://www.pythonware.com */
*
* --------------------------------------------------------------------
* This Unicode String Type is
*
* Copyright (c) 1999 by Secret Labs AB
* Copyright (c) 1999 by Fredrik Lundh
*
* By obtaining, using, and/or copying this software and/or its
* associated documentation, you agree that you have read, understood,
* and will comply with the following terms and conditions:
*
* Permission to use, copy, modify, and distribute this software and its
* associated documentation for any purpose and without fee is hereby
* granted, provided that the above copyright notice appears in all
* copies, and that both that copyright notice and this permission notice
* appear in supporting documentation, and that the name of Secret Labs
* AB or the author not be used in advertising or publicity pertaining to
* distribution of the software without specific, written prior
* permission.
*
* SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
* THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
* OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
* -------------------------------------------------------------------- */
#include "Python.h" #include "Python.h"
@ -1129,27 +1103,7 @@ int unicodeescape_decoding_error(const char **source,
} }
} }
static _Py_UCNHashAPI *pucnHash = NULL; static _PyUnicode_Name_CAPI *unicode_names = NULL;
static
int mystrnicmp(const char *s1, const char *s2, size_t count)
{
char c1, c2;
if (count)
{
do
{
c1 = tolower(*(s1++));
c2 = tolower(*(s2++));
}
while(--count && c1 == c2);
return c1 - c2;
}
return 0;
}
PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
int size, int size,
@ -1282,55 +1236,37 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
/* Ok, we need to deal with Unicode Character Names now, /* Ok, we need to deal with Unicode Character Names now,
* make sure we've imported the hash table data... * make sure we've imported the hash table data...
*/ */
if (pucnHash == NULL) { if (unicode_names == NULL) {
PyObject *mod = 0, *v = 0; PyObject *mod = 0, *v = 0;
mod = PyImport_ImportModule("ucnhash"); mod = PyImport_ImportModule("ucnhash");
if (mod == NULL) if (mod == NULL)
goto onError; goto onError;
v = PyObject_GetAttrString(mod,"ucnhashAPI"); v = PyObject_GetAttrString(mod,"Unicode_Names_CAPI");
Py_DECREF(mod); Py_DECREF(mod);
if (v == NULL) if (v == NULL)
goto onError; goto onError;
pucnHash = PyCObject_AsVoidPtr(v); unicode_names = PyCObject_AsVoidPtr(v);
Py_DECREF(v); Py_DECREF(v);
if (pucnHash == NULL) if (unicode_names == NULL)
goto onError; goto onError;
} }
if (*s == '{') { if (*s == '{') {
const char *start = s + 1; const char *start = s + 1;
const char *endBrace = start; const char *endBrace = start;
unsigned long j;
/* look for either the closing brace, or we /* look for the closing brace */
* exceed the maximum length of the unicode character names while (*endBrace != '}' && endBrace < end)
*/
while (*endBrace != '}' &&
(unsigned int)(endBrace - start) <=
pucnHash->cchMax &&
endBrace < end)
{
endBrace++; endBrace++;
}
if (endBrace != end && *endBrace == '}') { if (endBrace != end && *endBrace == '}') {
j = pucnHash->hash(start, endBrace - start); if (!unicode_names->getcode(start, endBrace-start, &chr)) {
if (j > pucnHash->cKeys ||
mystrnicmp(
start,
((_Py_UnicodeCharacterName *)
(pucnHash->getValue(j)))->pszUCN,
(int)(endBrace - start)) != 0)
{
if (unicodeescape_decoding_error( if (unicodeescape_decoding_error(
&s, &x, errors, &s, &x, errors,
"Invalid Unicode Character Name")) "Invalid Unicode Character Name")
{ )
goto onError; goto onError;
}
goto ucnFallthrough; goto ucnFallthrough;
} }
chr = ((_Py_UnicodeCharacterName *)
(pucnHash->getValue(j)))->value;
s = endBrace + 1; s = endBrace + 1;
goto store; goto store;
} else { } else {