mirror of
https://github.com/python/cpython.git
synced 2025-08-27 04:05:34 +00:00
- SF #962502: Add two more methods for unicode type; width() and
iswide() for east asian width manipulation. (Inspired by David Goodger, Reviewed by Martin v. Loewis) - Move _PyUnicode_TypeRecord.flags to the end of the struct so that no padding is added for UCS-4 builds. (Suggested by Martin v. Loewis)
This commit is contained in:
parent
b6568b91fd
commit
974ed7cfa5
11 changed files with 683 additions and 459 deletions
|
@ -850,6 +850,11 @@ functions depending on the Python configuration.
|
||||||
character.
|
character.
|
||||||
\end{cfuncdesc}
|
\end{cfuncdesc}
|
||||||
|
|
||||||
|
\begin{cfuncdesc}{int}{Py_UNICODE_ISWIDE}{Py_UNICODE ch}
|
||||||
|
Returns 1/0 depending on whether \var{ch} is a wide or full-width
|
||||||
|
character.
|
||||||
|
\end{cfuncdesc}
|
||||||
|
|
||||||
These APIs can be used for fast direct character conversions:
|
These APIs can be used for fast direct character conversions:
|
||||||
|
|
||||||
\begin{cfuncdesc}{Py_UNICODE}{Py_UNICODE_TOLOWER}{Py_UNICODE ch}
|
\begin{cfuncdesc}{Py_UNICODE}{Py_UNICODE_TOLOWER}{Py_UNICODE ch}
|
||||||
|
@ -908,6 +913,10 @@ use these APIs:
|
||||||
Return the length of the Unicode object.
|
Return the length of the Unicode object.
|
||||||
\end{cfuncdesc}
|
\end{cfuncdesc}
|
||||||
|
|
||||||
|
\begin{cfuncdesc}{int}{PyUnicode_GetWidth}{PyObject *unicode}
|
||||||
|
Return the fixed-width representation length of the Unicode object.
|
||||||
|
\end{cfuncdesc}
|
||||||
|
|
||||||
\begin{cfuncdesc}{PyObject*}{PyUnicode_FromEncodedObject}{PyObject *obj,
|
\begin{cfuncdesc}{PyObject*}{PyUnicode_FromEncodedObject}{PyObject *obj,
|
||||||
const char *encoding,
|
const char *encoding,
|
||||||
const char *errors}
|
const char *errors}
|
||||||
|
|
|
@ -642,6 +642,12 @@ Return true if all cased characters in the string are uppercase and
|
||||||
there is at least one cased character, false otherwise.
|
there is at least one cased character, false otherwise.
|
||||||
\end{methoddesc}
|
\end{methoddesc}
|
||||||
|
|
||||||
|
\begin{methoddesc}[string]{iswide}{}
|
||||||
|
Return true if all characters in the string are wide or full width and
|
||||||
|
there is at least one wide or full width character, false otherwise.
|
||||||
|
This method is supported by unicode type only.
|
||||||
|
\end{methoddesc}
|
||||||
|
|
||||||
\begin{methoddesc}[string]{join}{seq}
|
\begin{methoddesc}[string]{join}{seq}
|
||||||
Return a string which is the concatenation of the strings in the
|
Return a string which is the concatenation of the strings in the
|
||||||
sequence \var{seq}. The separator between elements is the string
|
sequence \var{seq}. The separator between elements is the string
|
||||||
|
@ -774,6 +780,11 @@ character mapping codec using the \refmodule{codecs} module (see
|
||||||
Return a copy of the string converted to uppercase.
|
Return a copy of the string converted to uppercase.
|
||||||
\end{methoddesc}
|
\end{methoddesc}
|
||||||
|
|
||||||
|
\begin{methoddesc}[string]{width}{}
|
||||||
|
Return length of fixed-width representation of the string. This method
|
||||||
|
is supported by unicode type only.
|
||||||
|
\end{methoddesc}
|
||||||
|
|
||||||
\begin{methoddesc}[string]{zfill}{width}
|
\begin{methoddesc}[string]{zfill}{width}
|
||||||
Return the numeric string left filled with zeros in a string
|
Return the numeric string left filled with zeros in a string
|
||||||
of length \var{width}. The original string is returned if
|
of length \var{width}. The original string is returned if
|
||||||
|
|
|
@ -180,6 +180,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
|
||||||
# define PyUnicode_GetDefaultEncoding PyUnicodeUCS2_GetDefaultEncoding
|
# define PyUnicode_GetDefaultEncoding PyUnicodeUCS2_GetDefaultEncoding
|
||||||
# define PyUnicode_GetMax PyUnicodeUCS2_GetMax
|
# define PyUnicode_GetMax PyUnicodeUCS2_GetMax
|
||||||
# define PyUnicode_GetSize PyUnicodeUCS2_GetSize
|
# define PyUnicode_GetSize PyUnicodeUCS2_GetSize
|
||||||
|
# define PyUnicode_GetWidth PyUnicodeUCS2_GetWidth
|
||||||
# define PyUnicode_Join PyUnicodeUCS2_Join
|
# define PyUnicode_Join PyUnicodeUCS2_Join
|
||||||
# define PyUnicode_Replace PyUnicodeUCS2_Replace
|
# define PyUnicode_Replace PyUnicodeUCS2_Replace
|
||||||
# define PyUnicode_Resize PyUnicodeUCS2_Resize
|
# define PyUnicode_Resize PyUnicodeUCS2_Resize
|
||||||
|
@ -199,6 +200,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
|
||||||
# define _PyUnicode_IsLinebreak _PyUnicodeUCS2_IsLinebreak
|
# define _PyUnicode_IsLinebreak _PyUnicodeUCS2_IsLinebreak
|
||||||
# define _PyUnicode_IsLowercase _PyUnicodeUCS2_IsLowercase
|
# define _PyUnicode_IsLowercase _PyUnicodeUCS2_IsLowercase
|
||||||
# define _PyUnicode_IsNumeric _PyUnicodeUCS2_IsNumeric
|
# define _PyUnicode_IsNumeric _PyUnicodeUCS2_IsNumeric
|
||||||
|
# define _PyUnicode_IsWide _PyUnicodeUCS2_IsWide
|
||||||
# define _PyUnicode_IsTitlecase _PyUnicodeUCS2_IsTitlecase
|
# define _PyUnicode_IsTitlecase _PyUnicodeUCS2_IsTitlecase
|
||||||
# define _PyUnicode_IsUppercase _PyUnicodeUCS2_IsUppercase
|
# define _PyUnicode_IsUppercase _PyUnicodeUCS2_IsUppercase
|
||||||
# define _PyUnicode_IsWhitespace _PyUnicodeUCS2_IsWhitespace
|
# define _PyUnicode_IsWhitespace _PyUnicodeUCS2_IsWhitespace
|
||||||
|
@ -252,6 +254,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
|
||||||
# define PyUnicode_GetDefaultEncoding PyUnicodeUCS4_GetDefaultEncoding
|
# define PyUnicode_GetDefaultEncoding PyUnicodeUCS4_GetDefaultEncoding
|
||||||
# define PyUnicode_GetMax PyUnicodeUCS4_GetMax
|
# define PyUnicode_GetMax PyUnicodeUCS4_GetMax
|
||||||
# define PyUnicode_GetSize PyUnicodeUCS4_GetSize
|
# define PyUnicode_GetSize PyUnicodeUCS4_GetSize
|
||||||
|
# define PyUnicode_GetWidth PyUnicodeUCS4_GetWidth
|
||||||
# define PyUnicode_Join PyUnicodeUCS4_Join
|
# define PyUnicode_Join PyUnicodeUCS4_Join
|
||||||
# define PyUnicode_Replace PyUnicodeUCS4_Replace
|
# define PyUnicode_Replace PyUnicodeUCS4_Replace
|
||||||
# define PyUnicode_Resize PyUnicodeUCS4_Resize
|
# define PyUnicode_Resize PyUnicodeUCS4_Resize
|
||||||
|
@ -270,6 +273,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
|
||||||
# define _PyUnicode_IsLinebreak _PyUnicodeUCS4_IsLinebreak
|
# define _PyUnicode_IsLinebreak _PyUnicodeUCS4_IsLinebreak
|
||||||
# define _PyUnicode_IsLowercase _PyUnicodeUCS4_IsLowercase
|
# define _PyUnicode_IsLowercase _PyUnicodeUCS4_IsLowercase
|
||||||
# define _PyUnicode_IsNumeric _PyUnicodeUCS4_IsNumeric
|
# define _PyUnicode_IsNumeric _PyUnicodeUCS4_IsNumeric
|
||||||
|
# define _PyUnicode_IsWide _PyUnicodeUCS4_IsWide
|
||||||
# define _PyUnicode_IsTitlecase _PyUnicodeUCS4_IsTitlecase
|
# define _PyUnicode_IsTitlecase _PyUnicodeUCS4_IsTitlecase
|
||||||
# define _PyUnicode_IsUppercase _PyUnicodeUCS4_IsUppercase
|
# define _PyUnicode_IsUppercase _PyUnicodeUCS4_IsUppercase
|
||||||
# define _PyUnicode_IsWhitespace _PyUnicodeUCS4_IsWhitespace
|
# define _PyUnicode_IsWhitespace _PyUnicodeUCS4_IsWhitespace
|
||||||
|
@ -315,6 +319,8 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
|
||||||
|
|
||||||
#define Py_UNICODE_ISALPHA(ch) iswalpha(ch)
|
#define Py_UNICODE_ISALPHA(ch) iswalpha(ch)
|
||||||
|
|
||||||
|
#define Py_UNICODE_ISWIDE(ch) _PyUnicode_IsWide(ch)
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
#define Py_UNICODE_ISSPACE(ch) _PyUnicode_IsWhitespace(ch)
|
#define Py_UNICODE_ISSPACE(ch) _PyUnicode_IsWhitespace(ch)
|
||||||
|
@ -338,6 +344,8 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
|
||||||
|
|
||||||
#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
|
#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
|
||||||
|
|
||||||
|
#define Py_UNICODE_ISWIDE(ch) _PyUnicode_IsWide(ch)
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define Py_UNICODE_ISALNUM(ch) \
|
#define Py_UNICODE_ISALNUM(ch) \
|
||||||
|
@ -430,6 +438,12 @@ PyAPI_FUNC(int) PyUnicode_GetSize(
|
||||||
PyObject *unicode /* Unicode object */
|
PyObject *unicode /* Unicode object */
|
||||||
);
|
);
|
||||||
|
|
||||||
|
/* Get the fixed-width representation length of the Unicode object */
|
||||||
|
|
||||||
|
PyAPI_FUNC(int) PyUnicode_GetWidth(
|
||||||
|
PyObject *unicode /* Unicode object */
|
||||||
|
);
|
||||||
|
|
||||||
/* Get the maximum ordinal for a Unicode character. */
|
/* Get the maximum ordinal for a Unicode character. */
|
||||||
PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void);
|
PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void);
|
||||||
|
|
||||||
|
@ -1151,6 +1165,10 @@ PyAPI_FUNC(int) _PyUnicode_IsAlpha(
|
||||||
Py_UNICODE ch /* Unicode character */
|
Py_UNICODE ch /* Unicode character */
|
||||||
);
|
);
|
||||||
|
|
||||||
|
PyAPI_FUNC(int) _PyUnicode_IsWide(
|
||||||
|
Py_UNICODE ch /* Unicode character */
|
||||||
|
);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -291,6 +291,26 @@ class UnicodeTest(
|
||||||
|
|
||||||
self.assertRaises(TypeError, u"abc".isnumeric, 42)
|
self.assertRaises(TypeError, u"abc".isnumeric, 42)
|
||||||
|
|
||||||
|
def test_iswide(self):
|
||||||
|
self.checkequalnofix(False, u'', 'iswide')
|
||||||
|
self.checkequalnofix(False, u'\x1f', 'iswide') # Neutral
|
||||||
|
self.checkequalnofix(False, u'\x20', 'iswide') # Narrow
|
||||||
|
self.checkequalnofix(True, u'\u2329', 'iswide') # Wide
|
||||||
|
self.checkequalnofix(False, u'\uff64', 'iswide') # Half
|
||||||
|
self.checkequalnofix(True, u'\u3000', 'iswide') # Full
|
||||||
|
self.checkequalnofix(False, u'\u2460', 'iswide') # Ambiguous
|
||||||
|
self.checkequalnofix(True, u'\ud55c\uae00', 'iswide')
|
||||||
|
self.checkequalnofix(False, u'\ud55c\u2606\uae00', 'iswide')
|
||||||
|
|
||||||
|
def test_wide(self):
|
||||||
|
self.assertEqual(u''.width(), 0)
|
||||||
|
self.assertEqual(u'abcd'.width(), 4)
|
||||||
|
self.assertEqual(u'\u0187\u01c9'.width(), 2)
|
||||||
|
self.assertEqual(u'\u2460\u2329'.width(), 3)
|
||||||
|
self.assertEqual(u'\u2329\u2460'.width(), 3)
|
||||||
|
self.assertEqual(u'\ud55c\uae00'.width(), 4)
|
||||||
|
self.assertEqual(u'\ud55c\u2606\uae00'.width(), 5)
|
||||||
|
|
||||||
def test_contains(self):
|
def test_contains(self):
|
||||||
# Testing Unicode contains method
|
# Testing Unicode contains method
|
||||||
self.assert_('a' in u'abdb')
|
self.assert_('a' in u'abdb')
|
||||||
|
|
|
@ -12,6 +12,9 @@ What's New in Python 2.4 alpha 1?
|
||||||
Core and builtins
|
Core and builtins
|
||||||
-----------------
|
-----------------
|
||||||
|
|
||||||
|
- Unicode type got two new methods; iswide() and width(). They
|
||||||
|
manipulate east asian width information as of Unicode TR11.
|
||||||
|
|
||||||
- Improved the tuple hashing algorithm to give fewer collisions in
|
- Improved the tuple hashing algorithm to give fewer collisions in
|
||||||
common cases. Fixes bug #942952.
|
common cases. Fixes bug #942952.
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
/* this file was generated by Tools/unicode/makeunicodedata.py 2.2 */
|
/* this file was generated by Tools/unicode/makeunicodedata.py 2.3 */
|
||||||
|
|
||||||
#define UNIDATA_VERSION "3.2.0"
|
#define UNIDATA_VERSION "3.2.0"
|
||||||
/* a list of unique database records */
|
/* a list of unique database records */
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
/* this file was generated by Tools/unicode/makeunicodedata.py 2.2 */
|
/* this file was generated by Tools/unicode/makeunicodedata.py 2.3 */
|
||||||
|
|
||||||
#define NAME_MAXLEN 256
|
#define NAME_MAXLEN 256
|
||||||
|
|
||||||
|
|
|
@ -19,14 +19,15 @@
|
||||||
#define SPACE_MASK 0x20
|
#define SPACE_MASK 0x20
|
||||||
#define TITLE_MASK 0x40
|
#define TITLE_MASK 0x40
|
||||||
#define UPPER_MASK 0x80
|
#define UPPER_MASK 0x80
|
||||||
|
#define WIDE_MASK 0x100
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
const unsigned short flags;
|
|
||||||
const Py_UNICODE upper;
|
const Py_UNICODE upper;
|
||||||
const Py_UNICODE lower;
|
const Py_UNICODE lower;
|
||||||
const Py_UNICODE title;
|
const Py_UNICODE title;
|
||||||
const unsigned char decimal;
|
const unsigned char decimal;
|
||||||
const unsigned char digit;
|
const unsigned char digit;
|
||||||
|
const unsigned short flags;
|
||||||
} _PyUnicode_TypeRecord;
|
} _PyUnicode_TypeRecord;
|
||||||
|
|
||||||
#include "unicodetype_db.h"
|
#include "unicodetype_db.h"
|
||||||
|
@ -322,6 +323,15 @@ int _PyUnicode_IsNumeric(Py_UNICODE ch)
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Returns 1 for Unicode characters having Full or Wide width, 0 otherwise */
|
||||||
|
|
||||||
|
int _PyUnicode_IsWide(Py_UNICODE ch)
|
||||||
|
{
|
||||||
|
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
|
||||||
|
|
||||||
|
return (ctype->flags & WIDE_MASK) != 0;
|
||||||
|
}
|
||||||
|
|
||||||
#ifndef WANT_WCTYPE_FUNCTIONS
|
#ifndef WANT_WCTYPE_FUNCTIONS
|
||||||
|
|
||||||
/* Returns 1 for Unicode characters having the bidirectional type
|
/* Returns 1 for Unicode characters having the bidirectional type
|
||||||
|
|
|
@ -655,6 +655,27 @@ int PyUnicode_GetSize(PyObject *unicode)
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int PyUnicode_GetWidth(PyObject *unicode)
|
||||||
|
{
|
||||||
|
const Py_UNICODE *p, *e;
|
||||||
|
int width;
|
||||||
|
|
||||||
|
if (!PyUnicode_Check(unicode)) {
|
||||||
|
PyErr_BadArgument();
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
p = PyUnicode_AS_UNICODE(unicode);
|
||||||
|
e = p + PyUnicode_GET_SIZE(unicode);
|
||||||
|
for (width = 0; p < e; p++)
|
||||||
|
if (Py_UNICODE_ISWIDE(*p))
|
||||||
|
width += 2;
|
||||||
|
else
|
||||||
|
width++;
|
||||||
|
|
||||||
|
return width;
|
||||||
|
}
|
||||||
|
|
||||||
const char *PyUnicode_GetDefaultEncoding(void)
|
const char *PyUnicode_GetDefaultEncoding(void)
|
||||||
{
|
{
|
||||||
return unicode_default_encoding;
|
return unicode_default_encoding;
|
||||||
|
@ -5316,6 +5337,35 @@ unicode_isnumeric(PyUnicodeObject *self)
|
||||||
return PyBool_FromLong(1);
|
return PyBool_FromLong(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
PyDoc_STRVAR(iswide__doc__,
|
||||||
|
"S.iswide() -> bool\n\
|
||||||
|
\n\
|
||||||
|
Return True if all characters in S are wide width\n\
|
||||||
|
and there is at least one character in S, False otherwise.");
|
||||||
|
|
||||||
|
static PyObject*
|
||||||
|
unicode_iswide(PyUnicodeObject *self)
|
||||||
|
{
|
||||||
|
register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
|
||||||
|
register const Py_UNICODE *e;
|
||||||
|
|
||||||
|
/* Shortcut for single character strings */
|
||||||
|
if (PyUnicode_GET_SIZE(self) == 1 &&
|
||||||
|
Py_UNICODE_ISWIDE(*p))
|
||||||
|
Py_RETURN_TRUE;
|
||||||
|
|
||||||
|
/* Special case for empty strings */
|
||||||
|
if (PyString_GET_SIZE(self) == 0)
|
||||||
|
Py_RETURN_FALSE;
|
||||||
|
|
||||||
|
e = p + PyUnicode_GET_SIZE(self);
|
||||||
|
for (; p < e; p++) {
|
||||||
|
if (!Py_UNICODE_ISWIDE(*p))
|
||||||
|
Py_RETURN_FALSE;
|
||||||
|
}
|
||||||
|
Py_RETURN_TRUE;
|
||||||
|
}
|
||||||
|
|
||||||
PyDoc_STRVAR(join__doc__,
|
PyDoc_STRVAR(join__doc__,
|
||||||
"S.join(sequence) -> unicode\n\
|
"S.join(sequence) -> unicode\n\
|
||||||
\n\
|
\n\
|
||||||
|
@ -5335,7 +5385,7 @@ unicode_length(PyUnicodeObject *self)
|
||||||
}
|
}
|
||||||
|
|
||||||
PyDoc_STRVAR(ljust__doc__,
|
PyDoc_STRVAR(ljust__doc__,
|
||||||
"S.ljust(width[, fillchar]) -> unicode\n\
|
"S.ljust(width[, fillchar]) -> int\n\
|
||||||
\n\
|
\n\
|
||||||
Return S left justified in a Unicode string of length width. Padding is\n\
|
Return S left justified in a Unicode string of length width. Padding is\n\
|
||||||
done using the specified fill character (default is a space).");
|
done using the specified fill character (default is a space).");
|
||||||
|
@ -5927,6 +5977,21 @@ unicode_upper(PyUnicodeObject *self)
|
||||||
return fixup(self, fixupper);
|
return fixup(self, fixupper);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
PyDoc_STRVAR(width__doc__,
|
||||||
|
"S.width() -> unicode\n\
|
||||||
|
\n\
|
||||||
|
Return a fixed-width representation length of S.");
|
||||||
|
|
||||||
|
static PyObject*
|
||||||
|
unicode_width(PyObject *self)
|
||||||
|
{
|
||||||
|
int width = PyUnicode_GetWidth(self);
|
||||||
|
if (width == -1)
|
||||||
|
return NULL;
|
||||||
|
else
|
||||||
|
return PyInt_FromLong((long)width);
|
||||||
|
}
|
||||||
|
|
||||||
PyDoc_STRVAR(zfill__doc__,
|
PyDoc_STRVAR(zfill__doc__,
|
||||||
"S.zfill(width) -> unicode\n\
|
"S.zfill(width) -> unicode\n\
|
||||||
\n\
|
\n\
|
||||||
|
@ -6090,6 +6155,8 @@ static PyMethodDef unicode_methods[] = {
|
||||||
{"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
|
{"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
|
||||||
{"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
|
{"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
|
||||||
{"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
|
{"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
|
||||||
|
{"iswide", (PyCFunction) unicode_iswide, METH_NOARGS, iswide__doc__},
|
||||||
|
{"width", (PyCFunction) unicode_width, METH_NOARGS, width__doc__},
|
||||||
{"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
|
{"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
|
||||||
#if 0
|
#if 0
|
||||||
{"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
|
{"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
|
||||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -18,6 +18,7 @@
|
||||||
# 2002-10-22 mvl generate NFC tables
|
# 2002-10-22 mvl generate NFC tables
|
||||||
# 2002-11-24 mvl expand all ranges, sort names version-independently
|
# 2002-11-24 mvl expand all ranges, sort names version-independently
|
||||||
# 2002-11-25 mvl add UNIDATA_VERSION
|
# 2002-11-25 mvl add UNIDATA_VERSION
|
||||||
|
# 2004-05-29 perky add east asian width information
|
||||||
#
|
#
|
||||||
# written by Fredrik Lundh (fredrik@pythonware.com)
|
# written by Fredrik Lundh (fredrik@pythonware.com)
|
||||||
#
|
#
|
||||||
|
@ -25,12 +26,13 @@
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
SCRIPT = sys.argv[0]
|
SCRIPT = sys.argv[0]
|
||||||
VERSION = "2.2"
|
VERSION = "2.3"
|
||||||
|
|
||||||
# The Unicode Database
|
# The Unicode Database
|
||||||
UNIDATA_VERSION = "3.2.0"
|
UNIDATA_VERSION = "3.2.0"
|
||||||
UNICODE_DATA = "UnicodeData.txt"
|
UNICODE_DATA = "UnicodeData.txt"
|
||||||
COMPOSITION_EXCLUSIONS = "CompositionExclusions.txt"
|
COMPOSITION_EXCLUSIONS = "CompositionExclusions.txt"
|
||||||
|
EASTASIAN_WIDTH = "EastAsianWidth.txt"
|
||||||
|
|
||||||
CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
|
CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
|
||||||
"Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
|
"Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
|
||||||
|
@ -50,12 +52,14 @@ LINEBREAK_MASK = 0x10
|
||||||
SPACE_MASK = 0x20
|
SPACE_MASK = 0x20
|
||||||
TITLE_MASK = 0x40
|
TITLE_MASK = 0x40
|
||||||
UPPER_MASK = 0x80
|
UPPER_MASK = 0x80
|
||||||
|
WIDE_MASK = 0x100
|
||||||
|
|
||||||
def maketables(trace=0):
|
def maketables(trace=0):
|
||||||
|
|
||||||
print "--- Reading", UNICODE_DATA, "..."
|
print "--- Reading", UNICODE_DATA, "..."
|
||||||
|
|
||||||
unicode = UnicodeData(UNICODE_DATA, COMPOSITION_EXCLUSIONS)
|
unicode = UnicodeData(UNICODE_DATA, COMPOSITION_EXCLUSIONS,
|
||||||
|
EASTASIAN_WIDTH)
|
||||||
|
|
||||||
print len(filter(None, unicode.table)), "characters"
|
print len(filter(None, unicode.table)), "characters"
|
||||||
|
|
||||||
|
@ -330,8 +334,10 @@ def makeunicodetype(unicode, trace):
|
||||||
if record[7]:
|
if record[7]:
|
||||||
flags |= DIGIT_MASK
|
flags |= DIGIT_MASK
|
||||||
digit = int(record[7])
|
digit = int(record[7])
|
||||||
|
if record[15] in ('W', 'F'): # Wide or Full width
|
||||||
|
flags |= WIDE_MASK
|
||||||
item = (
|
item = (
|
||||||
flags, upper, lower, title, decimal, digit
|
upper, lower, title, decimal, digit, flags
|
||||||
)
|
)
|
||||||
# add entry to index and item tables
|
# add entry to index and item tables
|
||||||
i = cache.get(item)
|
i = cache.get(item)
|
||||||
|
@ -538,7 +544,7 @@ import sys
|
||||||
|
|
||||||
class UnicodeData:
|
class UnicodeData:
|
||||||
|
|
||||||
def __init__(self, filename, exclusions, expand=1):
|
def __init__(self, filename, exclusions, eastasianwidth, expand=1):
|
||||||
file = open(filename)
|
file = open(filename)
|
||||||
table = [None] * 0x110000
|
table = [None] * 0x110000
|
||||||
while 1:
|
while 1:
|
||||||
|
@ -581,6 +587,25 @@ class UnicodeData:
|
||||||
char = int(s.split()[0],16)
|
char = int(s.split()[0],16)
|
||||||
self.exclusions[char] = 1
|
self.exclusions[char] = 1
|
||||||
|
|
||||||
|
widths = [None] * 0x110000
|
||||||
|
for s in open(eastasianwidth):
|
||||||
|
s = s.strip()
|
||||||
|
if not s:
|
||||||
|
continue
|
||||||
|
if s[0] == '#':
|
||||||
|
continue
|
||||||
|
s = s.split()[0].split(';')
|
||||||
|
if '..' in s[0]:
|
||||||
|
first, last = [int(c, 16) for c in s[0].split('..')]
|
||||||
|
chars = range(first, last+1)
|
||||||
|
else:
|
||||||
|
chars = [int(s[0], 16)]
|
||||||
|
for char in chars:
|
||||||
|
widths[char] = s[1]
|
||||||
|
for i in range(0, 0x110000):
|
||||||
|
if table[i] is not None:
|
||||||
|
table[i].append(widths[i])
|
||||||
|
|
||||||
def uselatin1(self):
|
def uselatin1(self):
|
||||||
# restrict character range to ISO Latin 1
|
# restrict character range to ISO Latin 1
|
||||||
self.chars = range(256)
|
self.chars = range(256)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue