- SF #962502: Add two more methods for unicode type; width() and

iswide() for east asian width manipulation. (Inspired by David Goodger, Reviewed by Martin v. Loewis) - Move _PyUnicode_TypeRecord.flags to the end of the struct so that no padding is added for UCS-4 builds. (Suggested by Martin v. Loewis)
2025-08-27 04:05:34 +00:00 · 2004-06-02 16:49:17 +00:00 · 2004-06-02 16:49:17 +00:00 · 974ed7cfa5
commit 974ed7cfa5
parent b6568b91fd
11 changed files with 683 additions and 459 deletions
--- a/Doc/api/concrete.tex
+++ b/Doc/api/concrete.tex
@ -850,6 +850,11 @@ functions depending on the Python configuration.
  character.
 \end{cfuncdesc}
 \begin{cfuncdesc}{int}{Py_UNICODE_ISWIDE}{Py_UNICODE ch}
  Returns 1/0 depending on whether \var{ch} is a wide or full-width
  character.
 \end{cfuncdesc}
 These APIs can be used for fast direct character conversions:
 \begin{cfuncdesc}{Py_UNICODE}{Py_UNICODE_TOLOWER}{Py_UNICODE ch}
@ -908,6 +913,10 @@ use these APIs:
  Return the length of the Unicode object.
 \end{cfuncdesc}
 \begin{cfuncdesc}{int}{PyUnicode_GetWidth}{PyObject *unicode}
  Return the fixed-width representation length of the Unicode object.
 \end{cfuncdesc}
 \begin{cfuncdesc}{PyObject*}{PyUnicode_FromEncodedObject}{PyObject *obj,
                                                      const char *encoding,
                                                      const char *errors}
--- a/Doc/lib/libstdtypes.tex
+++ b/Doc/lib/libstdtypes.tex
@ -642,6 +642,12 @@ Return true if all cased characters in the string are uppercase and
 there is at least one cased character, false otherwise.
 \end{methoddesc}
 \begin{methoddesc}[string]{iswide}{}
 Return true if all characters in the string are wide or full width and
 there is at least one wide or full width character, false otherwise.
 This method is supported by unicode type only.
 \end{methoddesc}
 \begin{methoddesc}[string]{join}{seq}
 Return a string which is the concatenation of the strings in the
 sequence \var{seq}.  The separator between elements is the string
@ -774,6 +780,11 @@ character mapping codec using the \refmodule{codecs} module (see
 Return a copy of the string converted to uppercase.
 \end{methoddesc}
 \begin{methoddesc}[string]{width}{}
 Return length of fixed-width representation of the string. This method
 is supported by unicode type only.
 \end{methoddesc}
 \begin{methoddesc}[string]{zfill}{width}
 Return the numeric string left filled with zeros in a string
 of length \var{width}. The original string is returned if
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@ -180,6 +180,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
 # define PyUnicode_GetDefaultEncoding PyUnicodeUCS2_GetDefaultEncoding
 # define PyUnicode_GetMax PyUnicodeUCS2_GetMax
 # define PyUnicode_GetSize PyUnicodeUCS2_GetSize
 # define PyUnicode_GetWidth PyUnicodeUCS2_GetWidth
 # define PyUnicode_Join PyUnicodeUCS2_Join
 # define PyUnicode_Replace PyUnicodeUCS2_Replace
 # define PyUnicode_Resize PyUnicodeUCS2_Resize
@ -199,6 +200,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
 # define _PyUnicode_IsLinebreak _PyUnicodeUCS2_IsLinebreak
 # define _PyUnicode_IsLowercase _PyUnicodeUCS2_IsLowercase
 # define _PyUnicode_IsNumeric _PyUnicodeUCS2_IsNumeric
 # define _PyUnicode_IsWide _PyUnicodeUCS2_IsWide
 # define _PyUnicode_IsTitlecase _PyUnicodeUCS2_IsTitlecase
 # define _PyUnicode_IsUppercase _PyUnicodeUCS2_IsUppercase
 # define _PyUnicode_IsWhitespace _PyUnicodeUCS2_IsWhitespace
@ -252,6 +254,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
 # define PyUnicode_GetDefaultEncoding PyUnicodeUCS4_GetDefaultEncoding
 # define PyUnicode_GetMax PyUnicodeUCS4_GetMax
 # define PyUnicode_GetSize PyUnicodeUCS4_GetSize
 # define PyUnicode_GetWidth PyUnicodeUCS4_GetWidth
 # define PyUnicode_Join PyUnicodeUCS4_Join
 # define PyUnicode_Replace PyUnicodeUCS4_Replace
 # define PyUnicode_Resize PyUnicodeUCS4_Resize
@ -270,6 +273,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
 # define _PyUnicode_IsLinebreak _PyUnicodeUCS4_IsLinebreak
 # define _PyUnicode_IsLowercase _PyUnicodeUCS4_IsLowercase
 # define _PyUnicode_IsNumeric _PyUnicodeUCS4_IsNumeric
 # define _PyUnicode_IsWide _PyUnicodeUCS4_IsWide
 # define _PyUnicode_IsTitlecase _PyUnicodeUCS4_IsTitlecase
 # define _PyUnicode_IsUppercase _PyUnicodeUCS4_IsUppercase
 # define _PyUnicode_IsWhitespace _PyUnicodeUCS4_IsWhitespace
@ -315,6 +319,8 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
 #define Py_UNICODE_ISALPHA(ch) iswalpha(ch)
 #define Py_UNICODE_ISWIDE(ch) _PyUnicode_IsWide(ch)
 #else
 #define Py_UNICODE_ISSPACE(ch) _PyUnicode_IsWhitespace(ch)
@ -338,6 +344,8 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
 #define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
 #define Py_UNICODE_ISWIDE(ch) _PyUnicode_IsWide(ch)
 #endif
 #define Py_UNICODE_ISALNUM(ch) \
@ -430,6 +438,12 @@ PyAPI_FUNC(int) PyUnicode_GetSize(
    PyObject *unicode	 	/* Unicode object */
    );
 /* Get the fixed-width representation length of the Unicode object */
 PyAPI_FUNC(int) PyUnicode_GetWidth(
    PyObject *unicode	 	/* Unicode object */
    );
 /* Get the maximum ordinal for a Unicode character. */
 PyAPI_FUNC(Py_UNICODE) PyUnicode_GetMax(void);
@ -1151,6 +1165,10 @@ PyAPI_FUNC(int) _PyUnicode_IsAlpha(
    Py_UNICODE ch 	/* Unicode character */
    );
 PyAPI_FUNC(int) _PyUnicode_IsWide(
    Py_UNICODE ch 	/* Unicode character */
    );
 #ifdef __cplusplus
 }
 #endif
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@ -291,6 +291,26 @@ class UnicodeTest(
        self.assertRaises(TypeError, u"abc".isnumeric, 42)
    def test_iswide(self):
        self.checkequalnofix(False, u'', 'iswide')
        self.checkequalnofix(False, u'\x1f', 'iswide') # Neutral
        self.checkequalnofix(False, u'\x20', 'iswide') # Narrow
        self.checkequalnofix(True, u'\u2329', 'iswide') # Wide
        self.checkequalnofix(False, u'\uff64', 'iswide') # Half
        self.checkequalnofix(True, u'\u3000', 'iswide') # Full
        self.checkequalnofix(False, u'\u2460', 'iswide') # Ambiguous
        self.checkequalnofix(True, u'\ud55c\uae00', 'iswide')
        self.checkequalnofix(False, u'\ud55c\u2606\uae00', 'iswide')
    def test_wide(self):
        self.assertEqual(u''.width(), 0)
        self.assertEqual(u'abcd'.width(), 4)
        self.assertEqual(u'\u0187\u01c9'.width(), 2)
        self.assertEqual(u'\u2460\u2329'.width(), 3)
        self.assertEqual(u'\u2329\u2460'.width(), 3)
        self.assertEqual(u'\ud55c\uae00'.width(), 4)
        self.assertEqual(u'\ud55c\u2606\uae00'.width(), 5)
    def test_contains(self):
        # Testing Unicode contains method
        self.assert_('a' in u'abdb')
--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -12,6 +12,9 @@ What's New in Python 2.4 alpha 1?
 Core and builtins
 -----------------
 - Unicode type got two new methods; iswide() and width(). They
  manipulate east asian width information as of Unicode TR11.
 - Improved the tuple hashing algorithm to give fewer collisions in
  common cases.  Fixes bug  #942952.
--- a/Modules/unicodedata_db.h
+++ b/Modules/unicodedata_db.h
@ -1,4 +1,4 @@
-/* this file was generated by Tools/unicode/makeunicodedata.py 2.2 */
+/* this file was generated by Tools/unicode/makeunicodedata.py 2.3 */
 #define UNIDATA_VERSION "3.2.0"
 /* a list of unique database records */
--- a/Modules/unicodename_db.h
+++ b/Modules/unicodename_db.h
@ -1,4 +1,4 @@
-/* this file was generated by Tools/unicode/makeunicodedata.py 2.2 */
+/* this file was generated by Tools/unicode/makeunicodedata.py 2.3 */
 #define NAME_MAXLEN 256
--- a/Objects/unicodectype.c
+++ b/Objects/unicodectype.c
@ -19,14 +19,15 @@
 #define SPACE_MASK 0x20
 #define TITLE_MASK 0x40
 #define UPPER_MASK 0x80
 #define WIDE_MASK 0x100
 typedef struct {
    const unsigned short flags;
    const Py_UNICODE upper;
    const Py_UNICODE lower;
    const Py_UNICODE title;
    const unsigned char decimal;
    const unsigned char digit;
    const unsigned short flags;
 } _PyUnicode_TypeRecord;
 #include "unicodetype_db.h"
@ -322,6 +323,15 @@ int _PyUnicode_IsNumeric(Py_UNICODE ch)
    return 1;
 }
 /* Returns 1 for Unicode characters having Full or Wide width, 0 otherwise */
 int _PyUnicode_IsWide(Py_UNICODE ch)
 {
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
    return (ctype->flags & WIDE_MASK) != 0;
 }
 #ifndef WANT_WCTYPE_FUNCTIONS
 /* Returns 1 for Unicode characters having the bidirectional type
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -655,6 +655,27 @@ int PyUnicode_GetSize(PyObject *unicode)
    return -1;
 }
 int PyUnicode_GetWidth(PyObject *unicode)
 {
    const Py_UNICODE *p, *e;
    int width;
    if (!PyUnicode_Check(unicode)) {
 	PyErr_BadArgument();
 	return -1;
    }
    p = PyUnicode_AS_UNICODE(unicode);
    e = p + PyUnicode_GET_SIZE(unicode);
    for (width = 0; p < e; p++)
 	if (Py_UNICODE_ISWIDE(*p))
 	    width += 2;
 	else
 	    width++;
    return width;
 }
 const char *PyUnicode_GetDefaultEncoding(void)
 {
    return unicode_default_encoding;
@ -5316,6 +5337,35 @@ unicode_isnumeric(PyUnicodeObject *self)
    return PyBool_FromLong(1);
 }
 PyDoc_STRVAR(iswide__doc__,
 "S.iswide() -> bool\n\
 \n\
 Return True if all characters in S are wide width\n\
 and there is at least one character in S, False otherwise.");
 static PyObject*
 unicode_iswide(PyUnicodeObject *self)
 {
    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
    register const Py_UNICODE *e;
    /* Shortcut for single character strings */
    if (PyUnicode_GET_SIZE(self) == 1 &&
 	Py_UNICODE_ISWIDE(*p))
 	Py_RETURN_TRUE;
    /* Special case for empty strings */
    if (PyString_GET_SIZE(self) == 0)
 	Py_RETURN_FALSE;
    e = p + PyUnicode_GET_SIZE(self);
    for (; p < e; p++) {
 	if (!Py_UNICODE_ISWIDE(*p))
 	    Py_RETURN_FALSE;
    }
    Py_RETURN_TRUE;
 }
 PyDoc_STRVAR(join__doc__,
 "S.join(sequence) -> unicode\n\
 \n\
@ -5335,7 +5385,7 @@ unicode_length(PyUnicodeObject *self)
 }
 PyDoc_STRVAR(ljust__doc__,
-"S.ljust(width[, fillchar]) -> unicode\n\
+"S.ljust(width[, fillchar]) -> int\n\
 \n\
 Return S left justified in a Unicode string of length width. Padding is\n\
 done using the specified fill character (default is a space).");
@ -5927,6 +5977,21 @@ unicode_upper(PyUnicodeObject *self)
    return fixup(self, fixupper);
 }
 PyDoc_STRVAR(width__doc__,
 "S.width() -> unicode\n\
 \n\
 Return a fixed-width representation length of S.");
 static PyObject*
 unicode_width(PyObject *self)
 {
    int width = PyUnicode_GetWidth(self);
    if (width == -1)
 	return NULL;
    else
 	return PyInt_FromLong((long)width);
 }
 PyDoc_STRVAR(zfill__doc__,
 "S.zfill(width) -> unicode\n\
 \n\
@ -6090,6 +6155,8 @@ static PyMethodDef unicode_methods[] = {
    {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
    {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
    {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
    {"iswide", (PyCFunction) unicode_iswide, METH_NOARGS, iswide__doc__},
    {"width", (PyCFunction) unicode_width, METH_NOARGS, width__doc__},
    {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
 #if 0
    {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
--- a/Objects/unicodetype_db.h
+++ b/Objects/unicodetype_db.h
--- a/Tools/unicode/makeunicodedata.py
+++ b/Tools/unicode/makeunicodedata.py
@ -18,6 +18,7 @@
 # 2002-10-22 mvl  generate NFC tables
 # 2002-11-24 mvl  expand all ranges, sort names version-independently
 # 2002-11-25 mvl  add UNIDATA_VERSION
 # 2004-05-29 perky add east asian width information
 #
 # written by Fredrik Lundh (fredrik@pythonware.com)
 #
@ -25,12 +26,13 @@
 import sys
 SCRIPT = sys.argv[0]
-VERSION = "2.2"
+VERSION = "2.3"
 # The Unicode Database
 UNIDATA_VERSION = "3.2.0"
 UNICODE_DATA = "UnicodeData.txt"
 COMPOSITION_EXCLUSIONS = "CompositionExclusions.txt"
 EASTASIAN_WIDTH = "EastAsianWidth.txt"
 CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
    "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
@ -50,12 +52,14 @@ LINEBREAK_MASK = 0x10
 SPACE_MASK = 0x20
 TITLE_MASK = 0x40
 UPPER_MASK = 0x80
 WIDE_MASK = 0x100
 def maketables(trace=0):
    print "--- Reading", UNICODE_DATA, "..."
-    unicode = UnicodeData(UNICODE_DATA, COMPOSITION_EXCLUSIONS)
+    unicode = UnicodeData(UNICODE_DATA, COMPOSITION_EXCLUSIONS,
                          EASTASIAN_WIDTH)
    print len(filter(None, unicode.table)), "characters"
@ -330,8 +334,10 @@ def makeunicodetype(unicode, trace):
            if record[7]:
                flags |= DIGIT_MASK
                digit = int(record[7])
            if record[15] in ('W', 'F'): # Wide or Full width
                flags |= WIDE_MASK
            item = (
-                flags, upper, lower, title, decimal, digit
+                upper, lower, title, decimal, digit, flags
                )
            # add entry to index and item tables
            i = cache.get(item)
@ -538,7 +544,7 @@ import sys
 class UnicodeData:
-    def __init__(self, filename, exclusions, expand=1):
+    def __init__(self, filename, exclusions, eastasianwidth, expand=1):
        file = open(filename)
        table = [None] * 0x110000
        while 1:
@ -581,6 +587,25 @@ class UnicodeData:
            char = int(s.split()[0],16)
            self.exclusions[char] = 1
        widths = [None] * 0x110000
        for s in open(eastasianwidth):
            s = s.strip()
            if not s:
                continue
            if s[0] == '#':
                continue
            s = s.split()[0].split(';')
            if '..' in s[0]:
                first, last = [int(c, 16) for c in s[0].split('..')]
                chars = range(first, last+1)
            else:
                chars = [int(s[0], 16)]
            for char in chars:
                widths[char] = s[1]
        for i in range(0, 0x110000):
            if table[i] is not None:
                table[i].append(widths[i])
    def uselatin1(self):
        # restrict character range to ISO Latin 1
        self.chars = range(256)
`@ -1,4 +1,4 @@`
	`/* this file was generated by Tools/unicode/makeunicodedata.py 2.2 */`	`/* this file was generated by Tools/unicode/makeunicodedata.py 2.3 */`

	`#define NAME_MAXLEN 256`	`#define NAME_MAXLEN 256`