mirror of
https://github.com/python/cpython.git
synced 2025-07-07 11:25:30 +00:00

We had the definition of what makes a character "printable" documented in three places, giving two different definitions. The definition in the comment on `_PyUnicode_IsPrintable` was inverted; correct that. With that correction, the two definitions turn out to be equivalent -- but to confirm that, you have to go look up, or happen to know, that those are the only five "Other" categories and only three "Separator" categories in the Unicode character database. That makes it hard for the reader to tell whether they really are the same, or if there's some subtle difference in the intended semantics. Fix that by cutting the C API docs' and the C comment's copies of the subtle details, in favor of referring to the Python-level docs. That ensures it's explicit that these are all meant to agree, and also lets us concentrate improvements to the wording in one place. Speaking of which, borrow some ideas from the C comment, along with other tweaks, to hopefully add a bit more clarity to that one newly-centralized copy in the docs. Also add a thorough test that the implementation agrees with this definition. Author: Greg Price <gnprice@gmail.com> Co-authored-by: Greg Price <gnprice@gmail.com>
287 lines
7.1 KiB
C
287 lines
7.1 KiB
C
/*
|
|
Unicode character type helpers.
|
|
|
|
Written by Marc-Andre Lemburg (mal@lemburg.com).
|
|
Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
|
|
|
|
Copyright (c) Corporation for National Research Initiatives.
|
|
|
|
*/
|
|
|
|
#include "Python.h"
|
|
|
|
#define ALPHA_MASK 0x01
|
|
#define DECIMAL_MASK 0x02
|
|
#define DIGIT_MASK 0x04
|
|
#define LOWER_MASK 0x08
|
|
#define TITLE_MASK 0x40
|
|
#define UPPER_MASK 0x80
|
|
#define XID_START_MASK 0x100
|
|
#define XID_CONTINUE_MASK 0x200
|
|
#define PRINTABLE_MASK 0x400
|
|
#define NUMERIC_MASK 0x800
|
|
#define CASE_IGNORABLE_MASK 0x1000
|
|
#define CASED_MASK 0x2000
|
|
#define EXTENDED_CASE_MASK 0x4000
|
|
|
|
typedef struct {
|
|
/*
|
|
These are either deltas to the character or offsets in
|
|
_PyUnicode_ExtendedCase.
|
|
*/
|
|
const int upper;
|
|
const int lower;
|
|
const int title;
|
|
/* Note if more flag space is needed, decimal and digit could be unified. */
|
|
const unsigned char decimal;
|
|
const unsigned char digit;
|
|
const unsigned short flags;
|
|
} _PyUnicode_TypeRecord;
|
|
|
|
#include "unicodetype_db.h"
|
|
|
|
static const _PyUnicode_TypeRecord *
|
|
gettyperecord(Py_UCS4 code)
|
|
{
|
|
int index;
|
|
|
|
if (code >= 0x110000)
|
|
index = 0;
|
|
else
|
|
{
|
|
index = index1[(code>>SHIFT)];
|
|
index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
|
|
}
|
|
|
|
return &_PyUnicode_TypeRecords[index];
|
|
}
|
|
|
|
/* Returns the titlecase Unicode characters corresponding to ch or just
|
|
ch if no titlecase mapping is known. */
|
|
|
|
Py_UCS4 _PyUnicode_ToTitlecase(Py_UCS4 ch)
|
|
{
|
|
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
|
|
|
|
if (ctype->flags & EXTENDED_CASE_MASK)
|
|
return _PyUnicode_ExtendedCase[ctype->title & 0xFFFF];
|
|
return ch + ctype->title;
|
|
}
|
|
|
|
/* Returns 1 for Unicode characters having the category 'Lt', 0
|
|
otherwise. */
|
|
|
|
int _PyUnicode_IsTitlecase(Py_UCS4 ch)
|
|
{
|
|
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
|
|
|
|
return (ctype->flags & TITLE_MASK) != 0;
|
|
}
|
|
|
|
/* Returns 1 for Unicode characters having the XID_Start property, 0
|
|
otherwise. */
|
|
|
|
int _PyUnicode_IsXidStart(Py_UCS4 ch)
|
|
{
|
|
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
|
|
|
|
return (ctype->flags & XID_START_MASK) != 0;
|
|
}
|
|
|
|
/* Returns 1 for Unicode characters having the XID_Continue property,
|
|
0 otherwise. */
|
|
|
|
int _PyUnicode_IsXidContinue(Py_UCS4 ch)
|
|
{
|
|
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
|
|
|
|
return (ctype->flags & XID_CONTINUE_MASK) != 0;
|
|
}
|
|
|
|
/* Returns the integer decimal (0-9) for Unicode characters having
|
|
this property, -1 otherwise. */
|
|
|
|
int _PyUnicode_ToDecimalDigit(Py_UCS4 ch)
|
|
{
|
|
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
|
|
|
|
return (ctype->flags & DECIMAL_MASK) ? ctype->decimal : -1;
|
|
}
|
|
|
|
int _PyUnicode_IsDecimalDigit(Py_UCS4 ch)
|
|
{
|
|
if (_PyUnicode_ToDecimalDigit(ch) < 0)
|
|
return 0;
|
|
return 1;
|
|
}
|
|
|
|
/* Returns the integer digit (0-9) for Unicode characters having
|
|
this property, -1 otherwise. */
|
|
|
|
int _PyUnicode_ToDigit(Py_UCS4 ch)
|
|
{
|
|
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
|
|
|
|
return (ctype->flags & DIGIT_MASK) ? ctype->digit : -1;
|
|
}
|
|
|
|
int _PyUnicode_IsDigit(Py_UCS4 ch)
|
|
{
|
|
if (_PyUnicode_ToDigit(ch) < 0)
|
|
return 0;
|
|
return 1;
|
|
}
|
|
|
|
/* Returns the numeric value as double for Unicode characters having
|
|
this property, -1.0 otherwise. */
|
|
|
|
int _PyUnicode_IsNumeric(Py_UCS4 ch)
|
|
{
|
|
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
|
|
|
|
return (ctype->flags & NUMERIC_MASK) != 0;
|
|
}
|
|
|
|
/* Returns 1 for Unicode characters that repr() may use in its output,
|
|
and 0 for characters to be hex-escaped.
|
|
|
|
See documentation of `str.isprintable` for details.
|
|
*/
|
|
int _PyUnicode_IsPrintable(Py_UCS4 ch)
|
|
{
|
|
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
|
|
|
|
return (ctype->flags & PRINTABLE_MASK) != 0;
|
|
}
|
|
|
|
/* Returns 1 for Unicode characters having the category 'Ll', 0
|
|
otherwise. */
|
|
|
|
int _PyUnicode_IsLowercase(Py_UCS4 ch)
|
|
{
|
|
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
|
|
|
|
return (ctype->flags & LOWER_MASK) != 0;
|
|
}
|
|
|
|
/* Returns 1 for Unicode characters having the category 'Lu', 0
|
|
otherwise. */
|
|
|
|
int _PyUnicode_IsUppercase(Py_UCS4 ch)
|
|
{
|
|
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
|
|
|
|
return (ctype->flags & UPPER_MASK) != 0;
|
|
}
|
|
|
|
/* Returns the uppercase Unicode characters corresponding to ch or just
|
|
ch if no uppercase mapping is known. */
|
|
|
|
Py_UCS4 _PyUnicode_ToUppercase(Py_UCS4 ch)
|
|
{
|
|
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
|
|
|
|
if (ctype->flags & EXTENDED_CASE_MASK)
|
|
return _PyUnicode_ExtendedCase[ctype->upper & 0xFFFF];
|
|
return ch + ctype->upper;
|
|
}
|
|
|
|
/* Returns the lowercase Unicode characters corresponding to ch or just
|
|
ch if no lowercase mapping is known. */
|
|
|
|
Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch)
|
|
{
|
|
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
|
|
|
|
if (ctype->flags & EXTENDED_CASE_MASK)
|
|
return _PyUnicode_ExtendedCase[ctype->lower & 0xFFFF];
|
|
return ch + ctype->lower;
|
|
}
|
|
|
|
int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res)
|
|
{
|
|
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
|
|
|
|
if (ctype->flags & EXTENDED_CASE_MASK) {
|
|
int index = ctype->lower & 0xFFFF;
|
|
int n = ctype->lower >> 24;
|
|
int i;
|
|
for (i = 0; i < n; i++)
|
|
res[i] = _PyUnicode_ExtendedCase[index + i];
|
|
return n;
|
|
}
|
|
res[0] = ch + ctype->lower;
|
|
return 1;
|
|
}
|
|
|
|
int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res)
|
|
{
|
|
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
|
|
|
|
if (ctype->flags & EXTENDED_CASE_MASK) {
|
|
int index = ctype->title & 0xFFFF;
|
|
int n = ctype->title >> 24;
|
|
int i;
|
|
for (i = 0; i < n; i++)
|
|
res[i] = _PyUnicode_ExtendedCase[index + i];
|
|
return n;
|
|
}
|
|
res[0] = ch + ctype->title;
|
|
return 1;
|
|
}
|
|
|
|
int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res)
|
|
{
|
|
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
|
|
|
|
if (ctype->flags & EXTENDED_CASE_MASK) {
|
|
int index = ctype->upper & 0xFFFF;
|
|
int n = ctype->upper >> 24;
|
|
int i;
|
|
for (i = 0; i < n; i++)
|
|
res[i] = _PyUnicode_ExtendedCase[index + i];
|
|
return n;
|
|
}
|
|
res[0] = ch + ctype->upper;
|
|
return 1;
|
|
}
|
|
|
|
int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res)
|
|
{
|
|
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
|
|
|
|
if (ctype->flags & EXTENDED_CASE_MASK && (ctype->lower >> 20) & 7) {
|
|
int index = (ctype->lower & 0xFFFF) + (ctype->lower >> 24);
|
|
int n = (ctype->lower >> 20) & 7;
|
|
int i;
|
|
for (i = 0; i < n; i++)
|
|
res[i] = _PyUnicode_ExtendedCase[index + i];
|
|
return n;
|
|
}
|
|
return _PyUnicode_ToLowerFull(ch, res);
|
|
}
|
|
|
|
int _PyUnicode_IsCased(Py_UCS4 ch)
|
|
{
|
|
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
|
|
|
|
return (ctype->flags & CASED_MASK) != 0;
|
|
}
|
|
|
|
int _PyUnicode_IsCaseIgnorable(Py_UCS4 ch)
|
|
{
|
|
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
|
|
|
|
return (ctype->flags & CASE_IGNORABLE_MASK) != 0;
|
|
}
|
|
|
|
/* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt',
|
|
'Lo' or 'Lm', 0 otherwise. */
|
|
|
|
int _PyUnicode_IsAlpha(Py_UCS4 ch)
|
|
{
|
|
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
|
|
|
|
return (ctype->flags & ALPHA_MASK) != 0;
|
|
}
|
|
|