mirror of
https://github.com/python/cpython.git
synced 2025-08-03 00:23:06 +00:00
Issue #23181: More "codepoint" -> "code point".
This commit is contained in:
commit
82e07b92b3
17 changed files with 28 additions and 28 deletions
|
@ -514,7 +514,7 @@ The RFC does not explicitly forbid JSON strings which contain byte sequences
|
|||
that don't correspond to valid Unicode characters (e.g. unpaired UTF-16
|
||||
surrogates), but it does note that they may cause interoperability problems.
|
||||
By default, this module accepts and outputs (when present in the original
|
||||
:class:`str`) codepoints for such sequences.
|
||||
:class:`str`) code points for such sequences.
|
||||
|
||||
|
||||
Infinite and NaN Number Values
|
||||
|
|
|
@ -124,7 +124,7 @@ class Codec:
|
|||
Python will use the official U+FFFD REPLACEMENT
|
||||
CHARACTER for the builtin Unicode codecs on
|
||||
decoding and '?' on encoding.
|
||||
'surrogateescape' - replace with private codepoints U+DCnn.
|
||||
'surrogateescape' - replace with private code points U+DCnn.
|
||||
'xmlcharrefreplace' - Replace with the appropriate XML
|
||||
character reference (only for encoding).
|
||||
'backslashreplace' - Replace with backslashed escape sequences
|
||||
|
|
|
@ -273,7 +273,7 @@ class Message:
|
|||
bpayload = payload.encode('ascii')
|
||||
except UnicodeError:
|
||||
# This won't happen for RFC compliant messages (messages
|
||||
# containing only ASCII codepoints in the unicode input).
|
||||
# containing only ASCII code points in the unicode input).
|
||||
# If it does happen, turn the string into bytes in a way
|
||||
# guaranteed not to fail.
|
||||
bpayload = payload.encode('raw-unicode-escape')
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
__all__ = ['html5', 'name2codepoint', 'codepoint2name', 'entitydefs']
|
||||
|
||||
|
||||
# maps the HTML entity name to the Unicode codepoint
|
||||
# maps the HTML entity name to the Unicode code point
|
||||
name2codepoint = {
|
||||
'AElig': 0x00c6, # latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1
|
||||
'Aacute': 0x00c1, # latin capital letter A with acute, U+00C1 ISOlat1
|
||||
|
@ -2495,7 +2495,7 @@ html5 = {
|
|||
'zwnj;': '\u200c',
|
||||
}
|
||||
|
||||
# maps the Unicode codepoint to the HTML entity name
|
||||
# maps the Unicode code point to the HTML entity name
|
||||
codepoint2name = {}
|
||||
|
||||
# maps the HTML entity name to the character
|
||||
|
|
|
@ -21,7 +21,7 @@ class TestBase:
|
|||
roundtriptest = 1 # set if roundtrip is possible with unicode
|
||||
has_iso10646 = 0 # set if this encoding contains whole iso10646 map
|
||||
xmlcharnametest = None # string to test xmlcharrefreplace
|
||||
unmappedunicode = '\udeee' # a unicode codepoint that is not mapped.
|
||||
unmappedunicode = '\udeee' # a unicode code point that is not mapped.
|
||||
|
||||
def setUp(self):
|
||||
if self.codec is None:
|
||||
|
|
|
@ -48,10 +48,10 @@ class HtmlTests(unittest.TestCase):
|
|||
check(s % num, char)
|
||||
for end in [' ', 'X']:
|
||||
check((s+end) % num, char+end)
|
||||
# check invalid codepoints
|
||||
# check invalid code points
|
||||
for cp in [0xD800, 0xDB00, 0xDC00, 0xDFFF, 0x110000]:
|
||||
check_num(cp, '\uFFFD')
|
||||
# check more invalid codepoints
|
||||
# check more invalid code points
|
||||
for cp in [0x1, 0xb, 0xe, 0x7f, 0xfffe, 0xffff, 0x10fffe, 0x10ffff]:
|
||||
check_num(cp, '')
|
||||
# check invalid numbers
|
||||
|
|
|
@ -80,7 +80,7 @@ class Test_IncrementalEncoder(unittest.TestCase):
|
|||
self.assertEqual(encoder.reset(), None)
|
||||
|
||||
def test_stateful(self):
|
||||
# jisx0213 encoder is stateful for a few codepoints. eg)
|
||||
# jisx0213 encoder is stateful for a few code points. eg)
|
||||
# U+00E6 => A9DC
|
||||
# U+00E6 U+0300 => ABC4
|
||||
# U+0300 => ABDC
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
# To fully test this module, we would need a copy of the stringprep tables.
|
||||
# Since we don't have them, this test checks only a few codepoints.
|
||||
# Since we don't have them, this test checks only a few code points.
|
||||
|
||||
import unittest
|
||||
from test import support
|
||||
|
|
|
@ -1470,9 +1470,9 @@ class UnicodeTest(string_tests.CommonTest,
|
|||
def test_utf8_decode_invalid_sequences(self):
|
||||
# continuation bytes in a sequence of 2, 3, or 4 bytes
|
||||
continuation_bytes = [bytes([x]) for x in range(0x80, 0xC0)]
|
||||
# start bytes of a 2-byte sequence equivalent to codepoints < 0x7F
|
||||
# start bytes of a 2-byte sequence equivalent to code points < 0x7F
|
||||
invalid_2B_seq_start_bytes = [bytes([x]) for x in range(0xC0, 0xC2)]
|
||||
# start bytes of a 4-byte sequence equivalent to codepoints > 0x10FFFF
|
||||
# start bytes of a 4-byte sequence equivalent to code points > 0x10FFFF
|
||||
invalid_4B_seq_start_bytes = [bytes([x]) for x in range(0xF5, 0xF8)]
|
||||
invalid_start_bytes = (
|
||||
continuation_bytes + invalid_2B_seq_start_bytes +
|
||||
|
|
|
@ -15,7 +15,7 @@
|
|||
#undef hz
|
||||
#endif
|
||||
|
||||
/* GBK and GB2312 map differently in few codepoints that are listed below:
|
||||
/* GBK and GB2312 map differently in few code points that are listed below:
|
||||
*
|
||||
* gb2312 gbk
|
||||
* A1A4 U+30FB KATAKANA MIDDLE DOT U+00B7 MIDDLE DOT
|
||||
|
|
|
@ -171,7 +171,7 @@ DECODER(big5hkscs)
|
|||
default: return 1;
|
||||
}
|
||||
|
||||
NEXT_IN(2); /* all decoded codepoints are pairs, above. */
|
||||
NEXT_IN(2); /* all decoded code points are pairs, above. */
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
|
|
@ -69,7 +69,7 @@ ENCODER(euc_kr)
|
|||
OUTBYTE1(EUCKR_JAMO_FIRSTBYTE);
|
||||
OUTBYTE2(EUCKR_JAMO_FILLER);
|
||||
|
||||
/* All codepoints in CP949 extension are in unicode
|
||||
/* All code points in CP949 extension are in unicode
|
||||
* Hangul Syllable area. */
|
||||
assert(0xac00 <= c && c <= 0xd7a3);
|
||||
c -= 0xac00;
|
||||
|
|
|
@ -12,10 +12,10 @@
|
|||
#include "multibytecodec.h"
|
||||
|
||||
|
||||
/* a unicode "undefined" codepoint */
|
||||
/* a unicode "undefined" code point */
|
||||
#define UNIINV 0xFFFE
|
||||
|
||||
/* internal-use DBCS codepoints which aren't used by any charsets */
|
||||
/* internal-use DBCS code points which aren't used by any charsets */
|
||||
#define NOCHAR 0xFFFF
|
||||
#define MULTIC 0xFFFE
|
||||
#define DBCINV 0xFFFD
|
||||
|
|
|
@ -976,7 +976,7 @@ is_unified_ideograph(Py_UCS4 code)
|
|||
(0x2B740 <= code && code <= 0x2B81D); /* CJK Ideograph Extension D */
|
||||
}
|
||||
|
||||
/* macros used to determine if the given codepoint is in the PUA range that
|
||||
/* macros used to determine if the given code point is in the PUA range that
|
||||
* we are using to store aliases and named sequences */
|
||||
#define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end))
|
||||
#define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \
|
||||
|
@ -986,7 +986,7 @@ static int
|
|||
_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen,
|
||||
int with_alias_and_seq)
|
||||
{
|
||||
/* Find the name associated with the given codepoint.
|
||||
/* Find the name associated with the given code point.
|
||||
* If with_alias_and_seq is 1, check for names in the Private Use Area 15
|
||||
* that we are using for aliases and named sequences. */
|
||||
int offset;
|
||||
|
@ -997,7 +997,7 @@ _getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen,
|
|||
if (code >= 0x110000)
|
||||
return 0;
|
||||
|
||||
/* XXX should we just skip all the codepoints in the PUAs here? */
|
||||
/* XXX should we just skip all the code points in the PUAs here? */
|
||||
if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code)))
|
||||
return 0;
|
||||
|
||||
|
@ -1125,8 +1125,8 @@ _check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq)
|
|||
/* check if named sequences are allowed */
|
||||
if (!with_named_seq && IS_NAMED_SEQ(cp))
|
||||
return 0;
|
||||
/* if the codepoint is in the PUA range that we use for aliases,
|
||||
* convert it to obtain the right codepoint */
|
||||
/* if the code point is in the PUA range that we use for aliases,
|
||||
* convert it to obtain the right code point */
|
||||
if (IS_ALIAS(cp))
|
||||
*code = name_aliases[cp-aliases_start];
|
||||
else
|
||||
|
@ -1138,9 +1138,9 @@ static int
|
|||
_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code,
|
||||
int with_named_seq)
|
||||
{
|
||||
/* Return the codepoint associated with the given name.
|
||||
/* Return the code point associated with the given name.
|
||||
* Named aliases are resolved too (unless self != NULL (i.e. we are using
|
||||
* 3.2.0)). If with_named_seq is 1, returns the PUA codepoint that we are
|
||||
* 3.2.0)). If with_named_seq is 1, returns the PUA code point that we are
|
||||
* using for the named sequence, and the caller must then convert it. */
|
||||
unsigned int h, v;
|
||||
unsigned int mask = code_size-1;
|
||||
|
|
|
@ -5013,7 +5013,7 @@ PyUnicode_DecodeUTF32Stateful(const char *s,
|
|||
}
|
||||
|
||||
if (Py_UNICODE_IS_SURROGATE(ch)) {
|
||||
errmsg = "codepoint in surrogate code point range(0xd800, 0xe000)";
|
||||
errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
|
||||
startinpos = ((const char *)q) - starts;
|
||||
endinpos = startinpos + 4;
|
||||
}
|
||||
|
@ -5032,7 +5032,7 @@ PyUnicode_DecodeUTF32Stateful(const char *s,
|
|||
q += 4;
|
||||
continue;
|
||||
}
|
||||
errmsg = "codepoint not in range(0x110000)";
|
||||
errmsg = "code point not in range(0x110000)";
|
||||
startinpos = ((const char *)q) - starts;
|
||||
endinpos = startinpos + 4;
|
||||
}
|
||||
|
|
|
@ -1379,7 +1379,7 @@ hexversion -- version information encoded as a single integer\n\
|
|||
implementation -- Python implementation information.\n\
|
||||
int_info -- a struct sequence with information about the int implementation.\n\
|
||||
maxsize -- the largest supported length of containers.\n\
|
||||
maxunicode -- the value of the largest Unicode codepoint\n\
|
||||
maxunicode -- the value of the largest Unicode code point\n\
|
||||
platform -- platform identifier\n\
|
||||
prefix -- prefix used to find the Python library\n\
|
||||
thread_info -- a struct sequence with information about the thread implementation.\n\
|
||||
|
|
|
@ -34,7 +34,7 @@ MAX_TABLE_SIZE = 8192
|
|||
# Standard undefined Unicode code point
|
||||
UNI_UNDEFINED = chr(0xFFFE)
|
||||
|
||||
# Placeholder for a missing codepoint
|
||||
# Placeholder for a missing code point
|
||||
MISSING_CODE = -1
|
||||
|
||||
mapRE = re.compile('((?:0x[0-9a-fA-F]+\+?)+)'
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue