mirror of
https://github.com/python/cpython.git
synced 2025-09-26 18:29:57 +00:00
Merged revisions 79494,79496 via svnmerge from
svn+ssh://pythondev@svn.python.org/python/trunk ........ r79494 | florent.xicluna | 2010-03-30 10:24:06 +0200 (mar, 30 mar 2010) | 2 lines #7643: Unicode codepoints VT (0x0B) and FF (0x0C) are linebreaks according to Unicode Standard Annex #14. ........ r79496 | florent.xicluna | 2010-03-30 18:29:03 +0200 (mar, 30 mar 2010) | 2 lines Highlight the change of behavior related to r79494. Now VT and FF are linebreaks. ........
This commit is contained in:
parent
364129ef5a
commit
806d8cf0e8
5 changed files with 52 additions and 12 deletions
|
@ -25,7 +25,7 @@ class UnicodeMethodsTest(unittest.TestCase):
|
||||||
|
|
||||||
def test_method_checksum(self):
|
def test_method_checksum(self):
|
||||||
h = hashlib.sha1()
|
h = hashlib.sha1()
|
||||||
for i in range(65536):
|
for i in range(0x10000):
|
||||||
char = chr(i)
|
char = chr(i)
|
||||||
data = [
|
data = [
|
||||||
# Predicates (single char)
|
# Predicates (single char)
|
||||||
|
@ -284,6 +284,17 @@ class UnicodeMiscTest(UnicodeDatabaseTest):
|
||||||
self.assertEqual("\u01c5".title(), "\u01c5")
|
self.assertEqual("\u01c5".title(), "\u01c5")
|
||||||
self.assertEqual("\u01c6".title(), "\u01c5")
|
self.assertEqual("\u01c6".title(), "\u01c5")
|
||||||
|
|
||||||
|
def test_linebreak_7643(self):
|
||||||
|
for i in range(0x10000):
|
||||||
|
lines = (chr(i) + 'A').splitlines()
|
||||||
|
if i in (0x0a, 0x0b, 0x0c, 0x0d, 0x85,
|
||||||
|
0x1c, 0x1d, 0x1e, 0x2028, 0x2029):
|
||||||
|
self.assertEqual(len(lines), 2,
|
||||||
|
r"\u%.4x should be a linebreak" % i)
|
||||||
|
else:
|
||||||
|
self.assertEqual(len(lines), 1,
|
||||||
|
r"\u%.4x should not be a linebreak" % i)
|
||||||
|
|
||||||
def test_main():
|
def test_main():
|
||||||
test.support.run_unittest(
|
test.support.run_unittest(
|
||||||
UnicodeMiscTest,
|
UnicodeMiscTest,
|
||||||
|
|
|
@ -293,6 +293,11 @@ C-API
|
||||||
Library
|
Library
|
||||||
-------
|
-------
|
||||||
|
|
||||||
|
- Backwards incompatible change: Unicode codepoints line tabulation (0x0B) and
|
||||||
|
form feed (0x0C) are now considered linebreaks, as specified in Unicode
|
||||||
|
Standard Annex #14. See issue #7643.
|
||||||
|
http://www.unicode.org/reports/tr14/
|
||||||
|
|
||||||
- Comparisons using one of <, <=, >, >= between a complex instance and
|
- Comparisons using one of <, <=, >, >= between a complex instance and
|
||||||
a Fractions instance now raise TypeError instead of returning
|
a Fractions instance now raise TypeError instead of returning
|
||||||
True/False. This makes Fraction <=> complex comparisons consistent with
|
True/False. This makes Fraction <=> complex comparisons consistent with
|
||||||
|
|
|
@ -126,9 +126,9 @@ static const char unicode_default_encoding[] = "utf-8";
|
||||||
/* Fast detection of the most frequent whitespace characters */
|
/* Fast detection of the most frequent whitespace characters */
|
||||||
const unsigned char _Py_ascii_whitespace[] = {
|
const unsigned char _Py_ascii_whitespace[] = {
|
||||||
0, 0, 0, 0, 0, 0, 0, 0,
|
0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
/* case 0x0009: * HORIZONTAL TABULATION */
|
/* case 0x0009: * CHARACTER TABULATION */
|
||||||
/* case 0x000A: * LINE FEED */
|
/* case 0x000A: * LINE FEED */
|
||||||
/* case 0x000B: * VERTICAL TABULATION */
|
/* case 0x000B: * LINE TABULATION */
|
||||||
/* case 0x000C: * FORM FEED */
|
/* case 0x000C: * FORM FEED */
|
||||||
/* case 0x000D: * CARRIAGE RETURN */
|
/* case 0x000D: * CARRIAGE RETURN */
|
||||||
0, 1, 1, 1, 1, 1, 0, 0,
|
0, 1, 1, 1, 1, 1, 0, 0,
|
||||||
|
@ -163,8 +163,10 @@ static PyObject *unicode_encode_call_errorhandler(const char *errors,
|
||||||
static unsigned char ascii_linebreak[] = {
|
static unsigned char ascii_linebreak[] = {
|
||||||
0, 0, 0, 0, 0, 0, 0, 0,
|
0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
/* 0x000A, * LINE FEED */
|
/* 0x000A, * LINE FEED */
|
||||||
|
/* 0x000B, * LINE TABULATION */
|
||||||
|
/* 0x000C, * FORM FEED */
|
||||||
/* 0x000D, * CARRIAGE RETURN */
|
/* 0x000D, * CARRIAGE RETURN */
|
||||||
0, 0, 1, 0, 0, 1, 0, 0,
|
0, 0, 1, 1, 1, 1, 0, 0,
|
||||||
0, 0, 0, 0, 0, 0, 0, 0,
|
0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
/* 0x001C, * FILE SEPARATOR */
|
/* 0x001C, * FILE SEPARATOR */
|
||||||
/* 0x001D, * GROUP SEPARATOR */
|
/* 0x001D, * GROUP SEPARATOR */
|
||||||
|
|
|
@ -694,7 +694,7 @@ static unsigned char index1[] = {
|
||||||
};
|
};
|
||||||
|
|
||||||
static unsigned char index2[] = {
|
static unsigned char index2[] = {
|
||||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 2, 2, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||||
1, 1, 1, 1, 3, 3, 3, 2, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
|
1, 1, 1, 1, 3, 3, 3, 2, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
|
||||||
6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 5, 5, 5, 5, 5, 5, 5, 16, 16, 16, 16,
|
6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 5, 5, 5, 5, 5, 5, 5, 16, 16, 16, 16,
|
||||||
16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
|
16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
|
||||||
|
@ -3395,13 +3395,16 @@ int _PyUnicode_IsWhitespace(register const Py_UNICODE ch)
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Returns 1 for Unicode characters having the category 'Zl',
|
/* Returns 1 for Unicode characters having the line break
|
||||||
* 'Zp' or type 'B', 0 otherwise.
|
* property 'BK', 'CR', 'LF' or 'NL' or having bidirectional
|
||||||
|
* type 'B', 0 otherwise.
|
||||||
*/
|
*/
|
||||||
int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)
|
int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)
|
||||||
{
|
{
|
||||||
switch (ch) {
|
switch (ch) {
|
||||||
case 0x000A:
|
case 0x000A:
|
||||||
|
case 0x000B:
|
||||||
|
case 0x000C:
|
||||||
case 0x000D:
|
case 0x000D:
|
||||||
case 0x001C:
|
case 0x001C:
|
||||||
case 0x001D:
|
case 0x001D:
|
||||||
|
|
|
@ -38,6 +38,7 @@ EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
|
||||||
UNIHAN = "Unihan%s.txt"
|
UNIHAN = "Unihan%s.txt"
|
||||||
DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
|
DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
|
||||||
DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
|
DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
|
||||||
|
LINE_BREAK = "LineBreak%s.txt"
|
||||||
|
|
||||||
old_versions = ["3.2.0"]
|
old_versions = ["3.2.0"]
|
||||||
|
|
||||||
|
@ -52,6 +53,8 @@ BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
|
||||||
|
|
||||||
EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
|
EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
|
||||||
|
|
||||||
|
MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]
|
||||||
|
|
||||||
# note: should match definitions in Objects/unicodectype.c
|
# note: should match definitions in Objects/unicodectype.c
|
||||||
ALPHA_MASK = 0x01
|
ALPHA_MASK = 0x01
|
||||||
DECIMAL_MASK = 0x02
|
DECIMAL_MASK = 0x02
|
||||||
|
@ -77,7 +80,8 @@ def maketables(trace=0):
|
||||||
EASTASIAN_WIDTH % version,
|
EASTASIAN_WIDTH % version,
|
||||||
UNIHAN % version,
|
UNIHAN % version,
|
||||||
DERIVED_CORE_PROPERTIES % version,
|
DERIVED_CORE_PROPERTIES % version,
|
||||||
DERIVEDNORMALIZATION_PROPS % version)
|
DERIVEDNORMALIZATION_PROPS % version,
|
||||||
|
LINE_BREAK % version)
|
||||||
|
|
||||||
print(len(list(filter(None, unicode.table))), "characters")
|
print(len(list(filter(None, unicode.table))), "characters")
|
||||||
|
|
||||||
|
@ -378,7 +382,7 @@ def makeunicodetype(unicode, trace):
|
||||||
flags |= ALPHA_MASK
|
flags |= ALPHA_MASK
|
||||||
if category == "Ll":
|
if category == "Ll":
|
||||||
flags |= LOWER_MASK
|
flags |= LOWER_MASK
|
||||||
if category == "Zl" or bidirectional == "B":
|
if 'Line_Break' in properties or bidirectional == "B":
|
||||||
flags |= LINEBREAK_MASK
|
flags |= LINEBREAK_MASK
|
||||||
linebreaks.append(char)
|
linebreaks.append(char)
|
||||||
if category == "Zs" or bidirectional in ("WS", "B", "S"):
|
if category == "Zs" or bidirectional in ("WS", "B", "S"):
|
||||||
|
@ -537,8 +541,9 @@ def makeunicodetype(unicode, trace):
|
||||||
print(file=fp)
|
print(file=fp)
|
||||||
|
|
||||||
# Generate code for _PyUnicode_IsLinebreak()
|
# Generate code for _PyUnicode_IsLinebreak()
|
||||||
print("/* Returns 1 for Unicode characters having the category 'Zl',", file=fp)
|
print("/* Returns 1 for Unicode characters having the line break", file=fp)
|
||||||
print(" * 'Zp' or type 'B', 0 otherwise.", file=fp)
|
print(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional", file=fp)
|
||||||
|
print(" * type 'B', 0 otherwise.", file=fp)
|
||||||
print(" */", file=fp)
|
print(" */", file=fp)
|
||||||
print('int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)', file=fp)
|
print('int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)', file=fp)
|
||||||
print('{', file=fp)
|
print('{', file=fp)
|
||||||
|
@ -826,7 +831,8 @@ class UnicodeData:
|
||||||
# derived-props] (17)
|
# derived-props] (17)
|
||||||
|
|
||||||
def __init__(self, filename, exclusions, eastasianwidth, unihan,
|
def __init__(self, filename, exclusions, eastasianwidth, unihan,
|
||||||
derivedprops, derivednormalizationprops=None, expand=1):
|
derivedprops, derivednormalizationprops=None, linebreakprops=None,
|
||||||
|
expand=1):
|
||||||
self.changed = []
|
self.changed = []
|
||||||
file = open(filename)
|
file = open(filename)
|
||||||
table = [None] * 0x110000
|
table = [None] * 0x110000
|
||||||
|
@ -912,6 +918,19 @@ class UnicodeData:
|
||||||
# apply to unassigned code points; ignore them
|
# apply to unassigned code points; ignore them
|
||||||
table[char][-1].add(p)
|
table[char][-1].add(p)
|
||||||
|
|
||||||
|
if linebreakprops:
|
||||||
|
for s in open(linebreakprops):
|
||||||
|
s = s.partition('#')[0]
|
||||||
|
s = [i.strip() for i in s.split(';')]
|
||||||
|
if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
|
||||||
|
continue
|
||||||
|
if '..' not in s[0]:
|
||||||
|
first = last = int(s[0], 16)
|
||||||
|
else:
|
||||||
|
first, last = [int(c, 16) for c in s[0].split('..')]
|
||||||
|
for char in range(first, last+1):
|
||||||
|
table[char][-1].add('Line_Break')
|
||||||
|
|
||||||
if derivednormalizationprops:
|
if derivednormalizationprops:
|
||||||
quickchecks = [0] * 0x110000 # default is Yes
|
quickchecks = [0] * 0x110000 # default is Yes
|
||||||
qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
|
qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue