#7643: Unicode codepoints VT (0x0B) and FF (0x0C) are linebreaks according to Unicode Standard Annex #14.

This commit is contained in:
Florent Xicluna 2010-03-30 08:24:06 +00:00
parent e6410c536c
commit 22b243809e
5 changed files with 65 additions and 13 deletions

View file

@ -115,9 +115,9 @@ static char unicode_default_encoding[100];
/* Fast detection of the most frequent whitespace characters */
const unsigned char _Py_ascii_whitespace[] = {
0, 0, 0, 0, 0, 0, 0, 0,
/* case 0x0009: * HORIZONTAL TABULATION */
/* case 0x0009: * CHARACTER TABULATION */
/* case 0x000A: * LINE FEED */
/* case 0x000B: * VERTICAL TABULATION */
/* case 0x000B: * LINE TABULATION */
/* case 0x000C: * FORM FEED */
/* case 0x000D: * CARRIAGE RETURN */
0, 1, 1, 1, 1, 1, 0, 0,
@ -147,8 +147,10 @@ const unsigned char _Py_ascii_whitespace[] = {
static unsigned char ascii_linebreak[] = {
0, 0, 0, 0, 0, 0, 0, 0,
/* 0x000A, * LINE FEED */
/* 0x000B, * LINE TABULATION */
/* 0x000C, * FORM FEED */
/* 0x000D, * CARRIAGE RETURN */
0, 0, 1, 0, 0, 1, 0, 0,
0, 0, 1, 1, 1, 1, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
/* 0x001C, * FILE SEPARATOR */
/* 0x001D, * GROUP SEPARATOR */