mirror of
https://github.com/python/cpython.git
synced 2025-11-01 18:51:43 +00:00
Implement PEP 3120.
This commit is contained in:
parent
5de17db361
commit
447d33ead6
5 changed files with 69 additions and 6 deletions
|
|
@ -444,6 +444,34 @@ static void fp_ungetc(int c, struct tok_state *tok) {
|
|||
ungetc(c, tok->fp);
|
||||
}
|
||||
|
||||
/* Check whether the characters at s start a valid
|
||||
UTF-8 sequence. Return the number of characters forming
|
||||
the sequence if yes, 0 if not. */
|
||||
static int valid_utf8(const unsigned char* s)
|
||||
{
|
||||
int expected = 0;
|
||||
int length;
|
||||
if (*s < 0x80)
|
||||
/* single-byte code */
|
||||
return 1;
|
||||
if (*s < 0xc0)
|
||||
/* following byte */
|
||||
return 0;
|
||||
if (*s < 0xE0)
|
||||
expected = 1;
|
||||
else if (*s < 0xF0)
|
||||
expected = 2;
|
||||
else if (*s < 0xF8)
|
||||
expected = 3;
|
||||
else
|
||||
return 0;
|
||||
length = expected + 1;
|
||||
for (; expected; expected--)
|
||||
if (s[expected] < 0x80 || s[expected] >= 0xC0)
|
||||
return 0;
|
||||
return length;
|
||||
}
|
||||
|
||||
/* Read a line of input from TOK. Determine encoding
|
||||
if necessary. */
|
||||
|
||||
|
|
@ -478,12 +506,13 @@ decoding_fgets(char *s, int size, struct tok_state *tok)
|
|||
}
|
||||
}
|
||||
#ifndef PGEN
|
||||
/* The default encoding is ASCII, so make sure we don't have any
|
||||
non-ASCII bytes in it. */
|
||||
/* The default encoding is UTF-8, so make sure we don't have any
|
||||
non-UTF-8 sequences in it. */
|
||||
if (line && !tok->encoding) {
|
||||
unsigned char *c;
|
||||
for (c = (unsigned char *)line; *c; c++)
|
||||
if (*c > 127) {
|
||||
int length;
|
||||
for (c = (unsigned char *)line; *c; c += length)
|
||||
if (!(length = valid_utf8(c))) {
|
||||
badchar = *c;
|
||||
break;
|
||||
}
|
||||
|
|
@ -493,7 +522,7 @@ decoding_fgets(char *s, int size, struct tok_state *tok)
|
|||
/* Need to add 1 to the line number, since this line
|
||||
has not been counted, yet. */
|
||||
sprintf(buf,
|
||||
"Non-ASCII character '\\x%.2x' "
|
||||
"Non-UTF-8 code starting with '\\x%.2x' "
|
||||
"in file %.200s on line %i, "
|
||||
"but no encoding declared; "
|
||||
"see http://www.python.org/peps/pep-0263.html for details",
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue