mirror of
https://github.com/python/cpython.git
synced 2025-07-24 11:44:31 +00:00
Issue #17693: CJK encoders now use the new Unicode API (PEP 393)
This commit is contained in:
parent
71557596b2
commit
d949126995
9 changed files with 430 additions and 418 deletions
|
@ -42,16 +42,18 @@
|
|||
|
||||
ENCODER(gb2312)
|
||||
{
|
||||
while (inleft > 0) {
|
||||
Py_UCS4 c = IN1;
|
||||
while (*inpos < inlen) {
|
||||
Py_UCS4 c = INCHAR1;
|
||||
DBCHAR code;
|
||||
|
||||
if (c < 0x80) {
|
||||
WRITE1((unsigned char)c)
|
||||
NEXT(1, 1)
|
||||
WRITEBYTE1((unsigned char)c)
|
||||
NEXT(1, 1);
|
||||
continue;
|
||||
}
|
||||
UCS4INVALID(c)
|
||||
|
||||
if (c > 0xFFFF)
|
||||
return 1;
|
||||
|
||||
REQUIRE_OUTBUF(2)
|
||||
TRYMAP_ENC(gbcommon, code, c);
|
||||
|
@ -60,9 +62,9 @@ ENCODER(gb2312)
|
|||
if (code & 0x8000) /* MSB set: GBK */
|
||||
return 1;
|
||||
|
||||
OUT1((code >> 8) | 0x80)
|
||||
OUT2((code & 0xFF) | 0x80)
|
||||
NEXT(1, 2)
|
||||
OUTBYTE1((code >> 8) | 0x80)
|
||||
OUTBYTE2((code & 0xFF) | 0x80)
|
||||
NEXT(1, 2);
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
@ -80,7 +82,7 @@ DECODER(gb2312)
|
|||
}
|
||||
|
||||
REQUIRE_INBUF(2)
|
||||
TRYMAP_DEC(gb2312, writer, c ^ 0x80, IN2 ^ 0x80) {
|
||||
TRYMAP_DEC(gb2312, writer, c ^ 0x80, INBYTE2 ^ 0x80) {
|
||||
NEXT_IN(2);
|
||||
}
|
||||
else return 1;
|
||||
|
@ -96,28 +98,30 @@ DECODER(gb2312)
|
|||
|
||||
ENCODER(gbk)
|
||||
{
|
||||
while (inleft > 0) {
|
||||
Py_UCS4 c = IN1;
|
||||
while (*inpos < inlen) {
|
||||
Py_UCS4 c = INCHAR1;
|
||||
DBCHAR code;
|
||||
|
||||
if (c < 0x80) {
|
||||
WRITE1((unsigned char)c)
|
||||
NEXT(1, 1)
|
||||
WRITEBYTE1((unsigned char)c)
|
||||
NEXT(1, 1);
|
||||
continue;
|
||||
}
|
||||
UCS4INVALID(c)
|
||||
|
||||
if (c > 0xFFFF)
|
||||
return 1;
|
||||
|
||||
REQUIRE_OUTBUF(2)
|
||||
|
||||
GBK_ENCODE(c, code)
|
||||
else return 1;
|
||||
|
||||
OUT1((code >> 8) | 0x80)
|
||||
OUTBYTE1((code >> 8) | 0x80)
|
||||
if (code & 0x8000)
|
||||
OUT2((code & 0xFF)) /* MSB set: GBK */
|
||||
OUTBYTE2((code & 0xFF)) /* MSB set: GBK */
|
||||
else
|
||||
OUT2((code & 0xFF) | 0x80) /* MSB unset: GB2312 */
|
||||
NEXT(1, 2)
|
||||
OUTBYTE2((code & 0xFF) | 0x80) /* MSB unset: GB2312 */
|
||||
NEXT(1, 2);
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
@ -126,7 +130,7 @@ ENCODER(gbk)
|
|||
DECODER(gbk)
|
||||
{
|
||||
while (inleft > 0) {
|
||||
unsigned char c = IN1;
|
||||
unsigned char c = INBYTE1;
|
||||
|
||||
if (c < 0x80) {
|
||||
OUTCHAR(c);
|
||||
|
@ -136,7 +140,7 @@ DECODER(gbk)
|
|||
|
||||
REQUIRE_INBUF(2)
|
||||
|
||||
GBK_DECODE(c, IN2, writer)
|
||||
GBK_DECODE(c, INBYTE2, writer)
|
||||
else return 1;
|
||||
|
||||
NEXT_IN(2);
|
||||
|
@ -152,41 +156,31 @@ DECODER(gbk)
|
|||
|
||||
ENCODER(gb18030)
|
||||
{
|
||||
while (inleft > 0) {
|
||||
Py_UCS4 c = IN1;
|
||||
while (*inpos < inlen) {
|
||||
Py_UCS4 c = INCHAR1;
|
||||
DBCHAR code;
|
||||
|
||||
if (c < 0x80) {
|
||||
WRITE1(c)
|
||||
NEXT(1, 1)
|
||||
WRITEBYTE1(c)
|
||||
NEXT(1, 1);
|
||||
continue;
|
||||
}
|
||||
|
||||
DECODE_SURROGATE(c)
|
||||
if (c > 0x10FFFF)
|
||||
#if Py_UNICODE_SIZE == 2
|
||||
return 2; /* surrogates pair */
|
||||
#else
|
||||
return 1;
|
||||
#endif
|
||||
else if (c >= 0x10000) {
|
||||
if (c >= 0x10000) {
|
||||
Py_UCS4 tc = c - 0x10000;
|
||||
assert (c <= 0x10FFFF);
|
||||
|
||||
REQUIRE_OUTBUF(4)
|
||||
|
||||
OUT4((unsigned char)(tc % 10) + 0x30)
|
||||
OUTBYTE4((unsigned char)(tc % 10) + 0x30)
|
||||
tc /= 10;
|
||||
OUT3((unsigned char)(tc % 126) + 0x81)
|
||||
OUTBYTE3((unsigned char)(tc % 126) + 0x81)
|
||||
tc /= 126;
|
||||
OUT2((unsigned char)(tc % 10) + 0x30)
|
||||
OUTBYTE2((unsigned char)(tc % 10) + 0x30)
|
||||
tc /= 10;
|
||||
OUT1((unsigned char)(tc + 0x90))
|
||||
OUTBYTE1((unsigned char)(tc + 0x90))
|
||||
|
||||
#if Py_UNICODE_SIZE == 2
|
||||
NEXT(2, 4) /* surrogates pair */
|
||||
#else
|
||||
NEXT(1, 4)
|
||||
#endif
|
||||
NEXT(1, 4);
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -209,15 +203,15 @@ ENCODER(gb18030)
|
|||
tc = c - utrrange->first +
|
||||
utrrange->base;
|
||||
|
||||
OUT4((unsigned char)(tc % 10) + 0x30)
|
||||
OUTBYTE4((unsigned char)(tc % 10) + 0x30)
|
||||
tc /= 10;
|
||||
OUT3((unsigned char)(tc % 126) + 0x81)
|
||||
OUTBYTE3((unsigned char)(tc % 126) + 0x81)
|
||||
tc /= 126;
|
||||
OUT2((unsigned char)(tc % 10) + 0x30)
|
||||
OUTBYTE2((unsigned char)(tc % 10) + 0x30)
|
||||
tc /= 10;
|
||||
OUT1((unsigned char)tc + 0x81)
|
||||
OUTBYTE1((unsigned char)tc + 0x81)
|
||||
|
||||
NEXT(1, 4)
|
||||
NEXT(1, 4);
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -226,13 +220,13 @@ ENCODER(gb18030)
|
|||
continue;
|
||||
}
|
||||
|
||||
OUT1((code >> 8) | 0x80)
|
||||
OUTBYTE1((code >> 8) | 0x80)
|
||||
if (code & 0x8000)
|
||||
OUT2((code & 0xFF)) /* MSB set: GBK or GB18030ext */
|
||||
OUTBYTE2((code & 0xFF)) /* MSB set: GBK or GB18030ext */
|
||||
else
|
||||
OUT2((code & 0xFF) | 0x80) /* MSB unset: GB2312 */
|
||||
OUTBYTE2((code & 0xFF) | 0x80) /* MSB unset: GB2312 */
|
||||
|
||||
NEXT(1, 2)
|
||||
NEXT(1, 2);
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
@ -241,7 +235,7 @@ ENCODER(gb18030)
|
|||
DECODER(gb18030)
|
||||
{
|
||||
while (inleft > 0) {
|
||||
unsigned char c = IN1, c2;
|
||||
unsigned char c = INBYTE1, c2;
|
||||
|
||||
if (c < 0x80) {
|
||||
OUTCHAR(c);
|
||||
|
@ -251,15 +245,15 @@ DECODER(gb18030)
|
|||
|
||||
REQUIRE_INBUF(2)
|
||||
|
||||
c2 = IN2;
|
||||
c2 = INBYTE2;
|
||||
if (c2 >= 0x30 && c2 <= 0x39) { /* 4 bytes seq */
|
||||
const struct _gb18030_to_unibmp_ranges *utr;
|
||||
unsigned char c3, c4;
|
||||
Py_UCS4 lseq;
|
||||
|
||||
REQUIRE_INBUF(4)
|
||||
c3 = IN3;
|
||||
c4 = IN4;
|
||||
c3 = INBYTE3;
|
||||
c4 = INBYTE4;
|
||||
if (c < 0x81 || c3 < 0x81 || c4 < 0x30 || c4 > 0x39)
|
||||
return 1;
|
||||
c -= 0x81; c2 -= 0x30;
|
||||
|
@ -313,33 +307,34 @@ ENCODER_INIT(hz)
|
|||
ENCODER_RESET(hz)
|
||||
{
|
||||
if (state->i != 0) {
|
||||
WRITE2('~', '}')
|
||||
WRITEBYTE2('~', '}')
|
||||
state->i = 0;
|
||||
NEXT_OUT(2)
|
||||
NEXT_OUT(2);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
ENCODER(hz)
|
||||
{
|
||||
while (inleft > 0) {
|
||||
Py_UCS4 c = IN1;
|
||||
while (*inpos < inlen) {
|
||||
Py_UCS4 c = INCHAR1;
|
||||
DBCHAR code;
|
||||
|
||||
if (c < 0x80) {
|
||||
if (state->i == 0) {
|
||||
WRITE1((unsigned char)c)
|
||||
NEXT(1, 1)
|
||||
WRITEBYTE1((unsigned char)c)
|
||||
NEXT(1, 1);
|
||||
}
|
||||
else {
|
||||
WRITE3('~', '}', (unsigned char)c)
|
||||
NEXT(1, 3)
|
||||
WRITEBYTE3('~', '}', (unsigned char)c)
|
||||
NEXT(1, 3);
|
||||
state->i = 0;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
UCS4INVALID(c)
|
||||
if (c > 0xFFFF)
|
||||
return 1;
|
||||
|
||||
TRYMAP_ENC(gbcommon, code, c);
|
||||
else return 1;
|
||||
|
@ -348,13 +343,13 @@ ENCODER(hz)
|
|||
return 1;
|
||||
|
||||
if (state->i == 0) {
|
||||
WRITE4('~', '{', code >> 8, code & 0xff)
|
||||
NEXT(1, 4)
|
||||
WRITEBYTE4('~', '{', code >> 8, code & 0xff)
|
||||
NEXT(1, 4);
|
||||
state->i = 1;
|
||||
}
|
||||
else {
|
||||
WRITE2(code >> 8, code & 0xff)
|
||||
NEXT(1, 2)
|
||||
WRITEBYTE2(code >> 8, code & 0xff)
|
||||
NEXT(1, 2);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -376,10 +371,10 @@ DECODER_RESET(hz)
|
|||
DECODER(hz)
|
||||
{
|
||||
while (inleft > 0) {
|
||||
unsigned char c = IN1;
|
||||
unsigned char c = INBYTE1;
|
||||
|
||||
if (c == '~') {
|
||||
unsigned char c2 = IN2;
|
||||
unsigned char c2 = INBYTE2;
|
||||
|
||||
REQUIRE_INBUF(2)
|
||||
if (c2 == '~') {
|
||||
|
@ -408,7 +403,7 @@ DECODER(hz)
|
|||
}
|
||||
else { /* GB mode */
|
||||
REQUIRE_INBUF(2)
|
||||
TRYMAP_DEC(gb2312, writer, c, IN2) {
|
||||
TRYMAP_DEC(gb2312, writer, c, INBYTE2) {
|
||||
NEXT_IN(2);
|
||||
}
|
||||
else
|
||||
|
|
|
@ -38,35 +38,39 @@ static const DBCHAR big5hkscs_pairenc_table[4] = {0x8862, 0x8864, 0x88a3, 0x88a5
|
|||
|
||||
ENCODER(big5hkscs)
|
||||
{
|
||||
while (inleft > 0) {
|
||||
Py_UCS4 c = **inbuf;
|
||||
while (*inpos < inlen) {
|
||||
Py_UCS4 c = INCHAR1;
|
||||
DBCHAR code;
|
||||
Py_ssize_t insize;
|
||||
|
||||
if (c < 0x80) {
|
||||
REQUIRE_OUTBUF(1)
|
||||
**outbuf = (unsigned char)c;
|
||||
NEXT(1, 1)
|
||||
NEXT(1, 1);
|
||||
continue;
|
||||
}
|
||||
|
||||
DECODE_SURROGATE(c)
|
||||
insize = GET_INSIZE(c);
|
||||
|
||||
insize = 1;
|
||||
REQUIRE_OUTBUF(2)
|
||||
|
||||
if (c < 0x10000) {
|
||||
TRYMAP_ENC(big5hkscs_bmp, code, c) {
|
||||
if (code == MULTIC) {
|
||||
if (inleft >= 2 &&
|
||||
Py_UCS4 c2;
|
||||
if (inlen - *inpos >= 2)
|
||||
c2 = INCHAR2;
|
||||
else
|
||||
c2 = 0;
|
||||
|
||||
if (inlen - *inpos >= 2 &&
|
||||
((c & 0xffdf) == 0x00ca) &&
|
||||
(((*inbuf)[1] & 0xfff7) == 0x0304)) {
|
||||
((c2 & 0xfff7) == 0x0304)) {
|
||||
code = big5hkscs_pairenc_table[
|
||||
((c >> 4) |
|
||||
((*inbuf)[1] >> 3)) & 3];
|
||||
(c2 >> 3)) & 3];
|
||||
insize = 2;
|
||||
}
|
||||
else if (inleft < 2 &&
|
||||
else if (inlen - *inpos < 2 &&
|
||||
!(flags & MBENC_FLUSH))
|
||||
return MBERR_TOOFEW;
|
||||
else {
|
||||
|
@ -89,9 +93,9 @@ ENCODER(big5hkscs)
|
|||
else
|
||||
return insize;
|
||||
|
||||
OUT1(code >> 8)
|
||||
OUT2(code & 0xFF)
|
||||
NEXT(insize, 2)
|
||||
OUTBYTE1(code >> 8)
|
||||
OUTBYTE2(code & 0xFF)
|
||||
NEXT(insize, 2);
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
@ -102,7 +106,7 @@ ENCODER(big5hkscs)
|
|||
DECODER(big5hkscs)
|
||||
{
|
||||
while (inleft > 0) {
|
||||
unsigned char c = IN1;
|
||||
unsigned char c = INBYTE1;
|
||||
Py_UCS4 decoded;
|
||||
|
||||
if (c < 0x80) {
|
||||
|
@ -113,20 +117,20 @@ DECODER(big5hkscs)
|
|||
|
||||
REQUIRE_INBUF(2)
|
||||
|
||||
if (0xc6 > c || c > 0xc8 || (c < 0xc7 && IN2 < 0xa1)) {
|
||||
TRYMAP_DEC(big5, writer, c, IN2) {
|
||||
if (0xc6 > c || c > 0xc8 || (c < 0xc7 && INBYTE2 < 0xa1)) {
|
||||
TRYMAP_DEC(big5, writer, c, INBYTE2) {
|
||||
NEXT_IN(2);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
TRYMAP_DEC_CHAR(big5hkscs, decoded, c, IN2)
|
||||
TRYMAP_DEC_CHAR(big5hkscs, decoded, c, INBYTE2)
|
||||
{
|
||||
int s = BH2S(c, IN2);
|
||||
int s = BH2S(c, INBYTE2);
|
||||
const unsigned char *hintbase;
|
||||
|
||||
assert(0x87 <= c && c <= 0xfe);
|
||||
assert(0x40 <= IN2 && IN2 <= 0xfe);
|
||||
assert(0x40 <= INBYTE2 && INBYTE2 <= 0xfe);
|
||||
|
||||
if (BH2S(0x87, 0x40) <= s && s <= BH2S(0xa0, 0xfe)) {
|
||||
hintbase = big5hkscs_phint_0;
|
||||
|
@ -154,7 +158,7 @@ DECODER(big5hkscs)
|
|||
continue;
|
||||
}
|
||||
|
||||
switch ((c << 8) | IN2) {
|
||||
switch ((c << 8) | INBYTE2) {
|
||||
case 0x8862: OUTCHAR2(0x00ca, 0x0304); break;
|
||||
case 0x8864: OUTCHAR2(0x00ca, 0x030c); break;
|
||||
case 0x88a3: OUTCHAR2(0x00ea, 0x0304); break;
|
||||
|
|
|
@ -141,13 +141,13 @@ ENCODER_INIT(iso2022)
|
|||
ENCODER_RESET(iso2022)
|
||||
{
|
||||
if (STATE_GETFLAG(F_SHIFTED)) {
|
||||
WRITE1(SI)
|
||||
NEXT_OUT(1)
|
||||
WRITEBYTE1(SI)
|
||||
NEXT_OUT(1);
|
||||
STATE_CLEARFLAG(F_SHIFTED)
|
||||
}
|
||||
if (STATE_G0 != CHARSET_ASCII) {
|
||||
WRITE3(ESC, '(', 'B')
|
||||
NEXT_OUT(3)
|
||||
WRITEBYTE3(ESC, '(', 'B')
|
||||
NEXT_OUT(3);
|
||||
STATE_SETG0(CHARSET_ASCII)
|
||||
}
|
||||
return 0;
|
||||
|
@ -155,30 +155,29 @@ ENCODER_RESET(iso2022)
|
|||
|
||||
ENCODER(iso2022)
|
||||
{
|
||||
while (inleft > 0) {
|
||||
while (*inpos < inlen) {
|
||||
const struct iso2022_designation *dsg;
|
||||
DBCHAR encoded;
|
||||
Py_UCS4 c = **inbuf;
|
||||
Py_UCS4 c = INCHAR1;
|
||||
Py_ssize_t insize;
|
||||
|
||||
if (c < 0x80) {
|
||||
if (STATE_G0 != CHARSET_ASCII) {
|
||||
WRITE3(ESC, '(', 'B')
|
||||
WRITEBYTE3(ESC, '(', 'B')
|
||||
STATE_SETG0(CHARSET_ASCII)
|
||||
NEXT_OUT(3)
|
||||
NEXT_OUT(3);
|
||||
}
|
||||
if (STATE_GETFLAG(F_SHIFTED)) {
|
||||
WRITE1(SI)
|
||||
WRITEBYTE1(SI)
|
||||
STATE_CLEARFLAG(F_SHIFTED)
|
||||
NEXT_OUT(1)
|
||||
NEXT_OUT(1);
|
||||
}
|
||||
WRITE1((unsigned char)c)
|
||||
NEXT(1, 1)
|
||||
WRITEBYTE1((unsigned char)c)
|
||||
NEXT(1, 1);
|
||||
continue;
|
||||
}
|
||||
|
||||
DECODE_SURROGATE(c)
|
||||
insize = GET_INSIZE(c);
|
||||
insize = 1;
|
||||
|
||||
encoded = MAP_UNMAPPABLE;
|
||||
for (dsg = CONFIG_DESIGNATIONS; dsg->mark; dsg++) {
|
||||
|
@ -187,24 +186,14 @@ ENCODER(iso2022)
|
|||
if (encoded == MAP_MULTIPLE_AVAIL) {
|
||||
/* this implementation won't work for pair
|
||||
* of non-bmp characters. */
|
||||
if (inleft < 2) {
|
||||
if (inlen - *inpos < 2) {
|
||||
if (!(flags & MBENC_FLUSH))
|
||||
return MBERR_TOOFEW;
|
||||
length = -1;
|
||||
}
|
||||
else
|
||||
length = 2;
|
||||
#if Py_UNICODE_SIZE == 2
|
||||
if (length == 2) {
|
||||
Py_UCS4 u4in[2];
|
||||
u4in[0] = (Py_UCS4)IN1;
|
||||
u4in[1] = (Py_UCS4)IN2;
|
||||
encoded = dsg->encoder(u4in, &length);
|
||||
} else
|
||||
encoded = dsg->encoder(&c, &length);
|
||||
#else
|
||||
encoded = dsg->encoder(&c, &length);
|
||||
#endif
|
||||
if (encoded != MAP_UNMAPPABLE) {
|
||||
insize = length;
|
||||
break;
|
||||
|
@ -221,47 +210,47 @@ ENCODER(iso2022)
|
|||
switch (dsg->plane) {
|
||||
case 0: /* G0 */
|
||||
if (STATE_GETFLAG(F_SHIFTED)) {
|
||||
WRITE1(SI)
|
||||
WRITEBYTE1(SI)
|
||||
STATE_CLEARFLAG(F_SHIFTED)
|
||||
NEXT_OUT(1)
|
||||
NEXT_OUT(1);
|
||||
}
|
||||
if (STATE_G0 != dsg->mark) {
|
||||
if (dsg->width == 1) {
|
||||
WRITE3(ESC, '(', ESCMARK(dsg->mark))
|
||||
WRITEBYTE3(ESC, '(', ESCMARK(dsg->mark))
|
||||
STATE_SETG0(dsg->mark)
|
||||
NEXT_OUT(3)
|
||||
NEXT_OUT(3);
|
||||
}
|
||||
else if (dsg->mark == CHARSET_JISX0208) {
|
||||
WRITE3(ESC, '$', ESCMARK(dsg->mark))
|
||||
WRITEBYTE3(ESC, '$', ESCMARK(dsg->mark))
|
||||
STATE_SETG0(dsg->mark)
|
||||
NEXT_OUT(3)
|
||||
NEXT_OUT(3);
|
||||
}
|
||||
else {
|
||||
WRITE4(ESC, '$', '(',
|
||||
WRITEBYTE4(ESC, '$', '(',
|
||||
ESCMARK(dsg->mark))
|
||||
STATE_SETG0(dsg->mark)
|
||||
NEXT_OUT(4)
|
||||
NEXT_OUT(4);
|
||||
}
|
||||
}
|
||||
break;
|
||||
case 1: /* G1 */
|
||||
if (STATE_G1 != dsg->mark) {
|
||||
if (dsg->width == 1) {
|
||||
WRITE3(ESC, ')', ESCMARK(dsg->mark))
|
||||
WRITEBYTE3(ESC, ')', ESCMARK(dsg->mark))
|
||||
STATE_SETG1(dsg->mark)
|
||||
NEXT_OUT(3)
|
||||
NEXT_OUT(3);
|
||||
}
|
||||
else {
|
||||
WRITE4(ESC, '$', ')',
|
||||
WRITEBYTE4(ESC, '$', ')',
|
||||
ESCMARK(dsg->mark))
|
||||
STATE_SETG1(dsg->mark)
|
||||
NEXT_OUT(4)
|
||||
NEXT_OUT(4);
|
||||
}
|
||||
}
|
||||
if (!STATE_GETFLAG(F_SHIFTED)) {
|
||||
WRITE1(SO)
|
||||
WRITEBYTE1(SO)
|
||||
STATE_SETFLAG(F_SHIFTED)
|
||||
NEXT_OUT(1)
|
||||
NEXT_OUT(1);
|
||||
}
|
||||
break;
|
||||
default: /* G2 and G3 is not supported: no encoding in
|
||||
|
@ -270,14 +259,14 @@ ENCODER(iso2022)
|
|||
}
|
||||
|
||||
if (dsg->width == 1) {
|
||||
WRITE1((unsigned char)encoded)
|
||||
NEXT_OUT(1)
|
||||
WRITEBYTE1((unsigned char)encoded)
|
||||
NEXT_OUT(1);
|
||||
}
|
||||
else {
|
||||
WRITE2(encoded >> 8, encoded & 0xff)
|
||||
NEXT_OUT(2)
|
||||
WRITEBYTE2(encoded >> 8, encoded & 0xff)
|
||||
NEXT_OUT(2);
|
||||
}
|
||||
NEXT_IN(insize);
|
||||
NEXT_INCHAR(insize);
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
@ -323,26 +312,26 @@ iso2022processesc(const void *config, MultibyteCodec_State *state,
|
|||
|
||||
switch (esclen) {
|
||||
case 3:
|
||||
if (IN2 == '$') {
|
||||
charset = IN3 | CHARSET_DBCS;
|
||||
if (INBYTE2 == '$') {
|
||||
charset = INBYTE3 | CHARSET_DBCS;
|
||||
designation = 0;
|
||||
}
|
||||
else {
|
||||
charset = IN3;
|
||||
if (IN2 == '(') designation = 0;
|
||||
else if (IN2 == ')') designation = 1;
|
||||
else if (CONFIG_ISSET(USE_G2) && IN2 == '.')
|
||||
charset = INBYTE3;
|
||||
if (INBYTE2 == '(') designation = 0;
|
||||
else if (INBYTE2 == ')') designation = 1;
|
||||
else if (CONFIG_ISSET(USE_G2) && INBYTE2 == '.')
|
||||
designation = 2;
|
||||
else return 3;
|
||||
}
|
||||
break;
|
||||
case 4:
|
||||
if (IN2 != '$')
|
||||
if (INBYTE2 != '$')
|
||||
return 4;
|
||||
|
||||
charset = IN4 | CHARSET_DBCS;
|
||||
if (IN3 == '(') designation = 0;
|
||||
else if (IN3 == ')') designation = 1;
|
||||
charset = INBYTE4 | CHARSET_DBCS;
|
||||
if (INBYTE3 == '(') designation = 0;
|
||||
else if (INBYTE3 == ')') designation = 1;
|
||||
else return 4;
|
||||
break;
|
||||
case 6: /* designation with prefix */
|
||||
|
@ -395,18 +384,18 @@ iso2022processg2(const void *config, MultibyteCodec_State *state,
|
|||
/* not written to use encoder, decoder functions because only few
|
||||
* encodings use G2 designations in CJKCodecs */
|
||||
if (STATE_G2 == CHARSET_ISO8859_1) {
|
||||
if (IN3 < 0x80)
|
||||
OUTCHAR(IN3 + 0x80);
|
||||
if (INBYTE3 < 0x80)
|
||||
OUTCHAR(INBYTE3 + 0x80);
|
||||
else
|
||||
return 3;
|
||||
}
|
||||
else if (STATE_G2 == CHARSET_ISO8859_7) {
|
||||
ISO8859_7_DECODE(IN3 ^ 0x80, writer)
|
||||
ISO8859_7_DECODE(INBYTE3 ^ 0x80, writer)
|
||||
else return 3;
|
||||
}
|
||||
else if (STATE_G2 == CHARSET_ASCII) {
|
||||
if (IN3 & 0x80) return 3;
|
||||
else OUTCHAR(IN3);
|
||||
if (INBYTE3 & 0x80) return 3;
|
||||
else OUTCHAR(INBYTE3);
|
||||
}
|
||||
else
|
||||
return MBERR_INTERNAL;
|
||||
|
@ -421,7 +410,7 @@ DECODER(iso2022)
|
|||
const struct iso2022_designation *dsgcache = NULL;
|
||||
|
||||
while (inleft > 0) {
|
||||
unsigned char c = IN1;
|
||||
unsigned char c = INBYTE1;
|
||||
Py_ssize_t err;
|
||||
|
||||
if (STATE_GETFLAG(F_ESCTHROUGHOUT)) {
|
||||
|
@ -438,13 +427,13 @@ DECODER(iso2022)
|
|||
switch (c) {
|
||||
case ESC:
|
||||
REQUIRE_INBUF(2)
|
||||
if (IS_ISO2022ESC(IN2)) {
|
||||
if (IS_ISO2022ESC(INBYTE2)) {
|
||||
err = iso2022processesc(config, state,
|
||||
inbuf, &inleft);
|
||||
if (err != 0)
|
||||
return err;
|
||||
}
|
||||
else if (CONFIG_ISSET(USE_G2) && IN2 == 'N') {/* SS2 */
|
||||
else if (CONFIG_ISSET(USE_G2) && INBYTE2 == 'N') {/* SS2 */
|
||||
REQUIRE_INBUF(3)
|
||||
err = iso2022processg2(config, state,
|
||||
inbuf, &inleft, writer);
|
||||
|
|
|
@ -19,38 +19,39 @@
|
|||
|
||||
ENCODER(cp932)
|
||||
{
|
||||
while (inleft > 0) {
|
||||
Py_UCS4 c = IN1;
|
||||
while (*inpos < inlen) {
|
||||
Py_UCS4 c = INCHAR1;
|
||||
DBCHAR code;
|
||||
unsigned char c1, c2;
|
||||
|
||||
if (c <= 0x80) {
|
||||
WRITE1((unsigned char)c)
|
||||
NEXT(1, 1)
|
||||
WRITEBYTE1((unsigned char)c)
|
||||
NEXT(1, 1);
|
||||
continue;
|
||||
}
|
||||
else if (c >= 0xff61 && c <= 0xff9f) {
|
||||
WRITE1(c - 0xfec0)
|
||||
NEXT(1, 1)
|
||||
WRITEBYTE1(c - 0xfec0)
|
||||
NEXT(1, 1);
|
||||
continue;
|
||||
}
|
||||
else if (c >= 0xf8f0 && c <= 0xf8f3) {
|
||||
/* Windows compatibility */
|
||||
REQUIRE_OUTBUF(1)
|
||||
if (c == 0xf8f0)
|
||||
OUT1(0xa0)
|
||||
OUTBYTE1(0xa0)
|
||||
else
|
||||
OUT1(c - 0xfef1 + 0xfd)
|
||||
NEXT(1, 1)
|
||||
OUTBYTE1(c - 0xfef1 + 0xfd)
|
||||
NEXT(1, 1);
|
||||
continue;
|
||||
}
|
||||
|
||||
UCS4INVALID(c)
|
||||
if (c > 0xFFFF)
|
||||
return 1;
|
||||
REQUIRE_OUTBUF(2)
|
||||
|
||||
TRYMAP_ENC(cp932ext, code, c) {
|
||||
OUT1(code >> 8)
|
||||
OUT2(code & 0xff)
|
||||
OUTBYTE1(code >> 8)
|
||||
OUTBYTE2(code & 0xff)
|
||||
}
|
||||
else TRYMAP_ENC(jisxcommon, code, c) {
|
||||
if (code & 0x8000) /* MSB set: JIS X 0212 */
|
||||
|
@ -61,20 +62,20 @@ ENCODER(cp932)
|
|||
c2 = code & 0xff;
|
||||
c2 = (((c1 - 0x21) & 1) ? 0x5e : 0) + (c2 - 0x21);
|
||||
c1 = (c1 - 0x21) >> 1;
|
||||
OUT1(c1 < 0x1f ? c1 + 0x81 : c1 + 0xc1)
|
||||
OUT2(c2 < 0x3f ? c2 + 0x40 : c2 + 0x41)
|
||||
OUTBYTE1(c1 < 0x1f ? c1 + 0x81 : c1 + 0xc1)
|
||||
OUTBYTE2(c2 < 0x3f ? c2 + 0x40 : c2 + 0x41)
|
||||
}
|
||||
else if (c >= 0xe000 && c < 0xe758) {
|
||||
/* User-defined area */
|
||||
c1 = (Py_UCS4)(c - 0xe000) / 188;
|
||||
c2 = (Py_UCS4)(c - 0xe000) % 188;
|
||||
OUT1(c1 + 0xf0)
|
||||
OUT2(c2 < 0x3f ? c2 + 0x40 : c2 + 0x41)
|
||||
OUTBYTE1(c1 + 0xf0)
|
||||
OUTBYTE2(c2 < 0x3f ? c2 + 0x40 : c2 + 0x41)
|
||||
}
|
||||
else
|
||||
return 1;
|
||||
|
||||
NEXT(1, 2)
|
||||
NEXT(1, 2);
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
@ -83,7 +84,7 @@ ENCODER(cp932)
|
|||
DECODER(cp932)
|
||||
{
|
||||
while (inleft > 0) {
|
||||
unsigned char c = IN1, c2;
|
||||
unsigned char c = INBYTE1, c2;
|
||||
|
||||
if (c <= 0x80) {
|
||||
OUTCHAR(c);
|
||||
|
@ -106,7 +107,7 @@ DECODER(cp932)
|
|||
}
|
||||
|
||||
REQUIRE_INBUF(2)
|
||||
c2 = IN2;
|
||||
c2 = INBYTE2;
|
||||
|
||||
TRYMAP_DEC(cp932ext, writer, c, c2);
|
||||
else if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)){
|
||||
|
@ -145,25 +146,24 @@ DECODER(cp932)
|
|||
|
||||
ENCODER(euc_jis_2004)
|
||||
{
|
||||
while (inleft > 0) {
|
||||
Py_UCS4 c = IN1;
|
||||
while (*inpos < inlen) {
|
||||
Py_UCS4 c = INCHAR1;
|
||||
DBCHAR code;
|
||||
Py_ssize_t insize;
|
||||
|
||||
if (c < 0x80) {
|
||||
WRITE1(c)
|
||||
NEXT(1, 1)
|
||||
WRITEBYTE1(c)
|
||||
NEXT(1, 1);
|
||||
continue;
|
||||
}
|
||||
|
||||
DECODE_SURROGATE(c)
|
||||
insize = GET_INSIZE(c);
|
||||
insize = 1;
|
||||
|
||||
if (c <= 0xFFFF) {
|
||||
EMULATE_JISX0213_2000_ENCODE_BMP(code, c)
|
||||
else TRYMAP_ENC(jisx0213_bmp, code, c) {
|
||||
if (code == MULTIC) {
|
||||
if (inleft < 2) {
|
||||
if (inlen - *inpos < 2) {
|
||||
if (flags & MBENC_FLUSH) {
|
||||
code = find_pairencmap(
|
||||
(ucs2_t)c, 0,
|
||||
|
@ -176,8 +176,9 @@ ENCODER(euc_jis_2004)
|
|||
return MBERR_TOOFEW;
|
||||
}
|
||||
else {
|
||||
Py_UCS4 c2 = INCHAR2;
|
||||
code = find_pairencmap(
|
||||
(ucs2_t)c, (*inbuf)[1],
|
||||
(ucs2_t)c, c2,
|
||||
jisx0213_pair_encmap,
|
||||
JISX0213_ENCPAIRS);
|
||||
if (code == DBCINV) {
|
||||
|
@ -195,8 +196,8 @@ ENCODER(euc_jis_2004)
|
|||
else TRYMAP_ENC(jisxcommon, code, c);
|
||||
else if (c >= 0xff61 && c <= 0xff9f) {
|
||||
/* JIS X 0201 half-width katakana */
|
||||
WRITE2(0x8e, c - 0xfec0)
|
||||
NEXT(1, 2)
|
||||
WRITEBYTE2(0x8e, c - 0xfec0)
|
||||
NEXT(1, 2);
|
||||
continue;
|
||||
}
|
||||
else if (c == 0xff3c)
|
||||
|
@ -218,12 +219,12 @@ ENCODER(euc_jis_2004)
|
|||
|
||||
if (code & 0x8000) {
|
||||
/* Codeset 2 */
|
||||
WRITE3(0x8f, code >> 8, (code & 0xFF) | 0x80)
|
||||
NEXT(insize, 3)
|
||||
WRITEBYTE3(0x8f, code >> 8, (code & 0xFF) | 0x80)
|
||||
NEXT(insize, 3);
|
||||
} else {
|
||||
/* Codeset 1 */
|
||||
WRITE2((code >> 8) | 0x80, (code & 0xFF) | 0x80)
|
||||
NEXT(insize, 2)
|
||||
WRITEBYTE2((code >> 8) | 0x80, (code & 0xFF) | 0x80)
|
||||
NEXT(insize, 2);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -233,7 +234,7 @@ ENCODER(euc_jis_2004)
|
|||
DECODER(euc_jis_2004)
|
||||
{
|
||||
while (inleft > 0) {
|
||||
unsigned char c = IN1;
|
||||
unsigned char c = INBYTE1;
|
||||
Py_UCS4 code;
|
||||
|
||||
if (c < 0x80) {
|
||||
|
@ -247,7 +248,7 @@ DECODER(euc_jis_2004)
|
|||
unsigned char c2;
|
||||
|
||||
REQUIRE_INBUF(2)
|
||||
c2 = IN2;
|
||||
c2 = INBYTE2;
|
||||
if (c2 >= 0xa1 && c2 <= 0xdf) {
|
||||
OUTCHAR(0xfec0 + c2);
|
||||
NEXT_IN(2);
|
||||
|
@ -259,8 +260,8 @@ DECODER(euc_jis_2004)
|
|||
unsigned char c2, c3;
|
||||
|
||||
REQUIRE_INBUF(3)
|
||||
c2 = IN2 ^ 0x80;
|
||||
c3 = IN3 ^ 0x80;
|
||||
c2 = INBYTE2 ^ 0x80;
|
||||
c3 = INBYTE3 ^ 0x80;
|
||||
|
||||
/* JIS X 0213 Plane 2 or JIS X 0212 (see NOTES) */
|
||||
EMULATE_JISX0213_2000_DECODE_PLANE2(writer, c2, c3)
|
||||
|
@ -279,7 +280,7 @@ DECODER(euc_jis_2004)
|
|||
|
||||
REQUIRE_INBUF(2)
|
||||
c ^= 0x80;
|
||||
c2 = IN2 ^ 0x80;
|
||||
c2 = INBYTE2 ^ 0x80;
|
||||
|
||||
/* JIS X 0213 Plane 1 */
|
||||
EMULATE_JISX0213_2000_DECODE_PLANE1(writer, c, c2)
|
||||
|
@ -312,35 +313,36 @@ DECODER(euc_jis_2004)
|
|||
|
||||
ENCODER(euc_jp)
|
||||
{
|
||||
while (inleft > 0) {
|
||||
Py_UCS4 c = IN1;
|
||||
while (*inpos < inlen) {
|
||||
Py_UCS4 c = INCHAR1;
|
||||
DBCHAR code;
|
||||
|
||||
if (c < 0x80) {
|
||||
WRITE1((unsigned char)c)
|
||||
NEXT(1, 1)
|
||||
WRITEBYTE1((unsigned char)c)
|
||||
NEXT(1, 1);
|
||||
continue;
|
||||
}
|
||||
|
||||
UCS4INVALID(c)
|
||||
if (c > 0xFFFF)
|
||||
return 1;
|
||||
|
||||
TRYMAP_ENC(jisxcommon, code, c);
|
||||
else if (c >= 0xff61 && c <= 0xff9f) {
|
||||
/* JIS X 0201 half-width katakana */
|
||||
WRITE2(0x8e, c - 0xfec0)
|
||||
NEXT(1, 2)
|
||||
WRITEBYTE2(0x8e, c - 0xfec0)
|
||||
NEXT(1, 2);
|
||||
continue;
|
||||
}
|
||||
#ifndef STRICT_BUILD
|
||||
else if (c == 0xff3c) /* FULL-WIDTH REVERSE SOLIDUS */
|
||||
code = 0x2140;
|
||||
else if (c == 0xa5) { /* YEN SIGN */
|
||||
WRITE1(0x5c);
|
||||
NEXT(1, 1)
|
||||
WRITEBYTE1(0x5c);
|
||||
NEXT(1, 1);
|
||||
continue;
|
||||
} else if (c == 0x203e) { /* OVERLINE */
|
||||
WRITE1(0x7e);
|
||||
NEXT(1, 1)
|
||||
WRITEBYTE1(0x7e);
|
||||
NEXT(1, 1);
|
||||
continue;
|
||||
}
|
||||
#endif
|
||||
|
@ -349,12 +351,12 @@ ENCODER(euc_jp)
|
|||
|
||||
if (code & 0x8000) {
|
||||
/* JIS X 0212 */
|
||||
WRITE3(0x8f, code >> 8, (code & 0xFF) | 0x80)
|
||||
NEXT(1, 3)
|
||||
WRITEBYTE3(0x8f, code >> 8, (code & 0xFF) | 0x80)
|
||||
NEXT(1, 3);
|
||||
} else {
|
||||
/* JIS X 0208 */
|
||||
WRITE2((code >> 8) | 0x80, (code & 0xFF) | 0x80)
|
||||
NEXT(1, 2)
|
||||
WRITEBYTE2((code >> 8) | 0x80, (code & 0xFF) | 0x80)
|
||||
NEXT(1, 2);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -364,7 +366,7 @@ ENCODER(euc_jp)
|
|||
DECODER(euc_jp)
|
||||
{
|
||||
while (inleft > 0) {
|
||||
unsigned char c = IN1;
|
||||
unsigned char c = INBYTE1;
|
||||
|
||||
if (c < 0x80) {
|
||||
OUTCHAR(c);
|
||||
|
@ -377,7 +379,7 @@ DECODER(euc_jp)
|
|||
unsigned char c2;
|
||||
|
||||
REQUIRE_INBUF(2)
|
||||
c2 = IN2;
|
||||
c2 = INBYTE2;
|
||||
if (c2 >= 0xa1 && c2 <= 0xdf) {
|
||||
OUTCHAR(0xfec0 + c2);
|
||||
NEXT_IN(2);
|
||||
|
@ -389,8 +391,8 @@ DECODER(euc_jp)
|
|||
unsigned char c2, c3;
|
||||
|
||||
REQUIRE_INBUF(3)
|
||||
c2 = IN2;
|
||||
c3 = IN3;
|
||||
c2 = INBYTE2;
|
||||
c3 = INBYTE3;
|
||||
/* JIS X 0212 */
|
||||
TRYMAP_DEC(jisx0212, writer, c2 ^ 0x80, c3 ^ 0x80) {
|
||||
NEXT_IN(3);
|
||||
|
@ -402,7 +404,7 @@ DECODER(euc_jp)
|
|||
unsigned char c2;
|
||||
|
||||
REQUIRE_INBUF(2)
|
||||
c2 = IN2;
|
||||
c2 = INBYTE2;
|
||||
/* JIS X 0208 */
|
||||
#ifndef STRICT_BUILD
|
||||
if (c == 0xa1 && c2 == 0xc0)
|
||||
|
@ -427,8 +429,8 @@ DECODER(euc_jp)
|
|||
|
||||
ENCODER(shift_jis)
|
||||
{
|
||||
while (inleft > 0) {
|
||||
Py_UCS4 c = IN1;
|
||||
while (*inpos < inlen) {
|
||||
Py_UCS4 c = INCHAR1;
|
||||
DBCHAR code;
|
||||
unsigned char c1, c2;
|
||||
|
||||
|
@ -440,14 +442,16 @@ ENCODER(shift_jis)
|
|||
else if (c == 0x203e) code = 0x7e; /* OVERLINE */
|
||||
#endif
|
||||
else JISX0201_K_ENCODE(c, code)
|
||||
else UCS4INVALID(c)
|
||||
else code = NOCHAR;
|
||||
else if (c > 0xFFFF)
|
||||
return 1;
|
||||
else
|
||||
code = NOCHAR;
|
||||
|
||||
if (code < 0x80 || (code >= 0xa1 && code <= 0xdf)) {
|
||||
REQUIRE_OUTBUF(1)
|
||||
|
||||
OUT1((unsigned char)code)
|
||||
NEXT(1, 1)
|
||||
OUTBYTE1((unsigned char)code)
|
||||
NEXT(1, 1);
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -470,9 +474,9 @@ ENCODER(shift_jis)
|
|||
c2 = code & 0xff;
|
||||
c2 = (((c1 - 0x21) & 1) ? 0x5e : 0) + (c2 - 0x21);
|
||||
c1 = (c1 - 0x21) >> 1;
|
||||
OUT1(c1 < 0x1f ? c1 + 0x81 : c1 + 0xc1)
|
||||
OUT2(c2 < 0x3f ? c2 + 0x40 : c2 + 0x41)
|
||||
NEXT(1, 2)
|
||||
OUTBYTE1(c1 < 0x1f ? c1 + 0x81 : c1 + 0xc1)
|
||||
OUTBYTE2(c2 < 0x3f ? c2 + 0x40 : c2 + 0x41)
|
||||
NEXT(1, 2);
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
@ -481,7 +485,7 @@ ENCODER(shift_jis)
|
|||
DECODER(shift_jis)
|
||||
{
|
||||
while (inleft > 0) {
|
||||
unsigned char c = IN1;
|
||||
unsigned char c = INBYTE1;
|
||||
|
||||
#ifdef STRICT_BUILD
|
||||
JISX0201_R_DECODE(c, writer)
|
||||
|
@ -493,7 +497,7 @@ DECODER(shift_jis)
|
|||
unsigned char c1, c2;
|
||||
|
||||
REQUIRE_INBUF(2)
|
||||
c2 = IN2;
|
||||
c2 = INBYTE2;
|
||||
if (c2 < 0x40 || (c2 > 0x7e && c2 < 0x80) || c2 > 0xfc)
|
||||
return 1;
|
||||
|
||||
|
@ -533,30 +537,29 @@ DECODER(shift_jis)
|
|||
|
||||
ENCODER(shift_jis_2004)
|
||||
{
|
||||
while (inleft > 0) {
|
||||
Py_UCS4 c = IN1;
|
||||
while (*inpos < inlen) {
|
||||
Py_UCS4 c = INCHAR1;
|
||||
DBCHAR code = NOCHAR;
|
||||
int c1, c2;
|
||||
Py_ssize_t insize;
|
||||
|
||||
JISX0201_ENCODE(c, code)
|
||||
else DECODE_SURROGATE(c)
|
||||
|
||||
if (code < 0x80 || (code >= 0xa1 && code <= 0xdf)) {
|
||||
WRITE1((unsigned char)code)
|
||||
NEXT(1, 1)
|
||||
WRITEBYTE1((unsigned char)code)
|
||||
NEXT(1, 1);
|
||||
continue;
|
||||
}
|
||||
|
||||
REQUIRE_OUTBUF(2)
|
||||
insize = GET_INSIZE(c);
|
||||
insize = 1;
|
||||
|
||||
if (code == NOCHAR) {
|
||||
if (c <= 0xffff) {
|
||||
EMULATE_JISX0213_2000_ENCODE_BMP(code, c)
|
||||
else TRYMAP_ENC(jisx0213_bmp, code, c) {
|
||||
if (code == MULTIC) {
|
||||
if (inleft < 2) {
|
||||
if (inlen - *inpos < 2) {
|
||||
if (flags & MBENC_FLUSH) {
|
||||
code = find_pairencmap
|
||||
((ucs2_t)c, 0,
|
||||
|
@ -569,8 +572,9 @@ ENCODER(shift_jis_2004)
|
|||
return MBERR_TOOFEW;
|
||||
}
|
||||
else {
|
||||
Py_UCS4 ch2 = INCHAR2;
|
||||
code = find_pairencmap(
|
||||
(ucs2_t)c, IN2,
|
||||
(ucs2_t)c, ch2,
|
||||
jisx0213_pair_encmap,
|
||||
JISX0213_ENCPAIRS);
|
||||
if (code == DBCINV) {
|
||||
|
@ -615,10 +619,10 @@ ENCODER(shift_jis_2004)
|
|||
|
||||
if (c1 & 1) c2 += 0x5e;
|
||||
c1 >>= 1;
|
||||
OUT1(c1 + (c1 < 0x1f ? 0x81 : 0xc1))
|
||||
OUT2(c2 + (c2 < 0x3f ? 0x40 : 0x41))
|
||||
OUTBYTE1(c1 + (c1 < 0x1f ? 0x81 : 0xc1))
|
||||
OUTBYTE2(c2 + (c2 < 0x3f ? 0x40 : 0x41))
|
||||
|
||||
NEXT(insize, 2)
|
||||
NEXT(insize, 2);
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
@ -627,7 +631,7 @@ ENCODER(shift_jis_2004)
|
|||
DECODER(shift_jis_2004)
|
||||
{
|
||||
while (inleft > 0) {
|
||||
unsigned char c = IN1;
|
||||
unsigned char c = INBYTE1;
|
||||
|
||||
JISX0201_DECODE(c, writer)
|
||||
else if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xfc)){
|
||||
|
@ -635,7 +639,7 @@ DECODER(shift_jis_2004)
|
|||
Py_UCS4 code;
|
||||
|
||||
REQUIRE_INBUF(2)
|
||||
c2 = IN2;
|
||||
c2 = INBYTE2;
|
||||
if (c2 < 0x40 || (c2 > 0x7e && c2 < 0x80) || c2 > 0xfc)
|
||||
return 1;
|
||||
|
||||
|
|
|
@ -33,16 +33,18 @@ static const unsigned char u2cgk_jongseong[28] = {
|
|||
|
||||
ENCODER(euc_kr)
|
||||
{
|
||||
while (inleft > 0) {
|
||||
Py_UCS4 c = IN1;
|
||||
while (*inpos < inlen) {
|
||||
Py_UCS4 c = INCHAR1;
|
||||
DBCHAR code;
|
||||
|
||||
if (c < 0x80) {
|
||||
WRITE1((unsigned char)c)
|
||||
NEXT(1, 1)
|
||||
WRITEBYTE1((unsigned char)c)
|
||||
NEXT(1, 1);
|
||||
continue;
|
||||
}
|
||||
UCS4INVALID(c)
|
||||
|
||||
if (c > 0xFFFF)
|
||||
return 1;
|
||||
|
||||
REQUIRE_OUTBUF(2)
|
||||
TRYMAP_ENC(cp949, code, c);
|
||||
|
@ -50,9 +52,9 @@ ENCODER(euc_kr)
|
|||
|
||||
if ((code & 0x8000) == 0) {
|
||||
/* KS X 1001 coded character */
|
||||
OUT1((code >> 8) | 0x80)
|
||||
OUT2((code & 0xFF) | 0x80)
|
||||
NEXT(1, 2)
|
||||
OUTBYTE1((code >> 8) | 0x80)
|
||||
OUTBYTE2((code & 0xFF) | 0x80)
|
||||
NEXT(1, 2);
|
||||
}
|
||||
else { /* Mapping is found in CP949 extension,
|
||||
* but we encode it in KS X 1001:1998 Annex 3,
|
||||
|
@ -61,23 +63,23 @@ ENCODER(euc_kr)
|
|||
REQUIRE_OUTBUF(8)
|
||||
|
||||
/* syllable composition precedence */
|
||||
OUT1(EUCKR_JAMO_FIRSTBYTE)
|
||||
OUT2(EUCKR_JAMO_FILLER)
|
||||
OUTBYTE1(EUCKR_JAMO_FIRSTBYTE)
|
||||
OUTBYTE2(EUCKR_JAMO_FILLER)
|
||||
|
||||
/* All codepoints in CP949 extension are in unicode
|
||||
* Hangul Syllable area. */
|
||||
assert(0xac00 <= c && c <= 0xd7a3);
|
||||
c -= 0xac00;
|
||||
|
||||
OUT3(EUCKR_JAMO_FIRSTBYTE)
|
||||
OUT4(u2cgk_choseong[c / 588])
|
||||
NEXT_OUT(4)
|
||||
OUTBYTE3(EUCKR_JAMO_FIRSTBYTE)
|
||||
OUTBYTE4(u2cgk_choseong[c / 588])
|
||||
NEXT_OUT(4);
|
||||
|
||||
OUT1(EUCKR_JAMO_FIRSTBYTE)
|
||||
OUT2(u2cgk_jungseong[(c / 28) % 21])
|
||||
OUT3(EUCKR_JAMO_FIRSTBYTE)
|
||||
OUT4(u2cgk_jongseong[c % 28])
|
||||
NEXT(1, 4)
|
||||
OUTBYTE1(EUCKR_JAMO_FIRSTBYTE)
|
||||
OUTBYTE2(u2cgk_jungseong[(c / 28) % 21])
|
||||
OUTBYTE3(EUCKR_JAMO_FIRSTBYTE)
|
||||
OUTBYTE4(u2cgk_jongseong[c % 28])
|
||||
NEXT(1, 4);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -102,7 +104,7 @@ static const unsigned char cgk2u_jongseong[] = { /* [A1, BE] */
|
|||
DECODER(euc_kr)
|
||||
{
|
||||
while (inleft > 0) {
|
||||
unsigned char c = IN1;
|
||||
unsigned char c = INBYTE1;
|
||||
|
||||
if (c < 0x80) {
|
||||
OUTCHAR(c);
|
||||
|
@ -113,7 +115,7 @@ DECODER(euc_kr)
|
|||
REQUIRE_INBUF(2)
|
||||
|
||||
if (c == EUCKR_JAMO_FIRSTBYTE &&
|
||||
IN2 == EUCKR_JAMO_FILLER) {
|
||||
INBYTE2 == EUCKR_JAMO_FILLER) {
|
||||
/* KS X 1001:1998 Annex 3 make-up sequence */
|
||||
DBCHAR cho, jung, jong;
|
||||
|
||||
|
@ -146,7 +148,7 @@ DECODER(euc_kr)
|
|||
OUTCHAR(0xac00 + cho*588 + jung*28 + jong);
|
||||
NEXT_IN(8);
|
||||
}
|
||||
else TRYMAP_DEC(ksx1001, writer, c ^ 0x80, IN2 ^ 0x80) {
|
||||
else TRYMAP_DEC(ksx1001, writer, c ^ 0x80, INBYTE2 ^ 0x80) {
|
||||
NEXT_IN(2);
|
||||
}
|
||||
else
|
||||
|
@ -164,27 +166,29 @@ DECODER(euc_kr)
|
|||
|
||||
ENCODER(cp949)
|
||||
{
|
||||
while (inleft > 0) {
|
||||
Py_UCS4 c = IN1;
|
||||
while (*inpos < inlen) {
|
||||
Py_UCS4 c = INCHAR1;
|
||||
DBCHAR code;
|
||||
|
||||
if (c < 0x80) {
|
||||
WRITE1((unsigned char)c)
|
||||
NEXT(1, 1)
|
||||
WRITEBYTE1((unsigned char)c)
|
||||
NEXT(1, 1);
|
||||
continue;
|
||||
}
|
||||
UCS4INVALID(c)
|
||||
|
||||
if (c > 0xFFFF)
|
||||
return 1;
|
||||
|
||||
REQUIRE_OUTBUF(2)
|
||||
TRYMAP_ENC(cp949, code, c);
|
||||
else return 1;
|
||||
|
||||
OUT1((code >> 8) | 0x80)
|
||||
OUTBYTE1((code >> 8) | 0x80)
|
||||
if (code & 0x8000)
|
||||
OUT2(code & 0xFF) /* MSB set: CP949 */
|
||||
OUTBYTE2(code & 0xFF) /* MSB set: CP949 */
|
||||
else
|
||||
OUT2((code & 0xFF) | 0x80) /* MSB unset: ks x 1001 */
|
||||
NEXT(1, 2)
|
||||
OUTBYTE2((code & 0xFF) | 0x80) /* MSB unset: ks x 1001 */
|
||||
NEXT(1, 2);
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
@ -193,7 +197,7 @@ ENCODER(cp949)
|
|||
DECODER(cp949)
|
||||
{
|
||||
while (inleft > 0) {
|
||||
unsigned char c = IN1;
|
||||
unsigned char c = INBYTE1;
|
||||
|
||||
if (c < 0x80) {
|
||||
OUTCHAR(c);
|
||||
|
@ -202,8 +206,8 @@ DECODER(cp949)
|
|||
}
|
||||
|
||||
REQUIRE_INBUF(2)
|
||||
TRYMAP_DEC(ksx1001, writer, c ^ 0x80, IN2 ^ 0x80);
|
||||
else TRYMAP_DEC(cp949ext, writer, c, IN2);
|
||||
TRYMAP_DEC(ksx1001, writer, c ^ 0x80, INBYTE2 ^ 0x80);
|
||||
else TRYMAP_DEC(cp949ext, writer, c, INBYTE2);
|
||||
else return 1;
|
||||
|
||||
NEXT_IN(2);
|
||||
|
@ -246,16 +250,18 @@ static const DBCHAR u2johabjamo[] = {
|
|||
|
||||
ENCODER(johab)
|
||||
{
|
||||
while (inleft > 0) {
|
||||
Py_UCS4 c = IN1;
|
||||
while (*inpos < inlen) {
|
||||
Py_UCS4 c = INCHAR1;
|
||||
DBCHAR code;
|
||||
|
||||
if (c < 0x80) {
|
||||
WRITE1((unsigned char)c)
|
||||
NEXT(1, 1)
|
||||
WRITEBYTE1((unsigned char)c)
|
||||
NEXT(1, 1);
|
||||
continue;
|
||||
}
|
||||
UCS4INVALID(c)
|
||||
|
||||
if (c > 0xFFFF)
|
||||
return 1;
|
||||
|
||||
REQUIRE_OUTBUF(2)
|
||||
|
||||
|
@ -281,9 +287,9 @@ ENCODER(johab)
|
|||
t1 = (c1 < 0x4a ? (c1 - 0x21 + 0x1b2) :
|
||||
(c1 - 0x21 + 0x197));
|
||||
t2 = ((t1 & 1) ? 0x5e : 0) + (c2 - 0x21);
|
||||
OUT1(t1 >> 1)
|
||||
OUT2(t2 < 0x4e ? t2 + 0x31 : t2 + 0x43)
|
||||
NEXT(1, 2)
|
||||
OUTBYTE1(t1 >> 1)
|
||||
OUTBYTE2(t2 < 0x4e ? t2 + 0x31 : t2 + 0x43)
|
||||
NEXT(1, 2);
|
||||
continue;
|
||||
}
|
||||
else
|
||||
|
@ -292,9 +298,9 @@ ENCODER(johab)
|
|||
else
|
||||
return 1;
|
||||
|
||||
OUT1(code >> 8)
|
||||
OUT2(code & 0xff)
|
||||
NEXT(1, 2)
|
||||
OUTBYTE1(code >> 8)
|
||||
OUTBYTE2(code & 0xff)
|
||||
NEXT(1, 2);
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
@ -344,7 +350,7 @@ static const unsigned char johabjamo_jongseong[32] = {
|
|||
DECODER(johab)
|
||||
{
|
||||
while (inleft > 0) {
|
||||
unsigned char c = IN1, c2;
|
||||
unsigned char c = INBYTE1, c2;
|
||||
|
||||
if (c < 0x80) {
|
||||
OUTCHAR(c);
|
||||
|
@ -353,7 +359,7 @@ DECODER(johab)
|
|||
}
|
||||
|
||||
REQUIRE_INBUF(2)
|
||||
c2 = IN2;
|
||||
c2 = INBYTE2;
|
||||
|
||||
if (c < 0xd8) {
|
||||
/* johab hangul */
|
||||
|
|
|
@ -13,26 +13,28 @@
|
|||
|
||||
ENCODER(big5)
|
||||
{
|
||||
while (inleft > 0) {
|
||||
Py_UCS4 c = **inbuf;
|
||||
while (*inpos < inlen) {
|
||||
Py_UCS4 c = INCHAR1;
|
||||
DBCHAR code;
|
||||
|
||||
if (c < 0x80) {
|
||||
REQUIRE_OUTBUF(1)
|
||||
**outbuf = (unsigned char)c;
|
||||
NEXT(1, 1)
|
||||
NEXT(1, 1);
|
||||
continue;
|
||||
}
|
||||
UCS4INVALID(c)
|
||||
|
||||
if (c > 0xFFFF)
|
||||
return 1;
|
||||
|
||||
REQUIRE_OUTBUF(2)
|
||||
|
||||
TRYMAP_ENC(big5, code, c);
|
||||
else return 1;
|
||||
|
||||
OUT1(code >> 8)
|
||||
OUT2(code & 0xFF)
|
||||
NEXT(1, 2)
|
||||
OUTBYTE1(code >> 8)
|
||||
OUTBYTE2(code & 0xFF)
|
||||
NEXT(1, 2);
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
@ -41,7 +43,7 @@ ENCODER(big5)
|
|||
DECODER(big5)
|
||||
{
|
||||
while (inleft > 0) {
|
||||
unsigned char c = IN1;
|
||||
unsigned char c = INBYTE1;
|
||||
|
||||
if (c < 0x80) {
|
||||
OUTCHAR(c);
|
||||
|
@ -50,7 +52,7 @@ DECODER(big5)
|
|||
}
|
||||
|
||||
REQUIRE_INBUF(2)
|
||||
TRYMAP_DEC(big5, writer, c, IN2) {
|
||||
TRYMAP_DEC(big5, writer, c, INBYTE2) {
|
||||
NEXT_IN(2);
|
||||
}
|
||||
else return 1;
|
||||
|
@ -66,25 +68,27 @@ DECODER(big5)
|
|||
|
||||
ENCODER(cp950)
|
||||
{
|
||||
while (inleft > 0) {
|
||||
Py_UCS4 c = IN1;
|
||||
while (*inpos < inlen) {
|
||||
Py_UCS4 c = INCHAR1;
|
||||
DBCHAR code;
|
||||
|
||||
if (c < 0x80) {
|
||||
WRITE1((unsigned char)c)
|
||||
NEXT(1, 1)
|
||||
WRITEBYTE1((unsigned char)c)
|
||||
NEXT(1, 1);
|
||||
continue;
|
||||
}
|
||||
UCS4INVALID(c)
|
||||
|
||||
if (c > 0xFFFF)
|
||||
return 1;
|
||||
|
||||
REQUIRE_OUTBUF(2)
|
||||
TRYMAP_ENC(cp950ext, code, c);
|
||||
else TRYMAP_ENC(big5, code, c);
|
||||
else return 1;
|
||||
|
||||
OUT1(code >> 8)
|
||||
OUT2(code & 0xFF)
|
||||
NEXT(1, 2)
|
||||
OUTBYTE1(code >> 8)
|
||||
OUTBYTE2(code & 0xFF)
|
||||
NEXT(1, 2);
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
@ -93,7 +97,7 @@ ENCODER(cp950)
|
|||
DECODER(cp950)
|
||||
{
|
||||
while (inleft > 0) {
|
||||
unsigned char c = IN1;
|
||||
unsigned char c = INBYTE1;
|
||||
|
||||
if (c < 0x80) {
|
||||
OUTCHAR(c);
|
||||
|
@ -103,8 +107,8 @@ DECODER(cp950)
|
|||
|
||||
REQUIRE_INBUF(2)
|
||||
|
||||
TRYMAP_DEC(cp950ext, writer, c, IN2);
|
||||
else TRYMAP_DEC(big5, writer, c, IN2);
|
||||
TRYMAP_DEC(cp950ext, writer, c, INBYTE2);
|
||||
else TRYMAP_DEC(big5, writer, c, INBYTE2);
|
||||
else return 1;
|
||||
|
||||
NEXT_IN(2);
|
||||
|
|
|
@ -72,7 +72,8 @@ static const struct dbcs_map *mapping_list;
|
|||
#define ENCODER(encoding) \
|
||||
static Py_ssize_t encoding##_encode( \
|
||||
MultibyteCodec_State *state, const void *config, \
|
||||
const Py_UNICODE **inbuf, Py_ssize_t inleft, \
|
||||
int kind, void *data, \
|
||||
Py_ssize_t *inpos, Py_ssize_t inlen, \
|
||||
unsigned char **outbuf, Py_ssize_t outleft, int flags)
|
||||
#define ENCODER_RESET(encoding) \
|
||||
static Py_ssize_t encoding##_encode_reset( \
|
||||
|
@ -91,25 +92,25 @@ static const struct dbcs_map *mapping_list;
|
|||
static Py_ssize_t encoding##_decode_reset( \
|
||||
MultibyteCodec_State *state, const void *config)
|
||||
|
||||
#if Py_UNICODE_SIZE == 4
|
||||
#define UCS4INVALID(code) \
|
||||
if ((code) > 0xFFFF) \
|
||||
return 1;
|
||||
#else
|
||||
#define UCS4INVALID(code) \
|
||||
if (0) ;
|
||||
#endif
|
||||
|
||||
#define NEXT_IN(i) \
|
||||
do { \
|
||||
(*inbuf) += (i); \
|
||||
(inleft) -= (i); \
|
||||
} while (0)
|
||||
#define NEXT_INCHAR(i) \
|
||||
do { \
|
||||
(*inpos) += (i); \
|
||||
} while (0)
|
||||
#define NEXT_OUT(o) \
|
||||
(*outbuf) += (o); \
|
||||
(outleft) -= (o);
|
||||
do { \
|
||||
(*outbuf) += (o); \
|
||||
(outleft) -= (o); \
|
||||
} while (0)
|
||||
#define NEXT(i, o) \
|
||||
NEXT_IN(i); NEXT_OUT(o)
|
||||
do { \
|
||||
NEXT_INCHAR(i); \
|
||||
NEXT_OUT(o); \
|
||||
} while (0)
|
||||
|
||||
#define REQUIRE_INBUF(n) \
|
||||
if (inleft < (n)) \
|
||||
|
@ -118,10 +119,13 @@ static const struct dbcs_map *mapping_list;
|
|||
if (outleft < (n)) \
|
||||
return MBERR_TOOSMALL;
|
||||
|
||||
#define IN1 ((*inbuf)[0])
|
||||
#define IN2 ((*inbuf)[1])
|
||||
#define IN3 ((*inbuf)[2])
|
||||
#define IN4 ((*inbuf)[3])
|
||||
#define INBYTE1 ((*inbuf)[0])
|
||||
#define INBYTE2 ((*inbuf)[1])
|
||||
#define INBYTE3 ((*inbuf)[2])
|
||||
#define INBYTE4 ((*inbuf)[3])
|
||||
|
||||
#define INCHAR1 PyUnicode_READ(kind, data, *inpos)
|
||||
#define INCHAR2 PyUnicode_READ(kind, data, *inpos + 1)
|
||||
|
||||
#define OUTCHAR(c) \
|
||||
do { \
|
||||
|
@ -140,24 +144,24 @@ static const struct dbcs_map *mapping_list;
|
|||
writer->pos += 2; \
|
||||
} while (0)
|
||||
|
||||
#define OUT1(c) ((*outbuf)[0]) = (c);
|
||||
#define OUT2(c) ((*outbuf)[1]) = (c);
|
||||
#define OUT3(c) ((*outbuf)[2]) = (c);
|
||||
#define OUT4(c) ((*outbuf)[3]) = (c);
|
||||
#define OUTBYTE1(c) ((*outbuf)[0]) = (c);
|
||||
#define OUTBYTE2(c) ((*outbuf)[1]) = (c);
|
||||
#define OUTBYTE3(c) ((*outbuf)[2]) = (c);
|
||||
#define OUTBYTE4(c) ((*outbuf)[3]) = (c);
|
||||
|
||||
#define WRITE1(c1) \
|
||||
#define WRITEBYTE1(c1) \
|
||||
REQUIRE_OUTBUF(1) \
|
||||
(*outbuf)[0] = (c1);
|
||||
#define WRITE2(c1, c2) \
|
||||
#define WRITEBYTE2(c1, c2) \
|
||||
REQUIRE_OUTBUF(2) \
|
||||
(*outbuf)[0] = (c1); \
|
||||
(*outbuf)[1] = (c2);
|
||||
#define WRITE3(c1, c2, c3) \
|
||||
#define WRITEBYTE3(c1, c2, c3) \
|
||||
REQUIRE_OUTBUF(3) \
|
||||
(*outbuf)[0] = (c1); \
|
||||
(*outbuf)[1] = (c2); \
|
||||
(*outbuf)[2] = (c3);
|
||||
#define WRITE4(c1, c2, c3, c4) \
|
||||
#define WRITEBYTE4(c1, c2, c3, c4) \
|
||||
REQUIRE_OUTBUF(4) \
|
||||
(*outbuf)[0] = (c1); \
|
||||
(*outbuf)[1] = (c2); \
|
||||
|
@ -209,20 +213,6 @@ _TRYMAP_DEC_WRITE(_PyUnicodeWriter *writer, Py_UCS4 c)
|
|||
#define TRYMAP_DEC_MPLANE(charset, writer, plane, c1, c2) \
|
||||
if _TRYMAP_DEC(&charset##_decmap[plane][c1], writer, c2)
|
||||
|
||||
#if Py_UNICODE_SIZE == 2
|
||||
#define DECODE_SURROGATE(c) \
|
||||
if (Py_UNICODE_IS_HIGH_SURROGATE(c)) { \
|
||||
REQUIRE_INBUF(2) \
|
||||
if (Py_UNICODE_IS_LOW_SURROGATE(IN2)) { \
|
||||
c = Py_UNICODE_JOIN_SURROGATES(c, IN2); \
|
||||
} \
|
||||
}
|
||||
#define GET_INSIZE(c) ((c) > 0xffff ? 2 : 1)
|
||||
#else
|
||||
#define DECODE_SURROGATE(c) {;}
|
||||
#define GET_INSIZE(c) 1
|
||||
#endif
|
||||
|
||||
#define BEGIN_MAPPINGS_LIST static const struct dbcs_map _mapping_list[] = {
|
||||
#define MAPPING_ENCONLY(enc) {#enc, (void*)enc##_encmap, NULL},
|
||||
#define MAPPING_DECONLY(enc) {#enc, NULL, (void*)enc##_decmap},
|
||||
|
|
|
@ -10,7 +10,8 @@
|
|||
#include "multibytecodec.h"
|
||||
|
||||
typedef struct {
|
||||
const Py_UNICODE *inbuf, *inbuf_top, *inbuf_end;
|
||||
PyObject *inobj;
|
||||
Py_ssize_t inpos, inlen;
|
||||
unsigned char *outbuf, *outbuf_end;
|
||||
PyObject *excobj, *outobj;
|
||||
} MultibyteEncodeBuffer;
|
||||
|
@ -45,7 +46,7 @@ static char *incrementalkwarglist[] = {"input", "final", NULL};
|
|||
static char *streamkwarglist[] = {"stream", "errors", NULL};
|
||||
|
||||
static PyObject *multibytecodec_encode(MultibyteCodec *,
|
||||
MultibyteCodec_State *, const Py_UNICODE **, Py_ssize_t,
|
||||
MultibyteCodec_State *, PyObject *, Py_ssize_t *,
|
||||
PyObject *, int);
|
||||
|
||||
#define MBENC_RESET MBENC_MAX<<1 /* reset after an encoding session */
|
||||
|
@ -224,7 +225,7 @@ multibytecodec_encerror(MultibyteCodec *codec,
|
|||
return 0; /* retry it */
|
||||
case MBERR_TOOFEW:
|
||||
reason = "incomplete multibyte sequence";
|
||||
esize = (Py_ssize_t)(buf->inbuf_end - buf->inbuf);
|
||||
esize = (Py_ssize_t)buf->inpos;
|
||||
break;
|
||||
case MBERR_INTERNAL:
|
||||
PyErr_SetString(PyExc_RuntimeError,
|
||||
|
@ -238,14 +239,24 @@ multibytecodec_encerror(MultibyteCodec *codec,
|
|||
}
|
||||
|
||||
if (errors == ERROR_REPLACE) {
|
||||
const Py_UNICODE replchar = '?', *inbuf = &replchar;
|
||||
PyObject *replchar;
|
||||
Py_ssize_t r;
|
||||
Py_ssize_t inpos;
|
||||
int kind;
|
||||
void *data;
|
||||
|
||||
replchar = PyUnicode_FromOrdinal('?');
|
||||
if (replchar == NULL)
|
||||
goto errorexit;
|
||||
kind = PyUnicode_KIND(replchar);
|
||||
data = PyUnicode_DATA(replchar);
|
||||
|
||||
inpos = 0;
|
||||
for (;;) {
|
||||
Py_ssize_t outleft;
|
||||
Py_ssize_t outleft = (Py_ssize_t)(buf->outbuf_end - buf->outbuf);
|
||||
|
||||
outleft = (Py_ssize_t)(buf->outbuf_end - buf->outbuf);
|
||||
r = codec->encode(state, codec->config, &inbuf, 1,
|
||||
r = codec->encode(state, codec->config,
|
||||
kind, data, &inpos, 1,
|
||||
&buf->outbuf, outleft, 0);
|
||||
if (r == MBERR_TOOSMALL) {
|
||||
REQUIRE_ENCODEBUFFER(buf, -1);
|
||||
|
@ -255,25 +266,27 @@ multibytecodec_encerror(MultibyteCodec *codec,
|
|||
break;
|
||||
}
|
||||
|
||||
Py_DECREF(replchar);
|
||||
|
||||
if (r != 0) {
|
||||
REQUIRE_ENCODEBUFFER(buf, 1);
|
||||
*buf->outbuf++ = '?';
|
||||
}
|
||||
}
|
||||
if (errors == ERROR_IGNORE || errors == ERROR_REPLACE) {
|
||||
buf->inbuf += esize;
|
||||
buf->inpos += esize;
|
||||
return 0;
|
||||
}
|
||||
|
||||
start = (Py_ssize_t)(buf->inbuf - buf->inbuf_top);
|
||||
start = (Py_ssize_t)buf->inpos;
|
||||
end = start + esize;
|
||||
|
||||
/* use cached exception object if available */
|
||||
if (buf->excobj == NULL) {
|
||||
buf->excobj = PyUnicodeEncodeError_Create(codec->encoding,
|
||||
buf->inbuf_top,
|
||||
buf->inbuf_end - buf->inbuf_top,
|
||||
start, end, reason);
|
||||
buf->excobj = PyObject_CallFunction(PyExc_UnicodeEncodeError,
|
||||
"sOnns",
|
||||
codec->encoding, buf->inobj,
|
||||
start, end, reason);
|
||||
if (buf->excobj == NULL)
|
||||
goto errorexit;
|
||||
}
|
||||
|
@ -302,10 +315,10 @@ multibytecodec_encerror(MultibyteCodec *codec,
|
|||
}
|
||||
|
||||
if (PyUnicode_Check(tobj)) {
|
||||
const Py_UNICODE *uraw = PyUnicode_AS_UNICODE(tobj);
|
||||
Py_ssize_t inpos;
|
||||
|
||||
retstr = multibytecodec_encode(codec, state, &uraw,
|
||||
PyUnicode_GET_SIZE(tobj), ERROR_STRICT,
|
||||
retstr = multibytecodec_encode(codec, state, tobj,
|
||||
&inpos, ERROR_STRICT,
|
||||
MBENC_FLUSH);
|
||||
if (retstr == NULL)
|
||||
goto errorexit;
|
||||
|
@ -324,15 +337,15 @@ multibytecodec_encerror(MultibyteCodec *codec,
|
|||
|
||||
newpos = PyLong_AsSsize_t(PyTuple_GET_ITEM(retobj, 1));
|
||||
if (newpos < 0 && !PyErr_Occurred())
|
||||
newpos += (Py_ssize_t)(buf->inbuf_end - buf->inbuf_top);
|
||||
if (newpos < 0 || buf->inbuf_top + newpos > buf->inbuf_end) {
|
||||
newpos += (Py_ssize_t)buf->inlen;
|
||||
if (newpos < 0 || newpos > buf->inlen) {
|
||||
PyErr_Clear();
|
||||
PyErr_Format(PyExc_IndexError,
|
||||
"position %zd from error handler out of bounds",
|
||||
newpos);
|
||||
goto errorexit;
|
||||
}
|
||||
buf->inbuf = buf->inbuf_top + newpos;
|
||||
buf->inpos = newpos;
|
||||
|
||||
Py_DECREF(retobj);
|
||||
Py_DECREF(retstr);
|
||||
|
@ -449,19 +462,29 @@ errorexit:
|
|||
static PyObject *
|
||||
multibytecodec_encode(MultibyteCodec *codec,
|
||||
MultibyteCodec_State *state,
|
||||
const Py_UNICODE **data, Py_ssize_t datalen,
|
||||
PyObject *text, Py_ssize_t *inpos_t,
|
||||
PyObject *errors, int flags)
|
||||
{
|
||||
MultibyteEncodeBuffer buf;
|
||||
Py_ssize_t finalsize, r = 0;
|
||||
Py_ssize_t datalen;
|
||||
int kind;
|
||||
void *data;
|
||||
|
||||
if (PyUnicode_READY(text) < 0)
|
||||
return NULL;
|
||||
datalen = PyUnicode_GET_LENGTH(text);
|
||||
|
||||
if (datalen == 0 && !(flags & MBENC_RESET))
|
||||
return PyBytes_FromStringAndSize(NULL, 0);
|
||||
|
||||
buf.excobj = NULL;
|
||||
buf.outobj = NULL;
|
||||
buf.inbuf = buf.inbuf_top = *data;
|
||||
buf.inbuf_end = buf.inbuf_top + datalen;
|
||||
buf.inobj = text; /* borrowed reference */
|
||||
buf.inpos = 0;
|
||||
buf.inlen = datalen;
|
||||
kind = PyUnicode_KIND(buf.inobj);
|
||||
data = PyUnicode_DATA(buf.inobj);
|
||||
|
||||
if (datalen > (PY_SSIZE_T_MAX - 16) / 2) {
|
||||
PyErr_NoMemory();
|
||||
|
@ -474,14 +497,14 @@ multibytecodec_encode(MultibyteCodec *codec,
|
|||
buf.outbuf = (unsigned char *)PyBytes_AS_STRING(buf.outobj);
|
||||
buf.outbuf_end = buf.outbuf + PyBytes_GET_SIZE(buf.outobj);
|
||||
|
||||
while (buf.inbuf < buf.inbuf_end) {
|
||||
Py_ssize_t inleft, outleft;
|
||||
|
||||
while (buf.inpos < buf.inlen) {
|
||||
/* we don't reuse inleft and outleft here.
|
||||
* error callbacks can relocate the cursor anywhere on buffer*/
|
||||
inleft = (Py_ssize_t)(buf.inbuf_end - buf.inbuf);
|
||||
outleft = (Py_ssize_t)(buf.outbuf_end - buf.outbuf);
|
||||
r = codec->encode(state, codec->config, &buf.inbuf, inleft,
|
||||
Py_ssize_t outleft = (Py_ssize_t)(buf.outbuf_end - buf.outbuf);
|
||||
|
||||
r = codec->encode(state, codec->config,
|
||||
kind, data,
|
||||
&buf.inpos, buf.inlen,
|
||||
&buf.outbuf, outleft, flags);
|
||||
if ((r == 0) || (r == MBERR_TOOFEW && !(flags & MBENC_FLUSH)))
|
||||
break;
|
||||
|
@ -512,7 +535,8 @@ multibytecodec_encode(MultibyteCodec *codec,
|
|||
if (_PyBytes_Resize(&buf.outobj, finalsize) == -1)
|
||||
goto errorexit;
|
||||
|
||||
*data = buf.inbuf;
|
||||
if (inpos_t)
|
||||
*inpos_t = buf.inpos;
|
||||
Py_XDECREF(buf.excobj);
|
||||
return buf.outobj;
|
||||
|
||||
|
@ -527,7 +551,6 @@ MultibyteCodec_Encode(MultibyteCodecObject *self,
|
|||
PyObject *args, PyObject *kwargs)
|
||||
{
|
||||
MultibyteCodec_State state;
|
||||
Py_UNICODE *data;
|
||||
PyObject *errorcb, *r, *arg, *ucvt;
|
||||
const char *errors = NULL;
|
||||
Py_ssize_t datalen;
|
||||
|
@ -550,11 +573,11 @@ MultibyteCodec_Encode(MultibyteCodecObject *self,
|
|||
}
|
||||
}
|
||||
|
||||
data = PyUnicode_AsUnicodeAndSize(arg, &datalen);
|
||||
if (data == NULL) {
|
||||
if (PyUnicode_READY(arg) < 0) {
|
||||
Py_XDECREF(ucvt);
|
||||
return NULL;
|
||||
}
|
||||
datalen = PyUnicode_GET_LENGTH(arg);
|
||||
|
||||
errorcb = internal_error_callback(errors);
|
||||
if (errorcb == NULL) {
|
||||
|
@ -566,7 +589,7 @@ MultibyteCodec_Encode(MultibyteCodecObject *self,
|
|||
self->codec->encinit(&state, self->codec->config) != 0)
|
||||
goto errorexit;
|
||||
r = multibytecodec_encode(self->codec, &state,
|
||||
(const Py_UNICODE **)&data, datalen, errorcb,
|
||||
arg, NULL, errorcb,
|
||||
MBENC_FLUSH | MBENC_RESET);
|
||||
if (r == NULL)
|
||||
goto errorexit;
|
||||
|
@ -712,8 +735,9 @@ encoder_encode_stateful(MultibyteStatefulEncoderContext *ctx,
|
|||
PyObject *unistr, int final)
|
||||
{
|
||||
PyObject *ucvt, *r = NULL;
|
||||
Py_UNICODE *inbuf, *inbuf_end, *inbuf_tmp = NULL;
|
||||
Py_ssize_t datalen, origpending;
|
||||
PyObject *inbuf = NULL;
|
||||
Py_ssize_t inpos, datalen;
|
||||
PyObject *origpending = NULL;
|
||||
wchar_t *data;
|
||||
|
||||
if (PyUnicode_Check(unistr))
|
||||
|
@ -733,66 +757,64 @@ encoder_encode_stateful(MultibyteStatefulEncoderContext *ctx,
|
|||
data = PyUnicode_AsUnicodeAndSize(unistr, &datalen);
|
||||
if (data == NULL)
|
||||
goto errorexit;
|
||||
origpending = ctx->pendingsize;
|
||||
|
||||
if (origpending > 0) {
|
||||
if (datalen > PY_SSIZE_T_MAX - ctx->pendingsize) {
|
||||
PyErr_NoMemory();
|
||||
/* inbuf_tmp == NULL */
|
||||
goto errorexit;
|
||||
}
|
||||
inbuf_tmp = PyMem_New(Py_UNICODE, datalen + ctx->pendingsize);
|
||||
if (ctx->pending) {
|
||||
PyObject *inbuf_tmp;
|
||||
|
||||
Py_INCREF(ctx->pending);
|
||||
origpending = ctx->pending;
|
||||
|
||||
Py_INCREF(ctx->pending);
|
||||
inbuf_tmp = ctx->pending;
|
||||
PyUnicode_Append(&inbuf_tmp, unistr);
|
||||
if (inbuf_tmp == NULL)
|
||||
goto errorexit;
|
||||
memcpy(inbuf_tmp, ctx->pending,
|
||||
Py_UNICODE_SIZE * ctx->pendingsize);
|
||||
memcpy(inbuf_tmp + ctx->pendingsize,
|
||||
PyUnicode_AS_UNICODE(unistr),
|
||||
Py_UNICODE_SIZE * datalen);
|
||||
datalen += ctx->pendingsize;
|
||||
ctx->pendingsize = 0;
|
||||
Py_CLEAR(ctx->pending);
|
||||
inbuf = inbuf_tmp;
|
||||
}
|
||||
else
|
||||
inbuf = (Py_UNICODE *)PyUnicode_AS_UNICODE(unistr);
|
||||
else {
|
||||
origpending = NULL;
|
||||
|
||||
inbuf_end = inbuf + datalen;
|
||||
Py_INCREF(unistr);
|
||||
inbuf = unistr;
|
||||
}
|
||||
if (PyUnicode_READY(inbuf) < 0)
|
||||
goto errorexit;
|
||||
inpos = 0;
|
||||
datalen = PyUnicode_GET_LENGTH(inbuf);
|
||||
|
||||
r = multibytecodec_encode(ctx->codec, &ctx->state,
|
||||
(const Py_UNICODE **)&inbuf, datalen,
|
||||
ctx->errors, final ? MBENC_FLUSH | MBENC_RESET : 0);
|
||||
inbuf, &inpos,
|
||||
ctx->errors, final ? MBENC_FLUSH | MBENC_RESET : 0);
|
||||
if (r == NULL) {
|
||||
/* recover the original pending buffer */
|
||||
if (origpending > 0)
|
||||
memcpy(ctx->pending, inbuf_tmp,
|
||||
Py_UNICODE_SIZE * origpending);
|
||||
ctx->pendingsize = origpending;
|
||||
Py_CLEAR(ctx->pending);
|
||||
ctx->pending = origpending;
|
||||
origpending = NULL;
|
||||
goto errorexit;
|
||||
}
|
||||
|
||||
if (inbuf < inbuf_end) {
|
||||
ctx->pendingsize = (Py_ssize_t)(inbuf_end - inbuf);
|
||||
if (ctx->pendingsize > MAXENCPENDING) {
|
||||
if (inpos < datalen) {
|
||||
if (datalen - inpos > MAXENCPENDING) {
|
||||
/* normal codecs can't reach here */
|
||||
ctx->pendingsize = 0;
|
||||
PyErr_SetString(PyExc_UnicodeError,
|
||||
"pending buffer overflow");
|
||||
goto errorexit;
|
||||
}
|
||||
memcpy(ctx->pending, inbuf,
|
||||
ctx->pendingsize * Py_UNICODE_SIZE);
|
||||
ctx->pending = PyUnicode_Substring(inbuf, inpos, datalen);
|
||||
if (ctx->pending == NULL) {
|
||||
/* normal codecs can't reach here */
|
||||
goto errorexit;
|
||||
}
|
||||
}
|
||||
|
||||
if (inbuf_tmp != NULL)
|
||||
PyMem_Del(inbuf_tmp);
|
||||
Py_XDECREF(ucvt);
|
||||
return r;
|
||||
|
||||
errorexit:
|
||||
if (inbuf_tmp != NULL)
|
||||
PyMem_Del(inbuf_tmp);
|
||||
Py_XDECREF(r);
|
||||
Py_XDECREF(ucvt);
|
||||
Py_XDECREF(origpending);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
@ -876,7 +898,7 @@ mbiencoder_reset(MultibyteIncrementalEncoderObject *self)
|
|||
if (r != 0)
|
||||
return NULL;
|
||||
}
|
||||
self->pendingsize = 0;
|
||||
Py_CLEAR(self->pending);
|
||||
Py_RETURN_NONE;
|
||||
}
|
||||
|
||||
|
@ -912,7 +934,7 @@ mbiencoder_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
|
|||
}
|
||||
|
||||
self->codec = ((MultibyteCodecObject *)codec)->codec;
|
||||
self->pendingsize = 0;
|
||||
self->pending = NULL;
|
||||
self->errors = internal_error_callback(errors);
|
||||
if (self->errors == NULL)
|
||||
goto errorexit;
|
||||
|
@ -1598,18 +1620,16 @@ mbstreamwriter_writelines(MultibyteStreamWriterObject *self, PyObject *lines)
|
|||
static PyObject *
|
||||
mbstreamwriter_reset(MultibyteStreamWriterObject *self)
|
||||
{
|
||||
const Py_UNICODE *pending;
|
||||
PyObject *pwrt;
|
||||
|
||||
pending = self->pending;
|
||||
pwrt = multibytecodec_encode(self->codec, &self->state,
|
||||
&pending, self->pendingsize, self->errors,
|
||||
self->pending, NULL, self->errors,
|
||||
MBENC_FLUSH | MBENC_RESET);
|
||||
/* some pending buffer can be truncated when UnicodeEncodeError is
|
||||
* raised on 'strict' mode. but, 'reset' method is designed to
|
||||
* reset the pending buffer or states so failed string sequence
|
||||
* ought to be missed */
|
||||
self->pendingsize = 0;
|
||||
Py_CLEAR(self->pending);
|
||||
if (pwrt == NULL)
|
||||
return NULL;
|
||||
|
||||
|
@ -1655,7 +1675,7 @@ mbstreamwriter_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
|
|||
self->codec = ((MultibyteCodecObject *)codec)->codec;
|
||||
self->stream = stream;
|
||||
Py_INCREF(stream);
|
||||
self->pendingsize = 0;
|
||||
self->pending = NULL;
|
||||
self->errors = internal_error_callback(errors);
|
||||
if (self->errors == NULL)
|
||||
goto errorexit;
|
||||
|
|
|
@ -27,7 +27,8 @@ typedef union {
|
|||
typedef int (*mbcodec_init)(const void *config);
|
||||
typedef Py_ssize_t (*mbencode_func)(MultibyteCodec_State *state,
|
||||
const void *config,
|
||||
const Py_UNICODE **inbuf, Py_ssize_t inleft,
|
||||
int kind, void *data,
|
||||
Py_ssize_t *inpos, Py_ssize_t inlen,
|
||||
unsigned char **outbuf, Py_ssize_t outleft,
|
||||
int flags);
|
||||
typedef int (*mbencodeinit_func)(MultibyteCodec_State *state,
|
||||
|
@ -75,8 +76,7 @@ typedef struct {
|
|||
#define MAXENCPENDING 2
|
||||
#define _MultibyteStatefulEncoder_HEAD \
|
||||
_MultibyteStatefulCodec_HEAD \
|
||||
Py_UNICODE pending[MAXENCPENDING]; \
|
||||
Py_ssize_t pendingsize;
|
||||
PyObject *pending;
|
||||
typedef struct {
|
||||
_MultibyteStatefulEncoder_HEAD
|
||||
} MultibyteStatefulEncoderContext;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue