cpython/Lib/test/test_capi/test_unicode.py

515 lines
20 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import unittest
import sys
from test import support
from test.support import import_helper
try:
import _testcapi
except ImportError:
_testcapi = None
class CAPITest(unittest.TestCase):
# Test PyUnicode_FromFormat()
def test_from_format(self):
import_helper.import_module('ctypes')
from ctypes import (
c_char_p,
pythonapi, py_object, sizeof,
c_int, c_long, c_longlong, c_ssize_t,
c_uint, c_ulong, c_ulonglong, c_size_t, c_void_p)
name = "PyUnicode_FromFormat"
_PyUnicode_FromFormat = getattr(pythonapi, name)
_PyUnicode_FromFormat.argtypes = (c_char_p,)
_PyUnicode_FromFormat.restype = py_object
def PyUnicode_FromFormat(format, *args):
cargs = tuple(
py_object(arg) if isinstance(arg, str) else arg
for arg in args)
return _PyUnicode_FromFormat(format, *cargs)
def check_format(expected, format, *args):
text = PyUnicode_FromFormat(format, *args)
self.assertEqual(expected, text)
# ascii format, non-ascii argument
check_format('ascii\x7f=unicode\xe9',
b'ascii\x7f=%U', 'unicode\xe9')
# non-ascii format, ascii argument: ensure that PyUnicode_FromFormatV()
# raises an error
self.assertRaisesRegex(ValueError,
r'^PyUnicode_FromFormatV\(\) expects an ASCII-encoded format '
'string, got a non-ASCII byte: 0xe9$',
PyUnicode_FromFormat, b'unicode\xe9=%s', 'ascii')
# test "%c"
check_format('\uabcd',
b'%c', c_int(0xabcd))
check_format('\U0010ffff',
b'%c', c_int(0x10ffff))
with self.assertRaises(OverflowError):
PyUnicode_FromFormat(b'%c', c_int(0x110000))
# Issue #18183
check_format('\U00010000\U00100000',
b'%c%c', c_int(0x10000), c_int(0x100000))
# test "%"
check_format('%',
b'%%')
check_format('%s',
b'%%s')
check_format('[%]',
b'[%%]')
check_format('%abc',
b'%%%s', b'abc')
# truncated string
check_format('abc',
b'%.3s', b'abcdef')
check_format('abc[\ufffd',
b'%.5s', 'abc[\u20ac]'.encode('utf8'))
check_format("'\\u20acABC'",
b'%A', '\u20acABC')
check_format("'\\u20",
b'%.5A', '\u20acABCDEF')
check_format("'\u20acABC'",
b'%R', '\u20acABC')
check_format("'\u20acA",
b'%.3R', '\u20acABCDEF')
check_format('\u20acAB',
b'%.3S', '\u20acABCDEF')
check_format('\u20acAB',
b'%.3U', '\u20acABCDEF')
check_format('\u20acAB',
b'%.3V', '\u20acABCDEF', None)
check_format('abc[\ufffd',
b'%.5V', None, 'abc[\u20ac]'.encode('utf8'))
# following tests comes from #7330
# test width modifier and precision modifier with %S
check_format("repr= abc",
b'repr=%5S', 'abc')
check_format("repr=ab",
b'repr=%.2S', 'abc')
check_format("repr= ab",
b'repr=%5.2S', 'abc')
# test width modifier and precision modifier with %R
check_format("repr= 'abc'",
b'repr=%8R', 'abc')
check_format("repr='ab",
b'repr=%.3R', 'abc')
check_format("repr= 'ab",
b'repr=%5.3R', 'abc')
# test width modifier and precision modifier with %A
check_format("repr= 'abc'",
b'repr=%8A', 'abc')
check_format("repr='ab",
b'repr=%.3A', 'abc')
check_format("repr= 'ab",
b'repr=%5.3A', 'abc')
# test width modifier and precision modifier with %s
check_format("repr= abc",
b'repr=%5s', b'abc')
check_format("repr=ab",
b'repr=%.2s', b'abc')
check_format("repr= ab",
b'repr=%5.2s', b'abc')
# test width modifier and precision modifier with %U
check_format("repr= abc",
b'repr=%5U', 'abc')
check_format("repr=ab",
b'repr=%.2U', 'abc')
check_format("repr= ab",
b'repr=%5.2U', 'abc')
# test width modifier and precision modifier with %V
check_format("repr= abc",
b'repr=%5V', 'abc', b'123')
check_format("repr=ab",
b'repr=%.2V', 'abc', b'123')
check_format("repr= ab",
b'repr=%5.2V', 'abc', b'123')
check_format("repr= 123",
b'repr=%5V', None, b'123')
check_format("repr=12",
b'repr=%.2V', None, b'123')
check_format("repr= 12",
b'repr=%5.2V', None, b'123')
# test integer formats (%i, %d, %u)
check_format('010',
b'%03i', c_int(10))
check_format('0010',
b'%0.4i', c_int(10))
check_format('-123',
b'%i', c_int(-123))
check_format('-123',
b'%li', c_long(-123))
check_format('-123',
b'%lli', c_longlong(-123))
check_format('-123',
b'%zi', c_ssize_t(-123))
check_format('-123',
b'%d', c_int(-123))
check_format('-123',
b'%ld', c_long(-123))
check_format('-123',
b'%lld', c_longlong(-123))
check_format('-123',
b'%zd', c_ssize_t(-123))
check_format('123',
b'%u', c_uint(123))
check_format('123',
b'%lu', c_ulong(123))
check_format('123',
b'%llu', c_ulonglong(123))
check_format('123',
b'%zu', c_size_t(123))
# test long output
min_longlong = -(2 ** (8 * sizeof(c_longlong) - 1))
max_longlong = -min_longlong - 1
check_format(str(min_longlong),
b'%lld', c_longlong(min_longlong))
check_format(str(max_longlong),
b'%lld', c_longlong(max_longlong))
max_ulonglong = 2 ** (8 * sizeof(c_ulonglong)) - 1
check_format(str(max_ulonglong),
b'%llu', c_ulonglong(max_ulonglong))
PyUnicode_FromFormat(b'%p', c_void_p(-1))
# test padding (width and/or precision)
check_format('123'.rjust(10, '0'),
b'%010i', c_int(123))
check_format('123'.rjust(100),
b'%100i', c_int(123))
check_format('123'.rjust(100, '0'),
b'%.100i', c_int(123))
check_format('123'.rjust(80, '0').rjust(100),
b'%100.80i', c_int(123))
check_format('123'.rjust(10, '0'),
b'%010u', c_uint(123))
check_format('123'.rjust(100),
b'%100u', c_uint(123))
check_format('123'.rjust(100, '0'),
b'%.100u', c_uint(123))
check_format('123'.rjust(80, '0').rjust(100),
b'%100.80u', c_uint(123))
check_format('123'.rjust(10, '0'),
b'%010x', c_int(0x123))
check_format('123'.rjust(100),
b'%100x', c_int(0x123))
check_format('123'.rjust(100, '0'),
b'%.100x', c_int(0x123))
check_format('123'.rjust(80, '0').rjust(100),
b'%100.80x', c_int(0x123))
# test %A
check_format(r"%A:'abc\xe9\uabcd\U0010ffff'",
b'%%A:%A', 'abc\xe9\uabcd\U0010ffff')
# test %V
check_format('repr=abc',
b'repr=%V', 'abc', b'xyz')
# test %p
# We cannot test the exact result,
# because it returns a hex representation of a C pointer,
# which is going to be different each time. But, we can test the format.
p_format_regex = r'^0x[a-zA-Z0-9]{3,}$'
p_format1 = PyUnicode_FromFormat(b'%p', 'abc')
self.assertIsInstance(p_format1, str)
self.assertRegex(p_format1, p_format_regex)
p_format2 = PyUnicode_FromFormat(b'%p %p', '123456', b'xyz')
self.assertIsInstance(p_format2, str)
self.assertRegex(p_format2,
r'0x[a-zA-Z0-9]{3,} 0x[a-zA-Z0-9]{3,}')
# Extra args are ignored:
p_format3 = PyUnicode_FromFormat(b'%p', '123456', None, b'xyz')
self.assertIsInstance(p_format3, str)
self.assertRegex(p_format3, p_format_regex)
# Test string decode from parameter of %s using utf-8.
# b'\xe4\xba\xba\xe6\xb0\x91' is utf-8 encoded byte sequence of
# '\u4eba\u6c11'
check_format('repr=\u4eba\u6c11',
b'repr=%V', None, b'\xe4\xba\xba\xe6\xb0\x91')
#Test replace error handler.
check_format('repr=abc\ufffd',
b'repr=%V', None, b'abc\xff')
# Issue #33817: empty strings
check_format('',
b'')
check_format('',
b'%s', b'')
# check for crashes
for fmt in (b'%', b'%0', b'%01', b'%.', b'%.1',
b'%0%s', b'%1%s', b'%.%s', b'%.1%s', b'%1abc',
b'%l', b'%ll', b'%z', b'%ls', b'%lls', b'%zs'):
with self.subTest(fmt=fmt):
self.assertRaisesRegex(SystemError, 'invalid format string',
PyUnicode_FromFormat, fmt, b'abc')
self.assertRaisesRegex(SystemError, 'invalid format string',
PyUnicode_FromFormat, b'%+i', c_int(10))
# Test PyUnicode_AsWideChar()
@support.cpython_only
@unittest.skipIf(_testcapi is None, 'need _testcapi module')
def test_aswidechar(self):
from _testcapi import unicode_aswidechar
import_helper.import_module('ctypes')
from ctypes import c_wchar, sizeof
wchar, size = unicode_aswidechar('abcdef', 2)
self.assertEqual(size, 2)
self.assertEqual(wchar, 'ab')
wchar, size = unicode_aswidechar('abc', 3)
self.assertEqual(size, 3)
self.assertEqual(wchar, 'abc')
wchar, size = unicode_aswidechar('abc', 4)
self.assertEqual(size, 3)
self.assertEqual(wchar, 'abc\0')
wchar, size = unicode_aswidechar('abc', 10)
self.assertEqual(size, 3)
self.assertEqual(wchar, 'abc\0')
wchar, size = unicode_aswidechar('abc\0def', 20)
self.assertEqual(size, 7)
self.assertEqual(wchar, 'abc\0def\0')
nonbmp = chr(0x10ffff)
if sizeof(c_wchar) == 2:
buflen = 3
nchar = 2
else: # sizeof(c_wchar) == 4
buflen = 2
nchar = 1
wchar, size = unicode_aswidechar(nonbmp, buflen)
self.assertEqual(size, nchar)
self.assertEqual(wchar, nonbmp + '\0')
# Test PyUnicode_AsWideCharString()
@support.cpython_only
@unittest.skipIf(_testcapi is None, 'need _testcapi module')
def test_aswidecharstring(self):
from _testcapi import unicode_aswidecharstring
import_helper.import_module('ctypes')
from ctypes import c_wchar, sizeof
wchar, size = unicode_aswidecharstring('abc')
self.assertEqual(size, 3)
self.assertEqual(wchar, 'abc\0')
wchar, size = unicode_aswidecharstring('abc\0def')
self.assertEqual(size, 7)
self.assertEqual(wchar, 'abc\0def\0')
nonbmp = chr(0x10ffff)
if sizeof(c_wchar) == 2:
nchar = 2
else: # sizeof(c_wchar) == 4
nchar = 1
wchar, size = unicode_aswidecharstring(nonbmp)
self.assertEqual(size, nchar)
self.assertEqual(wchar, nonbmp + '\0')
# Test PyUnicode_AsUCS4()
@support.cpython_only
@unittest.skipIf(_testcapi is None, 'need _testcapi module')
def test_asucs4(self):
from _testcapi import unicode_asucs4
for s in ['abc', '\xa1\xa2', '\u4f60\u597d', 'a\U0001f600',
'a\ud800b\udfffc', '\ud834\udd1e']:
l = len(s)
self.assertEqual(unicode_asucs4(s, l, True), s+'\0')
self.assertEqual(unicode_asucs4(s, l, False), s+'\uffff')
self.assertEqual(unicode_asucs4(s, l+1, True), s+'\0\uffff')
self.assertEqual(unicode_asucs4(s, l+1, False), s+'\0\uffff')
self.assertRaises(SystemError, unicode_asucs4, s, l-1, True)
self.assertRaises(SystemError, unicode_asucs4, s, l-2, False)
s = '\0'.join([s, s])
self.assertEqual(unicode_asucs4(s, len(s), True), s+'\0')
self.assertEqual(unicode_asucs4(s, len(s), False), s+'\uffff')
# Test PyUnicode_AsUTF8()
@support.cpython_only
@unittest.skipIf(_testcapi is None, 'need _testcapi module')
def test_asutf8(self):
from _testcapi import unicode_asutf8
bmp = '\u0100'
bmp2 = '\uffff'
nonbmp = chr(0x10ffff)
self.assertEqual(unicode_asutf8(bmp), b'\xc4\x80')
self.assertEqual(unicode_asutf8(bmp2), b'\xef\xbf\xbf')
self.assertEqual(unicode_asutf8(nonbmp), b'\xf4\x8f\xbf\xbf')
self.assertRaises(UnicodeEncodeError, unicode_asutf8, 'a\ud800b\udfffc')
# Test PyUnicode_AsUTF8AndSize()
@support.cpython_only
@unittest.skipIf(_testcapi is None, 'need _testcapi module')
def test_asutf8andsize(self):
from _testcapi import unicode_asutf8andsize
bmp = '\u0100'
bmp2 = '\uffff'
nonbmp = chr(0x10ffff)
self.assertEqual(unicode_asutf8andsize(bmp), (b'\xc4\x80', 2))
self.assertEqual(unicode_asutf8andsize(bmp2), (b'\xef\xbf\xbf', 3))
self.assertEqual(unicode_asutf8andsize(nonbmp), (b'\xf4\x8f\xbf\xbf', 4))
self.assertRaises(UnicodeEncodeError, unicode_asutf8andsize, 'a\ud800b\udfffc')
# Test PyUnicode_Count()
@support.cpython_only
@unittest.skipIf(_testcapi is None, 'need _testcapi module')
def test_count(self):
from _testcapi import unicode_count
st = 'abcabd'
self.assertEqual(unicode_count(st, 'a', 0, len(st)), 2)
self.assertEqual(unicode_count(st, 'ab', 0, len(st)), 2)
self.assertEqual(unicode_count(st, 'abc', 0, len(st)), 1)
self.assertEqual(unicode_count(st, 'а', 0, len(st)), 0) # cyrillic "a"
# start < end
self.assertEqual(unicode_count(st, 'a', 3, len(st)), 1)
self.assertEqual(unicode_count(st, 'a', 4, len(st)), 0)
self.assertEqual(unicode_count(st, 'a', 0, sys.maxsize), 2)
# start >= end
self.assertEqual(unicode_count(st, 'abc', 0, 0), 0)
self.assertEqual(unicode_count(st, 'a', 3, 2), 0)
self.assertEqual(unicode_count(st, 'a', sys.maxsize, 5), 0)
# negative
self.assertEqual(unicode_count(st, 'ab', -len(st), -1), 2)
self.assertEqual(unicode_count(st, 'a', -len(st), -3), 1)
# wrong args
self.assertRaises(TypeError, unicode_count, 'a', 'a')
self.assertRaises(TypeError, unicode_count, 'a', 'a', 1)
self.assertRaises(TypeError, unicode_count, 1, 'a', 0, 1)
self.assertRaises(TypeError, unicode_count, 'a', 1, 0, 1)
# empty string
self.assertEqual(unicode_count('abc', '', 0, 3), 4)
self.assertEqual(unicode_count('abc', '', 1, 3), 3)
self.assertEqual(unicode_count('', '', 0, 1), 1)
self.assertEqual(unicode_count('', 'a', 0, 1), 0)
# different unicode kinds
for uni in "\xa1", "\u8000\u8080", "\ud800\udc02", "\U0001f100\U0001f1f1":
for ch in uni:
self.assertEqual(unicode_count(uni, ch, 0, len(uni)), 1)
self.assertEqual(unicode_count(st, ch, 0, len(st)), 0)
# subclasses should still work
class MyStr(str):
pass
self.assertEqual(unicode_count(MyStr('aab'), 'a', 0, 3), 2)
# Test PyUnicode_FindChar()
@support.cpython_only
@unittest.skipIf(_testcapi is None, 'need _testcapi module')
def test_findchar(self):
from _testcapi import unicode_findchar
for str in "\xa1", "\u8000\u8080", "\ud800\udc02", "\U0001f100\U0001f1f1":
for i, ch in enumerate(str):
self.assertEqual(unicode_findchar(str, ord(ch), 0, len(str), 1), i)
self.assertEqual(unicode_findchar(str, ord(ch), 0, len(str), -1), i)
str = "!>_<!"
self.assertEqual(unicode_findchar(str, 0x110000, 0, len(str), 1), -1)
self.assertEqual(unicode_findchar(str, 0x110000, 0, len(str), -1), -1)
# start < end
self.assertEqual(unicode_findchar(str, ord('!'), 1, len(str)+1, 1), 4)
self.assertEqual(unicode_findchar(str, ord('!'), 1, len(str)+1, -1), 4)
# start >= end
self.assertEqual(unicode_findchar(str, ord('!'), 0, 0, 1), -1)
self.assertEqual(unicode_findchar(str, ord('!'), len(str), 0, 1), -1)
# negative
self.assertEqual(unicode_findchar(str, ord('!'), -len(str), -1, 1), 0)
self.assertEqual(unicode_findchar(str, ord('!'), -len(str), -1, -1), 0)
# Test PyUnicode_CopyCharacters()
@support.cpython_only
@unittest.skipIf(_testcapi is None, 'need _testcapi module')
def test_copycharacters(self):
from _testcapi import unicode_copycharacters
strings = [
'abcde', '\xa1\xa2\xa3\xa4\xa5',
'\u4f60\u597d\u4e16\u754c\uff01',
'\U0001f600\U0001f601\U0001f602\U0001f603\U0001f604'
]
for idx, from_ in enumerate(strings):
# wide -> narrow: exceed maxchar limitation
for to in strings[:idx]:
self.assertRaises(
SystemError,
unicode_copycharacters, to, 0, from_, 0, 5
)
# same kind
for from_start in range(5):
self.assertEqual(
unicode_copycharacters(from_, 0, from_, from_start, 5),
(from_[from_start:from_start+5].ljust(5, '\0'),
5-from_start)
)
for to_start in range(5):
self.assertEqual(
unicode_copycharacters(from_, to_start, from_, to_start, 5),
(from_[to_start:to_start+5].rjust(5, '\0'),
5-to_start)
)
# narrow -> wide
# Tests omitted since this creates invalid strings.
s = strings[0]
self.assertRaises(IndexError, unicode_copycharacters, s, 6, s, 0, 5)
self.assertRaises(IndexError, unicode_copycharacters, s, -1, s, 0, 5)
self.assertRaises(IndexError, unicode_copycharacters, s, 0, s, 6, 5)
self.assertRaises(IndexError, unicode_copycharacters, s, 0, s, -1, 5)
self.assertRaises(SystemError, unicode_copycharacters, s, 1, s, 0, 5)
self.assertRaises(SystemError, unicode_copycharacters, s, 0, s, 0, -1)
self.assertRaises(SystemError, unicode_copycharacters, s, 0, b'', 0, 0)
@support.cpython_only
@unittest.skipIf(_testcapi is None, 'need _testcapi module')
def test_pep393_utf8_caching_bug(self):
# Issue #25709: Problem with string concatenation and utf-8 cache
from _testcapi import getargs_s_hash
for k in 0x24, 0xa4, 0x20ac, 0x1f40d:
s = ''
for i in range(5):
# Due to CPython specific optimization the 's' string can be
# resized in-place.
s += chr(k)
# Parsing with the "s#" format code calls indirectly
# PyUnicode_AsUTF8AndSize() which creates the UTF-8
# encoded string cached in the Unicode object.
self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1))
# Check that the second call returns the same result
self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1))
if __name__ == "__main__":
unittest.main()