Port test_ucn and test_unicodedata to PyUnit. Add a few tests for error

cases increasing coverage in unicodedata.c from 87% to 95%
(when the normalization tests are run). From SF patch #662807.
This commit is contained in:
Walter Dörwald 2003-02-26 14:49:41 +00:00
parent 0ff7a4e7c1
commit 37c4728c64
4 changed files with 313 additions and 242 deletions

View file

@ -1,9 +0,0 @@
test_ucn
Testing General Unicode Character Name, and case insensitivity... done.
Testing name to code mapping.... done.
Testing hangul syllable names.... done.
Testing names of CJK unified ideographs.... done.
Testing code to name mapping for all BMP characters.... done.
Found 50212 characters in the unicode name database
Testing misc. symbols for unicode character name expansion.... done.
Testing unicode character name expansion strict error handling.... done.

View file

@ -1,5 +0,0 @@
test_unicodedata
Testing Unicode Database...
Methods: a37276dc2c158bef6dfd908ad34525c97180fad9
Functions: cfe20a967a450ebc82ca68c3e4eed344164e11af
API: ok

View file

@ -6,144 +6,141 @@ Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
"""#"
from test.test_support import verify, verbose
print 'Testing General Unicode Character Name, and case insensitivity...',
import unittest
# General and case insensitivity test:
try:
# put all \N escapes inside exec'd raw strings, to make sure this
# script runs even if the compiler chokes on \N escapes
exec r"""
s = u"\N{LATIN CAPITAL LETTER T}" \
u"\N{LATIN SMALL LETTER H}" \
u"\N{LATIN SMALL LETTER E}" \
u"\N{SPACE}" \
u"\N{LATIN SMALL LETTER R}" \
u"\N{LATIN CAPITAL LETTER E}" \
u"\N{LATIN SMALL LETTER D}" \
u"\N{SPACE}" \
u"\N{LATIN SMALL LETTER f}" \
u"\N{LATIN CAPITAL LeTtEr o}" \
u"\N{LATIN SMaLl LETTER x}" \
u"\N{SPACE}" \
u"\N{LATIN SMALL LETTER A}" \
u"\N{LATIN SMALL LETTER T}" \
u"\N{LATIN SMALL LETTER E}" \
u"\N{SPACE}" \
u"\N{LATIN SMALL LETTER T}" \
u"\N{LATIN SMALL LETTER H}" \
u"\N{LATIN SMALL LETTER E}" \
u"\N{SpAcE}" \
u"\N{LATIN SMALL LETTER S}" \
u"\N{LATIN SMALL LETTER H}" \
u"\N{LATIN SMALL LETTER E}" \
u"\N{LATIN SMALL LETTER E}" \
u"\N{LATIN SMALL LETTER P}" \
u"\N{FULL STOP}"
verify(s == u"The rEd fOx ate the sheep.", s)
"""
except UnicodeError, v:
print v
print "done."
from test import test_support
import unicodedata
class UnicodeNamesTest(unittest.TestCase):
print "Testing name to code mapping....",
for char in "SPAM":
name = "LATIN SMALL LETTER %s" % char
def checkletter(self, name, code):
# Helper that put all \N escapes inside eval'd raw strings,
# to make sure this script runs even if the compiler
# chokes on \N escapes
res = eval(ur'u"\N{%s}"' % name)
self.assertEqual(res, code)
return res
def test_general(self):
# General and case insensitivity test:
chars = [
"LATIN CAPITAL LETTER T",
"LATIN SMALL LETTER H",
"LATIN SMALL LETTER E",
"SPACE",
"LATIN SMALL LETTER R",
"LATIN CAPITAL LETTER E",
"LATIN SMALL LETTER D",
"SPACE",
"LATIN SMALL LETTER f",
"LATIN CAPITAL LeTtEr o",
"LATIN SMaLl LETTER x",
"SPACE",
"LATIN SMALL LETTER A",
"LATIN SMALL LETTER T",
"LATIN SMALL LETTER E",
"SPACE",
"LATIN SMALL LETTER T",
"LATIN SMALL LETTER H",
"LATIN SMALL LETTER E",
"SpAcE",
"LATIN SMALL LETTER S",
"LATIN SMALL LETTER H",
"LATIN small LETTER e",
"LATIN small LETTER e",
"LATIN SMALL LETTER P",
"FULL STOP"
]
string = u"The rEd fOx ate the sheep."
self.assertEqual(
u"".join([self.checkletter(*args) for args in zip(chars, string)]),
string
)
def test_ascii_letters(self):
import unicodedata
for char in "".join(map(chr, xrange(ord("a"), ord("z")))):
name = "LATIN SMALL LETTER %s" % char.upper()
code = unicodedata.lookup(name)
verify(unicodedata.name(code) == name)
print "done."
self.assertEqual(unicodedata.name(code), name)
print "Testing hangul syllable names....",
exec r"""
verify(u"\N{HANGUL SYLLABLE GA}" == u"\uac00")
verify(u"\N{HANGUL SYLLABLE GGWEOSS}" == u"\uafe8")
verify(u"\N{HANGUL SYLLABLE DOLS}" == u"\ub3d0")
verify(u"\N{HANGUL SYLLABLE RYAN}" == u"\ub7b8")
verify(u"\N{HANGUL SYLLABLE MWIK}" == u"\ubba0")
verify(u"\N{HANGUL SYLLABLE BBWAEM}" == u"\ubf88")
verify(u"\N{HANGUL SYLLABLE SSEOL}" == u"\uc370")
verify(u"\N{HANGUL SYLLABLE YI}" == u"\uc758")
verify(u"\N{HANGUL SYLLABLE JJYOSS}" == u"\ucb40")
verify(u"\N{HANGUL SYLLABLE KYEOLS}" == u"\ucf28")
verify(u"\N{HANGUL SYLLABLE PAN}" == u"\ud310")
verify(u"\N{HANGUL SYLLABLE HWEOK}" == u"\ud6f8")
verify(u"\N{HANGUL SYLLABLE HIH}" == u"\ud7a3")
"""
try:
unicodedata.name(u"\ud7a4")
except ValueError:
pass
else:
raise AssertionError, "Found name for U+D7A4"
print "done."
def test_hangul_syllables(self):
self.checkletter("HANGUL SYLLABLE GA", u"\uac00")
self.checkletter("HANGUL SYLLABLE GGWEOSS", u"\uafe8")
self.checkletter("HANGUL SYLLABLE DOLS", u"\ub3d0")
self.checkletter("HANGUL SYLLABLE RYAN", u"\ub7b8")
self.checkletter("HANGUL SYLLABLE MWIK", u"\ubba0")
self.checkletter("HANGUL SYLLABLE BBWAEM", u"\ubf88")
self.checkletter("HANGUL SYLLABLE SSEOL", u"\uc370")
self.checkletter("HANGUL SYLLABLE YI", u"\uc758")
self.checkletter("HANGUL SYLLABLE JJYOSS", u"\ucb40")
self.checkletter("HANGUL SYLLABLE KYEOLS", u"\ucf28")
self.checkletter("HANGUL SYLLABLE PAN", u"\ud310")
self.checkletter("HANGUL SYLLABLE HWEOK", u"\ud6f8")
self.checkletter("HANGUL SYLLABLE HIH", u"\ud7a3")
print "Testing names of CJK unified ideographs....",
exec r"""
verify(u"\N{CJK UNIFIED IDEOGRAPH-3400}" == u"\u3400")
verify(u"\N{CJK UNIFIED IDEOGRAPH-4DB5}" == u"\u4db5")
verify(u"\N{CJK UNIFIED IDEOGRAPH-4E00}" == u"\u4e00")
verify(u"\N{CJK UNIFIED IDEOGRAPH-9FA5}" == u"\u9fa5")
verify(u"\N{CJK UNIFIED IDEOGRAPH-20000}" == u"\U00020000")
verify(u"\N{CJK UNIFIED IDEOGRAPH-2A6D6}" == u"\U0002a6d6")
"""
print "done."
import unicodedata
self.assertRaises(ValueError, unicodedata.name, u"\ud7a4")
print "Testing code to name mapping for all BMP characters....",
count = 0
for code in range(0x10000):
try:
def test_cjk_unified_ideographs(self):
self.checkletter("CJK UNIFIED IDEOGRAPH-3400", u"\u3400")
self.checkletter("CJK UNIFIED IDEOGRAPH-4DB5", u"\u4db5")
self.checkletter("CJK UNIFIED IDEOGRAPH-4E00", u"\u4e00")
self.checkletter("CJK UNIFIED IDEOGRAPH-9FA5", u"\u9fa5")
self.checkletter("CJK UNIFIED IDEOGRAPH-20000", u"\U00020000")
self.checkletter("CJK UNIFIED IDEOGRAPH-2A6D6", u"\U0002a6d6")
def test_bmp_characters(self):
import unicodedata
count = 0
for code in xrange(0x10000):
char = unichr(code)
name = unicodedata.name(char)
except (KeyError, ValueError):
pass
else:
verify(unicodedata.lookup(name) == char)
name = unicodedata.name(char, None)
if name is not None:
self.assertEqual(unicodedata.lookup(name), char)
count += 1
print "done."
print "Found", count, "characters in the unicode name database"
def test_misc_symbols(self):
self.checkletter("PILCROW SIGN", u"\u00b6")
self.checkletter("REPLACEMENT CHARACTER", u"\uFFFD")
self.checkletter("HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK", u"\uFF9F")
self.checkletter("FULLWIDTH LATIN SMALL LETTER A", u"\uFF41")
# misc. symbol testing
print "Testing misc. symbols for unicode character name expansion....",
exec r"""
verify(u"\N{PILCROW SIGN}" == u"\u00b6")
verify(u"\N{REPLACEMENT CHARACTER}" == u"\uFFFD")
verify(u"\N{HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK}" == u"\uFF9F")
verify(u"\N{FULLWIDTH LATIN SMALL LETTER A}" == u"\uFF41")
"""
print "done."
def test_errors(self):
import unicodedata
self.assertRaises(TypeError, unicodedata.name)
self.assertRaises(TypeError, unicodedata.name, u'xx')
self.assertRaises(TypeError, unicodedata.lookup)
self.assertRaises(KeyError, unicodedata.lookup, u'unknown')
# strict error testing:
print "Testing unicode character name expansion strict error handling....",
try:
unicode("\N{blah}", 'unicode-escape', 'strict')
except UnicodeError:
pass
else:
raise AssertionError, "failed to raise an exception when given a bogus character name"
def test_strict_eror_handling(self):
# bogus character name
self.assertRaises(
UnicodeError,
unicode, "\\N{blah}", 'unicode-escape', 'strict'
)
# long bogus character name
self.assertRaises(
UnicodeError,
unicode, "\\N{%s}" % ("x" * 100000), 'unicode-escape', 'strict'
)
# missing closing brace
self.assertRaises(
UnicodeError,
unicode, "\\N{SPACE", 'unicode-escape', 'strict'
)
# missing opening brace
self.assertRaises(
UnicodeError,
unicode, "\\NSPACE", 'unicode-escape', 'strict'
)
try:
unicode("\N{" + "x" * 100000 + "}", 'unicode-escape', 'strict')
except UnicodeError:
pass
else:
raise AssertionError, "failed to raise an exception when given a very " \
"long bogus character name"
def test_main():
suite = unittest.TestSuite()
suite.addTest(unittest.makeSuite(UnicodeNamesTest))
test_support.run_suite(suite)
try:
unicode("\N{SPACE", 'unicode-escape', 'strict')
except UnicodeError:
pass
else:
raise AssertionError, "failed to raise an exception for a missing closing brace."
try:
unicode("\NSPACE", 'unicode-escape', 'strict')
except UnicodeError:
pass
else:
raise AssertionError, "failed to raise an exception for a missing opening brace."
print "done."
if __name__ == "__main__":
test_main()

View file

@ -5,39 +5,45 @@
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
"""#"
from test.test_support import verify, verbose
import unittest, test.test_support
import sha
encoding = 'utf-8'
def test_methods():
### Run tests
class UnicodeMethodsTest(unittest.TestCase):
# update this, if the database changes
expectedchecksum = 'a37276dc2c158bef6dfd908ad34525c97180fad9'
def test_method_checksum(self):
h = sha.sha()
for i in range(65536):
char = unichr(i)
data = [
# Predicates (single char)
char.isalnum() and u'1' or u'0',
char.isalpha() and u'1' or u'0',
char.isdecimal() and u'1' or u'0',
char.isdigit() and u'1' or u'0',
char.islower() and u'1' or u'0',
char.isnumeric() and u'1' or u'0',
char.isspace() and u'1' or u'0',
char.istitle() and u'1' or u'0',
char.isupper() and u'1' or u'0',
u"01"[char.isalnum()],
u"01"[char.isalpha()],
u"01"[char.isdecimal()],
u"01"[char.isdigit()],
u"01"[char.islower()],
u"01"[char.isnumeric()],
u"01"[char.isspace()],
u"01"[char.istitle()],
u"01"[char.isupper()],
# Predicates (multiple chars)
(char + u'abc').isalnum() and u'1' or u'0',
(char + u'abc').isalpha() and u'1' or u'0',
(char + u'123').isdecimal() and u'1' or u'0',
(char + u'123').isdigit() and u'1' or u'0',
(char + u'abc').islower() and u'1' or u'0',
(char + u'123').isnumeric() and u'1' or u'0',
(char + u' \t').isspace() and u'1' or u'0',
(char + u'abc').istitle() and u'1' or u'0',
(char + u'ABC').isupper() and u'1' or u'0',
u"01"[(char + u'abc').isalnum()],
u"01"[(char + u'abc').isalpha()],
u"01"[(char + u'123').isdecimal()],
u"01"[(char + u'123').isdigit()],
u"01"[(char + u'abc').islower()],
u"01"[(char + u'123').isnumeric()],
u"01"[(char + u' \t').isspace()],
u"01"[(char + u'abc').istitle()],
u"01"[(char + u'ABC').isupper()],
# Mappings (single char)
char.lower(),
@ -52,74 +58,156 @@ def test_methods():
]
h.update(u''.join(data).encode(encoding))
return h.hexdigest()
result = h.hexdigest()
self.assertEqual(result, self.expectedchecksum)
def test_unicodedata():
class UnicodeDatabaseTest(unittest.TestCase):
def setUp(self):
# In case unicodedata is not available, this will raise an ImportError,
# but the other test cases will still be run
import unicodedata
self.db = unicodedata
def tearDown(self):
del self.db
class UnicodeFunctionsTest(UnicodeDatabaseTest):
# update this, if the database changes
expectedchecksum = 'cfe20a967a450ebc82ca68c3e4eed344164e11af'
def test_function_checksum(self):
data = []
h = sha.sha()
for i in range(65536):
for i in range(0x10000):
char = unichr(i)
data = [
# Properties
str(unicodedata.digit(char, -1)),
str(unicodedata.numeric(char, -1)),
str(unicodedata.decimal(char, -1)),
unicodedata.category(char),
unicodedata.bidirectional(char),
unicodedata.decomposition(char),
str(unicodedata.mirrored(char)),
str(unicodedata.combining(char)),
str(self.db.digit(char, -1)),
str(self.db.numeric(char, -1)),
str(self.db.decimal(char, -1)),
self.db.category(char),
self.db.bidirectional(char),
self.db.decomposition(char),
str(self.db.mirrored(char)),
str(self.db.combining(char)),
]
h.update(''.join(data))
return h.hexdigest()
result = h.hexdigest()
self.assertEqual(result, self.expectedchecksum)
### Run tests
def test_digit(self):
self.assertEqual(self.db.digit(u'A', None), None)
self.assertEqual(self.db.digit(u'9'), 9)
self.assertEqual(self.db.digit(u'\u215b', None), None)
self.assertEqual(self.db.digit(u'\u2468'), 9)
print 'Testing Unicode Database...'
print 'Methods:',
print test_methods()
self.assertRaises(TypeError, self.db.digit)
self.assertRaises(TypeError, self.db.digit, u'xx')
self.assertRaises(ValueError, self.db.digit, u'x')
# In case unicodedata is not available, this will raise an ImportError,
# but still test the above cases...
import unicodedata
print 'Functions:',
print test_unicodedata()
def test_numeric(self):
self.assertEqual(self.db.numeric(u'A',None), None)
self.assertEqual(self.db.numeric(u'9'), 9)
self.assertEqual(self.db.numeric(u'\u215b'), 0.125)
self.assertEqual(self.db.numeric(u'\u2468'), 9.0)
# Some additional checks of the API:
print 'API:',
self.assertRaises(TypeError, self.db.numeric)
self.assertRaises(TypeError, self.db.numeric, u'xx')
self.assertRaises(ValueError, self.db.numeric, u'x')
verify(unicodedata.digit(u'A',None) is None)
verify(unicodedata.digit(u'9') == 9)
verify(unicodedata.digit(u'\u215b',None) is None)
verify(unicodedata.digit(u'\u2468') == 9)
def test_decimal(self):
self.assertEqual(self.db.decimal(u'A',None), None)
self.assertEqual(self.db.decimal(u'9'), 9)
self.assertEqual(self.db.decimal(u'\u215b', None), None)
self.assertEqual(self.db.decimal(u'\u2468', None), None)
verify(unicodedata.numeric(u'A',None) is None)
verify(unicodedata.numeric(u'9') == 9)
verify(unicodedata.numeric(u'\u215b') == 0.125)
verify(unicodedata.numeric(u'\u2468') == 9.0)
self.assertRaises(TypeError, self.db.decimal)
self.assertRaises(TypeError, self.db.decimal, u'xx')
self.assertRaises(ValueError, self.db.decimal, u'x')
verify(unicodedata.decimal(u'A',None) is None)
verify(unicodedata.decimal(u'9') == 9)
verify(unicodedata.decimal(u'\u215b',None) is None)
verify(unicodedata.decimal(u'\u2468',None) is None)
def test_category(self):
self.assertEqual(self.db.category(u'\uFFFE'), 'Cn')
self.assertEqual(self.db.category(u'a'), 'Ll')
self.assertEqual(self.db.category(u'A'), 'Lu')
verify(unicodedata.category(u'\uFFFE') == 'Cn')
verify(unicodedata.category(u'a') == 'Ll')
verify(unicodedata.category(u'A') == 'Lu')
self.assertRaises(TypeError, self.db.category)
self.assertRaises(TypeError, self.db.category, u'xx')
verify(unicodedata.bidirectional(u'\uFFFE') == '')
verify(unicodedata.bidirectional(u' ') == 'WS')
verify(unicodedata.bidirectional(u'A') == 'L')
def test_bidirectional(self):
self.assertEqual(self.db.bidirectional(u'\uFFFE'), '')
self.assertEqual(self.db.bidirectional(u' '), 'WS')
self.assertEqual(self.db.bidirectional(u'A'), 'L')
verify(unicodedata.decomposition(u'\uFFFE') == '')
verify(unicodedata.decomposition(u'\u00bc') == '<fraction> 0031 2044 0034')
self.assertRaises(TypeError, self.db.bidirectional)
self.assertRaises(TypeError, self.db.bidirectional, u'xx')
verify(unicodedata.mirrored(u'\uFFFE') == 0)
verify(unicodedata.mirrored(u'a') == 0)
verify(unicodedata.mirrored(u'\u2201') == 1)
def test_decomposition(self):
self.assertEqual(self.db.decomposition(u'\uFFFE'),'')
self.assertEqual(self.db.decomposition(u'\u00bc'), '<fraction> 0031 2044 0034')
verify(unicodedata.combining(u'\uFFFE') == 0)
verify(unicodedata.combining(u'a') == 0)
verify(unicodedata.combining(u'\u20e1') == 230)
self.assertRaises(TypeError, self.db.decomposition)
self.assertRaises(TypeError, self.db.decomposition, u'xx')
print 'ok'
def test_mirrored(self):
self.assertEqual(self.db.mirrored(u'\uFFFE'), 0)
self.assertEqual(self.db.mirrored(u'a'), 0)
self.assertEqual(self.db.mirrored(u'\u2201'), 1)
self.assertRaises(TypeError, self.db.mirrored)
self.assertRaises(TypeError, self.db.mirrored, u'xx')
def test_combining(self):
self.assertEqual(self.db.combining(u'\uFFFE'), 0)
self.assertEqual(self.db.combining(u'a'), 0)
self.assertEqual(self.db.combining(u'\u20e1'), 230)
self.assertRaises(TypeError, self.db.combining)
self.assertRaises(TypeError, self.db.combining, u'xx')
def test_normalize(self):
self.assertRaises(TypeError, self.db.normalize)
self.assertRaises(ValueError, self.db.normalize, 'unknown', u'xx')
# The rest can be found in test_normalization.py
# which requires an external file.
class UnicodeMiscTest(UnicodeDatabaseTest):
def test_decimal_numeric_consistent(self):
# Test that decimal and numeric are consistent,
# i.e. if a character has a decimal value,
# it's numeric value should be the same.
count = 0
for i in xrange(0x10000):
c = unichr(i)
dec = self.db.decimal(c, -1)
if dec != -1:
self.assertEqual(dec, self.db.numeric(c))
count += 1
self.assert_(count >= 10) # should have tested at least the ASCII digits
def test_digit_numeric_consistent(self):
# Test that digit and numeric are consistent,
# i.e. if a character has a digit value,
# it's numeric value should be the same.
count = 0
for i in xrange(0x10000):
c = unichr(i)
dec = self.db.digit(c, -1)
if dec != -1:
self.assertEqual(dec, self.db.numeric(c))
count += 1
self.assert_(count >= 10) # should have tested at least the ASCII digits
def test_main():
suite = unittest.TestSuite()
suite.addTest(unittest.makeSuite(UnicodeMiscTest))
suite.addTest(unittest.makeSuite(UnicodeMethodsTest))
suite.addTest(unittest.makeSuite(UnicodeFunctionsTest))
test.test_support.run_suite(suite)
if __name__ == "__main__":
test_main()