Port test_ucn and test_unicodedata to PyUnit. Add a few tests for error

cases increasing coverage in unicodedata.c from 87% to 95%
(when the normalization tests are run). From SF patch #662807.
This commit is contained in:
Walter Dörwald 2003-02-26 14:49:41 +00:00
parent 0ff7a4e7c1
commit 37c4728c64
4 changed files with 313 additions and 242 deletions

View file

@ -1,9 +0,0 @@
test_ucn
Testing General Unicode Character Name, and case insensitivity... done.
Testing name to code mapping.... done.
Testing hangul syllable names.... done.
Testing names of CJK unified ideographs.... done.
Testing code to name mapping for all BMP characters.... done.
Found 50212 characters in the unicode name database
Testing misc. symbols for unicode character name expansion.... done.
Testing unicode character name expansion strict error handling.... done.

View file

@ -1,5 +0,0 @@
test_unicodedata
Testing Unicode Database...
Methods: a37276dc2c158bef6dfd908ad34525c97180fad9
Functions: cfe20a967a450ebc82ca68c3e4eed344164e11af
API: ok

View file

@ -6,144 +6,141 @@ Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY. (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
"""#" """#"
from test.test_support import verify, verbose
print 'Testing General Unicode Character Name, and case insensitivity...', import unittest
# General and case insensitivity test: from test import test_support
try:
# put all \N escapes inside exec'd raw strings, to make sure this
# script runs even if the compiler chokes on \N escapes
exec r"""
s = u"\N{LATIN CAPITAL LETTER T}" \
u"\N{LATIN SMALL LETTER H}" \
u"\N{LATIN SMALL LETTER E}" \
u"\N{SPACE}" \
u"\N{LATIN SMALL LETTER R}" \
u"\N{LATIN CAPITAL LETTER E}" \
u"\N{LATIN SMALL LETTER D}" \
u"\N{SPACE}" \
u"\N{LATIN SMALL LETTER f}" \
u"\N{LATIN CAPITAL LeTtEr o}" \
u"\N{LATIN SMaLl LETTER x}" \
u"\N{SPACE}" \
u"\N{LATIN SMALL LETTER A}" \
u"\N{LATIN SMALL LETTER T}" \
u"\N{LATIN SMALL LETTER E}" \
u"\N{SPACE}" \
u"\N{LATIN SMALL LETTER T}" \
u"\N{LATIN SMALL LETTER H}" \
u"\N{LATIN SMALL LETTER E}" \
u"\N{SpAcE}" \
u"\N{LATIN SMALL LETTER S}" \
u"\N{LATIN SMALL LETTER H}" \
u"\N{LATIN SMALL LETTER E}" \
u"\N{LATIN SMALL LETTER E}" \
u"\N{LATIN SMALL LETTER P}" \
u"\N{FULL STOP}"
verify(s == u"The rEd fOx ate the sheep.", s)
"""
except UnicodeError, v:
print v
print "done."
import unicodedata class UnicodeNamesTest(unittest.TestCase):
print "Testing name to code mapping....", def checkletter(self, name, code):
for char in "SPAM": # Helper that put all \N escapes inside eval'd raw strings,
name = "LATIN SMALL LETTER %s" % char # to make sure this script runs even if the compiler
code = unicodedata.lookup(name) # chokes on \N escapes
verify(unicodedata.name(code) == name) res = eval(ur'u"\N{%s}"' % name)
print "done." self.assertEqual(res, code)
return res
print "Testing hangul syllable names....", def test_general(self):
exec r""" # General and case insensitivity test:
verify(u"\N{HANGUL SYLLABLE GA}" == u"\uac00") chars = [
verify(u"\N{HANGUL SYLLABLE GGWEOSS}" == u"\uafe8") "LATIN CAPITAL LETTER T",
verify(u"\N{HANGUL SYLLABLE DOLS}" == u"\ub3d0") "LATIN SMALL LETTER H",
verify(u"\N{HANGUL SYLLABLE RYAN}" == u"\ub7b8") "LATIN SMALL LETTER E",
verify(u"\N{HANGUL SYLLABLE MWIK}" == u"\ubba0") "SPACE",
verify(u"\N{HANGUL SYLLABLE BBWAEM}" == u"\ubf88") "LATIN SMALL LETTER R",
verify(u"\N{HANGUL SYLLABLE SSEOL}" == u"\uc370") "LATIN CAPITAL LETTER E",
verify(u"\N{HANGUL SYLLABLE YI}" == u"\uc758") "LATIN SMALL LETTER D",
verify(u"\N{HANGUL SYLLABLE JJYOSS}" == u"\ucb40") "SPACE",
verify(u"\N{HANGUL SYLLABLE KYEOLS}" == u"\ucf28") "LATIN SMALL LETTER f",
verify(u"\N{HANGUL SYLLABLE PAN}" == u"\ud310") "LATIN CAPITAL LeTtEr o",
verify(u"\N{HANGUL SYLLABLE HWEOK}" == u"\ud6f8") "LATIN SMaLl LETTER x",
verify(u"\N{HANGUL SYLLABLE HIH}" == u"\ud7a3") "SPACE",
""" "LATIN SMALL LETTER A",
try: "LATIN SMALL LETTER T",
unicodedata.name(u"\ud7a4") "LATIN SMALL LETTER E",
except ValueError: "SPACE",
pass "LATIN SMALL LETTER T",
else: "LATIN SMALL LETTER H",
raise AssertionError, "Found name for U+D7A4" "LATIN SMALL LETTER E",
print "done." "SpAcE",
"LATIN SMALL LETTER S",
"LATIN SMALL LETTER H",
"LATIN small LETTER e",
"LATIN small LETTER e",
"LATIN SMALL LETTER P",
"FULL STOP"
]
string = u"The rEd fOx ate the sheep."
print "Testing names of CJK unified ideographs....", self.assertEqual(
exec r""" u"".join([self.checkletter(*args) for args in zip(chars, string)]),
verify(u"\N{CJK UNIFIED IDEOGRAPH-3400}" == u"\u3400") string
verify(u"\N{CJK UNIFIED IDEOGRAPH-4DB5}" == u"\u4db5") )
verify(u"\N{CJK UNIFIED IDEOGRAPH-4E00}" == u"\u4e00")
verify(u"\N{CJK UNIFIED IDEOGRAPH-9FA5}" == u"\u9fa5")
verify(u"\N{CJK UNIFIED IDEOGRAPH-20000}" == u"\U00020000")
verify(u"\N{CJK UNIFIED IDEOGRAPH-2A6D6}" == u"\U0002a6d6")
"""
print "done."
print "Testing code to name mapping for all BMP characters....", def test_ascii_letters(self):
count = 0 import unicodedata
for code in range(0x10000):
try:
char = unichr(code)
name = unicodedata.name(char)
except (KeyError, ValueError):
pass
else:
verify(unicodedata.lookup(name) == char)
count += 1
print "done."
print "Found", count, "characters in the unicode name database" for char in "".join(map(chr, xrange(ord("a"), ord("z")))):
name = "LATIN SMALL LETTER %s" % char.upper()
code = unicodedata.lookup(name)
self.assertEqual(unicodedata.name(code), name)
# misc. symbol testing def test_hangul_syllables(self):
print "Testing misc. symbols for unicode character name expansion....", self.checkletter("HANGUL SYLLABLE GA", u"\uac00")
exec r""" self.checkletter("HANGUL SYLLABLE GGWEOSS", u"\uafe8")
verify(u"\N{PILCROW SIGN}" == u"\u00b6") self.checkletter("HANGUL SYLLABLE DOLS", u"\ub3d0")
verify(u"\N{REPLACEMENT CHARACTER}" == u"\uFFFD") self.checkletter("HANGUL SYLLABLE RYAN", u"\ub7b8")
verify(u"\N{HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK}" == u"\uFF9F") self.checkletter("HANGUL SYLLABLE MWIK", u"\ubba0")
verify(u"\N{FULLWIDTH LATIN SMALL LETTER A}" == u"\uFF41") self.checkletter("HANGUL SYLLABLE BBWAEM", u"\ubf88")
""" self.checkletter("HANGUL SYLLABLE SSEOL", u"\uc370")
print "done." self.checkletter("HANGUL SYLLABLE YI", u"\uc758")
self.checkletter("HANGUL SYLLABLE JJYOSS", u"\ucb40")
self.checkletter("HANGUL SYLLABLE KYEOLS", u"\ucf28")
self.checkletter("HANGUL SYLLABLE PAN", u"\ud310")
self.checkletter("HANGUL SYLLABLE HWEOK", u"\ud6f8")
self.checkletter("HANGUL SYLLABLE HIH", u"\ud7a3")
# strict error testing: import unicodedata
print "Testing unicode character name expansion strict error handling....", self.assertRaises(ValueError, unicodedata.name, u"\ud7a4")
try:
unicode("\N{blah}", 'unicode-escape', 'strict')
except UnicodeError:
pass
else:
raise AssertionError, "failed to raise an exception when given a bogus character name"
try: def test_cjk_unified_ideographs(self):
unicode("\N{" + "x" * 100000 + "}", 'unicode-escape', 'strict') self.checkletter("CJK UNIFIED IDEOGRAPH-3400", u"\u3400")
except UnicodeError: self.checkletter("CJK UNIFIED IDEOGRAPH-4DB5", u"\u4db5")
pass self.checkletter("CJK UNIFIED IDEOGRAPH-4E00", u"\u4e00")
else: self.checkletter("CJK UNIFIED IDEOGRAPH-9FA5", u"\u9fa5")
raise AssertionError, "failed to raise an exception when given a very " \ self.checkletter("CJK UNIFIED IDEOGRAPH-20000", u"\U00020000")
"long bogus character name" self.checkletter("CJK UNIFIED IDEOGRAPH-2A6D6", u"\U0002a6d6")
try: def test_bmp_characters(self):
unicode("\N{SPACE", 'unicode-escape', 'strict') import unicodedata
except UnicodeError: count = 0
pass for code in xrange(0x10000):
else: char = unichr(code)
raise AssertionError, "failed to raise an exception for a missing closing brace." name = unicodedata.name(char, None)
if name is not None:
self.assertEqual(unicodedata.lookup(name), char)
count += 1
try: def test_misc_symbols(self):
unicode("\NSPACE", 'unicode-escape', 'strict') self.checkletter("PILCROW SIGN", u"\u00b6")
except UnicodeError: self.checkletter("REPLACEMENT CHARACTER", u"\uFFFD")
pass self.checkletter("HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK", u"\uFF9F")
else: self.checkletter("FULLWIDTH LATIN SMALL LETTER A", u"\uFF41")
raise AssertionError, "failed to raise an exception for a missing opening brace."
print "done." def test_errors(self):
import unicodedata
self.assertRaises(TypeError, unicodedata.name)
self.assertRaises(TypeError, unicodedata.name, u'xx')
self.assertRaises(TypeError, unicodedata.lookup)
self.assertRaises(KeyError, unicodedata.lookup, u'unknown')
def test_strict_eror_handling(self):
# bogus character name
self.assertRaises(
UnicodeError,
unicode, "\\N{blah}", 'unicode-escape', 'strict'
)
# long bogus character name
self.assertRaises(
UnicodeError,
unicode, "\\N{%s}" % ("x" * 100000), 'unicode-escape', 'strict'
)
# missing closing brace
self.assertRaises(
UnicodeError,
unicode, "\\N{SPACE", 'unicode-escape', 'strict'
)
# missing opening brace
self.assertRaises(
UnicodeError,
unicode, "\\NSPACE", 'unicode-escape', 'strict'
)
def test_main():
suite = unittest.TestSuite()
suite.addTest(unittest.makeSuite(UnicodeNamesTest))
test_support.run_suite(suite)
if __name__ == "__main__":
test_main()

View file

@ -5,121 +5,209 @@
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY. (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
"""#" """#"
from test.test_support import verify, verbose import unittest, test.test_support
import sha import sha
encoding = 'utf-8' encoding = 'utf-8'
def test_methods():
h = sha.sha()
for i in range(65536):
char = unichr(i)
data = [
# Predicates (single char)
char.isalnum() and u'1' or u'0',
char.isalpha() and u'1' or u'0',
char.isdecimal() and u'1' or u'0',
char.isdigit() and u'1' or u'0',
char.islower() and u'1' or u'0',
char.isnumeric() and u'1' or u'0',
char.isspace() and u'1' or u'0',
char.istitle() and u'1' or u'0',
char.isupper() and u'1' or u'0',
# Predicates (multiple chars)
(char + u'abc').isalnum() and u'1' or u'0',
(char + u'abc').isalpha() and u'1' or u'0',
(char + u'123').isdecimal() and u'1' or u'0',
(char + u'123').isdigit() and u'1' or u'0',
(char + u'abc').islower() and u'1' or u'0',
(char + u'123').isnumeric() and u'1' or u'0',
(char + u' \t').isspace() and u'1' or u'0',
(char + u'abc').istitle() and u'1' or u'0',
(char + u'ABC').isupper() and u'1' or u'0',
# Mappings (single char)
char.lower(),
char.upper(),
char.title(),
# Mappings (multiple chars)
(char + u'abc').lower(),
(char + u'ABC').upper(),
(char + u'abc').title(),
(char + u'ABC').title(),
]
h.update(u''.join(data).encode(encoding))
return h.hexdigest()
def test_unicodedata():
h = sha.sha()
for i in range(65536):
char = unichr(i)
data = [
# Properties
str(unicodedata.digit(char, -1)),
str(unicodedata.numeric(char, -1)),
str(unicodedata.decimal(char, -1)),
unicodedata.category(char),
unicodedata.bidirectional(char),
unicodedata.decomposition(char),
str(unicodedata.mirrored(char)),
str(unicodedata.combining(char)),
]
h.update(''.join(data))
return h.hexdigest()
### Run tests ### Run tests
print 'Testing Unicode Database...' class UnicodeMethodsTest(unittest.TestCase):
print 'Methods:',
print test_methods()
# In case unicodedata is not available, this will raise an ImportError, # update this, if the database changes
# but still test the above cases... expectedchecksum = 'a37276dc2c158bef6dfd908ad34525c97180fad9'
import unicodedata
print 'Functions:',
print test_unicodedata()
# Some additional checks of the API: def test_method_checksum(self):
print 'API:', h = sha.sha()
for i in range(65536):
char = unichr(i)
data = [
# Predicates (single char)
u"01"[char.isalnum()],
u"01"[char.isalpha()],
u"01"[char.isdecimal()],
u"01"[char.isdigit()],
u"01"[char.islower()],
u"01"[char.isnumeric()],
u"01"[char.isspace()],
u"01"[char.istitle()],
u"01"[char.isupper()],
verify(unicodedata.digit(u'A',None) is None) # Predicates (multiple chars)
verify(unicodedata.digit(u'9') == 9) u"01"[(char + u'abc').isalnum()],
verify(unicodedata.digit(u'\u215b',None) is None) u"01"[(char + u'abc').isalpha()],
verify(unicodedata.digit(u'\u2468') == 9) u"01"[(char + u'123').isdecimal()],
u"01"[(char + u'123').isdigit()],
u"01"[(char + u'abc').islower()],
u"01"[(char + u'123').isnumeric()],
u"01"[(char + u' \t').isspace()],
u"01"[(char + u'abc').istitle()],
u"01"[(char + u'ABC').isupper()],
verify(unicodedata.numeric(u'A',None) is None) # Mappings (single char)
verify(unicodedata.numeric(u'9') == 9) char.lower(),
verify(unicodedata.numeric(u'\u215b') == 0.125) char.upper(),
verify(unicodedata.numeric(u'\u2468') == 9.0) char.title(),
verify(unicodedata.decimal(u'A',None) is None) # Mappings (multiple chars)
verify(unicodedata.decimal(u'9') == 9) (char + u'abc').lower(),
verify(unicodedata.decimal(u'\u215b',None) is None) (char + u'ABC').upper(),
verify(unicodedata.decimal(u'\u2468',None) is None) (char + u'abc').title(),
(char + u'ABC').title(),
verify(unicodedata.category(u'\uFFFE') == 'Cn') ]
verify(unicodedata.category(u'a') == 'Ll') h.update(u''.join(data).encode(encoding))
verify(unicodedata.category(u'A') == 'Lu') result = h.hexdigest()
self.assertEqual(result, self.expectedchecksum)
verify(unicodedata.bidirectional(u'\uFFFE') == '') class UnicodeDatabaseTest(unittest.TestCase):
verify(unicodedata.bidirectional(u' ') == 'WS')
verify(unicodedata.bidirectional(u'A') == 'L')
verify(unicodedata.decomposition(u'\uFFFE') == '') def setUp(self):
verify(unicodedata.decomposition(u'\u00bc') == '<fraction> 0031 2044 0034') # In case unicodedata is not available, this will raise an ImportError,
# but the other test cases will still be run
import unicodedata
self.db = unicodedata
verify(unicodedata.mirrored(u'\uFFFE') == 0) def tearDown(self):
verify(unicodedata.mirrored(u'a') == 0) del self.db
verify(unicodedata.mirrored(u'\u2201') == 1)
verify(unicodedata.combining(u'\uFFFE') == 0) class UnicodeFunctionsTest(UnicodeDatabaseTest):
verify(unicodedata.combining(u'a') == 0)
verify(unicodedata.combining(u'\u20e1') == 230)
print 'ok' # update this, if the database changes
expectedchecksum = 'cfe20a967a450ebc82ca68c3e4eed344164e11af'
def test_function_checksum(self):
data = []
h = sha.sha()
for i in range(0x10000):
char = unichr(i)
data = [
# Properties
str(self.db.digit(char, -1)),
str(self.db.numeric(char, -1)),
str(self.db.decimal(char, -1)),
self.db.category(char),
self.db.bidirectional(char),
self.db.decomposition(char),
str(self.db.mirrored(char)),
str(self.db.combining(char)),
]
h.update(''.join(data))
result = h.hexdigest()
self.assertEqual(result, self.expectedchecksum)
def test_digit(self):
self.assertEqual(self.db.digit(u'A', None), None)
self.assertEqual(self.db.digit(u'9'), 9)
self.assertEqual(self.db.digit(u'\u215b', None), None)
self.assertEqual(self.db.digit(u'\u2468'), 9)
self.assertRaises(TypeError, self.db.digit)
self.assertRaises(TypeError, self.db.digit, u'xx')
self.assertRaises(ValueError, self.db.digit, u'x')
def test_numeric(self):
self.assertEqual(self.db.numeric(u'A',None), None)
self.assertEqual(self.db.numeric(u'9'), 9)
self.assertEqual(self.db.numeric(u'\u215b'), 0.125)
self.assertEqual(self.db.numeric(u'\u2468'), 9.0)
self.assertRaises(TypeError, self.db.numeric)
self.assertRaises(TypeError, self.db.numeric, u'xx')
self.assertRaises(ValueError, self.db.numeric, u'x')
def test_decimal(self):
self.assertEqual(self.db.decimal(u'A',None), None)
self.assertEqual(self.db.decimal(u'9'), 9)
self.assertEqual(self.db.decimal(u'\u215b', None), None)
self.assertEqual(self.db.decimal(u'\u2468', None), None)
self.assertRaises(TypeError, self.db.decimal)
self.assertRaises(TypeError, self.db.decimal, u'xx')
self.assertRaises(ValueError, self.db.decimal, u'x')
def test_category(self):
self.assertEqual(self.db.category(u'\uFFFE'), 'Cn')
self.assertEqual(self.db.category(u'a'), 'Ll')
self.assertEqual(self.db.category(u'A'), 'Lu')
self.assertRaises(TypeError, self.db.category)
self.assertRaises(TypeError, self.db.category, u'xx')
def test_bidirectional(self):
self.assertEqual(self.db.bidirectional(u'\uFFFE'), '')
self.assertEqual(self.db.bidirectional(u' '), 'WS')
self.assertEqual(self.db.bidirectional(u'A'), 'L')
self.assertRaises(TypeError, self.db.bidirectional)
self.assertRaises(TypeError, self.db.bidirectional, u'xx')
def test_decomposition(self):
self.assertEqual(self.db.decomposition(u'\uFFFE'),'')
self.assertEqual(self.db.decomposition(u'\u00bc'), '<fraction> 0031 2044 0034')
self.assertRaises(TypeError, self.db.decomposition)
self.assertRaises(TypeError, self.db.decomposition, u'xx')
def test_mirrored(self):
self.assertEqual(self.db.mirrored(u'\uFFFE'), 0)
self.assertEqual(self.db.mirrored(u'a'), 0)
self.assertEqual(self.db.mirrored(u'\u2201'), 1)
self.assertRaises(TypeError, self.db.mirrored)
self.assertRaises(TypeError, self.db.mirrored, u'xx')
def test_combining(self):
self.assertEqual(self.db.combining(u'\uFFFE'), 0)
self.assertEqual(self.db.combining(u'a'), 0)
self.assertEqual(self.db.combining(u'\u20e1'), 230)
self.assertRaises(TypeError, self.db.combining)
self.assertRaises(TypeError, self.db.combining, u'xx')
def test_normalize(self):
self.assertRaises(TypeError, self.db.normalize)
self.assertRaises(ValueError, self.db.normalize, 'unknown', u'xx')
# The rest can be found in test_normalization.py
# which requires an external file.
class UnicodeMiscTest(UnicodeDatabaseTest):
def test_decimal_numeric_consistent(self):
# Test that decimal and numeric are consistent,
# i.e. if a character has a decimal value,
# it's numeric value should be the same.
count = 0
for i in xrange(0x10000):
c = unichr(i)
dec = self.db.decimal(c, -1)
if dec != -1:
self.assertEqual(dec, self.db.numeric(c))
count += 1
self.assert_(count >= 10) # should have tested at least the ASCII digits
def test_digit_numeric_consistent(self):
# Test that digit and numeric are consistent,
# i.e. if a character has a digit value,
# it's numeric value should be the same.
count = 0
for i in xrange(0x10000):
c = unichr(i)
dec = self.db.digit(c, -1)
if dec != -1:
self.assertEqual(dec, self.db.numeric(c))
count += 1
self.assert_(count >= 10) # should have tested at least the ASCII digits
def test_main():
suite = unittest.TestSuite()
suite.addTest(unittest.makeSuite(UnicodeMiscTest))
suite.addTest(unittest.makeSuite(UnicodeMethodsTest))
suite.addTest(unittest.makeSuite(UnicodeFunctionsTest))
test.test_support.run_suite(suite)
if __name__ == "__main__":
test_main()