mirror of
https://github.com/python/cpython.git
synced 2025-08-25 03:04:55 +00:00
GH-96172 fix unicodedata.east_asian_width being wrong on unassigned code points (#96207)
This commit is contained in:
parent
c1581a928c
commit
9c197bc8bf
4 changed files with 686 additions and 620 deletions
|
@ -77,7 +77,8 @@ BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
|
|||
"PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
|
||||
"ON", "LRI", "RLI", "FSI", "PDI" ]
|
||||
|
||||
EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
|
||||
# "N" needs to be the first entry, see the comment in makeunicodedata
|
||||
EASTASIANWIDTH_NAMES = [ "N", "H", "W", "Na", "A", "F" ]
|
||||
|
||||
MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]
|
||||
|
||||
|
@ -135,6 +136,14 @@ def maketables(trace=0):
|
|||
|
||||
def makeunicodedata(unicode, trace):
|
||||
|
||||
# the default value of east_asian_width is "N", for unassigned code points
|
||||
# not mentioned in EastAsianWidth.txt
|
||||
# in addition there are some reserved but unassigned code points in CJK
|
||||
# ranges that are classified as "W". code points in private use areas
|
||||
# have a width of "A". both of these have entries in
|
||||
# EastAsianWidth.txt
|
||||
# see https://unicode.org/reports/tr11/#Unassigned
|
||||
assert EASTASIANWIDTH_NAMES[0] == "N"
|
||||
dummy = (0, 0, 0, 0, 0, 0)
|
||||
table = [dummy]
|
||||
cache = {0: dummy}
|
||||
|
@ -160,12 +169,20 @@ def makeunicodedata(unicode, trace):
|
|||
category, combining, bidirectional, mirrored, eastasianwidth,
|
||||
normalizationquickcheck
|
||||
)
|
||||
# add entry to index and item tables
|
||||
i = cache.get(item)
|
||||
if i is None:
|
||||
cache[item] = i = len(table)
|
||||
table.append(item)
|
||||
index[char] = i
|
||||
elif unicode.widths[char] is not None:
|
||||
# an unassigned but reserved character, with a known
|
||||
# east_asian_width
|
||||
eastasianwidth = EASTASIANWIDTH_NAMES.index(unicode.widths[char])
|
||||
item = (0, 0, 0, 0, eastasianwidth, 0)
|
||||
else:
|
||||
continue
|
||||
|
||||
# add entry to index and item tables
|
||||
i = cache.get(item)
|
||||
if i is None:
|
||||
cache[item] = i = len(table)
|
||||
table.append(item)
|
||||
index[char] = i
|
||||
|
||||
# 2) decomposition data
|
||||
|
||||
|
@ -1085,6 +1102,7 @@ class UnicodeData:
|
|||
for i in range(0, 0x110000):
|
||||
if table[i] is not None:
|
||||
table[i].east_asian_width = widths[i]
|
||||
self.widths = widths
|
||||
|
||||
for char, (p,) in UcdFile(DERIVED_CORE_PROPERTIES, version).expanded():
|
||||
if table[char]:
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue