mirror of
https://github.com/python/cpython.git
synced 2025-09-26 10:19:53 +00:00
bpo-37760: Convert from length-18 lists to a dataclass, in makeunicodedata. (GH-15265)
Now the fields have names! Much easier to keep straight as a reader than the elements of an 18-tuple. Runs about 10-15% slower: from 10.8s to 12.3s, on my laptop. Fortunately that's perfectly fine for this maintenance script.
This commit is contained in:
parent
5e9caeec76
commit
a65678c5c9
2 changed files with 94 additions and 62 deletions
|
@ -0,0 +1,6 @@
|
||||||
|
The :file:`Tools/unicode/makeunicodedata.py` script, which is used for
|
||||||
|
converting information from the Unicode Character Database into generated
|
||||||
|
code and data used by the methods of :class:`str` and by the
|
||||||
|
:mod:`unicodedata` module, now handles each character's data as a
|
||||||
|
``dataclass`` with named attributes, rather than a length-18 list of
|
||||||
|
different fields.
|
|
@ -26,13 +26,14 @@
|
||||||
# written by Fredrik Lundh (fredrik@pythonware.com)
|
# written by Fredrik Lundh (fredrik@pythonware.com)
|
||||||
#
|
#
|
||||||
|
|
||||||
|
import dataclasses
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import zipfile
|
import zipfile
|
||||||
|
|
||||||
from functools import partial
|
from functools import partial
|
||||||
from textwrap import dedent
|
from textwrap import dedent
|
||||||
from typing import Iterator, List, Tuple
|
from typing import Iterator, List, Optional, Set, Tuple
|
||||||
|
|
||||||
SCRIPT = sys.argv[0]
|
SCRIPT = sys.argv[0]
|
||||||
VERSION = "3.3"
|
VERSION = "3.3"
|
||||||
|
@ -148,12 +149,12 @@ def makeunicodedata(unicode, trace):
|
||||||
record = unicode.table[char]
|
record = unicode.table[char]
|
||||||
if record:
|
if record:
|
||||||
# extract database properties
|
# extract database properties
|
||||||
category = CATEGORY_NAMES.index(record[2])
|
category = CATEGORY_NAMES.index(record.general_category)
|
||||||
combining = int(record[3])
|
combining = int(record.canonical_combining_class)
|
||||||
bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
|
bidirectional = BIDIRECTIONAL_NAMES.index(record.bidi_class)
|
||||||
mirrored = record[9] == "Y"
|
mirrored = record.bidi_mirrored == "Y"
|
||||||
eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
|
eastasianwidth = EASTASIANWIDTH_NAMES.index(record.east_asian_width)
|
||||||
normalizationquickcheck = record[17]
|
normalizationquickcheck = record.quick_check
|
||||||
item = (
|
item = (
|
||||||
category, combining, bidirectional, mirrored, eastasianwidth,
|
category, combining, bidirectional, mirrored, eastasianwidth,
|
||||||
normalizationquickcheck
|
normalizationquickcheck
|
||||||
|
@ -179,8 +180,8 @@ def makeunicodedata(unicode, trace):
|
||||||
for char in unicode.chars:
|
for char in unicode.chars:
|
||||||
record = unicode.table[char]
|
record = unicode.table[char]
|
||||||
if record:
|
if record:
|
||||||
if record[5]:
|
if record.decomposition_type:
|
||||||
decomp = record[5].split()
|
decomp = record.decomposition_type.split()
|
||||||
if len(decomp) > 19:
|
if len(decomp) > 19:
|
||||||
raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)
|
raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)
|
||||||
# prefix
|
# prefix
|
||||||
|
@ -200,7 +201,7 @@ def makeunicodedata(unicode, trace):
|
||||||
# Collect NFC pairs
|
# Collect NFC pairs
|
||||||
if not prefix and len(decomp) == 3 and \
|
if not prefix and len(decomp) == 3 and \
|
||||||
char not in unicode.exclusions and \
|
char not in unicode.exclusions and \
|
||||||
unicode.table[decomp[1]][3] == "0":
|
unicode.table[decomp[1]].canonical_combining_class == "0":
|
||||||
p, l, r = decomp
|
p, l, r = decomp
|
||||||
comp_first[l] = 1
|
comp_first[l] = 1
|
||||||
comp_last[r] = 1
|
comp_last[r] = 1
|
||||||
|
@ -404,9 +405,9 @@ def makeunicodetype(unicode, trace):
|
||||||
record = unicode.table[char]
|
record = unicode.table[char]
|
||||||
if record:
|
if record:
|
||||||
# extract database properties
|
# extract database properties
|
||||||
category = record[2]
|
category = record.general_category
|
||||||
bidirectional = record[4]
|
bidirectional = record.bidi_class
|
||||||
properties = record[16]
|
properties = record.binary_properties
|
||||||
flags = 0
|
flags = 0
|
||||||
if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
|
if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
|
||||||
flags |= ALPHA_MASK
|
flags |= ALPHA_MASK
|
||||||
|
@ -434,16 +435,16 @@ def makeunicodetype(unicode, trace):
|
||||||
flags |= CASE_IGNORABLE_MASK
|
flags |= CASE_IGNORABLE_MASK
|
||||||
sc = unicode.special_casing.get(char)
|
sc = unicode.special_casing.get(char)
|
||||||
cf = unicode.case_folding.get(char, [char])
|
cf = unicode.case_folding.get(char, [char])
|
||||||
if record[12]:
|
if record.simple_uppercase_mapping:
|
||||||
upper = int(record[12], 16)
|
upper = int(record.simple_uppercase_mapping, 16)
|
||||||
else:
|
else:
|
||||||
upper = char
|
upper = char
|
||||||
if record[13]:
|
if record.simple_lowercase_mapping:
|
||||||
lower = int(record[13], 16)
|
lower = int(record.simple_lowercase_mapping, 16)
|
||||||
else:
|
else:
|
||||||
lower = char
|
lower = char
|
||||||
if record[14]:
|
if record.simple_titlecase_mapping:
|
||||||
title = int(record[14], 16)
|
title = int(record.simple_titlecase_mapping, 16)
|
||||||
else:
|
else:
|
||||||
title = upper
|
title = upper
|
||||||
if sc is None and cf != [lower]:
|
if sc is None and cf != [lower]:
|
||||||
|
@ -480,16 +481,16 @@ def makeunicodetype(unicode, trace):
|
||||||
extra_casing.extend(sc[1])
|
extra_casing.extend(sc[1])
|
||||||
# decimal digit, integer digit
|
# decimal digit, integer digit
|
||||||
decimal = 0
|
decimal = 0
|
||||||
if record[6]:
|
if record.decomposition_mapping:
|
||||||
flags |= DECIMAL_MASK
|
flags |= DECIMAL_MASK
|
||||||
decimal = int(record[6])
|
decimal = int(record.decomposition_mapping)
|
||||||
digit = 0
|
digit = 0
|
||||||
if record[7]:
|
if record.numeric_type:
|
||||||
flags |= DIGIT_MASK
|
flags |= DIGIT_MASK
|
||||||
digit = int(record[7])
|
digit = int(record.numeric_type)
|
||||||
if record[8]:
|
if record.numeric_value:
|
||||||
flags |= NUMERIC_MASK
|
flags |= NUMERIC_MASK
|
||||||
numeric.setdefault(record[8], []).append(char)
|
numeric.setdefault(record.numeric_value, []).append(char)
|
||||||
item = (
|
item = (
|
||||||
upper, lower, title, decimal, digit, flags
|
upper, lower, title, decimal, digit, flags
|
||||||
)
|
)
|
||||||
|
@ -609,7 +610,7 @@ def makeunicodename(unicode, trace):
|
||||||
for char in unicode.chars:
|
for char in unicode.chars:
|
||||||
record = unicode.table[char]
|
record = unicode.table[char]
|
||||||
if record:
|
if record:
|
||||||
name = record[1].strip()
|
name = record.name.strip()
|
||||||
if name and name[0] != "<":
|
if name and name[0] != "<":
|
||||||
names[char] = name + chr(0)
|
names[char] = name + chr(0)
|
||||||
|
|
||||||
|
@ -719,7 +720,7 @@ def makeunicodename(unicode, trace):
|
||||||
for char in unicode.chars:
|
for char in unicode.chars:
|
||||||
record = unicode.table[char]
|
record = unicode.table[char]
|
||||||
if record:
|
if record:
|
||||||
name = record[1].strip()
|
name = record.name.strip()
|
||||||
if name and name[0] != "<":
|
if name and name[0] != "<":
|
||||||
data.append((name, char))
|
data.append((name, char))
|
||||||
|
|
||||||
|
@ -819,31 +820,27 @@ def merge_old_version(version, new, old):
|
||||||
continue
|
continue
|
||||||
# check characters that differ
|
# check characters that differ
|
||||||
if old.table[i] != new.table[i]:
|
if old.table[i] != new.table[i]:
|
||||||
for k in range(len(old.table[i])):
|
for k, field in enumerate(dataclasses.fields(UcdRecord)):
|
||||||
if old.table[i][k] != new.table[i][k]:
|
value = getattr(old.table[i], field.name)
|
||||||
value = old.table[i][k]
|
new_value = getattr(new.table[i], field.name)
|
||||||
|
if value != new_value:
|
||||||
if k == 1 and i in PUA_15:
|
if k == 1 and i in PUA_15:
|
||||||
# the name is not set in the old.table, but in the
|
# the name is not set in the old.table, but in the
|
||||||
# new.table we are using it for aliases and named seq
|
# new.table we are using it for aliases and named seq
|
||||||
assert value == ''
|
assert value == ''
|
||||||
elif k == 2:
|
elif k == 2:
|
||||||
#print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
|
|
||||||
category_changes[i] = CATEGORY_NAMES.index(value)
|
category_changes[i] = CATEGORY_NAMES.index(value)
|
||||||
elif k == 4:
|
elif k == 4:
|
||||||
#print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
|
|
||||||
bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
|
bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
|
||||||
elif k == 5:
|
elif k == 5:
|
||||||
#print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
|
|
||||||
# We assume that all normalization changes are in 1:1 mappings
|
# We assume that all normalization changes are in 1:1 mappings
|
||||||
assert " " not in value
|
assert " " not in value
|
||||||
normalization_changes.append((i, value))
|
normalization_changes.append((i, value))
|
||||||
elif k == 6:
|
elif k == 6:
|
||||||
#print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
|
|
||||||
# we only support changes where the old value is a single digit
|
# we only support changes where the old value is a single digit
|
||||||
assert value in "0123456789"
|
assert value in "0123456789"
|
||||||
decimal_changes[i] = int(value)
|
decimal_changes[i] = int(value)
|
||||||
elif k == 8:
|
elif k == 8:
|
||||||
# print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
|
|
||||||
# Since 0 encodes "no change", the old value is better not 0
|
# Since 0 encodes "no change", the old value is better not 0
|
||||||
if not value:
|
if not value:
|
||||||
numeric_changes[i] = -1
|
numeric_changes[i] = -1
|
||||||
|
@ -952,6 +949,45 @@ class UcdFile:
|
||||||
yield char, rest
|
yield char, rest
|
||||||
|
|
||||||
|
|
||||||
|
@dataclasses.dataclass
|
||||||
|
class UcdRecord:
|
||||||
|
# 15 fields from UnicodeData.txt . See:
|
||||||
|
# https://www.unicode.org/reports/tr44/#UnicodeData.txt
|
||||||
|
codepoint: str
|
||||||
|
name: str
|
||||||
|
general_category: str
|
||||||
|
canonical_combining_class: str
|
||||||
|
bidi_class: str
|
||||||
|
decomposition_type: str
|
||||||
|
decomposition_mapping: str
|
||||||
|
numeric_type: str
|
||||||
|
numeric_value: str
|
||||||
|
bidi_mirrored: str
|
||||||
|
unicode_1_name: str # obsolete
|
||||||
|
iso_comment: str # obsolete
|
||||||
|
simple_uppercase_mapping: str
|
||||||
|
simple_lowercase_mapping: str
|
||||||
|
simple_titlecase_mapping: str
|
||||||
|
|
||||||
|
# https://www.unicode.org/reports/tr44/#EastAsianWidth.txt
|
||||||
|
east_asian_width: Optional[str]
|
||||||
|
|
||||||
|
# Binary properties, as a set of those that are true.
|
||||||
|
# Taken from multiple files:
|
||||||
|
# https://www.unicode.org/reports/tr44/#DerivedCoreProperties.txt
|
||||||
|
# https://www.unicode.org/reports/tr44/#LineBreak.txt
|
||||||
|
binary_properties: Set[str]
|
||||||
|
|
||||||
|
# The Quick_Check properties related to normalization:
|
||||||
|
# https://www.unicode.org/reports/tr44/#Decompositions_and_Normalization
|
||||||
|
# We store them as a bitmask.
|
||||||
|
quick_check: int
|
||||||
|
|
||||||
|
|
||||||
|
def from_row(row: List[str]) -> UcdRecord:
|
||||||
|
return UcdRecord(*row, None, set(), 0)
|
||||||
|
|
||||||
|
|
||||||
# --------------------------------------------------------------------
|
# --------------------------------------------------------------------
|
||||||
# the following support code is taken from the unidb utilities
|
# the following support code is taken from the unidb utilities
|
||||||
# Copyright (c) 1999-2000 by Secret Labs AB
|
# Copyright (c) 1999-2000 by Secret Labs AB
|
||||||
|
@ -959,18 +995,14 @@ class UcdFile:
|
||||||
# load a unicode-data file from disk
|
# load a unicode-data file from disk
|
||||||
|
|
||||||
class UnicodeData:
|
class UnicodeData:
|
||||||
# Record structure:
|
# table: List[Optional[UcdRecord]] # index is codepoint; None means unassigned
|
||||||
# [ID, name, category, combining, bidi, decomp, (6)
|
|
||||||
# decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)
|
|
||||||
# ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
|
|
||||||
# derived-props] (17)
|
|
||||||
|
|
||||||
def __init__(self, version, cjk_check=True):
|
def __init__(self, version, cjk_check=True):
|
||||||
self.changed = []
|
self.changed = []
|
||||||
table = [None] * 0x110000
|
table = [None] * 0x110000
|
||||||
for s in UcdFile(UNICODE_DATA, version):
|
for s in UcdFile(UNICODE_DATA, version):
|
||||||
char = int(s[0], 16)
|
char = int(s[0], 16)
|
||||||
table[char] = s
|
table[char] = from_row(s)
|
||||||
|
|
||||||
cjk_ranges_found = []
|
cjk_ranges_found = []
|
||||||
|
|
||||||
|
@ -982,19 +1014,17 @@ class UnicodeData:
|
||||||
# https://www.unicode.org/reports/tr44/#Code_Point_Ranges
|
# https://www.unicode.org/reports/tr44/#Code_Point_Ranges
|
||||||
s = table[i]
|
s = table[i]
|
||||||
if s:
|
if s:
|
||||||
if s[1][-6:] == "First>":
|
if s.name[-6:] == "First>":
|
||||||
s[1] = ""
|
s.name = ""
|
||||||
field = s
|
field = dataclasses.astuple(s)[:15]
|
||||||
elif s[1][-5:] == "Last>":
|
elif s.name[-5:] == "Last>":
|
||||||
if s[1].startswith("<CJK Ideograph"):
|
if s.name.startswith("<CJK Ideograph"):
|
||||||
cjk_ranges_found.append((field[0],
|
cjk_ranges_found.append((field[0],
|
||||||
s[0]))
|
s.codepoint))
|
||||||
s[1] = ""
|
s.name = ""
|
||||||
field = None
|
field = None
|
||||||
elif field:
|
elif field:
|
||||||
f2 = field[:]
|
table[i] = from_row(('%X' % i,) + field[1:])
|
||||||
f2[0] = "%X" % i
|
|
||||||
table[i] = f2
|
|
||||||
if cjk_check and cjk_ranges != cjk_ranges_found:
|
if cjk_check and cjk_ranges != cjk_ranges_found:
|
||||||
raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)
|
raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)
|
||||||
|
|
||||||
|
@ -1015,7 +1045,7 @@ class UnicodeData:
|
||||||
char = int(char, 16)
|
char = int(char, 16)
|
||||||
self.aliases.append((name, char))
|
self.aliases.append((name, char))
|
||||||
# also store the name in the PUA 1
|
# also store the name in the PUA 1
|
||||||
self.table[pua_index][1] = name
|
self.table[pua_index].name = name
|
||||||
pua_index += 1
|
pua_index += 1
|
||||||
assert pua_index - NAME_ALIASES_START == len(self.aliases)
|
assert pua_index - NAME_ALIASES_START == len(self.aliases)
|
||||||
|
|
||||||
|
@ -1034,7 +1064,7 @@ class UnicodeData:
|
||||||
"the NamedSequence struct and in unicodedata_lookup")
|
"the NamedSequence struct and in unicodedata_lookup")
|
||||||
self.named_sequences.append((name, chars))
|
self.named_sequences.append((name, chars))
|
||||||
# also store these in the PUA 1
|
# also store these in the PUA 1
|
||||||
self.table[pua_index][1] = name
|
self.table[pua_index].name = name
|
||||||
pua_index += 1
|
pua_index += 1
|
||||||
assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)
|
assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)
|
||||||
|
|
||||||
|
@ -1049,23 +1079,19 @@ class UnicodeData:
|
||||||
|
|
||||||
for i in range(0, 0x110000):
|
for i in range(0, 0x110000):
|
||||||
if table[i] is not None:
|
if table[i] is not None:
|
||||||
table[i].append(widths[i])
|
table[i].east_asian_width = widths[i]
|
||||||
|
|
||||||
for i in range(0, 0x110000):
|
|
||||||
if table[i] is not None:
|
|
||||||
table[i].append(set())
|
|
||||||
|
|
||||||
for char, (p,) in UcdFile(DERIVED_CORE_PROPERTIES, version).expanded():
|
for char, (p,) in UcdFile(DERIVED_CORE_PROPERTIES, version).expanded():
|
||||||
if table[char]:
|
if table[char]:
|
||||||
# Some properties (e.g. Default_Ignorable_Code_Point)
|
# Some properties (e.g. Default_Ignorable_Code_Point)
|
||||||
# apply to unassigned code points; ignore them
|
# apply to unassigned code points; ignore them
|
||||||
table[char][-1].add(p)
|
table[char].binary_properties.add(p)
|
||||||
|
|
||||||
for char_range, value in UcdFile(LINE_BREAK, version):
|
for char_range, value in UcdFile(LINE_BREAK, version):
|
||||||
if value not in MANDATORY_LINE_BREAKS:
|
if value not in MANDATORY_LINE_BREAKS:
|
||||||
continue
|
continue
|
||||||
for char in expand_range(char_range):
|
for char in expand_range(char_range):
|
||||||
table[char][-1].add('Line_Break')
|
table[char].binary_properties.add('Line_Break')
|
||||||
|
|
||||||
# We only want the quickcheck properties
|
# We only want the quickcheck properties
|
||||||
# Format: NF?_QC; Y(es)/N(o)/M(aybe)
|
# Format: NF?_QC; Y(es)/N(o)/M(aybe)
|
||||||
|
@ -1087,7 +1113,7 @@ class UnicodeData:
|
||||||
quickchecks[char] |= quickcheck
|
quickchecks[char] |= quickcheck
|
||||||
for i in range(0, 0x110000):
|
for i in range(0, 0x110000):
|
||||||
if table[i] is not None:
|
if table[i] is not None:
|
||||||
table[i].append(quickchecks[i])
|
table[i].quick_check = quickchecks[i]
|
||||||
|
|
||||||
with open_data(UNIHAN, version) as file:
|
with open_data(UNIHAN, version) as file:
|
||||||
zip = zipfile.ZipFile(file)
|
zip = zipfile.ZipFile(file)
|
||||||
|
@ -1106,7 +1132,7 @@ class UnicodeData:
|
||||||
i = int(code[2:], 16)
|
i = int(code[2:], 16)
|
||||||
# Patch the numeric field
|
# Patch the numeric field
|
||||||
if table[i] is not None:
|
if table[i] is not None:
|
||||||
table[i][8] = value
|
table[i].numeric_value = value
|
||||||
|
|
||||||
sc = self.special_casing = {}
|
sc = self.special_casing = {}
|
||||||
for data in UcdFile(SPECIAL_CASING, version):
|
for data in UcdFile(SPECIAL_CASING, version):
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue