mirror of
https://github.com/python/cpython.git
synced 2025-07-19 09:15:34 +00:00
gh-91575: Add a script for generating data for case-insensitive matching in re (GH-91660)
Also test that all extra cases are in BMP.
This commit is contained in:
parent
48ec61a89a
commit
f912cc0e41
5 changed files with 212 additions and 57 deletions
95
Tools/scripts/generate_re_casefix.py
Executable file
95
Tools/scripts/generate_re_casefix.py
Executable file
|
@ -0,0 +1,95 @@
|
|||
#! /usr/bin/env python3
|
||||
# This script generates Lib/re/_casefix.py.
|
||||
|
||||
import collections
|
||||
import re
|
||||
import sys
|
||||
import unicodedata
|
||||
|
||||
def update_file(file, content):
|
||||
try:
|
||||
with open(file, 'r', encoding='utf-8') as fobj:
|
||||
if fobj.read() == content:
|
||||
return False
|
||||
except (OSError, ValueError):
|
||||
pass
|
||||
with open(file, 'w', encoding='utf-8') as fobj:
|
||||
fobj.write(content)
|
||||
return True
|
||||
|
||||
re_casefix_template = """\
|
||||
# Auto-generated by Tools/scripts/generate_re_casefix.py.
|
||||
|
||||
# Maps the code of lowercased character to codes of different lowercased
|
||||
# characters which have the same uppercase.
|
||||
_EXTRA_CASES = {
|
||||
%s
|
||||
}
|
||||
"""
|
||||
|
||||
def uname(i):
|
||||
return unicodedata.name(chr(i), r'U+%04X' % i)
|
||||
|
||||
class hexint(int):
|
||||
def __repr__(self):
|
||||
return '%#06x' % self
|
||||
|
||||
def alpha(i):
|
||||
c = chr(i)
|
||||
return c if c.isalpha() else ascii(c)[1:-1]
|
||||
|
||||
|
||||
def main(outfile='Lib/re/_casefix.py'):
|
||||
# Find sets of characters which have the same uppercase.
|
||||
equivalent_chars = collections.defaultdict(str)
|
||||
for c in map(chr, range(sys.maxunicode + 1)):
|
||||
equivalent_chars[c.upper()] += c
|
||||
equivalent_chars = [t for t in equivalent_chars.values() if len(t) > 1]
|
||||
|
||||
# List of codes of lowercased characters which have the same uppercase.
|
||||
equivalent_lower_codes = [sorted(t)
|
||||
for s in equivalent_chars
|
||||
for t in [set(ord(c.lower()) for c in s)]
|
||||
if len(t) > 1]
|
||||
|
||||
bad_codes = []
|
||||
for t in equivalent_lower_codes:
|
||||
for i in t:
|
||||
if i > 0xffff:
|
||||
bad_codes.extend(t)
|
||||
try:
|
||||
bad_codes.append(ord(chr(i).upper()))
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
break
|
||||
if bad_codes:
|
||||
print('Case-insensitive matching may not work correctly for character:',
|
||||
file=sys.stderr)
|
||||
for i in sorted(bad_codes):
|
||||
print(" '%s' (U+%04x, %s)" % (alpha(i), i, uname(i)),
|
||||
file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
mapping = {i: tuple(j for j in t if i != j)
|
||||
for t in equivalent_lower_codes
|
||||
for i in t}
|
||||
|
||||
items = []
|
||||
for i, t in sorted(mapping.items()):
|
||||
items.append(' # %s: %s' % (
|
||||
uname(i),
|
||||
', '.join(map(uname, t)),
|
||||
))
|
||||
items.append(" %r: %r, # '%s': '%s'" % (
|
||||
hexint(i),
|
||||
tuple(map(hexint, t)),
|
||||
alpha(i),
|
||||
''.join(map(alpha, t)),
|
||||
))
|
||||
|
||||
update_file(outfile, re_casefix_template % '\n'.join(items))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import sys
|
||||
main(*sys.argv[1:])
|
Loading…
Add table
Add a link
Reference in a new issue