gh-91575: Add a script for generating data for case-insensitive matching in re (GH-91660)

Also test that all extra cases are in BMP.
2025-10-28 17:13:08 +00:00 · 2022-04-22 21:37:46 +03:00 · 2022-04-22 21:37:46 +03:00 · f912cc0e41
commit f912cc0e41
parent 48ec61a89a
5 changed files with 212 additions and 57 deletions
--- a/Tools/scripts/generate_re_casefix.py
+++ b/Tools/scripts/generate_re_casefix.py
@ -0,0 +1,95 @@
+#! /usr/bin/env python3
+# This script generates Lib/re/_casefix.py.
+
+import collections
+import re
+import sys
+import unicodedata
+
+def update_file(file, content):
+    try:
+        with open(file, 'r', encoding='utf-8') as fobj:
+            if fobj.read() == content:
+                return False
+    except (OSError, ValueError):
+        pass
+    with open(file, 'w', encoding='utf-8') as fobj:
+        fobj.write(content)
+    return True
+
+re_casefix_template = """\
+# Auto-generated by Tools/scripts/generate_re_casefix.py.
+
+# Maps the code of lowercased character to codes of different lowercased
+# characters which have the same uppercase.
+_EXTRA_CASES = {
+%s
+}
+"""
+
+def uname(i):
+    return unicodedata.name(chr(i), r'U+%04X' % i)
+
+class hexint(int):
+    def __repr__(self):
+        return '%#06x' % self
+
+def alpha(i):
+    c = chr(i)
+    return c if c.isalpha() else ascii(c)[1:-1]
+
+
+def main(outfile='Lib/re/_casefix.py'):
+    # Find sets of characters which have the same uppercase.
+    equivalent_chars = collections.defaultdict(str)
+    for c in map(chr, range(sys.maxunicode + 1)):
+        equivalent_chars[c.upper()] += c
+    equivalent_chars = [t for t in equivalent_chars.values() if len(t) > 1]
+
+    # List of codes of lowercased characters which have the same uppercase.
+    equivalent_lower_codes = [sorted(t)
+                              for s in equivalent_chars
+                              for t in [set(ord(c.lower()) for c in s)]
+                              if len(t) > 1]
+
+    bad_codes = []
+    for t in equivalent_lower_codes:
+        for i in t:
+            if i > 0xffff:
+                bad_codes.extend(t)
+                try:
+                    bad_codes.append(ord(chr(i).upper()))
+                except (ValueError, TypeError):
+                    pass
+                break
+    if bad_codes:
+        print('Case-insensitive matching may not work correctly for character:',
+              file=sys.stderr)
+        for i in sorted(bad_codes):
+            print("  '%s' (U+%04x, %s)" % (alpha(i), i, uname(i)),
+                  file=sys.stderr)
+        sys.exit(1)
+
+    mapping = {i: tuple(j for j in t if i != j)
+               for t in equivalent_lower_codes
+               for i in t}
+
+    items = []
+    for i, t in sorted(mapping.items()):
+        items.append('    # %s: %s' % (
+            uname(i),
+            ', '.join(map(uname, t)),
+        ))
+        items.append("    %r: %r, # '%s': '%s'" % (
+            hexint(i),
+            tuple(map(hexint, t)),
+            alpha(i),
+            ''.join(map(alpha, t)),
+        ))
+
+    update_file(outfile, re_casefix_template % '\n'.join(items))
+
+
+if __name__ == '__main__':
+    import sys
+    main(*sys.argv[1:])