gh-91575: Add a script for generating data for case-insensitive matching in re (GH-91660)

Also test that all extra cases are in BMP.
This commit is contained in:
Serhiy Storchaka 2022-04-22 21:37:46 +03:00 committed by GitHub
parent 48ec61a89a
commit f912cc0e41
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 212 additions and 57 deletions

View file

@ -13,6 +13,7 @@
import _sre
from . import _parser
from ._constants import *
from ._casefix import _EXTRA_CASES
assert _sre.MAGIC == MAGIC, "SRE module mismatch"
@ -27,62 +28,6 @@ _REPEATING_CODES = {
POSSESSIVE_REPEAT: (POSSESSIVE_REPEAT, SUCCESS, POSSESSIVE_REPEAT_ONE),
}
# Sets of lowercase characters which have the same uppercase.
_equivalences = (
# LATIN SMALL LETTER I, LATIN SMALL LETTER DOTLESS I
(0x69, 0x131), # iı
# LATIN SMALL LETTER S, LATIN SMALL LETTER LONG S
(0x73, 0x17f), # sſ
# MICRO SIGN, GREEK SMALL LETTER MU
(0xb5, 0x3bc), # µμ
# COMBINING GREEK YPOGEGRAMMENI, GREEK SMALL LETTER IOTA, GREEK PROSGEGRAMMENI
(0x345, 0x3b9, 0x1fbe), # \u0345ι
# GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
(0x390, 0x1fd3), # ΐΐ
# GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
(0x3b0, 0x1fe3), # ΰΰ
# GREEK SMALL LETTER BETA, GREEK BETA SYMBOL
(0x3b2, 0x3d0), # βϐ
# GREEK SMALL LETTER EPSILON, GREEK LUNATE EPSILON SYMBOL
(0x3b5, 0x3f5), # εϵ
# GREEK SMALL LETTER THETA, GREEK THETA SYMBOL
(0x3b8, 0x3d1), # θϑ
# GREEK SMALL LETTER KAPPA, GREEK KAPPA SYMBOL
(0x3ba, 0x3f0), # κϰ
# GREEK SMALL LETTER PI, GREEK PI SYMBOL
(0x3c0, 0x3d6), # πϖ
# GREEK SMALL LETTER RHO, GREEK RHO SYMBOL
(0x3c1, 0x3f1), # ρϱ
# GREEK SMALL LETTER FINAL SIGMA, GREEK SMALL LETTER SIGMA
(0x3c2, 0x3c3), # ςσ
# GREEK SMALL LETTER PHI, GREEK PHI SYMBOL
(0x3c6, 0x3d5), # φϕ
# CYRILLIC SMALL LETTER VE, CYRILLIC SMALL LETTER ROUNDED VE
(0x432, 0x1c80), # вᲀ
# CYRILLIC SMALL LETTER DE, CYRILLIC SMALL LETTER LONG-LEGGED DE
(0x434, 0x1c81), # дᲁ
# CYRILLIC SMALL LETTER O, CYRILLIC SMALL LETTER NARROW O
(0x43e, 0x1c82), # оᲂ
# CYRILLIC SMALL LETTER ES, CYRILLIC SMALL LETTER WIDE ES
(0x441, 0x1c83), # сᲃ
# CYRILLIC SMALL LETTER TE, CYRILLIC SMALL LETTER TALL TE, CYRILLIC SMALL LETTER THREE-LEGGED TE
(0x442, 0x1c84, 0x1c85), # тᲄᲅ
# CYRILLIC SMALL LETTER HARD SIGN, CYRILLIC SMALL LETTER TALL HARD SIGN
(0x44a, 0x1c86), # ъᲆ
# CYRILLIC SMALL LETTER YAT, CYRILLIC SMALL LETTER TALL YAT
(0x463, 0x1c87), # ѣᲇ
# CYRILLIC SMALL LETTER UNBLENDED UK, CYRILLIC SMALL LETTER MONOGRAPH UK
(0x1c88, 0xa64b), # ᲈꙋ
# LATIN SMALL LETTER S WITH DOT ABOVE, LATIN SMALL LETTER LONG S WITH DOT ABOVE
(0x1e61, 0x1e9b), # ṡẛ
# LATIN SMALL LIGATURE LONG S T, LATIN SMALL LIGATURE ST
(0xfb05, 0xfb06), # ſtst
)
# Maps the lowercase code to lowercase codes which have the same uppercase.
_ignorecase_fixes = {i: tuple(j for j in t if i != j)
for t in _equivalences for i in t}
class _CompileData:
__slots__ = ('code', 'repeat_count')
def __init__(self):
@ -111,7 +56,7 @@ def _compile(data, pattern, flags):
if flags & SRE_FLAG_UNICODE:
iscased = _sre.unicode_iscased
tolower = _sre.unicode_tolower
fixes = _ignorecase_fixes
fixes = _EXTRA_CASES
else:
iscased = _sre.ascii_iscased
tolower = _sre.ascii_tolower