mirror of
https://github.com/python/cpython.git
synced 2025-08-31 22:18:28 +00:00
gh-91575: Add a script for generating data for case-insensitive matching in re (GH-91660)
Also test that all extra cases are in BMP.
This commit is contained in:
parent
48ec61a89a
commit
f912cc0e41
5 changed files with 212 additions and 57 deletions
|
@ -13,6 +13,7 @@
|
|||
import _sre
|
||||
from . import _parser
|
||||
from ._constants import *
|
||||
from ._casefix import _EXTRA_CASES
|
||||
|
||||
assert _sre.MAGIC == MAGIC, "SRE module mismatch"
|
||||
|
||||
|
@ -27,62 +28,6 @@ _REPEATING_CODES = {
|
|||
POSSESSIVE_REPEAT: (POSSESSIVE_REPEAT, SUCCESS, POSSESSIVE_REPEAT_ONE),
|
||||
}
|
||||
|
||||
# Sets of lowercase characters which have the same uppercase.
|
||||
_equivalences = (
|
||||
# LATIN SMALL LETTER I, LATIN SMALL LETTER DOTLESS I
|
||||
(0x69, 0x131), # iı
|
||||
# LATIN SMALL LETTER S, LATIN SMALL LETTER LONG S
|
||||
(0x73, 0x17f), # sſ
|
||||
# MICRO SIGN, GREEK SMALL LETTER MU
|
||||
(0xb5, 0x3bc), # µμ
|
||||
# COMBINING GREEK YPOGEGRAMMENI, GREEK SMALL LETTER IOTA, GREEK PROSGEGRAMMENI
|
||||
(0x345, 0x3b9, 0x1fbe), # \u0345ιι
|
||||
# GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
|
||||
(0x390, 0x1fd3), # ΐΐ
|
||||
# GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
|
||||
(0x3b0, 0x1fe3), # ΰΰ
|
||||
# GREEK SMALL LETTER BETA, GREEK BETA SYMBOL
|
||||
(0x3b2, 0x3d0), # βϐ
|
||||
# GREEK SMALL LETTER EPSILON, GREEK LUNATE EPSILON SYMBOL
|
||||
(0x3b5, 0x3f5), # εϵ
|
||||
# GREEK SMALL LETTER THETA, GREEK THETA SYMBOL
|
||||
(0x3b8, 0x3d1), # θϑ
|
||||
# GREEK SMALL LETTER KAPPA, GREEK KAPPA SYMBOL
|
||||
(0x3ba, 0x3f0), # κϰ
|
||||
# GREEK SMALL LETTER PI, GREEK PI SYMBOL
|
||||
(0x3c0, 0x3d6), # πϖ
|
||||
# GREEK SMALL LETTER RHO, GREEK RHO SYMBOL
|
||||
(0x3c1, 0x3f1), # ρϱ
|
||||
# GREEK SMALL LETTER FINAL SIGMA, GREEK SMALL LETTER SIGMA
|
||||
(0x3c2, 0x3c3), # ςσ
|
||||
# GREEK SMALL LETTER PHI, GREEK PHI SYMBOL
|
||||
(0x3c6, 0x3d5), # φϕ
|
||||
# CYRILLIC SMALL LETTER VE, CYRILLIC SMALL LETTER ROUNDED VE
|
||||
(0x432, 0x1c80), # вᲀ
|
||||
# CYRILLIC SMALL LETTER DE, CYRILLIC SMALL LETTER LONG-LEGGED DE
|
||||
(0x434, 0x1c81), # дᲁ
|
||||
# CYRILLIC SMALL LETTER O, CYRILLIC SMALL LETTER NARROW O
|
||||
(0x43e, 0x1c82), # оᲂ
|
||||
# CYRILLIC SMALL LETTER ES, CYRILLIC SMALL LETTER WIDE ES
|
||||
(0x441, 0x1c83), # сᲃ
|
||||
# CYRILLIC SMALL LETTER TE, CYRILLIC SMALL LETTER TALL TE, CYRILLIC SMALL LETTER THREE-LEGGED TE
|
||||
(0x442, 0x1c84, 0x1c85), # тᲄᲅ
|
||||
# CYRILLIC SMALL LETTER HARD SIGN, CYRILLIC SMALL LETTER TALL HARD SIGN
|
||||
(0x44a, 0x1c86), # ъᲆ
|
||||
# CYRILLIC SMALL LETTER YAT, CYRILLIC SMALL LETTER TALL YAT
|
||||
(0x463, 0x1c87), # ѣᲇ
|
||||
# CYRILLIC SMALL LETTER UNBLENDED UK, CYRILLIC SMALL LETTER MONOGRAPH UK
|
||||
(0x1c88, 0xa64b), # ᲈꙋ
|
||||
# LATIN SMALL LETTER S WITH DOT ABOVE, LATIN SMALL LETTER LONG S WITH DOT ABOVE
|
||||
(0x1e61, 0x1e9b), # ṡẛ
|
||||
# LATIN SMALL LIGATURE LONG S T, LATIN SMALL LIGATURE ST
|
||||
(0xfb05, 0xfb06), # ſtst
|
||||
)
|
||||
|
||||
# Maps the lowercase code to lowercase codes which have the same uppercase.
|
||||
_ignorecase_fixes = {i: tuple(j for j in t if i != j)
|
||||
for t in _equivalences for i in t}
|
||||
|
||||
class _CompileData:
|
||||
__slots__ = ('code', 'repeat_count')
|
||||
def __init__(self):
|
||||
|
@ -111,7 +56,7 @@ def _compile(data, pattern, flags):
|
|||
if flags & SRE_FLAG_UNICODE:
|
||||
iscased = _sre.unicode_iscased
|
||||
tolower = _sre.unicode_tolower
|
||||
fixes = _ignorecase_fixes
|
||||
fixes = _EXTRA_CASES
|
||||
else:
|
||||
iscased = _sre.ascii_iscased
|
||||
tolower = _sre.ascii_tolower
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue