mirror of
https://github.com/python/cpython.git
synced 2025-08-01 07:33:08 +00:00
[3.13] gh-126505: Fix bugs in compiling case-insensitive character classes (GH-126557) (GH-126689)
* upper-case non-BMP character was ignored
* the ASCII flag was ignored when matching a character range whose
upper bound is beyond the BMP region
(cherry picked from commit 819830f34a
)
Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
This commit is contained in:
parent
fc10908f7d
commit
7db6d4282f
3 changed files with 73 additions and 9 deletions
|
@ -248,11 +248,11 @@ def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
if op is LITERAL:
|
if op is LITERAL:
|
||||||
if fixup:
|
if fixup: # IGNORECASE and not LOCALE
|
||||||
lo = fixup(av)
|
av = fixup(av)
|
||||||
charmap[lo] = 1
|
charmap[av] = 1
|
||||||
if fixes and lo in fixes:
|
if fixes and av in fixes:
|
||||||
for k in fixes[lo]:
|
for k in fixes[av]:
|
||||||
charmap[k] = 1
|
charmap[k] = 1
|
||||||
if not hascased and iscased(av):
|
if not hascased and iscased(av):
|
||||||
hascased = True
|
hascased = True
|
||||||
|
@ -260,7 +260,7 @@ def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):
|
||||||
charmap[av] = 1
|
charmap[av] = 1
|
||||||
elif op is RANGE:
|
elif op is RANGE:
|
||||||
r = range(av[0], av[1]+1)
|
r = range(av[0], av[1]+1)
|
||||||
if fixup:
|
if fixup: # IGNORECASE and not LOCALE
|
||||||
if fixes:
|
if fixes:
|
||||||
for i in map(fixup, r):
|
for i in map(fixup, r):
|
||||||
charmap[i] = 1
|
charmap[i] = 1
|
||||||
|
@ -287,8 +287,7 @@ def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):
|
||||||
# Character set contains non-BMP character codes.
|
# Character set contains non-BMP character codes.
|
||||||
# For range, all BMP characters in the range are already
|
# For range, all BMP characters in the range are already
|
||||||
# proceeded.
|
# proceeded.
|
||||||
if fixup:
|
if fixup: # IGNORECASE and not LOCALE
|
||||||
hascased = True
|
|
||||||
# For now, IN_UNI_IGNORE+LITERAL and
|
# For now, IN_UNI_IGNORE+LITERAL and
|
||||||
# IN_UNI_IGNORE+RANGE_UNI_IGNORE work for all non-BMP
|
# IN_UNI_IGNORE+RANGE_UNI_IGNORE work for all non-BMP
|
||||||
# characters, because two characters (at least one of
|
# characters, because two characters (at least one of
|
||||||
|
@ -299,7 +298,13 @@ def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):
|
||||||
# Also, both c.lower() and c.lower().upper() are single
|
# Also, both c.lower() and c.lower().upper() are single
|
||||||
# characters for every non-BMP character.
|
# characters for every non-BMP character.
|
||||||
if op is RANGE:
|
if op is RANGE:
|
||||||
op = RANGE_UNI_IGNORE
|
if fixes: # not ASCII
|
||||||
|
op = RANGE_UNI_IGNORE
|
||||||
|
hascased = True
|
||||||
|
else:
|
||||||
|
assert op is LITERAL
|
||||||
|
if not hascased and iscased(av):
|
||||||
|
hascased = True
|
||||||
tail.append((op, av))
|
tail.append((op, av))
|
||||||
break
|
break
|
||||||
|
|
||||||
|
|
|
@ -1137,6 +1137,39 @@ class ReTests(unittest.TestCase):
|
||||||
self.assertTrue(re.match(br'[19a]', b'a', re.I))
|
self.assertTrue(re.match(br'[19a]', b'a', re.I))
|
||||||
self.assertTrue(re.match(br'[19a]', b'A', re.I))
|
self.assertTrue(re.match(br'[19a]', b'A', re.I))
|
||||||
self.assertTrue(re.match(br'[19A]', b'a', re.I))
|
self.assertTrue(re.match(br'[19A]', b'a', re.I))
|
||||||
|
self.assertTrue(re.match(r'[19\xc7]', '\xc7', re.I))
|
||||||
|
self.assertTrue(re.match(r'[19\xc7]', '\xe7', re.I))
|
||||||
|
self.assertTrue(re.match(r'[19\xe7]', '\xc7', re.I))
|
||||||
|
self.assertTrue(re.match(r'[19\xe7]', '\xe7', re.I))
|
||||||
|
self.assertTrue(re.match(r'[19\u0400]', '\u0400', re.I))
|
||||||
|
self.assertTrue(re.match(r'[19\u0400]', '\u0450', re.I))
|
||||||
|
self.assertTrue(re.match(r'[19\u0450]', '\u0400', re.I))
|
||||||
|
self.assertTrue(re.match(r'[19\u0450]', '\u0450', re.I))
|
||||||
|
self.assertTrue(re.match(r'[19\U00010400]', '\U00010400', re.I))
|
||||||
|
self.assertTrue(re.match(r'[19\U00010400]', '\U00010428', re.I))
|
||||||
|
self.assertTrue(re.match(r'[19\U00010428]', '\U00010400', re.I))
|
||||||
|
self.assertTrue(re.match(r'[19\U00010428]', '\U00010428', re.I))
|
||||||
|
|
||||||
|
self.assertTrue(re.match(br'[19A]', b'A', re.I))
|
||||||
|
self.assertTrue(re.match(br'[19a]', b'a', re.I))
|
||||||
|
self.assertTrue(re.match(br'[19a]', b'A', re.I))
|
||||||
|
self.assertTrue(re.match(br'[19A]', b'a', re.I))
|
||||||
|
self.assertTrue(re.match(r'[19A]', 'A', re.I|re.A))
|
||||||
|
self.assertTrue(re.match(r'[19a]', 'a', re.I|re.A))
|
||||||
|
self.assertTrue(re.match(r'[19a]', 'A', re.I|re.A))
|
||||||
|
self.assertTrue(re.match(r'[19A]', 'a', re.I|re.A))
|
||||||
|
self.assertTrue(re.match(r'[19\xc7]', '\xc7', re.I|re.A))
|
||||||
|
self.assertIsNone(re.match(r'[19\xc7]', '\xe7', re.I|re.A))
|
||||||
|
self.assertIsNone(re.match(r'[19\xe7]', '\xc7', re.I|re.A))
|
||||||
|
self.assertTrue(re.match(r'[19\xe7]', '\xe7', re.I|re.A))
|
||||||
|
self.assertTrue(re.match(r'[19\u0400]', '\u0400', re.I|re.A))
|
||||||
|
self.assertIsNone(re.match(r'[19\u0400]', '\u0450', re.I|re.A))
|
||||||
|
self.assertIsNone(re.match(r'[19\u0450]', '\u0400', re.I|re.A))
|
||||||
|
self.assertTrue(re.match(r'[19\u0450]', '\u0450', re.I|re.A))
|
||||||
|
self.assertTrue(re.match(r'[19\U00010400]', '\U00010400', re.I|re.A))
|
||||||
|
self.assertIsNone(re.match(r'[19\U00010400]', '\U00010428', re.I|re.A))
|
||||||
|
self.assertIsNone(re.match(r'[19\U00010428]', '\U00010400', re.I|re.A))
|
||||||
|
self.assertTrue(re.match(r'[19\U00010428]', '\U00010428', re.I|re.A))
|
||||||
|
|
||||||
# Two different characters have the same lowercase.
|
# Two different characters have the same lowercase.
|
||||||
assert 'K'.lower() == '\u212a'.lower() == 'k' # 'K'
|
assert 'K'.lower() == '\u212a'.lower() == 'k' # 'K'
|
||||||
|
@ -1173,8 +1206,10 @@ class ReTests(unittest.TestCase):
|
||||||
self.assertTrue(re.match(br'[9-a]', b'_', re.I))
|
self.assertTrue(re.match(br'[9-a]', b'_', re.I))
|
||||||
self.assertIsNone(re.match(br'[9-A]', b'_', re.I))
|
self.assertIsNone(re.match(br'[9-A]', b'_', re.I))
|
||||||
self.assertTrue(re.match(r'[\xc0-\xde]', '\xd7', re.I))
|
self.assertTrue(re.match(r'[\xc0-\xde]', '\xd7', re.I))
|
||||||
|
self.assertTrue(re.match(r'[\xc0-\xde]', '\xe7', re.I))
|
||||||
self.assertIsNone(re.match(r'[\xc0-\xde]', '\xf7', re.I))
|
self.assertIsNone(re.match(r'[\xc0-\xde]', '\xf7', re.I))
|
||||||
self.assertTrue(re.match(r'[\xe0-\xfe]', '\xf7', re.I))
|
self.assertTrue(re.match(r'[\xe0-\xfe]', '\xf7', re.I))
|
||||||
|
self.assertTrue(re.match(r'[\xe0-\xfe]', '\xc7', re.I))
|
||||||
self.assertIsNone(re.match(r'[\xe0-\xfe]', '\xd7', re.I))
|
self.assertIsNone(re.match(r'[\xe0-\xfe]', '\xd7', re.I))
|
||||||
self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0450', re.I))
|
self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0450', re.I))
|
||||||
self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0400', re.I))
|
self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0400', re.I))
|
||||||
|
@ -1185,6 +1220,26 @@ class ReTests(unittest.TestCase):
|
||||||
self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010428', re.I))
|
self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010428', re.I))
|
||||||
self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010400', re.I))
|
self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010400', re.I))
|
||||||
|
|
||||||
|
self.assertTrue(re.match(r'[\xc0-\xde]', '\xd7', re.I|re.A))
|
||||||
|
self.assertIsNone(re.match(r'[\xc0-\xde]', '\xe7', re.I|re.A))
|
||||||
|
self.assertTrue(re.match(r'[\xe0-\xfe]', '\xf7', re.I|re.A))
|
||||||
|
self.assertIsNone(re.match(r'[\xe0-\xfe]', '\xc7', re.I|re.A))
|
||||||
|
self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0450', re.I|re.A))
|
||||||
|
self.assertIsNone(re.match(r'[\u0430-\u045f]', '\u0400', re.I|re.A))
|
||||||
|
self.assertIsNone(re.match(r'[\u0400-\u042f]', '\u0450', re.I|re.A))
|
||||||
|
self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0400', re.I|re.A))
|
||||||
|
self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010428', re.I|re.A))
|
||||||
|
self.assertIsNone(re.match(r'[\U00010428-\U0001044f]', '\U00010400', re.I|re.A))
|
||||||
|
self.assertIsNone(re.match(r'[\U00010400-\U00010427]', '\U00010428', re.I|re.A))
|
||||||
|
self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010400', re.I|re.A))
|
||||||
|
|
||||||
|
self.assertTrue(re.match(r'[N-\x7f]', 'A', re.I|re.A))
|
||||||
|
self.assertTrue(re.match(r'[n-\x7f]', 'Z', re.I|re.A))
|
||||||
|
self.assertTrue(re.match(r'[N-\uffff]', 'A', re.I|re.A))
|
||||||
|
self.assertTrue(re.match(r'[n-\uffff]', 'Z', re.I|re.A))
|
||||||
|
self.assertTrue(re.match(r'[N-\U00010000]', 'A', re.I|re.A))
|
||||||
|
self.assertTrue(re.match(r'[n-\U00010000]', 'Z', re.I|re.A))
|
||||||
|
|
||||||
# Two different characters have the same lowercase.
|
# Two different characters have the same lowercase.
|
||||||
assert 'K'.lower() == '\u212a'.lower() == 'k' # 'K'
|
assert 'K'.lower() == '\u212a'.lower() == 'k' # 'K'
|
||||||
self.assertTrue(re.match(r'[J-M]', '\u212a', re.I))
|
self.assertTrue(re.match(r'[J-M]', '\u212a', re.I))
|
||||||
|
|
|
@ -0,0 +1,4 @@
|
||||||
|
Fix bugs in compiling case-insensitive :mod:`regular expressions <re>` with
|
||||||
|
character classes containing non-BMP characters: upper-case non-BMP
|
||||||
|
character did was ignored and the ASCII flag was ignored when
|
||||||
|
matching a character range whose upper bound is beyond the BMP region.
|
Loading…
Add table
Add a link
Reference in a new issue