[3.13] gh-126505: Fix bugs in compiling case-insensitive character classes (GH-126557) (GH-126689)

* upper-case non-BMP character was ignored
* the ASCII flag was ignored when matching a character range whose
  upper bound is beyond the BMP region
(cherry picked from commit 819830f34a)

Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
This commit is contained in:
Miss Islington (bot) 2024-11-11 17:54:57 +01:00 committed by GitHub
parent fc10908f7d
commit 7db6d4282f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 73 additions and 9 deletions

View file

@ -1137,6 +1137,39 @@ class ReTests(unittest.TestCase):
self.assertTrue(re.match(br'[19a]', b'a', re.I))
self.assertTrue(re.match(br'[19a]', b'A', re.I))
self.assertTrue(re.match(br'[19A]', b'a', re.I))
self.assertTrue(re.match(r'[19\xc7]', '\xc7', re.I))
self.assertTrue(re.match(r'[19\xc7]', '\xe7', re.I))
self.assertTrue(re.match(r'[19\xe7]', '\xc7', re.I))
self.assertTrue(re.match(r'[19\xe7]', '\xe7', re.I))
self.assertTrue(re.match(r'[19\u0400]', '\u0400', re.I))
self.assertTrue(re.match(r'[19\u0400]', '\u0450', re.I))
self.assertTrue(re.match(r'[19\u0450]', '\u0400', re.I))
self.assertTrue(re.match(r'[19\u0450]', '\u0450', re.I))
self.assertTrue(re.match(r'[19\U00010400]', '\U00010400', re.I))
self.assertTrue(re.match(r'[19\U00010400]', '\U00010428', re.I))
self.assertTrue(re.match(r'[19\U00010428]', '\U00010400', re.I))
self.assertTrue(re.match(r'[19\U00010428]', '\U00010428', re.I))
self.assertTrue(re.match(br'[19A]', b'A', re.I))
self.assertTrue(re.match(br'[19a]', b'a', re.I))
self.assertTrue(re.match(br'[19a]', b'A', re.I))
self.assertTrue(re.match(br'[19A]', b'a', re.I))
self.assertTrue(re.match(r'[19A]', 'A', re.I|re.A))
self.assertTrue(re.match(r'[19a]', 'a', re.I|re.A))
self.assertTrue(re.match(r'[19a]', 'A', re.I|re.A))
self.assertTrue(re.match(r'[19A]', 'a', re.I|re.A))
self.assertTrue(re.match(r'[19\xc7]', '\xc7', re.I|re.A))
self.assertIsNone(re.match(r'[19\xc7]', '\xe7', re.I|re.A))
self.assertIsNone(re.match(r'[19\xe7]', '\xc7', re.I|re.A))
self.assertTrue(re.match(r'[19\xe7]', '\xe7', re.I|re.A))
self.assertTrue(re.match(r'[19\u0400]', '\u0400', re.I|re.A))
self.assertIsNone(re.match(r'[19\u0400]', '\u0450', re.I|re.A))
self.assertIsNone(re.match(r'[19\u0450]', '\u0400', re.I|re.A))
self.assertTrue(re.match(r'[19\u0450]', '\u0450', re.I|re.A))
self.assertTrue(re.match(r'[19\U00010400]', '\U00010400', re.I|re.A))
self.assertIsNone(re.match(r'[19\U00010400]', '\U00010428', re.I|re.A))
self.assertIsNone(re.match(r'[19\U00010428]', '\U00010400', re.I|re.A))
self.assertTrue(re.match(r'[19\U00010428]', '\U00010428', re.I|re.A))
# Two different characters have the same lowercase.
assert 'K'.lower() == '\u212a'.lower() == 'k' # ''
@ -1173,8 +1206,10 @@ class ReTests(unittest.TestCase):
self.assertTrue(re.match(br'[9-a]', b'_', re.I))
self.assertIsNone(re.match(br'[9-A]', b'_', re.I))
self.assertTrue(re.match(r'[\xc0-\xde]', '\xd7', re.I))
self.assertTrue(re.match(r'[\xc0-\xde]', '\xe7', re.I))
self.assertIsNone(re.match(r'[\xc0-\xde]', '\xf7', re.I))
self.assertTrue(re.match(r'[\xe0-\xfe]', '\xf7', re.I))
self.assertTrue(re.match(r'[\xe0-\xfe]', '\xc7', re.I))
self.assertIsNone(re.match(r'[\xe0-\xfe]', '\xd7', re.I))
self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0450', re.I))
self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0400', re.I))
@ -1185,6 +1220,26 @@ class ReTests(unittest.TestCase):
self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010428', re.I))
self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010400', re.I))
self.assertTrue(re.match(r'[\xc0-\xde]', '\xd7', re.I|re.A))
self.assertIsNone(re.match(r'[\xc0-\xde]', '\xe7', re.I|re.A))
self.assertTrue(re.match(r'[\xe0-\xfe]', '\xf7', re.I|re.A))
self.assertIsNone(re.match(r'[\xe0-\xfe]', '\xc7', re.I|re.A))
self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0450', re.I|re.A))
self.assertIsNone(re.match(r'[\u0430-\u045f]', '\u0400', re.I|re.A))
self.assertIsNone(re.match(r'[\u0400-\u042f]', '\u0450', re.I|re.A))
self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0400', re.I|re.A))
self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010428', re.I|re.A))
self.assertIsNone(re.match(r'[\U00010428-\U0001044f]', '\U00010400', re.I|re.A))
self.assertIsNone(re.match(r'[\U00010400-\U00010427]', '\U00010428', re.I|re.A))
self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010400', re.I|re.A))
self.assertTrue(re.match(r'[N-\x7f]', 'A', re.I|re.A))
self.assertTrue(re.match(r'[n-\x7f]', 'Z', re.I|re.A))
self.assertTrue(re.match(r'[N-\uffff]', 'A', re.I|re.A))
self.assertTrue(re.match(r'[n-\uffff]', 'Z', re.I|re.A))
self.assertTrue(re.match(r'[N-\U00010000]', 'A', re.I|re.A))
self.assertTrue(re.match(r'[n-\U00010000]', 'Z', re.I|re.A))
# Two different characters have the same lowercase.
assert 'K'.lower() == '\u212a'.lower() == 'k' # ''
self.assertTrue(re.match(r'[J-M]', '\u212a', re.I))