gh-133306: Support \z as a synonym for \Z in regular expressions (GH-133314)

\Z was an error inherited from PCRE 0.95. It was fixed in PCRE 2.0.
In other engines, \Z means not “anchor at string end”, but
“anchor before optional newline at string end”.

\z means “anchor at string end” in most RE engines.
This commit is contained in:
Serhiy Storchaka 2025-05-03 10:54:33 +03:00 committed by GitHub
parent fe44fc4f43
commit ac56f8cc8d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 31 additions and 10 deletions

View file

@ -738,9 +738,12 @@ given location, they can obviously be matched an infinite number of times.
different: ``\A`` still matches only at the beginning of the string, but ``^`` different: ``\A`` still matches only at the beginning of the string, but ``^``
may match at any location inside the string that follows a newline character. may match at any location inside the string that follows a newline character.
``\Z`` ``\z``
Matches only at the end of the string. Matches only at the end of the string.
``\Z``
The same as ``\z``. For compatibility with old Python versions.
``\b`` ``\b``
Word boundary. This is a zero-width assertion that matches only at the Word boundary. This is a zero-width assertion that matches only at the
beginning or end of a word. A word is defined as a sequence of alphanumeric beginning or end of a word. A word is defined as a sequence of alphanumeric

View file

@ -266,7 +266,7 @@ The special characters are:
not a word boundary as outside a set, and numeric escapes not a word boundary as outside a set, and numeric escapes
such as ``\1`` are always octal escapes, not group references. such as ``\1`` are always octal escapes, not group references.
Special sequences which do not match a single character such as ``\A`` Special sequences which do not match a single character such as ``\A``
and ``\Z`` are not allowed. and ``\z`` are not allowed.
.. index:: single: ^ (caret); in regular expressions .. index:: single: ^ (caret); in regular expressions
@ -661,10 +661,16 @@ character ``'$'``.
matches characters which are neither alphanumeric in the current locale matches characters which are neither alphanumeric in the current locale
nor the underscore. nor the underscore.
.. index:: single: \Z; in regular expressions .. index:: single: \z; in regular expressions
single: \Z; in regular expressions
``\z``
Matches only at the end of the string.
.. versionadded:: next
``\Z`` ``\Z``
Matches only at the end of the string. The same as ``\z``. For compatibility with old Python versions.
.. index:: .. index::
single: \a; in regular expressions single: \a; in regular expressions

View file

@ -624,6 +624,11 @@ Other language changes
ASCII :class:`bytes` and :term:`bytes-like objects <bytes-like object>`. ASCII :class:`bytes` and :term:`bytes-like objects <bytes-like object>`.
(Contributed by Daniel Pope in :gh:`129349`.) (Contributed by Daniel Pope in :gh:`129349`.)
* Support ``\z`` as a synonym for ``\Z`` in :mod:`regular expressions <re>`.
It is interpreted unambiguously in many other regular expression engines,
unlike ``\Z``, which has subtly different behavior.
(Contributed by Serhiy Storchaka in :gh:`133306`.)
* ``\B`` in :mod:`regular expression <re>` now matches empty input string. * ``\B`` in :mod:`regular expression <re>` now matches empty input string.
Now it is always the opposite of ``\b``. Now it is always the opposite of ``\b``.
(Contributed by Serhiy Storchaka in :gh:`124130`.) (Contributed by Serhiy Storchaka in :gh:`124130`.)

View file

@ -61,7 +61,7 @@ below. If the ordinary character is not on the list, then the
resulting RE will match the second character. resulting RE will match the second character.
\number Matches the contents of the group of the same number. \number Matches the contents of the group of the same number.
\A Matches only at the start of the string. \A Matches only at the start of the string.
\Z Matches only at the end of the string. \z Matches only at the end of the string.
\b Matches the empty string, but only at the start or end of a word. \b Matches the empty string, but only at the start or end of a word.
\B Matches the empty string, but not at the start or end of a word. \B Matches the empty string, but not at the start or end of a word.
\d Matches any decimal digit; equivalent to the set [0-9] in \d Matches any decimal digit; equivalent to the set [0-9] in

View file

@ -49,7 +49,8 @@ CATEGORIES = {
r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]), r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]),
r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]), r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]),
r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]), r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]),
r"\Z": (AT, AT_END_STRING), # end of string r"\z": (AT, AT_END_STRING), # end of string
r"\Z": (AT, AT_END_STRING), # end of string (obsolete)
} }
FLAGS = { FLAGS = {

View file

@ -531,7 +531,7 @@ xyzabc
(r'a[ ]*?\ (\d+).*', 'a 10', SUCCEED, 'found', 'a 10'), (r'a[ ]*?\ (\d+).*', 'a 10', SUCCEED, 'found', 'a 10'),
(r'a[ ]*?\ (\d+).*', 'a 10', SUCCEED, 'found', 'a 10'), (r'a[ ]*?\ (\d+).*', 'a 10', SUCCEED, 'found', 'a 10'),
# bug 127259: \Z shouldn't depend on multiline mode # bug 127259: \Z shouldn't depend on multiline mode
(r'(?ms).*?x\s*\Z(.*)','xx\nx\n', SUCCEED, 'g1', ''), (r'(?ms).*?x\s*\z(.*)','xx\nx\n', SUCCEED, 'g1', ''),
# bug 128899: uppercase literals under the ignorecase flag # bug 128899: uppercase literals under the ignorecase flag
(r'(?i)M+', 'MMM', SUCCEED, 'found', 'MMM'), (r'(?i)M+', 'MMM', SUCCEED, 'found', 'MMM'),
(r'(?i)m+', 'MMM', SUCCEED, 'found', 'MMM'), (r'(?i)m+', 'MMM', SUCCEED, 'found', 'MMM'),

View file

@ -619,6 +619,7 @@ class ReTests(unittest.TestCase):
self.assertEqual(re.fullmatch(r"a.*?b", "axxb").span(), (0, 4)) self.assertEqual(re.fullmatch(r"a.*?b", "axxb").span(), (0, 4))
self.assertIsNone(re.fullmatch(r"a+", "ab")) self.assertIsNone(re.fullmatch(r"a+", "ab"))
self.assertIsNone(re.fullmatch(r"abc$", "abc\n")) self.assertIsNone(re.fullmatch(r"abc$", "abc\n"))
self.assertIsNone(re.fullmatch(r"abc\z", "abc\n"))
self.assertIsNone(re.fullmatch(r"abc\Z", "abc\n")) self.assertIsNone(re.fullmatch(r"abc\Z", "abc\n"))
self.assertIsNone(re.fullmatch(r"(?m)abc$", "abc\n")) self.assertIsNone(re.fullmatch(r"(?m)abc$", "abc\n"))
self.assertEqual(re.fullmatch(r"ab(?=c)cd", "abcd").span(), (0, 4)) self.assertEqual(re.fullmatch(r"ab(?=c)cd", "abcd").span(), (0, 4))
@ -802,6 +803,8 @@ class ReTests(unittest.TestCase):
self.assertEqual(re.search(r"\B(b.)\B", self.assertEqual(re.search(r"\B(b.)\B",
"abc bcd bc abxd", re.ASCII).group(1), "bx") "abc bcd bc abxd", re.ASCII).group(1), "bx")
self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc") self.assertEqual(re.search(r"^abc$", "\nabc\n", re.M).group(0), "abc")
self.assertEqual(re.search(r"^\Aabc\z$", "abc", re.M).group(0), "abc")
self.assertIsNone(re.search(r"^\Aabc\z$", "\nabc\n", re.M))
self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc") self.assertEqual(re.search(r"^\Aabc\Z$", "abc", re.M).group(0), "abc")
self.assertIsNone(re.search(r"^\Aabc\Z$", "\nabc\n", re.M)) self.assertIsNone(re.search(r"^\Aabc\Z$", "\nabc\n", re.M))
self.assertEqual(re.search(br"\b(b.)\b", self.assertEqual(re.search(br"\b(b.)\b",
@ -813,6 +816,8 @@ class ReTests(unittest.TestCase):
self.assertEqual(re.search(br"\B(b.)\B", self.assertEqual(re.search(br"\B(b.)\B",
b"abc bcd bc abxd", re.LOCALE).group(1), b"bx") b"abc bcd bc abxd", re.LOCALE).group(1), b"bx")
self.assertEqual(re.search(br"^abc$", b"\nabc\n", re.M).group(0), b"abc") self.assertEqual(re.search(br"^abc$", b"\nabc\n", re.M).group(0), b"abc")
self.assertEqual(re.search(br"^\Aabc\z$", b"abc", re.M).group(0), b"abc")
self.assertIsNone(re.search(br"^\Aabc\z$", b"\nabc\n", re.M))
self.assertEqual(re.search(br"^\Aabc\Z$", b"abc", re.M).group(0), b"abc") self.assertEqual(re.search(br"^\Aabc\Z$", b"abc", re.M).group(0), b"abc")
self.assertIsNone(re.search(br"^\Aabc\Z$", b"\nabc\n", re.M)) self.assertIsNone(re.search(br"^\Aabc\Z$", b"\nabc\n", re.M))
self.assertEqual(re.search(r"\d\D\w\W\s\S", self.assertEqual(re.search(r"\d\D\w\W\s\S",
@ -836,7 +841,7 @@ class ReTests(unittest.TestCase):
self.assertEqual(re.match(r"[\^a]+", 'a^').group(), 'a^') self.assertEqual(re.match(r"[\^a]+", 'a^').group(), 'a^')
self.assertIsNone(re.match(r"[\^a]+", 'b')) self.assertIsNone(re.match(r"[\^a]+", 'b'))
re.purge() # for warnings re.purge() # for warnings
for c in 'ceghijklmopqyzCEFGHIJKLMNOPQRTVXY': for c in 'ceghijklmopqyCEFGHIJKLMNOPQRTVXY':
with self.subTest(c): with self.subTest(c):
self.assertRaises(re.PatternError, re.compile, '\\%c' % c) self.assertRaises(re.PatternError, re.compile, '\\%c' % c)
for c in 'ceghijklmopqyzABCEFGHIJKLMNOPQRTVXYZ': for c in 'ceghijklmopqyzABCEFGHIJKLMNOPQRTVXYZ':
@ -2608,8 +2613,8 @@ class ReTests(unittest.TestCase):
self.assertEqual(re.findall(r'(?>(?:ab){1,3})', 'ababc'), ['abab']) self.assertEqual(re.findall(r'(?>(?:ab){1,3})', 'ababc'), ['abab'])
def test_bug_gh91616(self): def test_bug_gh91616(self):
self.assertTrue(re.fullmatch(r'(?s:(?>.*?\.).*)\Z', "a.txt")) # reproducer self.assertTrue(re.fullmatch(r'(?s:(?>.*?\.).*)\z', "a.txt")) # reproducer
self.assertTrue(re.fullmatch(r'(?s:(?=(?P<g0>.*?\.))(?P=g0).*)\Z', "a.txt")) self.assertTrue(re.fullmatch(r'(?s:(?=(?P<g0>.*?\.))(?P=g0).*)\z', "a.txt"))
def test_bug_gh100061(self): def test_bug_gh100061(self):
# gh-100061 # gh-100061

View file

@ -0,0 +1 @@
Support ``\z`` as a synonym for ``\Z`` in :mod:`regular expressions <re>`.