gh-91760: Deprecate group names and numbers which will be invalid in future (GH-91794)

Only sequence of ASCII digits will be accepted as a numerical reference. The group name in bytes patterns and replacement strings could only contain ASCII letters and digits and underscore.
2025-10-15 11:22:18 +00:00 · 2022-04-30 13:13:46 +03:00 · 2022-04-30 13:13:46 +03:00 · 19dca04121
commit 19dca04121
parent 6d0d547033
5 changed files with 112 additions and 7 deletions
--- a/Doc/library/re.rst
+++ b/Doc/library/re.rst
@ -417,6 +417,9 @@ The special characters are:
   |                                       | * ``\1``                         |
   +---------------------------------------+----------------------------------+
   .. deprecated:: 3.11
      Group names containing non-ASCII characters in bytes patterns.
 .. index:: single: (?P=; in regular expressions
 ``(?P=name)``
@ -486,6 +489,9 @@ The special characters are:
   will match with ``'<user@host.com>'`` as well as ``'user@host.com'``, but
   not with ``'<user@host.com'`` nor ``'user@host.com>'``.
   .. deprecated:: 3.11
      Group *id* containing anything except ASCII digits.
 The special sequences consist of ``'\'`` and a character from the list below.
 If the ordinary character is not an ASCII digit or an ASCII letter, then the
@ -995,6 +1001,10 @@ form.
      Empty matches for the pattern are replaced when adjacent to a previous
      non-empty match.
   .. deprecated:: 3.11
      Group *id* containing anything except ASCII digits.
      Group names containing non-ASCII characters in bytes replacement strings.
 .. function:: subn(pattern, repl, string, count=0, flags=0)
--- a/Doc/whatsnew/3.11.rst
+++ b/Doc/whatsnew/3.11.rst
@ -1151,6 +1151,14 @@ Deprecated
  (Contributed by Brett Cannon in :issue:`47061` and Victor Stinner in
  :gh:`68966`.)
 * More strict rules will be applied now applied for numerical group references
  and group names in regular expressions in future Python versions.
  Only sequence of ASCII digits will be now accepted as a numerical reference.
  The group name in bytes patterns and replacement strings could only
  contain ASCII letters and digits and underscore.
  For now, a deprecation warning is raised for such syntax.
  (Contributed by Serhiy Storchaka in :gh:`91760`.)
 Removed
 =======
--- a/Lib/re/_parser.py
+++ b/Lib/re/_parser.py
@ -287,8 +287,22 @@ class Tokenizer:
        self.__next()
    def error(self, msg, offset=0):
        if not self.istext:
            msg = msg.encode('ascii', 'backslashreplace').decode('ascii')
        return error(msg, self.string, self.tell() - offset)
    def checkgroupname(self, name, offset, nested):
        if not name.isidentifier():
            msg = "bad character in group name %r" % name
            raise self.error(msg, len(name) + offset)
        if not (self.istext or name.isascii()):
            import warnings
            warnings.warn(
                "bad character in group name %a at position %d" %
                (name, self.tell() - len(name) - offset),
                DeprecationWarning, stacklevel=nested + 7
            )
 def _class_escape(source, escape):
    # handle escape code inside character class
    code = ESCAPES.get(escape)
@ -703,15 +717,11 @@ def _parse(source, state, verbose, nested, first=False):
                    if sourcematch("<"):
                        # named group: skip forward to end of name
                        name = source.getuntil(">", "group name")
-                        if not name.isidentifier():
+                        source.checkgroupname(name, 1, nested)
                            msg = "bad character in group name %r" % name
                            raise source.error(msg, len(name) + 1)
                    elif sourcematch("="):
                        # named backreference
                        name = source.getuntil(")", "group name")
-                        if not name.isidentifier():
+                        source.checkgroupname(name, 1, nested)
                            msg = "bad character in group name %r" % name
                            raise source.error(msg, len(name) + 1)
                        gid = state.groupdict.get(name)
                        if gid is None:
                            msg = "unknown group name %r" % name
@ -773,6 +783,7 @@ def _parse(source, state, verbose, nested, first=False):
                    # conditional backreference group
                    condname = source.getuntil(")", "group name")
                    if condname.isidentifier():
                        source.checkgroupname(condname, 1, nested)
                        condgroup = state.groupdict.get(condname)
                        if condgroup is None:
                            msg = "unknown group name %r" % condname
@ -795,6 +806,14 @@ def _parse(source, state, verbose, nested, first=False):
                            state.grouprefpos[condgroup] = (
                                source.tell() - len(condname) - 1
                            )
                        if not (condname.isdecimal() and condname.isascii()):
                            import warnings
                            warnings.warn(
                                "bad character in group name %s at position %d" %
                                (repr(condname) if source.istext else ascii(condname),
                                 source.tell() - len(condname) - 1),
                                DeprecationWarning, stacklevel=nested + 6
                            )
                    state.checklookbehindgroup(condgroup, source)
                    item_yes = _parse(source, state, verbose, nested + 1)
                    if source.match("|"):
@ -1000,11 +1019,11 @@ def parse_template(source, state):
            # group
            c = this[1]
            if c == "g":
                name = ""
                if not s.match("<"):
                    raise s.error("missing <")
                name = s.getuntil(">", "group name")
                if name.isidentifier():
                    s.checkgroupname(name, 1, -1)
                    try:
                        index = groupindex[name]
                    except KeyError:
@ -1020,6 +1039,14 @@ def parse_template(source, state):
                    if index >= MAXGROUPS:
                        raise s.error("invalid group reference %d" % index,
                                      len(name) + 1)
                    if not (name.isdecimal() and name.isascii()):
                        import warnings
                        warnings.warn(
                            "bad character in group name %s at position %d" %
                            (repr(name) if s.istext else ascii(name),
                             s.tell() - len(name) - 1),
                            DeprecationWarning, stacklevel=5
                        )
                addgroup(index, len(name) + 1)
            elif c == "0":
                if s.next in OCTDIGITS:
--- a/Lib/test/test_re.py
+++ b/Lib/test/test_re.py
@ -135,6 +135,7 @@ class ReTests(unittest.TestCase):
        self.assertEqual(re.sub('(?P<a>x)', r'\g<a>\g<1>', 'xx'), 'xxxx')
        self.assertEqual(re.sub('(?P<unk>x)', r'\g<unk>\g<unk>', 'xx'), 'xxxx')
        self.assertEqual(re.sub('(?P<unk>x)', r'\g<1>\g<1>', 'xx'), 'xxxx')
        self.assertEqual(re.sub('()x', r'\g<0>\g<0>', 'xx'), 'xxxx')
        self.assertEqual(re.sub('a', r'\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b')
        self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b')
@ -274,6 +275,21 @@ class ReTests(unittest.TestCase):
        self.checkPatternError('(?P<©>x)', "bad character in group name '©'", 4)
        self.checkPatternError('(?P=©)', "bad character in group name '©'", 4)
        self.checkPatternError('(?(©)y)', "bad character in group name '©'", 3)
        with self.assertWarnsRegex(DeprecationWarning,
                                   r"bad character in group name '\\xc2\\xb5' "
                                   r"at position 4") as w:
            re.compile(b'(?P<\xc2\xb5>x)')
        self.assertEqual(w.filename, __file__)
        with self.assertWarnsRegex(DeprecationWarning,
                                   r"bad character in group name '\\xc2\\xb5' "
                                   r"at position 4"):
            self.checkPatternError(b'(?P=\xc2\xb5)',
                                   r"unknown group name '\xc2\xb5'", 4)
        with self.assertWarnsRegex(DeprecationWarning,
                                   r"bad character in group name '\\xc2\\xb5' "
                                   r"at position 3"):
            self.checkPatternError(b'(?(\xc2\xb5)y)',
                                   r"unknown group name '\xc2\xb5'", 3)
    def test_symbolic_refs(self):
        self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\g<b>', 'xx'), '')
@ -306,12 +322,35 @@ class ReTests(unittest.TestCase):
            re.sub('(?P<a>x)', r'\g<ab>', 'xx')
        self.checkTemplateError('(?P<a>x)', r'\g<-1>', 'xx',
                                "bad character in group name '-1'", 3)
        with self.assertWarnsRegex(DeprecationWarning,
                                   r"bad character in group name '\+1' "
                                   r"at position 3") as w:
            re.sub('(?P<a>x)', r'\g<+1>', 'xx')
        self.assertEqual(w.filename, __file__)
        with self.assertWarnsRegex(DeprecationWarning,
                                   r"bad character in group name '1_0' "
                                   r"at position 3"):
            re.sub('()'*10, r'\g<1_0>', 'xx')
        with self.assertWarnsRegex(DeprecationWarning,
                                   r"bad character in group name ' 1 ' "
                                   r"at position 3"):
            re.sub('(?P<a>x)', r'\g< 1 >', 'xx')
        self.checkTemplateError('(?P<a>x)', r'\g<©>', 'xx',
                                "bad character in group name '©'", 3)
        with self.assertWarnsRegex(DeprecationWarning,
                                   r"bad character in group name '\\xc2\\xb5' "
                                   r"at position 3") as w:
            with self.assertRaisesRegex(IndexError, "unknown group name '\xc2\xb5'"):
                re.sub(b'(?P<a>x)', b'\\g<\xc2\xb5>', b'xx')
        self.assertEqual(w.filename, __file__)
        self.checkTemplateError('(?P<a>x)', r'\g<㊀>', 'xx',
                                "bad character in group name '㊀'", 3)
        self.checkTemplateError('(?P<a>x)', r'\g<¹>', 'xx',
                                "bad character in group name '¹'", 3)
        with self.assertWarnsRegex(DeprecationWarning,
                                   r"bad character in group name '१' "
                                   r"at position 3"):
            re.sub('(?P<a>x)', r'\g<१>', 'xx')
    def test_re_subn(self):
        self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
@ -577,10 +616,27 @@ class ReTests(unittest.TestCase):
        self.checkPatternError(r'(?P<a>)(?(0)a|b)', 'bad group number', 10)
        self.checkPatternError(r'()(?(-1)a|b)',
                               "bad character in group name '-1'", 5)
        with self.assertWarnsRegex(DeprecationWarning,
                                   r"bad character in group name '\+1' "
                                   r"at position 5") as w:
            re.compile(r'()(?(+1)a|b)')
        self.assertEqual(w.filename, __file__)
        with self.assertWarnsRegex(DeprecationWarning,
                                   r"bad character in group name '1_0' "
                                   r"at position 23"):
            re.compile(r'()'*10 + r'(?(1_0)a|b)')
        with self.assertWarnsRegex(DeprecationWarning,
                                   r"bad character in group name ' 1 ' "
                                   r"at position 5"):
            re.compile(r'()(?( 1 )a|b)')
        self.checkPatternError(r'()(?(㊀)a|b)',
                               "bad character in group name '㊀'", 5)
        self.checkPatternError(r'()(?(¹)a|b)',
                               "bad character in group name '¹'", 5)
        with self.assertWarnsRegex(DeprecationWarning,
                                   r"bad character in group name '१' "
                                   r"at position 5"):
            re.compile(r'()(?(१)a|b)')
        self.checkPatternError(r'()(?(1',
                               "missing ), unterminated name", 5)
        self.checkPatternError(r'()(?(1)a',
--- a/Misc/NEWS.d/next/Library/2022-04-21-19-46-03.gh-issue-91760.zDtv1E.rst
+++ b/Misc/NEWS.d/next/Library/2022-04-21-19-46-03.gh-issue-91760.zDtv1E.rst
@ -0,0 +1,4 @@
 More strict rules will be applied for numerical group references and group
 names in regular expressions. For now, a deprecation warning is emitted for
 group references and group names which will be errors in future Python
 versions.