gh-91760: More strict rules for numerical group references and group names in RE (GH-91792)

Only sequence of ASCII digits is now accepted as a numerical reference.
The group name in bytes patterns and replacement strings can now only
contain ASCII letters and digits and underscore.
This commit is contained in:
Serhiy Storchaka 2022-05-08 19:19:29 +03:00 committed by GitHub
parent 7b024e3a3f
commit a84a56d80f
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 62 additions and 91 deletions

View file

@ -395,7 +395,8 @@ The special characters are:
``(?P<name>...)`` ``(?P<name>...)``
Similar to regular parentheses, but the substring matched by the group is Similar to regular parentheses, but the substring matched by the group is
accessible via the symbolic group name *name*. Group names must be valid accessible via the symbolic group name *name*. Group names must be valid
Python identifiers, and each group name must be defined only once within a Python identifiers, and in bytes patterns they must contain only characters
in the ASCII range. Each group name must be defined only once within a
regular expression. A symbolic group is also a numbered group, just as if regular expression. A symbolic group is also a numbered group, just as if
the group were not named. the group were not named.
@ -417,8 +418,9 @@ The special characters are:
| | * ``\1`` | | | * ``\1`` |
+---------------------------------------+----------------------------------+ +---------------------------------------+----------------------------------+
.. deprecated:: 3.11 .. versionchanged:: 3.12
Group names containing non-ASCII characters in bytes patterns. In bytes patterns group names must contain only characters in
the ASCII range.
.. index:: single: (?P=; in regular expressions .. index:: single: (?P=; in regular expressions
@ -489,8 +491,8 @@ The special characters are:
will match with ``'<user@host.com>'`` as well as ``'user@host.com'``, but will match with ``'<user@host.com>'`` as well as ``'user@host.com'``, but
not with ``'<user@host.com'`` nor ``'user@host.com>'``. not with ``'<user@host.com'`` nor ``'user@host.com>'``.
.. deprecated:: 3.11 .. versionchanged:: 3.12
Group *id* containing anything except ASCII digits. Group *id* can only contain ASCII digits.
The special sequences consist of ``'\'`` and a character from the list below. The special sequences consist of ``'\'`` and a character from the list below.
@ -1001,9 +1003,10 @@ form.
Empty matches for the pattern are replaced when adjacent to a previous Empty matches for the pattern are replaced when adjacent to a previous
non-empty match. non-empty match.
.. deprecated:: 3.11 .. versionchanged:: 3.12
Group *id* containing anything except ASCII digits. Group *id* can only contain ASCII digits.
Group names containing non-ASCII characters in bytes replacement strings. In bytes replacement strings group names must contain only characters
in the ASCII range.
.. function:: subn(pattern, repl, string, count=0, flags=0) .. function:: subn(pattern, repl, string, count=0, flags=0)

View file

@ -114,3 +114,13 @@ Porting to Python 3.12
This section lists previously described changes and other bugfixes This section lists previously described changes and other bugfixes
that may require changes to your code. that may require changes to your code.
Changes in the Python API
-------------------------
* More strict rules are now applied for numerical group references and
group names in regular expressions.
Only sequence of ASCII digits is now accepted as a numerical reference.
The group name in bytes patterns and replacement strings can now only
contain ASCII letters and digits and underscore.
(Contributed by Serhiy Storchaka in :gh:`91760`.)

View file

@ -291,17 +291,13 @@ class Tokenizer:
msg = msg.encode('ascii', 'backslashreplace').decode('ascii') msg = msg.encode('ascii', 'backslashreplace').decode('ascii')
return error(msg, self.string, self.tell() - offset) return error(msg, self.string, self.tell() - offset)
def checkgroupname(self, name, offset, nested): def checkgroupname(self, name, offset):
if not (self.istext or name.isascii()):
msg = "bad character in group name %a" % name
raise self.error(msg, len(name) + offset)
if not name.isidentifier(): if not name.isidentifier():
msg = "bad character in group name %r" % name msg = "bad character in group name %r" % name
raise self.error(msg, len(name) + offset) raise self.error(msg, len(name) + offset)
if not (self.istext or name.isascii()):
import warnings
warnings.warn(
"bad character in group name %a at position %d" %
(name, self.tell() - len(name) - offset),
DeprecationWarning, stacklevel=nested + 7
)
def _class_escape(source, escape): def _class_escape(source, escape):
# handle escape code inside character class # handle escape code inside character class
@ -717,11 +713,11 @@ def _parse(source, state, verbose, nested, first=False):
if sourcematch("<"): if sourcematch("<"):
# named group: skip forward to end of name # named group: skip forward to end of name
name = source.getuntil(">", "group name") name = source.getuntil(">", "group name")
source.checkgroupname(name, 1, nested) source.checkgroupname(name, 1)
elif sourcematch("="): elif sourcematch("="):
# named backreference # named backreference
name = source.getuntil(")", "group name") name = source.getuntil(")", "group name")
source.checkgroupname(name, 1, nested) source.checkgroupname(name, 1)
gid = state.groupdict.get(name) gid = state.groupdict.get(name)
if gid is None: if gid is None:
msg = "unknown group name %r" % name msg = "unknown group name %r" % name
@ -782,20 +778,14 @@ def _parse(source, state, verbose, nested, first=False):
elif char == "(": elif char == "(":
# conditional backreference group # conditional backreference group
condname = source.getuntil(")", "group name") condname = source.getuntil(")", "group name")
if condname.isidentifier(): if not (condname.isdecimal() and condname.isascii()):
source.checkgroupname(condname, 1, nested) source.checkgroupname(condname, 1)
condgroup = state.groupdict.get(condname) condgroup = state.groupdict.get(condname)
if condgroup is None: if condgroup is None:
msg = "unknown group name %r" % condname msg = "unknown group name %r" % condname
raise source.error(msg, len(condname) + 1) raise source.error(msg, len(condname) + 1)
else: else:
try: condgroup = int(condname)
condgroup = int(condname)
if condgroup < 0:
raise ValueError
except ValueError:
msg = "bad character in group name %r" % condname
raise source.error(msg, len(condname) + 1) from None
if not condgroup: if not condgroup:
raise source.error("bad group number", raise source.error("bad group number",
len(condname) + 1) len(condname) + 1)
@ -1022,20 +1012,14 @@ def parse_template(source, state):
if not s.match("<"): if not s.match("<"):
raise s.error("missing <") raise s.error("missing <")
name = s.getuntil(">", "group name") name = s.getuntil(">", "group name")
if name.isidentifier(): if not (name.isdecimal() and name.isascii()):
s.checkgroupname(name, 1, -1) s.checkgroupname(name, 1)
try: try:
index = groupindex[name] index = groupindex[name]
except KeyError: except KeyError:
raise IndexError("unknown group name %r" % name) from None raise IndexError("unknown group name %r" % name) from None
else: else:
try: index = int(name)
index = int(name)
if index < 0:
raise ValueError
except ValueError:
raise s.error("bad character in group name %r" % name,
len(name) + 1) from None
if index >= MAXGROUPS: if index >= MAXGROUPS:
raise s.error("invalid group reference %d" % index, raise s.error("invalid group reference %d" % index,
len(name) + 1) len(name) + 1)

View file

@ -275,21 +275,12 @@ class ReTests(unittest.TestCase):
self.checkPatternError('(?P<©>x)', "bad character in group name '©'", 4) self.checkPatternError('(?P<©>x)', "bad character in group name '©'", 4)
self.checkPatternError('(?P=©)', "bad character in group name '©'", 4) self.checkPatternError('(?P=©)', "bad character in group name '©'", 4)
self.checkPatternError('(?(©)y)', "bad character in group name '©'", 3) self.checkPatternError('(?(©)y)', "bad character in group name '©'", 3)
with self.assertWarnsRegex(DeprecationWarning, self.checkPatternError(b'(?P<\xc2\xb5>x)',
r"bad character in group name '\\xc2\\xb5' " r"bad character in group name '\xc2\xb5'", 4)
r"at position 4") as w: self.checkPatternError(b'(?P=\xc2\xb5)',
re.compile(b'(?P<\xc2\xb5>x)') r"bad character in group name '\xc2\xb5'", 4)
self.assertEqual(w.filename, __file__) self.checkPatternError(b'(?(\xc2\xb5)y)',
with self.assertWarnsRegex(DeprecationWarning, r"bad character in group name '\xc2\xb5'", 3)
r"bad character in group name '\\xc2\\xb5' "
r"at position 4"):
self.checkPatternError(b'(?P=\xc2\xb5)',
r"unknown group name '\xc2\xb5'", 4)
with self.assertWarnsRegex(DeprecationWarning,
r"bad character in group name '\\xc2\\xb5' "
r"at position 3"):
self.checkPatternError(b'(?(\xc2\xb5)y)',
r"unknown group name '\xc2\xb5'", 3)
def test_symbolic_refs(self): def test_symbolic_refs(self):
self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\g<b>', 'xx'), '') self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\g<b>', 'xx'), '')
@ -322,35 +313,22 @@ class ReTests(unittest.TestCase):
re.sub('(?P<a>x)', r'\g<ab>', 'xx') re.sub('(?P<a>x)', r'\g<ab>', 'xx')
self.checkTemplateError('(?P<a>x)', r'\g<-1>', 'xx', self.checkTemplateError('(?P<a>x)', r'\g<-1>', 'xx',
"bad character in group name '-1'", 3) "bad character in group name '-1'", 3)
with self.assertWarnsRegex(DeprecationWarning, self.checkTemplateError('(?P<a>x)', r'\g<+1>', 'xx',
r"bad character in group name '\+1' " "bad character in group name '+1'", 3)
r"at position 3") as w: self.checkTemplateError('()'*10, r'\g<1_0>', 'xx',
re.sub('(?P<a>x)', r'\g<+1>', 'xx') "bad character in group name '1_0'", 3)
self.assertEqual(w.filename, __file__) self.checkTemplateError('(?P<a>x)', r'\g< 1 >', 'xx',
with self.assertWarnsRegex(DeprecationWarning, "bad character in group name ' 1 '", 3)
r"bad character in group name '1_0' "
r"at position 3"):
re.sub('()'*10, r'\g<1_0>', 'xx')
with self.assertWarnsRegex(DeprecationWarning,
r"bad character in group name ' 1 ' "
r"at position 3"):
re.sub('(?P<a>x)', r'\g< 1 >', 'xx')
self.checkTemplateError('(?P<a>x)', r'\g<©>', 'xx', self.checkTemplateError('(?P<a>x)', r'\g<©>', 'xx',
"bad character in group name '©'", 3) "bad character in group name '©'", 3)
with self.assertWarnsRegex(DeprecationWarning, self.checkTemplateError(b'(?P<a>x)', b'\\g<\xc2\xb5>', b'xx',
r"bad character in group name '\\xc2\\xb5' " r"bad character in group name '\xc2\xb5'", 3)
r"at position 3") as w:
with self.assertRaisesRegex(IndexError, "unknown group name '\xc2\xb5'"):
re.sub(b'(?P<a>x)', b'\\g<\xc2\xb5>', b'xx')
self.assertEqual(w.filename, __file__)
self.checkTemplateError('(?P<a>x)', r'\g<㊀>', 'xx', self.checkTemplateError('(?P<a>x)', r'\g<㊀>', 'xx',
"bad character in group name ''", 3) "bad character in group name ''", 3)
self.checkTemplateError('(?P<a>x)', r'\g<¹>', 'xx', self.checkTemplateError('(?P<a>x)', r'\g<¹>', 'xx',
"bad character in group name '¹'", 3) "bad character in group name '¹'", 3)
with self.assertWarnsRegex(DeprecationWarning, self.checkTemplateError('(?P<a>x)', r'\g<१>', 'xx',
r"bad character in group name '' " "bad character in group name ''", 3)
r"at position 3"):
re.sub('(?P<a>x)', r'\g<१>', 'xx')
def test_re_subn(self): def test_re_subn(self):
self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2)) self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
@ -616,27 +594,18 @@ class ReTests(unittest.TestCase):
self.checkPatternError(r'(?P<a>)(?(0)a|b)', 'bad group number', 10) self.checkPatternError(r'(?P<a>)(?(0)a|b)', 'bad group number', 10)
self.checkPatternError(r'()(?(-1)a|b)', self.checkPatternError(r'()(?(-1)a|b)',
"bad character in group name '-1'", 5) "bad character in group name '-1'", 5)
with self.assertWarnsRegex(DeprecationWarning, self.checkPatternError(r'()(?(+1)a|b)',
r"bad character in group name '\+1' " "bad character in group name '+1'", 5)
r"at position 5") as w: self.checkPatternError(r'()'*10 + r'(?(1_0)a|b)',
re.compile(r'()(?(+1)a|b)') "bad character in group name '1_0'", 23)
self.assertEqual(w.filename, __file__) self.checkPatternError(r'()(?( 1 )a|b)',
with self.assertWarnsRegex(DeprecationWarning, "bad character in group name ' 1 '", 5)
r"bad character in group name '1_0' "
r"at position 23"):
re.compile(r'()'*10 + r'(?(1_0)a|b)')
with self.assertWarnsRegex(DeprecationWarning,
r"bad character in group name ' 1 ' "
r"at position 5"):
re.compile(r'()(?( 1 )a|b)')
self.checkPatternError(r'()(?(㊀)a|b)', self.checkPatternError(r'()(?(㊀)a|b)',
"bad character in group name ''", 5) "bad character in group name ''", 5)
self.checkPatternError(r'()(?(¹)a|b)', self.checkPatternError(r'()(?(¹)a|b)',
"bad character in group name '¹'", 5) "bad character in group name '¹'", 5)
with self.assertWarnsRegex(DeprecationWarning, self.checkPatternError(r'()(?(१)a|b)',
r"bad character in group name '' " "bad character in group name ''", 5)
r"at position 5"):
re.compile(r'()(?(१)a|b)')
self.checkPatternError(r'()(?(1', self.checkPatternError(r'()(?(1',
"missing ), unterminated name", 5) "missing ), unterminated name", 5)
self.checkPatternError(r'()(?(1)a', self.checkPatternError(r'()(?(1)a',

View file

@ -0,0 +1,5 @@
Apply more strict rules for numerical group references and group names in
regular expressions. Only sequence of ASCII digits is now accepted as
a numerical reference. The group name in
bytes patterns and replacement strings can now only contain ASCII letters
and digits and underscore.