gh-91760: Deprecate group names and numbers which will be invalid in future (GH-91794)

Only sequence of ASCII digits will be accepted as a numerical reference.
The group name in bytes patterns and replacement strings could only
contain ASCII letters and digits and underscore.
This commit is contained in:
Serhiy Storchaka 2022-04-30 13:13:46 +03:00 committed by GitHub
parent 6d0d547033
commit 19dca04121
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 112 additions and 7 deletions

View file

@ -417,6 +417,9 @@ The special characters are:
| | * ``\1`` | | | * ``\1`` |
+---------------------------------------+----------------------------------+ +---------------------------------------+----------------------------------+
.. deprecated:: 3.11
Group names containing non-ASCII characters in bytes patterns.
.. index:: single: (?P=; in regular expressions .. index:: single: (?P=; in regular expressions
``(?P=name)`` ``(?P=name)``
@ -486,6 +489,9 @@ The special characters are:
will match with ``'<user@host.com>'`` as well as ``'user@host.com'``, but will match with ``'<user@host.com>'`` as well as ``'user@host.com'``, but
not with ``'<user@host.com'`` nor ``'user@host.com>'``. not with ``'<user@host.com'`` nor ``'user@host.com>'``.
.. deprecated:: 3.11
Group *id* containing anything except ASCII digits.
The special sequences consist of ``'\'`` and a character from the list below. The special sequences consist of ``'\'`` and a character from the list below.
If the ordinary character is not an ASCII digit or an ASCII letter, then the If the ordinary character is not an ASCII digit or an ASCII letter, then the
@ -995,6 +1001,10 @@ form.
Empty matches for the pattern are replaced when adjacent to a previous Empty matches for the pattern are replaced when adjacent to a previous
non-empty match. non-empty match.
.. deprecated:: 3.11
Group *id* containing anything except ASCII digits.
Group names containing non-ASCII characters in bytes replacement strings.
.. function:: subn(pattern, repl, string, count=0, flags=0) .. function:: subn(pattern, repl, string, count=0, flags=0)

View file

@ -1151,6 +1151,14 @@ Deprecated
(Contributed by Brett Cannon in :issue:`47061` and Victor Stinner in (Contributed by Brett Cannon in :issue:`47061` and Victor Stinner in
:gh:`68966`.) :gh:`68966`.)
* More strict rules will be applied now applied for numerical group references
and group names in regular expressions in future Python versions.
Only sequence of ASCII digits will be now accepted as a numerical reference.
The group name in bytes patterns and replacement strings could only
contain ASCII letters and digits and underscore.
For now, a deprecation warning is raised for such syntax.
(Contributed by Serhiy Storchaka in :gh:`91760`.)
Removed Removed
======= =======

View file

@ -287,8 +287,22 @@ class Tokenizer:
self.__next() self.__next()
def error(self, msg, offset=0): def error(self, msg, offset=0):
if not self.istext:
msg = msg.encode('ascii', 'backslashreplace').decode('ascii')
return error(msg, self.string, self.tell() - offset) return error(msg, self.string, self.tell() - offset)
def checkgroupname(self, name, offset, nested):
if not name.isidentifier():
msg = "bad character in group name %r" % name
raise self.error(msg, len(name) + offset)
if not (self.istext or name.isascii()):
import warnings
warnings.warn(
"bad character in group name %a at position %d" %
(name, self.tell() - len(name) - offset),
DeprecationWarning, stacklevel=nested + 7
)
def _class_escape(source, escape): def _class_escape(source, escape):
# handle escape code inside character class # handle escape code inside character class
code = ESCAPES.get(escape) code = ESCAPES.get(escape)
@ -703,15 +717,11 @@ def _parse(source, state, verbose, nested, first=False):
if sourcematch("<"): if sourcematch("<"):
# named group: skip forward to end of name # named group: skip forward to end of name
name = source.getuntil(">", "group name") name = source.getuntil(">", "group name")
if not name.isidentifier(): source.checkgroupname(name, 1, nested)
msg = "bad character in group name %r" % name
raise source.error(msg, len(name) + 1)
elif sourcematch("="): elif sourcematch("="):
# named backreference # named backreference
name = source.getuntil(")", "group name") name = source.getuntil(")", "group name")
if not name.isidentifier(): source.checkgroupname(name, 1, nested)
msg = "bad character in group name %r" % name
raise source.error(msg, len(name) + 1)
gid = state.groupdict.get(name) gid = state.groupdict.get(name)
if gid is None: if gid is None:
msg = "unknown group name %r" % name msg = "unknown group name %r" % name
@ -773,6 +783,7 @@ def _parse(source, state, verbose, nested, first=False):
# conditional backreference group # conditional backreference group
condname = source.getuntil(")", "group name") condname = source.getuntil(")", "group name")
if condname.isidentifier(): if condname.isidentifier():
source.checkgroupname(condname, 1, nested)
condgroup = state.groupdict.get(condname) condgroup = state.groupdict.get(condname)
if condgroup is None: if condgroup is None:
msg = "unknown group name %r" % condname msg = "unknown group name %r" % condname
@ -795,6 +806,14 @@ def _parse(source, state, verbose, nested, first=False):
state.grouprefpos[condgroup] = ( state.grouprefpos[condgroup] = (
source.tell() - len(condname) - 1 source.tell() - len(condname) - 1
) )
if not (condname.isdecimal() and condname.isascii()):
import warnings
warnings.warn(
"bad character in group name %s at position %d" %
(repr(condname) if source.istext else ascii(condname),
source.tell() - len(condname) - 1),
DeprecationWarning, stacklevel=nested + 6
)
state.checklookbehindgroup(condgroup, source) state.checklookbehindgroup(condgroup, source)
item_yes = _parse(source, state, verbose, nested + 1) item_yes = _parse(source, state, verbose, nested + 1)
if source.match("|"): if source.match("|"):
@ -1000,11 +1019,11 @@ def parse_template(source, state):
# group # group
c = this[1] c = this[1]
if c == "g": if c == "g":
name = ""
if not s.match("<"): if not s.match("<"):
raise s.error("missing <") raise s.error("missing <")
name = s.getuntil(">", "group name") name = s.getuntil(">", "group name")
if name.isidentifier(): if name.isidentifier():
s.checkgroupname(name, 1, -1)
try: try:
index = groupindex[name] index = groupindex[name]
except KeyError: except KeyError:
@ -1020,6 +1039,14 @@ def parse_template(source, state):
if index >= MAXGROUPS: if index >= MAXGROUPS:
raise s.error("invalid group reference %d" % index, raise s.error("invalid group reference %d" % index,
len(name) + 1) len(name) + 1)
if not (name.isdecimal() and name.isascii()):
import warnings
warnings.warn(
"bad character in group name %s at position %d" %
(repr(name) if s.istext else ascii(name),
s.tell() - len(name) - 1),
DeprecationWarning, stacklevel=5
)
addgroup(index, len(name) + 1) addgroup(index, len(name) + 1)
elif c == "0": elif c == "0":
if s.next in OCTDIGITS: if s.next in OCTDIGITS:

View file

@ -135,6 +135,7 @@ class ReTests(unittest.TestCase):
self.assertEqual(re.sub('(?P<a>x)', r'\g<a>\g<1>', 'xx'), 'xxxx') self.assertEqual(re.sub('(?P<a>x)', r'\g<a>\g<1>', 'xx'), 'xxxx')
self.assertEqual(re.sub('(?P<unk>x)', r'\g<unk>\g<unk>', 'xx'), 'xxxx') self.assertEqual(re.sub('(?P<unk>x)', r'\g<unk>\g<unk>', 'xx'), 'xxxx')
self.assertEqual(re.sub('(?P<unk>x)', r'\g<1>\g<1>', 'xx'), 'xxxx') self.assertEqual(re.sub('(?P<unk>x)', r'\g<1>\g<1>', 'xx'), 'xxxx')
self.assertEqual(re.sub('()x', r'\g<0>\g<0>', 'xx'), 'xxxx')
self.assertEqual(re.sub('a', r'\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b') self.assertEqual(re.sub('a', r'\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b')
self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b') self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b')
@ -274,6 +275,21 @@ class ReTests(unittest.TestCase):
self.checkPatternError('(?P<©>x)', "bad character in group name '©'", 4) self.checkPatternError('(?P<©>x)', "bad character in group name '©'", 4)
self.checkPatternError('(?P=©)', "bad character in group name '©'", 4) self.checkPatternError('(?P=©)', "bad character in group name '©'", 4)
self.checkPatternError('(?(©)y)', "bad character in group name '©'", 3) self.checkPatternError('(?(©)y)', "bad character in group name '©'", 3)
with self.assertWarnsRegex(DeprecationWarning,
r"bad character in group name '\\xc2\\xb5' "
r"at position 4") as w:
re.compile(b'(?P<\xc2\xb5>x)')
self.assertEqual(w.filename, __file__)
with self.assertWarnsRegex(DeprecationWarning,
r"bad character in group name '\\xc2\\xb5' "
r"at position 4"):
self.checkPatternError(b'(?P=\xc2\xb5)',
r"unknown group name '\xc2\xb5'", 4)
with self.assertWarnsRegex(DeprecationWarning,
r"bad character in group name '\\xc2\\xb5' "
r"at position 3"):
self.checkPatternError(b'(?(\xc2\xb5)y)',
r"unknown group name '\xc2\xb5'", 3)
def test_symbolic_refs(self): def test_symbolic_refs(self):
self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\g<b>', 'xx'), '') self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\g<b>', 'xx'), '')
@ -306,12 +322,35 @@ class ReTests(unittest.TestCase):
re.sub('(?P<a>x)', r'\g<ab>', 'xx') re.sub('(?P<a>x)', r'\g<ab>', 'xx')
self.checkTemplateError('(?P<a>x)', r'\g<-1>', 'xx', self.checkTemplateError('(?P<a>x)', r'\g<-1>', 'xx',
"bad character in group name '-1'", 3) "bad character in group name '-1'", 3)
with self.assertWarnsRegex(DeprecationWarning,
r"bad character in group name '\+1' "
r"at position 3") as w:
re.sub('(?P<a>x)', r'\g<+1>', 'xx')
self.assertEqual(w.filename, __file__)
with self.assertWarnsRegex(DeprecationWarning,
r"bad character in group name '1_0' "
r"at position 3"):
re.sub('()'*10, r'\g<1_0>', 'xx')
with self.assertWarnsRegex(DeprecationWarning,
r"bad character in group name ' 1 ' "
r"at position 3"):
re.sub('(?P<a>x)', r'\g< 1 >', 'xx')
self.checkTemplateError('(?P<a>x)', r'\g<©>', 'xx', self.checkTemplateError('(?P<a>x)', r'\g<©>', 'xx',
"bad character in group name '©'", 3) "bad character in group name '©'", 3)
with self.assertWarnsRegex(DeprecationWarning,
r"bad character in group name '\\xc2\\xb5' "
r"at position 3") as w:
with self.assertRaisesRegex(IndexError, "unknown group name '\xc2\xb5'"):
re.sub(b'(?P<a>x)', b'\\g<\xc2\xb5>', b'xx')
self.assertEqual(w.filename, __file__)
self.checkTemplateError('(?P<a>x)', r'\g<㊀>', 'xx', self.checkTemplateError('(?P<a>x)', r'\g<㊀>', 'xx',
"bad character in group name ''", 3) "bad character in group name ''", 3)
self.checkTemplateError('(?P<a>x)', r'\g<¹>', 'xx', self.checkTemplateError('(?P<a>x)', r'\g<¹>', 'xx',
"bad character in group name '¹'", 3) "bad character in group name '¹'", 3)
with self.assertWarnsRegex(DeprecationWarning,
r"bad character in group name '' "
r"at position 3"):
re.sub('(?P<a>x)', r'\g<१>', 'xx')
def test_re_subn(self): def test_re_subn(self):
self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2)) self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
@ -577,10 +616,27 @@ class ReTests(unittest.TestCase):
self.checkPatternError(r'(?P<a>)(?(0)a|b)', 'bad group number', 10) self.checkPatternError(r'(?P<a>)(?(0)a|b)', 'bad group number', 10)
self.checkPatternError(r'()(?(-1)a|b)', self.checkPatternError(r'()(?(-1)a|b)',
"bad character in group name '-1'", 5) "bad character in group name '-1'", 5)
with self.assertWarnsRegex(DeprecationWarning,
r"bad character in group name '\+1' "
r"at position 5") as w:
re.compile(r'()(?(+1)a|b)')
self.assertEqual(w.filename, __file__)
with self.assertWarnsRegex(DeprecationWarning,
r"bad character in group name '1_0' "
r"at position 23"):
re.compile(r'()'*10 + r'(?(1_0)a|b)')
with self.assertWarnsRegex(DeprecationWarning,
r"bad character in group name ' 1 ' "
r"at position 5"):
re.compile(r'()(?( 1 )a|b)')
self.checkPatternError(r'()(?(㊀)a|b)', self.checkPatternError(r'()(?(㊀)a|b)',
"bad character in group name ''", 5) "bad character in group name ''", 5)
self.checkPatternError(r'()(?(¹)a|b)', self.checkPatternError(r'()(?(¹)a|b)',
"bad character in group name '¹'", 5) "bad character in group name '¹'", 5)
with self.assertWarnsRegex(DeprecationWarning,
r"bad character in group name '' "
r"at position 5"):
re.compile(r'()(?(१)a|b)')
self.checkPatternError(r'()(?(1', self.checkPatternError(r'()(?(1',
"missing ), unterminated name", 5) "missing ), unterminated name", 5)
self.checkPatternError(r'()(?(1)a', self.checkPatternError(r'()(?(1)a',

View file

@ -0,0 +1,4 @@
More strict rules will be applied for numerical group references and group
names in regular expressions. For now, a deprecation warning is emitted for
group references and group names which will be errors in future Python
versions.