mirror of
https://github.com/python/cpython.git
synced 2025-10-15 11:22:18 +00:00
gh-91760: Deprecate group names and numbers which will be invalid in future (GH-91794)
Only sequence of ASCII digits will be accepted as a numerical reference. The group name in bytes patterns and replacement strings could only contain ASCII letters and digits and underscore.
This commit is contained in:
parent
6d0d547033
commit
19dca04121
5 changed files with 112 additions and 7 deletions
|
@ -417,6 +417,9 @@ The special characters are:
|
||||||
| | * ``\1`` |
|
| | * ``\1`` |
|
||||||
+---------------------------------------+----------------------------------+
|
+---------------------------------------+----------------------------------+
|
||||||
|
|
||||||
|
.. deprecated:: 3.11
|
||||||
|
Group names containing non-ASCII characters in bytes patterns.
|
||||||
|
|
||||||
.. index:: single: (?P=; in regular expressions
|
.. index:: single: (?P=; in regular expressions
|
||||||
|
|
||||||
``(?P=name)``
|
``(?P=name)``
|
||||||
|
@ -486,6 +489,9 @@ The special characters are:
|
||||||
will match with ``'<user@host.com>'`` as well as ``'user@host.com'``, but
|
will match with ``'<user@host.com>'`` as well as ``'user@host.com'``, but
|
||||||
not with ``'<user@host.com'`` nor ``'user@host.com>'``.
|
not with ``'<user@host.com'`` nor ``'user@host.com>'``.
|
||||||
|
|
||||||
|
.. deprecated:: 3.11
|
||||||
|
Group *id* containing anything except ASCII digits.
|
||||||
|
|
||||||
|
|
||||||
The special sequences consist of ``'\'`` and a character from the list below.
|
The special sequences consist of ``'\'`` and a character from the list below.
|
||||||
If the ordinary character is not an ASCII digit or an ASCII letter, then the
|
If the ordinary character is not an ASCII digit or an ASCII letter, then the
|
||||||
|
@ -995,6 +1001,10 @@ form.
|
||||||
Empty matches for the pattern are replaced when adjacent to a previous
|
Empty matches for the pattern are replaced when adjacent to a previous
|
||||||
non-empty match.
|
non-empty match.
|
||||||
|
|
||||||
|
.. deprecated:: 3.11
|
||||||
|
Group *id* containing anything except ASCII digits.
|
||||||
|
Group names containing non-ASCII characters in bytes replacement strings.
|
||||||
|
|
||||||
|
|
||||||
.. function:: subn(pattern, repl, string, count=0, flags=0)
|
.. function:: subn(pattern, repl, string, count=0, flags=0)
|
||||||
|
|
||||||
|
|
|
@ -1151,6 +1151,14 @@ Deprecated
|
||||||
(Contributed by Brett Cannon in :issue:`47061` and Victor Stinner in
|
(Contributed by Brett Cannon in :issue:`47061` and Victor Stinner in
|
||||||
:gh:`68966`.)
|
:gh:`68966`.)
|
||||||
|
|
||||||
|
* More strict rules will be applied now applied for numerical group references
|
||||||
|
and group names in regular expressions in future Python versions.
|
||||||
|
Only sequence of ASCII digits will be now accepted as a numerical reference.
|
||||||
|
The group name in bytes patterns and replacement strings could only
|
||||||
|
contain ASCII letters and digits and underscore.
|
||||||
|
For now, a deprecation warning is raised for such syntax.
|
||||||
|
(Contributed by Serhiy Storchaka in :gh:`91760`.)
|
||||||
|
|
||||||
|
|
||||||
Removed
|
Removed
|
||||||
=======
|
=======
|
||||||
|
|
|
@ -287,8 +287,22 @@ class Tokenizer:
|
||||||
self.__next()
|
self.__next()
|
||||||
|
|
||||||
def error(self, msg, offset=0):
|
def error(self, msg, offset=0):
|
||||||
|
if not self.istext:
|
||||||
|
msg = msg.encode('ascii', 'backslashreplace').decode('ascii')
|
||||||
return error(msg, self.string, self.tell() - offset)
|
return error(msg, self.string, self.tell() - offset)
|
||||||
|
|
||||||
|
def checkgroupname(self, name, offset, nested):
|
||||||
|
if not name.isidentifier():
|
||||||
|
msg = "bad character in group name %r" % name
|
||||||
|
raise self.error(msg, len(name) + offset)
|
||||||
|
if not (self.istext or name.isascii()):
|
||||||
|
import warnings
|
||||||
|
warnings.warn(
|
||||||
|
"bad character in group name %a at position %d" %
|
||||||
|
(name, self.tell() - len(name) - offset),
|
||||||
|
DeprecationWarning, stacklevel=nested + 7
|
||||||
|
)
|
||||||
|
|
||||||
def _class_escape(source, escape):
|
def _class_escape(source, escape):
|
||||||
# handle escape code inside character class
|
# handle escape code inside character class
|
||||||
code = ESCAPES.get(escape)
|
code = ESCAPES.get(escape)
|
||||||
|
@ -703,15 +717,11 @@ def _parse(source, state, verbose, nested, first=False):
|
||||||
if sourcematch("<"):
|
if sourcematch("<"):
|
||||||
# named group: skip forward to end of name
|
# named group: skip forward to end of name
|
||||||
name = source.getuntil(">", "group name")
|
name = source.getuntil(">", "group name")
|
||||||
if not name.isidentifier():
|
source.checkgroupname(name, 1, nested)
|
||||||
msg = "bad character in group name %r" % name
|
|
||||||
raise source.error(msg, len(name) + 1)
|
|
||||||
elif sourcematch("="):
|
elif sourcematch("="):
|
||||||
# named backreference
|
# named backreference
|
||||||
name = source.getuntil(")", "group name")
|
name = source.getuntil(")", "group name")
|
||||||
if not name.isidentifier():
|
source.checkgroupname(name, 1, nested)
|
||||||
msg = "bad character in group name %r" % name
|
|
||||||
raise source.error(msg, len(name) + 1)
|
|
||||||
gid = state.groupdict.get(name)
|
gid = state.groupdict.get(name)
|
||||||
if gid is None:
|
if gid is None:
|
||||||
msg = "unknown group name %r" % name
|
msg = "unknown group name %r" % name
|
||||||
|
@ -773,6 +783,7 @@ def _parse(source, state, verbose, nested, first=False):
|
||||||
# conditional backreference group
|
# conditional backreference group
|
||||||
condname = source.getuntil(")", "group name")
|
condname = source.getuntil(")", "group name")
|
||||||
if condname.isidentifier():
|
if condname.isidentifier():
|
||||||
|
source.checkgroupname(condname, 1, nested)
|
||||||
condgroup = state.groupdict.get(condname)
|
condgroup = state.groupdict.get(condname)
|
||||||
if condgroup is None:
|
if condgroup is None:
|
||||||
msg = "unknown group name %r" % condname
|
msg = "unknown group name %r" % condname
|
||||||
|
@ -795,6 +806,14 @@ def _parse(source, state, verbose, nested, first=False):
|
||||||
state.grouprefpos[condgroup] = (
|
state.grouprefpos[condgroup] = (
|
||||||
source.tell() - len(condname) - 1
|
source.tell() - len(condname) - 1
|
||||||
)
|
)
|
||||||
|
if not (condname.isdecimal() and condname.isascii()):
|
||||||
|
import warnings
|
||||||
|
warnings.warn(
|
||||||
|
"bad character in group name %s at position %d" %
|
||||||
|
(repr(condname) if source.istext else ascii(condname),
|
||||||
|
source.tell() - len(condname) - 1),
|
||||||
|
DeprecationWarning, stacklevel=nested + 6
|
||||||
|
)
|
||||||
state.checklookbehindgroup(condgroup, source)
|
state.checklookbehindgroup(condgroup, source)
|
||||||
item_yes = _parse(source, state, verbose, nested + 1)
|
item_yes = _parse(source, state, verbose, nested + 1)
|
||||||
if source.match("|"):
|
if source.match("|"):
|
||||||
|
@ -1000,11 +1019,11 @@ def parse_template(source, state):
|
||||||
# group
|
# group
|
||||||
c = this[1]
|
c = this[1]
|
||||||
if c == "g":
|
if c == "g":
|
||||||
name = ""
|
|
||||||
if not s.match("<"):
|
if not s.match("<"):
|
||||||
raise s.error("missing <")
|
raise s.error("missing <")
|
||||||
name = s.getuntil(">", "group name")
|
name = s.getuntil(">", "group name")
|
||||||
if name.isidentifier():
|
if name.isidentifier():
|
||||||
|
s.checkgroupname(name, 1, -1)
|
||||||
try:
|
try:
|
||||||
index = groupindex[name]
|
index = groupindex[name]
|
||||||
except KeyError:
|
except KeyError:
|
||||||
|
@ -1020,6 +1039,14 @@ def parse_template(source, state):
|
||||||
if index >= MAXGROUPS:
|
if index >= MAXGROUPS:
|
||||||
raise s.error("invalid group reference %d" % index,
|
raise s.error("invalid group reference %d" % index,
|
||||||
len(name) + 1)
|
len(name) + 1)
|
||||||
|
if not (name.isdecimal() and name.isascii()):
|
||||||
|
import warnings
|
||||||
|
warnings.warn(
|
||||||
|
"bad character in group name %s at position %d" %
|
||||||
|
(repr(name) if s.istext else ascii(name),
|
||||||
|
s.tell() - len(name) - 1),
|
||||||
|
DeprecationWarning, stacklevel=5
|
||||||
|
)
|
||||||
addgroup(index, len(name) + 1)
|
addgroup(index, len(name) + 1)
|
||||||
elif c == "0":
|
elif c == "0":
|
||||||
if s.next in OCTDIGITS:
|
if s.next in OCTDIGITS:
|
||||||
|
|
|
@ -135,6 +135,7 @@ class ReTests(unittest.TestCase):
|
||||||
self.assertEqual(re.sub('(?P<a>x)', r'\g<a>\g<1>', 'xx'), 'xxxx')
|
self.assertEqual(re.sub('(?P<a>x)', r'\g<a>\g<1>', 'xx'), 'xxxx')
|
||||||
self.assertEqual(re.sub('(?P<unk>x)', r'\g<unk>\g<unk>', 'xx'), 'xxxx')
|
self.assertEqual(re.sub('(?P<unk>x)', r'\g<unk>\g<unk>', 'xx'), 'xxxx')
|
||||||
self.assertEqual(re.sub('(?P<unk>x)', r'\g<1>\g<1>', 'xx'), 'xxxx')
|
self.assertEqual(re.sub('(?P<unk>x)', r'\g<1>\g<1>', 'xx'), 'xxxx')
|
||||||
|
self.assertEqual(re.sub('()x', r'\g<0>\g<0>', 'xx'), 'xxxx')
|
||||||
|
|
||||||
self.assertEqual(re.sub('a', r'\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b')
|
self.assertEqual(re.sub('a', r'\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b')
|
||||||
self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b')
|
self.assertEqual(re.sub('a', '\t\n\v\r\f\a\b', 'a'), '\t\n\v\r\f\a\b')
|
||||||
|
@ -274,6 +275,21 @@ class ReTests(unittest.TestCase):
|
||||||
self.checkPatternError('(?P<©>x)', "bad character in group name '©'", 4)
|
self.checkPatternError('(?P<©>x)', "bad character in group name '©'", 4)
|
||||||
self.checkPatternError('(?P=©)', "bad character in group name '©'", 4)
|
self.checkPatternError('(?P=©)', "bad character in group name '©'", 4)
|
||||||
self.checkPatternError('(?(©)y)', "bad character in group name '©'", 3)
|
self.checkPatternError('(?(©)y)', "bad character in group name '©'", 3)
|
||||||
|
with self.assertWarnsRegex(DeprecationWarning,
|
||||||
|
r"bad character in group name '\\xc2\\xb5' "
|
||||||
|
r"at position 4") as w:
|
||||||
|
re.compile(b'(?P<\xc2\xb5>x)')
|
||||||
|
self.assertEqual(w.filename, __file__)
|
||||||
|
with self.assertWarnsRegex(DeprecationWarning,
|
||||||
|
r"bad character in group name '\\xc2\\xb5' "
|
||||||
|
r"at position 4"):
|
||||||
|
self.checkPatternError(b'(?P=\xc2\xb5)',
|
||||||
|
r"unknown group name '\xc2\xb5'", 4)
|
||||||
|
with self.assertWarnsRegex(DeprecationWarning,
|
||||||
|
r"bad character in group name '\\xc2\\xb5' "
|
||||||
|
r"at position 3"):
|
||||||
|
self.checkPatternError(b'(?(\xc2\xb5)y)',
|
||||||
|
r"unknown group name '\xc2\xb5'", 3)
|
||||||
|
|
||||||
def test_symbolic_refs(self):
|
def test_symbolic_refs(self):
|
||||||
self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\g<b>', 'xx'), '')
|
self.assertEqual(re.sub('(?P<a>x)|(?P<b>y)', r'\g<b>', 'xx'), '')
|
||||||
|
@ -306,12 +322,35 @@ class ReTests(unittest.TestCase):
|
||||||
re.sub('(?P<a>x)', r'\g<ab>', 'xx')
|
re.sub('(?P<a>x)', r'\g<ab>', 'xx')
|
||||||
self.checkTemplateError('(?P<a>x)', r'\g<-1>', 'xx',
|
self.checkTemplateError('(?P<a>x)', r'\g<-1>', 'xx',
|
||||||
"bad character in group name '-1'", 3)
|
"bad character in group name '-1'", 3)
|
||||||
|
with self.assertWarnsRegex(DeprecationWarning,
|
||||||
|
r"bad character in group name '\+1' "
|
||||||
|
r"at position 3") as w:
|
||||||
|
re.sub('(?P<a>x)', r'\g<+1>', 'xx')
|
||||||
|
self.assertEqual(w.filename, __file__)
|
||||||
|
with self.assertWarnsRegex(DeprecationWarning,
|
||||||
|
r"bad character in group name '1_0' "
|
||||||
|
r"at position 3"):
|
||||||
|
re.sub('()'*10, r'\g<1_0>', 'xx')
|
||||||
|
with self.assertWarnsRegex(DeprecationWarning,
|
||||||
|
r"bad character in group name ' 1 ' "
|
||||||
|
r"at position 3"):
|
||||||
|
re.sub('(?P<a>x)', r'\g< 1 >', 'xx')
|
||||||
self.checkTemplateError('(?P<a>x)', r'\g<©>', 'xx',
|
self.checkTemplateError('(?P<a>x)', r'\g<©>', 'xx',
|
||||||
"bad character in group name '©'", 3)
|
"bad character in group name '©'", 3)
|
||||||
|
with self.assertWarnsRegex(DeprecationWarning,
|
||||||
|
r"bad character in group name '\\xc2\\xb5' "
|
||||||
|
r"at position 3") as w:
|
||||||
|
with self.assertRaisesRegex(IndexError, "unknown group name '\xc2\xb5'"):
|
||||||
|
re.sub(b'(?P<a>x)', b'\\g<\xc2\xb5>', b'xx')
|
||||||
|
self.assertEqual(w.filename, __file__)
|
||||||
self.checkTemplateError('(?P<a>x)', r'\g<㊀>', 'xx',
|
self.checkTemplateError('(?P<a>x)', r'\g<㊀>', 'xx',
|
||||||
"bad character in group name '㊀'", 3)
|
"bad character in group name '㊀'", 3)
|
||||||
self.checkTemplateError('(?P<a>x)', r'\g<¹>', 'xx',
|
self.checkTemplateError('(?P<a>x)', r'\g<¹>', 'xx',
|
||||||
"bad character in group name '¹'", 3)
|
"bad character in group name '¹'", 3)
|
||||||
|
with self.assertWarnsRegex(DeprecationWarning,
|
||||||
|
r"bad character in group name '१' "
|
||||||
|
r"at position 3"):
|
||||||
|
re.sub('(?P<a>x)', r'\g<१>', 'xx')
|
||||||
|
|
||||||
def test_re_subn(self):
|
def test_re_subn(self):
|
||||||
self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
|
self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
|
||||||
|
@ -577,10 +616,27 @@ class ReTests(unittest.TestCase):
|
||||||
self.checkPatternError(r'(?P<a>)(?(0)a|b)', 'bad group number', 10)
|
self.checkPatternError(r'(?P<a>)(?(0)a|b)', 'bad group number', 10)
|
||||||
self.checkPatternError(r'()(?(-1)a|b)',
|
self.checkPatternError(r'()(?(-1)a|b)',
|
||||||
"bad character in group name '-1'", 5)
|
"bad character in group name '-1'", 5)
|
||||||
|
with self.assertWarnsRegex(DeprecationWarning,
|
||||||
|
r"bad character in group name '\+1' "
|
||||||
|
r"at position 5") as w:
|
||||||
|
re.compile(r'()(?(+1)a|b)')
|
||||||
|
self.assertEqual(w.filename, __file__)
|
||||||
|
with self.assertWarnsRegex(DeprecationWarning,
|
||||||
|
r"bad character in group name '1_0' "
|
||||||
|
r"at position 23"):
|
||||||
|
re.compile(r'()'*10 + r'(?(1_0)a|b)')
|
||||||
|
with self.assertWarnsRegex(DeprecationWarning,
|
||||||
|
r"bad character in group name ' 1 ' "
|
||||||
|
r"at position 5"):
|
||||||
|
re.compile(r'()(?( 1 )a|b)')
|
||||||
self.checkPatternError(r'()(?(㊀)a|b)',
|
self.checkPatternError(r'()(?(㊀)a|b)',
|
||||||
"bad character in group name '㊀'", 5)
|
"bad character in group name '㊀'", 5)
|
||||||
self.checkPatternError(r'()(?(¹)a|b)',
|
self.checkPatternError(r'()(?(¹)a|b)',
|
||||||
"bad character in group name '¹'", 5)
|
"bad character in group name '¹'", 5)
|
||||||
|
with self.assertWarnsRegex(DeprecationWarning,
|
||||||
|
r"bad character in group name '१' "
|
||||||
|
r"at position 5"):
|
||||||
|
re.compile(r'()(?(१)a|b)')
|
||||||
self.checkPatternError(r'()(?(1',
|
self.checkPatternError(r'()(?(1',
|
||||||
"missing ), unterminated name", 5)
|
"missing ), unterminated name", 5)
|
||||||
self.checkPatternError(r'()(?(1)a',
|
self.checkPatternError(r'()(?(1)a',
|
||||||
|
|
|
@ -0,0 +1,4 @@
|
||||||
|
More strict rules will be applied for numerical group references and group
|
||||||
|
names in regular expressions. For now, a deprecation warning is emitted for
|
||||||
|
group references and group names which will be errors in future Python
|
||||||
|
versions.
|
Loading…
Add table
Add a link
Reference in a new issue