Issue #22437: Number of capturing groups in regular expression is no longer

limited by 100.
This commit is contained in:
Serhiy Storchaka 2014-09-29 22:49:23 +03:00
parent c31e6227f9
commit 9baa5b2de2
8 changed files with 76 additions and 27 deletions

View file

@ -217,6 +217,12 @@ os
* :class:`os.stat_result` now has a :attr:`~os.stat_result.st_file_attributes` * :class:`os.stat_result` now has a :attr:`~os.stat_result.st_file_attributes`
attribute on Windows (contributed by Ben Hoyt in :issue:`21719`). attribute on Windows (contributed by Ben Hoyt in :issue:`21719`).
re
--
* Number of capturing groups in regular expression is no longer limited by 100.
(Contributed by Serhiy Storchaka in :issue:`22437`.)
shutil shutil
------ ------

View file

@ -470,12 +470,6 @@ def compile(p, flags=0):
# print code # print code
# XXX: <fl> get rid of this limitation!
if p.pattern.groups > 100:
raise AssertionError(
"sorry, but this version only supports 100 named groups"
)
# map in either direction # map in either direction
groupindex = p.pattern.groupdict groupindex = p.pattern.groupdict
indexgroup = [None] * p.pattern.groups indexgroup = [None] * p.pattern.groups

View file

@ -15,7 +15,7 @@
MAGIC = 20031017 MAGIC = 20031017
from _sre import MAXREPEAT from _sre import MAXREPEAT, MAXGROUPS
# SRE standard exception (access as sre.error) # SRE standard exception (access as sre.error)
# should this really be here? # should this really be here?

View file

@ -72,6 +72,8 @@ class Pattern:
def opengroup(self, name=None): def opengroup(self, name=None):
gid = self.groups gid = self.groups
self.groups = gid + 1 self.groups = gid + 1
if self.groups > MAXGROUPS:
raise error("groups number is too large")
if name is not None: if name is not None:
ogid = self.groupdict.get(name, None) ogid = self.groupdict.get(name, None)
if ogid is not None: if ogid is not None:
@ -695,8 +697,14 @@ def _parse(source, state):
else: else:
try: try:
condgroup = int(condname) condgroup = int(condname)
if condgroup < 0:
raise ValueError
except ValueError: except ValueError:
raise error("bad character in group name") raise error("bad character in group name")
if not condgroup:
raise error("bad group number")
if condgroup >= MAXGROUPS:
raise error("the group number is too large")
else: else:
# flags # flags
if not source.next in FLAGS: if not source.next in FLAGS:
@ -822,6 +830,8 @@ def parse_template(source, pattern):
index = int(name) index = int(name)
if index < 0: if index < 0:
raise error("negative group number") raise error("negative group number")
if index >= MAXGROUPS:
raise error("the group number is too large")
except ValueError: except ValueError:
if not name.isidentifier(): if not name.isidentifier():
raise error("bad character in group name") raise error("bad character in group name")

View file

@ -193,6 +193,7 @@ class ReTests(unittest.TestCase):
def test_symbolic_groups(self): def test_symbolic_groups(self):
re.compile('(?P<a>x)(?P=a)(?(a)y)') re.compile('(?P<a>x)(?P=a)(?(a)y)')
re.compile('(?P<a1>x)(?P=a1)(?(a1)y)') re.compile('(?P<a1>x)(?P=a1)(?(a1)y)')
re.compile('(?P<a1>x)\1(?(1)y)')
self.assertRaises(re.error, re.compile, '(?P<a>)(?P<a>)') self.assertRaises(re.error, re.compile, '(?P<a>)(?P<a>)')
self.assertRaises(re.error, re.compile, '(?Px)') self.assertRaises(re.error, re.compile, '(?Px)')
self.assertRaises(re.error, re.compile, '(?P=)') self.assertRaises(re.error, re.compile, '(?P=)')
@ -212,6 +213,10 @@ class ReTests(unittest.TestCase):
re.compile('(?P<µ>x)(?P=µ)(?(µ)y)') re.compile('(?P<µ>x)(?P=µ)(?(µ)y)')
re.compile('(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)(?P=𝔘𝔫𝔦𝔠𝔬𝔡𝔢)(?(𝔘𝔫𝔦𝔠𝔬𝔡𝔢)y)') re.compile('(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)(?P=𝔘𝔫𝔦𝔠𝔬𝔡𝔢)(?(𝔘𝔫𝔦𝔠𝔬𝔡𝔢)y)')
self.assertRaises(re.error, re.compile, '(?P<©>x)') self.assertRaises(re.error, re.compile, '(?P<©>x)')
# Support > 100 groups.
pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
pat = '(?:%s)(?(200)z|t)' % pat
self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5))
def test_symbolic_refs(self): def test_symbolic_refs(self):
self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a', 'xx') self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a', 'xx')
@ -228,6 +233,9 @@ class ReTests(unittest.TestCase):
self.assertEqual(re.sub('(?P<µ>x)', r'\g<µ>', 'xx'), 'xx') self.assertEqual(re.sub('(?P<µ>x)', r'\g<µ>', 'xx'), 'xx')
self.assertEqual(re.sub('(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)', r'\g<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>', 'xx'), 'xx') self.assertEqual(re.sub('(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)', r'\g<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>', 'xx'), 'xx')
self.assertRaises(re.error, re.sub, '(?P<a>x)', r'\g<©>', 'xx') self.assertRaises(re.error, re.sub, '(?P<a>x)', r'\g<©>', 'xx')
# Support > 100 groups.
pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
self.assertEqual(re.sub(pat, '\g<200>', 'xc8yzxc8y'), 'c8zc8')
def test_re_subn(self): def test_re_subn(self):
self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2)) self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
@ -404,6 +412,10 @@ class ReTests(unittest.TestCase):
self.assertIsNone(p.match('abd')) self.assertIsNone(p.match('abd'))
self.assertIsNone(p.match('ac')) self.assertIsNone(p.match('ac'))
# Support > 100 groups.
pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
pat = '(?:%s)(?(200)z)' % pat
self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5))
def test_re_groupref(self): def test_re_groupref(self):
self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a|').groups(), self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a|').groups(),
@ -1070,8 +1082,10 @@ class ReTests(unittest.TestCase):
# a RuntimeError is raised instead of OverflowError. # a RuntimeError is raised instead of OverflowError.
long_overflow = 2**128 long_overflow = 2**128
self.assertRaises(TypeError, re.finditer, "a", {}) self.assertRaises(TypeError, re.finditer, "a", {})
self.assertRaises(OverflowError, _sre.compile, "abc", 0, [long_overflow]) with self.assertRaises(OverflowError):
self.assertRaises(TypeError, _sre.compile, {}, 0, []) _sre.compile("abc", 0, [long_overflow], 0, [], [])
with self.assertRaises(TypeError):
_sre.compile({}, 0, [], 0, [], [])
def test_search_dot_unicode(self): def test_search_dot_unicode(self):
self.assertTrue(re.search("123.*-", '123abc-')) self.assertTrue(re.search("123.*-", '123abc-'))

View file

@ -145,6 +145,9 @@ Core and Builtins
Library Library
------- -------
- Issue #22437: Number of capturing groups in regular expression is no longer
limited by 100.
- Issue #17442: InteractiveInterpreter now displays the full chained traceback - Issue #17442: InteractiveInterpreter now displays the full chained traceback
in its showtraceback method, to match the built in interactive interpreter. in its showtraceback method, to match the built in interactive interpreter.

View file

@ -357,6 +357,11 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
memset(state, 0, sizeof(SRE_STATE)); memset(state, 0, sizeof(SRE_STATE));
state->mark = PyMem_New(void *, pattern->groups * 2);
if (!state->mark) {
PyErr_NoMemory();
goto err;
}
state->lastmark = -1; state->lastmark = -1;
state->lastindex = -1; state->lastindex = -1;
@ -409,6 +414,8 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
return string; return string;
err: err:
PyMem_Del(state->mark);
state->mark = NULL;
if (state->buffer.buf) if (state->buffer.buf)
PyBuffer_Release(&state->buffer); PyBuffer_Release(&state->buffer);
return NULL; return NULL;
@ -421,6 +428,8 @@ state_fini(SRE_STATE* state)
PyBuffer_Release(&state->buffer); PyBuffer_Release(&state->buffer);
Py_XDECREF(state->string); Py_XDECREF(state->string);
data_stack_dealloc(state); data_stack_dealloc(state);
PyMem_Del(state->mark);
state->mark = NULL;
} }
/* calculate offset from start of string */ /* calculate offset from start of string */
@ -560,6 +569,7 @@ pattern_match(PatternObject *self, PyObject *args, PyObject *kwargs)
PyObject *pattern = NULL; PyObject *pattern = NULL;
SRE_STATE state; SRE_STATE state;
Py_ssize_t status; Py_ssize_t status;
PyObject *match;
if (!PyArg_ParseTupleAndKeywords(args, kwargs, if (!PyArg_ParseTupleAndKeywords(args, kwargs,
"|Onn$O:match", _keywords, "|Onn$O:match", _keywords,
@ -579,12 +589,14 @@ pattern_match(PatternObject *self, PyObject *args, PyObject *kwargs)
status = sre_match(&state, PatternObject_GetCode(self), 0); status = sre_match(&state, PatternObject_GetCode(self), 0);
TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr)); TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
if (PyErr_Occurred()) if (PyErr_Occurred()) {
return NULL;
state_fini(&state); state_fini(&state);
return NULL;
}
return (PyObject *)pattern_new_match(self, &state, status); match = pattern_new_match(self, &state, status);
state_fini(&state);
return match;
} }
static PyObject* static PyObject*
@ -592,6 +604,7 @@ pattern_fullmatch(PatternObject* self, PyObject* args, PyObject* kw)
{ {
SRE_STATE state; SRE_STATE state;
Py_ssize_t status; Py_ssize_t status;
PyObject *match;
PyObject *string = NULL, *string2 = NULL; PyObject *string = NULL, *string2 = NULL;
Py_ssize_t start = 0; Py_ssize_t start = 0;
@ -616,12 +629,14 @@ pattern_fullmatch(PatternObject* self, PyObject* args, PyObject* kw)
status = sre_match(&state, PatternObject_GetCode(self), 1); status = sre_match(&state, PatternObject_GetCode(self), 1);
TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr)); TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
if (PyErr_Occurred()) if (PyErr_Occurred()) {
return NULL;
state_fini(&state); state_fini(&state);
return NULL;
}
return pattern_new_match(self, &state, status); match = pattern_new_match(self, &state, status);
state_fini(&state);
return match;
} }
static PyObject* static PyObject*
@ -629,6 +644,7 @@ pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
{ {
SRE_STATE state; SRE_STATE state;
Py_ssize_t status; Py_ssize_t status;
PyObject *match;
PyObject *string = NULL, *string2 = NULL; PyObject *string = NULL, *string2 = NULL;
Py_ssize_t start = 0; Py_ssize_t start = 0;
@ -652,12 +668,14 @@ pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr)); TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
if (PyErr_Occurred()) {
state_fini(&state); state_fini(&state);
if (PyErr_Occurred())
return NULL; return NULL;
}
return pattern_new_match(self, &state, status); match = pattern_new_match(self, &state, status);
state_fini(&state);
return match;
} }
static PyObject* static PyObject*
@ -1417,7 +1435,7 @@ _compile(PyObject* self_, PyObject* args)
PyObject* groupindex = NULL; PyObject* groupindex = NULL;
PyObject* indexgroup = NULL; PyObject* indexgroup = NULL;
if (!PyArg_ParseTuple(args, "OiO!|nOO", &pattern, &flags, if (!PyArg_ParseTuple(args, "OiO!nOO", &pattern, &flags,
&PyList_Type, &code, &groups, &PyList_Type, &code, &groups,
&groupindex, &indexgroup)) &groupindex, &indexgroup))
return NULL; return NULL;
@ -1933,10 +1951,9 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
static int static int
_validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups) _validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
{ {
if (groups < 0 || groups > 100 || code >= end || end[-1] != SRE_OP_SUCCESS) if (groups < 0 || (size_t)groups > SRE_MAXGROUPS ||
code >= end || end[-1] != SRE_OP_SUCCESS)
FAIL; FAIL;
if (groups == 0) /* fix for simplejson */
groups = 100; /* 100 groups should always be safe */
return _validate_inner(code, end-1, groups); return _validate_inner(code, end-1, groups);
} }
@ -2747,6 +2764,12 @@ PyMODINIT_FUNC PyInit__sre(void)
Py_DECREF(x); Py_DECREF(x);
} }
x = PyLong_FromUnsignedLong(SRE_MAXGROUPS);
if (x) {
PyDict_SetItemString(d, "MAXGROUPS", x);
Py_DECREF(x);
}
x = PyUnicode_FromString(copyright); x = PyUnicode_FromString(copyright);
if (x) { if (x) {
PyDict_SetItemString(d, "copyright", x); PyDict_SetItemString(d, "copyright", x);

View file

@ -18,8 +18,10 @@
#define SRE_CODE Py_UCS4 #define SRE_CODE Py_UCS4
#if SIZEOF_SIZE_T > 4 #if SIZEOF_SIZE_T > 4
# define SRE_MAXREPEAT (~(SRE_CODE)0) # define SRE_MAXREPEAT (~(SRE_CODE)0)
# define SRE_MAXGROUPS ((~(SRE_CODE)0) / 2)
#else #else
# define SRE_MAXREPEAT ((SRE_CODE)PY_SSIZE_T_MAX) # define SRE_MAXREPEAT ((SRE_CODE)PY_SSIZE_T_MAX)
# define SRE_MAXGROUPS ((SRE_CODE)PY_SSIZE_T_MAX / SIZEOF_SIZE_T / 2)
#endif #endif
typedef struct { typedef struct {
@ -52,9 +54,6 @@ typedef struct {
typedef unsigned int (*SRE_TOLOWER_HOOK)(unsigned int ch); typedef unsigned int (*SRE_TOLOWER_HOOK)(unsigned int ch);
/* FIXME: <fl> shouldn't be a constant, really... */
#define SRE_MARK_SIZE 200
typedef struct SRE_REPEAT_T { typedef struct SRE_REPEAT_T {
Py_ssize_t count; Py_ssize_t count;
SRE_CODE* pattern; /* points to REPEAT operator arguments */ SRE_CODE* pattern; /* points to REPEAT operator arguments */
@ -76,7 +75,7 @@ typedef struct {
/* registers */ /* registers */
Py_ssize_t lastindex; Py_ssize_t lastindex;
Py_ssize_t lastmark; Py_ssize_t lastmark;
void* mark[SRE_MARK_SIZE]; void** mark;
/* dynamically allocated stuff */ /* dynamically allocated stuff */
char* data_stack; char* data_stack;
size_t data_stack_size; size_t data_stack_size;