mirror of
https://github.com/python/cpython.git
synced 2025-08-04 08:59:19 +00:00
bpo-35859: Fix a few long-standing bugs in re engine (GH-12427)
In rare cases, capturing group could get wrong result. Regular expression engines in Perl and Java have similar bugs. The new behavior now matches the behavior of more modern RE engines: in the regex module and in PHP, Ruby and Node.js.
This commit is contained in:
parent
788154919c
commit
356997cccc
5 changed files with 153 additions and 20 deletions
|
@ -718,6 +718,11 @@ Changes in the Python API
|
|||
deprecated since Python 3.6.
|
||||
(Contributed by Serhiy Storchaka in :issue:`47066`.)
|
||||
|
||||
* :mod:`re` module: Fix a few long-standing bugs where, in rare cases,
|
||||
capturing group could get wrong result. So the result may be different than
|
||||
before.
|
||||
(Contributed by Ma Lin in :issue:`35859`.)
|
||||
|
||||
* The *population* parameter of :func:`random.sample` must be a sequence.
|
||||
Automatic conversion of sets to lists is no longer supported. If the sample size
|
||||
is larger than the population size, a :exc:`ValueError` is raised.
|
||||
|
|
|
@ -2033,6 +2033,75 @@ class ReTests(unittest.TestCase):
|
|||
{'tag': 'foo', 'text': None},
|
||||
{'tag': 'foo', 'text': None}])
|
||||
|
||||
def test_MARK_PUSH_macro_bug(self):
|
||||
# issue35859, MARK_PUSH() macro didn't protect MARK-0 if it
|
||||
# was the only available mark.
|
||||
self.assertEqual(re.match(r'(ab|a)*?b', 'ab').groups(), ('a',))
|
||||
self.assertEqual(re.match(r'(ab|a)+?b', 'ab').groups(), ('a',))
|
||||
self.assertEqual(re.match(r'(ab|a){0,2}?b', 'ab').groups(), ('a',))
|
||||
self.assertEqual(re.match(r'(.b|a)*?b', 'ab').groups(), ('a',))
|
||||
|
||||
def test_MIN_UNTIL_mark_bug(self):
|
||||
# Fixed in issue35859, reported in issue9134.
|
||||
# JUMP_MIN_UNTIL_2 should MARK_PUSH() if in a repeat
|
||||
s = 'axxzbcz'
|
||||
p = r'(?:(?:a|bc)*?(xx)??z)*'
|
||||
self.assertEqual(re.match(p, s).groups(), ('xx',))
|
||||
|
||||
# test-case provided by issue9134
|
||||
s = 'xtcxyzxc'
|
||||
p = r'((x|yz)+?(t)??c)*'
|
||||
m = re.match(p, s)
|
||||
self.assertEqual(m.span(), (0, 8))
|
||||
self.assertEqual(m.span(2), (6, 7))
|
||||
self.assertEqual(m.groups(), ('xyzxc', 'x', 't'))
|
||||
|
||||
def test_REPEAT_ONE_mark_bug(self):
|
||||
# issue35859
|
||||
# JUMP_REPEAT_ONE_1 should MARK_PUSH() if in a repeat
|
||||
s = 'aabaab'
|
||||
p = r'(?:[^b]*a(?=(b)|(a))ab)*'
|
||||
m = re.match(p, s)
|
||||
self.assertEqual(m.span(), (0, 6))
|
||||
self.assertEqual(m.span(2), (4, 5))
|
||||
self.assertEqual(m.groups(), (None, 'a'))
|
||||
|
||||
# JUMP_REPEAT_ONE_2 should MARK_PUSH() if in a repeat
|
||||
s = 'abab'
|
||||
p = r'(?:[^b]*(?=(b)|(a))ab)*'
|
||||
m = re.match(p, s)
|
||||
self.assertEqual(m.span(), (0, 4))
|
||||
self.assertEqual(m.span(2), (2, 3))
|
||||
self.assertEqual(m.groups(), (None, 'a'))
|
||||
|
||||
self.assertEqual(re.match(r'(ab?)*?b', 'ab').groups(), ('a',))
|
||||
|
||||
def test_MIN_REPEAT_ONE_mark_bug(self):
|
||||
# issue35859
|
||||
# JUMP_MIN_REPEAT_ONE should MARK_PUSH() if in a repeat
|
||||
s = 'abab'
|
||||
p = r'(?:.*?(?=(a)|(b))b)*'
|
||||
m = re.match(p, s)
|
||||
self.assertEqual(m.span(), (0, 4))
|
||||
self.assertEqual(m.span(2), (3, 4))
|
||||
self.assertEqual(m.groups(), (None, 'b'))
|
||||
|
||||
s = 'axxzaz'
|
||||
p = r'(?:a*?(xx)??z)*'
|
||||
self.assertEqual(re.match(p, s).groups(), ('xx',))
|
||||
|
||||
def test_ASSERT_NOT_mark_bug(self):
|
||||
# Fixed in issue35859, reported in issue725149.
|
||||
# JUMP_ASSERT_NOT should LASTMARK_SAVE()
|
||||
self.assertEqual(re.match(r'(?!(..)c)', 'ab').groups(), (None,))
|
||||
|
||||
# JUMP_ASSERT_NOT should MARK_PUSH() if in a repeat
|
||||
m = re.match(r'((?!(ab)c)(.))*', 'abab')
|
||||
self.assertEqual(m.span(), (0, 4))
|
||||
self.assertEqual(m.span(1), (3, 4))
|
||||
self.assertEqual(m.span(3), (3, 4))
|
||||
self.assertEqual(m.groups(), ('b', None, 'b'))
|
||||
|
||||
def test_bug_40736(self):
|
||||
with self.assertRaisesRegex(TypeError, "got 'int'"):
|
||||
re.search("x*", 5)
|
||||
|
|
|
@ -0,0 +1,2 @@
|
|||
:mod:`re` module, fix a few bugs about capturing group. In rare cases,
|
||||
capturing group gets an incorrect string. Patch by Ma Lin.
|
|
@ -532,6 +532,14 @@ state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty)
|
|||
} else {
|
||||
i = STATE_OFFSET(state, state->mark[index]);
|
||||
j = STATE_OFFSET(state, state->mark[index+1]);
|
||||
|
||||
/* check wrong span */
|
||||
if (i > j) {
|
||||
PyErr_SetString(PyExc_SystemError,
|
||||
"The span of capturing group is wrong,"
|
||||
" please report a bug for the re module.");
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
return getslice(state->isbytes, state->beginning, string, i, j);
|
||||
|
@ -2477,6 +2485,15 @@ pattern_new_match(_sremodulestate* module_state,
|
|||
if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
|
||||
match->mark[j+2] = ((char*) state->mark[j] - base) / n;
|
||||
match->mark[j+3] = ((char*) state->mark[j+1] - base) / n;
|
||||
|
||||
/* check wrong span */
|
||||
if (match->mark[j+2] > match->mark[j+3]) {
|
||||
PyErr_SetString(PyExc_SystemError,
|
||||
"The span of capturing group is wrong,"
|
||||
" please report a bug for the re module.");
|
||||
Py_DECREF(match);
|
||||
return NULL;
|
||||
}
|
||||
} else
|
||||
match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
|
||||
|
||||
|
|
|
@ -449,20 +449,20 @@ do { \
|
|||
DATA_STACK_LOOKUP_AT(state,t,p,pos)
|
||||
|
||||
#define MARK_PUSH(lastmark) \
|
||||
do if (lastmark > 0) { \
|
||||
do if (lastmark >= 0) { \
|
||||
i = lastmark; /* ctx->lastmark may change if reallocated */ \
|
||||
DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
|
||||
} while (0)
|
||||
#define MARK_POP(lastmark) \
|
||||
do if (lastmark > 0) { \
|
||||
do if (lastmark >= 0) { \
|
||||
DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
|
||||
} while (0)
|
||||
#define MARK_POP_KEEP(lastmark) \
|
||||
do if (lastmark > 0) { \
|
||||
do if (lastmark >= 0) { \
|
||||
DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
|
||||
} while (0)
|
||||
#define MARK_POP_DISCARD(lastmark) \
|
||||
do if (lastmark > 0) { \
|
||||
do if (lastmark >= 0) { \
|
||||
DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
|
||||
} while (0)
|
||||
|
||||
|
@ -770,8 +770,7 @@ entrance:
|
|||
/* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
|
||||
TRACE(("|%p|%p|BRANCH\n", ctx->pattern, ctx->ptr));
|
||||
LASTMARK_SAVE();
|
||||
ctx->u.rep = state->repeat;
|
||||
if (ctx->u.rep)
|
||||
if (state->repeat)
|
||||
MARK_PUSH(ctx->lastmark);
|
||||
for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) {
|
||||
if (ctx->pattern[1] == SRE_OP_LITERAL &&
|
||||
|
@ -786,16 +785,16 @@ entrance:
|
|||
state->ptr = ctx->ptr;
|
||||
DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1);
|
||||
if (ret) {
|
||||
if (ctx->u.rep)
|
||||
if (state->repeat)
|
||||
MARK_POP_DISCARD(ctx->lastmark);
|
||||
RETURN_ON_ERROR(ret);
|
||||
RETURN_SUCCESS;
|
||||
}
|
||||
if (ctx->u.rep)
|
||||
if (state->repeat)
|
||||
MARK_POP_KEEP(ctx->lastmark);
|
||||
LASTMARK_RESTORE();
|
||||
}
|
||||
if (ctx->u.rep)
|
||||
if (state->repeat)
|
||||
MARK_POP_DISCARD(ctx->lastmark);
|
||||
RETURN_FAILURE;
|
||||
|
||||
|
@ -841,6 +840,8 @@ entrance:
|
|||
}
|
||||
|
||||
LASTMARK_SAVE();
|
||||
if (state->repeat)
|
||||
MARK_PUSH(ctx->lastmark);
|
||||
|
||||
if (ctx->pattern[ctx->pattern[0]] == SRE_OP_LITERAL) {
|
||||
/* tail starts with a literal. skip positions where
|
||||
|
@ -858,16 +859,20 @@ entrance:
|
|||
DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1,
|
||||
ctx->pattern+ctx->pattern[0]);
|
||||
if (ret) {
|
||||
if (state->repeat)
|
||||
MARK_POP_DISCARD(ctx->lastmark);
|
||||
RETURN_ON_ERROR(ret);
|
||||
RETURN_SUCCESS;
|
||||
}
|
||||
|
||||
if (state->repeat)
|
||||
MARK_POP_KEEP(ctx->lastmark);
|
||||
LASTMARK_RESTORE();
|
||||
|
||||
ctx->ptr--;
|
||||
ctx->count--;
|
||||
}
|
||||
|
||||
if (state->repeat)
|
||||
MARK_POP_DISCARD(ctx->lastmark);
|
||||
} else {
|
||||
/* general case */
|
||||
while (ctx->count >= (Py_ssize_t) ctx->pattern[1]) {
|
||||
|
@ -875,13 +880,20 @@ entrance:
|
|||
DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2,
|
||||
ctx->pattern+ctx->pattern[0]);
|
||||
if (ret) {
|
||||
if (state->repeat)
|
||||
MARK_POP_DISCARD(ctx->lastmark);
|
||||
RETURN_ON_ERROR(ret);
|
||||
RETURN_SUCCESS;
|
||||
}
|
||||
if (state->repeat)
|
||||
MARK_POP_KEEP(ctx->lastmark);
|
||||
LASTMARK_RESTORE();
|
||||
|
||||
ctx->ptr--;
|
||||
ctx->count--;
|
||||
LASTMARK_RESTORE();
|
||||
}
|
||||
if (state->repeat)
|
||||
MARK_POP_DISCARD(ctx->lastmark);
|
||||
}
|
||||
RETURN_FAILURE;
|
||||
|
||||
|
@ -930,15 +942,24 @@ entrance:
|
|||
} else {
|
||||
/* general case */
|
||||
LASTMARK_SAVE();
|
||||
if (state->repeat)
|
||||
MARK_PUSH(ctx->lastmark);
|
||||
|
||||
while ((Py_ssize_t)ctx->pattern[2] == SRE_MAXREPEAT
|
||||
|| ctx->count <= (Py_ssize_t)ctx->pattern[2]) {
|
||||
state->ptr = ctx->ptr;
|
||||
DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one,
|
||||
ctx->pattern+ctx->pattern[0]);
|
||||
if (ret) {
|
||||
if (state->repeat)
|
||||
MARK_POP_DISCARD(ctx->lastmark);
|
||||
RETURN_ON_ERROR(ret);
|
||||
RETURN_SUCCESS;
|
||||
}
|
||||
if (state->repeat)
|
||||
MARK_POP_KEEP(ctx->lastmark);
|
||||
LASTMARK_RESTORE();
|
||||
|
||||
state->ptr = ctx->ptr;
|
||||
ret = SRE(count)(state, ctx->pattern+3, 1);
|
||||
RETURN_ON_ERROR(ret);
|
||||
|
@ -948,8 +969,9 @@ entrance:
|
|||
assert(ret == 1);
|
||||
ctx->ptr++;
|
||||
ctx->count++;
|
||||
LASTMARK_RESTORE();
|
||||
}
|
||||
if (state->repeat)
|
||||
MARK_POP_DISCARD(ctx->lastmark);
|
||||
}
|
||||
RETURN_FAILURE;
|
||||
|
||||
|
@ -1098,8 +1120,9 @@ entrance:
|
|||
tail matches */
|
||||
state->repeat = ctx->u.rep->prev;
|
||||
DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern);
|
||||
state->repeat = ctx->u.rep; // restore repeat before return
|
||||
|
||||
RETURN_ON_SUCCESS(ret);
|
||||
state->repeat = ctx->u.rep;
|
||||
state->ptr = ctx->ptr;
|
||||
RETURN_FAILURE;
|
||||
|
||||
|
@ -1132,21 +1155,29 @@ entrance:
|
|||
RETURN_FAILURE;
|
||||
}
|
||||
|
||||
LASTMARK_SAVE();
|
||||
|
||||
/* see if the tail matches */
|
||||
state->repeat = ctx->u.rep->prev;
|
||||
|
||||
LASTMARK_SAVE();
|
||||
if (state->repeat)
|
||||
MARK_PUSH(ctx->lastmark);
|
||||
|
||||
DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern);
|
||||
SRE_REPEAT *repeat_of_tail = state->repeat;
|
||||
state->repeat = ctx->u.rep; // restore repeat before return
|
||||
|
||||
if (ret) {
|
||||
if (repeat_of_tail)
|
||||
MARK_POP_DISCARD(ctx->lastmark);
|
||||
RETURN_ON_ERROR(ret);
|
||||
RETURN_SUCCESS;
|
||||
}
|
||||
|
||||
state->repeat = ctx->u.rep;
|
||||
state->ptr = ctx->ptr;
|
||||
|
||||
if (repeat_of_tail)
|
||||
MARK_POP(ctx->lastmark);
|
||||
LASTMARK_RESTORE();
|
||||
|
||||
state->ptr = ctx->ptr;
|
||||
|
||||
if ((ctx->count >= (Py_ssize_t) ctx->u.rep->pattern[2]
|
||||
&& ctx->u.rep->pattern[2] != SRE_MAXREPEAT) ||
|
||||
state->ptr == ctx->u.rep->last_ptr)
|
||||
|
@ -1444,11 +1475,20 @@ entrance:
|
|||
ctx->ptr, ctx->pattern[1]));
|
||||
if (ctx->ptr - (SRE_CHAR *)state->beginning >= (Py_ssize_t)ctx->pattern[1]) {
|
||||
state->ptr = ctx->ptr - ctx->pattern[1];
|
||||
LASTMARK_SAVE();
|
||||
if (state->repeat)
|
||||
MARK_PUSH(ctx->lastmark);
|
||||
|
||||
DO_JUMP0(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2);
|
||||
if (ret) {
|
||||
if (state->repeat)
|
||||
MARK_POP_DISCARD(ctx->lastmark);
|
||||
RETURN_ON_ERROR(ret);
|
||||
RETURN_FAILURE;
|
||||
}
|
||||
if (state->repeat)
|
||||
MARK_POP(ctx->lastmark);
|
||||
LASTMARK_RESTORE();
|
||||
}
|
||||
ctx->pattern += ctx->pattern[0];
|
||||
break;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue