mirror of
https://github.com/python/cpython.git
synced 2025-08-10 03:49:18 +00:00
[3.12] gh-109747: Improve errors for unsupported look-behind patterns (GH-109859) (GH-110859)
Now re.error is raised instead of OverflowError or RuntimeError for
too large width of look-behind pattern.
The limit is increased to 2**32-1 (was 2**31-1).
(cherry picked from commit e2b3d831fd
)
Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
This commit is contained in:
parent
12b9cb80be
commit
a2cc9a4c3a
6 changed files with 46 additions and 13 deletions
|
@ -149,6 +149,8 @@ def _compile(code, pattern, flags):
|
||||||
emit(0) # look ahead
|
emit(0) # look ahead
|
||||||
else:
|
else:
|
||||||
lo, hi = av[1].getwidth()
|
lo, hi = av[1].getwidth()
|
||||||
|
if lo > MAXCODE:
|
||||||
|
raise error("looks too much behind")
|
||||||
if lo != hi:
|
if lo != hi:
|
||||||
raise error("look-behind requires fixed-width pattern")
|
raise error("look-behind requires fixed-width pattern")
|
||||||
emit(lo) # look behind
|
emit(lo) # look behind
|
||||||
|
@ -549,7 +551,7 @@ def _compile_info(code, pattern, flags):
|
||||||
else:
|
else:
|
||||||
emit(MAXCODE)
|
emit(MAXCODE)
|
||||||
prefix = prefix[:MAXCODE]
|
prefix = prefix[:MAXCODE]
|
||||||
emit(min(hi, MAXCODE))
|
emit(hi)
|
||||||
# add literal prefix
|
# add literal prefix
|
||||||
if prefix:
|
if prefix:
|
||||||
emit(len(prefix)) # length
|
emit(len(prefix)) # length
|
||||||
|
|
|
@ -68,6 +68,10 @@ FLAGS = {
|
||||||
TYPE_FLAGS = SRE_FLAG_ASCII | SRE_FLAG_LOCALE | SRE_FLAG_UNICODE
|
TYPE_FLAGS = SRE_FLAG_ASCII | SRE_FLAG_LOCALE | SRE_FLAG_UNICODE
|
||||||
GLOBAL_FLAGS = SRE_FLAG_DEBUG | SRE_FLAG_TEMPLATE
|
GLOBAL_FLAGS = SRE_FLAG_DEBUG | SRE_FLAG_TEMPLATE
|
||||||
|
|
||||||
|
# Maximal value returned by SubPattern.getwidth().
|
||||||
|
# Must be larger than MAXREPEAT, MAXCODE and sys.maxsize.
|
||||||
|
MAXWIDTH = 1 << 64
|
||||||
|
|
||||||
class State:
|
class State:
|
||||||
# keeps track of state for parsing
|
# keeps track of state for parsing
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
@ -178,7 +182,7 @@ class SubPattern:
|
||||||
lo = hi = 0
|
lo = hi = 0
|
||||||
for op, av in self.data:
|
for op, av in self.data:
|
||||||
if op is BRANCH:
|
if op is BRANCH:
|
||||||
i = MAXREPEAT - 1
|
i = MAXWIDTH
|
||||||
j = 0
|
j = 0
|
||||||
for av in av[1]:
|
for av in av[1]:
|
||||||
l, h = av.getwidth()
|
l, h = av.getwidth()
|
||||||
|
@ -197,7 +201,10 @@ class SubPattern:
|
||||||
elif op in _REPEATCODES:
|
elif op in _REPEATCODES:
|
||||||
i, j = av[2].getwidth()
|
i, j = av[2].getwidth()
|
||||||
lo = lo + i * av[0]
|
lo = lo + i * av[0]
|
||||||
hi = hi + j * av[1]
|
if av[1] == MAXREPEAT and j:
|
||||||
|
hi = MAXWIDTH
|
||||||
|
else:
|
||||||
|
hi = hi + j * av[1]
|
||||||
elif op in _UNITCODES:
|
elif op in _UNITCODES:
|
||||||
lo = lo + 1
|
lo = lo + 1
|
||||||
hi = hi + 1
|
hi = hi + 1
|
||||||
|
@ -217,7 +224,7 @@ class SubPattern:
|
||||||
hi = hi + j
|
hi = hi + j
|
||||||
elif op is SUCCESS:
|
elif op is SUCCESS:
|
||||||
break
|
break
|
||||||
self.width = min(lo, MAXREPEAT - 1), min(hi, MAXREPEAT)
|
self.width = min(lo, MAXWIDTH), min(hi, MAXWIDTH)
|
||||||
return self.width
|
return self.width
|
||||||
|
|
||||||
class Tokenizer:
|
class Tokenizer:
|
||||||
|
|
|
@ -1799,6 +1799,29 @@ class ReTests(unittest.TestCase):
|
||||||
self.assertRaises(OverflowError, re.compile, r".{%d,}?" % 2**128)
|
self.assertRaises(OverflowError, re.compile, r".{%d,}?" % 2**128)
|
||||||
self.assertRaises(OverflowError, re.compile, r".{%d,%d}" % (2**129, 2**128))
|
self.assertRaises(OverflowError, re.compile, r".{%d,%d}" % (2**129, 2**128))
|
||||||
|
|
||||||
|
def test_look_behind_overflow(self):
|
||||||
|
string = "x" * 2_500_000
|
||||||
|
p1 = r"(?<=((.{%d}){%d}){%d})"
|
||||||
|
p2 = r"(?<!((.{%d}){%d}){%d})"
|
||||||
|
# Test that the templates are valid and look-behind with width 2**21
|
||||||
|
# (larger than sys.maxunicode) are supported.
|
||||||
|
self.assertEqual(re.search(p1 % (2**7, 2**7, 2**7), string).span(),
|
||||||
|
(2**21, 2**21))
|
||||||
|
self.assertEqual(re.search(p2 % (2**7, 2**7, 2**7), string).span(),
|
||||||
|
(0, 0))
|
||||||
|
# Test that 2**22 is accepted as a repetition number and look-behind
|
||||||
|
# width.
|
||||||
|
re.compile(p1 % (2**22, 1, 1))
|
||||||
|
re.compile(p1 % (1, 2**22, 1))
|
||||||
|
re.compile(p1 % (1, 1, 2**22))
|
||||||
|
re.compile(p2 % (2**22, 1, 1))
|
||||||
|
re.compile(p2 % (1, 2**22, 1))
|
||||||
|
re.compile(p2 % (1, 1, 2**22))
|
||||||
|
# But 2**66 is too large for look-behind width.
|
||||||
|
errmsg = "looks too much behind"
|
||||||
|
self.assertRaisesRegex(re.error, errmsg, re.compile, p1 % (2**22, 2**22, 2**22))
|
||||||
|
self.assertRaisesRegex(re.error, errmsg, re.compile, p2 % (2**22, 2**22, 2**22))
|
||||||
|
|
||||||
def test_backref_group_name_in_exception(self):
|
def test_backref_group_name_in_exception(self):
|
||||||
# Issue 17341: Poor error message when compiling invalid regex
|
# Issue 17341: Poor error message when compiling invalid regex
|
||||||
self.checkPatternError('(?P=<foo>)',
|
self.checkPatternError('(?P=<foo>)',
|
||||||
|
|
|
@ -0,0 +1,3 @@
|
||||||
|
Improve errors for unsupported look-behind patterns. Now re.error is raised
|
||||||
|
instead of OverflowError or RuntimeError for too large width of look-behind
|
||||||
|
pattern.
|
|
@ -2024,8 +2024,6 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
|
||||||
GET_SKIP;
|
GET_SKIP;
|
||||||
GET_ARG; /* 0 for lookahead, width for lookbehind */
|
GET_ARG; /* 0 for lookahead, width for lookbehind */
|
||||||
code--; /* Back up over arg to simplify math below */
|
code--; /* Back up over arg to simplify math below */
|
||||||
if (arg & 0x80000000)
|
|
||||||
FAIL; /* Width too large */
|
|
||||||
/* Stop 1 before the end; we check the SUCCESS below */
|
/* Stop 1 before the end; we check the SUCCESS below */
|
||||||
if (_validate_inner(code+1, code+skip-2, groups))
|
if (_validate_inner(code+1, code+skip-2, groups))
|
||||||
FAIL;
|
FAIL;
|
||||||
|
|
|
@ -589,8 +589,8 @@ entrance:
|
||||||
/* optimization info block */
|
/* optimization info block */
|
||||||
/* <INFO> <1=skip> <2=flags> <3=min> ... */
|
/* <INFO> <1=skip> <2=flags> <3=min> ... */
|
||||||
if (pattern[3] && (uintptr_t)(end - ptr) < pattern[3]) {
|
if (pattern[3] && (uintptr_t)(end - ptr) < pattern[3]) {
|
||||||
TRACE(("reject (got %zd chars, need %zd)\n",
|
TRACE(("reject (got %tu chars, need %zu)\n",
|
||||||
end - ptr, (Py_ssize_t) pattern[3]));
|
end - ptr, (size_t) pattern[3]));
|
||||||
RETURN_FAILURE;
|
RETURN_FAILURE;
|
||||||
}
|
}
|
||||||
pattern += pattern[1] + 1;
|
pattern += pattern[1] + 1;
|
||||||
|
@ -1507,7 +1507,7 @@ dispatch:
|
||||||
/* <ASSERT> <skip> <back> <pattern> */
|
/* <ASSERT> <skip> <back> <pattern> */
|
||||||
TRACE(("|%p|%p|ASSERT %d\n", pattern,
|
TRACE(("|%p|%p|ASSERT %d\n", pattern,
|
||||||
ptr, pattern[1]));
|
ptr, pattern[1]));
|
||||||
if (ptr - (SRE_CHAR *)state->beginning < (Py_ssize_t)pattern[1])
|
if ((uintptr_t)(ptr - (SRE_CHAR *)state->beginning) < pattern[1])
|
||||||
RETURN_FAILURE;
|
RETURN_FAILURE;
|
||||||
state->ptr = ptr - pattern[1];
|
state->ptr = ptr - pattern[1];
|
||||||
DO_JUMP0(JUMP_ASSERT, jump_assert, pattern+2);
|
DO_JUMP0(JUMP_ASSERT, jump_assert, pattern+2);
|
||||||
|
@ -1520,7 +1520,7 @@ dispatch:
|
||||||
/* <ASSERT_NOT> <skip> <back> <pattern> */
|
/* <ASSERT_NOT> <skip> <back> <pattern> */
|
||||||
TRACE(("|%p|%p|ASSERT_NOT %d\n", pattern,
|
TRACE(("|%p|%p|ASSERT_NOT %d\n", pattern,
|
||||||
ptr, pattern[1]));
|
ptr, pattern[1]));
|
||||||
if (ptr - (SRE_CHAR *)state->beginning >= (Py_ssize_t)pattern[1]) {
|
if ((uintptr_t)(ptr - (SRE_CHAR *)state->beginning) >= pattern[1]) {
|
||||||
state->ptr = ptr - pattern[1];
|
state->ptr = ptr - pattern[1];
|
||||||
LASTMARK_SAVE();
|
LASTMARK_SAVE();
|
||||||
if (state->repeat)
|
if (state->repeat)
|
||||||
|
@ -1655,9 +1655,9 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern)
|
||||||
|
|
||||||
flags = pattern[2];
|
flags = pattern[2];
|
||||||
|
|
||||||
if (pattern[3] && end - ptr < (Py_ssize_t)pattern[3]) {
|
if (pattern[3] && (uintptr_t)(end - ptr) < pattern[3]) {
|
||||||
TRACE(("reject (got %u chars, need %u)\n",
|
TRACE(("reject (got %tu chars, need %zu)\n",
|
||||||
(unsigned int)(end - ptr), pattern[3]));
|
end - ptr, (size_t) pattern[3]));
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
if (pattern[3] > 1) {
|
if (pattern[3] > 1) {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue