bpo-47256: Increasing the depth of backtracking in RE (GH-32411)

Limit the maximum capturing group to 2**30-1 on 64-bit platforms
(it was 2**31-1). No change on 32-bit platforms (2**28-1).

It allows to reduce the size of SRE(match_context):
- On 32 bit platform: 36 bytes, no change.  (msvc2022)
- On 64 bit platform: 72 bytes -> 56 bytes. (msvc2022/gcc9.4)

which leads to increasing the depth of backtracking.
This commit is contained in:
Ma Lin 2022-04-18 21:50:40 +08:00 committed by GitHub
parent 1c2fcebf3c
commit a29f858124
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 46 additions and 44 deletions

View file

@ -0,0 +1,2 @@
:mod:`re` module, limit the maximum capturing group to 1,073,741,823 in
64-bit build, this increases the depth of backtracking.

View file

@ -18,10 +18,10 @@
#define SRE_CODE Py_UCS4 #define SRE_CODE Py_UCS4
#if SIZEOF_SIZE_T > 4 #if SIZEOF_SIZE_T > 4
# define SRE_MAXREPEAT (~(SRE_CODE)0) # define SRE_MAXREPEAT (~(SRE_CODE)0)
# define SRE_MAXGROUPS ((~(SRE_CODE)0) / 2) # define SRE_MAXGROUPS ((SRE_CODE)INT32_MAX / 2)
#else #else
# define SRE_MAXREPEAT ((SRE_CODE)PY_SSIZE_T_MAX) # define SRE_MAXREPEAT ((SRE_CODE)PY_SSIZE_T_MAX)
# define SRE_MAXGROUPS ((SRE_CODE)PY_SSIZE_T_MAX / SIZEOF_SIZE_T / 2) # define SRE_MAXGROUPS ((SRE_CODE)PY_SSIZE_T_MAX / SIZEOF_VOID_P / 2)
#endif #endif
typedef struct { typedef struct {
@ -73,12 +73,12 @@ typedef struct {
Py_ssize_t pos, endpos; Py_ssize_t pos, endpos;
int isbytes; int isbytes;
int charsize; /* character size */ int charsize; /* character size */
/* registers */
Py_ssize_t lastindex;
Py_ssize_t lastmark;
const void** mark;
int match_all; int match_all;
int must_advance; int must_advance;
/* marks */
int lastmark;
int lastindex;
const void** mark;
/* dynamically allocated stuff */ /* dynamically allocated stuff */
char* data_stack; char* data_stack;
size_t data_stack_size; size_t data_stack_size;

View file

@ -450,20 +450,23 @@ do { \
#define MARK_PUSH(lastmark) \ #define MARK_PUSH(lastmark) \
do if (lastmark >= 0) { \ do if (lastmark >= 0) { \
i = lastmark; /* ctx->lastmark may change if reallocated */ \ size_t _marks_size = (lastmark+1) * sizeof(void*); \
DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \ DATA_STACK_PUSH(state, state->mark, _marks_size); \
} while (0) } while (0)
#define MARK_POP(lastmark) \ #define MARK_POP(lastmark) \
do if (lastmark >= 0) { \ do if (lastmark >= 0) { \
DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \ size_t _marks_size = (lastmark+1) * sizeof(void*); \
DATA_STACK_POP(state, state->mark, _marks_size, 1); \
} while (0) } while (0)
#define MARK_POP_KEEP(lastmark) \ #define MARK_POP_KEEP(lastmark) \
do if (lastmark >= 0) { \ do if (lastmark >= 0) { \
DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \ size_t _marks_size = (lastmark+1) * sizeof(void*); \
DATA_STACK_POP(state, state->mark, _marks_size, 0); \
} while (0) } while (0)
#define MARK_POP_DISCARD(lastmark) \ #define MARK_POP_DISCARD(lastmark) \
do if (lastmark >= 0) { \ do if (lastmark >= 0) { \
DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \ size_t _marks_size = (lastmark+1) * sizeof(void*); \
DATA_STACK_POP_DISCARD(state, _marks_size); \
} while (0) } while (0)
#define JUMP_NONE 0 #define JUMP_NONE 0
@ -488,10 +491,10 @@ do { \
ctx->pattern = pattern; \ ctx->pattern = pattern; \
ctx->ptr = ptr; \ ctx->ptr = ptr; \
DATA_ALLOC(SRE(match_context), nextctx); \ DATA_ALLOC(SRE(match_context), nextctx); \
nextctx->last_ctx_pos = ctx_pos; \
nextctx->jump = jumpvalue; \
nextctx->pattern = nextpattern; \ nextctx->pattern = nextpattern; \
nextctx->toplevel = toplevel_; \ nextctx->toplevel = toplevel_; \
nextctx->jump = jumpvalue; \
nextctx->last_ctx_pos = ctx_pos; \
pattern = nextpattern; \ pattern = nextpattern; \
ctx_pos = alloc_pos; \ ctx_pos = alloc_pos; \
ctx = nextctx; \ ctx = nextctx; \
@ -507,18 +510,18 @@ do { \
DO_JUMPX(jumpvalue, jumplabel, nextpattern, 0) DO_JUMPX(jumpvalue, jumplabel, nextpattern, 0)
typedef struct { typedef struct {
Py_ssize_t last_ctx_pos;
Py_ssize_t jump;
const SRE_CHAR* ptr;
const SRE_CODE* pattern;
Py_ssize_t count; Py_ssize_t count;
Py_ssize_t lastmark;
Py_ssize_t lastindex;
union { union {
SRE_CODE chr; SRE_CODE chr;
SRE_REPEAT* rep; SRE_REPEAT* rep;
} u; } u;
int lastmark;
int lastindex;
const SRE_CODE* pattern;
const SRE_CHAR* ptr;
int toplevel; int toplevel;
int jump;
Py_ssize_t last_ctx_pos;
} SRE(match_context); } SRE(match_context);
#define MAYBE_CHECK_SIGNALS \ #define MAYBE_CHECK_SIGNALS \
@ -558,8 +561,8 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel)
{ {
const SRE_CHAR* end = (const SRE_CHAR *)state->end; const SRE_CHAR* end = (const SRE_CHAR *)state->end;
Py_ssize_t alloc_pos, ctx_pos = -1; Py_ssize_t alloc_pos, ctx_pos = -1;
Py_ssize_t i, ret = 0; Py_ssize_t ret = 0;
Py_ssize_t jump; int jump;
unsigned int sigcount=0; unsigned int sigcount=0;
SRE(match_context)* ctx; SRE(match_context)* ctx;
@ -607,7 +610,8 @@ dispatch:
/* <MARK> <gid> */ /* <MARK> <gid> */
TRACE(("|%p|%p|MARK %d\n", pattern, TRACE(("|%p|%p|MARK %d\n", pattern,
ptr, pattern[0])); ptr, pattern[0]));
i = pattern[0]; {
int i = pattern[0];
if (i & 1) if (i & 1)
state->lastindex = i/2 + 1; state->lastindex = i/2 + 1;
if (i > state->lastmark) { if (i > state->lastmark) {
@ -615,12 +619,13 @@ dispatch:
state->mark array. If it is increased by more than 1, state->mark array. If it is increased by more than 1,
the intervening marks must be set to NULL to signal the intervening marks must be set to NULL to signal
that these marks have not been encountered. */ that these marks have not been encountered. */
Py_ssize_t j = state->lastmark + 1; int j = state->lastmark + 1;
while (j < i) while (j < i)
state->mark[j++] = NULL; state->mark[j++] = NULL;
state->lastmark = i; state->lastmark = i;
} }
state->mark[i] = ptr; state->mark[i] = ptr;
}
pattern++; pattern++;
DISPATCH; DISPATCH;
@ -1373,9 +1378,8 @@ dispatch:
/* match backreference */ /* match backreference */
TRACE(("|%p|%p|GROUPREF %d\n", pattern, TRACE(("|%p|%p|GROUPREF %d\n", pattern,
ptr, pattern[0])); ptr, pattern[0]));
i = pattern[0];
{ {
Py_ssize_t groupref = i+i; int groupref = pattern[0] * 2;
if (groupref >= state->lastmark) { if (groupref >= state->lastmark) {
RETURN_FAILURE; RETURN_FAILURE;
} else { } else {
@ -1398,9 +1402,8 @@ dispatch:
/* match backreference */ /* match backreference */
TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", pattern, TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", pattern,
ptr, pattern[0])); ptr, pattern[0]));
i = pattern[0];
{ {
Py_ssize_t groupref = i+i; int groupref = pattern[0] * 2;
if (groupref >= state->lastmark) { if (groupref >= state->lastmark) {
RETURN_FAILURE; RETURN_FAILURE;
} else { } else {
@ -1424,9 +1427,8 @@ dispatch:
/* match backreference */ /* match backreference */
TRACE(("|%p|%p|GROUPREF_UNI_IGNORE %d\n", pattern, TRACE(("|%p|%p|GROUPREF_UNI_IGNORE %d\n", pattern,
ptr, pattern[0])); ptr, pattern[0]));
i = pattern[0];
{ {
Py_ssize_t groupref = i+i; int groupref = pattern[0] * 2;
if (groupref >= state->lastmark) { if (groupref >= state->lastmark) {
RETURN_FAILURE; RETURN_FAILURE;
} else { } else {
@ -1450,9 +1452,8 @@ dispatch:
/* match backreference */ /* match backreference */
TRACE(("|%p|%p|GROUPREF_LOC_IGNORE %d\n", pattern, TRACE(("|%p|%p|GROUPREF_LOC_IGNORE %d\n", pattern,
ptr, pattern[0])); ptr, pattern[0]));
i = pattern[0];
{ {
Py_ssize_t groupref = i+i; int groupref = pattern[0] * 2;
if (groupref >= state->lastmark) { if (groupref >= state->lastmark) {
RETURN_FAILURE; RETURN_FAILURE;
} else { } else {
@ -1476,9 +1477,8 @@ dispatch:
TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", pattern, TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", pattern,
ptr, pattern[0])); ptr, pattern[0]));
/* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */ /* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */
i = pattern[0];
{ {
Py_ssize_t groupref = i+i; int groupref = pattern[0] * 2;
if (groupref >= state->lastmark) { if (groupref >= state->lastmark) {
pattern += pattern[1]; pattern += pattern[1];
DISPATCH; DISPATCH;