mirror of
https://github.com/python/cpython.git
synced 2025-09-27 02:39:58 +00:00
Issue #24426: Fast searching optimization in regular expressions now works
for patterns that starts with capturing groups. Fast searching optimization now can't be disabled at compile time.
This commit is contained in:
parent
6ee588f14e
commit
66dc4648fc
4 changed files with 99 additions and 78 deletions
|
@ -409,39 +409,34 @@ def _generate_overlap_table(prefix):
|
||||||
table[i] = idx + 1
|
table[i] = idx + 1
|
||||||
return table
|
return table
|
||||||
|
|
||||||
def _compile_info(code, pattern, flags):
|
def _get_literal_prefix(pattern):
|
||||||
# internal: compile an info block. in the current version,
|
# look for literal prefix
|
||||||
# this contains min/max pattern width, and an optional literal
|
|
||||||
# prefix or a character map
|
|
||||||
lo, hi = pattern.getwidth()
|
|
||||||
if hi > MAXCODE:
|
|
||||||
hi = MAXCODE
|
|
||||||
if lo == 0:
|
|
||||||
code.extend([INFO, 4, 0, lo, hi])
|
|
||||||
return
|
|
||||||
# look for a literal prefix
|
|
||||||
prefix = []
|
prefix = []
|
||||||
prefixappend = prefix.append
|
prefixappend = prefix.append
|
||||||
prefix_skip = 0
|
prefix_skip = None
|
||||||
charset = [] # not used
|
got_all = True
|
||||||
charsetappend = charset.append
|
|
||||||
if not (flags & SRE_FLAG_IGNORECASE):
|
|
||||||
# look for literal prefix
|
|
||||||
for op, av in pattern.data:
|
for op, av in pattern.data:
|
||||||
if op is LITERAL:
|
if op is LITERAL:
|
||||||
if len(prefix) == prefix_skip:
|
|
||||||
prefix_skip = prefix_skip + 1
|
|
||||||
prefixappend(av)
|
prefixappend(av)
|
||||||
elif op is SUBPATTERN and len(av[1]) == 1:
|
elif op is SUBPATTERN:
|
||||||
op, av = av[1][0]
|
prefix1, prefix_skip1, got_all = _get_literal_prefix(av[1])
|
||||||
if op is LITERAL:
|
if prefix_skip is None:
|
||||||
prefixappend(av)
|
if av[0] is not None:
|
||||||
else:
|
prefix_skip = len(prefix)
|
||||||
|
elif prefix_skip1 is not None:
|
||||||
|
prefix_skip = len(prefix) + prefix_skip1
|
||||||
|
prefix.extend(prefix1)
|
||||||
|
if not got_all:
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
|
got_all = False
|
||||||
break
|
break
|
||||||
# if no prefix, look for charset prefix
|
return prefix, prefix_skip, got_all
|
||||||
if not prefix and pattern.data:
|
|
||||||
|
def _get_charset_prefix(pattern):
|
||||||
|
charset = [] # not used
|
||||||
|
charsetappend = charset.append
|
||||||
|
if pattern.data:
|
||||||
op, av = pattern.data[0]
|
op, av = pattern.data[0]
|
||||||
if op is SUBPATTERN and av[1]:
|
if op is SUBPATTERN and av[1]:
|
||||||
op, av = av[1][0]
|
op, av = av[1][0]
|
||||||
|
@ -475,6 +470,28 @@ def _compile_info(code, pattern, flags):
|
||||||
charset = c
|
charset = c
|
||||||
elif op is IN:
|
elif op is IN:
|
||||||
charset = av
|
charset = av
|
||||||
|
return charset
|
||||||
|
|
||||||
|
def _compile_info(code, pattern, flags):
|
||||||
|
# internal: compile an info block. in the current version,
|
||||||
|
# this contains min/max pattern width, and an optional literal
|
||||||
|
# prefix or a character map
|
||||||
|
lo, hi = pattern.getwidth()
|
||||||
|
if hi > MAXCODE:
|
||||||
|
hi = MAXCODE
|
||||||
|
if lo == 0:
|
||||||
|
code.extend([INFO, 4, 0, lo, hi])
|
||||||
|
return
|
||||||
|
# look for a literal prefix
|
||||||
|
prefix = []
|
||||||
|
prefix_skip = 0
|
||||||
|
charset = [] # not used
|
||||||
|
if not (flags & SRE_FLAG_IGNORECASE):
|
||||||
|
# look for literal prefix
|
||||||
|
prefix, prefix_skip, got_all = _get_literal_prefix(pattern)
|
||||||
|
# if no prefix, look for charset prefix
|
||||||
|
if not prefix:
|
||||||
|
charset = _get_charset_prefix(pattern)
|
||||||
## if prefix:
|
## if prefix:
|
||||||
## print("*** PREFIX", prefix, prefix_skip)
|
## print("*** PREFIX", prefix, prefix_skip)
|
||||||
## if charset:
|
## if charset:
|
||||||
|
@ -487,7 +504,7 @@ def _compile_info(code, pattern, flags):
|
||||||
mask = 0
|
mask = 0
|
||||||
if prefix:
|
if prefix:
|
||||||
mask = SRE_INFO_PREFIX
|
mask = SRE_INFO_PREFIX
|
||||||
if len(prefix) == prefix_skip == len(pattern.data):
|
if prefix_skip is None and got_all:
|
||||||
mask = mask | SRE_INFO_LITERAL
|
mask = mask | SRE_INFO_LITERAL
|
||||||
elif charset:
|
elif charset:
|
||||||
mask = mask | SRE_INFO_CHARSET
|
mask = mask | SRE_INFO_CHARSET
|
||||||
|
@ -502,6 +519,8 @@ def _compile_info(code, pattern, flags):
|
||||||
# add literal prefix
|
# add literal prefix
|
||||||
if prefix:
|
if prefix:
|
||||||
emit(len(prefix)) # length
|
emit(len(prefix)) # length
|
||||||
|
if prefix_skip is None:
|
||||||
|
prefix_skip = len(prefix)
|
||||||
emit(prefix_skip) # skip
|
emit(prefix_skip) # skip
|
||||||
code.extend(prefix)
|
code.extend(prefix)
|
||||||
# generate overlap table
|
# generate overlap table
|
||||||
|
|
|
@ -13,6 +13,10 @@ Core and Builtins
|
||||||
Library
|
Library
|
||||||
-------
|
-------
|
||||||
|
|
||||||
|
- Issue #24426: Fast searching optimization in regular expressions now works
|
||||||
|
for patterns that starts with capturing groups. Fast searching optimization
|
||||||
|
now can't be disabled at compile time.
|
||||||
|
|
||||||
Documentation
|
Documentation
|
||||||
-------------
|
-------------
|
||||||
|
|
||||||
|
|
|
@ -62,9 +62,6 @@ static char copyright[] =
|
||||||
/* -------------------------------------------------------------------- */
|
/* -------------------------------------------------------------------- */
|
||||||
/* optional features */
|
/* optional features */
|
||||||
|
|
||||||
/* enables fast searching */
|
|
||||||
#define USE_FAST_SEARCH
|
|
||||||
|
|
||||||
/* enables copy/deepcopy handling (work in progress) */
|
/* enables copy/deepcopy handling (work in progress) */
|
||||||
#undef USE_BUILTIN_COPY
|
#undef USE_BUILTIN_COPY
|
||||||
|
|
||||||
|
|
|
@ -1248,7 +1248,32 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern)
|
||||||
prefix, prefix_len, prefix_skip));
|
prefix, prefix_len, prefix_skip));
|
||||||
TRACE(("charset = %p\n", charset));
|
TRACE(("charset = %p\n", charset));
|
||||||
|
|
||||||
#if defined(USE_FAST_SEARCH)
|
if (prefix_len == 1) {
|
||||||
|
/* pattern starts with a literal character */
|
||||||
|
SRE_CHAR c = (SRE_CHAR) prefix[0];
|
||||||
|
#if SIZEOF_SRE_CHAR < 4
|
||||||
|
if ((SRE_CODE) c != prefix[0])
|
||||||
|
return 0; /* literal can't match: doesn't fit in char width */
|
||||||
|
#endif
|
||||||
|
end = (SRE_CHAR *)state->end;
|
||||||
|
while (ptr < end) {
|
||||||
|
while (*ptr != c) {
|
||||||
|
if (++ptr >= end)
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
|
||||||
|
state->start = ptr;
|
||||||
|
state->ptr = ptr + prefix_skip;
|
||||||
|
if (flags & SRE_INFO_LITERAL)
|
||||||
|
return 1; /* we got all of it */
|
||||||
|
status = SRE(match)(state, pattern + 2*prefix_skip, 0);
|
||||||
|
if (status != 0)
|
||||||
|
return status;
|
||||||
|
++ptr;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
if (prefix_len > 1) {
|
if (prefix_len > 1) {
|
||||||
/* pattern starts with a known prefix. use the overlap
|
/* pattern starts with a known prefix. use the overlap
|
||||||
table to skip forward as fast as we possibly can */
|
table to skip forward as fast as we possibly can */
|
||||||
|
@ -1297,32 +1322,8 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern)
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
if (pattern[0] == SRE_OP_LITERAL) {
|
if (charset) {
|
||||||
/* pattern starts with a literal character. this is used
|
|
||||||
for short prefixes, and if fast search is disabled */
|
|
||||||
SRE_CHAR c = (SRE_CHAR) pattern[1];
|
|
||||||
#if SIZEOF_SRE_CHAR < 4
|
|
||||||
if ((SRE_CODE) c != pattern[1])
|
|
||||||
return 0; /* literal can't match: doesn't fit in char width */
|
|
||||||
#endif
|
|
||||||
end = (SRE_CHAR *)state->end;
|
|
||||||
while (ptr < end) {
|
|
||||||
while (*ptr != c) {
|
|
||||||
if (++ptr >= end)
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
TRACE(("|%p|%p|SEARCH LITERAL\n", pattern, ptr));
|
|
||||||
state->start = ptr;
|
|
||||||
state->ptr = ++ptr;
|
|
||||||
if (flags & SRE_INFO_LITERAL)
|
|
||||||
return 1; /* we got all of it */
|
|
||||||
status = SRE(match)(state, pattern + 2, 0);
|
|
||||||
if (status != 0)
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
} else if (charset) {
|
|
||||||
/* pattern starts with a character from a known set */
|
/* pattern starts with a character from a known set */
|
||||||
end = (SRE_CHAR *)state->end;
|
end = (SRE_CHAR *)state->end;
|
||||||
for (;;) {
|
for (;;) {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue