mirror of
https://github.com/python/cpython.git
synced 2025-07-12 05:45:15 +00:00
bpo-30285: Optimize case-insensitive matching and searching (#1482)
of regular expressions.
This commit is contained in:
parent
f93234bb8a
commit
6d336a0279
6 changed files with 215 additions and 70 deletions
|
@ -69,13 +69,16 @@ def _compile(code, pattern, flags):
|
|||
REPEATING_CODES = _REPEATING_CODES
|
||||
SUCCESS_CODES = _SUCCESS_CODES
|
||||
ASSERT_CODES = _ASSERT_CODES
|
||||
iscased = None
|
||||
tolower = None
|
||||
fixes = None
|
||||
if flags & SRE_FLAG_IGNORECASE and not flags & SRE_FLAG_LOCALE:
|
||||
if flags & SRE_FLAG_UNICODE and not flags & SRE_FLAG_ASCII:
|
||||
iscased = _sre.unicode_iscased
|
||||
tolower = _sre.unicode_tolower
|
||||
fixes = _ignorecase_fixes
|
||||
else:
|
||||
iscased = _sre.ascii_iscased
|
||||
tolower = _sre.ascii_tolower
|
||||
for op, av in pattern:
|
||||
if op in LITERAL_CODES:
|
||||
|
@ -85,6 +88,9 @@ def _compile(code, pattern, flags):
|
|||
elif flags & SRE_FLAG_LOCALE:
|
||||
emit(OP_LOC_IGNORE[op])
|
||||
emit(av)
|
||||
elif not iscased(av):
|
||||
emit(op)
|
||||
emit(av)
|
||||
else:
|
||||
lo = tolower(av)
|
||||
if fixes and lo in fixes:
|
||||
|
@ -101,14 +107,15 @@ def _compile(code, pattern, flags):
|
|||
emit(OP_IGNORE[op])
|
||||
emit(lo)
|
||||
elif op is IN:
|
||||
if not flags & SRE_FLAG_IGNORECASE:
|
||||
emit(op)
|
||||
elif flags & SRE_FLAG_LOCALE:
|
||||
charset, hascased = _optimize_charset(av, iscased, tolower, fixes)
|
||||
if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
|
||||
emit(IN_LOC_IGNORE)
|
||||
else:
|
||||
elif hascased:
|
||||
emit(IN_IGNORE)
|
||||
else:
|
||||
emit(IN)
|
||||
skip = _len(code); emit(0)
|
||||
_compile_charset(av, flags, code, tolower, fixes)
|
||||
_compile_charset(charset, flags, code)
|
||||
code[skip] = _len(code) - skip
|
||||
elif op is ANY:
|
||||
if flags & SRE_FLAG_DOTALL:
|
||||
|
@ -223,10 +230,10 @@ def _compile(code, pattern, flags):
|
|||
else:
|
||||
raise error("internal: unsupported operand type %r" % (op,))
|
||||
|
||||
def _compile_charset(charset, flags, code, fixup=None, fixes=None):
|
||||
def _compile_charset(charset, flags, code):
|
||||
# compile charset subprogram
|
||||
emit = code.append
|
||||
for op, av in _optimize_charset(charset, fixup, fixes):
|
||||
for op, av in charset:
|
||||
emit(op)
|
||||
if op is NEGATE:
|
||||
pass
|
||||
|
@ -250,11 +257,12 @@ def _compile_charset(charset, flags, code, fixup=None, fixes=None):
|
|||
raise error("internal: unsupported set operator %r" % (op,))
|
||||
emit(FAILURE)
|
||||
|
||||
def _optimize_charset(charset, fixup, fixes):
|
||||
def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):
|
||||
# internal: optimize character set
|
||||
out = []
|
||||
tail = []
|
||||
charmap = bytearray(256)
|
||||
hascased = False
|
||||
for op, av in charset:
|
||||
while True:
|
||||
try:
|
||||
|
@ -265,18 +273,24 @@ def _optimize_charset(charset, fixup, fixes):
|
|||
if fixes and lo in fixes:
|
||||
for k in fixes[lo]:
|
||||
charmap[k] = 1
|
||||
if not hascased and iscased(av):
|
||||
hascased = True
|
||||
else:
|
||||
charmap[av] = 1
|
||||
elif op is RANGE:
|
||||
r = range(av[0], av[1]+1)
|
||||
if fixup:
|
||||
r = map(fixup, r)
|
||||
if fixup and fixes:
|
||||
for i in r:
|
||||
charmap[i] = 1
|
||||
if i in fixes:
|
||||
for k in fixes[i]:
|
||||
charmap[k] = 1
|
||||
if fixes:
|
||||
for i in map(fixup, r):
|
||||
charmap[i] = 1
|
||||
if i in fixes:
|
||||
for k in fixes[i]:
|
||||
charmap[k] = 1
|
||||
else:
|
||||
for i in map(fixup, r):
|
||||
charmap[i] = 1
|
||||
if not hascased:
|
||||
hascased = any(map(iscased, r))
|
||||
else:
|
||||
for i in r:
|
||||
charmap[i] = 1
|
||||
|
@ -290,11 +304,13 @@ def _optimize_charset(charset, fixup, fixes):
|
|||
charmap += b'\0' * 0xff00
|
||||
continue
|
||||
# Character set contains non-BMP character codes.
|
||||
# There are only two ranges of cased non-BMP characters:
|
||||
# 10400-1044F (Deseret) and 118A0-118DF (Warang Citi),
|
||||
# and for both ranges RANGE_IGNORE works.
|
||||
if fixup and op is RANGE:
|
||||
op = RANGE_IGNORE
|
||||
if fixup:
|
||||
hascased = True
|
||||
# There are only two ranges of cased non-BMP characters:
|
||||
# 10400-1044F (Deseret) and 118A0-118DF (Warang Citi),
|
||||
# and for both ranges RANGE_IGNORE works.
|
||||
if op is RANGE:
|
||||
op = RANGE_IGNORE
|
||||
tail.append((op, av))
|
||||
break
|
||||
|
||||
|
@ -322,17 +338,17 @@ def _optimize_charset(charset, fixup, fixes):
|
|||
out.append((RANGE, (p, q - 1)))
|
||||
out += tail
|
||||
# if the case was changed or new representation is more compact
|
||||
if fixup or len(out) < len(charset):
|
||||
return out
|
||||
if hascased or len(out) < len(charset):
|
||||
return out, hascased
|
||||
# else original character set is good enough
|
||||
return charset
|
||||
return charset, hascased
|
||||
|
||||
# use bitmap
|
||||
if len(charmap) == 256:
|
||||
data = _mk_bitmap(charmap)
|
||||
out.append((CHARSET, data))
|
||||
out += tail
|
||||
return out
|
||||
return out, hascased
|
||||
|
||||
# To represent a big charset, first a bitmap of all characters in the
|
||||
# set is constructed. Then, this bitmap is sliced into chunks of 256
|
||||
|
@ -371,7 +387,7 @@ def _optimize_charset(charset, fixup, fixes):
|
|||
data[0:0] = [block] + _bytes_to_codes(mapping)
|
||||
out.append((BIGCHARSET, data))
|
||||
out += tail
|
||||
return out
|
||||
return out, hascased
|
||||
|
||||
_CODEBITS = _sre.CODESIZE * 8
|
||||
MAXCODE = (1 << _CODEBITS) - 1
|
||||
|
@ -414,19 +430,31 @@ def _generate_overlap_table(prefix):
|
|||
table[i] = idx + 1
|
||||
return table
|
||||
|
||||
def _get_literal_prefix(pattern):
|
||||
def _get_iscased(flags):
|
||||
if not flags & SRE_FLAG_IGNORECASE:
|
||||
return None
|
||||
elif flags & SRE_FLAG_UNICODE and not flags & SRE_FLAG_ASCII:
|
||||
return _sre.unicode_iscased
|
||||
else:
|
||||
return _sre.ascii_iscased
|
||||
|
||||
def _get_literal_prefix(pattern, flags):
|
||||
# look for literal prefix
|
||||
prefix = []
|
||||
prefixappend = prefix.append
|
||||
prefix_skip = None
|
||||
iscased = _get_iscased(flags)
|
||||
for op, av in pattern.data:
|
||||
if op is LITERAL:
|
||||
if iscased and iscased(av):
|
||||
break
|
||||
prefixappend(av)
|
||||
elif op is SUBPATTERN:
|
||||
group, add_flags, del_flags, p = av
|
||||
if add_flags & SRE_FLAG_IGNORECASE:
|
||||
flags1 = (flags | add_flags) & ~del_flags
|
||||
if flags1 & SRE_FLAG_IGNORECASE and flags1 & SRE_FLAG_LOCALE:
|
||||
break
|
||||
prefix1, prefix_skip1, got_all = _get_literal_prefix(p)
|
||||
prefix1, prefix_skip1, got_all = _get_literal_prefix(p, flags1)
|
||||
if prefix_skip is None:
|
||||
if group is not None:
|
||||
prefix_skip = len(prefix)
|
||||
|
@ -441,46 +469,49 @@ def _get_literal_prefix(pattern):
|
|||
return prefix, prefix_skip, True
|
||||
return prefix, prefix_skip, False
|
||||
|
||||
def _get_charset_prefix(pattern):
|
||||
charset = [] # not used
|
||||
charsetappend = charset.append
|
||||
if pattern.data:
|
||||
def _get_charset_prefix(pattern, flags):
|
||||
while True:
|
||||
if not pattern.data:
|
||||
return None
|
||||
op, av = pattern.data[0]
|
||||
if op is SUBPATTERN:
|
||||
group, add_flags, del_flags, p = av
|
||||
if p and not (add_flags & SRE_FLAG_IGNORECASE):
|
||||
op, av = p[0]
|
||||
if op is LITERAL:
|
||||
charsetappend((op, av))
|
||||
elif op is BRANCH:
|
||||
c = []
|
||||
cappend = c.append
|
||||
for p in av[1]:
|
||||
if not p:
|
||||
break
|
||||
op, av = p[0]
|
||||
if op is LITERAL:
|
||||
cappend((op, av))
|
||||
else:
|
||||
break
|
||||
else:
|
||||
charset = c
|
||||
elif op is BRANCH:
|
||||
c = []
|
||||
cappend = c.append
|
||||
for p in av[1]:
|
||||
if not p:
|
||||
break
|
||||
op, av = p[0]
|
||||
if op is LITERAL:
|
||||
cappend((op, av))
|
||||
else:
|
||||
break
|
||||
if op is not SUBPATTERN:
|
||||
break
|
||||
group, add_flags, del_flags, pattern = av
|
||||
flags = (flags | add_flags) & ~del_flags
|
||||
if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
|
||||
return None
|
||||
|
||||
iscased = _get_iscased(flags)
|
||||
if op is LITERAL:
|
||||
if iscased and iscased(av):
|
||||
return None
|
||||
return [(op, av)]
|
||||
elif op is BRANCH:
|
||||
charset = []
|
||||
charsetappend = charset.append
|
||||
for p in av[1]:
|
||||
if not p:
|
||||
return None
|
||||
op, av = p[0]
|
||||
if op is LITERAL and not (iscased and iscased(av)):
|
||||
charsetappend((op, av))
|
||||
else:
|
||||
charset = c
|
||||
elif op is IN:
|
||||
charset = av
|
||||
return charset
|
||||
return None
|
||||
return charset
|
||||
elif op is IN:
|
||||
charset = av
|
||||
if iscased:
|
||||
for op, av in charset:
|
||||
if op is LITERAL:
|
||||
if iscased(av):
|
||||
return None
|
||||
elif op is RANGE:
|
||||
if av[1] > 0xffff:
|
||||
return None
|
||||
if any(map(iscased, range(av[0], av[1]+1))):
|
||||
return None
|
||||
return charset
|
||||
return None
|
||||
|
||||
def _compile_info(code, pattern, flags):
|
||||
# internal: compile an info block. in the current version,
|
||||
|
@ -496,12 +527,12 @@ def _compile_info(code, pattern, flags):
|
|||
prefix = []
|
||||
prefix_skip = 0
|
||||
charset = [] # not used
|
||||
if not (flags & SRE_FLAG_IGNORECASE):
|
||||
if not (flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE):
|
||||
# look for literal prefix
|
||||
prefix, prefix_skip, got_all = _get_literal_prefix(pattern)
|
||||
prefix, prefix_skip, got_all = _get_literal_prefix(pattern, flags)
|
||||
# if no prefix, look for charset prefix
|
||||
if not prefix:
|
||||
charset = _get_charset_prefix(pattern)
|
||||
charset = _get_charset_prefix(pattern, flags)
|
||||
## if prefix:
|
||||
## print("*** PREFIX", prefix, prefix_skip)
|
||||
## if charset:
|
||||
|
@ -536,6 +567,8 @@ def _compile_info(code, pattern, flags):
|
|||
# generate overlap table
|
||||
code.extend(_generate_overlap_table(prefix))
|
||||
elif charset:
|
||||
charset, hascased = _optimize_charset(charset)
|
||||
assert not hascased
|
||||
_compile_charset(charset, flags, code)
|
||||
code[skip] = len(code) - skip
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue