bpo-30285: Optimize case-insensitive matching and searching (#1482)

of regular expressions.
2025-07-12 05:45:15 +00:00 · 2017-05-09 23:37:14 +03:00 · 2017-05-09 23:37:14 +03:00 · 6d336a0279
commit 6d336a0279
parent f93234bb8a
6 changed files with 215 additions and 70 deletions
--- a/Lib/sre_compile.py
+++ b/Lib/sre_compile.py
@ -69,13 +69,16 @@ def _compile(code, pattern, flags):
    REPEATING_CODES = _REPEATING_CODES
    SUCCESS_CODES = _SUCCESS_CODES
    ASSERT_CODES = _ASSERT_CODES
+    iscased = None
    tolower = None
    fixes = None
    if flags & SRE_FLAG_IGNORECASE and not flags & SRE_FLAG_LOCALE:
        if flags & SRE_FLAG_UNICODE and not flags & SRE_FLAG_ASCII:
+            iscased = _sre.unicode_iscased
            tolower = _sre.unicode_tolower
            fixes = _ignorecase_fixes
        else:
+            iscased = _sre.ascii_iscased
            tolower = _sre.ascii_tolower
    for op, av in pattern:
        if op in LITERAL_CODES:
@ -85,6 +88,9 @@ def _compile(code, pattern, flags):
            elif flags & SRE_FLAG_LOCALE:
                emit(OP_LOC_IGNORE[op])
                emit(av)
+            elif not iscased(av):
+                emit(op)
+                emit(av)
            else:
                lo = tolower(av)
                if fixes and lo in fixes:
@ -101,14 +107,15 @@ def _compile(code, pattern, flags):
                    emit(OP_IGNORE[op])
                    emit(lo)
        elif op is IN:
-            if not flags & SRE_FLAG_IGNORECASE:
-                emit(op)
-            elif flags & SRE_FLAG_LOCALE:
+            charset, hascased = _optimize_charset(av, iscased, tolower, fixes)
+            if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
                emit(IN_LOC_IGNORE)
-            else:
+            elif hascased:
                emit(IN_IGNORE)
+            else:
+                emit(IN)
            skip = _len(code); emit(0)
-            _compile_charset(av, flags, code, tolower, fixes)
+            _compile_charset(charset, flags, code)
            code[skip] = _len(code) - skip
        elif op is ANY:
            if flags & SRE_FLAG_DOTALL:
@ -223,10 +230,10 @@ def _compile(code, pattern, flags):
        else:
            raise error("internal: unsupported operand type %r" % (op,))

-def _compile_charset(charset, flags, code, fixup=None, fixes=None):
+def _compile_charset(charset, flags, code):
    # compile charset subprogram
    emit = code.append
-    for op, av in _optimize_charset(charset, fixup, fixes):
+    for op, av in charset:
        emit(op)
        if op is NEGATE:
            pass
@ -250,11 +257,12 @@ def _compile_charset(charset, flags, code, fixup=None, fixes=None):
            raise error("internal: unsupported set operator %r" % (op,))
    emit(FAILURE)

-def _optimize_charset(charset, fixup, fixes):
+def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):
    # internal: optimize character set
    out = []
    tail = []
    charmap = bytearray(256)
+    hascased = False
    for op, av in charset:
        while True:
            try:
@ -265,18 +273,24 @@ def _optimize_charset(charset, fixup, fixes):
                        if fixes and lo in fixes:
                            for k in fixes[lo]:
                                charmap[k] = 1
+                        if not hascased and iscased(av):
+                            hascased = True
                    else:
                        charmap[av] = 1
                elif op is RANGE:
                    r = range(av[0], av[1]+1)
                    if fixup:
-                        r = map(fixup, r)
-                    if fixup and fixes:
-                        for i in r:
-                            charmap[i] = 1
-                            if i in fixes:
-                                for k in fixes[i]:
-                                    charmap[k] = 1
+                        if fixes:
+                            for i in map(fixup, r):
+                                charmap[i] = 1
+                                if i in fixes:
+                                    for k in fixes[i]:
+                                        charmap[k] = 1
+                        else:
+                            for i in map(fixup, r):
+                                charmap[i] = 1
+                        if not hascased:
+                            hascased = any(map(iscased, r))
                    else:
                        for i in r:
                            charmap[i] = 1
@ -290,11 +304,13 @@ def _optimize_charset(charset, fixup, fixes):
                    charmap += b'\0' * 0xff00
                    continue
                # Character set contains non-BMP character codes.
-                # There are only two ranges of cased non-BMP characters:
-                # 10400-1044F (Deseret) and 118A0-118DF (Warang Citi),
-                # and for both ranges RANGE_IGNORE works.
-                if fixup and op is RANGE:
-                    op = RANGE_IGNORE
+                if fixup:
+                    hascased = True
+                    # There are only two ranges of cased non-BMP characters:
+                    # 10400-1044F (Deseret) and 118A0-118DF (Warang Citi),
+                    # and for both ranges RANGE_IGNORE works.
+                    if op is RANGE:
+                        op = RANGE_IGNORE
                tail.append((op, av))
            break

@ -322,17 +338,17 @@ def _optimize_charset(charset, fixup, fixes):
                out.append((RANGE, (p, q - 1)))
        out += tail
        # if the case was changed or new representation is more compact
-        if fixup or len(out) < len(charset):
-            return out
+        if hascased or len(out) < len(charset):
+            return out, hascased
        # else original character set is good enough
-        return charset
+        return charset, hascased

    # use bitmap
    if len(charmap) == 256:
        data = _mk_bitmap(charmap)
        out.append((CHARSET, data))
        out += tail
-        return out
+        return out, hascased

    # To represent a big charset, first a bitmap of all characters in the
    # set is constructed. Then, this bitmap is sliced into chunks of 256
@ -371,7 +387,7 @@ def _optimize_charset(charset, fixup, fixes):
    data[0:0] = [block] + _bytes_to_codes(mapping)
    out.append((BIGCHARSET, data))
    out += tail
-    return out
+    return out, hascased

 _CODEBITS = _sre.CODESIZE * 8
 MAXCODE = (1 << _CODEBITS) - 1
@ -414,19 +430,31 @@ def _generate_overlap_table(prefix):
            table[i] = idx + 1
    return table

-def _get_literal_prefix(pattern):
+def _get_iscased(flags):
+    if not flags & SRE_FLAG_IGNORECASE:
+        return None
+    elif flags & SRE_FLAG_UNICODE and not flags & SRE_FLAG_ASCII:
+        return _sre.unicode_iscased
+    else:
+        return _sre.ascii_iscased
+
+def _get_literal_prefix(pattern, flags):
    # look for literal prefix
    prefix = []
    prefixappend = prefix.append
    prefix_skip = None
+    iscased = _get_iscased(flags)
    for op, av in pattern.data:
        if op is LITERAL:
+            if iscased and iscased(av):
+                break
            prefixappend(av)
        elif op is SUBPATTERN:
            group, add_flags, del_flags, p = av
-            if add_flags & SRE_FLAG_IGNORECASE:
+            flags1 = (flags | add_flags) & ~del_flags
+            if flags1 & SRE_FLAG_IGNORECASE and flags1 & SRE_FLAG_LOCALE:
                break
-            prefix1, prefix_skip1, got_all = _get_literal_prefix(p)
+            prefix1, prefix_skip1, got_all = _get_literal_prefix(p, flags1)
            if prefix_skip is None:
                if group is not None:
                    prefix_skip = len(prefix)
@ -441,46 +469,49 @@ def _get_literal_prefix(pattern):
        return prefix, prefix_skip, True
    return prefix, prefix_skip, False

-def _get_charset_prefix(pattern):
-    charset = [] # not used
-    charsetappend = charset.append
-    if pattern.data:
+def _get_charset_prefix(pattern, flags):
+    while True:
+        if not pattern.data:
+            return None
        op, av = pattern.data[0]
-        if op is SUBPATTERN:
-            group, add_flags, del_flags, p = av
-            if p and not (add_flags & SRE_FLAG_IGNORECASE):
-                op, av = p[0]
-                if op is LITERAL:
-                    charsetappend((op, av))
-                elif op is BRANCH:
-                    c = []
-                    cappend = c.append
-                    for p in av[1]:
-                        if not p:
-                            break
-                        op, av = p[0]
-                        if op is LITERAL:
-                            cappend((op, av))
-                        else:
-                            break
-                    else:
-                        charset = c
-        elif op is BRANCH:
-            c = []
-            cappend = c.append
-            for p in av[1]:
-                if not p:
-                    break
-                op, av = p[0]
-                if op is LITERAL:
-                    cappend((op, av))
-                else:
-                    break
+        if op is not SUBPATTERN:
+            break
+        group, add_flags, del_flags, pattern = av
+        flags = (flags | add_flags) & ~del_flags
+        if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
+            return None
+
+    iscased = _get_iscased(flags)
+    if op is LITERAL:
+        if iscased and iscased(av):
+            return None
+        return [(op, av)]
+    elif op is BRANCH:
+        charset = []
+        charsetappend = charset.append
+        for p in av[1]:
+            if not p:
+                return None
+            op, av = p[0]
+            if op is LITERAL and not (iscased and iscased(av)):
+                charsetappend((op, av))
            else:
-                charset = c
-        elif op is IN:
-            charset = av
-    return charset
+                return None
+        return charset
+    elif op is IN:
+        charset = av
+        if iscased:
+            for op, av in charset:
+                if op is LITERAL:
+                    if iscased(av):
+                        return None
+                elif op is RANGE:
+                    if av[1] > 0xffff:
+                        return None
+                    if any(map(iscased, range(av[0], av[1]+1))):
+                        return None
+        return charset
+    return None

 def _compile_info(code, pattern, flags):
    # internal: compile an info block.  in the current version,
@ -496,12 +527,12 @@ def _compile_info(code, pattern, flags):
    prefix = []
    prefix_skip = 0
    charset = [] # not used
-    if not (flags & SRE_FLAG_IGNORECASE):
+    if not (flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE):
        # look for literal prefix
-        prefix, prefix_skip, got_all = _get_literal_prefix(pattern)
+        prefix, prefix_skip, got_all = _get_literal_prefix(pattern, flags)
        # if no prefix, look for charset prefix
        if not prefix:
-            charset = _get_charset_prefix(pattern)
+            charset = _get_charset_prefix(pattern, flags)
 ##     if prefix:
 ##         print("*** PREFIX", prefix, prefix_skip)
 ##     if charset:
@ -536,6 +567,8 @@ def _compile_info(code, pattern, flags):
        # generate overlap table
        code.extend(_generate_overlap_table(prefix))
    elif charset:
+        charset, hascased = _optimize_charset(charset)
+        assert not hascased
        _compile_charset(charset, flags, code)
    code[skip] = len(code) - skip