Issue #433028: Added support of modifier spans in regular expressions.

2025-10-09 16:34:44 +00:00 · 2016-09-10 00:57:55 +03:00 · 2016-09-10 00:57:55 +03:00 · be9a4e5c85
commit be9a4e5c85
parent ee73a65745
7 changed files with 180 additions and 66 deletions
--- a/Doc/library/re.rst
+++ b/Doc/library/re.rst
@ -237,6 +237,16 @@ The special characters are:
   *cannot* be retrieved after performing a match or referenced later in the
   pattern.
 ``(?imsx-imsx:...)``
   (Zero or more letters from the set ``'i'``, ``'m'``, ``'s'``, ``'x'``,
   optionally followed by ``'-'`` followed by one or more letters from the
   same set.)  The letters set or removes the corresponding flags:
   :const:`re.I` (ignore case), :const:`re.M` (multi-line), :const:`re.S`
   (dot matches all), and :const:`re.X` (verbose), for the part of the
   expression.  (The flags are described in :ref:`contents-of-module-re`.)
   .. versionadded: 3.7
 ``(?P<name>...)``
   Similar to regular parentheses, but the substring matched by the group is
   accessible via the symbolic group name *name*.  Group names must be valid
--- a/Doc/whatsnew/3.6.rst
+++ b/Doc/whatsnew/3.6.rst
@ -645,6 +645,15 @@ Protocol version 4 already supports this case.  (Contributed by Serhiy
 Storchaka in :issue:`24164`.)
 re
 --
 Added support of modifier spans in regular expressions.  Examples:
 ``'(?i:p)ython'`` matches ``'python'`` and ``'Python'``, but not ``'PYTHON'``;
 ``'(?i)g(?-i:v)r'`` matches ``'GvR'`` and ``'gvr'``, but not ``'GVR'``.
 (Contributed by Serhiy Storchaka in :issue:`433028`.)
 readline
 --------
--- a/Lib/re.py
+++ b/Lib/re.py
@ -352,7 +352,7 @@ class Scanner:
        for phrase, action in lexicon:
            gid = s.opengroup()
            p.append(sre_parse.SubPattern(s, [
-                (SUBPATTERN, (gid, sre_parse.parse(phrase, flags))),
+                (SUBPATTERN, (gid, 0, 0, sre_parse.parse(phrase, flags))),
                ]))
            s.closegroup(gid, p[-1])
        p = sre_parse.SubPattern(s, [(BRANCH, (None, p))])
--- a/Lib/sre_compile.py
+++ b/Lib/sre_compile.py
@ -71,7 +71,8 @@ def _compile(code, pattern, flags):
    ASSERT_CODES = _ASSERT_CODES
    if (flags & SRE_FLAG_IGNORECASE and
            not (flags & SRE_FLAG_LOCALE) and
-            flags & SRE_FLAG_UNICODE):
+            flags & SRE_FLAG_UNICODE and
            not (flags & SRE_FLAG_ASCII)):
        fixes = _ignorecase_fixes
    else:
        fixes = None
@ -137,14 +138,15 @@ def _compile(code, pattern, flags):
                else:
                    emit(MIN_UNTIL)
        elif op is SUBPATTERN:
-            if av[0]:
+            group, add_flags, del_flags, p = av
            if group:
                emit(MARK)
-                emit((av[0]-1)*2)
+                emit((group-1)*2)
-            # _compile_info(code, av[1], flags)
+            # _compile_info(code, p, (flags | add_flags) & ~del_flags)
-            _compile(code, av[1], flags)
+            _compile(code, p, (flags | add_flags) & ~del_flags)
-            if av[0]:
+            if group:
                emit(MARK)
-                emit((av[0]-1)*2+1)
+                emit((group-1)*2+1)
        elif op in SUCCESS_CODES:
            emit(op)
        elif op in ASSERT_CODES:
@ -172,7 +174,7 @@ def _compile(code, pattern, flags):
                av = AT_MULTILINE.get(av, av)
            if flags & SRE_FLAG_LOCALE:
                av = AT_LOCALE.get(av, av)
-            elif flags & SRE_FLAG_UNICODE:
+            elif (flags & SRE_FLAG_UNICODE) and not (flags & SRE_FLAG_ASCII):
                av = AT_UNICODE.get(av, av)
            emit(av)
        elif op is BRANCH:
@ -193,7 +195,7 @@ def _compile(code, pattern, flags):
            emit(op)
            if flags & SRE_FLAG_LOCALE:
                av = CH_LOCALE[av]
-            elif flags & SRE_FLAG_UNICODE:
+            elif (flags & SRE_FLAG_UNICODE) and not (flags & SRE_FLAG_ASCII):
                av = CH_UNICODE[av]
            emit(av)
        elif op is GROUPREF:
@ -237,7 +239,7 @@ def _compile_charset(charset, flags, code, fixup=None, fixes=None):
        elif op is CATEGORY:
            if flags & SRE_FLAG_LOCALE:
                emit(CH_LOCALE[av])
-            elif flags & SRE_FLAG_UNICODE:
+            elif (flags & SRE_FLAG_UNICODE) and not (flags & SRE_FLAG_ASCII):
                emit(CH_UNICODE[av])
            else:
                emit(av)
@ -414,14 +416,16 @@ def _get_literal_prefix(pattern):
    prefix = []
    prefixappend = prefix.append
    prefix_skip = None
    got_all = True
    for op, av in pattern.data:
        if op is LITERAL:
            prefixappend(av)
        elif op is SUBPATTERN:
-            prefix1, prefix_skip1, got_all = _get_literal_prefix(av[1])
+            group, add_flags, del_flags, p = av
            if add_flags & SRE_FLAG_IGNORECASE:
                break
            prefix1, prefix_skip1, got_all = _get_literal_prefix(p)
            if prefix_skip is None:
-                if av[0] is not None:
+                if group is not None:
                    prefix_skip = len(prefix)
                elif prefix_skip1 is not None:
                    prefix_skip = len(prefix) + prefix_skip1
@ -429,17 +433,20 @@ def _get_literal_prefix(pattern):
            if not got_all:
                break
        else:
            got_all = False
            break
-    return prefix, prefix_skip, got_all
+    else:
        return prefix, prefix_skip, True
    return prefix, prefix_skip, False
 def _get_charset_prefix(pattern):
    charset = [] # not used
    charsetappend = charset.append
    if pattern.data:
        op, av = pattern.data[0]
-        if op is SUBPATTERN and av[1]:
+        if op is SUBPATTERN:
-            op, av = av[1][0]
+            group, add_flags, del_flags, p = av
            if p and not (add_flags & SRE_FLAG_IGNORECASE):
                op, av = p[0]
                if op is LITERAL:
                    charsetappend((op, av))
                elif op is BRANCH:
--- a/Lib/sre_parse.py
+++ b/Lib/sre_parse.py
@ -65,6 +65,12 @@ FLAGS = {
    "u": SRE_FLAG_UNICODE,
 }
 GLOBAL_FLAGS = (SRE_FLAG_ASCII | SRE_FLAG_LOCALE | SRE_FLAG_UNICODE |
                SRE_FLAG_DEBUG | SRE_FLAG_TEMPLATE)
 class Verbose(Exception):
    pass
 class Pattern:
    # master pattern object.  keeps track of global attributes
    def __init__(self):
@ -184,7 +190,7 @@ class SubPattern:
                lo = lo + i
                hi = hi + j
            elif op is SUBPATTERN:
-                i, j = av[1].getwidth()
+                i, j = av[-1].getwidth()
                lo = lo + i
                hi = hi + j
            elif op in _REPEATCODES:
@ -395,7 +401,7 @@ def _escape(source, escape, state):
        pass
    raise source.error("bad escape %s" % escape, len(escape))
-def _parse_sub(source, state, nested=True):
+def _parse_sub(source, state, verbose, nested=True):
    # parse an alternation: a|b|c
    items = []
@ -403,7 +409,7 @@ def _parse_sub(source, state, nested=True):
    sourcematch = source.match
    start = source.tell()
    while True:
-        itemsappend(_parse(source, state))
+        itemsappend(_parse(source, state, verbose))
        if not sourcematch("|"):
            break
@ -445,10 +451,10 @@ def _parse_sub(source, state, nested=True):
    subpattern.append((BRANCH, (None, items)))
    return subpattern
-def _parse_sub_cond(source, state, condgroup):
+def _parse_sub_cond(source, state, condgroup, verbose):
-    item_yes = _parse(source, state)
+    item_yes = _parse(source, state, verbose)
    if source.match("|"):
-        item_no = _parse(source, state)
+        item_no = _parse(source, state, verbose)
        if source.next == "|":
            raise source.error("conditional backref with more than two branches")
    else:
@ -457,7 +463,7 @@ def _parse_sub_cond(source, state, condgroup):
    subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
    return subpattern
-def _parse(source, state):
+def _parse(source, state, verbose):
    # parse a simple pattern
    subpattern = SubPattern(state)
@ -467,7 +473,6 @@ def _parse(source, state):
    sourcematch = source.match
    _len = len
    _ord = ord
    verbose = state.flags & SRE_FLAG_VERBOSE
    while True:
@ -621,6 +626,8 @@ def _parse(source, state):
            group = True
            name = None
            condgroup = None
            add_flags = 0
            del_flags = 0
            if sourcematch("?"):
                # options
                char = sourceget()
@ -682,7 +689,7 @@ def _parse(source, state):
                        lookbehindgroups = state.lookbehindgroups
                        if lookbehindgroups is None:
                            state.lookbehindgroups = state.groups
-                    p = _parse_sub(source, state)
+                    p = _parse_sub(source, state, verbose)
                    if dir < 0:
                        if lookbehindgroups is None:
                            state.lookbehindgroups = None
@ -718,19 +725,13 @@ def _parse(source, state):
                            raise source.error("invalid group reference",
                                               len(condname) + 1)
                    state.checklookbehindgroup(condgroup, source)
-                elif char in FLAGS:
+                elif char in FLAGS or char == "-":
                    # flags
-                    while True:
+                    flags = _parse_flags(source, state, char)
-                        state.flags |= FLAGS[char]
+                    if flags is None:  # global flags
                        char = sourceget()
                        if char is None:
                            raise source.error("missing )")
                        if char == ")":
                            break
                        if char not in FLAGS:
                            raise source.error("unknown flag", len(char))
                    verbose = state.flags & SRE_FLAG_VERBOSE
                        continue
                    add_flags, del_flags = flags
                    group = None
                else:
                    raise source.error("unknown extension ?" + char,
                                       len(char) + 1)
@ -742,15 +743,17 @@ def _parse(source, state):
                except error as err:
                    raise source.error(err.msg, len(name) + 1) from None
            if condgroup:
-                p = _parse_sub_cond(source, state, condgroup)
+                p = _parse_sub_cond(source, state, condgroup, verbose)
            else:
-                p = _parse_sub(source, state)
+                sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) and
                               not (del_flags & SRE_FLAG_VERBOSE))
                p = _parse_sub(source, state, sub_verbose)
            if not source.match(")"):
                raise source.error("missing ), unterminated subpattern",
                                   source.tell() - start)
            if group is not None:
                state.closegroup(group, p)
-            subpatternappend((SUBPATTERN, (group, p)))
+            subpatternappend((SUBPATTERN, (group, add_flags, del_flags, p)))
        elif this == "^":
            subpatternappend((AT, AT_BEGINNING))
@ -763,6 +766,53 @@ def _parse(source, state):
    return subpattern
 def _parse_flags(source, state, char):
    sourceget = source.get
    add_flags = 0
    del_flags = 0
    if char != "-":
        while True:
            add_flags |= FLAGS[char]
            char = sourceget()
            if char is None:
                raise source.error("missing -, : or )")
            if char in ")-:":
                break
            if char not in FLAGS:
                msg = "unknown flag" if char.isalpha() else "missing -, : or )"
                raise source.error(msg, len(char))
    if char == ")":
        if ((add_flags & SRE_FLAG_VERBOSE) and
            not (state.flags & SRE_FLAG_VERBOSE)):
            raise Verbose
        state.flags |= add_flags
        return None
    if add_flags & GLOBAL_FLAGS:
        raise source.error("bad inline flags: cannot turn on global flag", 1)
    if char == "-":
        char = sourceget()
        if char is None:
            raise source.error("missing flag")
        if char not in FLAGS:
            msg = "unknown flag" if char.isalpha() else "missing flag"
            raise source.error(msg, len(char))
        while True:
            del_flags |= FLAGS[char]
            char = sourceget()
            if char is None:
                raise source.error("missing :")
            if char == ":":
                break
            if char not in FLAGS:
                msg = "unknown flag" if char.isalpha() else "missing :"
                raise source.error(msg, len(char))
    assert char == ":"
    if del_flags & GLOBAL_FLAGS:
        raise source.error("bad inline flags: cannot turn off global flag", 1)
    if add_flags & del_flags:
        raise source.error("bad inline flags: flag turned on and off", 1)
    return add_flags, del_flags
 def fix_flags(src, flags):
    # Check and fix flags according to the type of pattern (str or bytes)
    if isinstance(src, str):
@ -789,18 +839,22 @@ def parse(str, flags=0, pattern=None):
    pattern.flags = flags
    pattern.str = str
-    p = _parse_sub(source, pattern, 0)
+    try:
        p = _parse_sub(source, pattern, flags & SRE_FLAG_VERBOSE, False)
    except Verbose:
        # the VERBOSE flag was switched on inside the pattern.  to be
        # on the safe side, we'll parse the whole thing again...
        pattern = Pattern()
        pattern.flags = flags | SRE_FLAG_VERBOSE
        pattern.str = str
        p = _parse_sub(source, pattern, True, False)
    p.pattern.flags = fix_flags(str, p.pattern.flags)
    if source.next is not None:
        assert source.next == ")"
        raise source.error("unbalanced parenthesis")
    if not (flags & SRE_FLAG_VERBOSE) and p.pattern.flags & SRE_FLAG_VERBOSE:
        # the VERBOSE flag was switched on inside the pattern.  to be
        # on the safe side, we'll parse the whole thing again...
        return parse(str, p.pattern.flags)
    if flags & SRE_FLAG_DEBUG:
        p.dump()
--- a/Lib/test/test_re.py
+++ b/Lib/test/test_re.py
@ -1376,6 +1376,38 @@ class ReTests(unittest.TestCase):
        self.assertRaises(ValueError, re.compile, b'(?a)', re.LOCALE)
        self.assertRaises(ValueError, re.compile, b'(?aL)')
    def test_scoped_flags(self):
        self.assertTrue(re.match(r'(?i:a)b', 'Ab'))
        self.assertIsNone(re.match(r'(?i:a)b', 'aB'))
        self.assertIsNone(re.match(r'(?-i:a)b', 'Ab', re.IGNORECASE))
        self.assertTrue(re.match(r'(?-i:a)b', 'aB', re.IGNORECASE))
        self.assertIsNone(re.match(r'(?i:(?-i:a)b)', 'Ab'))
        self.assertTrue(re.match(r'(?i:(?-i:a)b)', 'aB'))
        self.assertTrue(re.match(r'(?x: a) b', 'a b'))
        self.assertIsNone(re.match(r'(?x: a) b', ' a b'))
        self.assertTrue(re.match(r'(?-x: a) b', ' ab', re.VERBOSE))
        self.assertIsNone(re.match(r'(?-x: a) b', 'ab', re.VERBOSE))
        self.checkPatternError(r'(?a:\w)',
                               'bad inline flags: cannot turn on global flag', 3)
        self.checkPatternError(r'(?a)(?-a:\w)',
                               'bad inline flags: cannot turn off global flag', 8)
        self.checkPatternError(r'(?i-i:a)',
                               'bad inline flags: flag turned on and off', 5)
        self.checkPatternError(r'(?-', 'missing flag', 3)
        self.checkPatternError(r'(?-+', 'missing flag', 3)
        self.checkPatternError(r'(?-z', 'unknown flag', 3)
        self.checkPatternError(r'(?-i', 'missing :', 4)
        self.checkPatternError(r'(?-i)', 'missing :', 4)
        self.checkPatternError(r'(?-i+', 'missing :', 4)
        self.checkPatternError(r'(?-iz', 'unknown flag', 4)
        self.checkPatternError(r'(?i:', 'missing ), unterminated subpattern', 0)
        self.checkPatternError(r'(?i', 'missing -, : or )', 3)
        self.checkPatternError(r'(?i+', 'missing -, : or )', 3)
        self.checkPatternError(r'(?iz', 'unknown flag', 3)
    def test_bug_6509(self):
        # Replacement strings of both types must parse properly.
        # all strings
@ -1538,9 +1570,9 @@ class ReTests(unittest.TestCase):
        with captured_stdout() as out:
            re.compile(pat, re.DEBUG)
        dump = '''\
-SUBPATTERN 1
+SUBPATTERN 1 0 0
  LITERAL 46
-SUBPATTERN None
+SUBPATTERN None 0 0
  BRANCH
    IN
      LITERAL 99
@ -1548,7 +1580,7 @@ SUBPATTERN None
  OR
    LITERAL 112
    LITERAL 121
-SUBPATTERN None
+SUBPATTERN None 0 0
  GROUPREF_EXISTS 1
    AT AT_END
  ELSE
@ -1664,7 +1696,7 @@ SUBPATTERN None
        self.checkPatternError(r'(?P', 'unexpected end of pattern', 3)
        self.checkPatternError(r'(?z)', 'unknown extension ?z', 1)
        self.checkPatternError(r'(?iz)', 'unknown flag', 3)
-        self.checkPatternError(r'(?i', 'missing )', 3)
+        self.checkPatternError(r'(?i', 'missing -, : or )', 3)
        self.checkPatternError(r'(?#abc', 'missing ), unterminated comment', 0)
        self.checkPatternError(r'(?<', 'unexpected end of pattern', 3)
        self.checkPatternError(r'(?<>)', 'unknown extension ?<>', 1)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -120,6 +120,8 @@ Core and Builtins
 Library
 -------
 - Issue #433028: Added support of modifier spans in regular expressions.
 - Issue #24594: Validates persist parameter when opening MSI database
 - Issue #28047: Fixed calculation of line length used for the base64 CTE