mirror of
https://github.com/python/cpython.git
synced 2025-10-09 00:22:17 +00:00
Issue #433028: Added support of modifier spans in regular expressions.
This commit is contained in:
parent
ee73a65745
commit
be9a4e5c85
7 changed files with 180 additions and 66 deletions
114
Lib/sre_parse.py
114
Lib/sre_parse.py
|
@ -65,6 +65,12 @@ FLAGS = {
|
|||
"u": SRE_FLAG_UNICODE,
|
||||
}
|
||||
|
||||
GLOBAL_FLAGS = (SRE_FLAG_ASCII | SRE_FLAG_LOCALE | SRE_FLAG_UNICODE |
|
||||
SRE_FLAG_DEBUG | SRE_FLAG_TEMPLATE)
|
||||
|
||||
class Verbose(Exception):
|
||||
pass
|
||||
|
||||
class Pattern:
|
||||
# master pattern object. keeps track of global attributes
|
||||
def __init__(self):
|
||||
|
@ -184,7 +190,7 @@ class SubPattern:
|
|||
lo = lo + i
|
||||
hi = hi + j
|
||||
elif op is SUBPATTERN:
|
||||
i, j = av[1].getwidth()
|
||||
i, j = av[-1].getwidth()
|
||||
lo = lo + i
|
||||
hi = hi + j
|
||||
elif op in _REPEATCODES:
|
||||
|
@ -395,7 +401,7 @@ def _escape(source, escape, state):
|
|||
pass
|
||||
raise source.error("bad escape %s" % escape, len(escape))
|
||||
|
||||
def _parse_sub(source, state, nested=True):
|
||||
def _parse_sub(source, state, verbose, nested=True):
|
||||
# parse an alternation: a|b|c
|
||||
|
||||
items = []
|
||||
|
@ -403,7 +409,7 @@ def _parse_sub(source, state, nested=True):
|
|||
sourcematch = source.match
|
||||
start = source.tell()
|
||||
while True:
|
||||
itemsappend(_parse(source, state))
|
||||
itemsappend(_parse(source, state, verbose))
|
||||
if not sourcematch("|"):
|
||||
break
|
||||
|
||||
|
@ -445,10 +451,10 @@ def _parse_sub(source, state, nested=True):
|
|||
subpattern.append((BRANCH, (None, items)))
|
||||
return subpattern
|
||||
|
||||
def _parse_sub_cond(source, state, condgroup):
|
||||
item_yes = _parse(source, state)
|
||||
def _parse_sub_cond(source, state, condgroup, verbose):
|
||||
item_yes = _parse(source, state, verbose)
|
||||
if source.match("|"):
|
||||
item_no = _parse(source, state)
|
||||
item_no = _parse(source, state, verbose)
|
||||
if source.next == "|":
|
||||
raise source.error("conditional backref with more than two branches")
|
||||
else:
|
||||
|
@ -457,7 +463,7 @@ def _parse_sub_cond(source, state, condgroup):
|
|||
subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
|
||||
return subpattern
|
||||
|
||||
def _parse(source, state):
|
||||
def _parse(source, state, verbose):
|
||||
# parse a simple pattern
|
||||
subpattern = SubPattern(state)
|
||||
|
||||
|
@ -467,7 +473,6 @@ def _parse(source, state):
|
|||
sourcematch = source.match
|
||||
_len = len
|
||||
_ord = ord
|
||||
verbose = state.flags & SRE_FLAG_VERBOSE
|
||||
|
||||
while True:
|
||||
|
||||
|
@ -621,6 +626,8 @@ def _parse(source, state):
|
|||
group = True
|
||||
name = None
|
||||
condgroup = None
|
||||
add_flags = 0
|
||||
del_flags = 0
|
||||
if sourcematch("?"):
|
||||
# options
|
||||
char = sourceget()
|
||||
|
@ -682,7 +689,7 @@ def _parse(source, state):
|
|||
lookbehindgroups = state.lookbehindgroups
|
||||
if lookbehindgroups is None:
|
||||
state.lookbehindgroups = state.groups
|
||||
p = _parse_sub(source, state)
|
||||
p = _parse_sub(source, state, verbose)
|
||||
if dir < 0:
|
||||
if lookbehindgroups is None:
|
||||
state.lookbehindgroups = None
|
||||
|
@ -718,19 +725,13 @@ def _parse(source, state):
|
|||
raise source.error("invalid group reference",
|
||||
len(condname) + 1)
|
||||
state.checklookbehindgroup(condgroup, source)
|
||||
elif char in FLAGS:
|
||||
elif char in FLAGS or char == "-":
|
||||
# flags
|
||||
while True:
|
||||
state.flags |= FLAGS[char]
|
||||
char = sourceget()
|
||||
if char is None:
|
||||
raise source.error("missing )")
|
||||
if char == ")":
|
||||
break
|
||||
if char not in FLAGS:
|
||||
raise source.error("unknown flag", len(char))
|
||||
verbose = state.flags & SRE_FLAG_VERBOSE
|
||||
continue
|
||||
flags = _parse_flags(source, state, char)
|
||||
if flags is None: # global flags
|
||||
continue
|
||||
add_flags, del_flags = flags
|
||||
group = None
|
||||
else:
|
||||
raise source.error("unknown extension ?" + char,
|
||||
len(char) + 1)
|
||||
|
@ -742,15 +743,17 @@ def _parse(source, state):
|
|||
except error as err:
|
||||
raise source.error(err.msg, len(name) + 1) from None
|
||||
if condgroup:
|
||||
p = _parse_sub_cond(source, state, condgroup)
|
||||
p = _parse_sub_cond(source, state, condgroup, verbose)
|
||||
else:
|
||||
p = _parse_sub(source, state)
|
||||
sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) and
|
||||
not (del_flags & SRE_FLAG_VERBOSE))
|
||||
p = _parse_sub(source, state, sub_verbose)
|
||||
if not source.match(")"):
|
||||
raise source.error("missing ), unterminated subpattern",
|
||||
source.tell() - start)
|
||||
if group is not None:
|
||||
state.closegroup(group, p)
|
||||
subpatternappend((SUBPATTERN, (group, p)))
|
||||
subpatternappend((SUBPATTERN, (group, add_flags, del_flags, p)))
|
||||
|
||||
elif this == "^":
|
||||
subpatternappend((AT, AT_BEGINNING))
|
||||
|
@ -763,6 +766,53 @@ def _parse(source, state):
|
|||
|
||||
return subpattern
|
||||
|
||||
def _parse_flags(source, state, char):
|
||||
sourceget = source.get
|
||||
add_flags = 0
|
||||
del_flags = 0
|
||||
if char != "-":
|
||||
while True:
|
||||
add_flags |= FLAGS[char]
|
||||
char = sourceget()
|
||||
if char is None:
|
||||
raise source.error("missing -, : or )")
|
||||
if char in ")-:":
|
||||
break
|
||||
if char not in FLAGS:
|
||||
msg = "unknown flag" if char.isalpha() else "missing -, : or )"
|
||||
raise source.error(msg, len(char))
|
||||
if char == ")":
|
||||
if ((add_flags & SRE_FLAG_VERBOSE) and
|
||||
not (state.flags & SRE_FLAG_VERBOSE)):
|
||||
raise Verbose
|
||||
state.flags |= add_flags
|
||||
return None
|
||||
if add_flags & GLOBAL_FLAGS:
|
||||
raise source.error("bad inline flags: cannot turn on global flag", 1)
|
||||
if char == "-":
|
||||
char = sourceget()
|
||||
if char is None:
|
||||
raise source.error("missing flag")
|
||||
if char not in FLAGS:
|
||||
msg = "unknown flag" if char.isalpha() else "missing flag"
|
||||
raise source.error(msg, len(char))
|
||||
while True:
|
||||
del_flags |= FLAGS[char]
|
||||
char = sourceget()
|
||||
if char is None:
|
||||
raise source.error("missing :")
|
||||
if char == ":":
|
||||
break
|
||||
if char not in FLAGS:
|
||||
msg = "unknown flag" if char.isalpha() else "missing :"
|
||||
raise source.error(msg, len(char))
|
||||
assert char == ":"
|
||||
if del_flags & GLOBAL_FLAGS:
|
||||
raise source.error("bad inline flags: cannot turn off global flag", 1)
|
||||
if add_flags & del_flags:
|
||||
raise source.error("bad inline flags: flag turned on and off", 1)
|
||||
return add_flags, del_flags
|
||||
|
||||
def fix_flags(src, flags):
|
||||
# Check and fix flags according to the type of pattern (str or bytes)
|
||||
if isinstance(src, str):
|
||||
|
@ -789,18 +839,22 @@ def parse(str, flags=0, pattern=None):
|
|||
pattern.flags = flags
|
||||
pattern.str = str
|
||||
|
||||
p = _parse_sub(source, pattern, 0)
|
||||
try:
|
||||
p = _parse_sub(source, pattern, flags & SRE_FLAG_VERBOSE, False)
|
||||
except Verbose:
|
||||
# the VERBOSE flag was switched on inside the pattern. to be
|
||||
# on the safe side, we'll parse the whole thing again...
|
||||
pattern = Pattern()
|
||||
pattern.flags = flags | SRE_FLAG_VERBOSE
|
||||
pattern.str = str
|
||||
p = _parse_sub(source, pattern, True, False)
|
||||
|
||||
p.pattern.flags = fix_flags(str, p.pattern.flags)
|
||||
|
||||
if source.next is not None:
|
||||
assert source.next == ")"
|
||||
raise source.error("unbalanced parenthesis")
|
||||
|
||||
if not (flags & SRE_FLAG_VERBOSE) and p.pattern.flags & SRE_FLAG_VERBOSE:
|
||||
# the VERBOSE flag was switched on inside the pattern. to be
|
||||
# on the safe side, we'll parse the whole thing again...
|
||||
return parse(str, p.pattern.flags)
|
||||
|
||||
if flags & SRE_FLAG_DEBUG:
|
||||
p.dump()
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue