mirror of
https://github.com/python/cpython.git
synced 2025-10-09 16:34:44 +00:00
bpo-433030: Add support of atomic grouping in regular expressions (GH-31982)
* Atomic grouping: (?>...). * Possessive quantifiers: x++, x*+, x?+, x{m,n}+. Equivalent to (?>x+), (?>x*), (?>x?), (?>x{m,n}). Co-authored-by: Jeffrey C. Jacobs <timehorse@users.sourceforge.net>
This commit is contained in:
parent
2bde6827ea
commit
345b390ed6
11 changed files with 593 additions and 92 deletions
|
@ -25,7 +25,7 @@ ASCIILETTERS = frozenset("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
|
|||
|
||||
WHITESPACE = frozenset(" \t\n\r\v\f")
|
||||
|
||||
_REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT})
|
||||
_REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT, POSSESSIVE_REPEAT})
|
||||
_UNITCODES = frozenset({ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY})
|
||||
|
||||
ESCAPES = {
|
||||
|
@ -190,6 +190,10 @@ class SubPattern:
|
|||
i, j = av.getwidth()
|
||||
lo = lo + i
|
||||
hi = hi + j
|
||||
elif op is ATOMIC_GROUP:
|
||||
i, j = av.getwidth()
|
||||
lo = lo + i
|
||||
hi = hi + j
|
||||
elif op is SUBPATTERN:
|
||||
i, j = av[-1].getwidth()
|
||||
lo = lo + i
|
||||
|
@ -675,8 +679,13 @@ def _parse(source, state, verbose, nested, first=False):
|
|||
if group is None and not add_flags and not del_flags:
|
||||
item = p
|
||||
if sourcematch("?"):
|
||||
# Non-Greedy Match
|
||||
subpattern[-1] = (MIN_REPEAT, (min, max, item))
|
||||
elif sourcematch("+"):
|
||||
# Possessive Match (Always Greedy)
|
||||
subpattern[-1] = (POSSESSIVE_REPEAT, (min, max, item))
|
||||
else:
|
||||
# Greedy Match
|
||||
subpattern[-1] = (MAX_REPEAT, (min, max, item))
|
||||
|
||||
elif this == ".":
|
||||
|
@ -684,7 +693,8 @@ def _parse(source, state, verbose, nested, first=False):
|
|||
|
||||
elif this == "(":
|
||||
start = source.tell() - 1
|
||||
group = True
|
||||
capture = True
|
||||
atomic = False
|
||||
name = None
|
||||
add_flags = 0
|
||||
del_flags = 0
|
||||
|
@ -726,7 +736,7 @@ def _parse(source, state, verbose, nested, first=False):
|
|||
len(char) + 2)
|
||||
elif char == ":":
|
||||
# non-capturing group
|
||||
group = None
|
||||
capture = False
|
||||
elif char == "#":
|
||||
# comment
|
||||
while True:
|
||||
|
@ -800,6 +810,10 @@ def _parse(source, state, verbose, nested, first=False):
|
|||
subpatternappend((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
|
||||
continue
|
||||
|
||||
elif char == ">":
|
||||
# non-capturing, atomic group
|
||||
capture = False
|
||||
atomic = True
|
||||
elif char in FLAGS or char == "-":
|
||||
# flags
|
||||
flags = _parse_flags(source, state, char)
|
||||
|
@ -813,17 +827,19 @@ def _parse(source, state, verbose, nested, first=False):
|
|||
continue
|
||||
|
||||
add_flags, del_flags = flags
|
||||
group = None
|
||||
capture = False
|
||||
else:
|
||||
raise source.error("unknown extension ?" + char,
|
||||
len(char) + 1)
|
||||
|
||||
# parse group contents
|
||||
if group is not None:
|
||||
if capture:
|
||||
try:
|
||||
group = state.opengroup(name)
|
||||
except error as err:
|
||||
raise source.error(err.msg, len(name) + 1) from None
|
||||
else:
|
||||
group = None
|
||||
sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) and
|
||||
not (del_flags & SRE_FLAG_VERBOSE))
|
||||
p = _parse_sub(source, state, sub_verbose, nested + 1)
|
||||
|
@ -832,7 +848,11 @@ def _parse(source, state, verbose, nested, first=False):
|
|||
source.tell() - start)
|
||||
if group is not None:
|
||||
state.closegroup(group, p)
|
||||
subpatternappend((SUBPATTERN, (group, add_flags, del_flags, p)))
|
||||
if atomic:
|
||||
assert group is None
|
||||
subpatternappend((ATOMIC_GROUP, p))
|
||||
else:
|
||||
subpatternappend((SUBPATTERN, (group, add_flags, del_flags, p)))
|
||||
|
||||
elif this == "^":
|
||||
subpatternappend((AT, AT_BEGINNING))
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue