bpo-433030: Add support of atomic grouping in regular expressions (GH-31982)

* Atomic grouping: (?>...).
* Possessive quantifiers: x++, x*+, x?+, x{m,n}+.
  Equivalent to (?>x+), (?>x*), (?>x?), (?>x{m,n}).

Co-authored-by: Jeffrey C. Jacobs <timehorse@users.sourceforge.net>
This commit is contained in:
Serhiy Storchaka 2022-03-21 18:28:22 +02:00 committed by GitHub
parent 2bde6827ea
commit 345b390ed6
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
11 changed files with 593 additions and 92 deletions

View file

@ -25,7 +25,7 @@ ASCIILETTERS = frozenset("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
WHITESPACE = frozenset(" \t\n\r\v\f")
_REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT})
_REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT, POSSESSIVE_REPEAT})
_UNITCODES = frozenset({ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY})
ESCAPES = {
@ -190,6 +190,10 @@ class SubPattern:
i, j = av.getwidth()
lo = lo + i
hi = hi + j
elif op is ATOMIC_GROUP:
i, j = av.getwidth()
lo = lo + i
hi = hi + j
elif op is SUBPATTERN:
i, j = av[-1].getwidth()
lo = lo + i
@ -675,8 +679,13 @@ def _parse(source, state, verbose, nested, first=False):
if group is None and not add_flags and not del_flags:
item = p
if sourcematch("?"):
# Non-Greedy Match
subpattern[-1] = (MIN_REPEAT, (min, max, item))
elif sourcematch("+"):
# Possessive Match (Always Greedy)
subpattern[-1] = (POSSESSIVE_REPEAT, (min, max, item))
else:
# Greedy Match
subpattern[-1] = (MAX_REPEAT, (min, max, item))
elif this == ".":
@ -684,7 +693,8 @@ def _parse(source, state, verbose, nested, first=False):
elif this == "(":
start = source.tell() - 1
group = True
capture = True
atomic = False
name = None
add_flags = 0
del_flags = 0
@ -726,7 +736,7 @@ def _parse(source, state, verbose, nested, first=False):
len(char) + 2)
elif char == ":":
# non-capturing group
group = None
capture = False
elif char == "#":
# comment
while True:
@ -800,6 +810,10 @@ def _parse(source, state, verbose, nested, first=False):
subpatternappend((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
continue
elif char == ">":
# non-capturing, atomic group
capture = False
atomic = True
elif char in FLAGS or char == "-":
# flags
flags = _parse_flags(source, state, char)
@ -813,17 +827,19 @@ def _parse(source, state, verbose, nested, first=False):
continue
add_flags, del_flags = flags
group = None
capture = False
else:
raise source.error("unknown extension ?" + char,
len(char) + 1)
# parse group contents
if group is not None:
if capture:
try:
group = state.opengroup(name)
except error as err:
raise source.error(err.msg, len(name) + 1) from None
else:
group = None
sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) and
not (del_flags & SRE_FLAG_VERBOSE))
p = _parse_sub(source, state, sub_verbose, nested + 1)
@ -832,7 +848,11 @@ def _parse(source, state, verbose, nested, first=False):
source.tell() - start)
if group is not None:
state.closegroup(group, p)
subpatternappend((SUBPATTERN, (group, add_flags, del_flags, p)))
if atomic:
assert group is None
subpatternappend((ATOMIC_GROUP, p))
else:
subpatternappend((SUBPATTERN, (group, add_flags, del_flags, p)))
elif this == "^":
subpatternappend((AT, AT_BEGINNING))