Issues #814253, #9179: Group references and conditional group references now

work in lookbehind assertions in regular expressions.
This commit is contained in:
Serhiy Storchaka 2015-02-21 10:07:35 +02:00
parent df80706f14
commit 4eea62fd2e
5 changed files with 92 additions and 12 deletions

View file

@ -297,6 +297,9 @@ The special characters are:
>>> m.group(0) >>> m.group(0)
'egg' 'egg'
.. versionchanged: 3.5
Added support for group references of fixed length.
``(?<!...)`` ``(?<!...)``
Matches if the current position in the string is not preceded by a match for Matches if the current position in the string is not preceded by a match for
``...``. This is called a :dfn:`negative lookbehind assertion`. Similar to ``...``. This is called a :dfn:`negative lookbehind assertion`. Similar to

View file

@ -351,10 +351,11 @@ class Scanner:
s = sre_parse.Pattern() s = sre_parse.Pattern()
s.flags = flags s.flags = flags
for phrase, action in lexicon: for phrase, action in lexicon:
gid = s.opengroup()
p.append(sre_parse.SubPattern(s, [ p.append(sre_parse.SubPattern(s, [
(SUBPATTERN, (len(p)+1, sre_parse.parse(phrase, flags))), (SUBPATTERN, (gid, sre_parse.parse(phrase, flags))),
])) ]))
s.groups = len(p)+1 s.closegroup(gid, p[-1])
p = sre_parse.SubPattern(s, [(BRANCH, (None, p))]) p = sre_parse.SubPattern(s, [(BRANCH, (None, p))])
self.scanner = sre_compile.compile(p) self.scanner = sre_compile.compile(p)
def scan(self, string): def scan(self, string):

View file

@ -68,12 +68,15 @@ class Pattern:
# master pattern object. keeps track of global attributes # master pattern object. keeps track of global attributes
def __init__(self): def __init__(self):
self.flags = 0 self.flags = 0
self.open = []
self.groups = 1
self.groupdict = {} self.groupdict = {}
self.subpatterns = [None] # group 0
self.lookbehindgroups = None
@property
def groups(self):
return len(self.subpatterns)
def opengroup(self, name=None): def opengroup(self, name=None):
gid = self.groups gid = self.groups
self.groups = gid + 1 self.subpatterns.append(None)
if self.groups > MAXGROUPS: if self.groups > MAXGROUPS:
raise error("groups number is too large") raise error("groups number is too large")
if name is not None: if name is not None:
@ -82,12 +85,19 @@ class Pattern:
raise error("redefinition of group name %r as group %d; " raise error("redefinition of group name %r as group %d; "
"was group %d" % (name, gid, ogid)) "was group %d" % (name, gid, ogid))
self.groupdict[name] = gid self.groupdict[name] = gid
self.open.append(gid)
return gid return gid
def closegroup(self, gid): def closegroup(self, gid, p):
self.open.remove(gid) self.subpatterns[gid] = p
def checkgroup(self, gid): def checkgroup(self, gid):
return gid < self.groups and gid not in self.open return gid < self.groups and self.subpatterns[gid] is not None
def checklookbehindgroup(self, gid, source):
if self.lookbehindgroups is not None:
if not self.checkgroup(gid):
raise source.error('cannot refer to an open group')
if gid >= self.lookbehindgroups:
raise source.error('cannot refer to group defined in the same '
'lookbehind subpattern')
class SubPattern: class SubPattern:
# a subpattern, in intermediate form # a subpattern, in intermediate form
@ -183,7 +193,21 @@ class SubPattern:
elif op in _UNITCODES: elif op in _UNITCODES:
lo = lo + 1 lo = lo + 1
hi = hi + 1 hi = hi + 1
elif op == SUCCESS: elif op is GROUPREF:
i, j = self.pattern.subpatterns[av].getwidth()
lo = lo + i
hi = hi + j
elif op is GROUPREF_EXISTS:
i, j = av[1].getwidth()
if av[2] is not None:
l, h = av[2].getwidth()
i = min(i, l)
j = max(j, h)
else:
i = 0
lo = lo + i
hi = hi + j
elif op is SUCCESS:
break break
self.width = min(lo, MAXREPEAT - 1), min(hi, MAXREPEAT) self.width = min(lo, MAXREPEAT - 1), min(hi, MAXREPEAT)
return self.width return self.width
@ -379,6 +403,7 @@ def _escape(source, escape, state):
if not state.checkgroup(group): if not state.checkgroup(group):
raise source.error("cannot refer to open group", raise source.error("cannot refer to open group",
len(escape)) len(escape))
state.checklookbehindgroup(group, source)
return GROUPREF, group return GROUPREF, group
raise ValueError raise ValueError
if len(escape) == 2: if len(escape) == 2:
@ -641,6 +666,7 @@ def _parse(source, state):
if gid is None: if gid is None:
msg = "unknown group name: {0!r}".format(name) msg = "unknown group name: {0!r}".format(name)
raise source.error(msg, len(name) + 1) raise source.error(msg, len(name) + 1)
state.checklookbehindgroup(gid, source)
subpatternappend((GROUPREF, gid)) subpatternappend((GROUPREF, gid))
continue continue
else: else:
@ -668,7 +694,13 @@ def _parse(source, state):
if char is None or char not in "=!": if char is None or char not in "=!":
raise source.error("syntax error") raise source.error("syntax error")
dir = -1 # lookbehind dir = -1 # lookbehind
lookbehindgroups = state.lookbehindgroups
if lookbehindgroups is None:
state.lookbehindgroups = state.groups
p = _parse_sub(source, state) p = _parse_sub(source, state)
if dir < 0:
if lookbehindgroups is None:
state.lookbehindgroups = None
if not sourcematch(")"): if not sourcematch(")"):
raise source.error("unbalanced parenthesis") raise source.error("unbalanced parenthesis")
if char == "=": if char == "=":
@ -701,6 +733,7 @@ def _parse(source, state):
if condgroup >= MAXGROUPS: if condgroup >= MAXGROUPS:
raise source.error("the group number is too large", raise source.error("the group number is too large",
len(condname) + 1) len(condname) + 1)
state.checklookbehindgroup(condgroup, source)
elif char in FLAGS: elif char in FLAGS:
# flags # flags
state.flags |= FLAGS[char] state.flags |= FLAGS[char]
@ -726,7 +759,7 @@ def _parse(source, state):
if not sourcematch(")"): if not sourcematch(")"):
raise source.error("unbalanced parenthesis") raise source.error("unbalanced parenthesis")
if group is not None: if group is not None:
state.closegroup(group) state.closegroup(group, p)
subpatternappend((SUBPATTERN, (group, p))) subpatternappend((SUBPATTERN, (group, p)))
else: else:
while True: while True:

View file

@ -604,7 +604,7 @@ class ReTests(unittest.TestCase):
self.assertEqual(re.match("a.*b", "a\n\nb", re.DOTALL).group(0), self.assertEqual(re.match("a.*b", "a\n\nb", re.DOTALL).group(0),
"a\n\nb") "a\n\nb")
def test_non_consuming(self): def test_lookahead(self):
self.assertEqual(re.match("(a(?=\s[^a]))", "a b").group(1), "a") self.assertEqual(re.match("(a(?=\s[^a]))", "a b").group(1), "a")
self.assertEqual(re.match("(a(?=\s[^a]*))", "a b").group(1), "a") self.assertEqual(re.match("(a(?=\s[^a]*))", "a b").group(1), "a")
self.assertEqual(re.match("(a(?=\s[abc]))", "a b").group(1), "a") self.assertEqual(re.match("(a(?=\s[abc]))", "a b").group(1), "a")
@ -618,6 +618,46 @@ class ReTests(unittest.TestCase):
self.assertEqual(re.match(r"(a)(?!\s\1)", "a b").group(1), "a") self.assertEqual(re.match(r"(a)(?!\s\1)", "a b").group(1), "a")
self.assertEqual(re.match(r"(a)(?!\s(abc|a))", "a b").group(1), "a") self.assertEqual(re.match(r"(a)(?!\s(abc|a))", "a b").group(1), "a")
# Group reference.
self.assertTrue(re.match(r'(a)b(?=\1)a', 'aba'))
self.assertIsNone(re.match(r'(a)b(?=\1)c', 'abac'))
# Conditional group reference.
self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(2)x|c))c', 'abc'))
self.assertIsNone(re.match(r'(?:(a)|(x))b(?=(?(2)c|x))c', 'abc'))
self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(2)x|c))c', 'abc'))
self.assertIsNone(re.match(r'(?:(a)|(x))b(?=(?(1)b|x))c', 'abc'))
self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(1)c|x))c', 'abc'))
# Group used before defined.
self.assertTrue(re.match(r'(a)b(?=(?(2)x|c))(c)', 'abc'))
self.assertIsNone(re.match(r'(a)b(?=(?(2)b|x))(c)', 'abc'))
self.assertTrue(re.match(r'(a)b(?=(?(1)c|x))(c)', 'abc'))
def test_lookbehind(self):
self.assertTrue(re.match(r'ab(?<=b)c', 'abc'))
self.assertIsNone(re.match(r'ab(?<=c)c', 'abc'))
self.assertIsNone(re.match(r'ab(?<!b)c', 'abc'))
self.assertTrue(re.match(r'ab(?<!c)c', 'abc'))
# Group reference.
self.assertTrue(re.match(r'(a)a(?<=\1)c', 'aac'))
self.assertIsNone(re.match(r'(a)b(?<=\1)a', 'abaa'))
self.assertIsNone(re.match(r'(a)a(?<!\1)c', 'aac'))
self.assertTrue(re.match(r'(a)b(?<!\1)a', 'abaa'))
# Conditional group reference.
self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(2)x|c))c', 'abc'))
self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(2)b|x))c', 'abc'))
self.assertTrue(re.match(r'(?:(a)|(x))b(?<=(?(2)x|b))c', 'abc'))
self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(1)c|x))c', 'abc'))
self.assertTrue(re.match(r'(?:(a)|(x))b(?<=(?(1)b|x))c', 'abc'))
# Group used before defined.
self.assertRaises(re.error, re.compile, r'(a)b(?<=(?(2)b|x))(c)')
self.assertIsNone(re.match(r'(a)b(?<=(?(1)c|x))(c)', 'abc'))
self.assertTrue(re.match(r'(a)b(?<=(?(1)b|x))(c)', 'abc'))
# Group defined in the same lookbehind pattern
self.assertRaises(re.error, re.compile, r'(a)b(?<=(.)\2)(c)')
self.assertRaises(re.error, re.compile, r'(a)b(?<=(?P<a>.)(?P=a))(c)')
self.assertRaises(re.error, re.compile, r'(a)b(?<=(a)(?(2)b|x))(c)')
self.assertRaises(re.error, re.compile, r'(a)b(?<=(.)(?<=\2))(c)')
def test_ignore_case(self): def test_ignore_case(self):
self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC") self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
self.assertEqual(re.match(b"abc", b"ABC", re.I).group(0), b"ABC") self.assertEqual(re.match(b"abc", b"ABC", re.I).group(0), b"ABC")

View file

@ -13,6 +13,9 @@ Core and Builtins
Library Library
------- -------
- Issues #814253, #9179: Group references and conditional group references now
work in lookbehind assertions in regular expressions.
- Issue #23215: Multibyte codecs with custom error handlers that ignores errors - Issue #23215: Multibyte codecs with custom error handlers that ignores errors
consumed too much memory and raised SystemError or MemoryError. consumed too much memory and raised SystemError or MemoryError.
Original patch by Aleksi Torhamo. Original patch by Aleksi Torhamo.