mirror of
https://github.com/python/cpython.git
synced 2025-08-30 21:48:47 +00:00
-- SRE 0.9.6 sync. this includes:
+ added "regs" attribute + fixed "pos" and "endpos" attributes + reset "lastindex" and "lastgroup" in scanner methods + removed (?P#id) syntax; the "lastindex" and "lastgroup" attributes are now always set + removed string module dependencies in sre_parse + better debugging support in sre_parse + various tweaks to build under 1.5.2
This commit is contained in:
parent
4f1b2081e9
commit
8a3ebf8ca8
7 changed files with 1265 additions and 1138 deletions
155
Lib/sre_parse.py
155
Lib/sre_parse.py
|
@ -25,12 +25,12 @@ CHARMASK = 0xff
|
|||
SPECIAL_CHARS = ".\\[{()*+?^$|"
|
||||
REPEAT_CHARS = "*+?{"
|
||||
|
||||
DIGITS = tuple(string.digits)
|
||||
DIGITS = tuple("012345689")
|
||||
|
||||
OCTDIGITS = tuple("01234567")
|
||||
HEXDIGITS = tuple("0123456789abcdefABCDEF")
|
||||
|
||||
WHITESPACE = tuple(string.whitespace)
|
||||
WHITESPACE = tuple(" \t\n\r\v\f")
|
||||
|
||||
ESCAPES = {
|
||||
r"\a": (LITERAL, 7),
|
||||
|
@ -68,7 +68,8 @@ FLAGS = {
|
|||
"u": SRE_FLAG_UNICODE,
|
||||
}
|
||||
|
||||
class State:
|
||||
class Pattern:
|
||||
# master pattern object. keeps track of global attributes
|
||||
def __init__(self):
|
||||
self.flags = 0
|
||||
self.groups = 1
|
||||
|
@ -88,6 +89,33 @@ class SubPattern:
|
|||
data = []
|
||||
self.data = data
|
||||
self.width = None
|
||||
def dump(self, level=0):
|
||||
nl = 1
|
||||
for op, av in self.data:
|
||||
print level*" " + op,; nl = 0
|
||||
if op == "in":
|
||||
# member sublanguage
|
||||
print; nl = 1
|
||||
for op, a in av:
|
||||
print (level+1)*" " + op, a
|
||||
elif op == "branch":
|
||||
print; nl = 1
|
||||
i = 0
|
||||
for a in av[1]:
|
||||
if i > 0:
|
||||
print level*" " + "or"
|
||||
a.dump(level+1); nl = 1
|
||||
i = i + 1
|
||||
elif type(av) in (type(()), type([])):
|
||||
for a in av:
|
||||
if isinstance(a, SubPattern):
|
||||
if not nl: print
|
||||
a.dump(level+1); nl = 1
|
||||
else:
|
||||
print a, ; nl = 0
|
||||
else:
|
||||
print av, ; nl = 0
|
||||
if not nl: print
|
||||
def __repr__(self):
|
||||
return repr(self.data)
|
||||
def __len__(self):
|
||||
|
@ -255,10 +283,25 @@ def _escape(source, escape, state):
|
|||
pass
|
||||
raise error, "bogus escape: %s" % repr(escape)
|
||||
|
||||
def _branch(pattern, items):
|
||||
# form a branch operator from a set of items
|
||||
def _parse_sub(source, state, nested=1):
|
||||
# parse an alternation: a|b|c
|
||||
|
||||
subpattern = SubPattern(pattern)
|
||||
items = []
|
||||
while 1:
|
||||
items.append(_parse(source, state))
|
||||
if source.match("|"):
|
||||
continue
|
||||
if not nested:
|
||||
break
|
||||
if not source.next or source.match(")"):
|
||||
break
|
||||
else:
|
||||
raise error, "pattern not properly closed"
|
||||
|
||||
if len(items) == 1:
|
||||
return items[0]
|
||||
|
||||
subpattern = SubPattern(state)
|
||||
|
||||
# check if all items share a common prefix
|
||||
while 1:
|
||||
|
@ -285,7 +328,7 @@ def _branch(pattern, items):
|
|||
break
|
||||
else:
|
||||
# we can store this as a character set instead of a
|
||||
# branch (FIXME: use a range if possible)
|
||||
# branch (the compiler may optimize this even more)
|
||||
set = []
|
||||
for item in items:
|
||||
set.append(item[0])
|
||||
|
@ -296,8 +339,7 @@ def _branch(pattern, items):
|
|||
return subpattern
|
||||
|
||||
def _parse(source, state):
|
||||
|
||||
# parse regular expression pattern into an operator list.
|
||||
# parse a simple pattern
|
||||
|
||||
subpattern = SubPattern(state)
|
||||
|
||||
|
@ -451,22 +493,6 @@ def _parse(source, state):
|
|||
if gid is None:
|
||||
raise error, "unknown group name"
|
||||
subpattern.append((GROUPREF, gid))
|
||||
elif source.match("#"):
|
||||
index = ""
|
||||
while 1:
|
||||
char = source.get()
|
||||
if char is None:
|
||||
raise error, "unterminated index"
|
||||
if char == ")":
|
||||
break
|
||||
index = index + char
|
||||
try:
|
||||
index = int(index)
|
||||
if index < 0 or index > MAXREPEAT:
|
||||
raise ValueError
|
||||
except ValueError:
|
||||
raise error, "illegal index"
|
||||
subpattern.append((INDEX, index))
|
||||
continue
|
||||
else:
|
||||
char = source.get()
|
||||
|
@ -491,48 +517,27 @@ def _parse(source, state):
|
|||
raise error, "syntax error"
|
||||
dir = -1 # lookbehind
|
||||
char = source.get()
|
||||
b = []
|
||||
while 1:
|
||||
p = _parse(source, state)
|
||||
if source.next == ")":
|
||||
if b:
|
||||
b.append(p)
|
||||
p = _branch(state, b)
|
||||
if char == "=":
|
||||
subpattern.append((ASSERT, (dir, p)))
|
||||
else:
|
||||
subpattern.append((ASSERT_NOT, (dir, p)))
|
||||
break
|
||||
elif source.match("|"):
|
||||
b.append(p)
|
||||
else:
|
||||
raise error, "pattern not properly closed"
|
||||
p = _parse_sub(source, state)
|
||||
if char == "=":
|
||||
subpattern.append((ASSERT, (dir, p)))
|
||||
else:
|
||||
subpattern.append((ASSERT_NOT, (dir, p)))
|
||||
continue
|
||||
else:
|
||||
# flags
|
||||
while FLAGS.has_key(source.next):
|
||||
state.flags = state.flags | FLAGS[source.get()]
|
||||
if group:
|
||||
# parse group contents
|
||||
b = []
|
||||
if group == 2:
|
||||
# anonymous group
|
||||
group = None
|
||||
else:
|
||||
group = state.getgroup(name)
|
||||
while 1:
|
||||
p = _parse(source, state)
|
||||
if group is not None:
|
||||
p.append((INDEX, group))
|
||||
if source.match(")"):
|
||||
if b:
|
||||
b.append(p)
|
||||
p = _branch(state, b)
|
||||
subpattern.append((SUBPATTERN, (group, p)))
|
||||
break
|
||||
elif source.match("|"):
|
||||
b.append(p)
|
||||
else:
|
||||
raise error, "group not properly closed"
|
||||
p = _parse_sub(source, state)
|
||||
subpattern.append((SUBPATTERN, (group, p)))
|
||||
if group is not None:
|
||||
p.append((INDEX, group))
|
||||
else:
|
||||
while 1:
|
||||
char = source.get()
|
||||
|
@ -555,26 +560,24 @@ def _parse(source, state):
|
|||
|
||||
return subpattern
|
||||
|
||||
def parse(pattern, flags=0):
|
||||
def parse(str, flags=0):
|
||||
# parse 're' pattern into list of (opcode, argument) tuples
|
||||
source = Tokenizer(pattern)
|
||||
state = State()
|
||||
state.flags = flags
|
||||
b = []
|
||||
while 1:
|
||||
p = _parse(source, state)
|
||||
tail = source.get()
|
||||
if tail == "|":
|
||||
b.append(p)
|
||||
elif tail == ")":
|
||||
raise error, "unbalanced parenthesis"
|
||||
elif tail is None:
|
||||
if b:
|
||||
b.append(p)
|
||||
p = _branch(state, b)
|
||||
break
|
||||
else:
|
||||
raise error, "bogus characters at end of regular expression"
|
||||
|
||||
source = Tokenizer(str)
|
||||
|
||||
pattern = Pattern()
|
||||
pattern.flags = flags
|
||||
|
||||
p = _parse_sub(source, pattern, 0)
|
||||
|
||||
tail = source.get()
|
||||
if tail == ")":
|
||||
raise error, "unbalanced parenthesis"
|
||||
elif tail:
|
||||
raise error, "bogus characters at end of regular expression"
|
||||
|
||||
# p.dump()
|
||||
|
||||
return p
|
||||
|
||||
def parse_template(source, pattern):
|
||||
|
@ -656,4 +659,4 @@ def expand_template(template, match):
|
|||
if s is None:
|
||||
raise error, "empty group"
|
||||
a(s)
|
||||
return sep.join(p)
|
||||
return string.join(p, sep)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue