mirror of
https://github.com/python/cpython.git
synced 2025-07-25 20:24:11 +00:00

-- added pickling support (only works if sre is imported) -- fixed wordsize problems in engine (instead of casting literals down to the character size, cast characters up to the literal size (same as the code word size). this prevents false hits when you're matching a unicode pattern against an 8-bit string. (unfortunately, this broke another test, but I think the test should be changed in this case; more on that on python-dev) -- added sre.purge function (unofficial, clears the cache)
630 lines
20 KiB
Python
630 lines
20 KiB
Python
#
|
|
# Secret Labs' Regular Expression Engine
|
|
#
|
|
# convert re-style regular expression to sre pattern
|
|
#
|
|
# Copyright (c) 1998-2000 by Secret Labs AB. All rights reserved.
|
|
#
|
|
# Portions of this engine have been developed in cooperation with
|
|
# CNRI. Hewlett-Packard provided funding for 1.6 integration and
|
|
# other compatibility work.
|
|
#
|
|
|
|
import string, sys
|
|
|
|
import _sre
|
|
|
|
from sre_constants import *
|
|
|
|
# FIXME: should be 65535, but the arraymodule is still broken
|
|
MAXREPEAT = 32767
|
|
|
|
# FIXME: same here
|
|
CHARMASK = 0x7fff
|
|
|
|
SPECIAL_CHARS = ".\\[{()*+?^$|"
|
|
REPEAT_CHARS = "*+?{"
|
|
|
|
DIGITS = tuple(string.digits)
|
|
|
|
OCTDIGITS = tuple("01234567")
|
|
HEXDIGITS = tuple("0123456789abcdefABCDEF")
|
|
|
|
WHITESPACE = string.whitespace
|
|
|
|
ESCAPES = {
|
|
r"\a": (LITERAL, 7),
|
|
r"\b": (LITERAL, 8),
|
|
r"\f": (LITERAL, 12),
|
|
r"\n": (LITERAL, 10),
|
|
r"\r": (LITERAL, 13),
|
|
r"\t": (LITERAL, 9),
|
|
r"\v": (LITERAL, 11),
|
|
r"\\": (LITERAL, ord("\\"))
|
|
}
|
|
|
|
CATEGORIES = {
|
|
r"\A": (AT, AT_BEGINNING), # start of string
|
|
r"\b": (AT, AT_BOUNDARY),
|
|
r"\B": (AT, AT_NON_BOUNDARY),
|
|
r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]),
|
|
r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]),
|
|
r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]),
|
|
r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]),
|
|
r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]),
|
|
r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]),
|
|
r"\Z": (AT, AT_END), # end of string
|
|
}
|
|
|
|
FLAGS = {
|
|
# standard flags
|
|
"i": SRE_FLAG_IGNORECASE,
|
|
"L": SRE_FLAG_LOCALE,
|
|
"m": SRE_FLAG_MULTILINE,
|
|
"s": SRE_FLAG_DOTALL,
|
|
"x": SRE_FLAG_VERBOSE,
|
|
# extensions
|
|
"t": SRE_FLAG_TEMPLATE,
|
|
"u": SRE_FLAG_UNICODE,
|
|
}
|
|
|
|
class State:
|
|
def __init__(self):
|
|
self.flags = 0
|
|
self.groups = 1
|
|
self.groupdict = {}
|
|
def getgroup(self, name=None):
|
|
gid = self.groups
|
|
self.groups = gid + 1
|
|
if name:
|
|
self.groupdict[name] = gid
|
|
return gid
|
|
|
|
class SubPattern:
|
|
# a subpattern, in intermediate form
|
|
def __init__(self, pattern, data=None):
|
|
self.pattern = pattern
|
|
if not data:
|
|
data = []
|
|
self.data = data
|
|
self.width = None
|
|
def __repr__(self):
|
|
return repr(self.data)
|
|
def __len__(self):
|
|
return len(self.data)
|
|
def __delitem__(self, index):
|
|
del self.data[index]
|
|
def __getitem__(self, index):
|
|
return self.data[index]
|
|
def __setitem__(self, index, code):
|
|
self.data[index] = code
|
|
def __getslice__(self, start, stop):
|
|
return SubPattern(self.pattern, self.data[start:stop])
|
|
def insert(self, index, code):
|
|
self.data.insert(index, code)
|
|
def append(self, code):
|
|
self.data.append(code)
|
|
def getwidth(self):
|
|
# determine the width (min, max) for this subpattern
|
|
if self.width:
|
|
return self.width
|
|
lo = hi = 0L
|
|
for op, av in self.data:
|
|
if op is BRANCH:
|
|
l = sys.maxint
|
|
h = 0
|
|
for av in av[1]:
|
|
i, j = av.getwidth()
|
|
l = min(l, i)
|
|
h = min(h, j)
|
|
lo = lo + i
|
|
hi = hi + j
|
|
elif op is CALL:
|
|
i, j = av.getwidth()
|
|
lo = lo + i
|
|
hi = hi + j
|
|
elif op is SUBPATTERN:
|
|
i, j = av[1].getwidth()
|
|
lo = lo + i
|
|
hi = hi + j
|
|
elif op in (MIN_REPEAT, MAX_REPEAT):
|
|
i, j = av[2].getwidth()
|
|
lo = lo + long(i) * av[0]
|
|
hi = hi + long(j) * av[1]
|
|
elif op in (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY):
|
|
lo = lo + 1
|
|
hi = hi + 1
|
|
elif op == SUCCESS:
|
|
break
|
|
self.width = int(min(lo, sys.maxint)), int(min(hi, sys.maxint))
|
|
return self.width
|
|
|
|
class Tokenizer:
|
|
def __init__(self, string):
|
|
self.index = 0
|
|
self.string = string
|
|
self.next = self.__next()
|
|
def __next(self):
|
|
if self.index >= len(self.string):
|
|
return None
|
|
char = self.string[self.index]
|
|
if char[0] == "\\":
|
|
try:
|
|
c = self.string[self.index + 1]
|
|
except IndexError:
|
|
raise error, "bogus escape"
|
|
char = char + c
|
|
self.index = self.index + len(char)
|
|
return char
|
|
def match(self, char):
|
|
if char == self.next:
|
|
self.next = self.__next()
|
|
return 1
|
|
return 0
|
|
def match_set(self, set):
|
|
if self.next and self.next in set:
|
|
self.next = self.__next()
|
|
return 1
|
|
return 0
|
|
def get(self):
|
|
this = self.next
|
|
self.next = self.__next()
|
|
return this
|
|
|
|
def isident(char):
|
|
return "a" <= char <= "z" or "A" <= char <= "Z" or char == "_"
|
|
|
|
def isdigit(char):
|
|
return "0" <= char <= "9"
|
|
|
|
def isname(name):
|
|
# check that group name is a valid string
|
|
if not isident(name[0]):
|
|
return 0
|
|
for char in name:
|
|
if not isident(char) and not isdigit(char):
|
|
return 0
|
|
return 1
|
|
|
|
def _group(escape, groups):
|
|
# check if the escape string represents a valid group
|
|
try:
|
|
gid = int(escape[1:])
|
|
if gid and gid < groups:
|
|
return gid
|
|
except ValueError:
|
|
pass
|
|
return None # not a valid group
|
|
|
|
def _class_escape(source, escape):
|
|
# handle escape code inside character class
|
|
code = ESCAPES.get(escape)
|
|
if code:
|
|
return code
|
|
code = CATEGORIES.get(escape)
|
|
if code:
|
|
return code
|
|
try:
|
|
if escape[1:2] == "x":
|
|
while source.next in HEXDIGITS:
|
|
escape = escape + source.get()
|
|
escape = escape[2:]
|
|
return LITERAL, int(escape[-4:], 16) & CHARMASK
|
|
elif str(escape[1:2]) in OCTDIGITS:
|
|
while source.next in OCTDIGITS:
|
|
escape = escape + source.get()
|
|
escape = escape[1:]
|
|
return LITERAL, int(escape[-6:], 8) & CHARMASK
|
|
if len(escape) == 2:
|
|
return LITERAL, ord(escape[1])
|
|
except ValueError:
|
|
pass
|
|
raise error, "bogus escape: %s" % repr(escape)
|
|
|
|
def _escape(source, escape, state):
|
|
# handle escape code in expression
|
|
code = CATEGORIES.get(escape)
|
|
if code:
|
|
return code
|
|
code = ESCAPES.get(escape)
|
|
if code:
|
|
return code
|
|
try:
|
|
if escape[1:2] == "x":
|
|
while source.next in HEXDIGITS:
|
|
escape = escape + source.get()
|
|
escape = escape[2:]
|
|
return LITERAL, int(escape[-4:], 16) & CHARMASK
|
|
elif escape[1:2] in DIGITS:
|
|
while 1:
|
|
group = _group(escape, state.groups)
|
|
if group:
|
|
if (not source.next or
|
|
not _group(escape + source.next, state.groups)):
|
|
return GROUP, group
|
|
escape = escape + source.get()
|
|
elif source.next in OCTDIGITS:
|
|
escape = escape + source.get()
|
|
else:
|
|
break
|
|
escape = escape[1:]
|
|
return LITERAL, int(escape[-6:], 8) & CHARMASK
|
|
if len(escape) == 2:
|
|
return LITERAL, ord(escape[1])
|
|
except ValueError:
|
|
pass
|
|
raise error, "bogus escape: %s" % repr(escape)
|
|
|
|
def _branch(pattern, items):
|
|
# form a branch operator from a set of items
|
|
|
|
subpattern = SubPattern(pattern)
|
|
|
|
# check if all items share a common prefix
|
|
while 1:
|
|
prefix = None
|
|
for item in items:
|
|
if not item:
|
|
break
|
|
if prefix is None:
|
|
prefix = item[0]
|
|
elif item[0] != prefix:
|
|
break
|
|
else:
|
|
# all subitems start with a common "prefix".
|
|
# move it out of the branch
|
|
for item in items:
|
|
del item[0]
|
|
subpattern.append(prefix)
|
|
continue # check next one
|
|
break
|
|
|
|
# check if the branch can be replaced by a character set
|
|
for item in items:
|
|
if len(item) != 1 or item[0][0] != LITERAL:
|
|
break
|
|
else:
|
|
# we can store this as a character set instead of a
|
|
# branch (FIXME: use a range if possible)
|
|
set = []
|
|
for item in items:
|
|
set.append(item[0])
|
|
subpattern.append((IN, set))
|
|
return subpattern
|
|
|
|
subpattern.append((BRANCH, (None, items)))
|
|
return subpattern
|
|
|
|
def _parse(source, state, flags=0):
|
|
|
|
# parse regular expression pattern into an operator list.
|
|
|
|
subpattern = SubPattern(state)
|
|
|
|
while 1:
|
|
|
|
if source.next in ("|", ")"):
|
|
break # end of subpattern
|
|
this = source.get()
|
|
if this is None:
|
|
break # end of pattern
|
|
|
|
if state.flags & SRE_FLAG_VERBOSE:
|
|
# skip whitespace and comments
|
|
if this in WHITESPACE:
|
|
continue
|
|
if this == "#":
|
|
while 1:
|
|
this = source.get()
|
|
if this in (None, "\n"):
|
|
break
|
|
continue
|
|
|
|
if this and this[0] not in SPECIAL_CHARS:
|
|
subpattern.append((LITERAL, ord(this)))
|
|
|
|
elif this == "[":
|
|
# character set
|
|
set = []
|
|
## if source.match(":"):
|
|
## pass # handle character classes
|
|
if source.match("^"):
|
|
set.append((NEGATE, None))
|
|
# check remaining characters
|
|
start = set[:]
|
|
while 1:
|
|
this = source.get()
|
|
if this == "]" and set != start:
|
|
break
|
|
elif this and this[0] == "\\":
|
|
code1 = _class_escape(source, this)
|
|
elif this:
|
|
code1 = LITERAL, ord(this)
|
|
else:
|
|
raise error, "unexpected end of regular expression"
|
|
if source.match("-"):
|
|
# potential range
|
|
this = source.get()
|
|
if this == "]":
|
|
set.append(code1)
|
|
set.append((LITERAL, ord("-")))
|
|
break
|
|
else:
|
|
if this[0] == "\\":
|
|
code2 = _class_escape(source, this)
|
|
else:
|
|
code2 = LITERAL, ord(this)
|
|
if code1[0] != LITERAL or code2[0] != LITERAL:
|
|
raise error, "illegal range"
|
|
set.append((RANGE, (code1[1], code2[1])))
|
|
else:
|
|
if code1[0] is IN:
|
|
code1 = code1[1][0]
|
|
set.append(code1)
|
|
|
|
# FIXME: <fl> move set optimization to compiler!
|
|
if len(set)==1 and set[0][0] is LITERAL:
|
|
subpattern.append(set[0]) # optimization
|
|
elif len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL:
|
|
subpattern.append((NOT_LITERAL, set[1][1])) # optimization
|
|
else:
|
|
# FIXME: <fl> add charmap optimization
|
|
subpattern.append((IN, set))
|
|
|
|
elif this and this[0] in REPEAT_CHARS:
|
|
# repeat previous item
|
|
if this == "?":
|
|
min, max = 0, 1
|
|
elif this == "*":
|
|
min, max = 0, MAXREPEAT
|
|
elif this == "+":
|
|
min, max = 1, MAXREPEAT
|
|
elif this == "{":
|
|
min, max = 0, MAXREPEAT
|
|
lo = hi = ""
|
|
while source.next in DIGITS:
|
|
lo = lo + source.get()
|
|
if source.match(","):
|
|
while source.next in DIGITS:
|
|
hi = hi + source.get()
|
|
else:
|
|
hi = lo
|
|
if not source.match("}"):
|
|
raise error, "bogus range"
|
|
if lo:
|
|
min = int(lo)
|
|
if hi:
|
|
max = int(hi)
|
|
# FIXME: <fl> check that hi >= lo!
|
|
else:
|
|
raise error, "not supported"
|
|
# figure out which item to repeat
|
|
if subpattern:
|
|
item = subpattern[-1:]
|
|
else:
|
|
raise error, "nothing to repeat"
|
|
if source.match("?"):
|
|
subpattern[-1] = (MIN_REPEAT, (min, max, item))
|
|
else:
|
|
subpattern[-1] = (MAX_REPEAT, (min, max, item))
|
|
|
|
elif this == ".":
|
|
subpattern.append((ANY, None))
|
|
|
|
elif this == "(":
|
|
group = 1
|
|
name = None
|
|
if source.match("?"):
|
|
group = 0
|
|
# options
|
|
if source.match("P"):
|
|
# python extensions
|
|
if source.match("<"):
|
|
# named group: skip forward to end of name
|
|
name = ""
|
|
while 1:
|
|
char = source.get()
|
|
if char is None:
|
|
raise error, "unterminated name"
|
|
if char == ">":
|
|
break
|
|
name = name + char
|
|
group = 1
|
|
if not isname(name):
|
|
raise error, "illegal character in group name"
|
|
elif source.match("="):
|
|
# named backreference
|
|
name = ""
|
|
while 1:
|
|
char = source.get()
|
|
if char is None:
|
|
raise error, "unterminated name"
|
|
if char == ")":
|
|
break
|
|
name = name + char
|
|
if not isname(name):
|
|
raise error, "illegal character in group name"
|
|
gid = state.groupdict.get(name)
|
|
if gid is None:
|
|
raise error, "unknown group name"
|
|
subpattern.append((GROUP, gid))
|
|
else:
|
|
char = source.get()
|
|
if char is None:
|
|
raise error, "unexpected end of pattern"
|
|
raise error, "unknown specifier: ?P%s" % char
|
|
elif source.match(":"):
|
|
# non-capturing group
|
|
group = 2
|
|
elif source.match("#"):
|
|
# comment
|
|
while 1:
|
|
if source.next is None or source.next == ")":
|
|
break
|
|
source.get()
|
|
elif source.next in ("=", "!"):
|
|
# lookahead assertions
|
|
char = source.get()
|
|
b = []
|
|
while 1:
|
|
p = _parse(source, state, flags)
|
|
if source.next == ")":
|
|
if b:
|
|
b.append(p)
|
|
p = _branch(state, b)
|
|
if char == "=":
|
|
subpattern.append((ASSERT, p))
|
|
else:
|
|
subpattern.append((ASSERT_NOT, p))
|
|
break
|
|
elif source.match("|"):
|
|
b.append(p)
|
|
else:
|
|
raise error, "pattern not properly closed"
|
|
else:
|
|
# flags
|
|
while FLAGS.has_key(source.next):
|
|
state.flags = state.flags | FLAGS[source.get()]
|
|
if group:
|
|
# parse group contents
|
|
b = []
|
|
if group == 2:
|
|
# anonymous group
|
|
group = None
|
|
else:
|
|
group = state.getgroup(name)
|
|
while 1:
|
|
p = _parse(source, state, flags)
|
|
if source.match(")"):
|
|
if b:
|
|
b.append(p)
|
|
p = _branch(state, b)
|
|
subpattern.append((SUBPATTERN, (group, p)))
|
|
break
|
|
elif source.match("|"):
|
|
b.append(p)
|
|
else:
|
|
raise error, "group not properly closed"
|
|
else:
|
|
while 1:
|
|
char = source.get()
|
|
if char is None or char == ")":
|
|
break
|
|
raise error, "unknown extension"
|
|
|
|
elif this == "^":
|
|
subpattern.append((AT, AT_BEGINNING))
|
|
|
|
elif this == "$":
|
|
subpattern.append((AT, AT_END))
|
|
|
|
elif this and this[0] == "\\":
|
|
code = _escape(source, this, state)
|
|
subpattern.append(code)
|
|
|
|
else:
|
|
raise error, "parser error"
|
|
|
|
return subpattern
|
|
|
|
def parse(pattern, flags=0):
|
|
# parse 're' pattern into list of (opcode, argument) tuples
|
|
source = Tokenizer(pattern)
|
|
state = State()
|
|
b = []
|
|
while 1:
|
|
p = _parse(source, state, flags)
|
|
tail = source.get()
|
|
if tail == "|":
|
|
b.append(p)
|
|
elif tail == ")":
|
|
raise error, "unbalanced parenthesis"
|
|
elif tail is None:
|
|
if b:
|
|
b.append(p)
|
|
p = _branch(state, b)
|
|
break
|
|
else:
|
|
raise error, "bogus characters at end of regular expression"
|
|
return p
|
|
|
|
def parse_template(source, pattern):
|
|
# parse 're' replacement string into list of literals and
|
|
# group references
|
|
s = Tokenizer(source)
|
|
p = []
|
|
a = p.append
|
|
while 1:
|
|
this = s.get()
|
|
if this is None:
|
|
break # end of replacement string
|
|
if this and this[0] == "\\":
|
|
# group
|
|
if this == "\\g":
|
|
name = ""
|
|
if s.match("<"):
|
|
while 1:
|
|
char = s.get()
|
|
if char is None:
|
|
raise error, "unterminated group name"
|
|
if char == ">":
|
|
break
|
|
name = name + char
|
|
if not name:
|
|
raise error, "bad group name"
|
|
try:
|
|
index = int(name)
|
|
except ValueError:
|
|
if not isname(name):
|
|
raise error, "illegal character in group name"
|
|
try:
|
|
index = pattern.groupindex[name]
|
|
except KeyError:
|
|
raise IndexError, "unknown group name"
|
|
a((MARK, index))
|
|
elif len(this) > 1 and this[1] in DIGITS:
|
|
code = None
|
|
while 1:
|
|
group = _group(this, pattern.groups+1)
|
|
if group:
|
|
if (not s.next or
|
|
not _group(this + s.next, pattern.groups+1)):
|
|
code = MARK, int(group)
|
|
break
|
|
elif s.next in OCTDIGITS:
|
|
this = this + s.get()
|
|
else:
|
|
break
|
|
if not code:
|
|
this = this[1:]
|
|
code = LITERAL, int(this[-6:], 8) & CHARMASK
|
|
a(code)
|
|
else:
|
|
try:
|
|
a(ESCAPES[this])
|
|
except KeyError:
|
|
for c in this:
|
|
a((LITERAL, ord(c)))
|
|
else:
|
|
a((LITERAL, ord(this)))
|
|
return p
|
|
|
|
def expand_template(template, match):
|
|
# FIXME: <fl> this is sooooo slow. drop in the slicelist
|
|
# code instead
|
|
p = []
|
|
a = p.append
|
|
sep = match.string[:0]
|
|
if type(sep) is type(""):
|
|
char = chr
|
|
else:
|
|
char = unichr
|
|
for c, s in template:
|
|
if c is LITERAL:
|
|
a(char(s))
|
|
elif c is MARK:
|
|
s = match.group(s)
|
|
if s is None:
|
|
raise error, "empty group"
|
|
a(s)
|
|
return sep.join(p)
|