bpo-47152: Convert the re module into a package (GH-32177)

The sre_* modules are now deprecated.
This commit is contained in:
Serhiy Storchaka 2022-04-02 11:35:13 +03:00 committed by GitHub
parent 4ed8a9a589
commit 1be3260a90
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
16 changed files with 2235 additions and 2182 deletions

View file

@ -96,14 +96,14 @@ Sample output (may vary depending on the architecture)::
Loaded modules: Loaded modules:
_types: _types:
copyreg: _inverted_registry,_slotnames,__all__ copyreg: _inverted_registry,_slotnames,__all__
sre_compile: isstring,_sre,_optimize_unicode re._compiler: isstring,_sre,_optimize_unicode
_sre: _sre:
sre_constants: REPEAT_ONE,makedict,AT_END_LINE re._constants: REPEAT_ONE,makedict,AT_END_LINE
sys: sys:
re: __module__,finditer,_expand re: __module__,finditer,_expand
itertools: itertools:
__main__: re,itertools,baconhameggs __main__: re,itertools,baconhameggs
sre_parse: _PATTERNENDERS,SRE_FLAG_UNICODE re._parser: _PATTERNENDERS,SRE_FLAG_UNICODE
array: array:
types: __module__,IntType,TypeType types: __module__,IntType,TypeType
--------------------------------------------------- ---------------------------------------------------

View file

@ -73,12 +73,12 @@ the following::
ncalls tottime percall cumtime percall filename:lineno(function) ncalls tottime percall cumtime percall filename:lineno(function)
1 0.000 0.000 0.002 0.002 {built-in method builtins.exec} 1 0.000 0.000 0.002 0.002 {built-in method builtins.exec}
1 0.000 0.000 0.001 0.001 <string>:1(<module>) 1 0.000 0.000 0.001 0.001 <string>:1(<module>)
1 0.000 0.000 0.001 0.001 re.py:250(compile) 1 0.000 0.000 0.001 0.001 __init__.py:250(compile)
1 0.000 0.000 0.001 0.001 re.py:289(_compile) 1 0.000 0.000 0.001 0.001 __init__.py:289(_compile)
1 0.000 0.000 0.000 0.000 sre_compile.py:759(compile) 1 0.000 0.000 0.000 0.000 _compiler.py:759(compile)
1 0.000 0.000 0.000 0.000 sre_parse.py:937(parse) 1 0.000 0.000 0.000 0.000 _parser.py:937(parse)
1 0.000 0.000 0.000 0.000 sre_compile.py:598(_code) 1 0.000 0.000 0.000 0.000 _compiler.py:598(_code)
1 0.000 0.000 0.000 0.000 sre_parse.py:435(_parse_sub) 1 0.000 0.000 0.000 0.000 _parser.py:435(_parse_sub)
The first line indicates that 214 calls were monitored. Of those calls, 207 The first line indicates that 214 calls were monitored. Of those calls, 207
were :dfn:`primitive`, meaning that the call was not induced via recursion. The were :dfn:`primitive`, meaning that the call was not induced via recursion. The

View file

@ -532,6 +532,10 @@ Deprecated
be able to parse Python 3.10 or newer. See the :pep:`617` (New PEG parser for be able to parse Python 3.10 or newer. See the :pep:`617` (New PEG parser for
CPython). (Contributed by Victor Stinner in :issue:`40360`.) CPython). (Contributed by Victor Stinner in :issue:`40360`.)
* Undocumented modules ``sre_compile``, ``sre_constants`` and ``sre_parse``
are now deprecated.
(Contributed by Serhiy Storchaka in :issue:`47152`.)
* :class:`webbrowser.MacOSX` is deprecated and will be removed in Python 3.13. * :class:`webbrowser.MacOSX` is deprecated and will be removed in Python 3.13.
It is untested and undocumented and also not used by webbrowser itself. It is untested and undocumented and also not used by webbrowser itself.
(Contributed by Dong-hee Na in :issue:`42255`.) (Contributed by Dong-hee Na in :issue:`42255`.)

View file

@ -122,8 +122,7 @@ This module also defines an exception 'error'.
""" """
import enum import enum
import sre_compile from . import _compiler, _parser
import sre_parse
import functools import functools
try: try:
import _locale import _locale
@ -146,21 +145,21 @@ __version__ = "2.2.1"
@enum._simple_enum(enum.IntFlag, boundary=enum.KEEP) @enum._simple_enum(enum.IntFlag, boundary=enum.KEEP)
class RegexFlag: class RegexFlag:
NOFLAG = 0 NOFLAG = 0
ASCII = A = sre_compile.SRE_FLAG_ASCII # assume ascii "locale" ASCII = A = _compiler.SRE_FLAG_ASCII # assume ascii "locale"
IGNORECASE = I = sre_compile.SRE_FLAG_IGNORECASE # ignore case IGNORECASE = I = _compiler.SRE_FLAG_IGNORECASE # ignore case
LOCALE = L = sre_compile.SRE_FLAG_LOCALE # assume current 8-bit locale LOCALE = L = _compiler.SRE_FLAG_LOCALE # assume current 8-bit locale
UNICODE = U = sre_compile.SRE_FLAG_UNICODE # assume unicode "locale" UNICODE = U = _compiler.SRE_FLAG_UNICODE # assume unicode "locale"
MULTILINE = M = sre_compile.SRE_FLAG_MULTILINE # make anchors look for newline MULTILINE = M = _compiler.SRE_FLAG_MULTILINE # make anchors look for newline
DOTALL = S = sre_compile.SRE_FLAG_DOTALL # make dot match newline DOTALL = S = _compiler.SRE_FLAG_DOTALL # make dot match newline
VERBOSE = X = sre_compile.SRE_FLAG_VERBOSE # ignore whitespace and comments VERBOSE = X = _compiler.SRE_FLAG_VERBOSE # ignore whitespace and comments
# sre extensions (experimental, don't rely on these) # sre extensions (experimental, don't rely on these)
TEMPLATE = T = sre_compile.SRE_FLAG_TEMPLATE # disable backtracking TEMPLATE = T = _compiler.SRE_FLAG_TEMPLATE # disable backtracking
DEBUG = sre_compile.SRE_FLAG_DEBUG # dump pattern after compilation DEBUG = _compiler.SRE_FLAG_DEBUG # dump pattern after compilation
__str__ = object.__str__ __str__ = object.__str__
_numeric_repr_ = hex _numeric_repr_ = hex
# sre exception # sre exception
error = sre_compile.error error = _compiler.error
# -------------------------------------------------------------------- # --------------------------------------------------------------------
# public interface # public interface
@ -257,8 +256,8 @@ def escape(pattern):
pattern = str(pattern, 'latin1') pattern = str(pattern, 'latin1')
return pattern.translate(_special_chars_map).encode('latin1') return pattern.translate(_special_chars_map).encode('latin1')
Pattern = type(sre_compile.compile('', 0)) Pattern = type(_compiler.compile('', 0))
Match = type(sre_compile.compile('', 0).match('')) Match = type(_compiler.compile('', 0).match(''))
# -------------------------------------------------------------------- # --------------------------------------------------------------------
# internals # internals
@ -279,9 +278,9 @@ def _compile(pattern, flags):
raise ValueError( raise ValueError(
"cannot process flags argument with a compiled pattern") "cannot process flags argument with a compiled pattern")
return pattern return pattern
if not sre_compile.isstring(pattern): if not _compiler.isstring(pattern):
raise TypeError("first argument must be string or compiled pattern") raise TypeError("first argument must be string or compiled pattern")
p = sre_compile.compile(pattern, flags) p = _compiler.compile(pattern, flags)
if not (flags & DEBUG): if not (flags & DEBUG):
if len(_cache) >= _MAXCACHE: if len(_cache) >= _MAXCACHE:
# Drop the oldest item # Drop the oldest item
@ -295,12 +294,12 @@ def _compile(pattern, flags):
@functools.lru_cache(_MAXCACHE) @functools.lru_cache(_MAXCACHE)
def _compile_repl(repl, pattern): def _compile_repl(repl, pattern):
# internal: compile replacement pattern # internal: compile replacement pattern
return sre_parse.parse_template(repl, pattern) return _parser.parse_template(repl, pattern)
def _expand(pattern, match, template): def _expand(pattern, match, template):
# internal: Match.expand implementation hook # internal: Match.expand implementation hook
template = sre_parse.parse_template(template, pattern) template = _parser.parse_template(template, pattern)
return sre_parse.expand_template(template, match) return _parser.expand_template(template, match)
def _subx(pattern, template): def _subx(pattern, template):
# internal: Pattern.sub/subn implementation helper # internal: Pattern.sub/subn implementation helper
@ -309,7 +308,7 @@ def _subx(pattern, template):
# literal replacement # literal replacement
return template[1][0] return template[1][0]
def filter(match, template=template): def filter(match, template=template):
return sre_parse.expand_template(template, match) return _parser.expand_template(template, match)
return filter return filter
# register myself for pickling # register myself for pickling
@ -326,22 +325,22 @@ copyreg.pickle(Pattern, _pickle, _compile)
class Scanner: class Scanner:
def __init__(self, lexicon, flags=0): def __init__(self, lexicon, flags=0):
from sre_constants import BRANCH, SUBPATTERN from ._constants import BRANCH, SUBPATTERN
if isinstance(flags, RegexFlag): if isinstance(flags, RegexFlag):
flags = flags.value flags = flags.value
self.lexicon = lexicon self.lexicon = lexicon
# combine phrases into a compound pattern # combine phrases into a compound pattern
p = [] p = []
s = sre_parse.State() s = _parser.State()
s.flags = flags s.flags = flags
for phrase, action in lexicon: for phrase, action in lexicon:
gid = s.opengroup() gid = s.opengroup()
p.append(sre_parse.SubPattern(s, [ p.append(_parser.SubPattern(s, [
(SUBPATTERN, (gid, 0, 0, sre_parse.parse(phrase, flags))), (SUBPATTERN, (gid, 0, 0, _parser.parse(phrase, flags))),
])) ]))
s.closegroup(gid, p[-1]) s.closegroup(gid, p[-1])
p = sre_parse.SubPattern(s, [(BRANCH, (None, p))]) p = _parser.SubPattern(s, [(BRANCH, (None, p))])
self.scanner = sre_compile.compile(p) self.scanner = _compiler.compile(p)
def scan(self, string): def scan(self, string):
result = [] result = []
append = result.append append = result.append

800
Lib/re/_compiler.py Normal file
View file

@ -0,0 +1,800 @@
#
# Secret Labs' Regular Expression Engine
#
# convert template to internal format
#
# Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
#
# See the __init__.py file for information on usage and redistribution.
#
"""Internal support module for sre"""
import _sre
from . import _parser
from ._constants import *
assert _sre.MAGIC == MAGIC, "SRE module mismatch"
_LITERAL_CODES = {LITERAL, NOT_LITERAL}
_SUCCESS_CODES = {SUCCESS, FAILURE}
_ASSERT_CODES = {ASSERT, ASSERT_NOT}
_UNIT_CODES = _LITERAL_CODES | {ANY, IN}
_REPEATING_CODES = {
MIN_REPEAT: (REPEAT, MIN_UNTIL, MIN_REPEAT_ONE),
MAX_REPEAT: (REPEAT, MAX_UNTIL, REPEAT_ONE),
POSSESSIVE_REPEAT: (POSSESSIVE_REPEAT, SUCCESS, POSSESSIVE_REPEAT_ONE),
}
# Sets of lowercase characters which have the same uppercase.
_equivalences = (
# LATIN SMALL LETTER I, LATIN SMALL LETTER DOTLESS I
(0x69, 0x131), # iı
# LATIN SMALL LETTER S, LATIN SMALL LETTER LONG S
(0x73, 0x17f), # sſ
# MICRO SIGN, GREEK SMALL LETTER MU
(0xb5, 0x3bc), # µμ
# COMBINING GREEK YPOGEGRAMMENI, GREEK SMALL LETTER IOTA, GREEK PROSGEGRAMMENI
(0x345, 0x3b9, 0x1fbe), # \u0345ι
# GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
(0x390, 0x1fd3), # ΐΐ
# GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
(0x3b0, 0x1fe3), # ΰΰ
# GREEK SMALL LETTER BETA, GREEK BETA SYMBOL
(0x3b2, 0x3d0), # βϐ
# GREEK SMALL LETTER EPSILON, GREEK LUNATE EPSILON SYMBOL
(0x3b5, 0x3f5), # εϵ
# GREEK SMALL LETTER THETA, GREEK THETA SYMBOL
(0x3b8, 0x3d1), # θϑ
# GREEK SMALL LETTER KAPPA, GREEK KAPPA SYMBOL
(0x3ba, 0x3f0), # κϰ
# GREEK SMALL LETTER PI, GREEK PI SYMBOL
(0x3c0, 0x3d6), # πϖ
# GREEK SMALL LETTER RHO, GREEK RHO SYMBOL
(0x3c1, 0x3f1), # ρϱ
# GREEK SMALL LETTER FINAL SIGMA, GREEK SMALL LETTER SIGMA
(0x3c2, 0x3c3), # ςσ
# GREEK SMALL LETTER PHI, GREEK PHI SYMBOL
(0x3c6, 0x3d5), # φϕ
# LATIN SMALL LETTER S WITH DOT ABOVE, LATIN SMALL LETTER LONG S WITH DOT ABOVE
(0x1e61, 0x1e9b), # ṡẛ
# LATIN SMALL LIGATURE LONG S T, LATIN SMALL LIGATURE ST
(0xfb05, 0xfb06), # ſtst
)
# Maps the lowercase code to lowercase codes which have the same uppercase.
_ignorecase_fixes = {i: tuple(j for j in t if i != j)
for t in _equivalences for i in t}
def _combine_flags(flags, add_flags, del_flags,
TYPE_FLAGS=_parser.TYPE_FLAGS):
if add_flags & TYPE_FLAGS:
flags &= ~TYPE_FLAGS
return (flags | add_flags) & ~del_flags
def _compile(code, pattern, flags):
# internal: compile a (sub)pattern
emit = code.append
_len = len
LITERAL_CODES = _LITERAL_CODES
REPEATING_CODES = _REPEATING_CODES
SUCCESS_CODES = _SUCCESS_CODES
ASSERT_CODES = _ASSERT_CODES
iscased = None
tolower = None
fixes = None
if flags & SRE_FLAG_IGNORECASE and not flags & SRE_FLAG_LOCALE:
if flags & SRE_FLAG_UNICODE:
iscased = _sre.unicode_iscased
tolower = _sre.unicode_tolower
fixes = _ignorecase_fixes
else:
iscased = _sre.ascii_iscased
tolower = _sre.ascii_tolower
for op, av in pattern:
if op in LITERAL_CODES:
if not flags & SRE_FLAG_IGNORECASE:
emit(op)
emit(av)
elif flags & SRE_FLAG_LOCALE:
emit(OP_LOCALE_IGNORE[op])
emit(av)
elif not iscased(av):
emit(op)
emit(av)
else:
lo = tolower(av)
if not fixes: # ascii
emit(OP_IGNORE[op])
emit(lo)
elif lo not in fixes:
emit(OP_UNICODE_IGNORE[op])
emit(lo)
else:
emit(IN_UNI_IGNORE)
skip = _len(code); emit(0)
if op is NOT_LITERAL:
emit(NEGATE)
for k in (lo,) + fixes[lo]:
emit(LITERAL)
emit(k)
emit(FAILURE)
code[skip] = _len(code) - skip
elif op is IN:
charset, hascased = _optimize_charset(av, iscased, tolower, fixes)
if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
emit(IN_LOC_IGNORE)
elif not hascased:
emit(IN)
elif not fixes: # ascii
emit(IN_IGNORE)
else:
emit(IN_UNI_IGNORE)
skip = _len(code); emit(0)
_compile_charset(charset, flags, code)
code[skip] = _len(code) - skip
elif op is ANY:
if flags & SRE_FLAG_DOTALL:
emit(ANY_ALL)
else:
emit(ANY)
elif op in REPEATING_CODES:
if flags & SRE_FLAG_TEMPLATE:
raise error("internal: unsupported template operator %r" % (op,))
if _simple(av[2]):
emit(REPEATING_CODES[op][2])
skip = _len(code); emit(0)
emit(av[0])
emit(av[1])
_compile(code, av[2], flags)
emit(SUCCESS)
code[skip] = _len(code) - skip
else:
emit(REPEATING_CODES[op][0])
skip = _len(code); emit(0)
emit(av[0])
emit(av[1])
_compile(code, av[2], flags)
code[skip] = _len(code) - skip
emit(REPEATING_CODES[op][1])
elif op is SUBPATTERN:
group, add_flags, del_flags, p = av
if group:
emit(MARK)
emit((group-1)*2)
# _compile_info(code, p, _combine_flags(flags, add_flags, del_flags))
_compile(code, p, _combine_flags(flags, add_flags, del_flags))
if group:
emit(MARK)
emit((group-1)*2+1)
elif op is ATOMIC_GROUP:
# Atomic Groups are handled by starting with an Atomic
# Group op code, then putting in the atomic group pattern
# and finally a success op code to tell any repeat
# operations within the Atomic Group to stop eating and
# pop their stack if they reach it
emit(ATOMIC_GROUP)
skip = _len(code); emit(0)
_compile(code, av, flags)
emit(SUCCESS)
code[skip] = _len(code) - skip
elif op in SUCCESS_CODES:
emit(op)
elif op in ASSERT_CODES:
emit(op)
skip = _len(code); emit(0)
if av[0] >= 0:
emit(0) # look ahead
else:
lo, hi = av[1].getwidth()
if lo != hi:
raise error("look-behind requires fixed-width pattern")
emit(lo) # look behind
_compile(code, av[1], flags)
emit(SUCCESS)
code[skip] = _len(code) - skip
elif op is CALL:
emit(op)
skip = _len(code); emit(0)
_compile(code, av, flags)
emit(SUCCESS)
code[skip] = _len(code) - skip
elif op is AT:
emit(op)
if flags & SRE_FLAG_MULTILINE:
av = AT_MULTILINE.get(av, av)
if flags & SRE_FLAG_LOCALE:
av = AT_LOCALE.get(av, av)
elif flags & SRE_FLAG_UNICODE:
av = AT_UNICODE.get(av, av)
emit(av)
elif op is BRANCH:
emit(op)
tail = []
tailappend = tail.append
for av in av[1]:
skip = _len(code); emit(0)
# _compile_info(code, av, flags)
_compile(code, av, flags)
emit(JUMP)
tailappend(_len(code)); emit(0)
code[skip] = _len(code) - skip
emit(FAILURE) # end of branch
for tail in tail:
code[tail] = _len(code) - tail
elif op is CATEGORY:
emit(op)
if flags & SRE_FLAG_LOCALE:
av = CH_LOCALE[av]
elif flags & SRE_FLAG_UNICODE:
av = CH_UNICODE[av]
emit(av)
elif op is GROUPREF:
if not flags & SRE_FLAG_IGNORECASE:
emit(op)
elif flags & SRE_FLAG_LOCALE:
emit(GROUPREF_LOC_IGNORE)
elif not fixes: # ascii
emit(GROUPREF_IGNORE)
else:
emit(GROUPREF_UNI_IGNORE)
emit(av-1)
elif op is GROUPREF_EXISTS:
emit(op)
emit(av[0]-1)
skipyes = _len(code); emit(0)
_compile(code, av[1], flags)
if av[2]:
emit(JUMP)
skipno = _len(code); emit(0)
code[skipyes] = _len(code) - skipyes + 1
_compile(code, av[2], flags)
code[skipno] = _len(code) - skipno
else:
code[skipyes] = _len(code) - skipyes + 1
else:
raise error("internal: unsupported operand type %r" % (op,))
def _compile_charset(charset, flags, code):
# compile charset subprogram
emit = code.append
for op, av in charset:
emit(op)
if op is NEGATE:
pass
elif op is LITERAL:
emit(av)
elif op is RANGE or op is RANGE_UNI_IGNORE:
emit(av[0])
emit(av[1])
elif op is CHARSET:
code.extend(av)
elif op is BIGCHARSET:
code.extend(av)
elif op is CATEGORY:
if flags & SRE_FLAG_LOCALE:
emit(CH_LOCALE[av])
elif flags & SRE_FLAG_UNICODE:
emit(CH_UNICODE[av])
else:
emit(av)
else:
raise error("internal: unsupported set operator %r" % (op,))
emit(FAILURE)
def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):
# internal: optimize character set
out = []
tail = []
charmap = bytearray(256)
hascased = False
for op, av in charset:
while True:
try:
if op is LITERAL:
if fixup:
lo = fixup(av)
charmap[lo] = 1
if fixes and lo in fixes:
for k in fixes[lo]:
charmap[k] = 1
if not hascased and iscased(av):
hascased = True
else:
charmap[av] = 1
elif op is RANGE:
r = range(av[0], av[1]+1)
if fixup:
if fixes:
for i in map(fixup, r):
charmap[i] = 1
if i in fixes:
for k in fixes[i]:
charmap[k] = 1
else:
for i in map(fixup, r):
charmap[i] = 1
if not hascased:
hascased = any(map(iscased, r))
else:
for i in r:
charmap[i] = 1
elif op is NEGATE:
out.append((op, av))
else:
tail.append((op, av))
except IndexError:
if len(charmap) == 256:
# character set contains non-UCS1 character codes
charmap += b'\0' * 0xff00
continue
# Character set contains non-BMP character codes.
if fixup:
hascased = True
# There are only two ranges of cased non-BMP characters:
# 10400-1044F (Deseret) and 118A0-118DF (Warang Citi),
# and for both ranges RANGE_UNI_IGNORE works.
if op is RANGE:
op = RANGE_UNI_IGNORE
tail.append((op, av))
break
# compress character map
runs = []
q = 0
while True:
p = charmap.find(1, q)
if p < 0:
break
if len(runs) >= 2:
runs = None
break
q = charmap.find(0, p)
if q < 0:
runs.append((p, len(charmap)))
break
runs.append((p, q))
if runs is not None:
# use literal/range
for p, q in runs:
if q - p == 1:
out.append((LITERAL, p))
else:
out.append((RANGE, (p, q - 1)))
out += tail
# if the case was changed or new representation is more compact
if hascased or len(out) < len(charset):
return out, hascased
# else original character set is good enough
return charset, hascased
# use bitmap
if len(charmap) == 256:
data = _mk_bitmap(charmap)
out.append((CHARSET, data))
out += tail
return out, hascased
# To represent a big charset, first a bitmap of all characters in the
# set is constructed. Then, this bitmap is sliced into chunks of 256
# characters, duplicate chunks are eliminated, and each chunk is
# given a number. In the compiled expression, the charset is
# represented by a 32-bit word sequence, consisting of one word for
# the number of different chunks, a sequence of 256 bytes (64 words)
# of chunk numbers indexed by their original chunk position, and a
# sequence of 256-bit chunks (8 words each).
# Compression is normally good: in a typical charset, large ranges of
# Unicode will be either completely excluded (e.g. if only cyrillic
# letters are to be matched), or completely included (e.g. if large
# subranges of Kanji match). These ranges will be represented by
# chunks of all one-bits or all zero-bits.
# Matching can be also done efficiently: the more significant byte of
# the Unicode character is an index into the chunk number, and the
# less significant byte is a bit index in the chunk (just like the
# CHARSET matching).
charmap = bytes(charmap) # should be hashable
comps = {}
mapping = bytearray(256)
block = 0
data = bytearray()
for i in range(0, 65536, 256):
chunk = charmap[i: i + 256]
if chunk in comps:
mapping[i // 256] = comps[chunk]
else:
mapping[i // 256] = comps[chunk] = block
block += 1
data += chunk
data = _mk_bitmap(data)
data[0:0] = [block] + _bytes_to_codes(mapping)
out.append((BIGCHARSET, data))
out += tail
return out, hascased
_CODEBITS = _sre.CODESIZE * 8
MAXCODE = (1 << _CODEBITS) - 1
_BITS_TRANS = b'0' + b'1' * 255
def _mk_bitmap(bits, _CODEBITS=_CODEBITS, _int=int):
s = bits.translate(_BITS_TRANS)[::-1]
return [_int(s[i - _CODEBITS: i], 2)
for i in range(len(s), 0, -_CODEBITS)]
def _bytes_to_codes(b):
# Convert block indices to word array
a = memoryview(b).cast('I')
assert a.itemsize == _sre.CODESIZE
assert len(a) * a.itemsize == len(b)
return a.tolist()
def _simple(p):
# check if this subpattern is a "simple" operator
if len(p) != 1:
return False
op, av = p[0]
if op is SUBPATTERN:
return av[0] is None and _simple(av[-1])
return op in _UNIT_CODES
def _generate_overlap_table(prefix):
"""
Generate an overlap table for the following prefix.
An overlap table is a table of the same size as the prefix which
informs about the potential self-overlap for each index in the prefix:
- if overlap[i] == 0, prefix[i:] can't overlap prefix[0:...]
- if overlap[i] == k with 0 < k <= i, prefix[i-k+1:i+1] overlaps with
prefix[0:k]
"""
table = [0] * len(prefix)
for i in range(1, len(prefix)):
idx = table[i - 1]
while prefix[i] != prefix[idx]:
if idx == 0:
table[i] = 0
break
idx = table[idx - 1]
else:
table[i] = idx + 1
return table
def _get_iscased(flags):
if not flags & SRE_FLAG_IGNORECASE:
return None
elif flags & SRE_FLAG_UNICODE:
return _sre.unicode_iscased
else:
return _sre.ascii_iscased
def _get_literal_prefix(pattern, flags):
# look for literal prefix
prefix = []
prefixappend = prefix.append
prefix_skip = None
iscased = _get_iscased(flags)
for op, av in pattern.data:
if op is LITERAL:
if iscased and iscased(av):
break
prefixappend(av)
elif op is SUBPATTERN:
group, add_flags, del_flags, p = av
flags1 = _combine_flags(flags, add_flags, del_flags)
if flags1 & SRE_FLAG_IGNORECASE and flags1 & SRE_FLAG_LOCALE:
break
prefix1, prefix_skip1, got_all = _get_literal_prefix(p, flags1)
if prefix_skip is None:
if group is not None:
prefix_skip = len(prefix)
elif prefix_skip1 is not None:
prefix_skip = len(prefix) + prefix_skip1
prefix.extend(prefix1)
if not got_all:
break
else:
break
else:
return prefix, prefix_skip, True
return prefix, prefix_skip, False
def _get_charset_prefix(pattern, flags):
while True:
if not pattern.data:
return None
op, av = pattern.data[0]
if op is not SUBPATTERN:
break
group, add_flags, del_flags, pattern = av
flags = _combine_flags(flags, add_flags, del_flags)
if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
return None
iscased = _get_iscased(flags)
if op is LITERAL:
if iscased and iscased(av):
return None
return [(op, av)]
elif op is BRANCH:
charset = []
charsetappend = charset.append
for p in av[1]:
if not p:
return None
op, av = p[0]
if op is LITERAL and not (iscased and iscased(av)):
charsetappend((op, av))
else:
return None
return charset
elif op is IN:
charset = av
if iscased:
for op, av in charset:
if op is LITERAL:
if iscased(av):
return None
elif op is RANGE:
if av[1] > 0xffff:
return None
if any(map(iscased, range(av[0], av[1]+1))):
return None
return charset
return None
def _compile_info(code, pattern, flags):
# internal: compile an info block. in the current version,
# this contains min/max pattern width, and an optional literal
# prefix or a character map
lo, hi = pattern.getwidth()
if hi > MAXCODE:
hi = MAXCODE
if lo == 0:
code.extend([INFO, 4, 0, lo, hi])
return
# look for a literal prefix
prefix = []
prefix_skip = 0
charset = [] # not used
if not (flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE):
# look for literal prefix
prefix, prefix_skip, got_all = _get_literal_prefix(pattern, flags)
# if no prefix, look for charset prefix
if not prefix:
charset = _get_charset_prefix(pattern, flags)
## if prefix:
## print("*** PREFIX", prefix, prefix_skip)
## if charset:
## print("*** CHARSET", charset)
# add an info block
emit = code.append
emit(INFO)
skip = len(code); emit(0)
# literal flag
mask = 0
if prefix:
mask = SRE_INFO_PREFIX
if prefix_skip is None and got_all:
mask = mask | SRE_INFO_LITERAL
elif charset:
mask = mask | SRE_INFO_CHARSET
emit(mask)
# pattern length
if lo < MAXCODE:
emit(lo)
else:
emit(MAXCODE)
prefix = prefix[:MAXCODE]
emit(min(hi, MAXCODE))
# add literal prefix
if prefix:
emit(len(prefix)) # length
if prefix_skip is None:
prefix_skip = len(prefix)
emit(prefix_skip) # skip
code.extend(prefix)
# generate overlap table
code.extend(_generate_overlap_table(prefix))
elif charset:
charset, hascased = _optimize_charset(charset)
assert not hascased
_compile_charset(charset, flags, code)
code[skip] = len(code) - skip
def isstring(obj):
return isinstance(obj, (str, bytes))
def _code(p, flags):
flags = p.state.flags | flags
code = []
# compile info block
_compile_info(code, p, flags)
# compile the pattern
_compile(code, p.data, flags)
code.append(SUCCESS)
return code
def _hex_code(code):
return '[%s]' % ', '.join('%#0*x' % (_sre.CODESIZE*2+2, x) for x in code)
def dis(code):
import sys
labels = set()
level = 0
offset_width = len(str(len(code) - 1))
def dis_(start, end):
def print_(*args, to=None):
if to is not None:
labels.add(to)
args += ('(to %d)' % (to,),)
print('%*d%s ' % (offset_width, start, ':' if start in labels else '.'),
end=' '*(level-1))
print(*args)
def print_2(*args):
print(end=' '*(offset_width + 2*level))
print(*args)
nonlocal level
level += 1
i = start
while i < end:
start = i
op = code[i]
i += 1
op = OPCODES[op]
if op in (SUCCESS, FAILURE, ANY, ANY_ALL,
MAX_UNTIL, MIN_UNTIL, NEGATE):
print_(op)
elif op in (LITERAL, NOT_LITERAL,
LITERAL_IGNORE, NOT_LITERAL_IGNORE,
LITERAL_UNI_IGNORE, NOT_LITERAL_UNI_IGNORE,
LITERAL_LOC_IGNORE, NOT_LITERAL_LOC_IGNORE):
arg = code[i]
i += 1
print_(op, '%#02x (%r)' % (arg, chr(arg)))
elif op is AT:
arg = code[i]
i += 1
arg = str(ATCODES[arg])
assert arg[:3] == 'AT_'
print_(op, arg[3:])
elif op is CATEGORY:
arg = code[i]
i += 1
arg = str(CHCODES[arg])
assert arg[:9] == 'CATEGORY_'
print_(op, arg[9:])
elif op in (IN, IN_IGNORE, IN_UNI_IGNORE, IN_LOC_IGNORE):
skip = code[i]
print_(op, skip, to=i+skip)
dis_(i+1, i+skip)
i += skip
elif op in (RANGE, RANGE_UNI_IGNORE):
lo, hi = code[i: i+2]
i += 2
print_(op, '%#02x %#02x (%r-%r)' % (lo, hi, chr(lo), chr(hi)))
elif op is CHARSET:
print_(op, _hex_code(code[i: i + 256//_CODEBITS]))
i += 256//_CODEBITS
elif op is BIGCHARSET:
arg = code[i]
i += 1
mapping = list(b''.join(x.to_bytes(_sre.CODESIZE, sys.byteorder)
for x in code[i: i + 256//_sre.CODESIZE]))
print_(op, arg, mapping)
i += 256//_sre.CODESIZE
level += 1
for j in range(arg):
print_2(_hex_code(code[i: i + 256//_CODEBITS]))
i += 256//_CODEBITS
level -= 1
elif op in (MARK, GROUPREF, GROUPREF_IGNORE, GROUPREF_UNI_IGNORE,
GROUPREF_LOC_IGNORE):
arg = code[i]
i += 1
print_(op, arg)
elif op is JUMP:
skip = code[i]
print_(op, skip, to=i+skip)
i += 1
elif op is BRANCH:
skip = code[i]
print_(op, skip, to=i+skip)
while skip:
dis_(i+1, i+skip)
i += skip
start = i
skip = code[i]
if skip:
print_('branch', skip, to=i+skip)
else:
print_(FAILURE)
i += 1
elif op in (REPEAT, REPEAT_ONE, MIN_REPEAT_ONE,
POSSESSIVE_REPEAT, POSSESSIVE_REPEAT_ONE):
skip, min, max = code[i: i+3]
if max == MAXREPEAT:
max = 'MAXREPEAT'
print_(op, skip, min, max, to=i+skip)
dis_(i+3, i+skip)
i += skip
elif op is GROUPREF_EXISTS:
arg, skip = code[i: i+2]
print_(op, arg, skip, to=i+skip)
i += 2
elif op in (ASSERT, ASSERT_NOT):
skip, arg = code[i: i+2]
print_(op, skip, arg, to=i+skip)
dis_(i+2, i+skip)
i += skip
elif op is ATOMIC_GROUP:
skip = code[i]
print_(op, skip, to=i+skip)
dis_(i+1, i+skip)
i += skip
elif op is INFO:
skip, flags, min, max = code[i: i+4]
if max == MAXREPEAT:
max = 'MAXREPEAT'
print_(op, skip, bin(flags), min, max, to=i+skip)
start = i+4
if flags & SRE_INFO_PREFIX:
prefix_len, prefix_skip = code[i+4: i+6]
print_2(' prefix_skip', prefix_skip)
start = i + 6
prefix = code[start: start+prefix_len]
print_2(' prefix',
'[%s]' % ', '.join('%#02x' % x for x in prefix),
'(%r)' % ''.join(map(chr, prefix)))
start += prefix_len
print_2(' overlap', code[start: start+prefix_len])
start += prefix_len
if flags & SRE_INFO_CHARSET:
level += 1
print_2('in')
dis_(start, i+skip)
level -= 1
i += skip
else:
raise ValueError(op)
level -= 1
dis_(0, len(code))
def compile(p, flags=0):
# internal: convert pattern list to internal format
if isstring(p):
pattern = p
p = _parser.parse(p, flags)
else:
pattern = None
code = _code(p, flags)
if flags & SRE_FLAG_DEBUG:
print()
dis(code)
# map in either direction
groupindex = p.state.groupdict
indexgroup = [None] * p.state.groups
for k, i in groupindex.items():
indexgroup[i] = k
return _sre.compile(
pattern, flags | p.state.flags, code,
p.state.groups-1,
groupindex, tuple(indexgroup)
)

262
Lib/re/_constants.py Normal file
View file

@ -0,0 +1,262 @@
#
# Secret Labs' Regular Expression Engine
#
# various symbols used by the regular expression engine.
# run this script to update the _sre include files!
#
# Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved.
#
# See the __init__.py file for information on usage and redistribution.
#
"""Internal support module for sre"""
# update when constants are added or removed
MAGIC = 20220318
from _sre import MAXREPEAT, MAXGROUPS
# SRE standard exception (access as sre.error)
# should this really be here?
class error(Exception):
"""Exception raised for invalid regular expressions.
Attributes:
msg: The unformatted error message
pattern: The regular expression pattern
pos: The index in the pattern where compilation failed (may be None)
lineno: The line corresponding to pos (may be None)
colno: The column corresponding to pos (may be None)
"""
__module__ = 're'
def __init__(self, msg, pattern=None, pos=None):
self.msg = msg
self.pattern = pattern
self.pos = pos
if pattern is not None and pos is not None:
msg = '%s at position %d' % (msg, pos)
if isinstance(pattern, str):
newline = '\n'
else:
newline = b'\n'
self.lineno = pattern.count(newline, 0, pos) + 1
self.colno = pos - pattern.rfind(newline, 0, pos)
if newline in pattern:
msg = '%s (line %d, column %d)' % (msg, self.lineno, self.colno)
else:
self.lineno = self.colno = None
super().__init__(msg)
class _NamedIntConstant(int):
def __new__(cls, value, name):
self = super(_NamedIntConstant, cls).__new__(cls, value)
self.name = name
return self
def __repr__(self):
return self.name
MAXREPEAT = _NamedIntConstant(MAXREPEAT, 'MAXREPEAT')
def _makecodes(names):
names = names.strip().split()
items = [_NamedIntConstant(i, name) for i, name in enumerate(names)]
globals().update({item.name: item for item in items})
return items
# operators
# failure=0 success=1 (just because it looks better that way :-)
OPCODES = _makecodes("""
FAILURE SUCCESS
ANY ANY_ALL
ASSERT ASSERT_NOT
AT
BRANCH
CALL
CATEGORY
CHARSET BIGCHARSET
GROUPREF GROUPREF_EXISTS
IN
INFO
JUMP
LITERAL
MARK
MAX_UNTIL
MIN_UNTIL
NOT_LITERAL
NEGATE
RANGE
REPEAT
REPEAT_ONE
SUBPATTERN
MIN_REPEAT_ONE
ATOMIC_GROUP
POSSESSIVE_REPEAT
POSSESSIVE_REPEAT_ONE
GROUPREF_IGNORE
IN_IGNORE
LITERAL_IGNORE
NOT_LITERAL_IGNORE
GROUPREF_LOC_IGNORE
IN_LOC_IGNORE
LITERAL_LOC_IGNORE
NOT_LITERAL_LOC_IGNORE
GROUPREF_UNI_IGNORE
IN_UNI_IGNORE
LITERAL_UNI_IGNORE
NOT_LITERAL_UNI_IGNORE
RANGE_UNI_IGNORE
MIN_REPEAT MAX_REPEAT
""")
del OPCODES[-2:] # remove MIN_REPEAT and MAX_REPEAT
# positions
ATCODES = _makecodes("""
AT_BEGINNING AT_BEGINNING_LINE AT_BEGINNING_STRING
AT_BOUNDARY AT_NON_BOUNDARY
AT_END AT_END_LINE AT_END_STRING
AT_LOC_BOUNDARY AT_LOC_NON_BOUNDARY
AT_UNI_BOUNDARY AT_UNI_NON_BOUNDARY
""")
# categories
CHCODES = _makecodes("""
CATEGORY_DIGIT CATEGORY_NOT_DIGIT
CATEGORY_SPACE CATEGORY_NOT_SPACE
CATEGORY_WORD CATEGORY_NOT_WORD
CATEGORY_LINEBREAK CATEGORY_NOT_LINEBREAK
CATEGORY_LOC_WORD CATEGORY_LOC_NOT_WORD
CATEGORY_UNI_DIGIT CATEGORY_UNI_NOT_DIGIT
CATEGORY_UNI_SPACE CATEGORY_UNI_NOT_SPACE
CATEGORY_UNI_WORD CATEGORY_UNI_NOT_WORD
CATEGORY_UNI_LINEBREAK CATEGORY_UNI_NOT_LINEBREAK
""")
# replacement operations for "ignore case" mode
OP_IGNORE = {
LITERAL: LITERAL_IGNORE,
NOT_LITERAL: NOT_LITERAL_IGNORE,
}
OP_LOCALE_IGNORE = {
LITERAL: LITERAL_LOC_IGNORE,
NOT_LITERAL: NOT_LITERAL_LOC_IGNORE,
}
OP_UNICODE_IGNORE = {
LITERAL: LITERAL_UNI_IGNORE,
NOT_LITERAL: NOT_LITERAL_UNI_IGNORE,
}
AT_MULTILINE = {
AT_BEGINNING: AT_BEGINNING_LINE,
AT_END: AT_END_LINE
}
AT_LOCALE = {
AT_BOUNDARY: AT_LOC_BOUNDARY,
AT_NON_BOUNDARY: AT_LOC_NON_BOUNDARY
}
AT_UNICODE = {
AT_BOUNDARY: AT_UNI_BOUNDARY,
AT_NON_BOUNDARY: AT_UNI_NON_BOUNDARY
}
CH_LOCALE = {
CATEGORY_DIGIT: CATEGORY_DIGIT,
CATEGORY_NOT_DIGIT: CATEGORY_NOT_DIGIT,
CATEGORY_SPACE: CATEGORY_SPACE,
CATEGORY_NOT_SPACE: CATEGORY_NOT_SPACE,
CATEGORY_WORD: CATEGORY_LOC_WORD,
CATEGORY_NOT_WORD: CATEGORY_LOC_NOT_WORD,
CATEGORY_LINEBREAK: CATEGORY_LINEBREAK,
CATEGORY_NOT_LINEBREAK: CATEGORY_NOT_LINEBREAK
}
CH_UNICODE = {
CATEGORY_DIGIT: CATEGORY_UNI_DIGIT,
CATEGORY_NOT_DIGIT: CATEGORY_UNI_NOT_DIGIT,
CATEGORY_SPACE: CATEGORY_UNI_SPACE,
CATEGORY_NOT_SPACE: CATEGORY_UNI_NOT_SPACE,
CATEGORY_WORD: CATEGORY_UNI_WORD,
CATEGORY_NOT_WORD: CATEGORY_UNI_NOT_WORD,
CATEGORY_LINEBREAK: CATEGORY_UNI_LINEBREAK,
CATEGORY_NOT_LINEBREAK: CATEGORY_UNI_NOT_LINEBREAK
}
# flags
SRE_FLAG_TEMPLATE = 1 # template mode (disable backtracking)
SRE_FLAG_IGNORECASE = 2 # case insensitive
SRE_FLAG_LOCALE = 4 # honour system locale
SRE_FLAG_MULTILINE = 8 # treat target as multiline string
SRE_FLAG_DOTALL = 16 # treat target as a single string
SRE_FLAG_UNICODE = 32 # use unicode "locale"
SRE_FLAG_VERBOSE = 64 # ignore whitespace and comments
SRE_FLAG_DEBUG = 128 # debugging
SRE_FLAG_ASCII = 256 # use ascii "locale"
# flags for INFO primitive
SRE_INFO_PREFIX = 1 # has prefix
SRE_INFO_LITERAL = 2 # entire pattern is literal (given by prefix)
SRE_INFO_CHARSET = 4 # pattern starts with character from given set
if __name__ == "__main__":
def dump(f, d, prefix):
items = sorted(d)
for item in items:
f.write("#define %s_%s %d\n" % (prefix, item, item))
with open("sre_constants.h", "w") as f:
f.write("""\
/*
* Secret Labs' Regular Expression Engine
*
* regular expression matching engine
*
* NOTE: This file is generated by Lib/re/_constants.py. If you need
* to change anything in here, edit Lib/re/_constants.py and run it.
*
* Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
*
* See the _sre.c file for information on usage and redistribution.
*/
""")
f.write("#define SRE_MAGIC %d\n" % MAGIC)
dump(f, OPCODES, "SRE_OP")
dump(f, ATCODES, "SRE")
dump(f, CHCODES, "SRE")
f.write("#define SRE_FLAG_TEMPLATE %d\n" % SRE_FLAG_TEMPLATE)
f.write("#define SRE_FLAG_IGNORECASE %d\n" % SRE_FLAG_IGNORECASE)
f.write("#define SRE_FLAG_LOCALE %d\n" % SRE_FLAG_LOCALE)
f.write("#define SRE_FLAG_MULTILINE %d\n" % SRE_FLAG_MULTILINE)
f.write("#define SRE_FLAG_DOTALL %d\n" % SRE_FLAG_DOTALL)
f.write("#define SRE_FLAG_UNICODE %d\n" % SRE_FLAG_UNICODE)
f.write("#define SRE_FLAG_VERBOSE %d\n" % SRE_FLAG_VERBOSE)
f.write("#define SRE_FLAG_DEBUG %d\n" % SRE_FLAG_DEBUG)
f.write("#define SRE_FLAG_ASCII %d\n" % SRE_FLAG_ASCII)
f.write("#define SRE_INFO_PREFIX %d\n" % SRE_INFO_PREFIX)
f.write("#define SRE_INFO_LITERAL %d\n" % SRE_INFO_LITERAL)
f.write("#define SRE_INFO_CHARSET %d\n" % SRE_INFO_CHARSET)
print("done")

1079
Lib/re/_parser.py Normal file

File diff suppressed because it is too large Load diff

View file

@ -1,800 +1,7 @@
# import warnings
# Secret Labs' Regular Expression Engine warnings.warn(f"module {__name__!r} is deprecated",
# DeprecationWarning,
# convert template to internal format stacklevel=2)
#
# Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
#
# See the sre.py file for information on usage and redistribution.
#
"""Internal support module for sre""" from re import _compiler as _
globals().update({k: v for k, v in vars(_).items() if k[:2] != '__'})
import _sre
import sre_parse
from sre_constants import *
assert _sre.MAGIC == MAGIC, "SRE module mismatch"
_LITERAL_CODES = {LITERAL, NOT_LITERAL}
_SUCCESS_CODES = {SUCCESS, FAILURE}
_ASSERT_CODES = {ASSERT, ASSERT_NOT}
_UNIT_CODES = _LITERAL_CODES | {ANY, IN}
_REPEATING_CODES = {
MIN_REPEAT: (REPEAT, MIN_UNTIL, MIN_REPEAT_ONE),
MAX_REPEAT: (REPEAT, MAX_UNTIL, REPEAT_ONE),
POSSESSIVE_REPEAT: (POSSESSIVE_REPEAT, SUCCESS, POSSESSIVE_REPEAT_ONE),
}
# Sets of lowercase characters which have the same uppercase.
_equivalences = (
# LATIN SMALL LETTER I, LATIN SMALL LETTER DOTLESS I
(0x69, 0x131), # iı
# LATIN SMALL LETTER S, LATIN SMALL LETTER LONG S
(0x73, 0x17f), # sſ
# MICRO SIGN, GREEK SMALL LETTER MU
(0xb5, 0x3bc), # µμ
# COMBINING GREEK YPOGEGRAMMENI, GREEK SMALL LETTER IOTA, GREEK PROSGEGRAMMENI
(0x345, 0x3b9, 0x1fbe), # \u0345ι
# GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
(0x390, 0x1fd3), # ΐΐ
# GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
(0x3b0, 0x1fe3), # ΰΰ
# GREEK SMALL LETTER BETA, GREEK BETA SYMBOL
(0x3b2, 0x3d0), # βϐ
# GREEK SMALL LETTER EPSILON, GREEK LUNATE EPSILON SYMBOL
(0x3b5, 0x3f5), # εϵ
# GREEK SMALL LETTER THETA, GREEK THETA SYMBOL
(0x3b8, 0x3d1), # θϑ
# GREEK SMALL LETTER KAPPA, GREEK KAPPA SYMBOL
(0x3ba, 0x3f0), # κϰ
# GREEK SMALL LETTER PI, GREEK PI SYMBOL
(0x3c0, 0x3d6), # πϖ
# GREEK SMALL LETTER RHO, GREEK RHO SYMBOL
(0x3c1, 0x3f1), # ρϱ
# GREEK SMALL LETTER FINAL SIGMA, GREEK SMALL LETTER SIGMA
(0x3c2, 0x3c3), # ςσ
# GREEK SMALL LETTER PHI, GREEK PHI SYMBOL
(0x3c6, 0x3d5), # φϕ
# LATIN SMALL LETTER S WITH DOT ABOVE, LATIN SMALL LETTER LONG S WITH DOT ABOVE
(0x1e61, 0x1e9b), # ṡẛ
# LATIN SMALL LIGATURE LONG S T, LATIN SMALL LIGATURE ST
(0xfb05, 0xfb06), # ſtst
)
# Maps the lowercase code to lowercase codes which have the same uppercase.
_ignorecase_fixes = {i: tuple(j for j in t if i != j)
for t in _equivalences for i in t}
def _combine_flags(flags, add_flags, del_flags,
TYPE_FLAGS=sre_parse.TYPE_FLAGS):
if add_flags & TYPE_FLAGS:
flags &= ~TYPE_FLAGS
return (flags | add_flags) & ~del_flags
def _compile(code, pattern, flags):
# internal: compile a (sub)pattern
emit = code.append
_len = len
LITERAL_CODES = _LITERAL_CODES
REPEATING_CODES = _REPEATING_CODES
SUCCESS_CODES = _SUCCESS_CODES
ASSERT_CODES = _ASSERT_CODES
iscased = None
tolower = None
fixes = None
if flags & SRE_FLAG_IGNORECASE and not flags & SRE_FLAG_LOCALE:
if flags & SRE_FLAG_UNICODE:
iscased = _sre.unicode_iscased
tolower = _sre.unicode_tolower
fixes = _ignorecase_fixes
else:
iscased = _sre.ascii_iscased
tolower = _sre.ascii_tolower
for op, av in pattern:
if op in LITERAL_CODES:
if not flags & SRE_FLAG_IGNORECASE:
emit(op)
emit(av)
elif flags & SRE_FLAG_LOCALE:
emit(OP_LOCALE_IGNORE[op])
emit(av)
elif not iscased(av):
emit(op)
emit(av)
else:
lo = tolower(av)
if not fixes: # ascii
emit(OP_IGNORE[op])
emit(lo)
elif lo not in fixes:
emit(OP_UNICODE_IGNORE[op])
emit(lo)
else:
emit(IN_UNI_IGNORE)
skip = _len(code); emit(0)
if op is NOT_LITERAL:
emit(NEGATE)
for k in (lo,) + fixes[lo]:
emit(LITERAL)
emit(k)
emit(FAILURE)
code[skip] = _len(code) - skip
elif op is IN:
charset, hascased = _optimize_charset(av, iscased, tolower, fixes)
if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
emit(IN_LOC_IGNORE)
elif not hascased:
emit(IN)
elif not fixes: # ascii
emit(IN_IGNORE)
else:
emit(IN_UNI_IGNORE)
skip = _len(code); emit(0)
_compile_charset(charset, flags, code)
code[skip] = _len(code) - skip
elif op is ANY:
if flags & SRE_FLAG_DOTALL:
emit(ANY_ALL)
else:
emit(ANY)
elif op in REPEATING_CODES:
if flags & SRE_FLAG_TEMPLATE:
raise error("internal: unsupported template operator %r" % (op,))
if _simple(av[2]):
emit(REPEATING_CODES[op][2])
skip = _len(code); emit(0)
emit(av[0])
emit(av[1])
_compile(code, av[2], flags)
emit(SUCCESS)
code[skip] = _len(code) - skip
else:
emit(REPEATING_CODES[op][0])
skip = _len(code); emit(0)
emit(av[0])
emit(av[1])
_compile(code, av[2], flags)
code[skip] = _len(code) - skip
emit(REPEATING_CODES[op][1])
elif op is SUBPATTERN:
group, add_flags, del_flags, p = av
if group:
emit(MARK)
emit((group-1)*2)
# _compile_info(code, p, _combine_flags(flags, add_flags, del_flags))
_compile(code, p, _combine_flags(flags, add_flags, del_flags))
if group:
emit(MARK)
emit((group-1)*2+1)
elif op is ATOMIC_GROUP:
# Atomic Groups are handled by starting with an Atomic
# Group op code, then putting in the atomic group pattern
# and finally a success op code to tell any repeat
# operations within the Atomic Group to stop eating and
# pop their stack if they reach it
emit(ATOMIC_GROUP)
skip = _len(code); emit(0)
_compile(code, av, flags)
emit(SUCCESS)
code[skip] = _len(code) - skip
elif op in SUCCESS_CODES:
emit(op)
elif op in ASSERT_CODES:
emit(op)
skip = _len(code); emit(0)
if av[0] >= 0:
emit(0) # look ahead
else:
lo, hi = av[1].getwidth()
if lo != hi:
raise error("look-behind requires fixed-width pattern")
emit(lo) # look behind
_compile(code, av[1], flags)
emit(SUCCESS)
code[skip] = _len(code) - skip
elif op is CALL:
emit(op)
skip = _len(code); emit(0)
_compile(code, av, flags)
emit(SUCCESS)
code[skip] = _len(code) - skip
elif op is AT:
emit(op)
if flags & SRE_FLAG_MULTILINE:
av = AT_MULTILINE.get(av, av)
if flags & SRE_FLAG_LOCALE:
av = AT_LOCALE.get(av, av)
elif flags & SRE_FLAG_UNICODE:
av = AT_UNICODE.get(av, av)
emit(av)
elif op is BRANCH:
emit(op)
tail = []
tailappend = tail.append
for av in av[1]:
skip = _len(code); emit(0)
# _compile_info(code, av, flags)
_compile(code, av, flags)
emit(JUMP)
tailappend(_len(code)); emit(0)
code[skip] = _len(code) - skip
emit(FAILURE) # end of branch
for tail in tail:
code[tail] = _len(code) - tail
elif op is CATEGORY:
emit(op)
if flags & SRE_FLAG_LOCALE:
av = CH_LOCALE[av]
elif flags & SRE_FLAG_UNICODE:
av = CH_UNICODE[av]
emit(av)
elif op is GROUPREF:
if not flags & SRE_FLAG_IGNORECASE:
emit(op)
elif flags & SRE_FLAG_LOCALE:
emit(GROUPREF_LOC_IGNORE)
elif not fixes: # ascii
emit(GROUPREF_IGNORE)
else:
emit(GROUPREF_UNI_IGNORE)
emit(av-1)
elif op is GROUPREF_EXISTS:
emit(op)
emit(av[0]-1)
skipyes = _len(code); emit(0)
_compile(code, av[1], flags)
if av[2]:
emit(JUMP)
skipno = _len(code); emit(0)
code[skipyes] = _len(code) - skipyes + 1
_compile(code, av[2], flags)
code[skipno] = _len(code) - skipno
else:
code[skipyes] = _len(code) - skipyes + 1
else:
raise error("internal: unsupported operand type %r" % (op,))
def _compile_charset(charset, flags, code):
# compile charset subprogram
emit = code.append
for op, av in charset:
emit(op)
if op is NEGATE:
pass
elif op is LITERAL:
emit(av)
elif op is RANGE or op is RANGE_UNI_IGNORE:
emit(av[0])
emit(av[1])
elif op is CHARSET:
code.extend(av)
elif op is BIGCHARSET:
code.extend(av)
elif op is CATEGORY:
if flags & SRE_FLAG_LOCALE:
emit(CH_LOCALE[av])
elif flags & SRE_FLAG_UNICODE:
emit(CH_UNICODE[av])
else:
emit(av)
else:
raise error("internal: unsupported set operator %r" % (op,))
emit(FAILURE)
def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):
# internal: optimize character set
out = []
tail = []
charmap = bytearray(256)
hascased = False
for op, av in charset:
while True:
try:
if op is LITERAL:
if fixup:
lo = fixup(av)
charmap[lo] = 1
if fixes and lo in fixes:
for k in fixes[lo]:
charmap[k] = 1
if not hascased and iscased(av):
hascased = True
else:
charmap[av] = 1
elif op is RANGE:
r = range(av[0], av[1]+1)
if fixup:
if fixes:
for i in map(fixup, r):
charmap[i] = 1
if i in fixes:
for k in fixes[i]:
charmap[k] = 1
else:
for i in map(fixup, r):
charmap[i] = 1
if not hascased:
hascased = any(map(iscased, r))
else:
for i in r:
charmap[i] = 1
elif op is NEGATE:
out.append((op, av))
else:
tail.append((op, av))
except IndexError:
if len(charmap) == 256:
# character set contains non-UCS1 character codes
charmap += b'\0' * 0xff00
continue
# Character set contains non-BMP character codes.
if fixup:
hascased = True
# There are only two ranges of cased non-BMP characters:
# 10400-1044F (Deseret) and 118A0-118DF (Warang Citi),
# and for both ranges RANGE_UNI_IGNORE works.
if op is RANGE:
op = RANGE_UNI_IGNORE
tail.append((op, av))
break
# compress character map
runs = []
q = 0
while True:
p = charmap.find(1, q)
if p < 0:
break
if len(runs) >= 2:
runs = None
break
q = charmap.find(0, p)
if q < 0:
runs.append((p, len(charmap)))
break
runs.append((p, q))
if runs is not None:
# use literal/range
for p, q in runs:
if q - p == 1:
out.append((LITERAL, p))
else:
out.append((RANGE, (p, q - 1)))
out += tail
# if the case was changed or new representation is more compact
if hascased or len(out) < len(charset):
return out, hascased
# else original character set is good enough
return charset, hascased
# use bitmap
if len(charmap) == 256:
data = _mk_bitmap(charmap)
out.append((CHARSET, data))
out += tail
return out, hascased
# To represent a big charset, first a bitmap of all characters in the
# set is constructed. Then, this bitmap is sliced into chunks of 256
# characters, duplicate chunks are eliminated, and each chunk is
# given a number. In the compiled expression, the charset is
# represented by a 32-bit word sequence, consisting of one word for
# the number of different chunks, a sequence of 256 bytes (64 words)
# of chunk numbers indexed by their original chunk position, and a
# sequence of 256-bit chunks (8 words each).
# Compression is normally good: in a typical charset, large ranges of
# Unicode will be either completely excluded (e.g. if only cyrillic
# letters are to be matched), or completely included (e.g. if large
# subranges of Kanji match). These ranges will be represented by
# chunks of all one-bits or all zero-bits.
# Matching can be also done efficiently: the more significant byte of
# the Unicode character is an index into the chunk number, and the
# less significant byte is a bit index in the chunk (just like the
# CHARSET matching).
charmap = bytes(charmap) # should be hashable
comps = {}
mapping = bytearray(256)
block = 0
data = bytearray()
for i in range(0, 65536, 256):
chunk = charmap[i: i + 256]
if chunk in comps:
mapping[i // 256] = comps[chunk]
else:
mapping[i // 256] = comps[chunk] = block
block += 1
data += chunk
data = _mk_bitmap(data)
data[0:0] = [block] + _bytes_to_codes(mapping)
out.append((BIGCHARSET, data))
out += tail
return out, hascased
_CODEBITS = _sre.CODESIZE * 8
MAXCODE = (1 << _CODEBITS) - 1
_BITS_TRANS = b'0' + b'1' * 255
def _mk_bitmap(bits, _CODEBITS=_CODEBITS, _int=int):
s = bits.translate(_BITS_TRANS)[::-1]
return [_int(s[i - _CODEBITS: i], 2)
for i in range(len(s), 0, -_CODEBITS)]
def _bytes_to_codes(b):
# Convert block indices to word array
a = memoryview(b).cast('I')
assert a.itemsize == _sre.CODESIZE
assert len(a) * a.itemsize == len(b)
return a.tolist()
def _simple(p):
# check if this subpattern is a "simple" operator
if len(p) != 1:
return False
op, av = p[0]
if op is SUBPATTERN:
return av[0] is None and _simple(av[-1])
return op in _UNIT_CODES
def _generate_overlap_table(prefix):
"""
Generate an overlap table for the following prefix.
An overlap table is a table of the same size as the prefix which
informs about the potential self-overlap for each index in the prefix:
- if overlap[i] == 0, prefix[i:] can't overlap prefix[0:...]
- if overlap[i] == k with 0 < k <= i, prefix[i-k+1:i+1] overlaps with
prefix[0:k]
"""
table = [0] * len(prefix)
for i in range(1, len(prefix)):
idx = table[i - 1]
while prefix[i] != prefix[idx]:
if idx == 0:
table[i] = 0
break
idx = table[idx - 1]
else:
table[i] = idx + 1
return table
def _get_iscased(flags):
if not flags & SRE_FLAG_IGNORECASE:
return None
elif flags & SRE_FLAG_UNICODE:
return _sre.unicode_iscased
else:
return _sre.ascii_iscased
def _get_literal_prefix(pattern, flags):
# look for literal prefix
prefix = []
prefixappend = prefix.append
prefix_skip = None
iscased = _get_iscased(flags)
for op, av in pattern.data:
if op is LITERAL:
if iscased and iscased(av):
break
prefixappend(av)
elif op is SUBPATTERN:
group, add_flags, del_flags, p = av
flags1 = _combine_flags(flags, add_flags, del_flags)
if flags1 & SRE_FLAG_IGNORECASE and flags1 & SRE_FLAG_LOCALE:
break
prefix1, prefix_skip1, got_all = _get_literal_prefix(p, flags1)
if prefix_skip is None:
if group is not None:
prefix_skip = len(prefix)
elif prefix_skip1 is not None:
prefix_skip = len(prefix) + prefix_skip1
prefix.extend(prefix1)
if not got_all:
break
else:
break
else:
return prefix, prefix_skip, True
return prefix, prefix_skip, False
def _get_charset_prefix(pattern, flags):
while True:
if not pattern.data:
return None
op, av = pattern.data[0]
if op is not SUBPATTERN:
break
group, add_flags, del_flags, pattern = av
flags = _combine_flags(flags, add_flags, del_flags)
if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
return None
iscased = _get_iscased(flags)
if op is LITERAL:
if iscased and iscased(av):
return None
return [(op, av)]
elif op is BRANCH:
charset = []
charsetappend = charset.append
for p in av[1]:
if not p:
return None
op, av = p[0]
if op is LITERAL and not (iscased and iscased(av)):
charsetappend((op, av))
else:
return None
return charset
elif op is IN:
charset = av
if iscased:
for op, av in charset:
if op is LITERAL:
if iscased(av):
return None
elif op is RANGE:
if av[1] > 0xffff:
return None
if any(map(iscased, range(av[0], av[1]+1))):
return None
return charset
return None
def _compile_info(code, pattern, flags):
# internal: compile an info block. in the current version,
# this contains min/max pattern width, and an optional literal
# prefix or a character map
lo, hi = pattern.getwidth()
if hi > MAXCODE:
hi = MAXCODE
if lo == 0:
code.extend([INFO, 4, 0, lo, hi])
return
# look for a literal prefix
prefix = []
prefix_skip = 0
charset = [] # not used
if not (flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE):
# look for literal prefix
prefix, prefix_skip, got_all = _get_literal_prefix(pattern, flags)
# if no prefix, look for charset prefix
if not prefix:
charset = _get_charset_prefix(pattern, flags)
## if prefix:
## print("*** PREFIX", prefix, prefix_skip)
## if charset:
## print("*** CHARSET", charset)
# add an info block
emit = code.append
emit(INFO)
skip = len(code); emit(0)
# literal flag
mask = 0
if prefix:
mask = SRE_INFO_PREFIX
if prefix_skip is None and got_all:
mask = mask | SRE_INFO_LITERAL
elif charset:
mask = mask | SRE_INFO_CHARSET
emit(mask)
# pattern length
if lo < MAXCODE:
emit(lo)
else:
emit(MAXCODE)
prefix = prefix[:MAXCODE]
emit(min(hi, MAXCODE))
# add literal prefix
if prefix:
emit(len(prefix)) # length
if prefix_skip is None:
prefix_skip = len(prefix)
emit(prefix_skip) # skip
code.extend(prefix)
# generate overlap table
code.extend(_generate_overlap_table(prefix))
elif charset:
charset, hascased = _optimize_charset(charset)
assert not hascased
_compile_charset(charset, flags, code)
code[skip] = len(code) - skip
def isstring(obj):
return isinstance(obj, (str, bytes))
def _code(p, flags):
flags = p.state.flags | flags
code = []
# compile info block
_compile_info(code, p, flags)
# compile the pattern
_compile(code, p.data, flags)
code.append(SUCCESS)
return code
def _hex_code(code):
return '[%s]' % ', '.join('%#0*x' % (_sre.CODESIZE*2+2, x) for x in code)
def dis(code):
import sys
labels = set()
level = 0
offset_width = len(str(len(code) - 1))
def dis_(start, end):
def print_(*args, to=None):
if to is not None:
labels.add(to)
args += ('(to %d)' % (to,),)
print('%*d%s ' % (offset_width, start, ':' if start in labels else '.'),
end=' '*(level-1))
print(*args)
def print_2(*args):
print(end=' '*(offset_width + 2*level))
print(*args)
nonlocal level
level += 1
i = start
while i < end:
start = i
op = code[i]
i += 1
op = OPCODES[op]
if op in (SUCCESS, FAILURE, ANY, ANY_ALL,
MAX_UNTIL, MIN_UNTIL, NEGATE):
print_(op)
elif op in (LITERAL, NOT_LITERAL,
LITERAL_IGNORE, NOT_LITERAL_IGNORE,
LITERAL_UNI_IGNORE, NOT_LITERAL_UNI_IGNORE,
LITERAL_LOC_IGNORE, NOT_LITERAL_LOC_IGNORE):
arg = code[i]
i += 1
print_(op, '%#02x (%r)' % (arg, chr(arg)))
elif op is AT:
arg = code[i]
i += 1
arg = str(ATCODES[arg])
assert arg[:3] == 'AT_'
print_(op, arg[3:])
elif op is CATEGORY:
arg = code[i]
i += 1
arg = str(CHCODES[arg])
assert arg[:9] == 'CATEGORY_'
print_(op, arg[9:])
elif op in (IN, IN_IGNORE, IN_UNI_IGNORE, IN_LOC_IGNORE):
skip = code[i]
print_(op, skip, to=i+skip)
dis_(i+1, i+skip)
i += skip
elif op in (RANGE, RANGE_UNI_IGNORE):
lo, hi = code[i: i+2]
i += 2
print_(op, '%#02x %#02x (%r-%r)' % (lo, hi, chr(lo), chr(hi)))
elif op is CHARSET:
print_(op, _hex_code(code[i: i + 256//_CODEBITS]))
i += 256//_CODEBITS
elif op is BIGCHARSET:
arg = code[i]
i += 1
mapping = list(b''.join(x.to_bytes(_sre.CODESIZE, sys.byteorder)
for x in code[i: i + 256//_sre.CODESIZE]))
print_(op, arg, mapping)
i += 256//_sre.CODESIZE
level += 1
for j in range(arg):
print_2(_hex_code(code[i: i + 256//_CODEBITS]))
i += 256//_CODEBITS
level -= 1
elif op in (MARK, GROUPREF, GROUPREF_IGNORE, GROUPREF_UNI_IGNORE,
GROUPREF_LOC_IGNORE):
arg = code[i]
i += 1
print_(op, arg)
elif op is JUMP:
skip = code[i]
print_(op, skip, to=i+skip)
i += 1
elif op is BRANCH:
skip = code[i]
print_(op, skip, to=i+skip)
while skip:
dis_(i+1, i+skip)
i += skip
start = i
skip = code[i]
if skip:
print_('branch', skip, to=i+skip)
else:
print_(FAILURE)
i += 1
elif op in (REPEAT, REPEAT_ONE, MIN_REPEAT_ONE,
POSSESSIVE_REPEAT, POSSESSIVE_REPEAT_ONE):
skip, min, max = code[i: i+3]
if max == MAXREPEAT:
max = 'MAXREPEAT'
print_(op, skip, min, max, to=i+skip)
dis_(i+3, i+skip)
i += skip
elif op is GROUPREF_EXISTS:
arg, skip = code[i: i+2]
print_(op, arg, skip, to=i+skip)
i += 2
elif op in (ASSERT, ASSERT_NOT):
skip, arg = code[i: i+2]
print_(op, skip, arg, to=i+skip)
dis_(i+2, i+skip)
i += skip
elif op is ATOMIC_GROUP:
skip = code[i]
print_(op, skip, to=i+skip)
dis_(i+1, i+skip)
i += skip
elif op is INFO:
skip, flags, min, max = code[i: i+4]
if max == MAXREPEAT:
max = 'MAXREPEAT'
print_(op, skip, bin(flags), min, max, to=i+skip)
start = i+4
if flags & SRE_INFO_PREFIX:
prefix_len, prefix_skip = code[i+4: i+6]
print_2(' prefix_skip', prefix_skip)
start = i + 6
prefix = code[start: start+prefix_len]
print_2(' prefix',
'[%s]' % ', '.join('%#02x' % x for x in prefix),
'(%r)' % ''.join(map(chr, prefix)))
start += prefix_len
print_2(' overlap', code[start: start+prefix_len])
start += prefix_len
if flags & SRE_INFO_CHARSET:
level += 1
print_2('in')
dis_(start, i+skip)
level -= 1
i += skip
else:
raise ValueError(op)
level -= 1
dis_(0, len(code))
def compile(p, flags=0):
# internal: convert pattern list to internal format
if isstring(p):
pattern = p
p = sre_parse.parse(p, flags)
else:
pattern = None
code = _code(p, flags)
if flags & SRE_FLAG_DEBUG:
print()
dis(code)
# map in either direction
groupindex = p.state.groupdict
indexgroup = [None] * p.state.groups
for k, i in groupindex.items():
indexgroup[i] = k
return _sre.compile(
pattern, flags | p.state.flags, code,
p.state.groups-1,
groupindex, tuple(indexgroup)
)

View file

@ -1,262 +1,7 @@
# import warnings
# Secret Labs' Regular Expression Engine warnings.warn(f"module {__name__!r} is deprecated",
# DeprecationWarning,
# various symbols used by the regular expression engine. stacklevel=2)
# run this script to update the _sre include files!
#
# Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved.
#
# See the sre.py file for information on usage and redistribution.
#
"""Internal support module for sre""" from re import _constants as _
globals().update({k: v for k, v in vars(_).items() if k[:2] != '__'})
# update when constants are added or removed
MAGIC = 20220318
from _sre import MAXREPEAT, MAXGROUPS
# SRE standard exception (access as sre.error)
# should this really be here?
class error(Exception):
"""Exception raised for invalid regular expressions.
Attributes:
msg: The unformatted error message
pattern: The regular expression pattern
pos: The index in the pattern where compilation failed (may be None)
lineno: The line corresponding to pos (may be None)
colno: The column corresponding to pos (may be None)
"""
__module__ = 're'
def __init__(self, msg, pattern=None, pos=None):
self.msg = msg
self.pattern = pattern
self.pos = pos
if pattern is not None and pos is not None:
msg = '%s at position %d' % (msg, pos)
if isinstance(pattern, str):
newline = '\n'
else:
newline = b'\n'
self.lineno = pattern.count(newline, 0, pos) + 1
self.colno = pos - pattern.rfind(newline, 0, pos)
if newline in pattern:
msg = '%s (line %d, column %d)' % (msg, self.lineno, self.colno)
else:
self.lineno = self.colno = None
super().__init__(msg)
class _NamedIntConstant(int):
def __new__(cls, value, name):
self = super(_NamedIntConstant, cls).__new__(cls, value)
self.name = name
return self
def __repr__(self):
return self.name
MAXREPEAT = _NamedIntConstant(MAXREPEAT, 'MAXREPEAT')
def _makecodes(names):
names = names.strip().split()
items = [_NamedIntConstant(i, name) for i, name in enumerate(names)]
globals().update({item.name: item for item in items})
return items
# operators
# failure=0 success=1 (just because it looks better that way :-)
OPCODES = _makecodes("""
FAILURE SUCCESS
ANY ANY_ALL
ASSERT ASSERT_NOT
AT
BRANCH
CALL
CATEGORY
CHARSET BIGCHARSET
GROUPREF GROUPREF_EXISTS
IN
INFO
JUMP
LITERAL
MARK
MAX_UNTIL
MIN_UNTIL
NOT_LITERAL
NEGATE
RANGE
REPEAT
REPEAT_ONE
SUBPATTERN
MIN_REPEAT_ONE
ATOMIC_GROUP
POSSESSIVE_REPEAT
POSSESSIVE_REPEAT_ONE
GROUPREF_IGNORE
IN_IGNORE
LITERAL_IGNORE
NOT_LITERAL_IGNORE
GROUPREF_LOC_IGNORE
IN_LOC_IGNORE
LITERAL_LOC_IGNORE
NOT_LITERAL_LOC_IGNORE
GROUPREF_UNI_IGNORE
IN_UNI_IGNORE
LITERAL_UNI_IGNORE
NOT_LITERAL_UNI_IGNORE
RANGE_UNI_IGNORE
MIN_REPEAT MAX_REPEAT
""")
del OPCODES[-2:] # remove MIN_REPEAT and MAX_REPEAT
# positions
ATCODES = _makecodes("""
AT_BEGINNING AT_BEGINNING_LINE AT_BEGINNING_STRING
AT_BOUNDARY AT_NON_BOUNDARY
AT_END AT_END_LINE AT_END_STRING
AT_LOC_BOUNDARY AT_LOC_NON_BOUNDARY
AT_UNI_BOUNDARY AT_UNI_NON_BOUNDARY
""")
# categories
CHCODES = _makecodes("""
CATEGORY_DIGIT CATEGORY_NOT_DIGIT
CATEGORY_SPACE CATEGORY_NOT_SPACE
CATEGORY_WORD CATEGORY_NOT_WORD
CATEGORY_LINEBREAK CATEGORY_NOT_LINEBREAK
CATEGORY_LOC_WORD CATEGORY_LOC_NOT_WORD
CATEGORY_UNI_DIGIT CATEGORY_UNI_NOT_DIGIT
CATEGORY_UNI_SPACE CATEGORY_UNI_NOT_SPACE
CATEGORY_UNI_WORD CATEGORY_UNI_NOT_WORD
CATEGORY_UNI_LINEBREAK CATEGORY_UNI_NOT_LINEBREAK
""")
# replacement operations for "ignore case" mode
OP_IGNORE = {
LITERAL: LITERAL_IGNORE,
NOT_LITERAL: NOT_LITERAL_IGNORE,
}
OP_LOCALE_IGNORE = {
LITERAL: LITERAL_LOC_IGNORE,
NOT_LITERAL: NOT_LITERAL_LOC_IGNORE,
}
OP_UNICODE_IGNORE = {
LITERAL: LITERAL_UNI_IGNORE,
NOT_LITERAL: NOT_LITERAL_UNI_IGNORE,
}
AT_MULTILINE = {
AT_BEGINNING: AT_BEGINNING_LINE,
AT_END: AT_END_LINE
}
AT_LOCALE = {
AT_BOUNDARY: AT_LOC_BOUNDARY,
AT_NON_BOUNDARY: AT_LOC_NON_BOUNDARY
}
AT_UNICODE = {
AT_BOUNDARY: AT_UNI_BOUNDARY,
AT_NON_BOUNDARY: AT_UNI_NON_BOUNDARY
}
CH_LOCALE = {
CATEGORY_DIGIT: CATEGORY_DIGIT,
CATEGORY_NOT_DIGIT: CATEGORY_NOT_DIGIT,
CATEGORY_SPACE: CATEGORY_SPACE,
CATEGORY_NOT_SPACE: CATEGORY_NOT_SPACE,
CATEGORY_WORD: CATEGORY_LOC_WORD,
CATEGORY_NOT_WORD: CATEGORY_LOC_NOT_WORD,
CATEGORY_LINEBREAK: CATEGORY_LINEBREAK,
CATEGORY_NOT_LINEBREAK: CATEGORY_NOT_LINEBREAK
}
CH_UNICODE = {
CATEGORY_DIGIT: CATEGORY_UNI_DIGIT,
CATEGORY_NOT_DIGIT: CATEGORY_UNI_NOT_DIGIT,
CATEGORY_SPACE: CATEGORY_UNI_SPACE,
CATEGORY_NOT_SPACE: CATEGORY_UNI_NOT_SPACE,
CATEGORY_WORD: CATEGORY_UNI_WORD,
CATEGORY_NOT_WORD: CATEGORY_UNI_NOT_WORD,
CATEGORY_LINEBREAK: CATEGORY_UNI_LINEBREAK,
CATEGORY_NOT_LINEBREAK: CATEGORY_UNI_NOT_LINEBREAK
}
# flags
SRE_FLAG_TEMPLATE = 1 # template mode (disable backtracking)
SRE_FLAG_IGNORECASE = 2 # case insensitive
SRE_FLAG_LOCALE = 4 # honour system locale
SRE_FLAG_MULTILINE = 8 # treat target as multiline string
SRE_FLAG_DOTALL = 16 # treat target as a single string
SRE_FLAG_UNICODE = 32 # use unicode "locale"
SRE_FLAG_VERBOSE = 64 # ignore whitespace and comments
SRE_FLAG_DEBUG = 128 # debugging
SRE_FLAG_ASCII = 256 # use ascii "locale"
# flags for INFO primitive
SRE_INFO_PREFIX = 1 # has prefix
SRE_INFO_LITERAL = 2 # entire pattern is literal (given by prefix)
SRE_INFO_CHARSET = 4 # pattern starts with character from given set
if __name__ == "__main__":
def dump(f, d, prefix):
items = sorted(d)
for item in items:
f.write("#define %s_%s %d\n" % (prefix, item, item))
with open("sre_constants.h", "w") as f:
f.write("""\
/*
* Secret Labs' Regular Expression Engine
*
* regular expression matching engine
*
* NOTE: This file is generated by sre_constants.py. If you need
* to change anything in here, edit sre_constants.py and run it.
*
* Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
*
* See the _sre.c file for information on usage and redistribution.
*/
""")
f.write("#define SRE_MAGIC %d\n" % MAGIC)
dump(f, OPCODES, "SRE_OP")
dump(f, ATCODES, "SRE")
dump(f, CHCODES, "SRE")
f.write("#define SRE_FLAG_TEMPLATE %d\n" % SRE_FLAG_TEMPLATE)
f.write("#define SRE_FLAG_IGNORECASE %d\n" % SRE_FLAG_IGNORECASE)
f.write("#define SRE_FLAG_LOCALE %d\n" % SRE_FLAG_LOCALE)
f.write("#define SRE_FLAG_MULTILINE %d\n" % SRE_FLAG_MULTILINE)
f.write("#define SRE_FLAG_DOTALL %d\n" % SRE_FLAG_DOTALL)
f.write("#define SRE_FLAG_UNICODE %d\n" % SRE_FLAG_UNICODE)
f.write("#define SRE_FLAG_VERBOSE %d\n" % SRE_FLAG_VERBOSE)
f.write("#define SRE_FLAG_DEBUG %d\n" % SRE_FLAG_DEBUG)
f.write("#define SRE_FLAG_ASCII %d\n" % SRE_FLAG_ASCII)
f.write("#define SRE_INFO_PREFIX %d\n" % SRE_INFO_PREFIX)
f.write("#define SRE_INFO_LITERAL %d\n" % SRE_INFO_LITERAL)
f.write("#define SRE_INFO_CHARSET %d\n" % SRE_INFO_CHARSET)
print("done")

File diff suppressed because it is too large Load diff

View file

@ -221,7 +221,7 @@ class PyclbrTest(TestCase):
cm('cgi', ignore=('log',)) # set with = in module cm('cgi', ignore=('log',)) # set with = in module
cm('pickle', ignore=('partial', 'PickleBuffer')) cm('pickle', ignore=('partial', 'PickleBuffer'))
cm('aifc', ignore=('_aifc_params',)) # set with = in module cm('aifc', ignore=('_aifc_params',)) # set with = in module
cm('sre_parse', ignore=('dump', 'groups', 'pos')) # from sre_constants import *; property cm('re._parser', ignore=('dump', 'groups', 'pos')) # from ._constants import *; property
cm( cm(
'pdb', 'pdb',
# pyclbr does not handle elegantly `typing` or properties # pyclbr does not handle elegantly `typing` or properties

View file

@ -3,8 +3,8 @@ from test.support import (gc_collect, bigmemtest, _2G,
check_disallow_instantiation, is_emscripten) check_disallow_instantiation, is_emscripten)
import locale import locale
import re import re
import sre_compile
import string import string
import sys
import time import time
import unittest import unittest
import warnings import warnings
@ -569,7 +569,7 @@ class ReTests(unittest.TestCase):
'two branches', 10) 'two branches', 10)
def test_re_groupref_overflow(self): def test_re_groupref_overflow(self):
from sre_constants import MAXGROUPS from re._constants import MAXGROUPS
self.checkTemplateError('()', r'\g<%s>' % MAXGROUPS, 'xx', self.checkTemplateError('()', r'\g<%s>' % MAXGROUPS, 'xx',
'invalid group reference %d' % MAXGROUPS, 3) 'invalid group reference %d' % MAXGROUPS, 3)
self.checkPatternError(r'(?P<a>)(?(%d))' % MAXGROUPS, self.checkPatternError(r'(?P<a>)(?(%d))' % MAXGROUPS,
@ -2433,7 +2433,7 @@ class ImplementationTest(unittest.TestCase):
tp.foo = 1 tp.foo = 1
def test_overlap_table(self): def test_overlap_table(self):
f = sre_compile._generate_overlap_table f = re._compiler._generate_overlap_table
self.assertEqual(f(""), []) self.assertEqual(f(""), [])
self.assertEqual(f("a"), [0]) self.assertEqual(f("a"), [0])
self.assertEqual(f("abcd"), [0, 0, 0, 0]) self.assertEqual(f("abcd"), [0, 0, 0, 0])
@ -2442,8 +2442,8 @@ class ImplementationTest(unittest.TestCase):
self.assertEqual(f("abcabdac"), [0, 0, 0, 1, 2, 0, 1, 0]) self.assertEqual(f("abcabdac"), [0, 0, 0, 1, 2, 0, 1, 0])
def test_signedness(self): def test_signedness(self):
self.assertGreaterEqual(sre_compile.MAXREPEAT, 0) self.assertGreaterEqual(re._compiler.MAXREPEAT, 0)
self.assertGreaterEqual(sre_compile.MAXGROUPS, 0) self.assertGreaterEqual(re._compiler.MAXGROUPS, 0)
@cpython_only @cpython_only
def test_disallow_instantiation(self): def test_disallow_instantiation(self):
@ -2453,6 +2453,32 @@ class ImplementationTest(unittest.TestCase):
pat = re.compile("") pat = re.compile("")
check_disallow_instantiation(self, type(pat.scanner(""))) check_disallow_instantiation(self, type(pat.scanner("")))
def test_deprecated_modules(self):
deprecated = {
'sre_compile': ['compile', 'error',
'SRE_FLAG_IGNORECASE', 'SUBPATTERN',
'_compile_info'],
'sre_constants': ['error', 'SRE_FLAG_IGNORECASE', 'SUBPATTERN',
'_NamedIntConstant'],
'sre_parse': ['SubPattern', 'parse',
'SRE_FLAG_IGNORECASE', 'SUBPATTERN',
'_parse_sub'],
}
for name in deprecated:
with self.subTest(module=name):
sys.modules.pop(name, None)
with self.assertWarns(DeprecationWarning) as cm:
__import__(name)
self.assertEqual(str(cm.warnings[0].message),
f"module {name!r} is deprecated")
self.assertEqual(cm.warnings[0].filename, __file__)
self.assertIn(name, sys.modules)
mod = sys.modules[name]
self.assertEqual(mod.__name__, name)
self.assertEqual(mod.__package__, '')
for attr in deprecated[name]:
self.assertTrue(hasattr(mod, attr))
del sys.modules[name]
class ExternalTests(unittest.TestCase): class ExternalTests(unittest.TestCase):

View file

@ -523,7 +523,7 @@ class StartupImportTests(unittest.TestCase):
self.assertIn('site', modules) self.assertIn('site', modules)
# http://bugs.python.org/issue19205 # http://bugs.python.org/issue19205
re_mods = {'re', '_sre', 'sre_compile', 'sre_constants', 'sre_parse'} re_mods = {'re', '_sre', 're._compiler', 're._constants', 're._parser'}
self.assertFalse(modules.intersection(re_mods), stderr) self.assertFalse(modules.intersection(re_mods), stderr)
# http://bugs.python.org/issue9548 # http://bugs.python.org/issue9548

View file

@ -1862,6 +1862,7 @@ LIBSUBDIRS= asyncio \
logging \ logging \
multiprocessing multiprocessing/dummy \ multiprocessing multiprocessing/dummy \
pydoc_data \ pydoc_data \
re \
site-packages \ site-packages \
sqlite3 \ sqlite3 \
tkinter \ tkinter \

View file

@ -0,0 +1,2 @@
Convert the :mod:`re` module into a package. Deprecate modules ``sre_compile``,
``sre_constants`` and ``sre_parse``.

View file

@ -3,8 +3,8 @@
* *
* regular expression matching engine * regular expression matching engine
* *
* NOTE: This file is generated by sre_constants.py. If you need * NOTE: This file is generated by Lib/re/_constants.py. If you need
* to change anything in here, edit sre_constants.py and run it. * to change anything in here, edit Lib/re/_constants.py and run it.
* *
* Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved. * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
* *