Fully support 32-bit codes. Enable BIGCHARSET in UCS-4 builds.

This commit is contained in:
Martin v. Löwis 2003-04-19 12:56:08 +00:00
parent 53d93adc46
commit 78e2f06cc6
4 changed files with 90 additions and 35 deletions

View file

@ -16,7 +16,10 @@ from sre_constants import *
assert _sre.MAGIC == MAGIC, "SRE module mismatch"
MAXCODE = 65535
if _sre.CODESIZE == 2:
MAXCODE = 65535
else:
MAXCODE = 0xFFFFFFFFL
def _compile(code, pattern, flags):
# internal: compile a (sub)pattern
@ -191,9 +194,6 @@ def _optimize_charset(charset, fixup):
# XXX: could append to charmap tail
return charset # cannot compress
except IndexError:
if sys.maxunicode != 65535:
# XXX: big charsets don't work in UCS-4 builds
return charset
# character set contains unicode characters
return _optimize_unicode(charset, fixup)
# compress character map
@ -228,14 +228,18 @@ def _optimize_charset(charset, fixup):
def _mk_bitmap(bits):
data = []
m = 1; v = 0
if _sre.CODESIZE == 2:
start = (1, 0)
else:
start = (1L, 0L)
m, v = start
for c in bits:
if c:
v = v + m
m = m << 1
if m > MAXCODE:
data.append(v)
m = 1; v = 0
m, v = start
return data
# To represent a big charset, first a bitmap of all characters in the
@ -258,21 +262,38 @@ def _mk_bitmap(bits):
# less significant byte is a bit index in the chunk (just like the
# CHARSET matching).
# In UCS-4 mode, the BIGCHARSET opcode still supports only subsets
# of the basic multilingual plane; an efficient representation
# for all of UTF-16 has not yet been developed. This means,
# in particular, that negated charsets cannot be represented as
# bigcharsets.
def _optimize_unicode(charset, fixup):
try:
import array
except ImportError:
return charset
charmap = [0]*65536
negate = 0
for op, av in charset:
if op is NEGATE:
negate = 1
elif op is LITERAL:
charmap[fixup(av)] = 1
elif op is RANGE:
for i in range(fixup(av[0]), fixup(av[1])+1):
charmap[i] = 1
elif op is CATEGORY:
# XXX: could expand category
return charset # cannot compress
try:
for op, av in charset:
if op is NEGATE:
negate = 1
elif op is LITERAL:
charmap[fixup(av)] = 1
elif op is RANGE:
for i in range(fixup(av[0]), fixup(av[1])+1):
charmap[i] = 1
elif op is CATEGORY:
# XXX: could expand category
return charset # cannot compress
except IndexError:
# non-BMP characters
return charset
if negate:
if sys.maxunicode != 65535:
# XXX: negation does not work with big charsets
return charset
for i in range(65536):
charmap[i] = not charmap[i]
comps = {}
@ -287,12 +308,14 @@ def _optimize_unicode(charset, fixup):
block = block + 1
data = data + _mk_bitmap(chunk)
header = [block]
assert MAXCODE == 65535
for i in range(128):
if sys.byteorder == 'big':
header.append(256*mapping[2*i]+mapping[2*i+1])
else:
header.append(mapping[2*i]+256*mapping[2*i+1])
if MAXCODE == 65535:
code = 'H'
else:
code = 'L'
# Convert block indices to byte array of 256 bytes
mapping = array.array('b', mapping).tostring()
# Convert byte array to word array
header = header + array.array(code, mapping).tolist()
data[0:0] = header
return [(BIGCHARSET, data)]