mirror of
https://github.com/python/cpython.git
synced 2024-12-03 23:06:43 +08:00
1ce3eb5c5b
frombytes() and tobytes(), respectively, to avoid confusion. Furthermore, array.frombytes(), array.extend() as well as the array.array() constructor now accept bytearray objects. Patch by Thomas Jollans.
516 lines
16 KiB
Python
516 lines
16 KiB
Python
#
|
|
# Secret Labs' Regular Expression Engine
|
|
#
|
|
# convert template to internal format
|
|
#
|
|
# Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
|
|
#
|
|
# See the sre.py file for information on usage and redistribution.
|
|
#
|
|
|
|
"""Internal support module for sre"""
|
|
|
|
import _sre, sys
|
|
import sre_parse
|
|
from sre_constants import *
|
|
|
|
assert _sre.MAGIC == MAGIC, "SRE module mismatch"
|
|
|
|
if _sre.CODESIZE == 2:
|
|
MAXCODE = 65535
|
|
else:
|
|
MAXCODE = 0xFFFFFFFF
|
|
|
|
def _identityfunction(x):
|
|
return x
|
|
|
|
_LITERAL_CODES = set([LITERAL, NOT_LITERAL])
|
|
_REPEATING_CODES = set([REPEAT, MIN_REPEAT, MAX_REPEAT])
|
|
_SUCCESS_CODES = set([SUCCESS, FAILURE])
|
|
_ASSERT_CODES = set([ASSERT, ASSERT_NOT])
|
|
|
|
def _compile(code, pattern, flags):
|
|
# internal: compile a (sub)pattern
|
|
emit = code.append
|
|
_len = len
|
|
LITERAL_CODES = _LITERAL_CODES
|
|
REPEATING_CODES = _REPEATING_CODES
|
|
SUCCESS_CODES = _SUCCESS_CODES
|
|
ASSERT_CODES = _ASSERT_CODES
|
|
for op, av in pattern:
|
|
if op in LITERAL_CODES:
|
|
if flags & SRE_FLAG_IGNORECASE:
|
|
emit(OPCODES[OP_IGNORE[op]])
|
|
emit(_sre.getlower(av, flags))
|
|
else:
|
|
emit(OPCODES[op])
|
|
emit(av)
|
|
elif op is IN:
|
|
if flags & SRE_FLAG_IGNORECASE:
|
|
emit(OPCODES[OP_IGNORE[op]])
|
|
def fixup(literal, flags=flags):
|
|
return _sre.getlower(literal, flags)
|
|
else:
|
|
emit(OPCODES[op])
|
|
fixup = _identityfunction
|
|
skip = _len(code); emit(0)
|
|
_compile_charset(av, flags, code, fixup)
|
|
code[skip] = _len(code) - skip
|
|
elif op is ANY:
|
|
if flags & SRE_FLAG_DOTALL:
|
|
emit(OPCODES[ANY_ALL])
|
|
else:
|
|
emit(OPCODES[ANY])
|
|
elif op in REPEATING_CODES:
|
|
if flags & SRE_FLAG_TEMPLATE:
|
|
raise error("internal: unsupported template operator")
|
|
emit(OPCODES[REPEAT])
|
|
skip = _len(code); emit(0)
|
|
emit(av[0])
|
|
emit(av[1])
|
|
_compile(code, av[2], flags)
|
|
emit(OPCODES[SUCCESS])
|
|
code[skip] = _len(code) - skip
|
|
elif _simple(av) and op is not REPEAT:
|
|
if op is MAX_REPEAT:
|
|
emit(OPCODES[REPEAT_ONE])
|
|
else:
|
|
emit(OPCODES[MIN_REPEAT_ONE])
|
|
skip = _len(code); emit(0)
|
|
emit(av[0])
|
|
emit(av[1])
|
|
_compile(code, av[2], flags)
|
|
emit(OPCODES[SUCCESS])
|
|
code[skip] = _len(code) - skip
|
|
else:
|
|
emit(OPCODES[REPEAT])
|
|
skip = _len(code); emit(0)
|
|
emit(av[0])
|
|
emit(av[1])
|
|
_compile(code, av[2], flags)
|
|
code[skip] = _len(code) - skip
|
|
if op is MAX_REPEAT:
|
|
emit(OPCODES[MAX_UNTIL])
|
|
else:
|
|
emit(OPCODES[MIN_UNTIL])
|
|
elif op is SUBPATTERN:
|
|
if av[0]:
|
|
emit(OPCODES[MARK])
|
|
emit((av[0]-1)*2)
|
|
# _compile_info(code, av[1], flags)
|
|
_compile(code, av[1], flags)
|
|
if av[0]:
|
|
emit(OPCODES[MARK])
|
|
emit((av[0]-1)*2+1)
|
|
elif op in SUCCESS_CODES:
|
|
emit(OPCODES[op])
|
|
elif op in ASSERT_CODES:
|
|
emit(OPCODES[op])
|
|
skip = _len(code); emit(0)
|
|
if av[0] >= 0:
|
|
emit(0) # look ahead
|
|
else:
|
|
lo, hi = av[1].getwidth()
|
|
if lo != hi:
|
|
raise error("look-behind requires fixed-width pattern")
|
|
emit(lo) # look behind
|
|
_compile(code, av[1], flags)
|
|
emit(OPCODES[SUCCESS])
|
|
code[skip] = _len(code) - skip
|
|
elif op is CALL:
|
|
emit(OPCODES[op])
|
|
skip = _len(code); emit(0)
|
|
_compile(code, av, flags)
|
|
emit(OPCODES[SUCCESS])
|
|
code[skip] = _len(code) - skip
|
|
elif op is AT:
|
|
emit(OPCODES[op])
|
|
if flags & SRE_FLAG_MULTILINE:
|
|
av = AT_MULTILINE.get(av, av)
|
|
if flags & SRE_FLAG_LOCALE:
|
|
av = AT_LOCALE.get(av, av)
|
|
elif flags & SRE_FLAG_UNICODE:
|
|
av = AT_UNICODE.get(av, av)
|
|
emit(ATCODES[av])
|
|
elif op is BRANCH:
|
|
emit(OPCODES[op])
|
|
tail = []
|
|
tailappend = tail.append
|
|
for av in av[1]:
|
|
skip = _len(code); emit(0)
|
|
# _compile_info(code, av, flags)
|
|
_compile(code, av, flags)
|
|
emit(OPCODES[JUMP])
|
|
tailappend(_len(code)); emit(0)
|
|
code[skip] = _len(code) - skip
|
|
emit(0) # end of branch
|
|
for tail in tail:
|
|
code[tail] = _len(code) - tail
|
|
elif op is CATEGORY:
|
|
emit(OPCODES[op])
|
|
if flags & SRE_FLAG_LOCALE:
|
|
av = CH_LOCALE[av]
|
|
elif flags & SRE_FLAG_UNICODE:
|
|
av = CH_UNICODE[av]
|
|
emit(CHCODES[av])
|
|
elif op is GROUPREF:
|
|
if flags & SRE_FLAG_IGNORECASE:
|
|
emit(OPCODES[OP_IGNORE[op]])
|
|
else:
|
|
emit(OPCODES[op])
|
|
emit(av-1)
|
|
elif op is GROUPREF_EXISTS:
|
|
emit(OPCODES[op])
|
|
emit(av[0]-1)
|
|
skipyes = _len(code); emit(0)
|
|
_compile(code, av[1], flags)
|
|
if av[2]:
|
|
emit(OPCODES[JUMP])
|
|
skipno = _len(code); emit(0)
|
|
code[skipyes] = _len(code) - skipyes + 1
|
|
_compile(code, av[2], flags)
|
|
code[skipno] = _len(code) - skipno
|
|
else:
|
|
code[skipyes] = _len(code) - skipyes + 1
|
|
else:
|
|
raise ValueError("unsupported operand type", op)
|
|
|
|
def _compile_charset(charset, flags, code, fixup=None):
|
|
# compile charset subprogram
|
|
emit = code.append
|
|
if fixup is None:
|
|
fixup = _identityfunction
|
|
for op, av in _optimize_charset(charset, fixup):
|
|
emit(OPCODES[op])
|
|
if op is NEGATE:
|
|
pass
|
|
elif op is LITERAL:
|
|
emit(fixup(av))
|
|
elif op is RANGE:
|
|
emit(fixup(av[0]))
|
|
emit(fixup(av[1]))
|
|
elif op is CHARSET:
|
|
code.extend(av)
|
|
elif op is BIGCHARSET:
|
|
code.extend(av)
|
|
elif op is CATEGORY:
|
|
if flags & SRE_FLAG_LOCALE:
|
|
emit(CHCODES[CH_LOCALE[av]])
|
|
elif flags & SRE_FLAG_UNICODE:
|
|
emit(CHCODES[CH_UNICODE[av]])
|
|
else:
|
|
emit(CHCODES[av])
|
|
else:
|
|
raise error("internal: unsupported set operator")
|
|
emit(OPCODES[FAILURE])
|
|
|
|
def _optimize_charset(charset, fixup):
|
|
# internal: optimize character set
|
|
out = []
|
|
outappend = out.append
|
|
charmap = [0]*256
|
|
try:
|
|
for op, av in charset:
|
|
if op is NEGATE:
|
|
outappend((op, av))
|
|
elif op is LITERAL:
|
|
charmap[fixup(av)] = 1
|
|
elif op is RANGE:
|
|
for i in range(fixup(av[0]), fixup(av[1])+1):
|
|
charmap[i] = 1
|
|
elif op is CATEGORY:
|
|
# XXX: could append to charmap tail
|
|
return charset # cannot compress
|
|
except IndexError:
|
|
# character set contains unicode characters
|
|
return _optimize_unicode(charset, fixup)
|
|
# compress character map
|
|
i = p = n = 0
|
|
runs = []
|
|
runsappend = runs.append
|
|
for c in charmap:
|
|
if c:
|
|
if n == 0:
|
|
p = i
|
|
n = n + 1
|
|
elif n:
|
|
runsappend((p, n))
|
|
n = 0
|
|
i = i + 1
|
|
if n:
|
|
runsappend((p, n))
|
|
if len(runs) <= 2:
|
|
# use literal/range
|
|
for p, n in runs:
|
|
if n == 1:
|
|
outappend((LITERAL, p))
|
|
else:
|
|
outappend((RANGE, (p, p+n-1)))
|
|
if len(out) < len(charset):
|
|
return out
|
|
else:
|
|
# use bitmap
|
|
data = _mk_bitmap(charmap)
|
|
outappend((CHARSET, data))
|
|
return out
|
|
return charset
|
|
|
|
def _mk_bitmap(bits):
|
|
data = []
|
|
dataappend = data.append
|
|
if _sre.CODESIZE == 2:
|
|
start = (1, 0)
|
|
else:
|
|
start = (1, 0)
|
|
m, v = start
|
|
for c in bits:
|
|
if c:
|
|
v = v + m
|
|
m = m + m
|
|
if m > MAXCODE:
|
|
dataappend(v)
|
|
m, v = start
|
|
return data
|
|
|
|
# To represent a big charset, first a bitmap of all characters in the
|
|
# set is constructed. Then, this bitmap is sliced into chunks of 256
|
|
# characters, duplicate chunks are eliminated, and each chunk is
|
|
# given a number. In the compiled expression, the charset is
|
|
# represented by a 16-bit word sequence, consisting of one word for
|
|
# the number of different chunks, a sequence of 256 bytes (128 words)
|
|
# of chunk numbers indexed by their original chunk position, and a
|
|
# sequence of chunks (16 words each).
|
|
|
|
# Compression is normally good: in a typical charset, large ranges of
|
|
# Unicode will be either completely excluded (e.g. if only cyrillic
|
|
# letters are to be matched), or completely included (e.g. if large
|
|
# subranges of Kanji match). These ranges will be represented by
|
|
# chunks of all one-bits or all zero-bits.
|
|
|
|
# Matching can be also done efficiently: the more significant byte of
|
|
# the Unicode character is an index into the chunk number, and the
|
|
# less significant byte is a bit index in the chunk (just like the
|
|
# CHARSET matching).
|
|
|
|
# In UCS-4 mode, the BIGCHARSET opcode still supports only subsets
|
|
# of the basic multilingual plane; an efficient representation
|
|
# for all of UTF-16 has not yet been developed. This means,
|
|
# in particular, that negated charsets cannot be represented as
|
|
# bigcharsets.
|
|
|
|
def _optimize_unicode(charset, fixup):
|
|
try:
|
|
import array
|
|
except ImportError:
|
|
return charset
|
|
charmap = [0]*65536
|
|
negate = 0
|
|
try:
|
|
for op, av in charset:
|
|
if op is NEGATE:
|
|
negate = 1
|
|
elif op is LITERAL:
|
|
charmap[fixup(av)] = 1
|
|
elif op is RANGE:
|
|
for i in range(fixup(av[0]), fixup(av[1])+1):
|
|
charmap[i] = 1
|
|
elif op is CATEGORY:
|
|
# XXX: could expand category
|
|
return charset # cannot compress
|
|
except IndexError:
|
|
# non-BMP characters
|
|
return charset
|
|
if negate:
|
|
if sys.maxunicode != 65535:
|
|
# XXX: negation does not work with big charsets
|
|
return charset
|
|
for i in range(65536):
|
|
charmap[i] = not charmap[i]
|
|
comps = {}
|
|
mapping = [0]*256
|
|
block = 0
|
|
data = []
|
|
for i in range(256):
|
|
chunk = tuple(charmap[i*256:(i+1)*256])
|
|
new = comps.setdefault(chunk, block)
|
|
mapping[i] = new
|
|
if new == block:
|
|
block = block + 1
|
|
data = data + _mk_bitmap(chunk)
|
|
header = [block]
|
|
if _sre.CODESIZE == 2:
|
|
code = 'H'
|
|
else:
|
|
code = 'I'
|
|
# Convert block indices to byte array of 256 bytes
|
|
mapping = array.array('b', mapping).tobytes()
|
|
# Convert byte array to word array
|
|
mapping = array.array(code, mapping)
|
|
assert mapping.itemsize == _sre.CODESIZE
|
|
assert len(mapping) * mapping.itemsize == 256
|
|
header = header + mapping.tolist()
|
|
data[0:0] = header
|
|
return [(BIGCHARSET, data)]
|
|
|
|
def _simple(av):
|
|
# check if av is a "simple" operator
|
|
lo, hi = av[2].getwidth()
|
|
if lo == 0 and hi == MAXREPEAT:
|
|
raise error("nothing to repeat")
|
|
return lo == hi == 1 and av[2][0][0] != SUBPATTERN
|
|
|
|
def _compile_info(code, pattern, flags):
|
|
# internal: compile an info block. in the current version,
|
|
# this contains min/max pattern width, and an optional literal
|
|
# prefix or a character map
|
|
lo, hi = pattern.getwidth()
|
|
if lo == 0:
|
|
return # not worth it
|
|
# look for a literal prefix
|
|
prefix = []
|
|
prefixappend = prefix.append
|
|
prefix_skip = 0
|
|
charset = [] # not used
|
|
charsetappend = charset.append
|
|
if not (flags & SRE_FLAG_IGNORECASE):
|
|
# look for literal prefix
|
|
for op, av in pattern.data:
|
|
if op is LITERAL:
|
|
if len(prefix) == prefix_skip:
|
|
prefix_skip = prefix_skip + 1
|
|
prefixappend(av)
|
|
elif op is SUBPATTERN and len(av[1]) == 1:
|
|
op, av = av[1][0]
|
|
if op is LITERAL:
|
|
prefixappend(av)
|
|
else:
|
|
break
|
|
else:
|
|
break
|
|
# if no prefix, look for charset prefix
|
|
if not prefix and pattern.data:
|
|
op, av = pattern.data[0]
|
|
if op is SUBPATTERN and av[1]:
|
|
op, av = av[1][0]
|
|
if op is LITERAL:
|
|
charsetappend((op, av))
|
|
elif op is BRANCH:
|
|
c = []
|
|
cappend = c.append
|
|
for p in av[1]:
|
|
if not p:
|
|
break
|
|
op, av = p[0]
|
|
if op is LITERAL:
|
|
cappend((op, av))
|
|
else:
|
|
break
|
|
else:
|
|
charset = c
|
|
elif op is BRANCH:
|
|
c = []
|
|
cappend = c.append
|
|
for p in av[1]:
|
|
if not p:
|
|
break
|
|
op, av = p[0]
|
|
if op is LITERAL:
|
|
cappend((op, av))
|
|
else:
|
|
break
|
|
else:
|
|
charset = c
|
|
elif op is IN:
|
|
charset = av
|
|
## if prefix:
|
|
## print "*** PREFIX", prefix, prefix_skip
|
|
## if charset:
|
|
## print "*** CHARSET", charset
|
|
# add an info block
|
|
emit = code.append
|
|
emit(OPCODES[INFO])
|
|
skip = len(code); emit(0)
|
|
# literal flag
|
|
mask = 0
|
|
if prefix:
|
|
mask = SRE_INFO_PREFIX
|
|
if len(prefix) == prefix_skip == len(pattern.data):
|
|
mask = mask + SRE_INFO_LITERAL
|
|
elif charset:
|
|
mask = mask + SRE_INFO_CHARSET
|
|
emit(mask)
|
|
# pattern length
|
|
if lo < MAXCODE:
|
|
emit(lo)
|
|
else:
|
|
emit(MAXCODE)
|
|
prefix = prefix[:MAXCODE]
|
|
if hi < MAXCODE:
|
|
emit(hi)
|
|
else:
|
|
emit(0)
|
|
# add literal prefix
|
|
if prefix:
|
|
emit(len(prefix)) # length
|
|
emit(prefix_skip) # skip
|
|
code.extend(prefix)
|
|
# generate overlap table
|
|
table = [-1] + ([0]*len(prefix))
|
|
for i in range(len(prefix)):
|
|
table[i+1] = table[i]+1
|
|
while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]:
|
|
table[i+1] = table[table[i+1]-1]+1
|
|
code.extend(table[1:]) # don't store first entry
|
|
elif charset:
|
|
_compile_charset(charset, flags, code)
|
|
code[skip] = len(code) - skip
|
|
|
|
def isstring(obj):
|
|
return isinstance(obj, (str, bytes))
|
|
|
|
def _code(p, flags):
|
|
|
|
flags = p.pattern.flags | flags
|
|
code = []
|
|
|
|
# compile info block
|
|
_compile_info(code, p, flags)
|
|
|
|
# compile the pattern
|
|
_compile(code, p.data, flags)
|
|
|
|
code.append(OPCODES[SUCCESS])
|
|
|
|
return code
|
|
|
|
def compile(p, flags=0):
|
|
# internal: convert pattern list to internal format
|
|
|
|
if isstring(p):
|
|
pattern = p
|
|
p = sre_parse.parse(p, flags)
|
|
else:
|
|
pattern = None
|
|
|
|
code = _code(p, flags)
|
|
|
|
# print code
|
|
|
|
# XXX: <fl> get rid of this limitation!
|
|
if p.pattern.groups > 100:
|
|
raise AssertionError(
|
|
"sorry, but this version only supports 100 named groups"
|
|
)
|
|
|
|
# map in either direction
|
|
groupindex = p.pattern.groupdict
|
|
indexgroup = [None] * p.pattern.groups
|
|
for k, i in groupindex.items():
|
|
indexgroup[i] = k
|
|
|
|
return _sre.compile(
|
|
pattern, flags | p.pattern.flags, code,
|
|
p.pattern.groups-1,
|
|
groupindex, indexgroup
|
|
)
|