mirror of
https://github.com/python/cpython.git
synced 2024-12-29 03:34:57 +08:00
8f36af7a4c
'latin-1' and 'utf-8'. These are optimized in the Python Unicode implementation to result in more direct processing, bypassing the codec registry. Also see issue11303.
817 lines
27 KiB
Python
817 lines
27 KiB
Python
#
|
|
# Secret Labs' Regular Expression Engine
|
|
#
|
|
# convert re-style regular expression to sre pattern
|
|
#
|
|
# Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved.
|
|
#
|
|
# See the sre.py file for information on usage and redistribution.
|
|
#
|
|
|
|
"""Internal support module for sre"""
|
|
|
|
# XXX: show string offset and offending character for all errors
|
|
|
|
import sys
|
|
|
|
from sre_constants import *
|
|
|
|
SPECIAL_CHARS = ".\\[{()*+?^$|"
|
|
REPEAT_CHARS = "*+?{"
|
|
|
|
DIGITS = set("0123456789")
|
|
|
|
OCTDIGITS = set("01234567")
|
|
HEXDIGITS = set("0123456789abcdefABCDEF")
|
|
|
|
WHITESPACE = set(" \t\n\r\v\f")
|
|
|
|
ESCAPES = {
|
|
r"\a": (LITERAL, ord("\a")),
|
|
r"\b": (LITERAL, ord("\b")),
|
|
r"\f": (LITERAL, ord("\f")),
|
|
r"\n": (LITERAL, ord("\n")),
|
|
r"\r": (LITERAL, ord("\r")),
|
|
r"\t": (LITERAL, ord("\t")),
|
|
r"\v": (LITERAL, ord("\v")),
|
|
r"\\": (LITERAL, ord("\\"))
|
|
}
|
|
|
|
CATEGORIES = {
|
|
r"\A": (AT, AT_BEGINNING_STRING), # start of string
|
|
r"\b": (AT, AT_BOUNDARY),
|
|
r"\B": (AT, AT_NON_BOUNDARY),
|
|
r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]),
|
|
r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]),
|
|
r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]),
|
|
r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]),
|
|
r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]),
|
|
r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]),
|
|
r"\Z": (AT, AT_END_STRING), # end of string
|
|
}
|
|
|
|
FLAGS = {
|
|
# standard flags
|
|
"i": SRE_FLAG_IGNORECASE,
|
|
"L": SRE_FLAG_LOCALE,
|
|
"m": SRE_FLAG_MULTILINE,
|
|
"s": SRE_FLAG_DOTALL,
|
|
"x": SRE_FLAG_VERBOSE,
|
|
# extensions
|
|
"a": SRE_FLAG_ASCII,
|
|
"t": SRE_FLAG_TEMPLATE,
|
|
"u": SRE_FLAG_UNICODE,
|
|
}
|
|
|
|
class Pattern:
|
|
# master pattern object. keeps track of global attributes
|
|
def __init__(self):
|
|
self.flags = 0
|
|
self.open = []
|
|
self.groups = 1
|
|
self.groupdict = {}
|
|
def opengroup(self, name=None):
|
|
gid = self.groups
|
|
self.groups = gid + 1
|
|
if name is not None:
|
|
ogid = self.groupdict.get(name, None)
|
|
if ogid is not None:
|
|
raise error("redefinition of group name %s as group %d; "
|
|
"was group %d" % (repr(name), gid, ogid))
|
|
self.groupdict[name] = gid
|
|
self.open.append(gid)
|
|
return gid
|
|
def closegroup(self, gid):
|
|
self.open.remove(gid)
|
|
def checkgroup(self, gid):
|
|
return gid < self.groups and gid not in self.open
|
|
|
|
class SubPattern:
|
|
# a subpattern, in intermediate form
|
|
def __init__(self, pattern, data=None):
|
|
self.pattern = pattern
|
|
if data is None:
|
|
data = []
|
|
self.data = data
|
|
self.width = None
|
|
def dump(self, level=0):
|
|
nl = 1
|
|
seqtypes = (tuple, list)
|
|
for op, av in self.data:
|
|
print(level*" " + op, end=' '); nl = 0
|
|
if op == "in":
|
|
# member sublanguage
|
|
print(); nl = 1
|
|
for op, a in av:
|
|
print((level+1)*" " + op, a)
|
|
elif op == "branch":
|
|
print(); nl = 1
|
|
i = 0
|
|
for a in av[1]:
|
|
if i > 0:
|
|
print(level*" " + "or")
|
|
a.dump(level+1); nl = 1
|
|
i = i + 1
|
|
elif isinstance(av, seqtypes):
|
|
for a in av:
|
|
if isinstance(a, SubPattern):
|
|
if not nl: print()
|
|
a.dump(level+1); nl = 1
|
|
else:
|
|
print(a, end=' ') ; nl = 0
|
|
else:
|
|
print(av, end=' ') ; nl = 0
|
|
if not nl: print()
|
|
def __repr__(self):
|
|
return repr(self.data)
|
|
def __len__(self):
|
|
return len(self.data)
|
|
def __delitem__(self, index):
|
|
del self.data[index]
|
|
def __getitem__(self, index):
|
|
if isinstance(index, slice):
|
|
return SubPattern(self.pattern, self.data[index])
|
|
return self.data[index]
|
|
def __setitem__(self, index, code):
|
|
self.data[index] = code
|
|
def insert(self, index, code):
|
|
self.data.insert(index, code)
|
|
def append(self, code):
|
|
self.data.append(code)
|
|
def getwidth(self):
|
|
# determine the width (min, max) for this subpattern
|
|
if self.width:
|
|
return self.width
|
|
lo = hi = 0
|
|
UNITCODES = (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY)
|
|
REPEATCODES = (MIN_REPEAT, MAX_REPEAT)
|
|
for op, av in self.data:
|
|
if op is BRANCH:
|
|
i = sys.maxsize
|
|
j = 0
|
|
for av in av[1]:
|
|
l, h = av.getwidth()
|
|
i = min(i, l)
|
|
j = max(j, h)
|
|
lo = lo + i
|
|
hi = hi + j
|
|
elif op is CALL:
|
|
i, j = av.getwidth()
|
|
lo = lo + i
|
|
hi = hi + j
|
|
elif op is SUBPATTERN:
|
|
i, j = av[1].getwidth()
|
|
lo = lo + i
|
|
hi = hi + j
|
|
elif op in REPEATCODES:
|
|
i, j = av[2].getwidth()
|
|
lo = lo + int(i) * av[0]
|
|
hi = hi + int(j) * av[1]
|
|
elif op in UNITCODES:
|
|
lo = lo + 1
|
|
hi = hi + 1
|
|
elif op == SUCCESS:
|
|
break
|
|
self.width = int(min(lo, sys.maxsize)), int(min(hi, sys.maxsize))
|
|
return self.width
|
|
|
|
class Tokenizer:
|
|
def __init__(self, string):
|
|
self.string = string
|
|
self.index = 0
|
|
self.__next()
|
|
def __next(self):
|
|
if self.index >= len(self.string):
|
|
self.next = None
|
|
return
|
|
char = self.string[self.index:self.index+1]
|
|
# Special case for the str8, since indexing returns a integer
|
|
# XXX This is only needed for test_bug_926075 in test_re.py
|
|
if char and isinstance(char, bytes):
|
|
char = chr(char[0])
|
|
if char == "\\":
|
|
try:
|
|
c = self.string[self.index + 1]
|
|
except IndexError:
|
|
raise error("bogus escape (end of line)")
|
|
if isinstance(self.string, bytes):
|
|
c = chr(c)
|
|
char = char + c
|
|
self.index = self.index + len(char)
|
|
self.next = char
|
|
def match(self, char, skip=1):
|
|
if char == self.next:
|
|
if skip:
|
|
self.__next()
|
|
return 1
|
|
return 0
|
|
def get(self):
|
|
this = self.next
|
|
self.__next()
|
|
return this
|
|
def tell(self):
|
|
return self.index, self.next
|
|
def seek(self, index):
|
|
self.index, self.next = index
|
|
|
|
def isident(char):
|
|
return "a" <= char <= "z" or "A" <= char <= "Z" or char == "_"
|
|
|
|
def isdigit(char):
|
|
return "0" <= char <= "9"
|
|
|
|
def isname(name):
|
|
# check that group name is a valid string
|
|
if not isident(name[0]):
|
|
return False
|
|
for char in name[1:]:
|
|
if not isident(char) and not isdigit(char):
|
|
return False
|
|
return True
|
|
|
|
def _class_escape(source, escape):
|
|
# handle escape code inside character class
|
|
code = ESCAPES.get(escape)
|
|
if code:
|
|
return code
|
|
code = CATEGORIES.get(escape)
|
|
if code:
|
|
return code
|
|
try:
|
|
c = escape[1:2]
|
|
if c == "x":
|
|
# hexadecimal escape (exactly two digits)
|
|
while source.next in HEXDIGITS and len(escape) < 4:
|
|
escape = escape + source.get()
|
|
escape = escape[2:]
|
|
if len(escape) != 2:
|
|
raise error("bogus escape: %s" % repr("\\" + escape))
|
|
return LITERAL, int(escape, 16) & 0xff
|
|
elif c in OCTDIGITS:
|
|
# octal escape (up to three digits)
|
|
while source.next in OCTDIGITS and len(escape) < 4:
|
|
escape = escape + source.get()
|
|
escape = escape[1:]
|
|
return LITERAL, int(escape, 8) & 0xff
|
|
elif c in DIGITS:
|
|
raise error("bogus escape: %s" % repr(escape))
|
|
if len(escape) == 2:
|
|
return LITERAL, ord(escape[1])
|
|
except ValueError:
|
|
pass
|
|
raise error("bogus escape: %s" % repr(escape))
|
|
|
|
def _escape(source, escape, state):
|
|
# handle escape code in expression
|
|
code = CATEGORIES.get(escape)
|
|
if code:
|
|
return code
|
|
code = ESCAPES.get(escape)
|
|
if code:
|
|
return code
|
|
try:
|
|
c = escape[1:2]
|
|
if c == "x":
|
|
# hexadecimal escape
|
|
while source.next in HEXDIGITS and len(escape) < 4:
|
|
escape = escape + source.get()
|
|
if len(escape) != 4:
|
|
raise ValueError
|
|
return LITERAL, int(escape[2:], 16) & 0xff
|
|
elif c == "0":
|
|
# octal escape
|
|
while source.next in OCTDIGITS and len(escape) < 4:
|
|
escape = escape + source.get()
|
|
return LITERAL, int(escape[1:], 8) & 0xff
|
|
elif c in DIGITS:
|
|
# octal escape *or* decimal group reference (sigh)
|
|
if source.next in DIGITS:
|
|
escape = escape + source.get()
|
|
if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and
|
|
source.next in OCTDIGITS):
|
|
# got three octal digits; this is an octal escape
|
|
escape = escape + source.get()
|
|
return LITERAL, int(escape[1:], 8) & 0xff
|
|
# not an octal escape, so this is a group reference
|
|
group = int(escape[1:])
|
|
if group < state.groups:
|
|
if not state.checkgroup(group):
|
|
raise error("cannot refer to open group")
|
|
return GROUPREF, group
|
|
raise ValueError
|
|
if len(escape) == 2:
|
|
return LITERAL, ord(escape[1])
|
|
except ValueError:
|
|
pass
|
|
raise error("bogus escape: %s" % repr(escape))
|
|
|
|
def _parse_sub(source, state, nested=1):
|
|
# parse an alternation: a|b|c
|
|
|
|
items = []
|
|
itemsappend = items.append
|
|
sourcematch = source.match
|
|
while 1:
|
|
itemsappend(_parse(source, state))
|
|
if sourcematch("|"):
|
|
continue
|
|
if not nested:
|
|
break
|
|
if not source.next or sourcematch(")", 0):
|
|
break
|
|
else:
|
|
raise error("pattern not properly closed")
|
|
|
|
if len(items) == 1:
|
|
return items[0]
|
|
|
|
subpattern = SubPattern(state)
|
|
subpatternappend = subpattern.append
|
|
|
|
# check if all items share a common prefix
|
|
while 1:
|
|
prefix = None
|
|
for item in items:
|
|
if not item:
|
|
break
|
|
if prefix is None:
|
|
prefix = item[0]
|
|
elif item[0] != prefix:
|
|
break
|
|
else:
|
|
# all subitems start with a common "prefix".
|
|
# move it out of the branch
|
|
for item in items:
|
|
del item[0]
|
|
subpatternappend(prefix)
|
|
continue # check next one
|
|
break
|
|
|
|
# check if the branch can be replaced by a character set
|
|
for item in items:
|
|
if len(item) != 1 or item[0][0] != LITERAL:
|
|
break
|
|
else:
|
|
# we can store this as a character set instead of a
|
|
# branch (the compiler may optimize this even more)
|
|
set = []
|
|
setappend = set.append
|
|
for item in items:
|
|
setappend(item[0])
|
|
subpatternappend((IN, set))
|
|
return subpattern
|
|
|
|
subpattern.append((BRANCH, (None, items)))
|
|
return subpattern
|
|
|
|
def _parse_sub_cond(source, state, condgroup):
|
|
item_yes = _parse(source, state)
|
|
if source.match("|"):
|
|
item_no = _parse(source, state)
|
|
if source.match("|"):
|
|
raise error("conditional backref with more than two branches")
|
|
else:
|
|
item_no = None
|
|
if source.next and not source.match(")", 0):
|
|
raise error("pattern not properly closed")
|
|
subpattern = SubPattern(state)
|
|
subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
|
|
return subpattern
|
|
|
|
_PATTERNENDERS = set("|)")
|
|
_ASSERTCHARS = set("=!<")
|
|
_LOOKBEHINDASSERTCHARS = set("=!")
|
|
_REPEATCODES = set([MIN_REPEAT, MAX_REPEAT])
|
|
|
|
def _parse(source, state):
|
|
# parse a simple pattern
|
|
subpattern = SubPattern(state)
|
|
|
|
# precompute constants into local variables
|
|
subpatternappend = subpattern.append
|
|
sourceget = source.get
|
|
sourcematch = source.match
|
|
_len = len
|
|
PATTERNENDERS = _PATTERNENDERS
|
|
ASSERTCHARS = _ASSERTCHARS
|
|
LOOKBEHINDASSERTCHARS = _LOOKBEHINDASSERTCHARS
|
|
REPEATCODES = _REPEATCODES
|
|
|
|
while 1:
|
|
|
|
if source.next in PATTERNENDERS:
|
|
break # end of subpattern
|
|
this = sourceget()
|
|
if this is None:
|
|
break # end of pattern
|
|
|
|
if state.flags & SRE_FLAG_VERBOSE:
|
|
# skip whitespace and comments
|
|
if this in WHITESPACE:
|
|
continue
|
|
if this == "#":
|
|
while 1:
|
|
this = sourceget()
|
|
if this in (None, "\n"):
|
|
break
|
|
continue
|
|
|
|
if this and this[0] not in SPECIAL_CHARS:
|
|
subpatternappend((LITERAL, ord(this)))
|
|
|
|
elif this == "[":
|
|
# character set
|
|
set = []
|
|
setappend = set.append
|
|
## if sourcematch(":"):
|
|
## pass # handle character classes
|
|
if sourcematch("^"):
|
|
setappend((NEGATE, None))
|
|
# check remaining characters
|
|
start = set[:]
|
|
while 1:
|
|
this = sourceget()
|
|
if this == "]" and set != start:
|
|
break
|
|
elif this and this[0] == "\\":
|
|
code1 = _class_escape(source, this)
|
|
elif this:
|
|
code1 = LITERAL, ord(this)
|
|
else:
|
|
raise error("unexpected end of regular expression")
|
|
if sourcematch("-"):
|
|
# potential range
|
|
this = sourceget()
|
|
if this == "]":
|
|
if code1[0] is IN:
|
|
code1 = code1[1][0]
|
|
setappend(code1)
|
|
setappend((LITERAL, ord("-")))
|
|
break
|
|
elif this:
|
|
if this[0] == "\\":
|
|
code2 = _class_escape(source, this)
|
|
else:
|
|
code2 = LITERAL, ord(this)
|
|
if code1[0] != LITERAL or code2[0] != LITERAL:
|
|
raise error("bad character range")
|
|
lo = code1[1]
|
|
hi = code2[1]
|
|
if hi < lo:
|
|
raise error("bad character range")
|
|
setappend((RANGE, (lo, hi)))
|
|
else:
|
|
raise error("unexpected end of regular expression")
|
|
else:
|
|
if code1[0] is IN:
|
|
code1 = code1[1][0]
|
|
setappend(code1)
|
|
|
|
# XXX: <fl> should move set optimization to compiler!
|
|
if _len(set)==1 and set[0][0] is LITERAL:
|
|
subpatternappend(set[0]) # optimization
|
|
elif _len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL:
|
|
subpatternappend((NOT_LITERAL, set[1][1])) # optimization
|
|
else:
|
|
# XXX: <fl> should add charmap optimization here
|
|
subpatternappend((IN, set))
|
|
|
|
elif this and this[0] in REPEAT_CHARS:
|
|
# repeat previous item
|
|
if this == "?":
|
|
min, max = 0, 1
|
|
elif this == "*":
|
|
min, max = 0, MAXREPEAT
|
|
|
|
elif this == "+":
|
|
min, max = 1, MAXREPEAT
|
|
elif this == "{":
|
|
if source.next == "}":
|
|
subpatternappend((LITERAL, ord(this)))
|
|
continue
|
|
here = source.tell()
|
|
min, max = 0, MAXREPEAT
|
|
lo = hi = ""
|
|
while source.next in DIGITS:
|
|
lo = lo + source.get()
|
|
if sourcematch(","):
|
|
while source.next in DIGITS:
|
|
hi = hi + sourceget()
|
|
else:
|
|
hi = lo
|
|
if not sourcematch("}"):
|
|
subpatternappend((LITERAL, ord(this)))
|
|
source.seek(here)
|
|
continue
|
|
if lo:
|
|
min = int(lo)
|
|
if hi:
|
|
max = int(hi)
|
|
if max < min:
|
|
raise error("bad repeat interval")
|
|
else:
|
|
raise error("not supported")
|
|
# figure out which item to repeat
|
|
if subpattern:
|
|
item = subpattern[-1:]
|
|
else:
|
|
item = None
|
|
if not item or (_len(item) == 1 and item[0][0] == AT):
|
|
raise error("nothing to repeat")
|
|
if item[0][0] in REPEATCODES:
|
|
raise error("multiple repeat")
|
|
if sourcematch("?"):
|
|
subpattern[-1] = (MIN_REPEAT, (min, max, item))
|
|
else:
|
|
subpattern[-1] = (MAX_REPEAT, (min, max, item))
|
|
|
|
elif this == ".":
|
|
subpatternappend((ANY, None))
|
|
|
|
elif this == "(":
|
|
group = 1
|
|
name = None
|
|
condgroup = None
|
|
if sourcematch("?"):
|
|
group = 0
|
|
# options
|
|
if sourcematch("P"):
|
|
# python extensions
|
|
if sourcematch("<"):
|
|
# named group: skip forward to end of name
|
|
name = ""
|
|
while 1:
|
|
char = sourceget()
|
|
if char is None:
|
|
raise error("unterminated name")
|
|
if char == ">":
|
|
break
|
|
name = name + char
|
|
group = 1
|
|
if not isname(name):
|
|
raise error("bad character in group name")
|
|
elif sourcematch("="):
|
|
# named backreference
|
|
name = ""
|
|
while 1:
|
|
char = sourceget()
|
|
if char is None:
|
|
raise error("unterminated name")
|
|
if char == ")":
|
|
break
|
|
name = name + char
|
|
if not isname(name):
|
|
raise error("bad character in group name")
|
|
gid = state.groupdict.get(name)
|
|
if gid is None:
|
|
raise error("unknown group name")
|
|
subpatternappend((GROUPREF, gid))
|
|
continue
|
|
else:
|
|
char = sourceget()
|
|
if char is None:
|
|
raise error("unexpected end of pattern")
|
|
raise error("unknown specifier: ?P%s" % char)
|
|
elif sourcematch(":"):
|
|
# non-capturing group
|
|
group = 2
|
|
elif sourcematch("#"):
|
|
# comment
|
|
while 1:
|
|
if source.next is None or source.next == ")":
|
|
break
|
|
sourceget()
|
|
if not sourcematch(")"):
|
|
raise error("unbalanced parenthesis")
|
|
continue
|
|
elif source.next in ASSERTCHARS:
|
|
# lookahead assertions
|
|
char = sourceget()
|
|
dir = 1
|
|
if char == "<":
|
|
if source.next not in LOOKBEHINDASSERTCHARS:
|
|
raise error("syntax error")
|
|
dir = -1 # lookbehind
|
|
char = sourceget()
|
|
p = _parse_sub(source, state)
|
|
if not sourcematch(")"):
|
|
raise error("unbalanced parenthesis")
|
|
if char == "=":
|
|
subpatternappend((ASSERT, (dir, p)))
|
|
else:
|
|
subpatternappend((ASSERT_NOT, (dir, p)))
|
|
continue
|
|
elif sourcematch("("):
|
|
# conditional backreference group
|
|
condname = ""
|
|
while 1:
|
|
char = sourceget()
|
|
if char is None:
|
|
raise error("unterminated name")
|
|
if char == ")":
|
|
break
|
|
condname = condname + char
|
|
group = 2
|
|
if isname(condname):
|
|
condgroup = state.groupdict.get(condname)
|
|
if condgroup is None:
|
|
raise error("unknown group name")
|
|
else:
|
|
try:
|
|
condgroup = int(condname)
|
|
except ValueError:
|
|
raise error("bad character in group name")
|
|
else:
|
|
# flags
|
|
if not source.next in FLAGS:
|
|
raise error("unexpected end of pattern")
|
|
while source.next in FLAGS:
|
|
state.flags = state.flags | FLAGS[sourceget()]
|
|
if group:
|
|
# parse group contents
|
|
if group == 2:
|
|
# anonymous group
|
|
group = None
|
|
else:
|
|
group = state.opengroup(name)
|
|
if condgroup:
|
|
p = _parse_sub_cond(source, state, condgroup)
|
|
else:
|
|
p = _parse_sub(source, state)
|
|
if not sourcematch(")"):
|
|
raise error("unbalanced parenthesis")
|
|
if group is not None:
|
|
state.closegroup(group)
|
|
subpatternappend((SUBPATTERN, (group, p)))
|
|
else:
|
|
while 1:
|
|
char = sourceget()
|
|
if char is None:
|
|
raise error("unexpected end of pattern")
|
|
if char == ")":
|
|
break
|
|
raise error("unknown extension")
|
|
|
|
elif this == "^":
|
|
subpatternappend((AT, AT_BEGINNING))
|
|
|
|
elif this == "$":
|
|
subpattern.append((AT, AT_END))
|
|
|
|
elif this and this[0] == "\\":
|
|
code = _escape(source, this, state)
|
|
subpatternappend(code)
|
|
|
|
else:
|
|
raise error("parser error")
|
|
|
|
return subpattern
|
|
|
|
def fix_flags(src, flags):
|
|
# Check and fix flags according to the type of pattern (str or bytes)
|
|
if isinstance(src, str):
|
|
if not flags & SRE_FLAG_ASCII:
|
|
flags |= SRE_FLAG_UNICODE
|
|
elif flags & SRE_FLAG_UNICODE:
|
|
raise ValueError("ASCII and UNICODE flags are incompatible")
|
|
else:
|
|
if flags & SRE_FLAG_UNICODE:
|
|
raise ValueError("can't use UNICODE flag with a bytes pattern")
|
|
return flags
|
|
|
|
def parse(str, flags=0, pattern=None):
|
|
# parse 're' pattern into list of (opcode, argument) tuples
|
|
|
|
source = Tokenizer(str)
|
|
|
|
if pattern is None:
|
|
pattern = Pattern()
|
|
pattern.flags = flags
|
|
pattern.str = str
|
|
|
|
p = _parse_sub(source, pattern, 0)
|
|
p.pattern.flags = fix_flags(str, p.pattern.flags)
|
|
|
|
tail = source.get()
|
|
if tail == ")":
|
|
raise error("unbalanced parenthesis")
|
|
elif tail:
|
|
raise error("bogus characters at end of regular expression")
|
|
|
|
if flags & SRE_FLAG_DEBUG:
|
|
p.dump()
|
|
|
|
if not (flags & SRE_FLAG_VERBOSE) and p.pattern.flags & SRE_FLAG_VERBOSE:
|
|
# the VERBOSE flag was switched on inside the pattern. to be
|
|
# on the safe side, we'll parse the whole thing again...
|
|
return parse(str, p.pattern.flags)
|
|
|
|
return p
|
|
|
|
def parse_template(source, pattern):
|
|
# parse 're' replacement string into list of literals and
|
|
# group references
|
|
s = Tokenizer(source)
|
|
sget = s.get
|
|
p = []
|
|
a = p.append
|
|
def literal(literal, p=p, pappend=a):
|
|
if p and p[-1][0] is LITERAL:
|
|
p[-1] = LITERAL, p[-1][1] + literal
|
|
else:
|
|
pappend((LITERAL, literal))
|
|
sep = source[:0]
|
|
if isinstance(sep, str):
|
|
makechar = chr
|
|
else:
|
|
makechar = chr
|
|
while 1:
|
|
this = sget()
|
|
if this is None:
|
|
break # end of replacement string
|
|
if this and this[0] == "\\":
|
|
# group
|
|
c = this[1:2]
|
|
if c == "g":
|
|
name = ""
|
|
if s.match("<"):
|
|
while 1:
|
|
char = sget()
|
|
if char is None:
|
|
raise error("unterminated group name")
|
|
if char == ">":
|
|
break
|
|
name = name + char
|
|
if not name:
|
|
raise error("bad group name")
|
|
try:
|
|
index = int(name)
|
|
if index < 0:
|
|
raise error("negative group number")
|
|
except ValueError:
|
|
if not isname(name):
|
|
raise error("bad character in group name")
|
|
try:
|
|
index = pattern.groupindex[name]
|
|
except KeyError:
|
|
raise IndexError("unknown group name")
|
|
a((MARK, index))
|
|
elif c == "0":
|
|
if s.next in OCTDIGITS:
|
|
this = this + sget()
|
|
if s.next in OCTDIGITS:
|
|
this = this + sget()
|
|
literal(makechar(int(this[1:], 8) & 0xff))
|
|
elif c in DIGITS:
|
|
isoctal = False
|
|
if s.next in DIGITS:
|
|
this = this + sget()
|
|
if (c in OCTDIGITS and this[2] in OCTDIGITS and
|
|
s.next in OCTDIGITS):
|
|
this = this + sget()
|
|
isoctal = True
|
|
literal(makechar(int(this[1:], 8) & 0xff))
|
|
if not isoctal:
|
|
a((MARK, int(this[1:])))
|
|
else:
|
|
try:
|
|
this = makechar(ESCAPES[this][1])
|
|
except KeyError:
|
|
pass
|
|
literal(this)
|
|
else:
|
|
literal(this)
|
|
# convert template to groups and literals lists
|
|
i = 0
|
|
groups = []
|
|
groupsappend = groups.append
|
|
literals = [None] * len(p)
|
|
if isinstance(source, str):
|
|
encode = lambda x: x
|
|
else:
|
|
# The tokenizer implicitly decodes bytes objects as latin-1, we must
|
|
# therefore re-encode the final representation.
|
|
encode = lambda x: x.encode('latin-1')
|
|
for c, s in p:
|
|
if c is MARK:
|
|
groupsappend((i, s))
|
|
# literal[i] is already None
|
|
else:
|
|
literals[i] = encode(s)
|
|
i = i + 1
|
|
return groups, literals
|
|
|
|
def expand_template(template, match):
|
|
g = match.group
|
|
sep = match.string[:0]
|
|
groups, literals = template
|
|
literals = literals[:]
|
|
try:
|
|
for index, group in groups:
|
|
literals[index] = s = g(group)
|
|
if s is None:
|
|
raise error("unmatched group")
|
|
except IndexError:
|
|
raise error("invalid group reference")
|
|
return sep.join(literals)
|