2000-03-31 22:58:54 +08:00
|
|
|
#
|
|
|
|
# Secret Labs' Regular Expression Engine
|
|
|
|
#
|
2000-06-09 22:08:07 +08:00
|
|
|
# convert re-style regular expression to sre pattern
|
2000-03-31 22:58:54 +08:00
|
|
|
#
|
2001-01-14 23:06:11 +08:00
|
|
|
# Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved.
|
2000-03-31 22:58:54 +08:00
|
|
|
#
|
2000-08-02 02:20:07 +08:00
|
|
|
# See the sre.py file for information on usage and redistribution.
|
2000-03-31 22:58:54 +08:00
|
|
|
#
|
|
|
|
|
2001-09-05 03:10:20 +08:00
|
|
|
"""Internal support module for sre"""
|
|
|
|
|
2001-01-15 05:00:44 +08:00
|
|
|
# XXX: show string offset and offending character for all errors
|
|
|
|
|
2004-08-25 10:22:30 +08:00
|
|
|
import sys
|
2000-03-31 22:58:54 +08:00
|
|
|
|
|
|
|
from sre_constants import *
|
|
|
|
|
2005-03-01 03:27:52 +08:00
|
|
|
def set(seq):
|
|
|
|
s = {}
|
|
|
|
for elem in seq:
|
|
|
|
s[elem] = 1
|
|
|
|
return s
|
|
|
|
|
2000-03-31 22:58:54 +08:00
|
|
|
SPECIAL_CHARS = ".\\[{()*+?^$|"
|
2000-09-02 19:03:34 +08:00
|
|
|
REPEAT_CHARS = "*+?{"
|
2000-03-31 22:58:54 +08:00
|
|
|
|
2005-03-01 03:27:52 +08:00
|
|
|
DIGITS = set("0123456789")
|
2000-04-11 01:10:48 +08:00
|
|
|
|
2005-03-01 03:27:52 +08:00
|
|
|
OCTDIGITS = set("01234567")
|
|
|
|
HEXDIGITS = set("0123456789abcdefABCDEF")
|
2000-03-31 22:58:54 +08:00
|
|
|
|
2005-03-01 03:27:52 +08:00
|
|
|
WHITESPACE = set(" \t\n\r\v\f")
|
2000-06-09 22:08:07 +08:00
|
|
|
|
2000-03-31 22:58:54 +08:00
|
|
|
ESCAPES = {
|
2001-02-18 20:05:16 +08:00
|
|
|
r"\a": (LITERAL, ord("\a")),
|
|
|
|
r"\b": (LITERAL, ord("\b")),
|
|
|
|
r"\f": (LITERAL, ord("\f")),
|
|
|
|
r"\n": (LITERAL, ord("\n")),
|
|
|
|
r"\r": (LITERAL, ord("\r")),
|
|
|
|
r"\t": (LITERAL, ord("\t")),
|
|
|
|
r"\v": (LITERAL, ord("\v")),
|
2000-06-30 21:55:15 +08:00
|
|
|
r"\\": (LITERAL, ord("\\"))
|
2000-03-31 22:58:54 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
CATEGORIES = {
|
2001-01-14 23:06:11 +08:00
|
|
|
r"\A": (AT, AT_BEGINNING_STRING), # start of string
|
2000-06-30 08:27:46 +08:00
|
|
|
r"\b": (AT, AT_BOUNDARY),
|
|
|
|
r"\B": (AT, AT_NON_BOUNDARY),
|
|
|
|
r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]),
|
|
|
|
r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]),
|
|
|
|
r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]),
|
|
|
|
r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]),
|
|
|
|
r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]),
|
|
|
|
r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]),
|
2001-01-14 23:06:11 +08:00
|
|
|
r"\Z": (AT, AT_END_STRING), # end of string
|
2000-03-31 22:58:54 +08:00
|
|
|
}
|
|
|
|
|
2000-06-09 22:08:07 +08:00
|
|
|
FLAGS = {
|
2000-06-29 16:58:44 +08:00
|
|
|
# standard flags
|
2000-06-09 22:08:07 +08:00
|
|
|
"i": SRE_FLAG_IGNORECASE,
|
|
|
|
"L": SRE_FLAG_LOCALE,
|
|
|
|
"m": SRE_FLAG_MULTILINE,
|
|
|
|
"s": SRE_FLAG_DOTALL,
|
|
|
|
"x": SRE_FLAG_VERBOSE,
|
2000-06-29 16:58:44 +08:00
|
|
|
# extensions
|
|
|
|
"t": SRE_FLAG_TEMPLATE,
|
|
|
|
"u": SRE_FLAG_UNICODE,
|
2000-06-09 22:08:07 +08:00
|
|
|
}
|
|
|
|
|
2000-07-24 05:46:17 +08:00
|
|
|
class Pattern:
|
|
|
|
# master pattern object. keeps track of global attributes
|
2000-03-31 22:58:54 +08:00
|
|
|
def __init__(self):
|
2000-06-30 15:50:59 +08:00
|
|
|
self.flags = 0
|
2000-10-29 03:30:41 +08:00
|
|
|
self.open = []
|
2000-06-30 15:50:59 +08:00
|
|
|
self.groups = 1
|
|
|
|
self.groupdict = {}
|
2000-10-29 03:30:41 +08:00
|
|
|
def opengroup(self, name=None):
|
2000-06-30 15:50:59 +08:00
|
|
|
gid = self.groups
|
|
|
|
self.groups = gid + 1
|
2002-06-02 08:40:05 +08:00
|
|
|
if name is not None:
|
2001-11-04 03:35:43 +08:00
|
|
|
ogid = self.groupdict.get(name, None)
|
|
|
|
if ogid is not None:
|
2001-12-10 00:13:15 +08:00
|
|
|
raise error, ("redefinition of group name %s as group %d; "
|
|
|
|
"was group %d" % (repr(name), gid, ogid))
|
2000-06-30 15:50:59 +08:00
|
|
|
self.groupdict[name] = gid
|
2000-10-29 03:30:41 +08:00
|
|
|
self.open.append(gid)
|
2000-06-30 15:50:59 +08:00
|
|
|
return gid
|
2000-10-29 03:30:41 +08:00
|
|
|
def closegroup(self, gid):
|
|
|
|
self.open.remove(gid)
|
|
|
|
def checkgroup(self, gid):
|
|
|
|
return gid < self.groups and gid not in self.open
|
2000-03-31 22:58:54 +08:00
|
|
|
|
|
|
|
class SubPattern:
|
|
|
|
# a subpattern, in intermediate form
|
|
|
|
def __init__(self, pattern, data=None):
|
2000-06-30 15:50:59 +08:00
|
|
|
self.pattern = pattern
|
2002-06-02 08:40:05 +08:00
|
|
|
if data is None:
|
2000-06-30 15:50:59 +08:00
|
|
|
data = []
|
|
|
|
self.data = data
|
|
|
|
self.width = None
|
2000-07-24 05:46:17 +08:00
|
|
|
def dump(self, level=0):
|
|
|
|
nl = 1
|
2007-06-08 07:15:56 +08:00
|
|
|
seqtypes = (tuple, list)
|
2000-07-24 05:46:17 +08:00
|
|
|
for op, av in self.data:
|
2007-02-09 13:37:30 +08:00
|
|
|
print(level*" " + op, end=' '); nl = 0
|
2000-07-24 05:46:17 +08:00
|
|
|
if op == "in":
|
|
|
|
# member sublanguage
|
2007-02-09 13:37:30 +08:00
|
|
|
print(); nl = 1
|
2000-07-24 05:46:17 +08:00
|
|
|
for op, a in av:
|
2007-02-09 13:37:30 +08:00
|
|
|
print((level+1)*" " + op, a)
|
2000-07-24 05:46:17 +08:00
|
|
|
elif op == "branch":
|
2007-02-09 13:37:30 +08:00
|
|
|
print(); nl = 1
|
2000-07-24 05:46:17 +08:00
|
|
|
i = 0
|
|
|
|
for a in av[1]:
|
|
|
|
if i > 0:
|
2007-02-09 13:37:30 +08:00
|
|
|
print(level*" " + "or")
|
2000-07-24 05:46:17 +08:00
|
|
|
a.dump(level+1); nl = 1
|
|
|
|
i = i + 1
|
2007-06-08 07:15:56 +08:00
|
|
|
elif isinstance(av, seqtypes):
|
2000-07-24 05:46:17 +08:00
|
|
|
for a in av:
|
|
|
|
if isinstance(a, SubPattern):
|
2007-02-09 13:37:30 +08:00
|
|
|
if not nl: print()
|
2000-07-24 05:46:17 +08:00
|
|
|
a.dump(level+1); nl = 1
|
|
|
|
else:
|
2007-02-09 13:37:30 +08:00
|
|
|
print(a, end=' ') ; nl = 0
|
2000-07-24 05:46:17 +08:00
|
|
|
else:
|
2007-02-09 13:37:30 +08:00
|
|
|
print(av, end=' ') ; nl = 0
|
|
|
|
if not nl: print()
|
2000-03-31 22:58:54 +08:00
|
|
|
def __repr__(self):
|
2000-06-30 15:50:59 +08:00
|
|
|
return repr(self.data)
|
2000-03-31 22:58:54 +08:00
|
|
|
def __len__(self):
|
2000-06-30 15:50:59 +08:00
|
|
|
return len(self.data)
|
2000-03-31 22:58:54 +08:00
|
|
|
def __delitem__(self, index):
|
2000-06-30 15:50:59 +08:00
|
|
|
del self.data[index]
|
2000-03-31 22:58:54 +08:00
|
|
|
def __getitem__(self, index):
|
Merged revisions 53005-53303 via svnmerge from
svn+ssh://pythondev@svn.python.org/python/trunk
........
r53012 | walter.doerwald | 2006-12-12 22:55:31 +0100 (Tue, 12 Dec 2006) | 2 lines
Fix typo.
........
r53023 | brett.cannon | 2006-12-13 23:31:37 +0100 (Wed, 13 Dec 2006) | 2 lines
Remove an unneeded import of 'warnings'.
........
r53025 | brett.cannon | 2006-12-14 00:02:38 +0100 (Thu, 14 Dec 2006) | 2 lines
Remove unneeded imports of 'warnings'.
........
r53026 | brett.cannon | 2006-12-14 00:09:53 +0100 (Thu, 14 Dec 2006) | 4 lines
Add test.test_support.guard_warnings_filter . This function returns a context
manager that protects warnings.filter from being modified once the context is
exited.
........
r53029 | george.yoshida | 2006-12-14 03:22:44 +0100 (Thu, 14 Dec 2006) | 2 lines
Note that guard_warnings_filter was added in 2.6
........
r53031 | vinay.sajip | 2006-12-14 09:53:55 +0100 (Thu, 14 Dec 2006) | 1 line
Added news on recent changes to logging
........
r53032 | andrew.kuchling | 2006-12-14 19:57:53 +0100 (Thu, 14 Dec 2006) | 1 line
[Patch #1599256 from David Watson] check that os.fsync is available before using it
........
r53042 | kurt.kaiser | 2006-12-15 06:13:11 +0100 (Fri, 15 Dec 2006) | 6 lines
1. Avoid hang when encountering a duplicate in a completion list. Bug 1571112.
2. Duplicate some old entries from Python's NEWS to IDLE's NEWS.txt
M AutoCompleteWindow.py
M NEWS.txt
........
r53048 | andrew.kuchling | 2006-12-18 18:12:31 +0100 (Mon, 18 Dec 2006) | 1 line
[Bug #1618083] Add missing word; make a few grammar fixes
........
r53050 | andrew.kuchling | 2006-12-18 18:16:05 +0100 (Mon, 18 Dec 2006) | 1 line
Bump version
........
r53051 | andrew.kuchling | 2006-12-18 18:22:07 +0100 (Mon, 18 Dec 2006) | 1 line
[Bug #1616726] Fix description of generator.close(); if you raise some random exception, the exception is raised and doesn't trigger a RuntimeError
........
r53052 | andrew.kuchling | 2006-12-18 18:38:14 +0100 (Mon, 18 Dec 2006) | 1 line
Describe new methods in Queue module
........
r53053 | andrew.kuchling | 2006-12-18 20:22:24 +0100 (Mon, 18 Dec 2006) | 1 line
[Patch #1615868 by Lars Gustaebel] Use Py_off_t to fix BZ2File.seek() for offsets > 2Gb
........
r53057 | andrew.kuchling | 2006-12-18 22:29:07 +0100 (Mon, 18 Dec 2006) | 1 line
Fix markup
........
r53063 | thomas.wouters | 2006-12-19 09:17:50 +0100 (Tue, 19 Dec 2006) | 5 lines
Make sre's SubPattern objects accept slice objects like it already accepts
simple slices.
........
r53065 | andrew.kuchling | 2006-12-19 15:13:05 +0100 (Tue, 19 Dec 2006) | 6 lines
[Patch #1618455 by Ben Maurer] Improve speed of HMAC by using str.translate()
instead of a more general XOR that has to construct a list.
Slightly modified from Maurer's patch: the _strxor() function is no longer
necessary at all.
........
r53066 | andrew.kuchling | 2006-12-19 15:28:23 +0100 (Tue, 19 Dec 2006) | 9 lines
[Bug #1613651] Document socket.recv_into, socket.recvfrom_into
Also, the text for recvfrom told you to read recv() for an explanation of the
'flags' argument, but recv() just pointed you at the man page. Copied the
man-page text to recvfrom(), recvfrom_into, recv_into to avoid the pointless
redirection.
I don't have LaTeX on this machine; hope my markup is OK.
........
r53067 | andrew.kuchling | 2006-12-19 15:29:04 +0100 (Tue, 19 Dec 2006) | 1 line
Comment typo
........
r53068 | andrew.kuchling | 2006-12-19 16:11:41 +0100 (Tue, 19 Dec 2006) | 1 line
[Patch #1617413 from Dug Song] Fix HTTP Basic authentication via HTTPS
........
r53071 | andrew.kuchling | 2006-12-19 16:18:12 +0100 (Tue, 19 Dec 2006) | 1 line
[Patch #1600491 from Jim Jewett] Describe how to build help files on Windows
........
r53073 | andrew.kuchling | 2006-12-19 16:43:10 +0100 (Tue, 19 Dec 2006) | 6 lines
[Patch #1587139 by kxroberto] Protect lock acquisition/release with
try...finally to ensure the lock is always released. This could use
the 'with' statement, but the patch uses 'finally'.
2.5 backport candidate.
........
r53074 | vinay.sajip | 2006-12-19 19:29:11 +0100 (Tue, 19 Dec 2006) | 1 line
Updated documentation for findCaller() to indicate that a 3-tuple is now returned, rather than a 2-tuple.
........
r53090 | georg.brandl | 2006-12-19 23:06:46 +0100 (Tue, 19 Dec 2006) | 3 lines
Patch #1484695: The tarfile module now raises a HeaderError exception
if a buffer given to frombuf() is invalid.
........
r53099 | raymond.hettinger | 2006-12-20 07:42:06 +0100 (Wed, 20 Dec 2006) | 5 lines
Bug #1590891: random.randrange don't return correct value for big number
Needs to be backported.
........
r53106 | georg.brandl | 2006-12-20 12:55:16 +0100 (Wed, 20 Dec 2006) | 3 lines
Testcase for patch #1484695.
........
r53110 | andrew.kuchling | 2006-12-20 20:48:20 +0100 (Wed, 20 Dec 2006) | 17 lines
[Apply length-checking.diff from bug #1599254]
Add length checking to single-file mailbox formats: before doing a
flush() on a mailbox, seek to the end and verify its length is
unchanged, raising ExternalClashError if the file's length has
changed.
This fix avoids potential data loss if some other process appends to
the mailbox file after the table of contents has been generated;
instead of overwriting the modified file, you'll get the exception.
I also noticed that the self._lookup() call in self.flush() wasn't
necessary (everything that sets self._pending to True also calls
self.lookup()), and replaced it by an assertion.
2.5 backport candidate.
........
r53112 | andrew.kuchling | 2006-12-20 20:57:10 +0100 (Wed, 20 Dec 2006) | 1 line
[Bug #1619674] Make sum() use the term iterable, not sequence
........
r53113 | andrew.kuchling | 2006-12-20 20:58:11 +0100 (Wed, 20 Dec 2006) | 1 line
Two grammar fixes
........
r53115 | andrew.kuchling | 2006-12-20 21:11:12 +0100 (Wed, 20 Dec 2006) | 5 lines
Some other built-in functions are described with 'sequence' arguments
that should really be 'iterable'; this commit changes them.
Did I miss any? Did I introduce any errors?
........
r53117 | andrew.kuchling | 2006-12-20 21:20:42 +0100 (Wed, 20 Dec 2006) | 1 line
[Bug #1619680] in_dll() arguments are documented in the wrong order
........
r53120 | neal.norwitz | 2006-12-21 05:38:00 +0100 (Thu, 21 Dec 2006) | 1 line
Lars asked for permission on on python-dev for work on tarfile.py
........
r53125 | andrew.kuchling | 2006-12-21 14:40:29 +0100 (Thu, 21 Dec 2006) | 1 line
Mention the os.SEEK_* constants
........
r53129 | walter.doerwald | 2006-12-21 19:06:30 +0100 (Thu, 21 Dec 2006) | 2 lines
Fix typo.
........
r53131 | thomas.heller | 2006-12-21 19:30:56 +0100 (Thu, 21 Dec 2006) | 3 lines
Fix wrong markup of an argument in a method signature.
Will backport.
........
r53137 | andrew.kuchling | 2006-12-22 01:50:56 +0100 (Fri, 22 Dec 2006) | 1 line
Typo fix
........
r53139 | andrew.kuchling | 2006-12-22 14:25:02 +0100 (Fri, 22 Dec 2006) | 1 line
[Bug #737202; fix from Titus Brown] Make CGIHTTPServer work for scripts in sub-directories
........
r53141 | andrew.kuchling | 2006-12-22 16:04:45 +0100 (Fri, 22 Dec 2006) | 6 lines
[Bug #802128] Make the mode argument of dumbdbm actually work the way it's
described, and add a test for it.
2.5 bugfix candidate, maybe; arguably this patch changes the API of
dumbdbm and shouldn't be added in a point-release.
........
r53142 | andrew.kuchling | 2006-12-22 16:16:58 +0100 (Fri, 22 Dec 2006) | 6 lines
[Bug #802128 continued] Modify mode depending on the process umask.
Is there really no other way to read the umask than to set it?
Hope this works on Windows...
........
r53145 | andrew.kuchling | 2006-12-22 17:43:26 +0100 (Fri, 22 Dec 2006) | 1 line
[Bug #776202] Apply Walter Doerwald's patch to use text mode for encoded files
........
r53146 | andrew.kuchling | 2006-12-22 19:41:42 +0100 (Fri, 22 Dec 2006) | 9 lines
[Patch #783050 from Patrick Lynch] The emulation of forkpty() is incorrect;
the master should close the slave fd.
Added a test to test_pty.py that reads from the master_fd after doing
a pty.fork(); without the fix it hangs forever instead of raising an
exception. (<crossing fingers for the buildbots>)
2.5 backport candidate.
........
r53147 | andrew.kuchling | 2006-12-22 20:06:16 +0100 (Fri, 22 Dec 2006) | 1 line
[Patch #827559 from Chris Gonnerman] Make SimpleHTTPServer redirect when a directory URL is missing the trailing slash; this lets relative links work.
........
r53149 | andrew.kuchling | 2006-12-22 20:21:27 +0100 (Fri, 22 Dec 2006) | 1 line
Darn; this test works when you run test_pty.py directly, but fails when regrtest runs it (the os.read() raises os.error). I can't figure out the cause, so am commenting out the test.
........
r53150 | andrew.kuchling | 2006-12-22 22:48:19 +0100 (Fri, 22 Dec 2006) | 1 line
Frak; this test also fails
........
r53153 | lars.gustaebel | 2006-12-23 17:40:13 +0100 (Sat, 23 Dec 2006) | 5 lines
Patch #1230446: tarfile.py: fix ExFileObject so that read() and tell()
work correctly together with readline().
Will backport to 2.5.
........
r53155 | lars.gustaebel | 2006-12-23 18:57:23 +0100 (Sat, 23 Dec 2006) | 5 lines
Patch #1262036: Prevent TarFiles from being added to themselves under
certain conditions.
Will backport to 2.5.
........
r53159 | andrew.kuchling | 2006-12-27 04:25:31 +0100 (Wed, 27 Dec 2006) | 4 lines
[Part of patch #1182394] Move the HMAC blocksize to be a class-level
constant; this allows changing it in a subclass. To accommodate this,
copy() now uses __class__. Also add some text to a comment.
........
r53160 | andrew.kuchling | 2006-12-27 04:31:24 +0100 (Wed, 27 Dec 2006) | 1 line
[Rest of patch #1182394] Add ._current() method so that we can use the written-in-C .hexdigest() method
........
r53161 | lars.gustaebel | 2006-12-27 11:30:46 +0100 (Wed, 27 Dec 2006) | 4 lines
Patch #1504073: Fix tarfile.open() for mode "r" with a fileobj argument.
Will backport to 2.5.
........
r53165 | neal.norwitz | 2006-12-28 05:39:20 +0100 (Thu, 28 Dec 2006) | 1 line
Remove a stray (old) macro name left around (I guess)
........
r53188 | neal.norwitz | 2006-12-29 04:01:53 +0100 (Fri, 29 Dec 2006) | 1 line
SF bug #1623890, fix argument name in docstring
........
r53200 | raymond.hettinger | 2006-12-30 05:01:17 +0100 (Sat, 30 Dec 2006) | 1 line
For sets with cyclical reprs, emit an ellipsis instead of infinitely recursing.
........
r53232 | brett.cannon | 2007-01-04 01:23:49 +0100 (Thu, 04 Jan 2007) | 3 lines
Add EnvironmentVarGuard to test.test_support. Provides a context manager to
temporarily set or unset environment variables.
........
r53235 | neal.norwitz | 2007-01-04 07:25:31 +0100 (Thu, 04 Jan 2007) | 1 line
SF #1627373, fix typo in CarbonEvt.
........
r53244 | raymond.hettinger | 2007-01-04 18:53:34 +0100 (Thu, 04 Jan 2007) | 1 line
Fix stability of heapq's nlargest() and nsmallest().
........
r53249 | martin.v.loewis | 2007-01-04 22:06:12 +0100 (Thu, 04 Jan 2007) | 3 lines
Bug #1566280: Explicitly invoke threading._shutdown from Py_Main,
to avoid relying on atexit.
Will backport to 2.5.
........
r53252 | gregory.p.smith | 2007-01-05 02:59:42 +0100 (Fri, 05 Jan 2007) | 3 lines
Support linking of the bsddb module against BerkeleyDB 4.5.x
(will backport to 2.5)
........
r53253 | gregory.p.smith | 2007-01-05 03:06:17 +0100 (Fri, 05 Jan 2007) | 2 lines
bump module version to match supported berkeleydb version
........
r53255 | neal.norwitz | 2007-01-05 06:25:22 +0100 (Fri, 05 Jan 2007) | 6 lines
Prevent crash on shutdown which can occur if we are finalizing
and the module dict has been cleared already and some object
raises a warning (like in a __del__).
Will backport.
........
r53258 | gregory.p.smith | 2007-01-05 08:21:35 +0100 (Fri, 05 Jan 2007) | 2 lines
typo fix
........
r53260 | neal.norwitz | 2007-01-05 09:06:43 +0100 (Fri, 05 Jan 2007) | 1 line
Add Collin Winter for access to update PEP 3107
........
r53262 | andrew.kuchling | 2007-01-05 15:22:17 +0100 (Fri, 05 Jan 2007) | 1 line
[Bug #1622533] Make docstrings raw strings because they contain control characters (\0, \1)
........
r53264 | andrew.kuchling | 2007-01-05 16:51:24 +0100 (Fri, 05 Jan 2007) | 1 line
[Patch #1520904] Fix bsddb tests to write to the temp directory instead of the Lib/bsddb/test directory
........
r53279 | brett.cannon | 2007-01-05 22:45:09 +0100 (Fri, 05 Jan 2007) | 3 lines
Silence a warning from gcc 4.0.1 by specifying a function's parameter list is
'void' instead of just a set of empty parentheses.
........
r53285 | raymond.hettinger | 2007-01-06 02:14:41 +0100 (Sat, 06 Jan 2007) | 2 lines
SF# 1409443: Expand comment to cover the interaction between f->f_lasti and the PREDICT macros.
........
r53286 | anthony.baxter | 2007-01-06 05:45:54 +0100 (Sat, 06 Jan 2007) | 1 line
update to (c) years to include 2007
........
r53291 | neal.norwitz | 2007-01-06 22:24:35 +0100 (Sat, 06 Jan 2007) | 1 line
Add Josiah to SF for maintaining asyncore/asynchat
........
r53293 | peter.astrand | 2007-01-07 09:53:46 +0100 (Sun, 07 Jan 2007) | 1 line
Re-implemented fix for #1531862 once again, in a way that works with Python 2.2. Fixes bug #1603424.
........
r53295 | peter.astrand | 2007-01-07 15:34:16 +0100 (Sun, 07 Jan 2007) | 1 line
Avoid O(N**2) bottleneck in _communicate_(). Fixes #1598181.
........
r53300 | raymond.hettinger | 2007-01-08 19:09:20 +0100 (Mon, 08 Jan 2007) | 1 line
Fix zero-length corner case for iterating over a mutating deque.
........
r53301 | vinay.sajip | 2007-01-08 19:50:32 +0100 (Mon, 08 Jan 2007) | 4 lines
Bare except clause removed from SMTPHandler.emit(). Now, only ImportError is trapped.
Bare except clause removed from SocketHandler.createSocket(). Now, only socket.error is trapped.
(SF #411881)
........
r53302 | vinay.sajip | 2007-01-08 19:51:46 +0100 (Mon, 08 Jan 2007) | 2 lines
Bare except clause removed from LogRecord.__init__. Now, only ValueError, TypeError and AttributeError are trapped.
(SF #411881)
........
r53303 | vinay.sajip | 2007-01-08 19:52:36 +0100 (Mon, 08 Jan 2007) | 1 line
Added entries about removal of some bare except clauses from logging.
........
2007-01-10 07:18:33 +08:00
|
|
|
if isinstance(index, slice):
|
|
|
|
return SubPattern(self.pattern, self.data[index])
|
2000-06-30 15:50:59 +08:00
|
|
|
return self.data[index]
|
2000-03-31 22:58:54 +08:00
|
|
|
def __setitem__(self, index, code):
|
2000-06-30 15:50:59 +08:00
|
|
|
self.data[index] = code
|
2000-03-31 22:58:54 +08:00
|
|
|
def __getslice__(self, start, stop):
|
2000-06-30 15:50:59 +08:00
|
|
|
return SubPattern(self.pattern, self.data[start:stop])
|
2000-03-31 22:58:54 +08:00
|
|
|
def insert(self, index, code):
|
2000-06-30 15:50:59 +08:00
|
|
|
self.data.insert(index, code)
|
2000-03-31 22:58:54 +08:00
|
|
|
def append(self, code):
|
2000-06-30 15:50:59 +08:00
|
|
|
self.data.append(code)
|
2000-03-31 22:58:54 +08:00
|
|
|
def getwidth(self):
|
2000-06-30 15:50:59 +08:00
|
|
|
# determine the width (min, max) for this subpattern
|
|
|
|
if self.width:
|
|
|
|
return self.width
|
2007-01-16 00:59:06 +08:00
|
|
|
lo = hi = 0
|
2004-03-27 07:24:00 +08:00
|
|
|
UNITCODES = (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY)
|
|
|
|
REPEATCODES = (MIN_REPEAT, MAX_REPEAT)
|
2000-06-30 15:50:59 +08:00
|
|
|
for op, av in self.data:
|
|
|
|
if op is BRANCH:
|
2000-08-02 05:05:41 +08:00
|
|
|
i = sys.maxint
|
|
|
|
j = 0
|
2000-06-30 15:50:59 +08:00
|
|
|
for av in av[1]:
|
2000-08-02 05:05:41 +08:00
|
|
|
l, h = av.getwidth()
|
|
|
|
i = min(i, l)
|
2000-08-02 06:47:49 +08:00
|
|
|
j = max(j, h)
|
2000-06-30 15:50:59 +08:00
|
|
|
lo = lo + i
|
|
|
|
hi = hi + j
|
|
|
|
elif op is CALL:
|
|
|
|
i, j = av.getwidth()
|
|
|
|
lo = lo + i
|
|
|
|
hi = hi + j
|
|
|
|
elif op is SUBPATTERN:
|
|
|
|
i, j = av[1].getwidth()
|
|
|
|
lo = lo + i
|
|
|
|
hi = hi + j
|
2004-03-27 07:24:00 +08:00
|
|
|
elif op in REPEATCODES:
|
2000-06-30 15:50:59 +08:00
|
|
|
i, j = av[2].getwidth()
|
2007-01-16 00:59:06 +08:00
|
|
|
lo = lo + int(i) * av[0]
|
|
|
|
hi = hi + int(j) * av[1]
|
2004-03-27 07:24:00 +08:00
|
|
|
elif op in UNITCODES:
|
2000-06-30 15:50:59 +08:00
|
|
|
lo = lo + 1
|
|
|
|
hi = hi + 1
|
|
|
|
elif op == SUCCESS:
|
|
|
|
break
|
|
|
|
self.width = int(min(lo, sys.maxint)), int(min(hi, sys.maxint))
|
|
|
|
return self.width
|
2000-03-31 22:58:54 +08:00
|
|
|
|
|
|
|
class Tokenizer:
|
|
|
|
def __init__(self, string):
|
2000-06-30 15:50:59 +08:00
|
|
|
self.string = string
|
2000-07-02 07:49:14 +08:00
|
|
|
self.index = 0
|
|
|
|
self.__next()
|
2000-03-31 22:58:54 +08:00
|
|
|
def __next(self):
|
2000-06-30 15:50:59 +08:00
|
|
|
if self.index >= len(self.string):
|
2000-07-02 07:49:14 +08:00
|
|
|
self.next = None
|
|
|
|
return
|
2000-06-30 15:50:59 +08:00
|
|
|
char = self.string[self.index]
|
|
|
|
if char[0] == "\\":
|
|
|
|
try:
|
|
|
|
c = self.string[self.index + 1]
|
|
|
|
except IndexError:
|
2001-11-02 21:59:51 +08:00
|
|
|
raise error, "bogus escape (end of line)"
|
2000-06-30 15:50:59 +08:00
|
|
|
char = char + c
|
|
|
|
self.index = self.index + len(char)
|
2000-07-02 07:49:14 +08:00
|
|
|
self.next = char
|
2000-09-01 06:57:55 +08:00
|
|
|
def match(self, char, skip=1):
|
2000-06-30 15:50:59 +08:00
|
|
|
if char == self.next:
|
2000-09-01 06:57:55 +08:00
|
|
|
if skip:
|
|
|
|
self.__next()
|
2000-06-30 15:50:59 +08:00
|
|
|
return 1
|
|
|
|
return 0
|
2000-03-31 22:58:54 +08:00
|
|
|
def get(self):
|
2000-06-30 15:50:59 +08:00
|
|
|
this = self.next
|
2000-07-02 07:49:14 +08:00
|
|
|
self.__next()
|
2000-06-30 15:50:59 +08:00
|
|
|
return this
|
2000-07-02 07:49:14 +08:00
|
|
|
def tell(self):
|
|
|
|
return self.index, self.next
|
|
|
|
def seek(self, index):
|
|
|
|
self.index, self.next = index
|
2000-03-31 22:58:54 +08:00
|
|
|
|
2000-06-29 20:38:45 +08:00
|
|
|
def isident(char):
|
|
|
|
return "a" <= char <= "z" or "A" <= char <= "Z" or char == "_"
|
|
|
|
|
|
|
|
def isdigit(char):
|
|
|
|
return "0" <= char <= "9"
|
|
|
|
|
|
|
|
def isname(name):
|
|
|
|
# check that group name is a valid string
|
|
|
|
if not isident(name[0]):
|
2002-04-07 14:36:23 +08:00
|
|
|
return False
|
2004-09-04 01:06:10 +08:00
|
|
|
for char in name[1:]:
|
2000-06-30 15:50:59 +08:00
|
|
|
if not isident(char) and not isdigit(char):
|
2002-04-07 14:36:23 +08:00
|
|
|
return False
|
|
|
|
return True
|
2000-06-29 20:38:45 +08:00
|
|
|
|
2000-06-09 22:08:07 +08:00
|
|
|
def _class_escape(source, escape):
|
|
|
|
# handle escape code inside character class
|
|
|
|
code = ESCAPES.get(escape)
|
|
|
|
if code:
|
2000-06-30 15:50:59 +08:00
|
|
|
return code
|
2000-06-09 22:08:07 +08:00
|
|
|
code = CATEGORIES.get(escape)
|
|
|
|
if code:
|
2000-06-30 15:50:59 +08:00
|
|
|
return code
|
2000-06-09 22:08:07 +08:00
|
|
|
try:
|
2004-09-04 01:06:10 +08:00
|
|
|
c = escape[1:2]
|
|
|
|
if c == "x":
|
2000-09-01 06:57:55 +08:00
|
|
|
# hexadecimal escape (exactly two digits)
|
|
|
|
while source.next in HEXDIGITS and len(escape) < 4:
|
2000-06-30 15:50:59 +08:00
|
|
|
escape = escape + source.get()
|
|
|
|
escape = escape[2:]
|
2000-09-01 06:57:55 +08:00
|
|
|
if len(escape) != 2:
|
|
|
|
raise error, "bogus escape: %s" % repr("\\" + escape)
|
2004-08-25 10:22:30 +08:00
|
|
|
return LITERAL, int(escape, 16) & 0xff
|
2004-09-04 01:06:10 +08:00
|
|
|
elif c in OCTDIGITS:
|
2000-09-01 06:57:55 +08:00
|
|
|
# octal escape (up to three digits)
|
2004-09-04 01:06:10 +08:00
|
|
|
while source.next in OCTDIGITS and len(escape) < 4:
|
2000-06-30 15:50:59 +08:00
|
|
|
escape = escape + source.get()
|
|
|
|
escape = escape[1:]
|
2004-08-25 10:22:30 +08:00
|
|
|
return LITERAL, int(escape, 8) & 0xff
|
2004-09-04 01:06:10 +08:00
|
|
|
elif c in DIGITS:
|
|
|
|
raise error, "bogus escape: %s" % repr(escape)
|
2000-06-30 15:50:59 +08:00
|
|
|
if len(escape) == 2:
|
2000-06-30 21:55:15 +08:00
|
|
|
return LITERAL, ord(escape[1])
|
2000-06-09 22:08:07 +08:00
|
|
|
except ValueError:
|
2000-06-30 15:50:59 +08:00
|
|
|
pass
|
2000-06-29 16:58:44 +08:00
|
|
|
raise error, "bogus escape: %s" % repr(escape)
|
2000-06-09 22:08:07 +08:00
|
|
|
|
|
|
|
def _escape(source, escape, state):
|
|
|
|
# handle escape code in expression
|
|
|
|
code = CATEGORIES.get(escape)
|
|
|
|
if code:
|
2000-06-30 15:50:59 +08:00
|
|
|
return code
|
2000-06-09 22:08:07 +08:00
|
|
|
code = ESCAPES.get(escape)
|
2000-03-31 22:58:54 +08:00
|
|
|
if code:
|
2000-06-30 15:50:59 +08:00
|
|
|
return code
|
2000-03-31 22:58:54 +08:00
|
|
|
try:
|
2004-09-04 01:06:10 +08:00
|
|
|
c = escape[1:2]
|
|
|
|
if c == "x":
|
2000-09-01 06:57:55 +08:00
|
|
|
# hexadecimal escape
|
|
|
|
while source.next in HEXDIGITS and len(escape) < 4:
|
2000-06-30 15:50:59 +08:00
|
|
|
escape = escape + source.get()
|
2000-09-02 19:03:34 +08:00
|
|
|
if len(escape) != 4:
|
|
|
|
raise ValueError
|
2004-08-25 10:22:30 +08:00
|
|
|
return LITERAL, int(escape[2:], 16) & 0xff
|
2004-09-04 01:06:10 +08:00
|
|
|
elif c == "0":
|
2000-09-01 06:57:55 +08:00
|
|
|
# octal escape
|
2000-09-02 19:03:34 +08:00
|
|
|
while source.next in OCTDIGITS and len(escape) < 4:
|
2000-09-01 06:57:55 +08:00
|
|
|
escape = escape + source.get()
|
2004-08-25 10:22:30 +08:00
|
|
|
return LITERAL, int(escape[1:], 8) & 0xff
|
2004-09-04 01:06:10 +08:00
|
|
|
elif c in DIGITS:
|
2000-09-01 06:57:55 +08:00
|
|
|
# octal escape *or* decimal group reference (sigh)
|
|
|
|
if source.next in DIGITS:
|
|
|
|
escape = escape + source.get()
|
2000-09-02 19:03:34 +08:00
|
|
|
if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and
|
|
|
|
source.next in OCTDIGITS):
|
2000-09-01 06:57:55 +08:00
|
|
|
# got three octal digits; this is an octal escape
|
2000-06-30 15:50:59 +08:00
|
|
|
escape = escape + source.get()
|
2004-08-25 10:22:30 +08:00
|
|
|
return LITERAL, int(escape[1:], 8) & 0xff
|
2004-09-04 01:06:10 +08:00
|
|
|
# not an octal escape, so this is a group reference
|
|
|
|
group = int(escape[1:])
|
|
|
|
if group < state.groups:
|
2000-10-29 03:30:41 +08:00
|
|
|
if not state.checkgroup(group):
|
|
|
|
raise error, "cannot refer to open group"
|
2000-09-01 06:57:55 +08:00
|
|
|
return GROUPREF, group
|
2000-09-02 19:03:34 +08:00
|
|
|
raise ValueError
|
2000-06-30 15:50:59 +08:00
|
|
|
if len(escape) == 2:
|
2000-06-30 21:55:15 +08:00
|
|
|
return LITERAL, ord(escape[1])
|
2000-03-31 22:58:54 +08:00
|
|
|
except ValueError:
|
2000-06-30 15:50:59 +08:00
|
|
|
pass
|
2000-06-29 16:58:44 +08:00
|
|
|
raise error, "bogus escape: %s" % repr(escape)
|
2000-03-31 22:58:54 +08:00
|
|
|
|
2000-07-24 05:46:17 +08:00
|
|
|
def _parse_sub(source, state, nested=1):
|
|
|
|
# parse an alternation: a|b|c
|
2000-03-31 22:58:54 +08:00
|
|
|
|
2000-07-24 05:46:17 +08:00
|
|
|
items = []
|
2004-03-27 07:24:00 +08:00
|
|
|
itemsappend = items.append
|
|
|
|
sourcematch = source.match
|
2000-07-24 05:46:17 +08:00
|
|
|
while 1:
|
2004-03-27 07:24:00 +08:00
|
|
|
itemsappend(_parse(source, state))
|
|
|
|
if sourcematch("|"):
|
2000-07-24 05:46:17 +08:00
|
|
|
continue
|
|
|
|
if not nested:
|
|
|
|
break
|
2004-03-27 07:24:00 +08:00
|
|
|
if not source.next or sourcematch(")", 0):
|
2000-07-24 05:46:17 +08:00
|
|
|
break
|
|
|
|
else:
|
|
|
|
raise error, "pattern not properly closed"
|
|
|
|
|
|
|
|
if len(items) == 1:
|
|
|
|
return items[0]
|
|
|
|
|
|
|
|
subpattern = SubPattern(state)
|
2004-03-27 07:24:00 +08:00
|
|
|
subpatternappend = subpattern.append
|
2000-06-09 22:08:07 +08:00
|
|
|
|
2000-03-31 22:58:54 +08:00
|
|
|
# check if all items share a common prefix
|
|
|
|
while 1:
|
2000-06-30 15:50:59 +08:00
|
|
|
prefix = None
|
|
|
|
for item in items:
|
|
|
|
if not item:
|
|
|
|
break
|
|
|
|
if prefix is None:
|
|
|
|
prefix = item[0]
|
|
|
|
elif item[0] != prefix:
|
|
|
|
break
|
|
|
|
else:
|
|
|
|
# all subitems start with a common "prefix".
|
|
|
|
# move it out of the branch
|
|
|
|
for item in items:
|
|
|
|
del item[0]
|
2004-03-27 07:24:00 +08:00
|
|
|
subpatternappend(prefix)
|
2000-06-30 15:50:59 +08:00
|
|
|
continue # check next one
|
|
|
|
break
|
2000-03-31 22:58:54 +08:00
|
|
|
|
|
|
|
# check if the branch can be replaced by a character set
|
|
|
|
for item in items:
|
2000-06-30 15:50:59 +08:00
|
|
|
if len(item) != 1 or item[0][0] != LITERAL:
|
|
|
|
break
|
2000-03-31 22:58:54 +08:00
|
|
|
else:
|
2000-06-30 15:50:59 +08:00
|
|
|
# we can store this as a character set instead of a
|
2000-07-24 05:46:17 +08:00
|
|
|
# branch (the compiler may optimize this even more)
|
2000-06-30 15:50:59 +08:00
|
|
|
set = []
|
2004-03-27 07:24:00 +08:00
|
|
|
setappend = set.append
|
2000-06-30 15:50:59 +08:00
|
|
|
for item in items:
|
2004-03-27 07:24:00 +08:00
|
|
|
setappend(item[0])
|
|
|
|
subpatternappend((IN, set))
|
2000-06-30 15:50:59 +08:00
|
|
|
return subpattern
|
2000-03-31 22:58:54 +08:00
|
|
|
|
|
|
|
subpattern.append((BRANCH, (None, items)))
|
2000-06-09 22:08:07 +08:00
|
|
|
return subpattern
|
2000-03-31 22:58:54 +08:00
|
|
|
|
2003-10-18 06:13:16 +08:00
|
|
|
def _parse_sub_cond(source, state, condgroup):
|
2004-01-19 04:29:55 +08:00
|
|
|
item_yes = _parse(source, state)
|
2003-10-18 06:13:16 +08:00
|
|
|
if source.match("|"):
|
2004-01-19 04:29:55 +08:00
|
|
|
item_no = _parse(source, state)
|
2003-10-18 06:13:16 +08:00
|
|
|
if source.match("|"):
|
|
|
|
raise error, "conditional backref with more than two branches"
|
|
|
|
else:
|
|
|
|
item_no = None
|
|
|
|
if source.next and not source.match(")", 0):
|
|
|
|
raise error, "pattern not properly closed"
|
|
|
|
subpattern = SubPattern(state)
|
|
|
|
subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
|
|
|
|
return subpattern
|
|
|
|
|
2005-03-01 03:27:52 +08:00
|
|
|
_PATTERNENDERS = set("|)")
|
|
|
|
_ASSERTCHARS = set("=!<")
|
|
|
|
_LOOKBEHINDASSERTCHARS = set("=!")
|
|
|
|
_REPEATCODES = set([MIN_REPEAT, MAX_REPEAT])
|
|
|
|
|
2000-07-01 06:37:31 +08:00
|
|
|
def _parse(source, state):
|
2000-07-24 05:46:17 +08:00
|
|
|
# parse a simple pattern
|
2000-06-09 22:08:07 +08:00
|
|
|
subpattern = SubPattern(state)
|
2000-03-31 22:58:54 +08:00
|
|
|
|
2004-03-27 07:24:00 +08:00
|
|
|
# precompute constants into local variables
|
|
|
|
subpatternappend = subpattern.append
|
|
|
|
sourceget = source.get
|
|
|
|
sourcematch = source.match
|
|
|
|
_len = len
|
2005-03-01 03:27:52 +08:00
|
|
|
PATTERNENDERS = _PATTERNENDERS
|
|
|
|
ASSERTCHARS = _ASSERTCHARS
|
|
|
|
LOOKBEHINDASSERTCHARS = _LOOKBEHINDASSERTCHARS
|
|
|
|
REPEATCODES = _REPEATCODES
|
2004-03-27 07:24:00 +08:00
|
|
|
|
2000-03-31 22:58:54 +08:00
|
|
|
while 1:
|
|
|
|
|
2004-03-27 07:24:00 +08:00
|
|
|
if source.next in PATTERNENDERS:
|
2000-06-30 15:50:59 +08:00
|
|
|
break # end of subpattern
|
2004-03-27 07:24:00 +08:00
|
|
|
this = sourceget()
|
2000-06-30 15:50:59 +08:00
|
|
|
if this is None:
|
|
|
|
break # end of pattern
|
|
|
|
|
|
|
|
if state.flags & SRE_FLAG_VERBOSE:
|
|
|
|
# skip whitespace and comments
|
|
|
|
if this in WHITESPACE:
|
|
|
|
continue
|
|
|
|
if this == "#":
|
|
|
|
while 1:
|
2004-03-27 07:24:00 +08:00
|
|
|
this = sourceget()
|
2000-06-30 15:50:59 +08:00
|
|
|
if this in (None, "\n"):
|
|
|
|
break
|
|
|
|
continue
|
|
|
|
|
|
|
|
if this and this[0] not in SPECIAL_CHARS:
|
2004-03-27 07:24:00 +08:00
|
|
|
subpatternappend((LITERAL, ord(this)))
|
2000-06-30 15:50:59 +08:00
|
|
|
|
|
|
|
elif this == "[":
|
|
|
|
# character set
|
|
|
|
set = []
|
2004-03-27 07:24:00 +08:00
|
|
|
setappend = set.append
|
|
|
|
## if sourcematch(":"):
|
2000-06-30 15:50:59 +08:00
|
|
|
## pass # handle character classes
|
2004-03-27 07:24:00 +08:00
|
|
|
if sourcematch("^"):
|
|
|
|
setappend((NEGATE, None))
|
2000-06-30 15:50:59 +08:00
|
|
|
# check remaining characters
|
|
|
|
start = set[:]
|
|
|
|
while 1:
|
2004-03-27 07:24:00 +08:00
|
|
|
this = sourceget()
|
2000-06-30 15:50:59 +08:00
|
|
|
if this == "]" and set != start:
|
|
|
|
break
|
|
|
|
elif this and this[0] == "\\":
|
|
|
|
code1 = _class_escape(source, this)
|
|
|
|
elif this:
|
2000-06-30 21:55:15 +08:00
|
|
|
code1 = LITERAL, ord(this)
|
2000-06-30 15:50:59 +08:00
|
|
|
else:
|
|
|
|
raise error, "unexpected end of regular expression"
|
2004-03-27 07:24:00 +08:00
|
|
|
if sourcematch("-"):
|
2000-06-30 15:50:59 +08:00
|
|
|
# potential range
|
2004-03-27 07:24:00 +08:00
|
|
|
this = sourceget()
|
2000-06-30 15:50:59 +08:00
|
|
|
if this == "]":
|
2000-10-07 18:16:19 +08:00
|
|
|
if code1[0] is IN:
|
|
|
|
code1 = code1[1][0]
|
2004-03-27 07:24:00 +08:00
|
|
|
setappend(code1)
|
|
|
|
setappend((LITERAL, ord("-")))
|
2000-06-30 15:50:59 +08:00
|
|
|
break
|
2003-04-15 01:59:34 +08:00
|
|
|
elif this:
|
2000-06-30 15:50:59 +08:00
|
|
|
if this[0] == "\\":
|
|
|
|
code2 = _class_escape(source, this)
|
|
|
|
else:
|
2000-06-30 21:55:15 +08:00
|
|
|
code2 = LITERAL, ord(this)
|
2000-06-30 15:50:59 +08:00
|
|
|
if code1[0] != LITERAL or code2[0] != LITERAL:
|
2001-01-15 05:00:44 +08:00
|
|
|
raise error, "bad character range"
|
2000-09-01 06:57:55 +08:00
|
|
|
lo = code1[1]
|
|
|
|
hi = code2[1]
|
|
|
|
if hi < lo:
|
2001-01-15 05:00:44 +08:00
|
|
|
raise error, "bad character range"
|
2004-03-27 07:24:00 +08:00
|
|
|
setappend((RANGE, (lo, hi)))
|
2003-04-15 01:59:34 +08:00
|
|
|
else:
|
|
|
|
raise error, "unexpected end of regular expression"
|
2000-06-30 15:50:59 +08:00
|
|
|
else:
|
|
|
|
if code1[0] is IN:
|
|
|
|
code1 = code1[1][0]
|
2004-03-27 07:24:00 +08:00
|
|
|
setappend(code1)
|
2000-06-30 15:50:59 +08:00
|
|
|
|
2001-01-14 23:06:11 +08:00
|
|
|
# XXX: <fl> should move set optimization to compiler!
|
2004-03-27 07:24:00 +08:00
|
|
|
if _len(set)==1 and set[0][0] is LITERAL:
|
|
|
|
subpatternappend(set[0]) # optimization
|
|
|
|
elif _len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL:
|
|
|
|
subpatternappend((NOT_LITERAL, set[1][1])) # optimization
|
2000-06-30 15:50:59 +08:00
|
|
|
else:
|
2001-01-14 23:06:11 +08:00
|
|
|
# XXX: <fl> should add charmap optimization here
|
2004-03-27 07:24:00 +08:00
|
|
|
subpatternappend((IN, set))
|
2000-06-30 15:50:59 +08:00
|
|
|
|
|
|
|
elif this and this[0] in REPEAT_CHARS:
|
|
|
|
# repeat previous item
|
|
|
|
if this == "?":
|
|
|
|
min, max = 0, 1
|
|
|
|
elif this == "*":
|
|
|
|
min, max = 0, MAXREPEAT
|
2001-02-19 05:04:48 +08:00
|
|
|
|
2000-06-30 15:50:59 +08:00
|
|
|
elif this == "+":
|
|
|
|
min, max = 1, MAXREPEAT
|
|
|
|
elif this == "{":
|
2005-09-14 16:54:39 +08:00
|
|
|
if source.next == "}":
|
|
|
|
subpatternappend((LITERAL, ord(this)))
|
|
|
|
continue
|
2000-07-02 07:49:14 +08:00
|
|
|
here = source.tell()
|
2000-06-30 15:50:59 +08:00
|
|
|
min, max = 0, MAXREPEAT
|
|
|
|
lo = hi = ""
|
|
|
|
while source.next in DIGITS:
|
|
|
|
lo = lo + source.get()
|
2004-03-27 07:24:00 +08:00
|
|
|
if sourcematch(","):
|
2000-06-30 15:50:59 +08:00
|
|
|
while source.next in DIGITS:
|
2004-03-27 07:24:00 +08:00
|
|
|
hi = hi + sourceget()
|
2000-06-30 15:50:59 +08:00
|
|
|
else:
|
|
|
|
hi = lo
|
2004-03-27 07:24:00 +08:00
|
|
|
if not sourcematch("}"):
|
|
|
|
subpatternappend((LITERAL, ord(this)))
|
2000-07-02 07:49:14 +08:00
|
|
|
source.seek(here)
|
|
|
|
continue
|
2000-06-30 15:50:59 +08:00
|
|
|
if lo:
|
2004-08-25 10:22:30 +08:00
|
|
|
min = int(lo)
|
2000-06-30 15:50:59 +08:00
|
|
|
if hi:
|
2004-08-25 10:22:30 +08:00
|
|
|
max = int(hi)
|
2001-01-15 05:00:44 +08:00
|
|
|
if max < min:
|
|
|
|
raise error, "bad repeat interval"
|
2000-06-30 15:50:59 +08:00
|
|
|
else:
|
|
|
|
raise error, "not supported"
|
|
|
|
# figure out which item to repeat
|
|
|
|
if subpattern:
|
|
|
|
item = subpattern[-1:]
|
|
|
|
else:
|
2001-02-19 05:04:48 +08:00
|
|
|
item = None
|
2004-03-27 07:24:00 +08:00
|
|
|
if not item or (_len(item) == 1 and item[0][0] == AT):
|
2000-06-30 15:50:59 +08:00
|
|
|
raise error, "nothing to repeat"
|
2004-03-27 07:24:00 +08:00
|
|
|
if item[0][0] in REPEATCODES:
|
2001-01-15 05:00:44 +08:00
|
|
|
raise error, "multiple repeat"
|
2004-03-27 07:24:00 +08:00
|
|
|
if sourcematch("?"):
|
2000-06-30 15:50:59 +08:00
|
|
|
subpattern[-1] = (MIN_REPEAT, (min, max, item))
|
|
|
|
else:
|
|
|
|
subpattern[-1] = (MAX_REPEAT, (min, max, item))
|
|
|
|
|
|
|
|
elif this == ".":
|
2004-03-27 07:24:00 +08:00
|
|
|
subpatternappend((ANY, None))
|
2000-06-30 15:50:59 +08:00
|
|
|
|
|
|
|
elif this == "(":
|
|
|
|
group = 1
|
|
|
|
name = None
|
2003-10-18 06:13:16 +08:00
|
|
|
condgroup = None
|
2004-03-27 07:24:00 +08:00
|
|
|
if sourcematch("?"):
|
2000-06-30 15:50:59 +08:00
|
|
|
group = 0
|
|
|
|
# options
|
2004-03-27 07:24:00 +08:00
|
|
|
if sourcematch("P"):
|
2000-06-30 15:50:59 +08:00
|
|
|
# python extensions
|
2004-03-27 07:24:00 +08:00
|
|
|
if sourcematch("<"):
|
2000-06-30 15:50:59 +08:00
|
|
|
# named group: skip forward to end of name
|
|
|
|
name = ""
|
|
|
|
while 1:
|
2004-03-27 07:24:00 +08:00
|
|
|
char = sourceget()
|
2000-06-30 15:50:59 +08:00
|
|
|
if char is None:
|
|
|
|
raise error, "unterminated name"
|
|
|
|
if char == ">":
|
|
|
|
break
|
|
|
|
name = name + char
|
|
|
|
group = 1
|
|
|
|
if not isname(name):
|
2001-01-15 05:00:44 +08:00
|
|
|
raise error, "bad character in group name"
|
2004-03-27 07:24:00 +08:00
|
|
|
elif sourcematch("="):
|
2000-06-30 15:50:59 +08:00
|
|
|
# named backreference
|
2000-06-30 17:13:06 +08:00
|
|
|
name = ""
|
|
|
|
while 1:
|
2004-03-27 07:24:00 +08:00
|
|
|
char = sourceget()
|
2000-06-30 17:13:06 +08:00
|
|
|
if char is None:
|
|
|
|
raise error, "unterminated name"
|
|
|
|
if char == ")":
|
|
|
|
break
|
|
|
|
name = name + char
|
|
|
|
if not isname(name):
|
2001-01-15 05:00:44 +08:00
|
|
|
raise error, "bad character in group name"
|
2000-06-30 17:13:06 +08:00
|
|
|
gid = state.groupdict.get(name)
|
|
|
|
if gid is None:
|
|
|
|
raise error, "unknown group name"
|
2004-03-27 07:24:00 +08:00
|
|
|
subpatternappend((GROUPREF, gid))
|
2000-07-03 01:33:27 +08:00
|
|
|
continue
|
2000-06-30 15:50:59 +08:00
|
|
|
else:
|
2004-03-27 07:24:00 +08:00
|
|
|
char = sourceget()
|
2000-06-30 15:50:59 +08:00
|
|
|
if char is None:
|
|
|
|
raise error, "unexpected end of pattern"
|
|
|
|
raise error, "unknown specifier: ?P%s" % char
|
2004-03-27 07:24:00 +08:00
|
|
|
elif sourcematch(":"):
|
2000-06-30 15:50:59 +08:00
|
|
|
# non-capturing group
|
|
|
|
group = 2
|
2004-03-27 07:24:00 +08:00
|
|
|
elif sourcematch("#"):
|
2000-06-30 15:50:59 +08:00
|
|
|
# comment
|
|
|
|
while 1:
|
|
|
|
if source.next is None or source.next == ")":
|
|
|
|
break
|
2004-03-27 07:24:00 +08:00
|
|
|
sourceget()
|
|
|
|
if not sourcematch(")"):
|
2000-09-01 06:57:55 +08:00
|
|
|
raise error, "unbalanced parenthesis"
|
|
|
|
continue
|
2004-03-27 07:24:00 +08:00
|
|
|
elif source.next in ASSERTCHARS:
|
2000-06-30 18:41:31 +08:00
|
|
|
# lookahead assertions
|
2004-03-27 07:24:00 +08:00
|
|
|
char = sourceget()
|
2000-07-04 02:44:21 +08:00
|
|
|
dir = 1
|
|
|
|
if char == "<":
|
2004-03-27 07:24:00 +08:00
|
|
|
if source.next not in LOOKBEHINDASSERTCHARS:
|
2000-07-04 02:44:21 +08:00
|
|
|
raise error, "syntax error"
|
|
|
|
dir = -1 # lookbehind
|
2004-03-27 07:24:00 +08:00
|
|
|
char = sourceget()
|
2000-07-24 05:46:17 +08:00
|
|
|
p = _parse_sub(source, state)
|
2004-03-27 07:24:00 +08:00
|
|
|
if not sourcematch(")"):
|
2000-09-01 06:57:55 +08:00
|
|
|
raise error, "unbalanced parenthesis"
|
2000-07-24 05:46:17 +08:00
|
|
|
if char == "=":
|
2004-03-27 07:24:00 +08:00
|
|
|
subpatternappend((ASSERT, (dir, p)))
|
2000-07-24 05:46:17 +08:00
|
|
|
else:
|
2004-03-27 07:24:00 +08:00
|
|
|
subpatternappend((ASSERT_NOT, (dir, p)))
|
2000-07-24 05:46:17 +08:00
|
|
|
continue
|
2004-03-27 07:24:00 +08:00
|
|
|
elif sourcematch("("):
|
2003-10-18 06:13:16 +08:00
|
|
|
# conditional backreference group
|
|
|
|
condname = ""
|
|
|
|
while 1:
|
2004-03-27 07:24:00 +08:00
|
|
|
char = sourceget()
|
2003-10-18 06:13:16 +08:00
|
|
|
if char is None:
|
|
|
|
raise error, "unterminated name"
|
|
|
|
if char == ")":
|
|
|
|
break
|
|
|
|
condname = condname + char
|
|
|
|
group = 2
|
|
|
|
if isname(condname):
|
|
|
|
condgroup = state.groupdict.get(condname)
|
|
|
|
if condgroup is None:
|
|
|
|
raise error, "unknown group name"
|
|
|
|
else:
|
|
|
|
try:
|
2004-08-25 10:22:30 +08:00
|
|
|
condgroup = int(condname)
|
2003-10-18 06:13:16 +08:00
|
|
|
except ValueError:
|
|
|
|
raise error, "bad character in group name"
|
2000-06-30 15:50:59 +08:00
|
|
|
else:
|
|
|
|
# flags
|
2002-06-01 22:18:47 +08:00
|
|
|
if not source.next in FLAGS:
|
2001-01-15 05:00:44 +08:00
|
|
|
raise error, "unexpected end of pattern"
|
2002-06-01 22:18:47 +08:00
|
|
|
while source.next in FLAGS:
|
2004-03-27 07:24:00 +08:00
|
|
|
state.flags = state.flags | FLAGS[sourceget()]
|
2000-06-30 15:50:59 +08:00
|
|
|
if group:
|
|
|
|
# parse group contents
|
|
|
|
if group == 2:
|
|
|
|
# anonymous group
|
|
|
|
group = None
|
|
|
|
else:
|
2000-10-29 03:30:41 +08:00
|
|
|
group = state.opengroup(name)
|
2003-10-18 06:13:16 +08:00
|
|
|
if condgroup:
|
|
|
|
p = _parse_sub_cond(source, state, condgroup)
|
|
|
|
else:
|
|
|
|
p = _parse_sub(source, state)
|
2004-03-27 07:24:00 +08:00
|
|
|
if not sourcematch(")"):
|
2000-09-01 06:57:55 +08:00
|
|
|
raise error, "unbalanced parenthesis"
|
2000-10-29 03:30:41 +08:00
|
|
|
if group is not None:
|
|
|
|
state.closegroup(group)
|
2004-03-27 07:24:00 +08:00
|
|
|
subpatternappend((SUBPATTERN, (group, p)))
|
2000-06-30 15:50:59 +08:00
|
|
|
else:
|
|
|
|
while 1:
|
2004-03-27 07:24:00 +08:00
|
|
|
char = sourceget()
|
2001-01-15 05:00:44 +08:00
|
|
|
if char is None:
|
|
|
|
raise error, "unexpected end of pattern"
|
|
|
|
if char == ")":
|
2000-06-30 15:50:59 +08:00
|
|
|
break
|
|
|
|
raise error, "unknown extension"
|
|
|
|
|
|
|
|
elif this == "^":
|
2004-03-27 07:24:00 +08:00
|
|
|
subpatternappend((AT, AT_BEGINNING))
|
2000-06-30 15:50:59 +08:00
|
|
|
|
|
|
|
elif this == "$":
|
|
|
|
subpattern.append((AT, AT_END))
|
|
|
|
|
|
|
|
elif this and this[0] == "\\":
|
|
|
|
code = _escape(source, this, state)
|
2004-03-27 07:24:00 +08:00
|
|
|
subpatternappend(code)
|
2000-06-30 15:50:59 +08:00
|
|
|
|
|
|
|
else:
|
|
|
|
raise error, "parser error"
|
2000-03-31 22:58:54 +08:00
|
|
|
|
|
|
|
return subpattern
|
|
|
|
|
2000-08-08 04:59:04 +08:00
|
|
|
def parse(str, flags=0, pattern=None):
|
2000-06-09 22:08:07 +08:00
|
|
|
# parse 're' pattern into list of (opcode, argument) tuples
|
2000-07-24 05:46:17 +08:00
|
|
|
|
|
|
|
source = Tokenizer(str)
|
|
|
|
|
2000-08-08 04:59:04 +08:00
|
|
|
if pattern is None:
|
|
|
|
pattern = Pattern()
|
2000-07-24 05:46:17 +08:00
|
|
|
pattern.flags = flags
|
2001-01-15 05:00:44 +08:00
|
|
|
pattern.str = str
|
2000-07-24 05:46:17 +08:00
|
|
|
|
|
|
|
p = _parse_sub(source, pattern, 0)
|
|
|
|
|
|
|
|
tail = source.get()
|
|
|
|
if tail == ")":
|
|
|
|
raise error, "unbalanced parenthesis"
|
|
|
|
elif tail:
|
|
|
|
raise error, "bogus characters at end of regular expression"
|
|
|
|
|
2001-01-14 23:06:11 +08:00
|
|
|
if flags & SRE_FLAG_DEBUG:
|
|
|
|
p.dump()
|
2000-07-24 05:46:17 +08:00
|
|
|
|
2000-10-04 03:22:26 +08:00
|
|
|
if not (flags & SRE_FLAG_VERBOSE) and p.pattern.flags & SRE_FLAG_VERBOSE:
|
|
|
|
# the VERBOSE flag was switched on inside the pattern. to be
|
|
|
|
# on the safe side, we'll parse the whole thing again...
|
|
|
|
return parse(str, p.pattern.flags)
|
|
|
|
|
2000-03-31 22:58:54 +08:00
|
|
|
return p
|
|
|
|
|
2000-06-29 16:58:44 +08:00
|
|
|
def parse_template(source, pattern):
|
2000-06-09 22:08:07 +08:00
|
|
|
# parse 're' replacement string into list of literals and
|
|
|
|
# group references
|
|
|
|
s = Tokenizer(source)
|
2004-03-27 07:24:00 +08:00
|
|
|
sget = s.get
|
2000-06-09 22:08:07 +08:00
|
|
|
p = []
|
|
|
|
a = p.append
|
2004-03-27 07:24:00 +08:00
|
|
|
def literal(literal, p=p, pappend=a):
|
2001-03-22 23:50:10 +08:00
|
|
|
if p and p[-1][0] is LITERAL:
|
|
|
|
p[-1] = LITERAL, p[-1][1] + literal
|
|
|
|
else:
|
2004-03-27 07:24:00 +08:00
|
|
|
pappend((LITERAL, literal))
|
2001-03-22 23:50:10 +08:00
|
|
|
sep = source[:0]
|
2007-06-08 07:15:56 +08:00
|
|
|
if isinstance(sep, str):
|
2001-09-19 04:55:24 +08:00
|
|
|
makechar = chr
|
2001-03-22 23:50:10 +08:00
|
|
|
else:
|
2007-05-04 01:18:26 +08:00
|
|
|
makechar = chr
|
2000-06-09 22:08:07 +08:00
|
|
|
while 1:
|
2004-03-27 07:24:00 +08:00
|
|
|
this = sget()
|
2000-06-30 15:50:59 +08:00
|
|
|
if this is None:
|
|
|
|
break # end of replacement string
|
|
|
|
if this and this[0] == "\\":
|
|
|
|
# group
|
2004-09-04 01:06:10 +08:00
|
|
|
c = this[1:2]
|
|
|
|
if c == "g":
|
2000-06-30 15:50:59 +08:00
|
|
|
name = ""
|
|
|
|
if s.match("<"):
|
|
|
|
while 1:
|
2004-03-27 07:24:00 +08:00
|
|
|
char = sget()
|
2000-06-30 15:50:59 +08:00
|
|
|
if char is None:
|
|
|
|
raise error, "unterminated group name"
|
|
|
|
if char == ">":
|
|
|
|
break
|
|
|
|
name = name + char
|
|
|
|
if not name:
|
|
|
|
raise error, "bad group name"
|
|
|
|
try:
|
2004-08-25 10:22:30 +08:00
|
|
|
index = int(name)
|
2004-09-04 01:06:10 +08:00
|
|
|
if index < 0:
|
|
|
|
raise error, "negative group number"
|
2000-06-30 15:50:59 +08:00
|
|
|
except ValueError:
|
|
|
|
if not isname(name):
|
2001-01-15 05:00:44 +08:00
|
|
|
raise error, "bad character in group name"
|
2000-06-30 15:50:59 +08:00
|
|
|
try:
|
|
|
|
index = pattern.groupindex[name]
|
|
|
|
except KeyError:
|
|
|
|
raise IndexError, "unknown group name"
|
|
|
|
a((MARK, index))
|
2004-09-04 01:06:10 +08:00
|
|
|
elif c == "0":
|
|
|
|
if s.next in OCTDIGITS:
|
|
|
|
this = this + sget()
|
|
|
|
if s.next in OCTDIGITS:
|
2004-03-27 07:24:00 +08:00
|
|
|
this = this + sget()
|
2004-09-04 01:06:10 +08:00
|
|
|
literal(makechar(int(this[1:], 8) & 0xff))
|
|
|
|
elif c in DIGITS:
|
|
|
|
isoctal = False
|
|
|
|
if s.next in DIGITS:
|
|
|
|
this = this + sget()
|
2004-09-04 04:15:56 +08:00
|
|
|
if (c in OCTDIGITS and this[2] in OCTDIGITS and
|
|
|
|
s.next in OCTDIGITS):
|
2004-09-04 01:06:10 +08:00
|
|
|
this = this + sget()
|
|
|
|
isoctal = True
|
|
|
|
literal(makechar(int(this[1:], 8) & 0xff))
|
|
|
|
if not isoctal:
|
|
|
|
a((MARK, int(this[1:])))
|
2000-06-30 15:50:59 +08:00
|
|
|
else:
|
|
|
|
try:
|
2001-09-19 04:55:24 +08:00
|
|
|
this = makechar(ESCAPES[this][1])
|
2000-06-30 15:50:59 +08:00
|
|
|
except KeyError:
|
2001-03-22 23:50:10 +08:00
|
|
|
pass
|
|
|
|
literal(this)
|
2000-06-30 15:50:59 +08:00
|
|
|
else:
|
2001-03-22 23:50:10 +08:00
|
|
|
literal(this)
|
|
|
|
# convert template to groups and literals lists
|
|
|
|
i = 0
|
|
|
|
groups = []
|
2004-03-27 07:24:00 +08:00
|
|
|
groupsappend = groups.append
|
|
|
|
literals = [None] * len(p)
|
2001-03-22 23:50:10 +08:00
|
|
|
for c, s in p:
|
|
|
|
if c is MARK:
|
2004-03-27 07:24:00 +08:00
|
|
|
groupsappend((i, s))
|
|
|
|
# literal[i] is already None
|
2001-03-22 23:50:10 +08:00
|
|
|
else:
|
2004-03-27 07:24:00 +08:00
|
|
|
literals[i] = s
|
2001-03-22 23:50:10 +08:00
|
|
|
i = i + 1
|
|
|
|
return groups, literals
|
2000-06-09 22:08:07 +08:00
|
|
|
|
2000-06-29 16:58:44 +08:00
|
|
|
def expand_template(template, match):
|
2001-03-22 23:50:10 +08:00
|
|
|
g = match.group
|
2000-06-30 21:55:15 +08:00
|
|
|
sep = match.string[:0]
|
2001-03-22 23:50:10 +08:00
|
|
|
groups, literals = template
|
|
|
|
literals = literals[:]
|
|
|
|
try:
|
|
|
|
for index, group in groups:
|
|
|
|
literals[index] = s = g(group)
|
2000-06-30 15:50:59 +08:00
|
|
|
if s is None:
|
2004-09-04 01:06:10 +08:00
|
|
|
raise error, "unmatched group"
|
2001-03-22 23:50:10 +08:00
|
|
|
except IndexError:
|
2004-09-04 01:06:10 +08:00
|
|
|
raise error, "invalid group reference"
|
2004-08-25 10:22:30 +08:00
|
|
|
return sep.join(literals)
|