mirror of
https://github.com/python/cpython.git
synced 2024-11-25 19:03:49 +08:00
1054 lines
25 KiB
C
1054 lines
25 KiB
C
/***********************************************************
|
|
Copyright 1991-1995 by Stichting Mathematisch Centrum, Amsterdam,
|
|
The Netherlands.
|
|
|
|
All Rights Reserved
|
|
|
|
Permission to use, copy, modify, and distribute this software and its
|
|
documentation for any purpose and without fee is hereby granted,
|
|
provided that the above copyright notice appear in all copies and that
|
|
both that copyright notice and this permission notice appear in
|
|
supporting documentation, and that the names of Stichting Mathematisch
|
|
Centrum or CWI or Corporation for National Research Initiatives or
|
|
CNRI not be used in advertising or publicity pertaining to
|
|
distribution of the software without specific, written prior
|
|
permission.
|
|
|
|
While CWI is the initial source for this software, a modified version
|
|
is made available by the Corporation for National Research Initiatives
|
|
(CNRI) at the Internet address ftp://ftp.python.org.
|
|
|
|
STICHTING MATHEMATISCH CENTRUM AND CNRI DISCLAIM ALL WARRANTIES WITH
|
|
REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF
|
|
MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL STICHTING MATHEMATISCH
|
|
CENTRUM OR CNRI BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL
|
|
DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
|
|
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
|
|
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
|
|
PERFORMANCE OF THIS SOFTWARE.
|
|
|
|
******************************************************************/
|
|
|
|
/* $Id$ */
|
|
|
|
/* Regular expression objects */
|
|
/* This uses Tatu Ylonen's copyleft-free reimplementation of
|
|
GNU regular expressions */
|
|
|
|
#include "Python.h"
|
|
|
|
#include <ctype.h>
|
|
|
|
#include "regexpr.h"
|
|
|
|
static PyObject *ReopError; /* Exception */
|
|
|
|
#define IGNORECASE 0x01
|
|
#define MULTILINE 0x02
|
|
#define DOTALL 0x04
|
|
#define VERBOSE 0x08
|
|
|
|
#define NORMAL 0
|
|
#define CHARCLASS 1
|
|
#define REPLACEMENT 2
|
|
|
|
#define CHAR 0
|
|
#define MEMORY_REFERENCE 1
|
|
#define SYNTAX 2
|
|
#define NOT_SYNTAX 3
|
|
#define SET 4
|
|
#define WORD_BOUNDARY 5
|
|
#define NOT_WORD_BOUNDARY 6
|
|
#define BEGINNING_OF_BUFFER 7
|
|
#define END_OF_BUFFER 8
|
|
|
|
static PyObject *
|
|
makeresult(regs, num_regs)
|
|
struct re_registers *regs;
|
|
int num_regs;
|
|
{
|
|
PyObject *v;
|
|
int i;
|
|
static PyObject *filler = NULL;
|
|
|
|
if (filler == NULL) {
|
|
filler = Py_BuildValue("(ii)", -1, -1);
|
|
if (filler == NULL)
|
|
return NULL;
|
|
}
|
|
v = PyTuple_New(num_regs);
|
|
if (v == NULL)
|
|
return NULL;
|
|
|
|
for (i = 0; i < num_regs; i++) {
|
|
int lo = regs->start[i];
|
|
int hi = regs->end[i];
|
|
PyObject *w;
|
|
if (lo == -1 && hi == -1) {
|
|
w = filler;
|
|
Py_INCREF(w);
|
|
}
|
|
else
|
|
w = Py_BuildValue("(ii)", lo, hi);
|
|
if (w == NULL || PyTuple_SetItem(v, i, w) < 0) {
|
|
Py_DECREF(v);
|
|
return NULL;
|
|
}
|
|
}
|
|
return v;
|
|
}
|
|
|
|
static PyObject *
|
|
reop_match(self, args)
|
|
PyObject *self;
|
|
PyObject *args;
|
|
{
|
|
unsigned char *string;
|
|
int fastmaplen, stringlen;
|
|
int can_be_null, anchor, i;
|
|
int flags, pos, result;
|
|
struct re_pattern_buffer bufp;
|
|
struct re_registers re_regs;
|
|
PyObject *modules = NULL;
|
|
PyObject *reopmodule = NULL;
|
|
PyObject *reopdict = NULL;
|
|
PyObject *casefold = NULL;
|
|
|
|
if (!PyArg_Parse(args, "(s#iiis#is#i)",
|
|
&(bufp.buffer), &(bufp.allocated),
|
|
&(bufp.num_registers), &flags, &can_be_null,
|
|
&(bufp.fastmap), &fastmaplen,
|
|
&anchor,
|
|
&string, &stringlen,
|
|
&pos))
|
|
return NULL;
|
|
|
|
/* XXX sanity-check the input data */
|
|
bufp.used=bufp.allocated;
|
|
if (flags & IGNORECASE)
|
|
{
|
|
if ((modules = PyImport_GetModuleDict()) == NULL)
|
|
return NULL;
|
|
|
|
if ((reopmodule = PyDict_GetItemString(modules,
|
|
"reop")) == NULL)
|
|
return NULL;
|
|
|
|
if ((reopdict = PyModule_GetDict(reopmodule)) == NULL)
|
|
return NULL;
|
|
|
|
if ((casefold = PyDict_GetItemString(reopdict,
|
|
"casefold")) == NULL)
|
|
return NULL;
|
|
|
|
bufp.translate = (unsigned char*)PyString_AsString(casefold);
|
|
}
|
|
else
|
|
bufp.translate=NULL;
|
|
bufp.fastmap_accurate=1;
|
|
bufp.can_be_null=can_be_null;
|
|
bufp.uses_registers=1;
|
|
bufp.anchor=anchor;
|
|
|
|
for(i=0; i<bufp.num_registers; i++) {
|
|
re_regs.start[i]=-1;
|
|
re_regs.end[i]=-1;
|
|
}
|
|
|
|
result = re_match(&bufp,
|
|
string, stringlen, pos,
|
|
&re_regs);
|
|
|
|
if (result < -1) {
|
|
/* Failure like stack overflow */
|
|
if (!PyErr_Occurred())
|
|
PyErr_SetString(ReopError, "match failure");
|
|
return NULL;
|
|
}
|
|
if (result == -1) {
|
|
Py_INCREF(Py_None);
|
|
return Py_None;
|
|
}
|
|
return makeresult(&re_regs, bufp.num_registers);
|
|
}
|
|
|
|
#if 0
|
|
static PyObject *
|
|
reop_optimize(self, args)
|
|
PyObject *self;
|
|
PyObject *args;
|
|
{
|
|
unsigned char *buffer;
|
|
int buflen;
|
|
struct re_pattern_buffer bufp;
|
|
|
|
PyObject *opt_code;
|
|
|
|
if (!PyArg_Parse(args, "(s#)", &buffer, &buflen)) return NULL;
|
|
/* Create a new string for the optimized code */
|
|
opt_code=PyString_FromStringAndSize(buffer, buflen);
|
|
if (opt_code!=NULL)
|
|
{
|
|
bufp.buffer = PyString_AsString(opt_code);
|
|
bufp.used=bufp.allocated=buflen;
|
|
|
|
}
|
|
return opt_code;
|
|
|
|
}
|
|
#endif
|
|
|
|
static PyObject *
|
|
reop_search(self, args)
|
|
PyObject *self;
|
|
PyObject *args;
|
|
{
|
|
unsigned char *string;
|
|
int fastmaplen, stringlen;
|
|
int can_be_null, anchor, i;
|
|
int flags, pos, result;
|
|
struct re_pattern_buffer bufp;
|
|
struct re_registers re_regs;
|
|
PyObject *modules = NULL;
|
|
PyObject *reopmodule = NULL;
|
|
PyObject *reopdict = NULL;
|
|
PyObject *casefold = NULL;
|
|
|
|
if (!PyArg_Parse(args, "(s#iiis#is#i)",
|
|
&(bufp.buffer), &(bufp.allocated),
|
|
&(bufp.num_registers), &flags, &can_be_null,
|
|
&(bufp.fastmap), &fastmaplen,
|
|
&anchor,
|
|
&string, &stringlen,
|
|
&pos))
|
|
return NULL;
|
|
|
|
/* XXX sanity-check the input data */
|
|
bufp.used=bufp.allocated;
|
|
if (flags & IGNORECASE)
|
|
{
|
|
if ((modules = PyImport_GetModuleDict()) == NULL)
|
|
return NULL;
|
|
|
|
if ((reopmodule = PyDict_GetItemString(modules,
|
|
"reop")) == NULL)
|
|
return NULL;
|
|
|
|
if ((reopdict = PyModule_GetDict(reopmodule)) == NULL)
|
|
return NULL;
|
|
|
|
if ((casefold = PyDict_GetItemString(reopdict,
|
|
"casefold")) == NULL)
|
|
return NULL;
|
|
|
|
bufp.translate = (unsigned char *)PyString_AsString(casefold);
|
|
}
|
|
else
|
|
bufp.translate=NULL;
|
|
bufp.fastmap_accurate=1;
|
|
bufp.can_be_null=can_be_null;
|
|
bufp.uses_registers=1;
|
|
bufp.anchor=anchor;
|
|
|
|
for(i = 0; i < bufp.num_registers; i++) {
|
|
re_regs.start[i] = -1;
|
|
re_regs.end[i] = -1;
|
|
}
|
|
|
|
result = re_search(&bufp,
|
|
string, stringlen, pos, stringlen-pos,
|
|
&re_regs);
|
|
|
|
if (result < -1) {
|
|
/* Failure like stack overflow */
|
|
if (!PyErr_Occurred())
|
|
PyErr_SetString(ReopError, "match failure");
|
|
return NULL;
|
|
}
|
|
|
|
if (result == -1) {
|
|
Py_INCREF(Py_None);
|
|
return Py_None;
|
|
}
|
|
|
|
return makeresult(&re_regs, bufp.num_registers);
|
|
}
|
|
|
|
static PyObject *
|
|
reop_expand_escape(self, args)
|
|
PyObject *self;
|
|
PyObject *args;
|
|
{
|
|
unsigned char c, *pattern;
|
|
int index, context=NORMAL, pattern_len;
|
|
|
|
if (!PyArg_ParseTuple(args, "s#i|i", &pattern, &pattern_len, &index,
|
|
&context))
|
|
return NULL;
|
|
if (pattern_len<=index)
|
|
{
|
|
PyErr_SetString(ReopError, "escape ends too soon");
|
|
return NULL;
|
|
}
|
|
c=pattern[index]; index++;
|
|
switch (c)
|
|
{
|
|
case('t'):
|
|
return Py_BuildValue("ici", CHAR, (char)9, index);
|
|
break;
|
|
case('n'):
|
|
return Py_BuildValue("ici", CHAR, (char)10, index);
|
|
break;
|
|
case('v'):
|
|
return Py_BuildValue("ici", CHAR, (char)11, index);
|
|
break;
|
|
case('r'):
|
|
return Py_BuildValue("ici", CHAR, (char)13, index);
|
|
break;
|
|
case('f'):
|
|
return Py_BuildValue("ici", CHAR, (char)12, index);
|
|
break;
|
|
case('a'):
|
|
return Py_BuildValue("ici", CHAR, (char)7, index);
|
|
break;
|
|
case('x'):
|
|
{
|
|
int end, length;
|
|
unsigned char *string;
|
|
PyObject *v, *result;
|
|
|
|
end=index;
|
|
while (end<pattern_len &&
|
|
( re_syntax_table[ pattern[end] ] & Shexdigit ) )
|
|
end++;
|
|
if (end==index)
|
|
{
|
|
PyErr_SetString(ReopError, "\\x must be followed by hex digits");
|
|
return NULL;
|
|
}
|
|
length=end-index;
|
|
string=malloc(length+4+1);
|
|
if (string==NULL)
|
|
{
|
|
PyErr_SetString(PyExc_MemoryError, "can't allocate memory for \\x string");
|
|
return NULL;
|
|
}
|
|
/* Create a string containing "\x<hexdigits>", which will be
|
|
passed to eval() */
|
|
string[0]=string[length+3]='"';
|
|
string[1]='\\';
|
|
string[length+4]='\0';
|
|
memcpy(string+2, pattern+index-1, length+1);
|
|
v=PyRun_String((char *)string, Py_eval_input,
|
|
PyEval_GetGlobals(), PyEval_GetLocals());
|
|
free(string);
|
|
/* The evaluation raised an exception */
|
|
if (v==NULL) return NULL;
|
|
result=Py_BuildValue("iOi", CHAR, v, end);
|
|
Py_DECREF(v);
|
|
return result;
|
|
}
|
|
break;
|
|
|
|
case('b'):
|
|
if (context!=NORMAL)
|
|
return Py_BuildValue("ici", CHAR, (char)8, index);
|
|
else
|
|
{
|
|
unsigned char empty_string[1];
|
|
empty_string[0]='\0';
|
|
return Py_BuildValue("isi", WORD_BOUNDARY, empty_string, index);
|
|
}
|
|
break;
|
|
case('B'):
|
|
if (context!=NORMAL)
|
|
return Py_BuildValue("ici", CHAR, 'B', index);
|
|
else
|
|
{
|
|
unsigned char empty_string[1];
|
|
empty_string[0]='\0';
|
|
return Py_BuildValue("isi", NOT_WORD_BOUNDARY, empty_string, index);
|
|
}
|
|
break;
|
|
case('A'):
|
|
if (context!=NORMAL)
|
|
return Py_BuildValue("ici", CHAR, 'A', index);
|
|
else
|
|
{
|
|
unsigned char empty_string[1];
|
|
empty_string[0]='\0';
|
|
return Py_BuildValue("isi", BEGINNING_OF_BUFFER, empty_string, index);
|
|
}
|
|
break;
|
|
case('Z'):
|
|
if (context!=NORMAL)
|
|
return Py_BuildValue("ici", CHAR, 'Z', index);
|
|
else
|
|
{
|
|
unsigned char empty_string[1];
|
|
empty_string[0]='\0';
|
|
return Py_BuildValue("isi", END_OF_BUFFER, empty_string, index);
|
|
}
|
|
break;
|
|
case('E'): case('G'): case('L'): case('Q'):
|
|
case('U'): case('l'): case('u'):
|
|
{
|
|
char message[50];
|
|
sprintf(message, "\\%c is not allowed", c);
|
|
PyErr_SetString(ReopError, message);
|
|
return NULL;
|
|
}
|
|
|
|
case ('w'):
|
|
if (context==NORMAL)
|
|
return Py_BuildValue("iii", SYNTAX, Sword, index);
|
|
if (context!=CHARCLASS)
|
|
return Py_BuildValue("ici", CHAR, 'w', index);
|
|
{
|
|
/* context==CHARCLASS */
|
|
unsigned char set[256];
|
|
int i, j;
|
|
for(i=j=0; i<256; i++)
|
|
if (re_syntax_table[i] & Sword)
|
|
{
|
|
set[j++] = i;
|
|
}
|
|
return Py_BuildValue("is#i", SET, set, j, index);
|
|
}
|
|
break;
|
|
case ('W'):
|
|
if (context==NORMAL)
|
|
return Py_BuildValue("iii", NOT_SYNTAX, Sword, index);
|
|
if (context!=CHARCLASS)
|
|
return Py_BuildValue("ici", CHAR, 'W', index);
|
|
{
|
|
/* context==CHARCLASS */
|
|
unsigned char set[256];
|
|
int i, j;
|
|
for(i=j=0; i<256; i++)
|
|
if (! (re_syntax_table[i] & Sword))
|
|
{
|
|
set[j++] = i;
|
|
}
|
|
return Py_BuildValue("is#i", SET, set, j, index);
|
|
}
|
|
break;
|
|
case ('s'):
|
|
if (context==NORMAL)
|
|
return Py_BuildValue("iii", SYNTAX, Swhitespace, index);
|
|
if (context!=CHARCLASS)
|
|
return Py_BuildValue("ici", CHAR, 's', index);
|
|
{
|
|
/* context==CHARCLASS */
|
|
unsigned char set[256];
|
|
int i, j;
|
|
for(i=j=0; i<256; i++)
|
|
if (re_syntax_table[i] & Swhitespace)
|
|
{
|
|
set[j++] = i;
|
|
}
|
|
return Py_BuildValue("is#i", SET, set, j, index);
|
|
}
|
|
break;
|
|
case ('S'):
|
|
if (context==NORMAL)
|
|
return Py_BuildValue("iii", NOT_SYNTAX, Swhitespace, index);
|
|
if (context!=CHARCLASS)
|
|
return Py_BuildValue("ici", CHAR, 'S', index);
|
|
{
|
|
/* context==CHARCLASS */
|
|
unsigned char set[256];
|
|
int i, j;
|
|
for(i=j=0; i<256; i++)
|
|
if (! (re_syntax_table[i] & Swhitespace) )
|
|
{
|
|
set[j++] = i;
|
|
}
|
|
return Py_BuildValue("is#i", SET, set, j, index);
|
|
}
|
|
break;
|
|
|
|
case ('d'):
|
|
if (context==NORMAL)
|
|
return Py_BuildValue("iii", SYNTAX, Sdigit, index);
|
|
if (context!=CHARCLASS)
|
|
return Py_BuildValue("ici", CHAR, 'd', index);
|
|
{
|
|
/* context==CHARCLASS */
|
|
unsigned char set[256];
|
|
int i, j;
|
|
for(i=j=0; i<256; i++)
|
|
if (re_syntax_table[i] & Sdigit)
|
|
{
|
|
set[j++] = i;
|
|
}
|
|
return Py_BuildValue("is#i", SET, set, j, index);
|
|
}
|
|
break;
|
|
case ('D'):
|
|
if (context==NORMAL)
|
|
return Py_BuildValue("iii", NOT_SYNTAX, Sdigit, index);
|
|
if (context!=CHARCLASS)
|
|
return Py_BuildValue("ici", CHAR, 'D', index);
|
|
{
|
|
/* context==CHARCLASS */
|
|
unsigned char set[256];
|
|
int i, j;
|
|
for(i=j=0; i<256; i++)
|
|
if ( !(re_syntax_table[i] & Sdigit) )
|
|
{
|
|
set[j++] = i;
|
|
}
|
|
return Py_BuildValue("is#i", SET, set, j, index);
|
|
}
|
|
break;
|
|
|
|
case('g'):
|
|
{
|
|
int end, valid, i;
|
|
if (context!=REPLACEMENT)
|
|
return Py_BuildValue("ici", CHAR, 'g', index);
|
|
if (pattern_len<=index)
|
|
{
|
|
PyErr_SetString(ReopError, "unfinished symbolic reference");
|
|
return NULL;
|
|
}
|
|
if (pattern[index]!='<')
|
|
{
|
|
PyErr_SetString(ReopError, "missing < in symbolic reference");
|
|
return NULL;
|
|
}
|
|
index++;
|
|
end=index;
|
|
while (end<pattern_len && pattern[end]!='>')
|
|
end++;
|
|
if (end==pattern_len)
|
|
{
|
|
PyErr_SetString(ReopError, "unfinished symbolic reference");
|
|
return NULL;
|
|
}
|
|
valid=1;
|
|
if (index==end /* Zero-length name */
|
|
|| !(re_syntax_table[pattern[index]] & Sword) /* First char. not alphanumeric */
|
|
|| (re_syntax_table[pattern[index]] & Sdigit) ) /* First char. a digit */
|
|
valid=0;
|
|
|
|
for(i=index+1; i<end; i++)
|
|
{
|
|
if (!(re_syntax_table[pattern[i]] & Sword) )
|
|
valid=0;
|
|
}
|
|
if (!valid)
|
|
{
|
|
/* XXX should include the text of the reference */
|
|
PyErr_SetString(ReopError, "illegal symbolic reference");
|
|
return NULL;
|
|
}
|
|
|
|
return Py_BuildValue("is#i", MEMORY_REFERENCE,
|
|
pattern+index, end-index,
|
|
end+1);
|
|
}
|
|
break;
|
|
|
|
case('0'):
|
|
{
|
|
/* \0 always indicates an octal escape, so we consume up to 3
|
|
characters, as long as they're all octal digits */
|
|
int octval=0, i;
|
|
index--;
|
|
for(i=index;
|
|
i<=index+2 && i<pattern_len
|
|
&& (re_syntax_table[ pattern[i] ] & Soctaldigit );
|
|
i++)
|
|
{
|
|
octval = octval * 8 + pattern[i] - '0';
|
|
}
|
|
if (octval>255)
|
|
{
|
|
PyErr_SetString(ReopError, "octal value out of range");
|
|
return NULL;
|
|
}
|
|
return Py_BuildValue("ici", CHAR, (unsigned char)octval, i);
|
|
}
|
|
break;
|
|
case('1'): case('2'): case('3'): case('4'):
|
|
case('5'): case('6'): case('7'): case('8'):
|
|
case('9'):
|
|
{
|
|
/* Handle \?, where ? is from 1 through 9 */
|
|
int value=0;
|
|
index--;
|
|
/* If it's at least a two-digit reference, like \34, it might
|
|
either be a 3-digit octal escape (\123) or a 2-digit
|
|
decimal memory reference (\34) */
|
|
|
|
if ( (index+1) <pattern_len &&
|
|
(re_syntax_table[ pattern[index+1] ] & Sdigit) )
|
|
{
|
|
if ( (index+2) <pattern_len &&
|
|
(re_syntax_table[ pattern[index+2] ] & Soctaldigit) &&
|
|
(re_syntax_table[ pattern[index+1] ] & Soctaldigit) &&
|
|
(re_syntax_table[ pattern[index ] ] & Soctaldigit)
|
|
)
|
|
{
|
|
/* 3 octal digits */
|
|
value= 8*8*(pattern[index ]-'0') +
|
|
8*(pattern[index+1]-'0') +
|
|
(pattern[index+2]-'0');
|
|
if (value>255)
|
|
{
|
|
PyErr_SetString(ReopError, "octal value out of range");
|
|
return NULL;
|
|
}
|
|
return Py_BuildValue("ici", CHAR, (unsigned char)value, index+3);
|
|
}
|
|
else
|
|
{
|
|
/* 2-digit form, so it's a memory reference */
|
|
if (context==CHARCLASS)
|
|
{
|
|
PyErr_SetString(ReopError, "cannot reference a register "
|
|
"from inside a character class");
|
|
return NULL;
|
|
}
|
|
value= 10*(pattern[index ]-'0') +
|
|
(pattern[index+1]-'0');
|
|
if (value<1 || RE_NREGS<=value)
|
|
{
|
|
PyErr_SetString(ReopError, "memory reference out of range");
|
|
return NULL;
|
|
}
|
|
return Py_BuildValue("iii", MEMORY_REFERENCE,
|
|
value, index+2);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
/* Single-digit form, like \2, so it's a memory reference */
|
|
if (context==CHARCLASS)
|
|
{
|
|
PyErr_SetString(ReopError, "cannot reference a register "
|
|
"from inside a character class");
|
|
return NULL;
|
|
}
|
|
return Py_BuildValue("iii", MEMORY_REFERENCE,
|
|
pattern[index]-'0', index+1);
|
|
}
|
|
}
|
|
break;
|
|
|
|
default:
|
|
return Py_BuildValue("ici", CHAR, c, index);
|
|
break;
|
|
}
|
|
}
|
|
|
|
static PyObject *
|
|
reop__expand(self, args)
|
|
PyObject *self;
|
|
PyObject *args;
|
|
{
|
|
PyObject *results, *match_obj;
|
|
PyObject *repl_obj, *newstring;
|
|
unsigned char *repl;
|
|
int size, total_len, i, start, pos;
|
|
|
|
if (!PyArg_ParseTuple(args, "OS", &match_obj, &repl_obj))
|
|
return NULL;
|
|
|
|
repl=(unsigned char *)PyString_AsString(repl_obj);
|
|
size=PyString_Size(repl_obj);
|
|
results=PyList_New(0);
|
|
if (results==NULL) return NULL;
|
|
for(start=total_len=i=0; i<size; i++)
|
|
{
|
|
if (repl[i]=='\\')
|
|
{
|
|
PyObject *args, *t, *value;
|
|
int escape_type;
|
|
|
|
if (start!=i)
|
|
{
|
|
PyList_Append(results,
|
|
PyString_FromStringAndSize((char *)repl+start, i-start));
|
|
total_len += i-start;
|
|
}
|
|
i++;
|
|
args=Py_BuildValue("Oii", repl_obj, i, REPLACEMENT);
|
|
t=reop_expand_escape(NULL, args);
|
|
Py_DECREF(args);
|
|
if (t==NULL)
|
|
{
|
|
/* reop_expand_escape triggered an exception of some sort,
|
|
so just return */
|
|
Py_DECREF(results);
|
|
return NULL;
|
|
}
|
|
value=PyTuple_GetItem(t, 1);
|
|
escape_type=PyInt_AsLong(PyTuple_GetItem(t, 0));
|
|
switch (escape_type)
|
|
{
|
|
case (CHAR):
|
|
PyList_Append(results, value);
|
|
total_len += PyString_Size(value);
|
|
break;
|
|
case(MEMORY_REFERENCE):
|
|
{
|
|
PyObject *r, *tuple, *result;
|
|
r=PyObject_GetAttrString(match_obj, "group");
|
|
tuple=PyTuple_New(1);
|
|
Py_INCREF(value);
|
|
PyTuple_SetItem(tuple, 0, value);
|
|
result=PyEval_CallObject(r, tuple);
|
|
Py_DECREF(r); Py_DECREF(tuple);
|
|
if (result==NULL)
|
|
{
|
|
/* The group() method trigged an exception of some sort */
|
|
Py_DECREF(results);
|
|
return NULL;
|
|
}
|
|
if (result==Py_None)
|
|
{
|
|
char message[50];
|
|
sprintf(message,
|
|
"group %li did not contribute to the match",
|
|
PyInt_AsLong(value));
|
|
PyErr_SetString(ReopError,
|
|
message);
|
|
Py_DECREF(result);
|
|
Py_DECREF(t);
|
|
Py_DECREF(results);
|
|
return NULL;
|
|
}
|
|
/* xxx typecheck that it's a string! */
|
|
PyList_Append(results, result);
|
|
total_len += PyString_Size(result);
|
|
Py_DECREF(result);
|
|
}
|
|
break;
|
|
default:
|
|
Py_DECREF(t);
|
|
Py_DECREF(results);
|
|
PyErr_SetString(ReopError,
|
|
"bad escape in replacement");
|
|
return NULL;
|
|
}
|
|
i=start=PyInt_AsLong(PyTuple_GetItem(t, 2));
|
|
i--; /* Decrement now, because the 'for' loop will increment it */
|
|
Py_DECREF(t);
|
|
}
|
|
} /* endif repl[i]!='\\' */
|
|
|
|
if (start!=i)
|
|
{
|
|
PyList_Append(results, PyString_FromStringAndSize((char *)repl+start, i-start));
|
|
total_len += i-start;
|
|
}
|
|
|
|
/* Whew! Now we've constructed a list containing various pieces of
|
|
strings that will make up our final result. So, iterate over
|
|
the list concatenating them. A new string measuring total_len
|
|
bytes is allocated and filled in. */
|
|
|
|
newstring=PyString_FromStringAndSize(NULL, total_len);
|
|
if (newstring==NULL)
|
|
{
|
|
Py_DECREF(results);
|
|
return NULL;
|
|
}
|
|
|
|
repl=(unsigned char *)PyString_AsString(newstring);
|
|
for (pos=i=0; i<PyList_Size(results); i++)
|
|
{
|
|
PyObject *item=PyList_GetItem(results, i);
|
|
memcpy(repl+pos, PyString_AsString(item), PyString_Size(item) );
|
|
pos += PyString_Size(item);
|
|
}
|
|
Py_DECREF(results);
|
|
return newstring;
|
|
}
|
|
|
|
|
|
#if 0
|
|
/* Functions originally in the regsub module.
|
|
Added June 1, 1997.
|
|
*/
|
|
|
|
/* A cache of previously used patterns is maintained. Notice that if
|
|
you change the reop syntax flag, entries in the cache are
|
|
invalidated.
|
|
XXX Solution: use (syntax flag, pattern) as keys? Clear the cache
|
|
every so often, or once it gets past a certain size?
|
|
*/
|
|
|
|
static PyObject *cache_dict=NULL;
|
|
|
|
/* Accept an object; if it's a reop pattern, Py_INCREF it and return
|
|
it. If it's a string, a reop object is compiled and cached.
|
|
*/
|
|
|
|
static reopobject *
|
|
cached_compile(pattern)
|
|
PyObject *pattern;
|
|
{
|
|
reopobject *p2;
|
|
|
|
if (!PyString_Check(pattern))
|
|
{
|
|
/* It's not a string, so assume it's a compiled reop object */
|
|
/* XXX check that! */
|
|
Py_INCREF(pattern);
|
|
return (reopobject*)pattern;
|
|
}
|
|
if (cache_dict==NULL)
|
|
{
|
|
cache_dict=PyDict_New();
|
|
if (cache_dict==NULL)
|
|
{
|
|
return (reopobject*)NULL;
|
|
}
|
|
}
|
|
|
|
/* See if the pattern has already been cached; if so, return that
|
|
reop object */
|
|
p2=(reopobject*)PyDict_GetItem(cache_dict, pattern);
|
|
if (p2)
|
|
{
|
|
Py_INCREF(p2);
|
|
return (reopobject*)p2;
|
|
}
|
|
|
|
/* Compile the pattern and cache it */
|
|
p2=(reopobject*)newreopobject(pattern, NULL, pattern, NULL);
|
|
if (!p2) return p2;
|
|
PyDict_SetItem(cache_dict, pattern, (PyObject*)p2);
|
|
return p2;
|
|
}
|
|
|
|
|
|
static PyObject *
|
|
internal_split(args, retain)
|
|
PyObject *args;
|
|
int retain;
|
|
{
|
|
PyObject *newlist, *s;
|
|
reopobject *pattern;
|
|
int maxsplit=0, count=0, length, next=0, result;
|
|
int match_end=0; /* match_start is defined below */
|
|
unsigned char *start;
|
|
|
|
if (!PyArg_ParseTuple(args, "s#Oi", &start, &length, &pattern,
|
|
&maxsplit))
|
|
{
|
|
PyErr_Clear();
|
|
if (!PyArg_ParseTuple(args, "s#O", &start, &length, &pattern))
|
|
return NULL;
|
|
}
|
|
pattern=cached_compile((PyObject *)pattern);
|
|
if (!pattern) return NULL;
|
|
|
|
newlist=PyList_New(0);
|
|
if (!newlist) return NULL;
|
|
|
|
do
|
|
{
|
|
result = re_search(&pattern->re_patbuf,
|
|
start, length, next, length-next,
|
|
&pattern->re_regs);
|
|
if (result < -1)
|
|
{ /* Erk... an error happened during the reop search */
|
|
Py_DECREF(newlist);
|
|
PyErr_SetString(ReopError, "match failure");
|
|
return NULL;
|
|
}
|
|
if (next<=result)
|
|
{
|
|
int match_start=pattern->re_regs.start[0];
|
|
int oldmatch_end=match_end;
|
|
match_end=pattern->re_regs.end[0];
|
|
|
|
if (match_start==match_end)
|
|
{ /* A zero-length match; increment to the next position */
|
|
next=result+1;
|
|
match_end=oldmatch_end;
|
|
continue;
|
|
}
|
|
|
|
/* Append the string up to the start of the match */
|
|
s=PyString_FromStringAndSize(start+oldmatch_end, match_start-oldmatch_end);
|
|
if (!s)
|
|
{
|
|
Py_DECREF(newlist);
|
|
return NULL;
|
|
}
|
|
PyList_Append(newlist, s);
|
|
Py_DECREF(s);
|
|
|
|
if (retain)
|
|
{
|
|
/* Append a string containing whatever matched */
|
|
s=PyString_FromStringAndSize(start+match_start, match_end-match_start);
|
|
if (!s)
|
|
{
|
|
Py_DECREF(newlist);
|
|
return NULL;
|
|
}
|
|
PyList_Append(newlist, s);
|
|
Py_DECREF(s);
|
|
}
|
|
/* Update the pointer, and increment the count of splits */
|
|
next=match_end; count++;
|
|
}
|
|
} while (result!=-1 && !(maxsplit && maxsplit==count) &&
|
|
next<length);
|
|
s=PyString_FromStringAndSize(start+match_end, length-match_end);
|
|
if (!s)
|
|
{
|
|
Py_DECREF(newlist);
|
|
return NULL;
|
|
}
|
|
PyList_Append(newlist, s);
|
|
Py_DECREF(s);
|
|
Py_DECREF(pattern);
|
|
return newlist;
|
|
}
|
|
|
|
static PyObject *
|
|
reop_split(self, args)
|
|
PyObject *self;
|
|
PyObject *args;
|
|
{
|
|
return internal_split(args, 0);
|
|
}
|
|
|
|
static PyObject *
|
|
reop_splitx(self, args)
|
|
PyObject *self;
|
|
PyObject *args;
|
|
{
|
|
return internal_split(args, 1);
|
|
}
|
|
#endif
|
|
|
|
static struct PyMethodDef reop_global_methods[] = {
|
|
{"match", reop_match, 0},
|
|
{"search", reop_search, 0},
|
|
{"expand_escape", reop_expand_escape, 1},
|
|
{"_expand", reop__expand, 1},
|
|
#if 0
|
|
{"_optimize", reop_optimize, 0},
|
|
{"split", reop_split, 0},
|
|
{"splitx", reop_splitx, 0},
|
|
#endif
|
|
{NULL, NULL} /* sentinel */
|
|
};
|
|
|
|
void
|
|
initreop()
|
|
{
|
|
PyObject *m, *d, *k, *v, *o;
|
|
int i;
|
|
unsigned char *s;
|
|
unsigned char j[2];
|
|
|
|
re_compile_initialize();
|
|
|
|
m = Py_InitModule("reop", reop_global_methods);
|
|
d = PyModule_GetDict(m);
|
|
|
|
/* Initialize reop.error exception */
|
|
v = ReopError = PyString_FromString("reop.error");
|
|
if (v == NULL || PyDict_SetItemString(d, "error", v) != 0)
|
|
goto finally;
|
|
|
|
/* Initialize reop.casefold constant */
|
|
if (!(v = PyString_FromStringAndSize((char *)NULL, 256)))
|
|
goto finally;
|
|
|
|
if (!(s = (unsigned char *)PyString_AsString(v)))
|
|
goto finally;
|
|
|
|
for (i = 0; i < 256; i++) {
|
|
if (isupper(i))
|
|
s[i] = tolower(i);
|
|
else
|
|
s[i] = i;
|
|
}
|
|
|
|
if (PyDict_SetItemString(d, "casefold", v) < 0)
|
|
goto finally;
|
|
Py_DECREF(v);
|
|
|
|
/* Initialize the syntax table */
|
|
|
|
o = PyDict_New();
|
|
if (o == NULL)
|
|
goto finally;
|
|
|
|
j[1] = '\0';
|
|
for (i = 0; i < 256; i++)
|
|
{
|
|
j[0] = i;
|
|
k = PyString_FromStringAndSize((char *)j, 1);
|
|
if (k == NULL)
|
|
goto finally;
|
|
v = PyInt_FromLong(re_syntax_table[i]);
|
|
if (v == NULL)
|
|
goto finally;
|
|
if (PyDict_SetItem(o, k, v) < 0)
|
|
goto finally;
|
|
Py_DECREF(k);
|
|
Py_DECREF(v);
|
|
}
|
|
|
|
if (PyDict_SetItemString(d, "syntax_table", o) < 0)
|
|
goto finally;
|
|
Py_DECREF(o);
|
|
|
|
v = PyInt_FromLong(Sword);
|
|
if (v == NULL)
|
|
goto finally;
|
|
|
|
if (PyDict_SetItemString(d, "word", v) < 0)
|
|
goto finally;
|
|
Py_DECREF(v);
|
|
|
|
v = PyInt_FromLong(Swhitespace);
|
|
if (v == NULL)
|
|
goto finally;
|
|
|
|
if (PyDict_SetItemString(d, "whitespace", v) < 0)
|
|
goto finally;
|
|
Py_DECREF(v);
|
|
|
|
v = PyInt_FromLong(Sdigit);
|
|
if (v == NULL)
|
|
goto finally;
|
|
|
|
if (PyDict_SetItemString(d, "digit", v) < 0)
|
|
goto finally;
|
|
Py_DECREF(v);
|
|
|
|
PyDict_SetItemString(d, "NORMAL", PyInt_FromLong(NORMAL));
|
|
PyDict_SetItemString(d, "CHARCLASS", PyInt_FromLong(CHARCLASS));
|
|
PyDict_SetItemString(d, "REPLACEMENT", PyInt_FromLong(REPLACEMENT));
|
|
|
|
PyDict_SetItemString(d, "CHAR", PyInt_FromLong(CHAR));
|
|
PyDict_SetItemString(d, "MEMORY_REFERENCE", PyInt_FromLong(MEMORY_REFERENCE));
|
|
PyDict_SetItemString(d, "SYNTAX", PyInt_FromLong(SYNTAX));
|
|
PyDict_SetItemString(d, "NOT_SYNTAX", PyInt_FromLong(NOT_SYNTAX));
|
|
PyDict_SetItemString(d, "SET", PyInt_FromLong(SET));
|
|
PyDict_SetItemString(d, "WORD_BOUNDARY", PyInt_FromLong(WORD_BOUNDARY));
|
|
PyDict_SetItemString(d, "NOT_WORD_BOUNDARY", PyInt_FromLong(NOT_WORD_BOUNDARY));
|
|
PyDict_SetItemString(d, "BEGINNING_OF_BUFFER", PyInt_FromLong(BEGINNING_OF_BUFFER));
|
|
PyDict_SetItemString(d, "END_OF_BUFFER", PyInt_FromLong(END_OF_BUFFER));
|
|
|
|
if (!PyErr_Occurred())
|
|
return;
|
|
|
|
finally:
|
|
Py_FatalError("can't initialize reop module");
|
|
}
|
|
|