mirror of
https://github.com/python/cpython.git
synced 2024-11-23 09:54:58 +08:00
bpo-35808: Retire pgen and use pgen2 to generate the parser (GH-11814)
Pgen is the oldest piece of technology in the CPython repository, building it requires various #if[n]def PGEN hacks in other parts of the code and it also depends more and more on CPython internals. This commit removes the old pgen C code and replaces it for a new version implemented in pure Python. This is a modified and adapted version of lib2to3/pgen2 that can generate grammar files compatibles with the current parser. This commit also eliminates all the #ifdef and code branches related to pgen, simplifying the code and making it more maintainable. The regen-grammar step now uses $(PYTHON_FOR_REGEN) that can be any version of the interpreter, so the new pgen code maintains compatibility with older versions of the interpreter (this also allows regenerating the grammar with the current CI solution that uses Python3.5). The new pgen Python module also makes use of the Grammar/Tokens file that holds the token specification, so is always kept in sync and avoids having to maintain duplicate token definitions.
This commit is contained in:
parent
7eebbbd5b3
commit
1f24a719e7
2
.gitignore
vendored
2
.gitignore
vendored
@ -73,8 +73,6 @@ PCbuild/arm32/
|
||||
PCbuild/obj/
|
||||
PCbuild/win32/
|
||||
.purify
|
||||
Parser/pgen
|
||||
Parser/pgen.exe
|
||||
__pycache__
|
||||
autom4te.cache
|
||||
build/
|
||||
|
@ -1,18 +0,0 @@
|
||||
#ifndef Py_METAGRAMMAR_H
|
||||
#define Py_METAGRAMMAR_H
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
|
||||
#define MSTART 256
|
||||
#define RULE 257
|
||||
#define RHS 258
|
||||
#define ALT 259
|
||||
#define ITEM 260
|
||||
#define ATOM 261
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
#endif /* !Py_METAGRAMMAR_H */
|
@ -12,10 +12,7 @@ extern "C" {
|
||||
|
||||
typedef struct {
|
||||
int error;
|
||||
#ifndef PGEN
|
||||
/* The filename is useless for pgen, see comment in tok_state structure */
|
||||
PyObject *filename;
|
||||
#endif
|
||||
int lineno;
|
||||
int offset;
|
||||
char *text; /* UTF-8-encoded string */
|
||||
|
@ -1,18 +0,0 @@
|
||||
#ifndef Py_PGEN_H
|
||||
#define Py_PGEN_H
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
|
||||
/* Parser generator interface */
|
||||
|
||||
extern grammar *meta_grammar(void);
|
||||
|
||||
struct _node;
|
||||
extern grammar *pgen(struct _node *);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
#endif /* !Py_PGEN_H */
|
@ -290,40 +290,21 @@ LIBFFI_INCLUDEDIR= @LIBFFI_INCLUDEDIR@
|
||||
|
||||
##########################################################################
|
||||
# Parser
|
||||
PGEN= Parser/pgen$(EXE)
|
||||
|
||||
POBJS= \
|
||||
Parser/acceler.o \
|
||||
Parser/grammar1.o \
|
||||
Parser/listnode.o \
|
||||
Parser/node.o \
|
||||
Parser/parser.o \
|
||||
Parser/bitset.o \
|
||||
Parser/metagrammar.o \
|
||||
Parser/firstsets.o \
|
||||
Parser/grammar.o \
|
||||
Parser/token.o \
|
||||
Parser/pgen.o
|
||||
|
||||
PARSER_OBJS= $(POBJS) Parser/myreadline.o Parser/parsetok.o Parser/tokenizer.o
|
||||
|
||||
PGOBJS= \
|
||||
Objects/obmalloc.o \
|
||||
Python/dynamic_annotations.o \
|
||||
Python/mysnprintf.o \
|
||||
Python/pyctype.o \
|
||||
Parser/tokenizer_pgen.o \
|
||||
Parser/printgrammar.o \
|
||||
Parser/parsetok_pgen.o \
|
||||
Parser/pgenmain.o
|
||||
|
||||
PARSER_HEADERS= \
|
||||
$(srcdir)/Parser/parser.h \
|
||||
$(srcdir)/Include/parsetok.h \
|
||||
$(srcdir)/Parser/tokenizer.h
|
||||
|
||||
PGENOBJS= $(POBJS) $(PGOBJS)
|
||||
|
||||
##########################################################################
|
||||
# Python
|
||||
|
||||
@ -802,31 +783,18 @@ Python/sysmodule.o: $(srcdir)/Python/sysmodule.c Makefile
|
||||
|
||||
$(IO_OBJS): $(IO_H)
|
||||
|
||||
$(PGEN): $(PGENOBJS)
|
||||
$(CC) $(OPT) $(PY_CORE_LDFLAGS) $(PGENOBJS) $(LIBS) -o $(PGEN)
|
||||
|
||||
.PHONY: regen-grammar
|
||||
regen-grammar: $(PGEN)
|
||||
regen-grammar: regen-token
|
||||
# Regenerate Include/graminit.h and Python/graminit.c
|
||||
# from Grammar/Grammar using pgen
|
||||
@$(MKDIR_P) Include
|
||||
$(PGEN) $(srcdir)/Grammar/Grammar \
|
||||
$(PYTHON_FOR_REGEN) -m Parser.pgen $(srcdir)/Grammar/Grammar \
|
||||
$(srcdir)/Grammar/Tokens \
|
||||
$(srcdir)/Include/graminit.h.new \
|
||||
$(srcdir)/Python/graminit.c.new
|
||||
$(UPDATE_FILE) $(srcdir)/Include/graminit.h $(srcdir)/Include/graminit.h.new
|
||||
$(UPDATE_FILE) $(srcdir)/Python/graminit.c $(srcdir)/Python/graminit.c.new
|
||||
|
||||
Parser/grammar.o: $(srcdir)/Parser/grammar.c \
|
||||
$(srcdir)/Include/token.h \
|
||||
$(srcdir)/Include/grammar.h
|
||||
Parser/metagrammar.o: $(srcdir)/Parser/metagrammar.c
|
||||
|
||||
Parser/tokenizer_pgen.o: $(srcdir)/Parser/tokenizer.c
|
||||
Parser/parsetok_pgen.o: $(srcdir)/Parser/parsetok.c
|
||||
Parser/printgrammar.o: $(srcdir)/Parser/printgrammar.c
|
||||
|
||||
Parser/pgenmain.o: $(srcdir)/Include/parsetok.h
|
||||
|
||||
.PHONY=regen-ast
|
||||
regen-ast:
|
||||
# Regenerate Include/Python-ast.h using Parser/asdl_c.py -h
|
||||
@ -1016,7 +984,6 @@ PYTHON_HEADERS= \
|
||||
$(srcdir)/Include/longobject.h \
|
||||
$(srcdir)/Include/marshal.h \
|
||||
$(srcdir)/Include/memoryobject.h \
|
||||
$(srcdir)/Include/metagrammar.h \
|
||||
$(srcdir)/Include/methodobject.h \
|
||||
$(srcdir)/Include/modsupport.h \
|
||||
$(srcdir)/Include/moduleobject.h \
|
||||
@ -1028,7 +995,6 @@ PYTHON_HEADERS= \
|
||||
$(srcdir)/Include/osdefs.h \
|
||||
$(srcdir)/Include/osmodule.h \
|
||||
$(srcdir)/Include/patchlevel.h \
|
||||
$(srcdir)/Include/pgen.h \
|
||||
$(srcdir)/Include/pgenheaders.h \
|
||||
$(srcdir)/Include/pyarena.h \
|
||||
$(srcdir)/Include/pycapsule.h \
|
||||
@ -1771,7 +1737,7 @@ profile-removal:
|
||||
rm -f profile-run-stamp
|
||||
|
||||
clobber: clean profile-removal
|
||||
-rm -f $(BUILDPYTHON) $(PGEN) $(LIBRARY) $(LDLIBRARY) $(DLLLIBRARY) \
|
||||
-rm -f $(BUILDPYTHON) $(LIBRARY) $(LDLIBRARY) $(DLLLIBRARY) \
|
||||
tags TAGS \
|
||||
config.cache config.log pyconfig.h Modules/config.c
|
||||
-rm -rf build platform
|
||||
|
@ -0,0 +1,2 @@
|
||||
Retire pgen and use a modified version of pgen2 to generate the parser.
|
||||
Patch by Pablo Galindo.
|
@ -92,14 +92,6 @@ wchar_t *Py_DecodeLocale(const char* arg, size_t *size)
|
||||
return w;
|
||||
}
|
||||
|
||||
/* Parser/pgenmain.c */
|
||||
grammar *getgrammar(const char *filename)
|
||||
{
|
||||
grammar *g;
|
||||
__coverity_tainted_data_sink__(filename);
|
||||
return g;
|
||||
}
|
||||
|
||||
/* Python/marshal.c */
|
||||
|
||||
static Py_ssize_t r_string(char *s, Py_ssize_t n, RFILE *p)
|
||||
|
@ -161,7 +161,6 @@
|
||||
<ClInclude Include="..\Include\longobject.h" />
|
||||
<ClInclude Include="..\Include\marshal.h" />
|
||||
<ClInclude Include="..\Include\memoryobject.h" />
|
||||
<ClInclude Include="..\Include\metagrammar.h" />
|
||||
<ClInclude Include="..\Include\methodobject.h" />
|
||||
<ClInclude Include="..\Include\modsupport.h" />
|
||||
<ClInclude Include="..\Include\moduleobject.h" />
|
||||
@ -175,7 +174,6 @@
|
||||
<ClInclude Include="..\Include\osmodule.h" />
|
||||
<ClInclude Include="..\Include\parsetok.h" />
|
||||
<ClInclude Include="..\Include\patchlevel.h" />
|
||||
<ClInclude Include="..\Include\pgen.h" />
|
||||
<ClInclude Include="..\Include\pgenheaders.h" />
|
||||
<ClInclude Include="..\Include\pyhash.h" />
|
||||
<ClInclude Include="..\Include\py_curses.h" />
|
||||
@ -372,12 +370,8 @@
|
||||
<ClCompile Include="..\Objects\unicodeobject.c" />
|
||||
<ClCompile Include="..\Objects\weakrefobject.c" />
|
||||
<ClCompile Include="..\Parser\acceler.c" />
|
||||
<ClCompile Include="..\Parser\bitset.c" />
|
||||
<ClCompile Include="..\Parser\firstsets.c" />
|
||||
<ClCompile Include="..\Parser\grammar.c" />
|
||||
<ClCompile Include="..\Parser\grammar1.c" />
|
||||
<ClCompile Include="..\Parser\listnode.c" />
|
||||
<ClCompile Include="..\Parser\metagrammar.c" />
|
||||
<ClCompile Include="..\Parser\myreadline.c" />
|
||||
<ClCompile Include="..\Parser\node.c" />
|
||||
<ClCompile Include="..\Parser\parser.c" />
|
||||
|
@ -234,9 +234,6 @@
|
||||
<ClInclude Include="..\Include\memoryobject.h">
|
||||
<Filter>Include</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="..\Include\metagrammar.h">
|
||||
<Filter>Include</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="..\Include\methodobject.h">
|
||||
<Filter>Include</Filter>
|
||||
</ClInclude>
|
||||
@ -270,9 +267,6 @@
|
||||
<ClInclude Include="..\Include\patchlevel.h">
|
||||
<Filter>Include</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="..\Include\pgen.h">
|
||||
<Filter>Include</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="..\Include\pgenheaders.h">
|
||||
<Filter>Include</Filter>
|
||||
</ClInclude>
|
||||
@ -836,24 +830,12 @@
|
||||
<ClCompile Include="..\Parser\acceler.c">
|
||||
<Filter>Parser</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\Parser\bitset.c">
|
||||
<Filter>Parser</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\Parser\firstsets.c">
|
||||
<Filter>Parser</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\Parser\grammar.c">
|
||||
<Filter>Parser</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\Parser\grammar1.c">
|
||||
<Filter>Parser</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\Parser\listnode.c">
|
||||
<Filter>Parser</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\Parser\metagrammar.c">
|
||||
<Filter>Parser</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\Parser\myreadline.c">
|
||||
<Filter>Parser</Filter>
|
||||
</ClCompile>
|
||||
|
@ -1,66 +0,0 @@
|
||||
|
||||
/* Bitset primitives used by the parser generator */
|
||||
|
||||
#include "pgenheaders.h"
|
||||
#include "bitset.h"
|
||||
|
||||
bitset
|
||||
newbitset(int nbits)
|
||||
{
|
||||
int nbytes = NBYTES(nbits);
|
||||
bitset ss = (char *)PyObject_MALLOC(sizeof(BYTE) * nbytes);
|
||||
|
||||
if (ss == NULL)
|
||||
Py_FatalError("no mem for bitset");
|
||||
|
||||
ss += nbytes;
|
||||
while (--nbytes >= 0)
|
||||
*--ss = 0;
|
||||
return ss;
|
||||
}
|
||||
|
||||
void
|
||||
delbitset(bitset ss)
|
||||
{
|
||||
PyObject_FREE(ss);
|
||||
}
|
||||
|
||||
int
|
||||
addbit(bitset ss, int ibit)
|
||||
{
|
||||
int ibyte = BIT2BYTE(ibit);
|
||||
BYTE mask = BIT2MASK(ibit);
|
||||
|
||||
if (ss[ibyte] & mask)
|
||||
return 0; /* Bit already set */
|
||||
ss[ibyte] |= mask;
|
||||
return 1;
|
||||
}
|
||||
|
||||
#if 0 /* Now a macro */
|
||||
int
|
||||
testbit(bitset ss, int ibit)
|
||||
{
|
||||
return (ss[BIT2BYTE(ibit)] & BIT2MASK(ibit)) != 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
int
|
||||
samebitset(bitset ss1, bitset ss2, int nbits)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = NBYTES(nbits); --i >= 0; )
|
||||
if (*ss1++ != *ss2++)
|
||||
return 0;
|
||||
return 1;
|
||||
}
|
||||
|
||||
void
|
||||
mergebitset(bitset ss1, bitset ss2, int nbits)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = NBYTES(nbits); --i >= 0; )
|
||||
*ss1++ |= *ss2++;
|
||||
}
|
@ -1,113 +0,0 @@
|
||||
|
||||
/* Computation of FIRST stets */
|
||||
|
||||
#include "pgenheaders.h"
|
||||
#include "grammar.h"
|
||||
#include "token.h"
|
||||
|
||||
extern int Py_DebugFlag;
|
||||
|
||||
/* Forward */
|
||||
static void calcfirstset(grammar *, dfa *);
|
||||
|
||||
void
|
||||
addfirstsets(grammar *g)
|
||||
{
|
||||
int i;
|
||||
dfa *d;
|
||||
|
||||
if (Py_DebugFlag)
|
||||
printf("Adding FIRST sets ...\n");
|
||||
for (i = 0; i < g->g_ndfas; i++) {
|
||||
d = &g->g_dfa[i];
|
||||
if (d->d_first == NULL)
|
||||
calcfirstset(g, d);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
calcfirstset(grammar *g, dfa *d)
|
||||
{
|
||||
int i, j;
|
||||
state *s;
|
||||
arc *a;
|
||||
int nsyms;
|
||||
int *sym;
|
||||
int nbits;
|
||||
static bitset dummy;
|
||||
bitset result;
|
||||
int type;
|
||||
dfa *d1;
|
||||
label *l0;
|
||||
|
||||
if (Py_DebugFlag)
|
||||
printf("Calculate FIRST set for '%s'\n", d->d_name);
|
||||
|
||||
if (dummy == NULL)
|
||||
dummy = newbitset(1);
|
||||
if (d->d_first == dummy) {
|
||||
fprintf(stderr, "Left-recursion for '%s'\n", d->d_name);
|
||||
return;
|
||||
}
|
||||
if (d->d_first != NULL) {
|
||||
fprintf(stderr, "Re-calculating FIRST set for '%s' ???\n",
|
||||
d->d_name);
|
||||
}
|
||||
d->d_first = dummy;
|
||||
|
||||
l0 = g->g_ll.ll_label;
|
||||
nbits = g->g_ll.ll_nlabels;
|
||||
result = newbitset(nbits);
|
||||
|
||||
sym = (int *)PyObject_MALLOC(sizeof(int));
|
||||
if (sym == NULL)
|
||||
Py_FatalError("no mem for new sym in calcfirstset");
|
||||
nsyms = 1;
|
||||
sym[0] = findlabel(&g->g_ll, d->d_type, (char *)NULL);
|
||||
|
||||
s = &d->d_state[d->d_initial];
|
||||
for (i = 0; i < s->s_narcs; i++) {
|
||||
a = &s->s_arc[i];
|
||||
for (j = 0; j < nsyms; j++) {
|
||||
if (sym[j] == a->a_lbl)
|
||||
break;
|
||||
}
|
||||
if (j >= nsyms) { /* New label */
|
||||
sym = (int *)PyObject_REALLOC(sym,
|
||||
sizeof(int) * (nsyms + 1));
|
||||
if (sym == NULL)
|
||||
Py_FatalError(
|
||||
"no mem to resize sym in calcfirstset");
|
||||
sym[nsyms++] = a->a_lbl;
|
||||
type = l0[a->a_lbl].lb_type;
|
||||
if (ISNONTERMINAL(type)) {
|
||||
d1 = PyGrammar_FindDFA(g, type);
|
||||
if (d1->d_first == dummy) {
|
||||
fprintf(stderr,
|
||||
"Left-recursion below '%s'\n",
|
||||
d->d_name);
|
||||
}
|
||||
else {
|
||||
if (d1->d_first == NULL)
|
||||
calcfirstset(g, d1);
|
||||
mergebitset(result,
|
||||
d1->d_first, nbits);
|
||||
}
|
||||
}
|
||||
else if (ISTERMINAL(type)) {
|
||||
addbit(result, a->a_lbl);
|
||||
}
|
||||
}
|
||||
}
|
||||
d->d_first = result;
|
||||
if (Py_DebugFlag) {
|
||||
printf("FIRST set for '%s': {", d->d_name);
|
||||
for (i = 0; i < nbits; i++) {
|
||||
if (testbit(result, i))
|
||||
printf(" %s", PyGrammar_LabelRepr(&l0[i]));
|
||||
}
|
||||
printf(" }\n");
|
||||
}
|
||||
|
||||
PyObject_FREE(sym);
|
||||
}
|
273
Parser/grammar.c
273
Parser/grammar.c
@ -1,273 +0,0 @@
|
||||
|
||||
/* Grammar implementation */
|
||||
|
||||
#include "Python.h"
|
||||
#include "pgenheaders.h"
|
||||
|
||||
#include <ctype.h>
|
||||
|
||||
#include "token.h"
|
||||
#include "grammar.h"
|
||||
|
||||
extern int Py_DebugFlag;
|
||||
|
||||
grammar *
|
||||
newgrammar(int start)
|
||||
{
|
||||
grammar *g;
|
||||
|
||||
g = (grammar *)PyObject_MALLOC(sizeof(grammar));
|
||||
if (g == NULL)
|
||||
Py_FatalError("no mem for new grammar");
|
||||
g->g_ndfas = 0;
|
||||
g->g_dfa = NULL;
|
||||
g->g_start = start;
|
||||
g->g_ll.ll_nlabels = 0;
|
||||
g->g_ll.ll_label = NULL;
|
||||
g->g_accel = 0;
|
||||
return g;
|
||||
}
|
||||
|
||||
void
|
||||
freegrammar(grammar *g)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < g->g_ndfas; i++) {
|
||||
free(g->g_dfa[i].d_name);
|
||||
for (int j = 0; j < g->g_dfa[i].d_nstates; j++)
|
||||
PyObject_FREE(g->g_dfa[i].d_state[j].s_arc);
|
||||
PyObject_FREE(g->g_dfa[i].d_state);
|
||||
}
|
||||
PyObject_FREE(g->g_dfa);
|
||||
for (i = 0; i < g->g_ll.ll_nlabels; i++)
|
||||
free(g->g_ll.ll_label[i].lb_str);
|
||||
PyObject_FREE(g->g_ll.ll_label);
|
||||
PyObject_FREE(g);
|
||||
}
|
||||
|
||||
dfa *
|
||||
adddfa(grammar *g, int type, const char *name)
|
||||
{
|
||||
dfa *d;
|
||||
|
||||
g->g_dfa = (dfa *)PyObject_REALLOC(g->g_dfa,
|
||||
sizeof(dfa) * (g->g_ndfas + 1));
|
||||
if (g->g_dfa == NULL)
|
||||
Py_FatalError("no mem to resize dfa in adddfa");
|
||||
d = &g->g_dfa[g->g_ndfas++];
|
||||
d->d_type = type;
|
||||
d->d_name = strdup(name);
|
||||
d->d_nstates = 0;
|
||||
d->d_state = NULL;
|
||||
d->d_initial = -1;
|
||||
d->d_first = NULL;
|
||||
return d; /* Only use while fresh! */
|
||||
}
|
||||
|
||||
int
|
||||
addstate(dfa *d)
|
||||
{
|
||||
state *s;
|
||||
|
||||
d->d_state = (state *)PyObject_REALLOC(d->d_state,
|
||||
sizeof(state) * (d->d_nstates + 1));
|
||||
if (d->d_state == NULL)
|
||||
Py_FatalError("no mem to resize state in addstate");
|
||||
s = &d->d_state[d->d_nstates++];
|
||||
s->s_narcs = 0;
|
||||
s->s_arc = NULL;
|
||||
s->s_lower = 0;
|
||||
s->s_upper = 0;
|
||||
s->s_accel = NULL;
|
||||
s->s_accept = 0;
|
||||
return Py_SAFE_DOWNCAST(s - d->d_state, intptr_t, int);
|
||||
}
|
||||
|
||||
void
|
||||
addarc(dfa *d, int from, int to, int lbl)
|
||||
{
|
||||
state *s;
|
||||
arc *a;
|
||||
|
||||
assert(0 <= from && from < d->d_nstates);
|
||||
assert(0 <= to && to < d->d_nstates);
|
||||
|
||||
s = &d->d_state[from];
|
||||
s->s_arc = (arc *)PyObject_REALLOC(s->s_arc, sizeof(arc) * (s->s_narcs + 1));
|
||||
if (s->s_arc == NULL)
|
||||
Py_FatalError("no mem to resize arc list in addarc");
|
||||
a = &s->s_arc[s->s_narcs++];
|
||||
a->a_lbl = lbl;
|
||||
a->a_arrow = to;
|
||||
}
|
||||
|
||||
int
|
||||
addlabel(labellist *ll, int type, const char *str)
|
||||
{
|
||||
int i;
|
||||
label *lb;
|
||||
|
||||
for (i = 0; i < ll->ll_nlabels; i++) {
|
||||
if (ll->ll_label[i].lb_type == type &&
|
||||
strcmp(ll->ll_label[i].lb_str, str) == 0)
|
||||
return i;
|
||||
}
|
||||
ll->ll_label = (label *)PyObject_REALLOC(ll->ll_label,
|
||||
sizeof(label) * (ll->ll_nlabels + 1));
|
||||
if (ll->ll_label == NULL)
|
||||
Py_FatalError("no mem to resize labellist in addlabel");
|
||||
lb = &ll->ll_label[ll->ll_nlabels++];
|
||||
lb->lb_type = type;
|
||||
lb->lb_str = strdup(str);
|
||||
if (Py_DebugFlag)
|
||||
printf("Label @ %8p, %d: %s\n", ll, ll->ll_nlabels,
|
||||
PyGrammar_LabelRepr(lb));
|
||||
return Py_SAFE_DOWNCAST(lb - ll->ll_label, intptr_t, int);
|
||||
}
|
||||
|
||||
/* Same, but rather dies than adds */
|
||||
|
||||
int
|
||||
findlabel(labellist *ll, int type, const char *str)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < ll->ll_nlabels; i++) {
|
||||
if (ll->ll_label[i].lb_type == type /*&&
|
||||
strcmp(ll->ll_label[i].lb_str, str) == 0*/)
|
||||
return i;
|
||||
}
|
||||
fprintf(stderr, "Label %d/'%s' not found\n", type, str);
|
||||
Py_FatalError("grammar.c:findlabel()");
|
||||
|
||||
/* Py_FatalError() is declared with __attribute__((__noreturn__)).
|
||||
GCC emits a warning without "return 0;" (compiler bug!), but Clang is
|
||||
smarter and emits a warning on the return... */
|
||||
#ifndef __clang__
|
||||
return 0; /* Make gcc -Wall happy */
|
||||
#endif
|
||||
}
|
||||
|
||||
/* Forward */
|
||||
static void translabel(grammar *, label *);
|
||||
|
||||
void
|
||||
translatelabels(grammar *g)
|
||||
{
|
||||
int i;
|
||||
|
||||
#ifdef Py_DEBUG
|
||||
printf("Translating labels ...\n");
|
||||
#endif
|
||||
/* Don't translate EMPTY */
|
||||
for (i = EMPTY+1; i < g->g_ll.ll_nlabels; i++)
|
||||
translabel(g, &g->g_ll.ll_label[i]);
|
||||
}
|
||||
|
||||
static void
|
||||
translabel(grammar *g, label *lb)
|
||||
{
|
||||
int i;
|
||||
|
||||
if (Py_DebugFlag)
|
||||
printf("Translating label %s ...\n", PyGrammar_LabelRepr(lb));
|
||||
|
||||
if (lb->lb_type == NAME) {
|
||||
for (i = 0; i < g->g_ndfas; i++) {
|
||||
if (strcmp(lb->lb_str, g->g_dfa[i].d_name) == 0) {
|
||||
if (Py_DebugFlag)
|
||||
printf(
|
||||
"Label %s is non-terminal %d.\n",
|
||||
lb->lb_str,
|
||||
g->g_dfa[i].d_type);
|
||||
lb->lb_type = g->g_dfa[i].d_type;
|
||||
free(lb->lb_str);
|
||||
lb->lb_str = NULL;
|
||||
return;
|
||||
}
|
||||
}
|
||||
for (i = 0; i < (int)N_TOKENS; i++) {
|
||||
if (strcmp(lb->lb_str, _PyParser_TokenNames[i]) == 0) {
|
||||
if (Py_DebugFlag)
|
||||
printf("Label %s is terminal %d.\n",
|
||||
lb->lb_str, i);
|
||||
lb->lb_type = i;
|
||||
free(lb->lb_str);
|
||||
lb->lb_str = NULL;
|
||||
return;
|
||||
}
|
||||
}
|
||||
printf("Can't translate NAME label '%s'\n", lb->lb_str);
|
||||
return;
|
||||
}
|
||||
|
||||
if (lb->lb_type == STRING) {
|
||||
if (isalpha(Py_CHARMASK(lb->lb_str[1])) ||
|
||||
lb->lb_str[1] == '_') {
|
||||
char *p;
|
||||
char *src;
|
||||
char *dest;
|
||||
size_t name_len;
|
||||
if (Py_DebugFlag)
|
||||
printf("Label %s is a keyword\n", lb->lb_str);
|
||||
lb->lb_type = NAME;
|
||||
src = lb->lb_str + 1;
|
||||
p = strchr(src, '\'');
|
||||
if (p)
|
||||
name_len = p - src;
|
||||
else
|
||||
name_len = strlen(src);
|
||||
dest = (char *)malloc(name_len + 1);
|
||||
if (!dest) {
|
||||
printf("Can't alloc dest '%s'\n", src);
|
||||
return;
|
||||
}
|
||||
strncpy(dest, src, name_len);
|
||||
dest[name_len] = '\0';
|
||||
free(lb->lb_str);
|
||||
lb->lb_str = dest;
|
||||
}
|
||||
else if (lb->lb_str[2] == lb->lb_str[0]) {
|
||||
int type = (int) PyToken_OneChar(lb->lb_str[1]);
|
||||
if (type != OP) {
|
||||
lb->lb_type = type;
|
||||
free(lb->lb_str);
|
||||
lb->lb_str = NULL;
|
||||
}
|
||||
else
|
||||
printf("Unknown OP label %s\n",
|
||||
lb->lb_str);
|
||||
}
|
||||
else if (lb->lb_str[2] && lb->lb_str[3] == lb->lb_str[0]) {
|
||||
int type = (int) PyToken_TwoChars(lb->lb_str[1],
|
||||
lb->lb_str[2]);
|
||||
if (type != OP) {
|
||||
lb->lb_type = type;
|
||||
free(lb->lb_str);
|
||||
lb->lb_str = NULL;
|
||||
}
|
||||
else
|
||||
printf("Unknown OP label %s\n",
|
||||
lb->lb_str);
|
||||
}
|
||||
else if (lb->lb_str[2] && lb->lb_str[3] && lb->lb_str[4] == lb->lb_str[0]) {
|
||||
int type = (int) PyToken_ThreeChars(lb->lb_str[1],
|
||||
lb->lb_str[2],
|
||||
lb->lb_str[3]);
|
||||
if (type != OP) {
|
||||
lb->lb_type = type;
|
||||
free(lb->lb_str);
|
||||
lb->lb_str = NULL;
|
||||
}
|
||||
else
|
||||
printf("Unknown OP label %s\n",
|
||||
lb->lb_str);
|
||||
}
|
||||
else
|
||||
printf("Can't translate STRING label %s\n",
|
||||
lb->lb_str);
|
||||
}
|
||||
else
|
||||
printf("Can't translate label '%s'\n",
|
||||
PyGrammar_LabelRepr(lb));
|
||||
}
|
@ -1,159 +0,0 @@
|
||||
|
||||
#include "pgenheaders.h"
|
||||
#include "metagrammar.h"
|
||||
#include "grammar.h"
|
||||
#include "pgen.h"
|
||||
static arc arcs_0_0[3] = {
|
||||
{2, 0},
|
||||
{3, 0},
|
||||
{4, 1},
|
||||
};
|
||||
static arc arcs_0_1[1] = {
|
||||
{0, 1},
|
||||
};
|
||||
static state states_0[2] = {
|
||||
{3, arcs_0_0},
|
||||
{1, arcs_0_1},
|
||||
};
|
||||
static arc arcs_1_0[1] = {
|
||||
{5, 1},
|
||||
};
|
||||
static arc arcs_1_1[1] = {
|
||||
{6, 2},
|
||||
};
|
||||
static arc arcs_1_2[1] = {
|
||||
{7, 3},
|
||||
};
|
||||
static arc arcs_1_3[1] = {
|
||||
{3, 4},
|
||||
};
|
||||
static arc arcs_1_4[1] = {
|
||||
{0, 4},
|
||||
};
|
||||
static state states_1[5] = {
|
||||
{1, arcs_1_0},
|
||||
{1, arcs_1_1},
|
||||
{1, arcs_1_2},
|
||||
{1, arcs_1_3},
|
||||
{1, arcs_1_4},
|
||||
};
|
||||
static arc arcs_2_0[1] = {
|
||||
{8, 1},
|
||||
};
|
||||
static arc arcs_2_1[2] = {
|
||||
{9, 0},
|
||||
{0, 1},
|
||||
};
|
||||
static state states_2[2] = {
|
||||
{1, arcs_2_0},
|
||||
{2, arcs_2_1},
|
||||
};
|
||||
static arc arcs_3_0[1] = {
|
||||
{10, 1},
|
||||
};
|
||||
static arc arcs_3_1[2] = {
|
||||
{10, 1},
|
||||
{0, 1},
|
||||
};
|
||||
static state states_3[2] = {
|
||||
{1, arcs_3_0},
|
||||
{2, arcs_3_1},
|
||||
};
|
||||
static arc arcs_4_0[2] = {
|
||||
{11, 1},
|
||||
{13, 2},
|
||||
};
|
||||
static arc arcs_4_1[1] = {
|
||||
{7, 3},
|
||||
};
|
||||
static arc arcs_4_2[3] = {
|
||||
{14, 4},
|
||||
{15, 4},
|
||||
{0, 2},
|
||||
};
|
||||
static arc arcs_4_3[1] = {
|
||||
{12, 4},
|
||||
};
|
||||
static arc arcs_4_4[1] = {
|
||||
{0, 4},
|
||||
};
|
||||
static state states_4[5] = {
|
||||
{2, arcs_4_0},
|
||||
{1, arcs_4_1},
|
||||
{3, arcs_4_2},
|
||||
{1, arcs_4_3},
|
||||
{1, arcs_4_4},
|
||||
};
|
||||
static arc arcs_5_0[3] = {
|
||||
{5, 1},
|
||||
{16, 1},
|
||||
{17, 2},
|
||||
};
|
||||
static arc arcs_5_1[1] = {
|
||||
{0, 1},
|
||||
};
|
||||
static arc arcs_5_2[1] = {
|
||||
{7, 3},
|
||||
};
|
||||
static arc arcs_5_3[1] = {
|
||||
{18, 1},
|
||||
};
|
||||
static state states_5[4] = {
|
||||
{3, arcs_5_0},
|
||||
{1, arcs_5_1},
|
||||
{1, arcs_5_2},
|
||||
{1, arcs_5_3},
|
||||
};
|
||||
static dfa dfas[6] = {
|
||||
{256, "MSTART", 0, 2, states_0,
|
||||
"\070\000\000"},
|
||||
{257, "RULE", 0, 5, states_1,
|
||||
"\040\000\000"},
|
||||
{258, "RHS", 0, 2, states_2,
|
||||
"\040\010\003"},
|
||||
{259, "ALT", 0, 2, states_3,
|
||||
"\040\010\003"},
|
||||
{260, "ITEM", 0, 5, states_4,
|
||||
"\040\010\003"},
|
||||
{261, "ATOM", 0, 4, states_5,
|
||||
"\040\000\003"},
|
||||
};
|
||||
static label labels[19] = {
|
||||
{0, "EMPTY"},
|
||||
{256, 0},
|
||||
{257, 0},
|
||||
{4, 0},
|
||||
{0, 0},
|
||||
{1, 0},
|
||||
{11, 0},
|
||||
{258, 0},
|
||||
{259, 0},
|
||||
{18, 0},
|
||||
{260, 0},
|
||||
{9, 0},
|
||||
{10, 0},
|
||||
{261, 0},
|
||||
{16, 0},
|
||||
{14, 0},
|
||||
{3, 0},
|
||||
{7, 0},
|
||||
{8, 0},
|
||||
};
|
||||
static grammar _PyParser_Grammar = {
|
||||
6,
|
||||
dfas,
|
||||
{19, labels},
|
||||
256
|
||||
};
|
||||
|
||||
grammar *
|
||||
meta_grammar(void)
|
||||
{
|
||||
return &_PyParser_Grammar;
|
||||
}
|
||||
|
||||
grammar *
|
||||
Py_meta_grammar(void)
|
||||
{
|
||||
return meta_grammar();
|
||||
}
|
@ -99,10 +99,8 @@ PyParser_ParseStringObject(const char *s, PyObject *filename,
|
||||
tok->type_comments = 1;
|
||||
}
|
||||
|
||||
#ifndef PGEN
|
||||
Py_INCREF(err_ret->filename);
|
||||
tok->filename = err_ret->filename;
|
||||
#endif
|
||||
return parsetok(tok, g, start, err_ret, flags);
|
||||
}
|
||||
|
||||
@ -113,7 +111,6 @@ PyParser_ParseStringFlagsFilenameEx(const char *s, const char *filename_str,
|
||||
{
|
||||
node *n;
|
||||
PyObject *filename = NULL;
|
||||
#ifndef PGEN
|
||||
if (filename_str != NULL) {
|
||||
filename = PyUnicode_DecodeFSDefault(filename_str);
|
||||
if (filename == NULL) {
|
||||
@ -121,11 +118,8 @@ PyParser_ParseStringFlagsFilenameEx(const char *s, const char *filename_str,
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
n = PyParser_ParseStringObject(s, filename, g, start, err_ret, flags);
|
||||
#ifndef PGEN
|
||||
Py_XDECREF(filename);
|
||||
#endif
|
||||
return n;
|
||||
}
|
||||
|
||||
@ -169,10 +163,8 @@ PyParser_ParseFileObject(FILE *fp, PyObject *filename,
|
||||
if (*flags & PyPARSE_TYPE_COMMENTS) {
|
||||
tok->type_comments = 1;
|
||||
}
|
||||
#ifndef PGEN
|
||||
Py_INCREF(err_ret->filename);
|
||||
tok->filename = err_ret->filename;
|
||||
#endif
|
||||
return parsetok(tok, g, start, err_ret, flags);
|
||||
}
|
||||
|
||||
@ -184,7 +176,6 @@ PyParser_ParseFileFlagsEx(FILE *fp, const char *filename,
|
||||
{
|
||||
node *n;
|
||||
PyObject *fileobj = NULL;
|
||||
#ifndef PGEN
|
||||
if (filename != NULL) {
|
||||
fileobj = PyUnicode_DecodeFSDefault(filename);
|
||||
if (fileobj == NULL) {
|
||||
@ -192,12 +183,9 @@ PyParser_ParseFileFlagsEx(FILE *fp, const char *filename,
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
n = PyParser_ParseFileObject(fp, fileobj, enc, g,
|
||||
start, ps1, ps2, err_ret, flags);
|
||||
#ifndef PGEN
|
||||
Py_XDECREF(fileobj);
|
||||
#endif
|
||||
return n;
|
||||
}
|
||||
|
||||
@ -371,7 +359,6 @@ parsetok(struct tok_state *tok, grammar *g, int start, perrdetail *err_ret,
|
||||
}
|
||||
}
|
||||
|
||||
#ifndef PGEN
|
||||
/* Check that the source for a single input statement really
|
||||
is a single statement by looking at what is left in the
|
||||
buffer after parsing. Trailing whitespace and comments
|
||||
@ -399,7 +386,6 @@ parsetok(struct tok_state *tok, grammar *g, int start, perrdetail *err_ret,
|
||||
c = *++cur;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
else
|
||||
n = NULL;
|
||||
@ -470,7 +456,6 @@ initerr(perrdetail *err_ret, PyObject *filename)
|
||||
err_ret->text = NULL;
|
||||
err_ret->token = -1;
|
||||
err_ret->expected = -1;
|
||||
#ifndef PGEN
|
||||
if (filename) {
|
||||
Py_INCREF(filename);
|
||||
err_ret->filename = filename;
|
||||
@ -482,6 +467,5 @@ initerr(perrdetail *err_ret, PyObject *filename)
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
|
@ -1,2 +0,0 @@
|
||||
#define PGEN
|
||||
#include "parsetok.c"
|
724
Parser/pgen.c
724
Parser/pgen.c
@ -1,724 +0,0 @@
|
||||
/* Parser generator */
|
||||
|
||||
/* For a description, see the comments at end of this file */
|
||||
|
||||
#include "Python.h"
|
||||
#include "pgenheaders.h"
|
||||
#include "token.h"
|
||||
#include "node.h"
|
||||
#include "grammar.h"
|
||||
#include "metagrammar.h"
|
||||
#include "pgen.h"
|
||||
|
||||
extern int Py_DebugFlag;
|
||||
extern int Py_IgnoreEnvironmentFlag; /* needed by Py_GETENV */
|
||||
|
||||
|
||||
/* PART ONE -- CONSTRUCT NFA -- Cf. Algorithm 3.2 from [Aho&Ullman 77] */
|
||||
|
||||
typedef struct _nfaarc {
|
||||
int ar_label;
|
||||
int ar_arrow;
|
||||
} nfaarc;
|
||||
|
||||
typedef struct _nfastate {
|
||||
int st_narcs;
|
||||
nfaarc *st_arc;
|
||||
} nfastate;
|
||||
|
||||
typedef struct _nfa {
|
||||
int nf_type;
|
||||
char *nf_name;
|
||||
int nf_nstates;
|
||||
nfastate *nf_state;
|
||||
int nf_start, nf_finish;
|
||||
} nfa;
|
||||
|
||||
/* Forward */
|
||||
static void compile_rhs(labellist *ll,
|
||||
nfa *nf, node *n, int *pa, int *pb);
|
||||
static void compile_alt(labellist *ll,
|
||||
nfa *nf, node *n, int *pa, int *pb);
|
||||
static void compile_item(labellist *ll,
|
||||
nfa *nf, node *n, int *pa, int *pb);
|
||||
static void compile_atom(labellist *ll,
|
||||
nfa *nf, node *n, int *pa, int *pb);
|
||||
|
||||
static int
|
||||
addnfastate(nfa *nf)
|
||||
{
|
||||
nfastate *st;
|
||||
|
||||
nf->nf_state = (nfastate *)PyObject_REALLOC(nf->nf_state,
|
||||
sizeof(nfastate) * (nf->nf_nstates + 1));
|
||||
if (nf->nf_state == NULL)
|
||||
Py_FatalError("out of mem");
|
||||
st = &nf->nf_state[nf->nf_nstates++];
|
||||
st->st_narcs = 0;
|
||||
st->st_arc = NULL;
|
||||
return st - nf->nf_state;
|
||||
}
|
||||
|
||||
static void
|
||||
addnfaarc(nfa *nf, int from, int to, int lbl)
|
||||
{
|
||||
nfastate *st;
|
||||
nfaarc *ar;
|
||||
|
||||
st = &nf->nf_state[from];
|
||||
st->st_arc = (nfaarc *)PyObject_REALLOC(st->st_arc,
|
||||
sizeof(nfaarc) * (st->st_narcs + 1));
|
||||
if (st->st_arc == NULL)
|
||||
Py_FatalError("out of mem");
|
||||
ar = &st->st_arc[st->st_narcs++];
|
||||
ar->ar_label = lbl;
|
||||
ar->ar_arrow = to;
|
||||
}
|
||||
|
||||
static nfa *
|
||||
newnfa(char *name)
|
||||
{
|
||||
nfa *nf;
|
||||
static int type = NT_OFFSET; /* All types will be disjunct */
|
||||
|
||||
nf = (nfa *)PyObject_MALLOC(sizeof(nfa));
|
||||
if (nf == NULL)
|
||||
Py_FatalError("no mem for new nfa");
|
||||
nf->nf_type = type++;
|
||||
nf->nf_name = name; /* XXX strdup(name) ??? */
|
||||
nf->nf_nstates = 0;
|
||||
nf->nf_state = NULL;
|
||||
nf->nf_start = nf->nf_finish = -1;
|
||||
return nf;
|
||||
}
|
||||
|
||||
typedef struct _nfagrammar {
|
||||
int gr_nnfas;
|
||||
nfa **gr_nfa;
|
||||
labellist gr_ll;
|
||||
} nfagrammar;
|
||||
|
||||
/* Forward */
|
||||
static void compile_rule(nfagrammar *gr, node *n);
|
||||
|
||||
static nfagrammar *
|
||||
newnfagrammar(void)
|
||||
{
|
||||
nfagrammar *gr;
|
||||
|
||||
gr = (nfagrammar *)PyObject_MALLOC(sizeof(nfagrammar));
|
||||
if (gr == NULL)
|
||||
Py_FatalError("no mem for new nfa grammar");
|
||||
gr->gr_nnfas = 0;
|
||||
gr->gr_nfa = NULL;
|
||||
gr->gr_ll.ll_nlabels = 0;
|
||||
gr->gr_ll.ll_label = NULL;
|
||||
addlabel(&gr->gr_ll, ENDMARKER, "EMPTY");
|
||||
return gr;
|
||||
}
|
||||
|
||||
static void
|
||||
freenfagrammar(nfagrammar *gr)
|
||||
{
|
||||
for (int i = 0; i < gr->gr_nnfas; i++) {
|
||||
PyObject_FREE(gr->gr_nfa[i]->nf_state);
|
||||
}
|
||||
PyObject_FREE(gr->gr_nfa);
|
||||
PyObject_FREE(gr);
|
||||
}
|
||||
|
||||
static nfa *
|
||||
addnfa(nfagrammar *gr, char *name)
|
||||
{
|
||||
nfa *nf;
|
||||
|
||||
nf = newnfa(name);
|
||||
gr->gr_nfa = (nfa **)PyObject_REALLOC(gr->gr_nfa,
|
||||
sizeof(nfa*) * (gr->gr_nnfas + 1));
|
||||
if (gr->gr_nfa == NULL)
|
||||
Py_FatalError("out of mem");
|
||||
gr->gr_nfa[gr->gr_nnfas++] = nf;
|
||||
addlabel(&gr->gr_ll, NAME, nf->nf_name);
|
||||
return nf;
|
||||
}
|
||||
|
||||
#ifdef Py_DEBUG
|
||||
|
||||
static const char REQNFMT[] = "metacompile: less than %d children\n";
|
||||
|
||||
#define REQN(i, count) do { \
|
||||
if (i < count) { \
|
||||
fprintf(stderr, REQNFMT, count); \
|
||||
Py_FatalError("REQN"); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#else
|
||||
#define REQN(i, count) /* empty */
|
||||
#endif
|
||||
|
||||
static nfagrammar *
|
||||
metacompile(node *n)
|
||||
{
|
||||
nfagrammar *gr;
|
||||
int i;
|
||||
|
||||
if (Py_DebugFlag)
|
||||
printf("Compiling (meta-) parse tree into NFA grammar\n");
|
||||
gr = newnfagrammar();
|
||||
REQ(n, MSTART);
|
||||
i = n->n_nchildren - 1; /* Last child is ENDMARKER */
|
||||
n = n->n_child;
|
||||
for (; --i >= 0; n++) {
|
||||
if (n->n_type != NEWLINE)
|
||||
compile_rule(gr, n);
|
||||
}
|
||||
return gr;
|
||||
}
|
||||
|
||||
static void
|
||||
compile_rule(nfagrammar *gr, node *n)
|
||||
{
|
||||
nfa *nf;
|
||||
|
||||
REQ(n, RULE);
|
||||
REQN(n->n_nchildren, 4);
|
||||
n = n->n_child;
|
||||
REQ(n, NAME);
|
||||
nf = addnfa(gr, n->n_str);
|
||||
n++;
|
||||
REQ(n, COLON);
|
||||
n++;
|
||||
REQ(n, RHS);
|
||||
compile_rhs(&gr->gr_ll, nf, n, &nf->nf_start, &nf->nf_finish);
|
||||
n++;
|
||||
REQ(n, NEWLINE);
|
||||
}
|
||||
|
||||
static void
|
||||
compile_rhs(labellist *ll, nfa *nf, node *n, int *pa, int *pb)
|
||||
{
|
||||
int i;
|
||||
int a, b;
|
||||
|
||||
REQ(n, RHS);
|
||||
i = n->n_nchildren;
|
||||
REQN(i, 1);
|
||||
n = n->n_child;
|
||||
REQ(n, ALT);
|
||||
compile_alt(ll, nf, n, pa, pb);
|
||||
if (--i <= 0)
|
||||
return;
|
||||
n++;
|
||||
a = *pa;
|
||||
b = *pb;
|
||||
*pa = addnfastate(nf);
|
||||
*pb = addnfastate(nf);
|
||||
addnfaarc(nf, *pa, a, EMPTY);
|
||||
addnfaarc(nf, b, *pb, EMPTY);
|
||||
for (; --i >= 0; n++) {
|
||||
REQ(n, VBAR);
|
||||
REQN(i, 1);
|
||||
--i;
|
||||
n++;
|
||||
REQ(n, ALT);
|
||||
compile_alt(ll, nf, n, &a, &b);
|
||||
addnfaarc(nf, *pa, a, EMPTY);
|
||||
addnfaarc(nf, b, *pb, EMPTY);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
compile_alt(labellist *ll, nfa *nf, node *n, int *pa, int *pb)
|
||||
{
|
||||
int i;
|
||||
int a, b;
|
||||
|
||||
REQ(n, ALT);
|
||||
i = n->n_nchildren;
|
||||
REQN(i, 1);
|
||||
n = n->n_child;
|
||||
REQ(n, ITEM);
|
||||
compile_item(ll, nf, n, pa, pb);
|
||||
--i;
|
||||
n++;
|
||||
for (; --i >= 0; n++) {
|
||||
REQ(n, ITEM);
|
||||
compile_item(ll, nf, n, &a, &b);
|
||||
addnfaarc(nf, *pb, a, EMPTY);
|
||||
*pb = b;
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
compile_item(labellist *ll, nfa *nf, node *n, int *pa, int *pb)
|
||||
{
|
||||
int i;
|
||||
int a, b;
|
||||
|
||||
REQ(n, ITEM);
|
||||
i = n->n_nchildren;
|
||||
REQN(i, 1);
|
||||
n = n->n_child;
|
||||
if (n->n_type == LSQB) {
|
||||
REQN(i, 3);
|
||||
n++;
|
||||
REQ(n, RHS);
|
||||
*pa = addnfastate(nf);
|
||||
*pb = addnfastate(nf);
|
||||
addnfaarc(nf, *pa, *pb, EMPTY);
|
||||
compile_rhs(ll, nf, n, &a, &b);
|
||||
addnfaarc(nf, *pa, a, EMPTY);
|
||||
addnfaarc(nf, b, *pb, EMPTY);
|
||||
REQN(i, 1);
|
||||
n++;
|
||||
REQ(n, RSQB);
|
||||
}
|
||||
else {
|
||||
compile_atom(ll, nf, n, pa, pb);
|
||||
if (--i <= 0)
|
||||
return;
|
||||
n++;
|
||||
addnfaarc(nf, *pb, *pa, EMPTY);
|
||||
if (n->n_type == STAR)
|
||||
*pb = *pa;
|
||||
else
|
||||
REQ(n, PLUS);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
compile_atom(labellist *ll, nfa *nf, node *n, int *pa, int *pb)
|
||||
{
|
||||
int i;
|
||||
|
||||
REQ(n, ATOM);
|
||||
i = n->n_nchildren;
|
||||
(void)i; /* Don't warn about set but unused */
|
||||
REQN(i, 1);
|
||||
n = n->n_child;
|
||||
if (n->n_type == LPAR) {
|
||||
REQN(i, 3);
|
||||
n++;
|
||||
REQ(n, RHS);
|
||||
compile_rhs(ll, nf, n, pa, pb);
|
||||
n++;
|
||||
REQ(n, RPAR);
|
||||
}
|
||||
else if (n->n_type == NAME || n->n_type == STRING) {
|
||||
*pa = addnfastate(nf);
|
||||
*pb = addnfastate(nf);
|
||||
addnfaarc(nf, *pa, *pb, addlabel(ll, n->n_type, n->n_str));
|
||||
}
|
||||
else
|
||||
REQ(n, NAME);
|
||||
}
|
||||
|
||||
static void
|
||||
dumpstate(labellist *ll, nfa *nf, int istate)
|
||||
{
|
||||
nfastate *st;
|
||||
int i;
|
||||
nfaarc *ar;
|
||||
|
||||
printf("%c%2d%c",
|
||||
istate == nf->nf_start ? '*' : ' ',
|
||||
istate,
|
||||
istate == nf->nf_finish ? '.' : ' ');
|
||||
st = &nf->nf_state[istate];
|
||||
ar = st->st_arc;
|
||||
for (i = 0; i < st->st_narcs; i++) {
|
||||
if (i > 0)
|
||||
printf("\n ");
|
||||
printf("-> %2d %s", ar->ar_arrow,
|
||||
PyGrammar_LabelRepr(&ll->ll_label[ar->ar_label]));
|
||||
ar++;
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
static void
|
||||
dumpnfa(labellist *ll, nfa *nf)
|
||||
{
|
||||
int i;
|
||||
|
||||
printf("NFA '%s' has %d states; start %d, finish %d\n",
|
||||
nf->nf_name, nf->nf_nstates, nf->nf_start, nf->nf_finish);
|
||||
for (i = 0; i < nf->nf_nstates; i++)
|
||||
dumpstate(ll, nf, i);
|
||||
}
|
||||
|
||||
|
||||
/* PART TWO -- CONSTRUCT DFA -- Algorithm 3.1 from [Aho&Ullman 77] */
|
||||
|
||||
static void
|
||||
addclosure(bitset ss, nfa *nf, int istate)
|
||||
{
|
||||
if (addbit(ss, istate)) {
|
||||
nfastate *st = &nf->nf_state[istate];
|
||||
nfaarc *ar = st->st_arc;
|
||||
int i;
|
||||
|
||||
for (i = st->st_narcs; --i >= 0; ) {
|
||||
if (ar->ar_label == EMPTY)
|
||||
addclosure(ss, nf, ar->ar_arrow);
|
||||
ar++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
typedef struct _ss_arc {
|
||||
bitset sa_bitset;
|
||||
int sa_arrow;
|
||||
int sa_label;
|
||||
} ss_arc;
|
||||
|
||||
typedef struct _ss_state {
|
||||
bitset ss_ss;
|
||||
int ss_narcs;
|
||||
struct _ss_arc *ss_arc;
|
||||
int ss_deleted;
|
||||
int ss_finish;
|
||||
int ss_rename;
|
||||
} ss_state;
|
||||
|
||||
typedef struct _ss_dfa {
|
||||
int sd_nstates;
|
||||
ss_state *sd_state;
|
||||
} ss_dfa;
|
||||
|
||||
/* Forward */
|
||||
static void printssdfa(int xx_nstates, ss_state *xx_state, int nbits,
|
||||
labellist *ll, const char *msg);
|
||||
static void simplify(int xx_nstates, ss_state *xx_state);
|
||||
static void convert(dfa *d, int xx_nstates, ss_state *xx_state);
|
||||
|
||||
static void
|
||||
makedfa(nfagrammar *gr, nfa *nf, dfa *d)
|
||||
{
|
||||
int nbits = nf->nf_nstates;
|
||||
bitset ss;
|
||||
int xx_nstates;
|
||||
ss_state *xx_state, *yy;
|
||||
ss_arc *zz;
|
||||
int istate, jstate, iarc, jarc, ibit;
|
||||
nfastate *st;
|
||||
nfaarc *ar;
|
||||
|
||||
ss = newbitset(nbits);
|
||||
addclosure(ss, nf, nf->nf_start);
|
||||
xx_state = (ss_state *)PyObject_MALLOC(sizeof(ss_state));
|
||||
if (xx_state == NULL)
|
||||
Py_FatalError("no mem for xx_state in makedfa");
|
||||
xx_nstates = 1;
|
||||
yy = &xx_state[0];
|
||||
yy->ss_ss = ss;
|
||||
yy->ss_narcs = 0;
|
||||
yy->ss_arc = NULL;
|
||||
yy->ss_deleted = 0;
|
||||
yy->ss_finish = testbit(ss, nf->nf_finish);
|
||||
if (yy->ss_finish)
|
||||
printf("Error: nonterminal '%s' may produce empty.\n",
|
||||
nf->nf_name);
|
||||
|
||||
/* This algorithm is from a book written before
|
||||
the invention of structured programming... */
|
||||
|
||||
/* For each unmarked state... */
|
||||
for (istate = 0; istate < xx_nstates; ++istate) {
|
||||
size_t size;
|
||||
yy = &xx_state[istate];
|
||||
ss = yy->ss_ss;
|
||||
/* For all its states... */
|
||||
for (ibit = 0; ibit < nf->nf_nstates; ++ibit) {
|
||||
if (!testbit(ss, ibit))
|
||||
continue;
|
||||
st = &nf->nf_state[ibit];
|
||||
/* For all non-empty arcs from this state... */
|
||||
for (iarc = 0; iarc < st->st_narcs; iarc++) {
|
||||
ar = &st->st_arc[iarc];
|
||||
if (ar->ar_label == EMPTY)
|
||||
continue;
|
||||
/* Look up in list of arcs from this state */
|
||||
for (jarc = 0; jarc < yy->ss_narcs; ++jarc) {
|
||||
zz = &yy->ss_arc[jarc];
|
||||
if (ar->ar_label == zz->sa_label)
|
||||
goto found;
|
||||
}
|
||||
/* Add new arc for this state */
|
||||
size = sizeof(ss_arc) * (yy->ss_narcs + 1);
|
||||
yy->ss_arc = (ss_arc *)PyObject_REALLOC(
|
||||
yy->ss_arc, size);
|
||||
if (yy->ss_arc == NULL)
|
||||
Py_FatalError("out of mem");
|
||||
zz = &yy->ss_arc[yy->ss_narcs++];
|
||||
zz->sa_label = ar->ar_label;
|
||||
zz->sa_bitset = newbitset(nbits);
|
||||
zz->sa_arrow = -1;
|
||||
found: ;
|
||||
/* Add destination */
|
||||
addclosure(zz->sa_bitset, nf, ar->ar_arrow);
|
||||
}
|
||||
}
|
||||
/* Now look up all the arrow states */
|
||||
for (jarc = 0; jarc < xx_state[istate].ss_narcs; jarc++) {
|
||||
zz = &xx_state[istate].ss_arc[jarc];
|
||||
for (jstate = 0; jstate < xx_nstates; jstate++) {
|
||||
if (samebitset(zz->sa_bitset,
|
||||
xx_state[jstate].ss_ss, nbits)) {
|
||||
zz->sa_arrow = jstate;
|
||||
goto done;
|
||||
}
|
||||
}
|
||||
size = sizeof(ss_state) * (xx_nstates + 1);
|
||||
xx_state = (ss_state *)PyObject_REALLOC(xx_state,
|
||||
size);
|
||||
if (xx_state == NULL)
|
||||
Py_FatalError("out of mem");
|
||||
zz->sa_arrow = xx_nstates;
|
||||
yy = &xx_state[xx_nstates++];
|
||||
yy->ss_ss = zz->sa_bitset;
|
||||
yy->ss_narcs = 0;
|
||||
yy->ss_arc = NULL;
|
||||
yy->ss_deleted = 0;
|
||||
yy->ss_finish = testbit(yy->ss_ss, nf->nf_finish);
|
||||
done: ;
|
||||
}
|
||||
}
|
||||
|
||||
if (Py_DebugFlag)
|
||||
printssdfa(xx_nstates, xx_state, nbits, &gr->gr_ll,
|
||||
"before minimizing");
|
||||
|
||||
simplify(xx_nstates, xx_state);
|
||||
|
||||
if (Py_DebugFlag)
|
||||
printssdfa(xx_nstates, xx_state, nbits, &gr->gr_ll,
|
||||
"after minimizing");
|
||||
|
||||
convert(d, xx_nstates, xx_state);
|
||||
|
||||
for (int i = 0; i < xx_nstates; i++) {
|
||||
for (int j = 0; j < xx_state[i].ss_narcs; j++)
|
||||
delbitset(xx_state[i].ss_arc[j].sa_bitset);
|
||||
PyObject_FREE(xx_state[i].ss_arc);
|
||||
}
|
||||
PyObject_FREE(xx_state);
|
||||
}
|
||||
|
||||
static void
|
||||
printssdfa(int xx_nstates, ss_state *xx_state, int nbits,
|
||||
labellist *ll, const char *msg)
|
||||
{
|
||||
int i, ibit, iarc;
|
||||
ss_state *yy;
|
||||
ss_arc *zz;
|
||||
|
||||
printf("Subset DFA %s\n", msg);
|
||||
for (i = 0; i < xx_nstates; i++) {
|
||||
yy = &xx_state[i];
|
||||
if (yy->ss_deleted)
|
||||
continue;
|
||||
printf(" Subset %d", i);
|
||||
if (yy->ss_finish)
|
||||
printf(" (finish)");
|
||||
printf(" { ");
|
||||
for (ibit = 0; ibit < nbits; ibit++) {
|
||||
if (testbit(yy->ss_ss, ibit))
|
||||
printf("%d ", ibit);
|
||||
}
|
||||
printf("}\n");
|
||||
for (iarc = 0; iarc < yy->ss_narcs; iarc++) {
|
||||
zz = &yy->ss_arc[iarc];
|
||||
printf(" Arc to state %d, label %s\n",
|
||||
zz->sa_arrow,
|
||||
PyGrammar_LabelRepr(
|
||||
&ll->ll_label[zz->sa_label]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* PART THREE -- SIMPLIFY DFA */
|
||||
|
||||
/* Simplify the DFA by repeatedly eliminating states that are
|
||||
equivalent to another oner. This is NOT Algorithm 3.3 from
|
||||
[Aho&Ullman 77]. It does not always finds the minimal DFA,
|
||||
but it does usually make a much smaller one... (For an example
|
||||
of sub-optimal behavior, try S: x a b+ | y a b+.)
|
||||
*/
|
||||
|
||||
static int
|
||||
samestate(ss_state *s1, ss_state *s2)
|
||||
{
|
||||
int i;
|
||||
|
||||
if (s1->ss_narcs != s2->ss_narcs || s1->ss_finish != s2->ss_finish)
|
||||
return 0;
|
||||
for (i = 0; i < s1->ss_narcs; i++) {
|
||||
if (s1->ss_arc[i].sa_arrow != s2->ss_arc[i].sa_arrow ||
|
||||
s1->ss_arc[i].sa_label != s2->ss_arc[i].sa_label)
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
static void
|
||||
renamestates(int xx_nstates, ss_state *xx_state, int from, int to)
|
||||
{
|
||||
int i, j;
|
||||
|
||||
if (Py_DebugFlag)
|
||||
printf("Rename state %d to %d.\n", from, to);
|
||||
for (i = 0; i < xx_nstates; i++) {
|
||||
if (xx_state[i].ss_deleted)
|
||||
continue;
|
||||
for (j = 0; j < xx_state[i].ss_narcs; j++) {
|
||||
if (xx_state[i].ss_arc[j].sa_arrow == from)
|
||||
xx_state[i].ss_arc[j].sa_arrow = to;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
simplify(int xx_nstates, ss_state *xx_state)
|
||||
{
|
||||
int changes;
|
||||
int i, j;
|
||||
|
||||
do {
|
||||
changes = 0;
|
||||
for (i = 1; i < xx_nstates; i++) {
|
||||
if (xx_state[i].ss_deleted)
|
||||
continue;
|
||||
for (j = 0; j < i; j++) {
|
||||
if (xx_state[j].ss_deleted)
|
||||
continue;
|
||||
if (samestate(&xx_state[i], &xx_state[j])) {
|
||||
xx_state[i].ss_deleted++;
|
||||
renamestates(xx_nstates, xx_state,
|
||||
i, j);
|
||||
changes++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
} while (changes);
|
||||
}
|
||||
|
||||
|
||||
/* PART FOUR -- GENERATE PARSING TABLES */
|
||||
|
||||
/* Convert the DFA into a grammar that can be used by our parser */
|
||||
|
||||
static void
|
||||
convert(dfa *d, int xx_nstates, ss_state *xx_state)
|
||||
{
|
||||
int i, j;
|
||||
ss_state *yy;
|
||||
ss_arc *zz;
|
||||
|
||||
for (i = 0; i < xx_nstates; i++) {
|
||||
yy = &xx_state[i];
|
||||
if (yy->ss_deleted)
|
||||
continue;
|
||||
yy->ss_rename = addstate(d);
|
||||
}
|
||||
|
||||
for (i = 0; i < xx_nstates; i++) {
|
||||
yy = &xx_state[i];
|
||||
if (yy->ss_deleted)
|
||||
continue;
|
||||
for (j = 0; j < yy->ss_narcs; j++) {
|
||||
zz = &yy->ss_arc[j];
|
||||
addarc(d, yy->ss_rename,
|
||||
xx_state[zz->sa_arrow].ss_rename,
|
||||
zz->sa_label);
|
||||
}
|
||||
if (yy->ss_finish)
|
||||
addarc(d, yy->ss_rename, yy->ss_rename, 0);
|
||||
}
|
||||
|
||||
d->d_initial = 0;
|
||||
}
|
||||
|
||||
|
||||
/* PART FIVE -- GLUE IT ALL TOGETHER */
|
||||
|
||||
static grammar *
|
||||
maketables(nfagrammar *gr)
|
||||
{
|
||||
int i;
|
||||
nfa *nf;
|
||||
dfa *d;
|
||||
grammar *g;
|
||||
|
||||
if (gr->gr_nnfas == 0)
|
||||
return NULL;
|
||||
g = newgrammar(gr->gr_nfa[0]->nf_type);
|
||||
/* XXX first rule must be start rule */
|
||||
g->g_ll = gr->gr_ll;
|
||||
|
||||
for (i = 0; i < gr->gr_nnfas; i++) {
|
||||
nf = gr->gr_nfa[i];
|
||||
if (Py_DebugFlag) {
|
||||
printf("Dump of NFA for '%s' ...\n", nf->nf_name);
|
||||
dumpnfa(&gr->gr_ll, nf);
|
||||
printf("Making DFA for '%s' ...\n", nf->nf_name);
|
||||
}
|
||||
d = adddfa(g, nf->nf_type, nf->nf_name);
|
||||
makedfa(gr, gr->gr_nfa[i], d);
|
||||
}
|
||||
|
||||
return g;
|
||||
}
|
||||
|
||||
grammar *
|
||||
pgen(node *n)
|
||||
{
|
||||
nfagrammar *gr;
|
||||
grammar *g;
|
||||
|
||||
gr = metacompile(n);
|
||||
g = maketables(gr);
|
||||
translatelabels(g);
|
||||
addfirstsets(g);
|
||||
freenfagrammar(gr);
|
||||
return g;
|
||||
}
|
||||
|
||||
grammar *
|
||||
Py_pgen(node *n)
|
||||
{
|
||||
return pgen(n);
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
Description
|
||||
-----------
|
||||
|
||||
Input is a grammar in extended BNF (using * for repetition, + for
|
||||
at-least-once repetition, [] for optional parts, | for alternatives and
|
||||
() for grouping). This has already been parsed and turned into a parse
|
||||
tree.
|
||||
|
||||
Each rule is considered as a regular expression in its own right.
|
||||
It is turned into a Non-deterministic Finite Automaton (NFA), which
|
||||
is then turned into a Deterministic Finite Automaton (DFA), which is then
|
||||
optimized to reduce the number of states. See [Aho&Ullman 77] chapter 3,
|
||||
or similar compiler books (this technique is more often used for lexical
|
||||
analyzers).
|
||||
|
||||
The DFA's are used by the parser as parsing tables in a special way
|
||||
that's probably unique. Before they are usable, the FIRST sets of all
|
||||
non-terminals are computed.
|
||||
|
||||
Reference
|
||||
---------
|
||||
|
||||
[Aho&Ullman 77]
|
||||
Aho&Ullman, Principles of Compiler Design, Addison-Wesley 1977
|
||||
(first edition)
|
||||
|
||||
*/
|
0
Parser/pgen/__init__.py
Normal file
0
Parser/pgen/__init__.py
Normal file
34
Parser/pgen/__main__.py
Normal file
34
Parser/pgen/__main__.py
Normal file
@ -0,0 +1,34 @@
|
||||
import argparse
|
||||
|
||||
from .pgen import ParserGenerator
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Parser generator main program.")
|
||||
parser.add_argument(
|
||||
"grammar", type=str, help="The file with the grammar definition in EBNF format"
|
||||
)
|
||||
parser.add_argument(
|
||||
"tokens", type=str, help="The file with the token definitions"
|
||||
)
|
||||
parser.add_argument(
|
||||
"graminit_h",
|
||||
type=argparse.FileType('w'),
|
||||
help="The path to write the grammar's non-terminals as #defines",
|
||||
)
|
||||
parser.add_argument(
|
||||
"graminit_c",
|
||||
type=argparse.FileType('w'),
|
||||
help="The path to write the grammar as initialized data",
|
||||
)
|
||||
|
||||
parser.add_argument("--verbose", "-v", action="count")
|
||||
args = parser.parse_args()
|
||||
|
||||
p = ParserGenerator(args.grammar, args.tokens, verbose=args.verbose)
|
||||
grammar = p.make_grammar()
|
||||
grammar.produce_graminit_h(args.graminit_h.write)
|
||||
grammar.produce_graminit_c(args.graminit_c.write)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
160
Parser/pgen/grammar.py
Normal file
160
Parser/pgen/grammar.py
Normal file
@ -0,0 +1,160 @@
|
||||
import collections
|
||||
|
||||
class Grammar:
|
||||
"""Pgen parsing tables conversion class.
|
||||
|
||||
Once initialized, this class supplies the grammar tables for the
|
||||
parsing engine implemented by parse.py. The parsing engine
|
||||
accesses the instance variables directly. The class here does not
|
||||
provide initialization of the tables; several subclasses exist to
|
||||
do this (see the conv and pgen modules).
|
||||
|
||||
The load() method reads the tables from a pickle file, which is
|
||||
much faster than the other ways offered by subclasses. The pickle
|
||||
file is written by calling dump() (after loading the grammar
|
||||
tables using a subclass). The report() method prints a readable
|
||||
representation of the tables to stdout, for debugging.
|
||||
|
||||
The instance variables are as follows:
|
||||
|
||||
symbol2number -- a dict mapping symbol names to numbers. Symbol
|
||||
numbers are always 256 or higher, to distinguish
|
||||
them from token numbers, which are between 0 and
|
||||
255 (inclusive).
|
||||
|
||||
number2symbol -- a dict mapping numbers to symbol names;
|
||||
these two are each other's inverse.
|
||||
|
||||
states -- a list of DFAs, where each DFA is a list of
|
||||
states, each state is a list of arcs, and each
|
||||
arc is a (i, j) pair where i is a label and j is
|
||||
a state number. The DFA number is the index into
|
||||
this list. (This name is slightly confusing.)
|
||||
Final states are represented by a special arc of
|
||||
the form (0, j) where j is its own state number.
|
||||
|
||||
dfas -- a dict mapping symbol numbers to (DFA, first)
|
||||
pairs, where DFA is an item from the states list
|
||||
above, and first is a set of tokens that can
|
||||
begin this grammar rule (represented by a dict
|
||||
whose values are always 1).
|
||||
|
||||
labels -- a list of (x, y) pairs where x is either a token
|
||||
number or a symbol number, and y is either None
|
||||
or a string; the strings are keywords. The label
|
||||
number is the index in this list; label numbers
|
||||
are used to mark state transitions (arcs) in the
|
||||
DFAs.
|
||||
|
||||
start -- the number of the grammar's start symbol.
|
||||
|
||||
keywords -- a dict mapping keyword strings to arc labels.
|
||||
|
||||
tokens -- a dict mapping token numbers to arc labels.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.symbol2number = collections.OrderedDict()
|
||||
self.number2symbol = collections.OrderedDict()
|
||||
self.states = []
|
||||
self.dfas = collections.OrderedDict()
|
||||
self.labels = [(0, "EMPTY")]
|
||||
self.keywords = collections.OrderedDict()
|
||||
self.tokens = collections.OrderedDict()
|
||||
self.symbol2label = collections.OrderedDict()
|
||||
self.start = 256
|
||||
|
||||
def produce_graminit_h(self, writer):
|
||||
writer("/* Generated by Parser/pgen */\n\n")
|
||||
for number, symbol in self.number2symbol.items():
|
||||
writer("#define {} {}\n".format(symbol, number))
|
||||
|
||||
def produce_graminit_c(self, writer):
|
||||
writer("/* Generated by Parser/pgen */\n\n")
|
||||
|
||||
writer('#include "pgenheaders.h"\n')
|
||||
writer('#include "grammar.h"\n')
|
||||
writer("grammar _PyParser_Grammar;\n")
|
||||
|
||||
self.print_dfas(writer)
|
||||
self.print_labels(writer)
|
||||
|
||||
writer("grammar _PyParser_Grammar = {\n")
|
||||
writer(" {n_dfas},\n".format(n_dfas=len(self.dfas)))
|
||||
writer(" dfas,\n")
|
||||
writer(" {{{n_labels}, labels}},\n".format(n_labels=len(self.labels)))
|
||||
writer(" {start_number}\n".format(start_number=self.start))
|
||||
writer("};\n")
|
||||
|
||||
def print_labels(self, writer):
|
||||
writer(
|
||||
"static label labels[{n_labels}] = {{\n".format(n_labels=len(self.labels))
|
||||
)
|
||||
for label, name in self.labels:
|
||||
if name is None:
|
||||
writer(" {{{label}, 0}},\n".format(label=label))
|
||||
else:
|
||||
writer(
|
||||
' {{{label}, "{label_name}"}},\n'.format(
|
||||
label=label, label_name=name
|
||||
)
|
||||
)
|
||||
writer("};\n")
|
||||
|
||||
def print_dfas(self, writer):
|
||||
self.print_states(writer)
|
||||
writer("static dfa dfas[{}] = {{\n".format(len(self.dfas)))
|
||||
for dfaindex, dfa_elem in enumerate(self.dfas.items()):
|
||||
symbol, (dfa, first_sets) = dfa_elem
|
||||
writer(
|
||||
' {{{dfa_symbol}, "{symbol_name}", '.format(
|
||||
dfa_symbol=symbol, symbol_name=self.number2symbol[symbol]
|
||||
)
|
||||
+ "0, {n_states}, states_{dfa_index},\n".format(
|
||||
n_states=len(dfa), dfa_index=dfaindex
|
||||
)
|
||||
)
|
||||
writer(' "')
|
||||
|
||||
k = [name for label, name in self.labels if label in first_sets]
|
||||
bitset = bytearray((len(self.labels) >> 3) + 1)
|
||||
for token in first_sets:
|
||||
bitset[token >> 3] |= 1 << (token & 7)
|
||||
for byte in bitset:
|
||||
writer("\\%03o" % (byte & 0xFF))
|
||||
writer('"},\n')
|
||||
writer("};\n")
|
||||
|
||||
def print_states(self, write):
|
||||
for dfaindex, dfa in enumerate(self.states):
|
||||
self.print_arcs(write, dfaindex, dfa)
|
||||
write(
|
||||
"static state states_{dfa_index}[{n_states}] = {{\n".format(
|
||||
dfa_index=dfaindex, n_states=len(dfa)
|
||||
)
|
||||
)
|
||||
for stateindex, state in enumerate(dfa):
|
||||
narcs = len(state)
|
||||
write(
|
||||
" {{{n_arcs}, arcs_{dfa_index}_{state_index}}},\n".format(
|
||||
n_arcs=narcs, dfa_index=dfaindex, state_index=stateindex
|
||||
)
|
||||
)
|
||||
write("};\n")
|
||||
|
||||
def print_arcs(self, write, dfaindex, states):
|
||||
for stateindex, state in enumerate(states):
|
||||
narcs = len(state)
|
||||
write(
|
||||
"static arc arcs_{dfa_index}_{state_index}[{n_arcs}] = {{\n".format(
|
||||
dfa_index=dfaindex, state_index=stateindex, n_arcs=narcs
|
||||
)
|
||||
)
|
||||
for a, b in state:
|
||||
write(
|
||||
" {{{from_label}, {to_state}}},\n".format(
|
||||
from_label=a, to_state=b
|
||||
)
|
||||
)
|
||||
write("};\n")
|
408
Parser/pgen/pgen.py
Normal file
408
Parser/pgen/pgen.py
Normal file
@ -0,0 +1,408 @@
|
||||
import collections
|
||||
import tokenize # from stdlib
|
||||
|
||||
from . import grammar, token
|
||||
|
||||
class ParserGenerator(object):
|
||||
|
||||
def __init__(self, grammar_file, token_file, stream=None, verbose=False):
|
||||
close_stream = None
|
||||
if stream is None:
|
||||
stream = open(grammar_file)
|
||||
close_stream = stream.close
|
||||
with open(token_file) as tok_file:
|
||||
token_lines = tok_file.readlines()
|
||||
self.tokens = dict(token.generate_tokens(token_lines))
|
||||
self.opmap = dict(token.generate_opmap(token_lines))
|
||||
# Manually add <> so it does not collide with !=
|
||||
self.opmap['<>'] = "NOTEQUAL"
|
||||
self.verbose = verbose
|
||||
self.filename = grammar_file
|
||||
self.stream = stream
|
||||
self.generator = tokenize.generate_tokens(stream.readline)
|
||||
self.gettoken() # Initialize lookahead
|
||||
self.dfas, self.startsymbol = self.parse()
|
||||
if close_stream is not None:
|
||||
close_stream()
|
||||
self.first = {} # map from symbol name to set of tokens
|
||||
self.addfirstsets()
|
||||
|
||||
def make_grammar(self):
|
||||
c = grammar.Grammar()
|
||||
names = list(self.dfas.keys())
|
||||
names.remove(self.startsymbol)
|
||||
names.insert(0, self.startsymbol)
|
||||
for name in names:
|
||||
i = 256 + len(c.symbol2number)
|
||||
c.symbol2number[name] = i
|
||||
c.number2symbol[i] = name
|
||||
for name in names:
|
||||
self.make_label(c, name)
|
||||
dfa = self.dfas[name]
|
||||
states = []
|
||||
for state in dfa:
|
||||
arcs = []
|
||||
for label, next in sorted(state.arcs.items()):
|
||||
arcs.append((self.make_label(c, label), dfa.index(next)))
|
||||
if state.isfinal:
|
||||
arcs.append((0, dfa.index(state)))
|
||||
states.append(arcs)
|
||||
c.states.append(states)
|
||||
c.dfas[c.symbol2number[name]] = (states, self.make_first(c, name))
|
||||
c.start = c.symbol2number[self.startsymbol]
|
||||
|
||||
if self.verbose:
|
||||
print("")
|
||||
print("Grammar summary")
|
||||
print("===============")
|
||||
|
||||
print("- {n_labels} labels".format(n_labels=len(c.labels)))
|
||||
print("- {n_dfas} dfas".format(n_dfas=len(c.dfas)))
|
||||
print("- {n_tokens} tokens".format(n_tokens=len(c.tokens)))
|
||||
print("- {n_keywords} keywords".format(n_keywords=len(c.keywords)))
|
||||
print(
|
||||
"- Start symbol: {start_symbol}".format(
|
||||
start_symbol=c.number2symbol[c.start]
|
||||
)
|
||||
)
|
||||
return c
|
||||
|
||||
def make_first(self, c, name):
|
||||
rawfirst = self.first[name]
|
||||
first = set()
|
||||
for label in sorted(rawfirst):
|
||||
ilabel = self.make_label(c, label)
|
||||
##assert ilabel not in first # XXX failed on <> ... !=
|
||||
first.add(ilabel)
|
||||
return first
|
||||
|
||||
def make_label(self, c, label):
|
||||
# XXX Maybe this should be a method on a subclass of converter?
|
||||
ilabel = len(c.labels)
|
||||
if label[0].isalpha():
|
||||
# Either a symbol name or a named token
|
||||
if label in c.symbol2number:
|
||||
# A symbol name (a non-terminal)
|
||||
if label in c.symbol2label:
|
||||
return c.symbol2label[label]
|
||||
else:
|
||||
c.labels.append((c.symbol2number[label], None))
|
||||
c.symbol2label[label] = ilabel
|
||||
return ilabel
|
||||
else:
|
||||
# A named token (NAME, NUMBER, STRING)
|
||||
itoken = self.tokens.get(label, None)
|
||||
assert isinstance(itoken, int), label
|
||||
assert itoken in self.tokens.values(), label
|
||||
if itoken in c.tokens:
|
||||
return c.tokens[itoken]
|
||||
else:
|
||||
c.labels.append((itoken, None))
|
||||
c.tokens[itoken] = ilabel
|
||||
return ilabel
|
||||
else:
|
||||
# Either a keyword or an operator
|
||||
assert label[0] in ('"', "'"), label
|
||||
value = eval(label)
|
||||
if value[0].isalpha():
|
||||
# A keyword
|
||||
if value in c.keywords:
|
||||
return c.keywords[value]
|
||||
else:
|
||||
c.labels.append((self.tokens["NAME"], value))
|
||||
c.keywords[value] = ilabel
|
||||
return ilabel
|
||||
else:
|
||||
# An operator (any non-numeric token)
|
||||
tok_name = self.opmap[value] # Fails if unknown token
|
||||
itoken = self.tokens[tok_name]
|
||||
if itoken in c.tokens:
|
||||
return c.tokens[itoken]
|
||||
else:
|
||||
c.labels.append((itoken, None))
|
||||
c.tokens[itoken] = ilabel
|
||||
return ilabel
|
||||
|
||||
def addfirstsets(self):
|
||||
names = list(self.dfas.keys())
|
||||
for name in names:
|
||||
if name not in self.first:
|
||||
self.calcfirst(name)
|
||||
|
||||
if self.verbose:
|
||||
print("First set for {dfa_name}".format(dfa_name=name))
|
||||
for item in self.first[name]:
|
||||
print(" - {terminal}".format(terminal=item))
|
||||
|
||||
def calcfirst(self, name):
|
||||
dfa = self.dfas[name]
|
||||
self.first[name] = None # dummy to detect left recursion
|
||||
state = dfa[0]
|
||||
totalset = set()
|
||||
overlapcheck = {}
|
||||
for label, next in state.arcs.items():
|
||||
if label in self.dfas:
|
||||
if label in self.first:
|
||||
fset = self.first[label]
|
||||
if fset is None:
|
||||
raise ValueError("recursion for rule %r" % name)
|
||||
else:
|
||||
self.calcfirst(label)
|
||||
fset = self.first[label]
|
||||
totalset.update(fset)
|
||||
overlapcheck[label] = fset
|
||||
else:
|
||||
totalset.add(label)
|
||||
overlapcheck[label] = {label}
|
||||
inverse = {}
|
||||
for label, itsfirst in overlapcheck.items():
|
||||
for symbol in itsfirst:
|
||||
if symbol in inverse:
|
||||
raise ValueError("rule %s is ambiguous; %s is in the"
|
||||
" first sets of %s as well as %s" %
|
||||
(name, symbol, label, inverse[symbol]))
|
||||
inverse[symbol] = label
|
||||
self.first[name] = totalset
|
||||
|
||||
def parse(self):
|
||||
dfas = collections.OrderedDict()
|
||||
startsymbol = None
|
||||
# MSTART: (NEWLINE | RULE)* ENDMARKER
|
||||
while self.type != tokenize.ENDMARKER:
|
||||
while self.type == tokenize.NEWLINE:
|
||||
self.gettoken()
|
||||
# RULE: NAME ':' RHS NEWLINE
|
||||
name = self.expect(tokenize.NAME)
|
||||
if self.verbose:
|
||||
print("Processing rule {dfa_name}".format(dfa_name=name))
|
||||
self.expect(tokenize.OP, ":")
|
||||
a, z = self.parse_rhs()
|
||||
self.expect(tokenize.NEWLINE)
|
||||
if self.verbose:
|
||||
self.dump_nfa(name, a, z)
|
||||
dfa = self.make_dfa(a, z)
|
||||
if self.verbose:
|
||||
self.dump_dfa(name, dfa)
|
||||
oldlen = len(dfa)
|
||||
self.simplify_dfa(dfa)
|
||||
newlen = len(dfa)
|
||||
dfas[name] = dfa
|
||||
#print name, oldlen, newlen
|
||||
if startsymbol is None:
|
||||
startsymbol = name
|
||||
return dfas, startsymbol
|
||||
|
||||
def make_dfa(self, start, finish):
|
||||
# To turn an NFA into a DFA, we define the states of the DFA
|
||||
# to correspond to *sets* of states of the NFA. Then do some
|
||||
# state reduction. Let's represent sets as dicts with 1 for
|
||||
# values.
|
||||
assert isinstance(start, NFAState)
|
||||
assert isinstance(finish, NFAState)
|
||||
def closure(state):
|
||||
base = set()
|
||||
addclosure(state, base)
|
||||
return base
|
||||
def addclosure(state, base):
|
||||
assert isinstance(state, NFAState)
|
||||
if state in base:
|
||||
return
|
||||
base.add(state)
|
||||
for label, next in state.arcs:
|
||||
if label is None:
|
||||
addclosure(next, base)
|
||||
states = [DFAState(closure(start), finish)]
|
||||
for state in states: # NB states grows while we're iterating
|
||||
arcs = {}
|
||||
for nfastate in state.nfaset:
|
||||
for label, next in nfastate.arcs:
|
||||
if label is not None:
|
||||
addclosure(next, arcs.setdefault(label, set()))
|
||||
for label, nfaset in sorted(arcs.items()):
|
||||
for st in states:
|
||||
if st.nfaset == nfaset:
|
||||
break
|
||||
else:
|
||||
st = DFAState(nfaset, finish)
|
||||
states.append(st)
|
||||
state.addarc(st, label)
|
||||
return states # List of DFAState instances; first one is start
|
||||
|
||||
def dump_nfa(self, name, start, finish):
|
||||
print("Dump of NFA for", name)
|
||||
todo = [start]
|
||||
for i, state in enumerate(todo):
|
||||
print(" State", i, state is finish and "(final)" or "")
|
||||
for label, next in state.arcs:
|
||||
if next in todo:
|
||||
j = todo.index(next)
|
||||
else:
|
||||
j = len(todo)
|
||||
todo.append(next)
|
||||
if label is None:
|
||||
print(" -> %d" % j)
|
||||
else:
|
||||
print(" %s -> %d" % (label, j))
|
||||
|
||||
def dump_dfa(self, name, dfa):
|
||||
print("Dump of DFA for", name)
|
||||
for i, state in enumerate(dfa):
|
||||
print(" State", i, state.isfinal and "(final)" or "")
|
||||
for label, next in sorted(state.arcs.items()):
|
||||
print(" %s -> %d" % (label, dfa.index(next)))
|
||||
|
||||
def simplify_dfa(self, dfa):
|
||||
# This is not theoretically optimal, but works well enough.
|
||||
# Algorithm: repeatedly look for two states that have the same
|
||||
# set of arcs (same labels pointing to the same nodes) and
|
||||
# unify them, until things stop changing.
|
||||
|
||||
# dfa is a list of DFAState instances
|
||||
changes = True
|
||||
while changes:
|
||||
changes = False
|
||||
for i, state_i in enumerate(dfa):
|
||||
for j in range(i+1, len(dfa)):
|
||||
state_j = dfa[j]
|
||||
if state_i == state_j:
|
||||
#print " unify", i, j
|
||||
del dfa[j]
|
||||
for state in dfa:
|
||||
state.unifystate(state_j, state_i)
|
||||
changes = True
|
||||
break
|
||||
|
||||
def parse_rhs(self):
|
||||
# RHS: ALT ('|' ALT)*
|
||||
a, z = self.parse_alt()
|
||||
if self.value != "|":
|
||||
return a, z
|
||||
else:
|
||||
aa = NFAState()
|
||||
zz = NFAState()
|
||||
aa.addarc(a)
|
||||
z.addarc(zz)
|
||||
while self.value == "|":
|
||||
self.gettoken()
|
||||
a, z = self.parse_alt()
|
||||
aa.addarc(a)
|
||||
z.addarc(zz)
|
||||
return aa, zz
|
||||
|
||||
def parse_alt(self):
|
||||
# ALT: ITEM+
|
||||
a, b = self.parse_item()
|
||||
while (self.value in ("(", "[") or
|
||||
self.type in (tokenize.NAME, tokenize.STRING)):
|
||||
c, d = self.parse_item()
|
||||
b.addarc(c)
|
||||
b = d
|
||||
return a, b
|
||||
|
||||
def parse_item(self):
|
||||
# ITEM: '[' RHS ']' | ATOM ['+' | '*']
|
||||
if self.value == "[":
|
||||
self.gettoken()
|
||||
a, z = self.parse_rhs()
|
||||
self.expect(tokenize.OP, "]")
|
||||
a.addarc(z)
|
||||
return a, z
|
||||
else:
|
||||
a, z = self.parse_atom()
|
||||
value = self.value
|
||||
if value not in ("+", "*"):
|
||||
return a, z
|
||||
self.gettoken()
|
||||
z.addarc(a)
|
||||
if value == "+":
|
||||
return a, z
|
||||
else:
|
||||
return a, a
|
||||
|
||||
def parse_atom(self):
|
||||
# ATOM: '(' RHS ')' | NAME | STRING
|
||||
if self.value == "(":
|
||||
self.gettoken()
|
||||
a, z = self.parse_rhs()
|
||||
self.expect(tokenize.OP, ")")
|
||||
return a, z
|
||||
elif self.type in (tokenize.NAME, tokenize.STRING):
|
||||
a = NFAState()
|
||||
z = NFAState()
|
||||
a.addarc(z, self.value)
|
||||
self.gettoken()
|
||||
return a, z
|
||||
else:
|
||||
self.raise_error("expected (...) or NAME or STRING, got %s/%s",
|
||||
self.type, self.value)
|
||||
|
||||
def expect(self, type, value=None):
|
||||
if self.type != type or (value is not None and self.value != value):
|
||||
self.raise_error("expected %s/%s, got %s/%s",
|
||||
type, value, self.type, self.value)
|
||||
value = self.value
|
||||
self.gettoken()
|
||||
return value
|
||||
|
||||
def gettoken(self):
|
||||
tup = next(self.generator)
|
||||
while tup[0] in (tokenize.COMMENT, tokenize.NL):
|
||||
tup = next(self.generator)
|
||||
self.type, self.value, self.begin, self.end, self.line = tup
|
||||
# print(getattr(tokenize, 'tok_name')[self.type], repr(self.value))
|
||||
|
||||
def raise_error(self, msg, *args):
|
||||
if args:
|
||||
try:
|
||||
msg = msg % args
|
||||
except:
|
||||
msg = " ".join([msg] + list(map(str, args)))
|
||||
raise SyntaxError(msg, (self.filename, self.end[0],
|
||||
self.end[1], self.line))
|
||||
|
||||
class NFAState(object):
|
||||
|
||||
def __init__(self):
|
||||
self.arcs = [] # list of (label, NFAState) pairs
|
||||
|
||||
def addarc(self, next, label=None):
|
||||
assert label is None or isinstance(label, str)
|
||||
assert isinstance(next, NFAState)
|
||||
self.arcs.append((label, next))
|
||||
|
||||
class DFAState(object):
|
||||
|
||||
def __init__(self, nfaset, final):
|
||||
assert isinstance(nfaset, set)
|
||||
assert isinstance(next(iter(nfaset)), NFAState)
|
||||
assert isinstance(final, NFAState)
|
||||
self.nfaset = nfaset
|
||||
self.isfinal = final in nfaset
|
||||
self.arcs = {} # map from label to DFAState
|
||||
|
||||
def addarc(self, next, label):
|
||||
assert isinstance(label, str)
|
||||
assert label not in self.arcs
|
||||
assert isinstance(next, DFAState)
|
||||
self.arcs[label] = next
|
||||
|
||||
def unifystate(self, old, new):
|
||||
for label, next in self.arcs.items():
|
||||
if next is old:
|
||||
self.arcs[label] = new
|
||||
|
||||
def __eq__(self, other):
|
||||
# Equality test -- ignore the nfaset instance variable
|
||||
assert isinstance(other, DFAState)
|
||||
if self.isfinal != other.isfinal:
|
||||
return False
|
||||
# Can't just return self.arcs == other.arcs, because that
|
||||
# would invoke this method recursively, with cycles...
|
||||
if len(self.arcs) != len(other.arcs):
|
||||
return False
|
||||
for label, next in self.arcs.items():
|
||||
if next is not other.arcs.get(label):
|
||||
return False
|
||||
return True
|
||||
|
||||
__hash__ = None # For Py3 compatibility.
|
40
Parser/pgen/token.py
Normal file
40
Parser/pgen/token.py
Normal file
@ -0,0 +1,40 @@
|
||||
import itertools
|
||||
|
||||
def generate_tokens(tokens):
|
||||
numbers = itertools.count(0)
|
||||
for line in tokens:
|
||||
line = line.strip()
|
||||
|
||||
if not line:
|
||||
continue
|
||||
if line.strip().startswith('#'):
|
||||
continue
|
||||
|
||||
name = line.split()[0]
|
||||
yield (name, next(numbers))
|
||||
|
||||
yield ('N_TOKENS', next(numbers))
|
||||
yield ('NT_OFFSET', 256)
|
||||
|
||||
def generate_opmap(tokens):
|
||||
for line in tokens:
|
||||
line = line.strip()
|
||||
|
||||
if not line:
|
||||
continue
|
||||
if line.strip().startswith('#'):
|
||||
continue
|
||||
|
||||
pieces = line.split()
|
||||
|
||||
if len(pieces) != 2:
|
||||
continue
|
||||
|
||||
name, op = pieces
|
||||
yield (op.strip("'"), name)
|
||||
|
||||
# Yield independently <>. This is needed so it does not collide
|
||||
# with the token generation in "generate_tokens" because if this
|
||||
# symbol is included in Grammar/Tokens, it will collide with !=
|
||||
# as it has the same name (NOTEQUAL).
|
||||
yield ('<>', 'NOTEQUAL')
|
@ -1,188 +0,0 @@
|
||||
|
||||
/* Parser generator main program */
|
||||
|
||||
/* This expects a filename containing the grammar as argv[1] (UNIX)
|
||||
or asks the console for such a file name (THINK C).
|
||||
It writes its output on two files in the current directory:
|
||||
- "graminit.c" gets the grammar as a bunch of initialized data
|
||||
- "graminit.h" gets the grammar's non-terminals as #defines.
|
||||
Error messages and status info during the generation process are
|
||||
written to stdout, or sometimes to stderr. */
|
||||
|
||||
/* XXX TO DO:
|
||||
- check for duplicate definitions of names (instead of fatal err)
|
||||
*/
|
||||
|
||||
#define PGEN
|
||||
|
||||
#include "Python.h"
|
||||
#include "pycore_pymem.h"
|
||||
#include "pycore_pystate.h"
|
||||
#include "pgenheaders.h"
|
||||
#include "grammar.h"
|
||||
#include "node.h"
|
||||
#include "parsetok.h"
|
||||
#include "pgen.h"
|
||||
|
||||
int Py_DebugFlag = 0;
|
||||
int Py_VerboseFlag = 0;
|
||||
int Py_IgnoreEnvironmentFlag = 0;
|
||||
_PyRuntimeState _PyRuntime = _PyRuntimeState_INIT;
|
||||
|
||||
/* Forward */
|
||||
grammar *getgrammar(const char *filename);
|
||||
|
||||
void
|
||||
Py_Exit(int sts)
|
||||
{
|
||||
exit(sts);
|
||||
}
|
||||
|
||||
/* Needed by obmalloc.c */
|
||||
int PyGILState_Check(void)
|
||||
{ return 1; }
|
||||
|
||||
void _PyMem_DumpTraceback(int fd, const void *ptr)
|
||||
{}
|
||||
|
||||
int
|
||||
main(int argc, char **argv)
|
||||
{
|
||||
grammar *g;
|
||||
FILE *fp;
|
||||
char *filename, *graminit_h, *graminit_c;
|
||||
|
||||
if (argc != 4) {
|
||||
fprintf(stderr,
|
||||
"usage: %s grammar graminit.h graminit.c\n", argv[0]);
|
||||
Py_Exit(2);
|
||||
}
|
||||
filename = argv[1];
|
||||
graminit_h = argv[2];
|
||||
graminit_c = argv[3];
|
||||
g = getgrammar(filename);
|
||||
fp = fopen(graminit_c, "w");
|
||||
if (fp == NULL) {
|
||||
perror(graminit_c);
|
||||
Py_Exit(1);
|
||||
}
|
||||
if (Py_DebugFlag)
|
||||
printf("Writing %s ...\n", graminit_c);
|
||||
printgrammar(g, fp);
|
||||
fclose(fp);
|
||||
fp = fopen(graminit_h, "w");
|
||||
if (fp == NULL) {
|
||||
perror(graminit_h);
|
||||
Py_Exit(1);
|
||||
}
|
||||
if (Py_DebugFlag)
|
||||
printf("Writing %s ...\n", graminit_h);
|
||||
printnonterminals(g, fp);
|
||||
fclose(fp);
|
||||
freegrammar(g);
|
||||
Py_Exit(0);
|
||||
return 0; /* Make gcc -Wall happy */
|
||||
}
|
||||
|
||||
grammar *
|
||||
getgrammar(const char *filename)
|
||||
{
|
||||
FILE *fp;
|
||||
node *n;
|
||||
grammar *g0, *g;
|
||||
perrdetail err;
|
||||
|
||||
fp = fopen(filename, "r");
|
||||
if (fp == NULL) {
|
||||
perror(filename);
|
||||
Py_Exit(1);
|
||||
}
|
||||
g0 = meta_grammar();
|
||||
n = PyParser_ParseFile(fp, filename, g0, g0->g_start,
|
||||
(char *)NULL, (char *)NULL, &err);
|
||||
fclose(fp);
|
||||
if (n == NULL) {
|
||||
fprintf(stderr, "Parsing error %d, line %d.\n",
|
||||
err.error, err.lineno);
|
||||
if (err.text != NULL) {
|
||||
size_t len;
|
||||
int i;
|
||||
fprintf(stderr, "%s", err.text);
|
||||
len = strlen(err.text);
|
||||
if (len == 0 || err.text[len-1] != '\n')
|
||||
fprintf(stderr, "\n");
|
||||
for (i = 0; i < err.offset; i++) {
|
||||
if (err.text[i] == '\t')
|
||||
putc('\t', stderr);
|
||||
else
|
||||
putc(' ', stderr);
|
||||
}
|
||||
fprintf(stderr, "^\n");
|
||||
PyObject_FREE(err.text);
|
||||
}
|
||||
Py_Exit(1);
|
||||
}
|
||||
g = pgen(n);
|
||||
PyNode_Free(n);
|
||||
if (g == NULL) {
|
||||
printf("Bad grammar.\n");
|
||||
Py_Exit(1);
|
||||
}
|
||||
return g;
|
||||
}
|
||||
|
||||
/* Can't happen in pgen */
|
||||
PyObject*
|
||||
PyErr_Occurred()
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
void
|
||||
Py_FatalError(const char *msg)
|
||||
{
|
||||
fprintf(stderr, "pgen: FATAL ERROR: %s\n", msg);
|
||||
Py_Exit(1);
|
||||
}
|
||||
|
||||
/* No-nonsense my_readline() for tokenizer.c */
|
||||
|
||||
char *
|
||||
PyOS_Readline(FILE *sys_stdin, FILE *sys_stdout, const char *prompt)
|
||||
{
|
||||
size_t n = 1000;
|
||||
char *p = (char *)PyMem_MALLOC(n);
|
||||
char *q;
|
||||
if (p == NULL)
|
||||
return NULL;
|
||||
fprintf(stderr, "%s", prompt);
|
||||
q = fgets(p, n, sys_stdin);
|
||||
if (q == NULL) {
|
||||
*p = '\0';
|
||||
return p;
|
||||
}
|
||||
n = strlen(p);
|
||||
if (n > 0 && p[n-1] != '\n')
|
||||
p[n-1] = '\n';
|
||||
return (char *)PyMem_REALLOC(p, n+1);
|
||||
}
|
||||
|
||||
/* No-nonsense fgets */
|
||||
char *
|
||||
Py_UniversalNewlineFgets(char *buf, int n, FILE *stream, PyObject *fobj)
|
||||
{
|
||||
return fgets(buf, n, stream);
|
||||
}
|
||||
|
||||
|
||||
#include <stdarg.h>
|
||||
|
||||
void
|
||||
PySys_WriteStderr(const char *format, ...)
|
||||
{
|
||||
va_list va;
|
||||
|
||||
va_start(va, format);
|
||||
vfprintf(stderr, format, va);
|
||||
va_end(va);
|
||||
}
|
@ -1,120 +0,0 @@
|
||||
|
||||
/* Print a bunch of C initializers that represent a grammar */
|
||||
|
||||
#define PGEN
|
||||
|
||||
#include "pgenheaders.h"
|
||||
#include "grammar.h"
|
||||
|
||||
/* Forward */
|
||||
static void printarcs(int, dfa *, FILE *);
|
||||
static void printstates(grammar *, FILE *);
|
||||
static void printdfas(grammar *, FILE *);
|
||||
static void printlabels(grammar *, FILE *);
|
||||
|
||||
void
|
||||
printgrammar(grammar *g, FILE *fp)
|
||||
{
|
||||
fprintf(fp, "/* Generated by Parser/pgen */\n\n");
|
||||
fprintf(fp, "#include \"pgenheaders.h\"\n");
|
||||
fprintf(fp, "#include \"grammar.h\"\n");
|
||||
fprintf(fp, "grammar _PyParser_Grammar;\n");
|
||||
printdfas(g, fp);
|
||||
printlabels(g, fp);
|
||||
fprintf(fp, "grammar _PyParser_Grammar = {\n");
|
||||
fprintf(fp, " %d,\n", g->g_ndfas);
|
||||
fprintf(fp, " dfas,\n");
|
||||
fprintf(fp, " {%d, labels},\n", g->g_ll.ll_nlabels);
|
||||
fprintf(fp, " %d\n", g->g_start);
|
||||
fprintf(fp, "};\n");
|
||||
}
|
||||
|
||||
void
|
||||
printnonterminals(grammar *g, FILE *fp)
|
||||
{
|
||||
dfa *d;
|
||||
int i;
|
||||
|
||||
fprintf(fp, "/* Generated by Parser/pgen */\n\n");
|
||||
|
||||
d = g->g_dfa;
|
||||
for (i = g->g_ndfas; --i >= 0; d++)
|
||||
fprintf(fp, "#define %s %d\n", d->d_name, d->d_type);
|
||||
}
|
||||
|
||||
static void
|
||||
printarcs(int i, dfa *d, FILE *fp)
|
||||
{
|
||||
arc *a;
|
||||
state *s;
|
||||
int j, k;
|
||||
|
||||
s = d->d_state;
|
||||
for (j = 0; j < d->d_nstates; j++, s++) {
|
||||
fprintf(fp, "static arc arcs_%d_%d[%d] = {\n",
|
||||
i, j, s->s_narcs);
|
||||
a = s->s_arc;
|
||||
for (k = 0; k < s->s_narcs; k++, a++)
|
||||
fprintf(fp, " {%d, %d},\n", a->a_lbl, a->a_arrow);
|
||||
fprintf(fp, "};\n");
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
printstates(grammar *g, FILE *fp)
|
||||
{
|
||||
state *s;
|
||||
dfa *d;
|
||||
int i, j;
|
||||
|
||||
d = g->g_dfa;
|
||||
for (i = 0; i < g->g_ndfas; i++, d++) {
|
||||
printarcs(i, d, fp);
|
||||
fprintf(fp, "static state states_%d[%d] = {\n",
|
||||
i, d->d_nstates);
|
||||
s = d->d_state;
|
||||
for (j = 0; j < d->d_nstates; j++, s++)
|
||||
fprintf(fp, " {%d, arcs_%d_%d},\n",
|
||||
s->s_narcs, i, j);
|
||||
fprintf(fp, "};\n");
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
printdfas(grammar *g, FILE *fp)
|
||||
{
|
||||
dfa *d;
|
||||
int i, j, n;
|
||||
|
||||
printstates(g, fp);
|
||||
fprintf(fp, "static dfa dfas[%d] = {\n", g->g_ndfas);
|
||||
d = g->g_dfa;
|
||||
for (i = 0; i < g->g_ndfas; i++, d++) {
|
||||
fprintf(fp, " {%d, \"%s\", %d, %d, states_%d,\n",
|
||||
d->d_type, d->d_name, d->d_initial, d->d_nstates, i);
|
||||
fprintf(fp, " \"");
|
||||
n = NBYTES(g->g_ll.ll_nlabels);
|
||||
for (j = 0; j < n; j++)
|
||||
fprintf(fp, "\\%03o", d->d_first[j] & 0xff);
|
||||
fprintf(fp, "\"},\n");
|
||||
}
|
||||
fprintf(fp, "};\n");
|
||||
}
|
||||
|
||||
static void
|
||||
printlabels(grammar *g, FILE *fp)
|
||||
{
|
||||
label *l;
|
||||
int i;
|
||||
|
||||
fprintf(fp, "static label labels[%d] = {\n", g->g_ll.ll_nlabels);
|
||||
l = g->g_ll.ll_label;
|
||||
for (i = g->g_ll.ll_nlabels; --i >= 0; l++) {
|
||||
if (l->lb_str == NULL)
|
||||
fprintf(fp, " {%d, 0},\n", l->lb_type);
|
||||
else
|
||||
fprintf(fp, " {%d, \"%s\"},\n",
|
||||
l->lb_type, l->lb_str);
|
||||
}
|
||||
fprintf(fp, "};\n");
|
||||
}
|
@ -10,13 +10,11 @@
|
||||
#include "tokenizer.h"
|
||||
#include "errcode.h"
|
||||
|
||||
#ifndef PGEN
|
||||
#include "unicodeobject.h"
|
||||
#include "bytesobject.h"
|
||||
#include "fileobject.h"
|
||||
#include "codecs.h"
|
||||
#include "abstract.h"
|
||||
#endif /* PGEN */
|
||||
|
||||
/* Alternate tab spacing */
|
||||
#define ALTTABSIZE 1
|
||||
@ -81,11 +79,9 @@ tok_new(void)
|
||||
tok->enc = NULL;
|
||||
tok->encoding = NULL;
|
||||
tok->cont_line = 0;
|
||||
#ifndef PGEN
|
||||
tok->filename = NULL;
|
||||
tok->decoding_readline = NULL;
|
||||
tok->decoding_buffer = NULL;
|
||||
#endif
|
||||
tok->type_comments = 0;
|
||||
|
||||
return tok;
|
||||
@ -104,28 +100,6 @@ new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
|
||||
return result;
|
||||
}
|
||||
|
||||
#ifdef PGEN
|
||||
|
||||
static char *
|
||||
decoding_fgets(char *s, int size, struct tok_state *tok)
|
||||
{
|
||||
return fgets(s, size, tok->fp);
|
||||
}
|
||||
|
||||
static int
|
||||
decoding_feof(struct tok_state *tok)
|
||||
{
|
||||
return feof(tok->fp);
|
||||
}
|
||||
|
||||
static char *
|
||||
decode_str(const char *str, int exec_input, struct tok_state *tok)
|
||||
{
|
||||
return new_string(str, strlen(str), tok);
|
||||
}
|
||||
|
||||
#else /* PGEN */
|
||||
|
||||
static char *
|
||||
error_ret(struct tok_state *tok) /* XXX */
|
||||
{
|
||||
@ -551,7 +525,6 @@ decoding_fgets(char *s, int size, struct tok_state *tok)
|
||||
return error_ret(tok);
|
||||
}
|
||||
}
|
||||
#ifndef PGEN
|
||||
/* The default encoding is UTF-8, so make sure we don't have any
|
||||
non-UTF-8 sequences in it. */
|
||||
if (line && !tok->encoding) {
|
||||
@ -574,7 +547,6 @@ decoding_fgets(char *s, int size, struct tok_state *tok)
|
||||
badchar, tok->filename, tok->lineno + 1);
|
||||
return error_ret(tok);
|
||||
}
|
||||
#endif
|
||||
return line;
|
||||
}
|
||||
|
||||
@ -738,8 +710,6 @@ decode_str(const char *input, int single, struct tok_state *tok)
|
||||
return str;
|
||||
}
|
||||
|
||||
#endif /* PGEN */
|
||||
|
||||
/* Set up tokenizer for string */
|
||||
|
||||
struct tok_state *
|
||||
@ -765,9 +735,7 @@ PyTokenizer_FromUTF8(const char *str, int exec_input)
|
||||
struct tok_state *tok = tok_new();
|
||||
if (tok == NULL)
|
||||
return NULL;
|
||||
#ifndef PGEN
|
||||
tok->input = str = translate_newlines(str, exec_input, tok);
|
||||
#endif
|
||||
if (str == NULL) {
|
||||
PyTokenizer_Free(tok);
|
||||
return NULL;
|
||||
@ -828,11 +796,9 @@ PyTokenizer_Free(struct tok_state *tok)
|
||||
{
|
||||
if (tok->encoding != NULL)
|
||||
PyMem_FREE(tok->encoding);
|
||||
#ifndef PGEN
|
||||
Py_XDECREF(tok->decoding_readline);
|
||||
Py_XDECREF(tok->decoding_buffer);
|
||||
Py_XDECREF(tok->filename);
|
||||
#endif
|
||||
if (tok->fp != NULL && tok->buf != NULL)
|
||||
PyMem_FREE(tok->buf);
|
||||
if (tok->input)
|
||||
@ -871,7 +837,6 @@ tok_nextc(struct tok_state *tok)
|
||||
}
|
||||
if (tok->prompt != NULL) {
|
||||
char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
|
||||
#ifndef PGEN
|
||||
if (newtok != NULL) {
|
||||
char *translated = translate_newlines(newtok, 0, tok);
|
||||
PyMem_FREE(newtok);
|
||||
@ -900,7 +865,6 @@ tok_nextc(struct tok_state *tok)
|
||||
strcpy(newtok, buf);
|
||||
Py_DECREF(u);
|
||||
}
|
||||
#endif
|
||||
if (tok->nextprompt != NULL)
|
||||
tok->prompt = tok->nextprompt;
|
||||
if (newtok == NULL)
|
||||
@ -1056,7 +1020,6 @@ tok_backup(struct tok_state *tok, int c)
|
||||
static int
|
||||
syntaxerror(struct tok_state *tok, const char *format, ...)
|
||||
{
|
||||
#ifndef PGEN
|
||||
va_list vargs;
|
||||
#ifdef HAVE_STDARG_PROTOTYPES
|
||||
va_start(vargs, format);
|
||||
@ -1069,9 +1032,6 @@ syntaxerror(struct tok_state *tok, const char *format, ...)
|
||||
tok->lineno,
|
||||
(int)(tok->cur - tok->line_start));
|
||||
tok->done = E_ERROR;
|
||||
#else
|
||||
tok->done = E_TOKEN;
|
||||
#endif
|
||||
return ERRORTOKEN;
|
||||
}
|
||||
|
||||
@ -1083,9 +1043,6 @@ indenterror(struct tok_state *tok)
|
||||
return ERRORTOKEN;
|
||||
}
|
||||
|
||||
#ifdef PGEN
|
||||
#define verify_identifier(tok) 1
|
||||
#else
|
||||
/* Verify that the identifier follows PEP 3131.
|
||||
All identifier strings are guaranteed to be "ready" unicode objects.
|
||||
*/
|
||||
@ -1112,7 +1069,6 @@ verify_identifier(struct tok_state *tok)
|
||||
tok->done = E_IDENTIFIER;
|
||||
return result;
|
||||
}
|
||||
#endif
|
||||
|
||||
static int
|
||||
tok_decimal_tail(struct tok_state *tok)
|
||||
@ -1667,25 +1623,20 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
|
||||
case '(':
|
||||
case '[':
|
||||
case '{':
|
||||
#ifndef PGEN
|
||||
if (tok->level >= MAXLEVEL) {
|
||||
return syntaxerror(tok, "too many nested parentheses");
|
||||
}
|
||||
tok->parenstack[tok->level] = c;
|
||||
tok->parenlinenostack[tok->level] = tok->lineno;
|
||||
#endif
|
||||
tok->level++;
|
||||
break;
|
||||
case ')':
|
||||
case ']':
|
||||
case '}':
|
||||
#ifndef PGEN
|
||||
if (!tok->level) {
|
||||
return syntaxerror(tok, "unmatched '%c'", c);
|
||||
}
|
||||
#endif
|
||||
tok->level--;
|
||||
#ifndef PGEN
|
||||
int opening = tok->parenstack[tok->level];
|
||||
if (!((opening == '(' && c == ')') ||
|
||||
(opening == '[' && c == ']') ||
|
||||
@ -1704,7 +1655,6 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
|
||||
c, opening);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
break;
|
||||
}
|
||||
|
||||
@ -1742,11 +1692,7 @@ PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
|
||||
FILE *fp;
|
||||
char *p_start =NULL , *p_end =NULL , *encoding = NULL;
|
||||
|
||||
#ifndef PGEN
|
||||
fd = _Py_dup(fd);
|
||||
#else
|
||||
fd = dup(fd);
|
||||
#endif
|
||||
if (fd < 0) {
|
||||
return NULL;
|
||||
}
|
||||
@ -1760,7 +1706,6 @@ PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
|
||||
fclose(fp);
|
||||
return NULL;
|
||||
}
|
||||
#ifndef PGEN
|
||||
if (filename != NULL) {
|
||||
Py_INCREF(filename);
|
||||
tok->filename = filename;
|
||||
@ -1773,7 +1718,6 @@ PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
|
||||
return encoding;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
while (tok->lineno < 2 && tok->done == E_OK) {
|
||||
PyTokenizer_Get(tok, &p_start, &p_end);
|
||||
}
|
||||
|
@ -42,15 +42,9 @@ struct tok_state {
|
||||
expression (cf. issue 16806) */
|
||||
int level; /* () [] {} Parentheses nesting level */
|
||||
/* Used to allow free continuations inside them */
|
||||
#ifndef PGEN
|
||||
char parenstack[MAXLEVEL];
|
||||
int parenlinenostack[MAXLEVEL];
|
||||
/* pgen doesn't have access to Python codecs, it cannot decode the input
|
||||
filename. The bytes filename might be kept, but it is only used by
|
||||
indenterror() and it is not really needed: pgen only compiles one file
|
||||
(Grammar/Grammar). */
|
||||
PyObject *filename;
|
||||
#endif
|
||||
/* Stuff for checking on different tab sizes */
|
||||
int altindstack[MAXINDENT]; /* Stack of alternate indents */
|
||||
/* Stuff for PEP 0263 */
|
||||
@ -63,10 +57,8 @@ struct tok_state {
|
||||
const char* multi_line_start; /* pointer to start of first line of
|
||||
a single line or multi line string
|
||||
expression (cf. issue 16806) */
|
||||
#ifndef PGEN
|
||||
PyObject *decoding_readline; /* open(...).readline */
|
||||
PyObject *decoding_buffer;
|
||||
#endif
|
||||
const char* enc; /* Encoding for the current str. */
|
||||
const char* str;
|
||||
const char* input; /* Tokenizer's newline translated copy of the string. */
|
||||
|
@ -1,2 +0,0 @@
|
||||
#define PGEN
|
||||
#include "tokenizer.c"
|
1840
Python/graminit.c
1840
Python/graminit.c
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user