Initial revision

This commit is contained in:
Guido van Rossum 1990-10-14 12:07:46 +00:00
parent c636014c43
commit 85a5fbbdfe
78 changed files with 13589 additions and 0 deletions

101
Parser/acceler.c Normal file
View file

@ -0,0 +1,101 @@
/* Parser accelerator module */
#include <stdio.h>
#include "PROTO.h"
#include "grammar.h"
#include "token.h"
#include "malloc.h"
static void
fixstate(g, d, s)
grammar *g;
dfa *d;
state *s;
{
arc *a;
int k;
int *accel;
int nl = g->g_ll.ll_nlabels;
s->s_accept = 0;
accel = NEW(int, nl);
for (k = 0; k < nl; k++)
accel[k] = -1;
a = s->s_arc;
for (k = s->s_narcs; --k >= 0; a++) {
int lbl = a->a_lbl;
label *l = &g->g_ll.ll_label[lbl];
int type = l->lb_type;
if (a->a_arrow >= (1 << 7)) {
printf("XXX too many states!\n");
continue;
}
if (ISNONTERMINAL(type)) {
dfa *d1 = finddfa(g, type);
int ibit;
if (type - NT_OFFSET >= (1 << 7)) {
printf("XXX too high nonterminal number!\n");
continue;
}
for (ibit = 0; ibit < g->g_ll.ll_nlabels; ibit++) {
if (testbit(d1->d_first, ibit)) {
if (accel[ibit] != -1)
printf("XXX ambiguity!\n");
accel[ibit] = a->a_arrow | (1 << 7) |
((type - NT_OFFSET) << 8);
}
}
}
else if (lbl == EMPTY)
s->s_accept = 1;
else if (lbl >= 0 && lbl < nl)
accel[lbl] = a->a_arrow;
}
while (nl > 0 && accel[nl-1] == -1)
nl--;
for (k = 0; k < nl && accel[k] == -1;)
k++;
if (k < nl) {
int i;
s->s_accel = NEW(int, nl-k);
if (s->s_accel == NULL) {
fprintf(stderr, "no mem to add parser accelerators\n");
exit(1);
}
s->s_lower = k;
s->s_upper = nl;
for (i = 0; k < nl; i++, k++)
s->s_accel[i] = accel[k];
}
DEL(accel);
}
static void
fixdfa(g, d)
grammar *g;
dfa *d;
{
state *s;
int j;
s = d->d_state;
for (j = 0; j < d->d_nstates; j++, s++)
fixstate(g, d, s);
}
void
addaccelerators(g)
grammar *g;
{
dfa *d;
int i;
#ifdef DEBUG
printf("Adding parser accellerators ...\n");
#endif
d = g->g_dfa;
for (i = g->g_ndfas; --i >= 0; d++)
fixdfa(g, d);
g->g_accel = 1;
#ifdef DEBUG
printf("Done.\n");
#endif
}

1
Parser/assert.h Normal file
View file

@ -0,0 +1 @@
#define assert(e) { if (!(e)) { printf("Assertion failed\n"); abort(); } }

76
Parser/bitset.c Normal file
View file

@ -0,0 +1,76 @@
/* Bitset primitives */
#include "PROTO.h"
#include "malloc.h"
#include "bitset.h"
bitset
newbitset(nbits)
int nbits;
{
int nbytes = NBYTES(nbits);
bitset ss = NEW(BYTE, nbytes);
if (ss == NULL)
fatal("no mem for bitset");
ss += nbytes;
while (--nbytes >= 0)
*--ss = 0;
return ss;
}
void
delbitset(ss)
bitset ss;
{
DEL(ss);
}
int
addbit(ss, ibit)
bitset ss;
int ibit;
{
int ibyte = BIT2BYTE(ibit);
BYTE mask = BIT2MASK(ibit);
if (ss[ibyte] & mask)
return 0; /* Bit already set */
ss[ibyte] |= mask;
return 1;
}
#if 0 /* Now a macro */
int
testbit(ss, ibit)
bitset ss;
int ibit;
{
return (ss[BIT2BYTE(ibit)] & BIT2MASK(ibit)) != 0;
}
#endif
int
samebitset(ss1, ss2, nbits)
bitset ss1, ss2;
int nbits;
{
int i;
for (i = NBYTES(nbits); --i >= 0; )
if (*ss1++ != *ss2++)
return 0;
return 1;
}
void
mergebitset(ss1, ss2, nbits)
bitset ss1, ss2;
int nbits;
{
int i;
for (i = NBYTES(nbits); --i >= 0; )
*ss1++ |= *ss2++;
}

109
Parser/firstsets.c Normal file
View file

@ -0,0 +1,109 @@
/* Computation of FIRST stets */
#include <stdio.h>
#include "PROTO.h"
#include "malloc.h"
#include "grammar.h"
#include "token.h"
extern int debugging;
static void
calcfirstset(g, d)
grammar *g;
dfa *d;
{
int i, j;
state *s;
arc *a;
int nsyms;
int *sym;
int nbits;
static bitset dummy;
bitset result;
int type;
dfa *d1;
label *l0;
if (debugging)
printf("Calculate FIRST set for '%s'\n", d->d_name);
if (dummy == NULL)
dummy = newbitset(1);
if (d->d_first == dummy) {
fprintf(stderr, "Left-recursion for '%s'\n", d->d_name);
return;
}
if (d->d_first != NULL) {
fprintf(stderr, "Re-calculating FIRST set for '%s' ???\n",
d->d_name);
}
d->d_first = dummy;
l0 = g->g_ll.ll_label;
nbits = g->g_ll.ll_nlabels;
result = newbitset(nbits);
sym = NEW(int, 1);
if (sym == NULL)
fatal("no mem for new sym in calcfirstset");
nsyms = 1;
sym[0] = findlabel(&g->g_ll, d->d_type, (char *)NULL);
s = &d->d_state[d->d_initial];
for (i = 0; i < s->s_narcs; i++) {
a = &s->s_arc[i];
for (j = 0; j < nsyms; j++) {
if (sym[j] == a->a_lbl)
break;
}
if (j >= nsyms) { /* New label */
RESIZE(sym, int, nsyms + 1);
if (sym == NULL)
fatal("no mem to resize sym in calcfirstset");
sym[nsyms++] = a->a_lbl;
type = l0[a->a_lbl].lb_type;
if (ISNONTERMINAL(type)) {
d1 = finddfa(g, type);
if (d1->d_first == dummy) {
fprintf(stderr,
"Left-recursion below '%s'\n",
d->d_name);
}
else {
if (d1->d_first == NULL)
calcfirstset(g, d1);
mergebitset(result, d1->d_first, nbits);
}
}
else if (ISTERMINAL(type)) {
addbit(result, a->a_lbl);
}
}
}
d->d_first = result;
if (debugging) {
printf("FIRST set for '%s': {", d->d_name);
for (i = 0; i < nbits; i++) {
if (testbit(result, i))
printf(" %s", labelrepr(&l0[i]));
}
printf(" }\n");
}
}
void
addfirstsets(g)
grammar *g;
{
int i;
dfa *d;
printf("Adding FIRST sets ...\n");
for (i = 0; i < g->g_ndfas; i++) {
d = &g->g_dfa[i];
if (d->d_first == NULL)
calcfirstset(g, d);
}
}

207
Parser/grammar.c Normal file
View file

@ -0,0 +1,207 @@
/* Grammar implementation */
#include <stdio.h>
#include <ctype.h>
#include "PROTO.h"
#include "malloc.h"
#include "assert.h"
#include "token.h"
#include "grammar.h"
extern int debugging;
grammar *
newgrammar(start)
int start;
{
grammar *g;
g = NEW(grammar, 1);
if (g == NULL)
fatal("no mem for new grammar");
g->g_ndfas = 0;
g->g_dfa = NULL;
g->g_start = start;
g->g_ll.ll_nlabels = 0;
g->g_ll.ll_label = NULL;
return g;
}
dfa *
adddfa(g, type, name)
grammar *g;
int type;
char *name;
{
dfa *d;
RESIZE(g->g_dfa, dfa, g->g_ndfas + 1);
if (g->g_dfa == NULL)
fatal("no mem to resize dfa in adddfa");
d = &g->g_dfa[g->g_ndfas++];
d->d_type = type;
d->d_name = name;
d->d_nstates = 0;
d->d_state = NULL;
d->d_initial = -1;
d->d_first = NULL;
return d; /* Only use while fresh! */
}
int
addstate(d)
dfa *d;
{
state *s;
RESIZE(d->d_state, state, d->d_nstates + 1);
if (d->d_state == NULL)
fatal("no mem to resize state in addstate");
s = &d->d_state[d->d_nstates++];
s->s_narcs = 0;
s->s_arc = NULL;
return s - d->d_state;
}
void
addarc(d, from, to, lbl)
dfa *d;
int lbl;
{
state *s;
arc *a;
assert(0 <= from && from < d->d_nstates);
assert(0 <= to && to < d->d_nstates);
s = &d->d_state[from];
RESIZE(s->s_arc, arc, s->s_narcs + 1);
if (s->s_arc == NULL)
fatal("no mem to resize arc list in addarc");
a = &s->s_arc[s->s_narcs++];
a->a_lbl = lbl;
a->a_arrow = to;
}
int
addlabel(ll, type, str)
labellist *ll;
int type;
char *str;
{
int i;
label *lb;
for (i = 0; i < ll->ll_nlabels; i++) {
if (ll->ll_label[i].lb_type == type &&
strcmp(ll->ll_label[i].lb_str, str) == 0)
return i;
}
RESIZE(ll->ll_label, label, ll->ll_nlabels + 1);
if (ll->ll_label == NULL)
fatal("no mem to resize labellist in addlabel");
lb = &ll->ll_label[ll->ll_nlabels++];
lb->lb_type = type;
lb->lb_str = str; /* XXX strdup(str) ??? */
return lb - ll->ll_label;
}
/* Same, but rather dies than adds */
int
findlabel(ll, type, str)
labellist *ll;
int type;
char *str;
{
int i;
label *lb;
for (i = 0; i < ll->ll_nlabels; i++) {
if (ll->ll_label[i].lb_type == type /*&&
strcmp(ll->ll_label[i].lb_str, str) == 0*/)
return i;
}
fprintf(stderr, "Label %d/'%s' not found\n", type, str);
abort();
}
static void
translabel(g, lb)
grammar *g;
label *lb;
{
int i;
if (debugging)
printf("Translating label %s ...\n", labelrepr(lb));
if (lb->lb_type == NAME) {
for (i = 0; i < g->g_ndfas; i++) {
if (strcmp(lb->lb_str, g->g_dfa[i].d_name) == 0) {
if (debugging)
printf("Label %s is non-terminal %d.\n",
lb->lb_str,
g->g_dfa[i].d_type);
lb->lb_type = g->g_dfa[i].d_type;
lb->lb_str = NULL;
return;
}
}
for (i = 0; i < (int)N_TOKENS; i++) {
if (strcmp(lb->lb_str, tok_name[i]) == 0) {
if (debugging)
printf("Label %s is terminal %d.\n",
lb->lb_str, i);
lb->lb_type = i;
lb->lb_str = NULL;
return;
}
}
printf("Can't translate NAME label '%s'\n", lb->lb_str);
return;
}
if (lb->lb_type == STRING) {
if (isalpha(lb->lb_str[1])) {
char *p, *strchr();
if (debugging)
printf("Label %s is a keyword\n", lb->lb_str);
lb->lb_type = NAME;
lb->lb_str++;
p = strchr(lb->lb_str, '\'');
if (p)
*p = '\0';
}
else {
if (lb->lb_str[2] == lb->lb_str[0]) {
int type = (int) tok_1char(lb->lb_str[1]);
if (type != OP) {
lb->lb_type = type;
lb->lb_str = NULL;
}
else
printf("Unknown OP label %s\n",
lb->lb_str);
}
else
printf("Can't translate STRING label %s\n",
lb->lb_str);
}
}
else
printf("Can't translate label '%s'\n", labelrepr(lb));
}
void
translatelabels(g)
grammar *g;
{
int i;
printf("Translating labels ...\n");
/* Don't translate EMPTY */
for (i = EMPTY+1; i < g->g_ll.ll_nlabels; i++)
translabel(g, &g->g_ll.ll_label[i]);
}

52
Parser/grammar1.c Normal file
View file

@ -0,0 +1,52 @@
/* Grammar subroutines needed by parser */
#include "PROTO.h"
#define NULL 0
#include "assert.h"
#include "grammar.h"
#include "token.h"
/* Return the DFA for the given type */
dfa *
finddfa(g, type)
grammar *g;
register int type;
{
register int i;
register dfa *d;
for (i = g->g_ndfas, d = g->g_dfa; --i >= 0; d++) {
if (d->d_type == type)
return d;
}
assert(0);
/* NOTREACHED */
}
char *
labelrepr(lb)
label *lb;
{
static char buf[100];
if (lb->lb_type == ENDMARKER)
return "EMPTY";
else if (ISNONTERMINAL(lb->lb_type)) {
if (lb->lb_str == NULL) {
sprintf(buf, "NT%d", lb->lb_type);
return buf;
}
else
return lb->lb_str;
}
else {
if (lb->lb_str == NULL)
return tok_name[lb->lb_type];
else {
sprintf(buf, "%.32s(%.32s)",
tok_name[lb->lb_type], lb->lb_str);
return buf;
}
}
}

95
Parser/intrcheck.c Normal file
View file

@ -0,0 +1,95 @@
/* Check for interrupts */
#ifdef MSDOS
/* This might work for MS-DOS: */
void
initintr()
{
}
int
intrcheck()
{
int interrupted = 0;
while (kbhit()) {
if (getch() == '\003')
interrupted = 1;
}
return interrupted;
}
#define OK
#endif
#ifdef THINK_C
#include <MacHeaders>
void
initintr()
{
}
int
intrcheck()
{
/* Static to make it faster(?) only */
static EventRecord e;
/* XXX This fails if the user first types ahead and then
decides to interrupt -- repeating Command-. until the
event queue overflows may work though. */
if (EventAvail(keyDownMask|autoKeyMask, &e) &&
(e.modifiers & cmdKey) &&
(e.message & charCodeMask) == '.') {
(void) GetNextEvent(keyDownMask|autoKeyMask, &e);
return 1;
}
return 0;
}
#define OK
#endif /* THINK_C */
#ifndef OK
/* Default version -- should work for Unix and Standard C */
#include <stdio.h>
#include <signal.h>
#include "sigtype.h"
static int interrupted;
static SIGTYPE
intcatcher(sig)
int sig;
{
interrupted = 1;
signal(SIGINT, intcatcher);
}
void
initintr()
{
if (signal(SIGINT, SIG_IGN) != SIG_IGN)
signal(SIGINT, intcatcher);
}
int
intrcheck()
{
if (!interrupted)
return 0;
interrupted = 0;
return 1;
}
#endif /* !OK */

68
Parser/listnode.c Normal file
View file

@ -0,0 +1,68 @@
/* List a node on a file */
#include <stdio.h>
#include "PROTO.h"
#include "token.h"
#include "node.h"
static int level, atbol;
static void
list1node(fp, n)
FILE *fp;
node *n;
{
if (n == 0)
return;
if (ISNONTERMINAL(TYPE(n))) {
int i;
for (i = 0; i < NCH(n); i++)
list1node(fp, CHILD(n, i));
}
else if (ISTERMINAL(TYPE(n))) {
switch (TYPE(n)) {
case INDENT:
++level;
break;
case DEDENT:
--level;
break;
default:
if (atbol) {
int i;
for (i = 0; i < level; ++i)
fprintf(fp, "\t");
atbol = 0;
}
if (TYPE(n) == NEWLINE) {
if (STR(n) != NULL)
fprintf(fp, "%s", STR(n));
fprintf(fp, "\n");
atbol = 1;
}
else
fprintf(fp, "%s ", STR(n));
break;
}
}
else
fprintf(fp, "? ");
}
void
listnode(fp, n)
FILE *fp;
node *n;
{
level = 0;
atbol = 1;
list1node(fp, n);
}
void
listtree(n)
node *n;
{
listnode(stdout, n);
}

151
Parser/metagrammar.c Normal file
View file

@ -0,0 +1,151 @@
#include "PROTO.h"
#include "metagrammar.h"
#include "grammar.h"
static arc arcs_0_0[3] = {
{2, 0},
{3, 0},
{4, 1},
};
static arc arcs_0_1[1] = {
{0, 1},
};
static state states_0[2] = {
{3, arcs_0_0},
{1, arcs_0_1},
};
static arc arcs_1_0[1] = {
{5, 1},
};
static arc arcs_1_1[1] = {
{6, 2},
};
static arc arcs_1_2[1] = {
{7, 3},
};
static arc arcs_1_3[1] = {
{3, 4},
};
static arc arcs_1_4[1] = {
{0, 4},
};
static state states_1[5] = {
{1, arcs_1_0},
{1, arcs_1_1},
{1, arcs_1_2},
{1, arcs_1_3},
{1, arcs_1_4},
};
static arc arcs_2_0[1] = {
{8, 1},
};
static arc arcs_2_1[2] = {
{9, 0},
{0, 1},
};
static state states_2[2] = {
{1, arcs_2_0},
{2, arcs_2_1},
};
static arc arcs_3_0[1] = {
{10, 1},
};
static arc arcs_3_1[2] = {
{10, 1},
{0, 1},
};
static state states_3[2] = {
{1, arcs_3_0},
{2, arcs_3_1},
};
static arc arcs_4_0[2] = {
{11, 1},
{13, 2},
};
static arc arcs_4_1[1] = {
{7, 3},
};
static arc arcs_4_2[3] = {
{14, 4},
{15, 4},
{0, 2},
};
static arc arcs_4_3[1] = {
{12, 4},
};
static arc arcs_4_4[1] = {
{0, 4},
};
static state states_4[5] = {
{2, arcs_4_0},
{1, arcs_4_1},
{3, arcs_4_2},
{1, arcs_4_3},
{1, arcs_4_4},
};
static arc arcs_5_0[3] = {
{5, 1},
{16, 1},
{17, 2},
};
static arc arcs_5_1[1] = {
{0, 1},
};
static arc arcs_5_2[1] = {
{7, 3},
};
static arc arcs_5_3[1] = {
{18, 1},
};
static state states_5[4] = {
{3, arcs_5_0},
{1, arcs_5_1},
{1, arcs_5_2},
{1, arcs_5_3},
};
static dfa dfas[6] = {
{256, "MSTART", 0, 2, states_0,
"\070\000\000"},
{257, "RULE", 0, 5, states_1,
"\040\000\000"},
{258, "RHS", 0, 2, states_2,
"\040\010\003"},
{259, "ALT", 0, 2, states_3,
"\040\010\003"},
{260, "ITEM", 0, 5, states_4,
"\040\010\003"},
{261, "ATOM", 0, 4, states_5,
"\040\000\003"},
};
static label labels[19] = {
{0, "EMPTY"},
{256, 0},
{257, 0},
{4, 0},
{0, 0},
{1, 0},
{11, 0},
{258, 0},
{259, 0},
{18, 0},
{260, 0},
{9, 0},
{10, 0},
{261, 0},
{16, 0},
{14, 0},
{3, 0},
{7, 0},
{8, 0},
};
static grammar gram = {
6,
dfas,
{19, labels},
256
};
grammar *
meta_grammar()
{
return &gram;
}

47
Parser/node.c Normal file
View file

@ -0,0 +1,47 @@
/* Parse tree node implementation */
#include "PROTO.h"
#include "malloc.h"
#include "node.h"
node *
newnode(type)
int type;
{
node *n = NEW(node, 1);
if (n == NULL)
return NULL;
n->n_type = type;
n->n_str = NULL;
n->n_nchildren = 0;
n->n_child = NULL;
return n;
}
#define XXX 3 /* Node alignment factor to speed up realloc */
#define XXXROUNDUP(n) ((n) == 1 ? 1 : ((n) + XXX - 1) / XXX * XXX)
node *
addchild(n1, type, str)
register node *n1;
int type;
char *str;
{
register int nch = n1->n_nchildren;
register int nch1 = nch+1;
register node *n;
if (XXXROUNDUP(nch) < nch1) {
n = n1->n_child;
nch1 = XXXROUNDUP(nch1);
RESIZE(n, node, nch1);
if (n == NULL)
return NULL;
n1->n_child = n;
}
n = &n1->n_child[n1->n_nchildren++];
n->n_type = type;
n->n_str = str;
n->n_nchildren = 0;
n->n_child = NULL;
return n;
}

396
Parser/parser.c Normal file
View file

@ -0,0 +1,396 @@
/* Parser implementation */
/* For a description, see the comments at end of this file */
/* XXX To do: error recovery */
#include <stdio.h>
#include "assert.h"
#include "PROTO.h"
#include "malloc.h"
#include "token.h"
#include "grammar.h"
#include "node.h"
#include "parser.h"
#include "errcode.h"
extern int debugging;
#ifdef DEBUG
#define D(x) if (!debugging); else x
#else
#define D(x)
#endif
/* STACK DATA TYPE */
static void s_reset PROTO((stack *));
static void
s_reset(s)
stack *s;
{
s->s_top = &s->s_base[MAXSTACK];
}
#define s_empty(s) ((s)->s_top == &(s)->s_base[MAXSTACK])
static int s_push PROTO((stack *, dfa *, node *));
static int
s_push(s, d, parent)
register stack *s;
dfa *d;
node *parent;
{
register stackentry *top;
if (s->s_top == s->s_base) {
fprintf(stderr, "s_push: parser stack overflow\n");
return -1;
}
top = --s->s_top;
top->s_dfa = d;
top->s_parent = parent;
top->s_state = 0;
return 0;
}
#ifdef DEBUG
static void s_pop PROTO((stack *));
static void
s_pop(s)
register stack *s;
{
if (s_empty(s)) {
fprintf(stderr, "s_pop: parser stack underflow -- FATAL\n");
abort();
}
s->s_top++;
}
#else /* !DEBUG */
#define s_pop(s) (s)->s_top++
#endif
/* PARSER CREATION */
parser_state *
newparser(g, start)
grammar *g;
int start;
{
parser_state *ps;
if (!g->g_accel)
addaccelerators(g);
ps = NEW(parser_state, 1);
if (ps == NULL)
return NULL;
ps->p_grammar = g;
ps->p_tree = newnode(start);
if (ps->p_tree == NULL) {
if (ps->p_tree != NULL)
DEL(ps->p_tree); /* XXX freeing a node!?! */
DEL(ps);
return NULL;
}
s_reset(&ps->p_stack);
(void) s_push(&ps->p_stack, finddfa(g, start), ps->p_tree);
return ps;
}
void
delparser(ps)
parser_state *ps;
{
DEL(ps);
}
/* PARSER STACK OPERATIONS */
static int shift PROTO((stack *, int, char *, int));
static int
shift(s, type, str, newstate)
register stack *s;
int type;
char *str;
int newstate;
{
assert(!s_empty(s));
if (addchild(s->s_top->s_parent, type, str) == NULL) {
fprintf(stderr, "shift: no mem in addchild\n");
return -1;
}
s->s_top->s_state = newstate;
return 0;
}
static int push PROTO((stack *, int, dfa *, int));
static int
push(s, type, d, newstate)
register stack *s;
int type;
dfa *d;
int newstate;
{
register node *n;
n = s->s_top->s_parent;
assert(!s_empty(s));
if (addchild(n, type, (char *)NULL) == NULL) {
fprintf(stderr, "push: no mem in addchild\n");
return -1;
}
s->s_top->s_state = newstate;
return s_push(s, d, CHILD(n, NCH(n)-1));
}
/* PARSER PROPER */
static int classify PROTO((grammar *, int, char *));
static int
classify(g, type, str)
grammar *g;
register int type;
char *str;
{
register int n = g->g_ll.ll_nlabels;
if (type == NAME) {
register char *s = str;
register label *l = g->g_ll.ll_label;
register int i;
for (i = n; i > 0; i--, l++) {
if (l->lb_type == NAME && l->lb_str != NULL &&
l->lb_str[0] == s[0] &&
strcmp(l->lb_str, s) == 0) {
D(printf("It's a keyword\n"));
return n - i;
}
}
}
{
register label *l = g->g_ll.ll_label;
register int i;
for (i = n; i > 0; i--, l++) {
if (l->lb_type == type && l->lb_str == NULL) {
D(printf("It's a token we know\n"));
return n - i;
}
}
}
D(printf("Illegal token\n"));
return -1;
}
int
addtoken(ps, type, str)
register parser_state *ps;
register int type;
char *str;
{
register int ilabel;
D(printf("Token %s/'%s' ... ", tok_name[type], str));
/* Find out which label this token is */
ilabel = classify(ps->p_grammar, type, str);
if (ilabel < 0)
return E_SYNTAX;
/* Loop until the token is shifted or an error occurred */
for (;;) {
/* Fetch the current dfa and state */
register dfa *d = ps->p_stack.s_top->s_dfa;
register state *s = &d->d_state[ps->p_stack.s_top->s_state];
D(printf(" DFA '%s', state %d:",
d->d_name, ps->p_stack.s_top->s_state));
/* Check accelerator */
if (s->s_lower <= ilabel && ilabel < s->s_upper) {
register int x = s->s_accel[ilabel - s->s_lower];
if (x != -1) {
if (x & (1<<7)) {
/* Push non-terminal */
int nt = (x >> 8) + NT_OFFSET;
int arrow = x & ((1<<7)-1);
dfa *d1 = finddfa(ps->p_grammar, nt);
if (push(&ps->p_stack, nt, d1, arrow) < 0) {
D(printf(" MemError: push.\n"));
return E_NOMEM;
}
D(printf(" Push ...\n"));
continue;
}
/* Shift the token */
if (shift(&ps->p_stack, type, str, x) < 0) {
D(printf(" MemError: shift.\n"));
return E_NOMEM;
}
D(printf(" Shift.\n"));
/* Pop while we are in an accept-only state */
while (s = &d->d_state
[ps->p_stack.s_top->s_state],
s->s_accept && s->s_narcs == 1) {
D(printf(" Direct pop.\n"));
s_pop(&ps->p_stack);
if (s_empty(&ps->p_stack)) {
D(printf(" ACCEPT.\n"));
return E_DONE;
}
d = ps->p_stack.s_top->s_dfa;
}
return E_OK;
}
}
if (s->s_accept) {
/* Pop this dfa and try again */
s_pop(&ps->p_stack);
D(printf(" Pop ...\n"));
if (s_empty(&ps->p_stack)) {
D(printf(" Error: bottom of stack.\n"));
return E_SYNTAX;
}
continue;
}
/* Stuck, report syntax error */
D(printf(" Error.\n"));
return E_SYNTAX;
}
}
#ifdef DEBUG
/* DEBUG OUTPUT */
void
dumptree(g, n)
grammar *g;
node *n;
{
int i;
if (n == NULL)
printf("NIL");
else {
label l;
l.lb_type = TYPE(n);
l.lb_str = TYPE(str);
printf("%s", labelrepr(&l));
if (ISNONTERMINAL(TYPE(n))) {
printf("(");
for (i = 0; i < NCH(n); i++) {
if (i > 0)
printf(",");
dumptree(g, CHILD(n, i));
}
printf(")");
}
}
}
void
showtree(g, n)
grammar *g;
node *n;
{
int i;
if (n == NULL)
return;
if (ISNONTERMINAL(TYPE(n))) {
for (i = 0; i < NCH(n); i++)
showtree(g, CHILD(n, i));
}
else if (ISTERMINAL(TYPE(n))) {
printf("%s", tok_name[TYPE(n)]);
if (TYPE(n) == NUMBER || TYPE(n) == NAME)
printf("(%s)", STR(n));
printf(" ");
}
else
printf("? ");
}
void
printtree(ps)
parser_state *ps;
{
if (debugging) {
printf("Parse tree:\n");
dumptree(ps->p_grammar, ps->p_tree);
printf("\n");
printf("Tokens:\n");
showtree(ps->p_grammar, ps->p_tree);
printf("\n");
}
printf("Listing:\n");
listtree(ps->p_tree);
printf("\n");
}
#endif /* DEBUG */
/*
Description
-----------
The parser's interface is different than usual: the function addtoken()
must be called for each token in the input. This makes it possible to
turn it into an incremental parsing system later. The parsing system
constructs a parse tree as it goes.
A parsing rule is represented as a Deterministic Finite-state Automaton
(DFA). A node in a DFA represents a state of the parser; an arc represents
a transition. Transitions are either labeled with terminal symbols or
with non-terminals. When the parser decides to follow an arc labeled
with a non-terminal, it is invoked recursively with the DFA representing
the parsing rule for that as its initial state; when that DFA accepts,
the parser that invoked it continues. The parse tree constructed by the
recursively called parser is inserted as a child in the current parse tree.
The DFA's can be constructed automatically from a more conventional
language description. An extended LL(1) grammar (ELL(1)) is suitable.
Certain restrictions make the parser's life easier: rules that can produce
the empty string should be outlawed (there are other ways to put loops
or optional parts in the language). To avoid the need to construct
FIRST sets, we can require that all but the last alternative of a rule
(really: arc going out of a DFA's state) must begin with a terminal
symbol.
As an example, consider this grammar:
expr: term (OP term)*
term: CONSTANT | '(' expr ')'
The DFA corresponding to the rule for expr is:
------->.---term-->.------->
^ |
| |
\----OP----/
The parse tree generated for the input a+b is:
(expr: (term: (NAME: a)), (OP: +), (term: (NAME: b)))
*/

25
Parser/parser.h Normal file
View file

@ -0,0 +1,25 @@
/* Parser interface */
#define MAXSTACK 100
typedef struct _stackentry {
int s_state; /* State in current DFA */
dfa *s_dfa; /* Current DFA */
node *s_parent; /* Where to add next node */
} stackentry;
typedef struct _stack {
stackentry *s_top; /* Top entry */
stackentry s_base[MAXSTACK];/* Array of stack entries */
/* NB The stack grows down */
} stack;
typedef struct {
struct _stack p_stack; /* Stack of parser states */
struct _grammar *p_grammar; /* Grammar to use */
struct _node *p_tree; /* Top of parse tree */
} parser_state;
parser_state *newparser PROTO((struct _grammar *g, int start));
void delparser PROTO((parser_state *ps));
int addtoken PROTO((parser_state *ps, int type, char *str));

131
Parser/parsetok.c Normal file
View file

@ -0,0 +1,131 @@
/* Parser-tokenizer link implementation */
#include <stdio.h>
#include "PROTO.h"
#include "malloc.h"
#include "tokenizer.h"
#include "node.h"
#include "grammar.h"
#include "parser.h"
#include "errcode.h"
extern int debugging;
/* Parse input coming from the given tokenizer structure.
Return error code. */
static int
parsetok(tok, g, start, n_ret)
struct tok_state *tok;
grammar *g;
int start;
node **n_ret;
{
parser_state *ps;
int ret;
if ((ps = newparser(g, start)) == NULL) {
fprintf(stderr, "no mem for new parser\n");
return E_NOMEM;
}
for (;;) {
char *a, *b;
int type;
int len;
char *str;
type = tok_get(tok, &a, &b);
if (type == ERRORTOKEN) {
ret = tok->done;
break;
}
len = b - a;
str = NEW(char, len + 1);
if (str == NULL) {
fprintf(stderr, "no mem for next token\n");
ret = E_NOMEM;
break;
}
strncpy(str, a, len);
str[len] = '\0';
ret = addtoken(ps, (int)type, str);
if (ret != E_OK) {
if (ret == E_DONE)
*n_ret = ps->p_tree;
else if (tok->lineno <= 1 && tok->done == E_EOF)
ret = E_EOF;
break;
}
}
delparser(ps);
return ret;
}
/* Parse input coming from a string. Return error code. */
int
parsestring(s, g, start, n_ret)
char *s;
grammar *g;
int start;
node **n_ret;
{
struct tok_state *tok = tok_setups(s);
int ret;
if (tok == NULL) {
fprintf(stderr, "no mem for tok_setups\n");
return E_NOMEM;
}
ret = parsetok(tok, g, start, n_ret);
if (ret == E_TOKEN || ret == E_SYNTAX) {
fprintf(stderr, "String parsing error at line %d\n",
tok->lineno);
}
tok_free(tok);
return ret;
}
/* Parse input coming from a file. Return error code. */
int
parsefile(fp, g, start, ps1, ps2, n_ret)
FILE *fp;
grammar *g;
int start;
char *ps1, *ps2;
node **n_ret;
{
struct tok_state *tok = tok_setupf(fp, ps1, ps2);
int ret;
if (tok == NULL) {
fprintf(stderr, "no mem for tok_setupf\n");
return E_NOMEM;
}
ret = parsetok(tok, g, start, n_ret);
if (ret == E_TOKEN || ret == E_SYNTAX) {
char *p;
fprintf(stderr, "Parsing error at line %d:\n",
tok->lineno);
*tok->inp = '\0';
if (tok->inp > tok->buf && tok->inp[-1] == '\n')
tok->inp[-1] = '\0';
fprintf(stderr, "%s\n", tok->buf);
for (p = tok->buf; p < tok->cur; p++) {
if (*p == '\t')
putc('\t', stderr);
else
putc(' ', stderr);
}
fprintf(stderr, "^\n");
}
tok_free(tok);
return ret;
}

729
Parser/pgen.c Normal file
View file

@ -0,0 +1,729 @@
/* Parser generator */
/* For a description, see the comments at end of this file */
#include <stdio.h>
#include "assert.h"
#include "PROTO.h"
#include "malloc.h"
#include "token.h"
#include "node.h"
#include "grammar.h"
#include "metagrammar.h"
#include "pgen.h"
extern int debugging;
/* PART ONE -- CONSTRUCT NFA -- Cf. Algorithm 3.2 from [Aho&Ullman 77] */
typedef struct _nfaarc {
int ar_label;
int ar_arrow;
} nfaarc;
typedef struct _nfastate {
int st_narcs;
nfaarc *st_arc;
} nfastate;
typedef struct _nfa {
int nf_type;
char *nf_name;
int nf_nstates;
nfastate *nf_state;
int nf_start, nf_finish;
} nfa;
static int
addnfastate(nf)
nfa *nf;
{
nfastate *st;
RESIZE(nf->nf_state, nfastate, nf->nf_nstates + 1);
if (nf->nf_state == NULL)
fatal("out of mem");
st = &nf->nf_state[nf->nf_nstates++];
st->st_narcs = 0;
st->st_arc = NULL;
return st - nf->nf_state;
}
static void
addnfaarc(nf, from, to, lbl)
nfa *nf;
int from, to, lbl;
{
nfastate *st;
nfaarc *ar;
st = &nf->nf_state[from];
RESIZE(st->st_arc, nfaarc, st->st_narcs + 1);
if (st->st_arc == NULL)
fatal("out of mem");
ar = &st->st_arc[st->st_narcs++];
ar->ar_label = lbl;
ar->ar_arrow = to;
}
static nfa *
newnfa(name)
char *name;
{
nfa *nf;
static type = NT_OFFSET; /* All types will be disjunct */
nf = NEW(nfa, 1);
if (nf == NULL)
fatal("no mem for new nfa");
nf->nf_type = type++;
nf->nf_name = name; /* XXX strdup(name) ??? */
nf->nf_nstates = 0;
nf->nf_state = NULL;
nf->nf_start = nf->nf_finish = -1;
return nf;
}
typedef struct _nfagrammar {
int gr_nnfas;
nfa **gr_nfa;
labellist gr_ll;
} nfagrammar;
static nfagrammar *
newnfagrammar()
{
nfagrammar *gr;
gr = NEW(nfagrammar, 1);
if (gr == NULL)
fatal("no mem for new nfa grammar");
gr->gr_nnfas = 0;
gr->gr_nfa = NULL;
gr->gr_ll.ll_nlabels = 0;
gr->gr_ll.ll_label = NULL;
addlabel(&gr->gr_ll, ENDMARKER, "EMPTY");
return gr;
}
static nfa *
addnfa(gr, name)
nfagrammar *gr;
char *name;
{
nfa *nf;
nf = newnfa(name);
RESIZE(gr->gr_nfa, nfa *, gr->gr_nnfas + 1);
if (gr->gr_nfa == NULL)
fatal("out of mem");
gr->gr_nfa[gr->gr_nnfas++] = nf;
addlabel(&gr->gr_ll, NAME, nf->nf_name);
return nf;
}
#ifdef DEBUG
static char REQNFMT[] = "metacompile: less than %d children\n";
#define REQN(i, count) \
if (i < count) { \
fprintf(stderr, REQNFMT, count); \
abort(); \
} else
#else
#define REQN(i, count) /* empty */
#endif
static nfagrammar *
metacompile(n)
node *n;
{
nfagrammar *gr;
int i;
printf("Compiling (meta-) parse tree into NFA grammar\n");
gr = newnfagrammar();
REQ(n, MSTART);
i = n->n_nchildren - 1; /* Last child is ENDMARKER */
n = n->n_child;
for (; --i >= 0; n++) {
if (n->n_type != NEWLINE)
compile_rule(gr, n);
}
return gr;
}
static
compile_rule(gr, n)
nfagrammar *gr;
node *n;
{
nfa *nf;
REQ(n, RULE);
REQN(n->n_nchildren, 4);
n = n->n_child;
REQ(n, NAME);
nf = addnfa(gr, n->n_str);
n++;
REQ(n, COLON);
n++;
REQ(n, RHS);
compile_rhs(&gr->gr_ll, nf, n, &nf->nf_start, &nf->nf_finish);
n++;
REQ(n, NEWLINE);
}
static
compile_rhs(ll, nf, n, pa, pb)
labellist *ll;
nfa *nf;
node *n;
int *pa, *pb;
{
int i;
int a, b;
REQ(n, RHS);
i = n->n_nchildren;
REQN(i, 1);
n = n->n_child;
REQ(n, ALT);
compile_alt(ll, nf, n, pa, pb);
if (--i <= 0)
return;
n++;
a = *pa;
b = *pb;
*pa = addnfastate(nf);
*pb = addnfastate(nf);
addnfaarc(nf, *pa, a, EMPTY);
addnfaarc(nf, b, *pb, EMPTY);
for (; --i >= 0; n++) {
REQ(n, VBAR);
REQN(i, 1);
--i;
n++;
REQ(n, ALT);
compile_alt(ll, nf, n, &a, &b);
addnfaarc(nf, *pa, a, EMPTY);
addnfaarc(nf, b, *pb, EMPTY);
}
}
static
compile_alt(ll, nf, n, pa, pb)
labellist *ll;
nfa *nf;
node *n;
int *pa, *pb;
{
int i;
int a, b;
REQ(n, ALT);
i = n->n_nchildren;
REQN(i, 1);
n = n->n_child;
REQ(n, ITEM);
compile_item(ll, nf, n, pa, pb);
--i;
n++;
for (; --i >= 0; n++) {
if (n->n_type == COMMA) { /* XXX Temporary */
REQN(i, 1);
--i;
n++;
}
REQ(n, ITEM);
compile_item(ll, nf, n, &a, &b);
addnfaarc(nf, *pb, a, EMPTY);
*pb = b;
}
}
static
compile_item(ll, nf, n, pa, pb)
labellist *ll;
nfa *nf;
node *n;
int *pa, *pb;
{
int i;
int a, b;
REQ(n, ITEM);
i = n->n_nchildren;
REQN(i, 1);
n = n->n_child;
if (n->n_type == LSQB) {
REQN(i, 3);
n++;
REQ(n, RHS);
*pa = addnfastate(nf);
*pb = addnfastate(nf);
addnfaarc(nf, *pa, *pb, EMPTY);
compile_rhs(ll, nf, n, &a, &b);
addnfaarc(nf, *pa, a, EMPTY);
addnfaarc(nf, b, *pb, EMPTY);
REQN(i, 1);
n++;
REQ(n, RSQB);
}
else {
compile_atom(ll, nf, n, pa, pb);
if (--i <= 0)
return;
n++;
addnfaarc(nf, *pb, *pa, EMPTY);
if (n->n_type == STAR)
*pb = *pa;
else
REQ(n, PLUS);
}
}
static
compile_atom(ll, nf, n, pa, pb)
labellist *ll;
nfa *nf;
node *n;
int *pa, *pb;
{
int i;
REQ(n, ATOM);
i = n->n_nchildren;
REQN(i, 1);
n = n->n_child;
if (n->n_type == LPAR) {
REQN(i, 3);
n++;
REQ(n, RHS);
compile_rhs(ll, nf, n, pa, pb);
n++;
REQ(n, RPAR);
}
else if (n->n_type == NAME || n->n_type == STRING) {
*pa = addnfastate(nf);
*pb = addnfastate(nf);
addnfaarc(nf, *pa, *pb, addlabel(ll, n->n_type, n->n_str));
}
else
REQ(n, NAME);
}
static void
dumpstate(ll, nf, istate)
labellist *ll;
nfa *nf;
int istate;
{
nfastate *st;
int i;
nfaarc *ar;
printf("%c%2d%c",
istate == nf->nf_start ? '*' : ' ',
istate,
istate == nf->nf_finish ? '.' : ' ');
st = &nf->nf_state[istate];
ar = st->st_arc;
for (i = 0; i < st->st_narcs; i++) {
if (i > 0)
printf("\n ");
printf("-> %2d %s", ar->ar_arrow,
labelrepr(&ll->ll_label[ar->ar_label]));
ar++;
}
printf("\n");
}
static void
dumpnfa(ll, nf)
labellist *ll;
nfa *nf;
{
int i;
printf("NFA '%s' has %d states; start %d, finish %d\n",
nf->nf_name, nf->nf_nstates, nf->nf_start, nf->nf_finish);
for (i = 0; i < nf->nf_nstates; i++)
dumpstate(ll, nf, i);
}
/* PART TWO -- CONSTRUCT DFA -- Algorithm 3.1 from [Aho&Ullman 77] */
static int
addclosure(ss, nf, istate)
bitset ss;
nfa *nf;
int istate;
{
if (addbit(ss, istate)) {
nfastate *st = &nf->nf_state[istate];
nfaarc *ar = st->st_arc;
int i;
for (i = st->st_narcs; --i >= 0; ) {
if (ar->ar_label == EMPTY)
addclosure(ss, nf, ar->ar_arrow);
ar++;
}
}
}
typedef struct _ss_arc {
bitset sa_bitset;
int sa_arrow;
int sa_label;
} ss_arc;
typedef struct _ss_state {
bitset ss_ss;
int ss_narcs;
ss_arc *ss_arc;
int ss_deleted;
int ss_finish;
int ss_rename;
} ss_state;
typedef struct _ss_dfa {
int sd_nstates;
ss_state *sd_state;
} ss_dfa;
static
makedfa(gr, nf, d)
nfagrammar *gr;
nfa *nf;
dfa *d;
{
int nbits = nf->nf_nstates;
bitset ss;
int xx_nstates;
ss_state *xx_state, *yy;
ss_arc *zz;
int istate, jstate, iarc, jarc, ibit;
nfastate *st;
nfaarc *ar;
ss = newbitset(nbits);
addclosure(ss, nf, nf->nf_start);
xx_state = NEW(ss_state, 1);
if (xx_state == NULL)
fatal("no mem for xx_state in makedfa");
xx_nstates = 1;
yy = &xx_state[0];
yy->ss_ss = ss;
yy->ss_narcs = 0;
yy->ss_arc = NULL;
yy->ss_deleted = 0;
yy->ss_finish = testbit(ss, nf->nf_finish);
if (yy->ss_finish)
printf("Error: nonterminal '%s' may produce empty.\n",
nf->nf_name);
/* This algorithm is from a book written before
the invention of structured programming... */
/* For each unmarked state... */
for (istate = 0; istate < xx_nstates; ++istate) {
yy = &xx_state[istate];
ss = yy->ss_ss;
/* For all its states... */
for (ibit = 0; ibit < nf->nf_nstates; ++ibit) {
if (!testbit(ss, ibit))
continue;
st = &nf->nf_state[ibit];
/* For all non-empty arcs from this state... */
for (iarc = 0; iarc < st->st_narcs; iarc++) {
ar = &st->st_arc[iarc];
if (ar->ar_label == EMPTY)
continue;
/* Look up in list of arcs from this state */
for (jarc = 0; jarc < yy->ss_narcs; ++jarc) {
zz = &yy->ss_arc[jarc];
if (ar->ar_label == zz->sa_label)
goto found;
}
/* Add new arc for this state */
RESIZE(yy->ss_arc, ss_arc, yy->ss_narcs + 1);
if (yy->ss_arc == NULL)
fatal("out of mem");
zz = &yy->ss_arc[yy->ss_narcs++];
zz->sa_label = ar->ar_label;
zz->sa_bitset = newbitset(nbits);
zz->sa_arrow = -1;
found: ;
/* Add destination */
addclosure(zz->sa_bitset, nf, ar->ar_arrow);
}
}
/* Now look up all the arrow states */
for (jarc = 0; jarc < xx_state[istate].ss_narcs; jarc++) {
zz = &xx_state[istate].ss_arc[jarc];
for (jstate = 0; jstate < xx_nstates; jstate++) {
if (samebitset(zz->sa_bitset,
xx_state[jstate].ss_ss, nbits)) {
zz->sa_arrow = jstate;
goto done;
}
}
RESIZE(xx_state, ss_state, xx_nstates + 1);
if (xx_state == NULL)
fatal("out of mem");
zz->sa_arrow = xx_nstates;
yy = &xx_state[xx_nstates++];
yy->ss_ss = zz->sa_bitset;
yy->ss_narcs = 0;
yy->ss_arc = NULL;
yy->ss_deleted = 0;
yy->ss_finish = testbit(yy->ss_ss, nf->nf_finish);
done: ;
}
}
if (debugging)
printssdfa(xx_nstates, xx_state, nbits, &gr->gr_ll,
"before minimizing");
simplify(xx_nstates, xx_state);
if (debugging)
printssdfa(xx_nstates, xx_state, nbits, &gr->gr_ll,
"after minimizing");
convert(d, xx_nstates, xx_state);
/* XXX cleanup */
}
static
printssdfa(xx_nstates, xx_state, nbits, ll, msg)
int xx_nstates;
ss_state *xx_state;
int nbits;
labellist *ll;
char *msg;
{
int i, ibit, iarc;
ss_state *yy;
ss_arc *zz;
printf("Subset DFA %s\n", msg);
for (i = 0; i < xx_nstates; i++) {
yy = &xx_state[i];
if (yy->ss_deleted)
continue;
printf(" Subset %d", i);
if (yy->ss_finish)
printf(" (finish)");
printf(" { ");
for (ibit = 0; ibit < nbits; ibit++) {
if (testbit(yy->ss_ss, ibit))
printf("%d ", ibit);
}
printf("}\n");
for (iarc = 0; iarc < yy->ss_narcs; iarc++) {
zz = &yy->ss_arc[iarc];
printf(" Arc to state %d, label %s\n",
zz->sa_arrow,
labelrepr(&ll->ll_label[zz->sa_label]));
}
}
}
/* PART THREE -- SIMPLIFY DFA */
/* Simplify the DFA by repeatedly eliminating states that are
equivalent to another oner. This is NOT Algorithm 3.3 from
[Aho&Ullman 77]. It does not always finds the minimal DFA,
but it does usually make a much smaller one... (For an example
of sub-optimal behaviour, try S: x a b+ | y a b+.)
*/
static int
samestate(s1, s2)
ss_state *s1, *s2;
{
int i;
if (s1->ss_narcs != s2->ss_narcs || s1->ss_finish != s2->ss_finish)
return 0;
for (i = 0; i < s1->ss_narcs; i++) {
if (s1->ss_arc[i].sa_arrow != s2->ss_arc[i].sa_arrow ||
s1->ss_arc[i].sa_label != s2->ss_arc[i].sa_label)
return 0;
}
return 1;
}
static void
renamestates(xx_nstates, xx_state, from, to)
int xx_nstates;
ss_state *xx_state;
int from, to;
{
int i, j;
if (debugging)
printf("Rename state %d to %d.\n", from, to);
for (i = 0; i < xx_nstates; i++) {
if (xx_state[i].ss_deleted)
continue;
for (j = 0; j < xx_state[i].ss_narcs; j++) {
if (xx_state[i].ss_arc[j].sa_arrow == from)
xx_state[i].ss_arc[j].sa_arrow = to;
}
}
}
static
simplify(xx_nstates, xx_state)
int xx_nstates;
ss_state *xx_state;
{
int changes;
int i, j, k;
do {
changes = 0;
for (i = 1; i < xx_nstates; i++) {
if (xx_state[i].ss_deleted)
continue;
for (j = 0; j < i; j++) {
if (xx_state[j].ss_deleted)
continue;
if (samestate(&xx_state[i], &xx_state[j])) {
xx_state[i].ss_deleted++;
renamestates(xx_nstates, xx_state, i, j);
changes++;
break;
}
}
}
} while (changes);
}
/* PART FOUR -- GENERATE PARSING TABLES */
/* Convert the DFA into a grammar that can be used by our parser */
static
convert(d, xx_nstates, xx_state)
dfa *d;
int xx_nstates;
ss_state *xx_state;
{
int i, j;
ss_state *yy;
ss_arc *zz;
for (i = 0; i < xx_nstates; i++) {
yy = &xx_state[i];
if (yy->ss_deleted)
continue;
yy->ss_rename = addstate(d);
}
for (i = 0; i < xx_nstates; i++) {
yy = &xx_state[i];
if (yy->ss_deleted)
continue;
for (j = 0; j < yy->ss_narcs; j++) {
zz = &yy->ss_arc[j];
addarc(d, yy->ss_rename,
xx_state[zz->sa_arrow].ss_rename,
zz->sa_label);
}
if (yy->ss_finish)
addarc(d, yy->ss_rename, yy->ss_rename, 0);
}
d->d_initial = 0;
}
/* PART FIVE -- GLUE IT ALL TOGETHER */
static grammar *
maketables(gr)
nfagrammar *gr;
{
int i;
nfa *nf;
dfa *d;
grammar *g;
if (gr->gr_nnfas == 0)
return NULL;
g = newgrammar(gr->gr_nfa[0]->nf_type);
/* XXX first rule must be start rule */
g->g_ll = gr->gr_ll;
for (i = 0; i < gr->gr_nnfas; i++) {
nf = gr->gr_nfa[i];
if (debugging) {
printf("Dump of NFA for '%s' ...\n", nf->nf_name);
dumpnfa(&gr->gr_ll, nf);
}
printf("Making DFA for '%s' ...\n", nf->nf_name);
d = adddfa(g, nf->nf_type, nf->nf_name);
makedfa(gr, gr->gr_nfa[i], d);
}
return g;
}
grammar *
pgen(n)
node *n;
{
nfagrammar *gr;
grammar *g;
gr = metacompile(n);
g = maketables(gr);
translatelabels(g);
addfirstsets(g);
return g;
}
/*
Description
-----------
Input is a grammar in extended BNF (using * for repetition, + for
at-least-once repetition, [] for optional parts, | for alternatives and
() for grouping). This has already been parsed and turned into a parse
tree.
Each rule is considered as a regular expression in its own right.
It is turned into a Non-deterministic Finite Automaton (NFA), which
is then turned into a Deterministic Finite Automaton (DFA), which is then
optimized to reduce the number of states. See [Aho&Ullman 77] chapter 3,
or similar compiler books (this technique is more often used for lexical
analyzers).
The DFA's are used by the parser as parsing tables in a special way
that's probably unique. Before they are usable, the FIRST sets of all
non-terminals are computed.
Reference
---------
[Aho&Ullman 77]
Aho&Ullman, Principles of Compiler Design, Addison-Wesley 1977
(first edition)
*/

6
Parser/pgen.h Normal file
View file

@ -0,0 +1,6 @@
/* Parser generator interface */
extern grammar gram;
extern grammar *meta_grammar PROTO((void));
extern grammar *pgen PROTO((node *));

111
Parser/pgenmain.c Normal file
View file

@ -0,0 +1,111 @@
/* Parser generator main program */
#include <stdio.h>
#include "PROTO.h"
#include "grammar.h"
#include "node.h"
#include "parsetok.h"
#include "pgen.h"
int debugging;
#ifdef THINK_C
char *
askfile()
{
char buf[256];
static char name[256];
printf("Input file name: ");
if (fgets(buf, sizeof buf, stdin) == NULL) {
printf("EOF\n");
exit(1);
}
if (sscanf(buf, " %s ", name) != 1) {
printf("No file\n");
exit(1);
}
return name;
}
#endif
grammar *
getgrammar(filename)
char *filename;
{
FILE *fp;
node *n;
grammar *g0, *g;
fp = fopen(filename, "r");
if (fp == NULL) {
perror(filename);
exit(1);
}
g0 = meta_grammar();
n = NULL;
parsefile(fp, g0, g0->g_start, (char *)NULL, (char *)NULL, &n);
fclose(fp);
if (n == NULL) {
fprintf(stderr, "Parsing error.\n");
exit(1);
}
g = pgen(n);
if (g == NULL) {
printf("Bad grammar.\n");
exit(1);
}
return g;
}
main(argc, argv)
int argc;
char **argv;
{
grammar *g;
node *n;
FILE *fp;
char *filename;
#ifdef THINK_C
filename = askfile();
#else
if (argc != 2) {
fprintf(stderr, "usage: %s grammar\n", argv[0]);
exit(2);
}
filename = argv[1];
#endif
g = getgrammar(filename);
fp = fopen("graminit.c", "w");
if (fp == NULL) {
perror("graminit.c");
exit(1);
}
printf("Writing graminit.c ...\n");
printgrammar(g, fp);
fclose(fp);
fp = fopen("graminit.h", "w");
if (fp == NULL) {
perror("graminit.h");
exit(1);
}
printf("Writing graminit.h ...\n");
printnonterminals(g, fp);
fclose(fp);
exit(0);
}
void
fatal(msg)
char *msg;
{
fprintf(stderr, "pgen: FATAL ERROR: %s\n", msg);
exit(1);
}
/* TO DO:
- improve user interface
- check for duplicate definitions of names (instead of fatal err)
*/

121
Parser/printgrammar.c Normal file
View file

@ -0,0 +1,121 @@
/* Print a bunch of C initializers that represent a grammar */
#include <stdio.h>
#include "PROTO.h"
#include "grammar.h"
static void
printarcs(i, d, fp)
int i;
dfa *d;
FILE *fp;
{
arc *a;
state *s;
int j, k;
s = d->d_state;
for (j = 0; j < d->d_nstates; j++, s++) {
fprintf(fp, "static arc arcs_%d_%d[%d] = {\n",
i, j, s->s_narcs);
a = s->s_arc;
for (k = 0; k < s->s_narcs; k++, a++)
fprintf(fp, "\t{%d, %d},\n", a->a_lbl, a->a_arrow);
fprintf(fp, "};\n");
}
}
static void
printstates(g, fp)
grammar *g;
FILE *fp;
{
state *s;
dfa *d;
int i, j;
d = g->g_dfa;
for (i = 0; i < g->g_ndfas; i++, d++) {
printarcs(i, d, fp);
fprintf(fp, "static state states_%d[%d] = {\n",
i, d->d_nstates);
s = d->d_state;
for (j = 0; j < d->d_nstates; j++, s++)
fprintf(fp, "\t{%d, arcs_%d_%d},\n",
s->s_narcs, i, j);
fprintf(fp, "};\n");
}
}
static void
printdfas(g, fp)
grammar *g;
FILE *fp;
{
dfa *d;
int i, j;
printstates(g, fp);
fprintf(fp, "static dfa dfas[%d] = {\n", g->g_ndfas);
d = g->g_dfa;
for (i = 0; i < g->g_ndfas; i++, d++) {
fprintf(fp, "\t{%d, \"%s\", %d, %d, states_%d,\n",
d->d_type, d->d_name, d->d_initial, d->d_nstates, i);
fprintf(fp, "\t \"");
for (j = 0; j < NBYTES(g->g_ll.ll_nlabels); j++)
fprintf(fp, "\\%03o", d->d_first[j] & 0xff);
fprintf(fp, "\"},\n");
}
fprintf(fp, "};\n");
}
static void
printlabels(g, fp)
grammar *g;
FILE *fp;
{
label *l;
int i;
fprintf(fp, "static label labels[%d] = {\n", g->g_ll.ll_nlabels);
l = g->g_ll.ll_label;
for (i = g->g_ll.ll_nlabels; --i >= 0; l++) {
if (l->lb_str == NULL)
fprintf(fp, "\t{%d, 0},\n", l->lb_type);
else
fprintf(fp, "\t{%d, \"%s\"},\n",
l->lb_type, l->lb_str);
}
fprintf(fp, "};\n");
}
void
printgrammar(g, fp)
grammar *g;
FILE *fp;
{
fprintf(fp, "#include \"PROTO.h\"\n");
fprintf(fp, "#include \"grammar.h\"\n");
printdfas(g, fp);
printlabels(g, fp);
fprintf(fp, "grammar gram = {\n");
fprintf(fp, "\t%d,\n", g->g_ndfas);
fprintf(fp, "\tdfas,\n");
fprintf(fp, "\t{%d, labels},\n", g->g_ll.ll_nlabels);
fprintf(fp, "\t%d\n", g->g_start);
fprintf(fp, "};\n");
}
void
printnonterminals(g, fp)
grammar *g;
FILE *fp;
{
dfa *d;
int i;
d = g->g_dfa;
for (i = g->g_ndfas; --i >= 0; d++)
fprintf(fp, "#define %s %d\n", d->d_name, d->d_type);
}

490
Parser/tokenizer.c Normal file
View file

@ -0,0 +1,490 @@
/* Tokenizer implementation */
/* XXX This is rather old, should be restructured perhaps */
/* XXX Need a better interface to report errors than writing to stderr */
#include <stdio.h>
#include <ctype.h>
#include "string.h"
#include "PROTO.h"
#include "malloc.h"
#include "tokenizer.h"
#include "errcode.h"
#ifdef THINK_C
#define TABSIZE 4
#endif
#ifndef TABSIZE
#define TABSIZE 8
#endif
/* Token names */
char *tok_name[] = {
"ENDMARKER",
"NAME",
"NUMBER",
"STRING",
"NEWLINE",
"INDENT",
"DEDENT",
"LPAR",
"RPAR",
"LSQB",
"RSQB",
"COLON",
"COMMA",
"SEMI",
"PLUS",
"MINUS",
"STAR",
"SLASH",
"VBAR",
"AMPER",
"LESS",
"GREATER",
"EQUAL",
"DOT",
"PERCENT",
"BACKQUOTE",
"LBRACE",
"RBRACE",
"OP",
"<ERRORTOKEN>",
"<N_TOKENS>"
};
/* Create and initialize a new tok_state structure */
static struct tok_state *
tok_new()
{
struct tok_state *tok = NEW(struct tok_state, 1);
if (tok == NULL)
return NULL;
tok->buf = tok->cur = tok->end = tok->inp = NULL;
tok->done = E_OK;
tok->fp = NULL;
tok->tabsize = TABSIZE;
tok->indent = 0;
tok->indstack[0] = 0;
tok->atbol = 1;
tok->pendin = 0;
tok->prompt = tok->nextprompt = NULL;
tok->lineno = 0;
return tok;
}
/* Set up tokenizer for string */
struct tok_state *
tok_setups(str)
char *str;
{
struct tok_state *tok = tok_new();
if (tok == NULL)
return NULL;
tok->buf = tok->cur = str;
tok->end = tok->inp = strchr(str, '\0');
return tok;
}
/* Set up tokenizer for string */
struct tok_state *
tok_setupf(fp, ps1, ps2)
FILE *fp;
char *ps1, *ps2;
{
struct tok_state *tok = tok_new();
if (tok == NULL)
return NULL;
if ((tok->buf = NEW(char, BUFSIZ)) == NULL) {
DEL(tok);
return NULL;
}
tok->cur = tok->inp = tok->buf;
tok->end = tok->buf + BUFSIZ;
tok->fp = fp;
tok->prompt = ps1;
tok->nextprompt = ps2;
return tok;
}
/* Free a tok_state structure */
void
tok_free(tok)
struct tok_state *tok;
{
/* XXX really need a separate flag to say 'my buffer' */
if (tok->fp != NULL && tok->buf != NULL)
DEL(tok->buf);
DEL(tok);
}
/* Get next char, updating state; error code goes into tok->done */
static int
tok_nextc(tok)
register struct tok_state *tok;
{
if (tok->done != E_OK)
return EOF;
for (;;) {
if (tok->cur < tok->inp)
return *tok->cur++;
if (tok->fp == NULL) {
tok->done = E_EOF;
return EOF;
}
if (tok->inp > tok->buf && tok->inp[-1] == '\n')
tok->inp = tok->buf;
if (tok->inp == tok->end) {
int n = tok->end - tok->buf;
char *new = tok->buf;
RESIZE(new, char, n+n);
if (new == NULL) {
fprintf(stderr, "tokenizer out of mem\n");
tok->done = E_NOMEM;
return EOF;
}
tok->buf = new;
tok->inp = tok->buf + n;
tok->end = tok->inp + n;
}
#ifdef USE_READLINE
if (tok->prompt != NULL) {
extern char *readline PROTO((char *prompt));
static int been_here;
if (!been_here) {
/* Force rebind of TAB to insert-tab */
extern int rl_insert();
rl_bind_key('\t', rl_insert);
been_here++;
}
if (tok->buf != NULL)
free(tok->buf);
tok->buf = readline(tok->prompt);
(void) intrcheck(); /* Clear pending interrupt */
if (tok->nextprompt != NULL)
tok->prompt = tok->nextprompt;
/* XXX different semantics w/o readline()! */
if (tok->buf == NULL) {
tok->done = E_EOF;
}
else {
unsigned int n = strlen(tok->buf);
if (n > 0)
add_history(tok->buf);
/* Append the '\n' that readline()
doesn't give us, for the tokenizer... */
tok->buf = realloc(tok->buf, n+2);
if (tok->buf == NULL)
tok->done = E_NOMEM;
else {
tok->end = tok->buf + n;
*tok->end++ = '\n';
*tok->end = '\0';
tok->inp = tok->end;
tok->cur = tok->buf;
}
}
}
else
#endif
{
tok->cur = tok->inp;
if (tok->prompt != NULL && tok->inp == tok->buf) {
fprintf(stderr, "%s", tok->prompt);
tok->prompt = tok->nextprompt;
}
tok->done = fgets_intr(tok->inp,
(int)(tok->end - tok->inp), tok->fp);
}
if (tok->done != E_OK) {
if (tok->prompt != NULL)
fprintf(stderr, "\n");
return EOF;
}
tok->inp = strchr(tok->inp, '\0');
}
}
/* Back-up one character */
static void
tok_backup(tok, c)
register struct tok_state *tok;
register int c;
{
if (c != EOF) {
if (--tok->cur < tok->buf) {
fprintf(stderr, "tok_backup: begin of buffer\n");
abort();
}
if (*tok->cur != c)
*tok->cur = c;
}
}
/* Return the token corresponding to a single character */
int
tok_1char(c)
int c;
{
switch (c) {
case '(': return LPAR;
case ')': return RPAR;
case '[': return LSQB;
case ']': return RSQB;
case ':': return COLON;
case ',': return COMMA;
case ';': return SEMI;
case '+': return PLUS;
case '-': return MINUS;
case '*': return STAR;
case '/': return SLASH;
case '|': return VBAR;
case '&': return AMPER;
case '<': return LESS;
case '>': return GREATER;
case '=': return EQUAL;
case '.': return DOT;
case '%': return PERCENT;
case '`': return BACKQUOTE;
case '{': return LBRACE;
case '}': return RBRACE;
default: return OP;
}
}
/* Get next token, after space stripping etc. */
int
tok_get(tok, p_start, p_end)
register struct tok_state *tok; /* In/out: tokenizer state */
char **p_start, **p_end; /* Out: point to start/end of token */
{
register int c;
/* Get indentation level */
if (tok->atbol) {
register int col = 0;
tok->atbol = 0;
tok->lineno++;
for (;;) {
c = tok_nextc(tok);
if (c == ' ')
col++;
else if (c == '\t')
col = (col/tok->tabsize + 1) * tok->tabsize;
else
break;
}
tok_backup(tok, c);
if (col == tok->indstack[tok->indent]) {
/* No change */
}
else if (col > tok->indstack[tok->indent]) {
/* Indent -- always one */
if (tok->indent+1 >= MAXINDENT) {
fprintf(stderr, "excessive indent\n");
tok->done = E_TOKEN;
return ERRORTOKEN;
}
tok->pendin++;
tok->indstack[++tok->indent] = col;
}
else /* col < tok->indstack[tok->indent] */ {
/* Dedent -- any number, must be consistent */
while (tok->indent > 0 &&
col < tok->indstack[tok->indent]) {
tok->indent--;
tok->pendin--;
}
if (col != tok->indstack[tok->indent]) {
fprintf(stderr, "inconsistent dedent\n");
tok->done = E_TOKEN;
return ERRORTOKEN;
}
}
}
*p_start = *p_end = tok->cur;
/* Return pending indents/dedents */
if (tok->pendin != 0) {
if (tok->pendin < 0) {
tok->pendin++;
return DEDENT;
}
else {
tok->pendin--;
return INDENT;
}
}
again:
/* Skip spaces */
do {
c = tok_nextc(tok);
} while (c == ' ' || c == '\t');
/* Set start of current token */
*p_start = tok->cur - 1;
/* Skip comment */
if (c == '#') {
/* Hack to allow overriding the tabsize in the file.
This is also recognized by vi, when it occurs near the
beginning or end of the file. (Will vi never die...?) */
int x;
if (sscanf(tok->cur, " vi:set tabsize=%d:", &x) == 1 &&
x >= 1 && x <= 40) {
fprintf(stderr, "# vi:set tabsize=%d:\n", x);
tok->tabsize = x;
}
do {
c = tok_nextc(tok);
} while (c != EOF && c != '\n');
}
/* Check for EOF and errors now */
if (c == EOF)
return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
/* Identifier (most frequent token!) */
if (isalpha(c) || c == '_') {
do {
c = tok_nextc(tok);
} while (isalnum(c) || c == '_');
tok_backup(tok, c);
*p_end = tok->cur;
return NAME;
}
/* Newline */
if (c == '\n') {
tok->atbol = 1;
*p_end = tok->cur - 1; /* Leave '\n' out of the string */
return NEWLINE;
}
/* Number */
if (isdigit(c)) {
if (c == '0') {
/* Hex or octal */
c = tok_nextc(tok);
if (c == '.')
goto fraction;
if (c == 'x' || c == 'X') {
/* Hex */
do {
c = tok_nextc(tok);
} while (isxdigit(c));
}
else {
/* Octal; c is first char of it */
/* There's no 'isoctdigit' macro, sigh */
while ('0' <= c && c < '8') {
c = tok_nextc(tok);
}
}
}
else {
/* Decimal */
do {
c = tok_nextc(tok);
} while (isdigit(c));
/* Accept floating point numbers.
XXX This accepts incomplete things like 12e or 1e+;
worry about that at run-time.
XXX Doesn't accept numbers starting with a dot */
if (c == '.') {
fraction:
/* Fraction */
do {
c = tok_nextc(tok);
} while (isdigit(c));
}
if (c == 'e' || c == 'E') {
/* Exponent part */
c = tok_nextc(tok);
if (c == '+' || c == '-')
c = tok_nextc(tok);
while (isdigit(c)) {
c = tok_nextc(tok);
}
}
}
tok_backup(tok, c);
*p_end = tok->cur;
return NUMBER;
}
/* String */
if (c == '\'') {
for (;;) {
c = tok_nextc(tok);
if (c == '\n' || c == EOF) {
tok->done = E_TOKEN;
return ERRORTOKEN;
}
if (c == '\\') {
c = tok_nextc(tok);
*p_end = tok->cur;
if (c == '\n' || c == EOF) {
tok->done = E_TOKEN;
return ERRORTOKEN;
}
continue;
}
if (c == '\'')
break;
}
*p_end = tok->cur;
return STRING;
}
/* Line continuation */
if (c == '\\') {
c = tok_nextc(tok);
if (c != '\n') {
tok->done = E_TOKEN;
return ERRORTOKEN;
}
goto again; /* Read next line */
}
/* Punctuation character */
*p_end = tok->cur;
return tok_1char(c);
}
#ifdef DEBUG
void
tok_dump(type, start, end)
int type;
char *start, *end;
{
printf("%s", tok_name[type]);
if (type == NAME || type == NUMBER || type == STRING || type == OP)
printf("(%.*s)", (int)(end - start), start);
}
#endif

29
Parser/tokenizer.h Normal file
View file

@ -0,0 +1,29 @@
/* Tokenizer interface */
#include "token.h" /* For token types */
#define MAXINDENT 100 /* Max indentation level */
/* Tokenizer state */
struct tok_state {
/* Input state; buf <= cur <= inp <= end */
/* NB an entire token must fit in the buffer */
char *buf; /* Input buffer */
char *cur; /* Next character in buffer */
char *inp; /* End of data in buffer */
char *end; /* End of input buffer */
int done; /* 0 normally, 1 at EOF, -1 after error */
FILE *fp; /* Rest of input; NULL if tokenizing a string */
int tabsize; /* Tab spacing */
int indent; /* Current indentation index */
int indstack[MAXINDENT]; /* Stack of indents */
int atbol; /* Nonzero if at begin of new line */
int pendin; /* Pending indents (if > 0) or dedents (if < 0) */
char *prompt, *nextprompt; /* For interactive prompting */
int lineno; /* Current line number */
};
extern struct tok_state *tok_setups PROTO((char *));
extern struct tok_state *tok_setupf PROTO((FILE *, char *ps1, char *ps2));
extern void tok_free PROTO((struct tok_state *));
extern int tok_get PROTO((struct tok_state *, char **, char **));