* Parser/tokenizer.c: backup over illegal newline in string

literal (for "completeness" test)
This commit is contained in:
Guido van Rossum 1994-08-29 12:43:07 +00:00
parent bd0389d5fd
commit f4b1a64a21

View File

@ -1,5 +1,5 @@
/***********************************************************
Copyright 1991, 1992, 1993 by Stichting Mathematisch Centrum,
Copyright 1991, 1992, 1993, 1994 by Stichting Mathematisch Centrum,
Amsterdam, The Netherlands.
All Rights Reserved
@ -24,19 +24,18 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
/* Tokenizer implementation */
/* XXX This is rather old, should be restructured perhaps */
/* XXX Need a better interface to report errors than writing to stderr */
/* XXX Should use editor resource to fetch true tab size on Macintosh */
#include "pgenheaders.h"
#include <ctype.h>
#include "string.h"
#include "fgetsintr.h"
#include "tokenizer.h"
#include "errcode.h"
extern char *my_readline PROTO((char *));
/* Return malloc'ed string including trailing \n;
empty malloc'ed string for EOF;
NULL if interrupted */
/* Don't ever change this -- it would break the portability of Python code */
#define TABSIZE 8
@ -99,7 +98,7 @@ tok_new()
struct tok_state *tok = NEW(struct tok_state, 1);
if (tok == NULL)
return NULL;
tok->buf = tok->cur = tok->end = tok->inp = NULL;
tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
tok->done = E_OK;
tok->fp = NULL;
tok->tabsize = TABSIZE;
@ -158,7 +157,6 @@ void
tok_free(tok)
struct tok_state *tok;
{
/* XXX really need a separate flag to say 'my buffer' */
if (tok->fp != NULL && tok->buf != NULL)
DEL(tok->buf);
DEL(tok);
@ -180,58 +178,78 @@ tok_nextc(tok)
tok->done = E_EOF;
return EOF;
}
#ifdef USE_READLINE
if (tok->prompt != NULL) {
extern char *readline PROTO((char *prompt));
static int been_here;
if (!been_here) {
/* Force rebind of TAB to insert-tab */
extern int rl_insert();
rl_bind_key('\t', rl_insert);
been_here++;
}
if (tok->buf != NULL)
free(tok->buf);
tok->buf = readline(tok->prompt);
(void) intrcheck(); /* Clear pending interrupt */
char *new = my_readline(tok->prompt);
if (tok->nextprompt != NULL)
tok->prompt = tok->nextprompt;
if (tok->buf == NULL) {
if (new == NULL)
tok->done = E_INTR;
else if (*new == '\0') {
free(new);
tok->done = E_EOF;
}
else {
tok->end = strchr(tok->buf, '\0');
if (tok->end > tok->buf)
add_history(tok->buf);
/* Replace trailing '\n' by '\0'
(we don't need a '\0', but the
tokenizer wants a '\n'...) */
*tok->end++ = '\n';
tok->inp = tok->end;
tok->cur = tok->buf;
}
}
else
#endif
{
if (tok->prompt != NULL) {
fprintf(stderr, "%s", tok->prompt);
if (tok->nextprompt != NULL)
tok->prompt = tok->nextprompt;
}
if (tok->buf == NULL) {
tok->buf = NEW(char, BUFSIZ);
if (tok->buf == NULL) {
else if (tok->start != NULL) {
int start = tok->start - tok->buf;
int oldlen = tok->cur - tok->buf;
int newlen = oldlen + strlen(new);
char *buf = realloc(tok->buf, newlen+1);
tok->lineno++;
if (buf == NULL) {
free(tok->buf);
free(new);
tok->done = E_NOMEM;
return EOF;
}
tok->end = tok->buf + BUFSIZ;
tok->buf = buf;
tok->cur = tok->buf + oldlen;
strcpy(tok->buf + oldlen, new);
free(new);
tok->inp = tok->buf + newlen;
tok->end = tok->inp + 1;
tok->start = tok->buf + start;
}
tok->done = fgets_intr(tok->buf,
(int)(tok->end - tok->buf), tok->fp);
tok->inp = strchr(tok->buf, '\0');
else {
tok->lineno++;
if (tok->buf != NULL)
free(tok->buf);
tok->buf = new;
tok->cur = tok->buf;
tok->inp = strchr(tok->buf, '\0');
tok->end = tok->inp + 1;
}
}
else {
int done = 0;
int cur = 0;
if (tok->start == NULL) {
if (tok->buf == NULL) {
tok->buf = NEW(char, BUFSIZ);
if (tok->buf == NULL) {
tok->done = E_NOMEM;
return EOF;
}
tok->end = tok->buf + BUFSIZ;
}
if (fgets(tok->buf, (int)(tok->end - tok->buf),
tok->fp) == NULL) {
tok->done = E_EOF;
done = 1;
}
else {
tok->done = E_OK;
tok->inp = strchr(tok->buf, '\0');
done = tok->inp[-1] == '\n';
}
}
else {
cur = tok->cur - tok->buf;
tok->done = E_OK;
}
tok->lineno++;
/* Read until '\n' or EOF */
while (tok->inp+1==tok->end && tok->inp[-1]!='\n') {
while (!done) {
int curstart = tok->start == NULL ? -1 :
tok->start - tok->buf;
int curvalid = tok->inp - tok->buf;
int cursize = tok->end - tok->buf;
int newsize = cursize + BUFSIZ;
@ -245,13 +263,19 @@ tok_nextc(tok)
tok->buf = newbuf;
tok->inp = tok->buf + curvalid;
tok->end = tok->buf + newsize;
if (fgets_intr(tok->inp,
tok->start = curstart < 0 ? NULL :
tok->buf + curstart;
if (fgets(tok->inp,
(int)(tok->end - tok->inp),
tok->fp) != E_OK)
break;
tok->fp) == NULL) {
/* Last line does not end in \n,
fake one */
strcpy(tok->inp, "\n");
}
tok->inp = strchr(tok->inp, '\0');
done = tok->inp[-1] == '\n';
}
tok->cur = tok->buf;
tok->cur = tok->buf + cur;
}
if (tok->done != E_OK) {
if (tok->prompt != NULL)
@ -360,14 +384,15 @@ tok_get(tok, p_start, p_end)
register int c;
int blankline;
*p_start = *p_end = NULL;
nextline:
tok->start = NULL;
blankline = 0;
/* Get indentation level */
if (tok->atbol) {
register int col = 0;
tok->atbol = 0;
tok->lineno++;
for (;;) {
c = tok_nextc(tok);
if (c == ' ')
@ -423,7 +448,7 @@ tok_get(tok, p_start, p_end)
}
}
*p_start = *p_end = tok->cur;
tok->start = tok->cur;
/* Return pending indents/dedents */
if (tok->pendin != 0) {
@ -438,13 +463,14 @@ tok_get(tok, p_start, p_end)
}
again:
tok->start = NULL;
/* Skip spaces */
do {
c = tok_nextc(tok);
} while (c == ' ' || c == '\t');
/* Set start of current token */
*p_start = tok->cur - 1;
tok->start = tok->cur - 1;
/* Skip comment */
if (c == '#') {
@ -467,7 +493,6 @@ tok_get(tok, p_start, p_end)
/* Check for EOF and errors now */
if (c == EOF) {
*p_start = *p_end = tok->cur;
return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
}
@ -477,6 +502,7 @@ tok_get(tok, p_start, p_end)
c = tok_nextc(tok);
} while (isalnum(c) || c == '_');
tok_backup(tok, c);
*p_start = tok->start;
*p_end = tok->cur;
return NAME;
}
@ -486,6 +512,7 @@ tok_get(tok, p_start, p_end)
tok->atbol = 1;
if (blankline || tok->level > 0)
goto nextline;
*p_start = tok->start;
*p_end = tok->cur - 1; /* Leave '\n' out of the string */
return NEWLINE;
}
@ -498,6 +525,7 @@ tok_get(tok, p_start, p_end)
}
else {
tok_backup(tok, c);
*p_start = tok->start;
*p_end = tok->cur;
return DOT;
}
@ -538,9 +566,7 @@ tok_get(tok, p_start, p_end)
else {
/* Accept floating point numbers.
XXX This accepts incomplete things like
XXX 12e or 1e+; worry run-time.
XXX Doesn't accept numbers
XXX starting with a dot */
XXX 12e or 1e+; worry run-time */
if (c == '.') {
fraction:
/* Fraction */
@ -560,58 +586,58 @@ tok_get(tok, p_start, p_end)
}
}
tok_backup(tok, c);
*p_start = tok->start;
*p_end = tok->cur;
return NUMBER;
}
/* String (single quotes) */
if (c == '\'') {
/* String */
if (c == '\'' || c == '"') {
int quote = c;
int triple = 0;
int tripcount = 0;
for (;;) {
c = tok_nextc(tok);
if (c == '\n' || c == EOF) {
if (c == '\n') {
if (!triple) {
tok->done = E_TOKEN;
tok_backup(tok, c);
return ERRORTOKEN;
}
tripcount = 0;
}
else if (c == EOF) {
tok->done = E_TOKEN;
tok->cur = tok->inp;
return ERRORTOKEN;
}
if (c == '\\') {
else if (c == quote) {
tripcount++;
if (tok->cur == tok->start+2) {
c = tok_nextc(tok);
if (c == quote) {
triple = 1;
tripcount = 0;
continue;
}
tok_backup(tok, c);
}
if (!triple || tripcount == 3)
break;
}
else if (c == '\\') {
tripcount = 0;
c = tok_nextc(tok);
*p_end = tok->cur;
if (c == '\n' || c == EOF) {
if (c == EOF) {
tok->done = E_TOKEN;
tok->cur = tok->inp;
return ERRORTOKEN;
}
continue;
}
if (c == '\'')
break;
}
*p_end = tok->cur;
return STRING;
}
/* String (double quotes) */
if (c == '\"') {
for (;;) {
c = tok_nextc(tok);
if (c == '\n' || c == EOF) {
tok->done = E_TOKEN;
tok->cur = tok->inp;
return ERRORTOKEN;
}
if (c == '\\') {
c = tok_nextc(tok);
*p_end = tok->cur;
if (c == '\n' || c == EOF) {
tok->done = E_TOKEN;
tok->cur = tok->inp;
return ERRORTOKEN;
}
continue;
}
if (c == '\"')
break;
else
tripcount = 0;
}
*p_start = tok->start;
*p_end = tok->cur;
return STRING;
}
@ -624,7 +650,6 @@ tok_get(tok, p_start, p_end)
tok->cur = tok->inp;
return ERRORTOKEN;
}
tok->lineno++;
goto again; /* Read next line */
}
@ -633,13 +658,14 @@ tok_get(tok, p_start, p_end)
int c2 = tok_nextc(tok);
int token = tok_2char(c, c2);
if (token != OP) {
*p_start = tok->start;
*p_end = tok->cur;
return token;
}
tok_backup(tok, c2);
}
/* Keep track of parenteses nesting level */
/* Keep track of parentheses nesting level */
switch (c) {
case '(':
case '[':
@ -654,6 +680,7 @@ tok_get(tok, p_start, p_end)
}
/* Punctuation character */
*p_start = tok->start;
*p_end = tok->cur;
return tok_1char(c);
}