scc

simple C compiler
git clone git://git.2f30.org/scc
Log | Files | Refs | README | LICENSE

commit 490b2ddc64c800327ab07b4b3e5e56ed434fd38e
parent 8adc67d19572d6edb933a5af8960fbc86123e1f4
Author: Roberto E. Vargas Caballero <k0ga@shike2.com>
Date:   Wed, 18 Jan 2017 11:16:06 +0100

[cc1] Rewrite the input system

The input system was broken for a lot of different reasons. Some of them
were due to the fact that the responsability of the different functions
was not clear, and the kind of input wasn't explicit. This new version
tries to split clearly the responsability of every function (for
example spaces are skiped only in skipspaces now).

Diffstat:
Mcc1/cc1.h | 3+--
Mcc1/code.c | 2+-
Mcc1/cpp.c | 67+++++++++++++++++++++++++++++++++++++++++++++++++------------------
Mcc1/lex.c | 204+++++++++++++++++++++++++++++++++++++++++++------------------------------------
4 files changed, 163 insertions(+), 113 deletions(-)

diff --git a/cc1/cc1.h b/cc1/cc1.h @@ -400,7 +400,6 @@ extern void decl(void); /* lex.c */ extern char ahead(void); extern unsigned next(void); -extern int moreinput(void); extern void expect(unsigned tok); extern void discard(void); extern int addinput(char *fname, Symbol *hide, char *buffer); @@ -451,7 +450,7 @@ extern struct yystype yylval; extern char yytext[]; extern unsigned yytoken; extern unsigned short yylen; -extern int cppoff, disexpand; +extern int disexpand; extern unsigned cppctx; extern Input *input; extern int lexmode, namespace, onlycpp; diff --git a/cc1/code.c b/cc1/code.c @@ -159,7 +159,7 @@ emit(unsigned op, void *arg) { extern int failure; - if (failure) + if (failure || onlycpp) return; (*opcode[op])(op, arg); } diff --git a/cc1/cpp.c b/cc1/cpp.c @@ -16,7 +16,7 @@ static unsigned arglen; static unsigned ncmdlines; static Symbol *symline, *symfile; static unsigned char ifstatus[NR_COND]; -static int ninclude; +static int ninclude, cppoff; static char **dirinclude; unsigned cppctx; @@ -25,7 +25,7 @@ int disexpand; void defdefine(char *macro, char *val, char *source) { - char *def, *fmt = "#define %s %s"; + char *def, *fmt = "#define %s %s\n"; Symbol dummy = {.flags = SDECLARED}; if (!val) @@ -345,7 +345,7 @@ getdefs(Symbol *args[NR_MACROARG], int nargs, char *bp, size_t bufsiz) cpperror("'#' is not followed by a macro parameter"); return 0; } - if (yytoken == EOFTOK) + if (yytoken == '\n') break; if ((len = strlen(yytext)) >= bufsiz) { @@ -449,7 +449,7 @@ includefile(char *dir, char *file, size_t filelen) static void include(void) { - char *file, *p, **bp; + char file[FILENAME_MAX], *p, **bp; size_t filelen; static char *sysinclude[] = { PREFIX "/include/scc/" ARCH "/", @@ -467,19 +467,31 @@ include(void) switch (*yytext) { case '<': - if ((p = strchr(input->begin, '>')) == NULL || p == yytext + 1) + if ((p = strchr(input->begin, '>')) == NULL || p[-1] == '<') goto bad_include; - *p = '\0'; - file = input->begin; - filelen = strlen(file); + filelen = p - input->begin; + if (filelen >= FILENAME_MAX) + goto too_long; + memcpy(file, input->begin, filelen); + file[filelen] = '\0'; + input->begin = input->p = p+1; + if (next() != '\n') + goto trailing_characters; + break; case '"': - if ((p = strchr(yytext + 1, '"')) == NULL || p == yytext + 1) + if (yylen < 3) goto bad_include; - *p = '\0'; - file = yytext+1; - filelen = strlen(file); + filelen = yylen-2; + if (filelen >= FILENAME_MAX) + goto too_long; + memcpy(file, yytext+1, filelen); + file[filelen] = '\0'; + + if (next() != '\n') + goto trailing_characters; + if (includefile(NULL, file, filelen)) goto its_done; break; @@ -499,7 +511,14 @@ include(void) cpperror("included file '%s' not found", file); its_done: - next(); + return; + +trailing_characters: + cpperror("trailing characters after preprocessor directive"); + return; + +too_long: + cpperror("too long file name in #include"); return; bad_include: @@ -709,10 +728,14 @@ cpp(void) {0, NULL} }; int ns; + char *p; - if (*input->p != '#') - return 0; - ++input->p; + for (p = input->p; isspace(*p); ++p) + /* nothing */; + + if (*p != '#') + return cppoff; + input->p = p+1; disexpand = 1; lexmode = CPPMODE; @@ -724,15 +747,23 @@ cpp(void) for (bp = clauses; bp->token && bp->token != yytoken; ++bp) /* nothing */; if (!bp->token) { - errorp("incorrect preprocessor directive"); + errorp("incorrect preprocessor directive '%s'", yytext); goto error; } + DBG("CPP %s", yytext); + pushctx(); /* create a new context to avoid polish */ (*bp->fun)(); /* the current context, and to get all */ popctx(); /* the symbols freed at the end */ - if (yytoken != EOFTOK && !cppoff) + /* + * #include changes the content of input->line, so the correctness + * of the line must be checked in the own include(), and we have + * to skip this tests. For the same reason include() is the only + * function which does not prepare the next token + */ + if (yytoken != '\n' && !cppoff && bp->token != INCLUDE) errorp("trailing characters after preprocessor directive"); error: diff --git a/cc1/lex.c b/cc1/lex.c @@ -1,5 +1,6 @@ /* See LICENSE file for copyright and license details. */ static char sccsid[] = "@(#) ./cc1/lex.c"; +#include <assert.h> #include <ctype.h> #include <errno.h> #include <limits.h> @@ -16,11 +17,10 @@ unsigned yytoken; struct yystype yylval; char yytext[STRINGSIZ+3]; unsigned short yylen; -int cppoff; int lexmode = CCMODE; int namespace = NS_IDEN; -static int safe, eof; +static int safe; Input *input; void @@ -83,7 +83,7 @@ addinput(char *fname, Symbol *hide, char *buffer) if (hide->hide == UCHAR_MAX) die("Too many macro expansions"); ++hide->hide; - flags = IMACRO|IEOF; + flags = IMACRO; } else if (fname) { /* a new file */ if ((fp = fopen(fname, "r")) == NULL) @@ -126,24 +126,12 @@ delinput(void) if (fclose(ip->fp)) die("error: failed to read from input file '%s'", ip->fname); - if (!ip->next) - eof = 1; break; case IMACRO: + assert(hide->hide == 1); --hide->hide; - /* - * If the symbol is not declared then it was - * an expansion due to a #if directive with - * a non declared symbol (expanded to 0), - * thus we have to kill the symbol - * TODO: review this comment and code - */ - if ((hide->flags & SDECLARED) == 0) - killsym(hide); break; } - if (eof) - return; input = ip->next; free(ip->fname); free(ip->line); @@ -156,19 +144,18 @@ newline(void) die("error: input file '%s' too long", input->fname); } +/* + * Read the next character from the input file, counting number of lines + * and joining lines escaped with \ + */ static int readchar(void) { FILE *fp = input->fp; int c; - if (eof || !fp) - return 0; repeat: switch (c = getc(fp)) { - case EOF: - c = '\0'; - break; case '\\': if ((c = getc(fp)) == '\n') { newline(); @@ -185,85 +172,111 @@ repeat: return c; } +/* + * discard a C comment. This function is only called from readline + * because it is impossible to have a comment in a macro, because + * comments are always discarded before processing any cpp directive + */ static void comment(int type) { int c; - c = -1; repeat: - do { - if (!c || eof) { - errorp("unterminated comment"); - return; - } - } while ((c = readchar()) != type); + while ((c = readchar()) != EOF && c != type) + /* nothing */; + + if (c == EOF) { + errorp("unterminated comment"); + return; + } if (type == '*' && (c = readchar()) != '/') goto repeat; } +/* + * readline is used to read a full logic line from a file. + * It discards comments and check that the line fits in + * the input buffer + */ static int readline(void) { char *bp, *lim; - char c, peekc = 0; - -repeat: + int c, peekc = 0; - if (eof) - return 0; - if (!input->fp) { - delinput(); - return 1; - } if (feof(input->fp)) { - delinput(); - goto repeat; + input->flags |= IEOF; + return 0; } *input->line = '\0'; - input->begin = input->p = input->line; lim = &input->line[INPUTSIZ-1]; - for (bp = input->line; bp < lim; *bp++ = c) { + for (bp = input->line; bp < lim-1; *bp++ = c) { c = (peekc) ? peekc : readchar(); peekc = 0; - if (c == '\n' || c == '\0') + if (c == '\n' || c == EOF) break; - if (c != '/' || (peekc = readchar()) != '*' && peekc != '/') + if (c != '/') + continue; + + /* check for /* or // */ + peekc = readchar(); + if (peekc != '*' && peekc != '/') continue; - comment((peekc == '/') ? '\n' : peekc); + comment((peekc == '/') ? '\n' : '/'); peekc = 0; c = ' '; } - if (bp == lim) - error("line too long"); + input->begin = input->p = input->line; + if (bp == lim-1) { + errorp("line too long"); + --bp; + } + *bp++ = '\n'; *bp = '\0'; + return 1; } -int +/* + * moreinput gets more bytes to be passed to the lexer. + * It can take more bytes from macro expansions or + * directly reading from files. When a cpp directive + * is processed the line is discarded because it must not + * be passed to the lexer + */ +static int moreinput(void) { - static char file[FILENAME_MAX]; - static unsigned nline; - char *s; - int wasexpand; + int wasexpand = 0; repeat: - wasexpand = input->hide != NULL; - if (!readline()) + if (!input) return 0; - while (isspace(*input->p)) - ++input->p; - input->begin = input->p; - if (*input->p == '\0' || cpp() || cppoff) { - *input->begin = '\0'; - goto repeat; + + if (*input->p == '\0') { + if ((input->flags&ITYPE) == IMACRO) { + wasexpand = 1; + input->flags |= IEOF; + } + if (input->flags & IEOF) { + delinput(); + goto repeat; + } + if (!readline() || cpp()) { + *input->p = '\0'; + goto repeat; + } } if (onlycpp && !wasexpand) { + static char file[FILENAME_MAX]; + static unsigned nline; + char *s; + putchar('\n'); if (strcmp(file, input->fname)) { strcpy(file, input->fname); @@ -276,7 +289,6 @@ repeat: nline = input->nline; printf(s, nline, file); } - input->begin = input->p; return 1; } @@ -483,7 +495,7 @@ character(void) c = *input->p; ++input->p; if (*input->p != '\'') - error("invalid character constant"); + errorp("invalid character constant"); else ++input->p; @@ -643,47 +655,50 @@ operator(void) /* TODO: Ensure that namespace is NS_IDEN after a recovery */ -static void +/* + * skip all the spaces until the next token. When we are in + * CPPMODE \n is not considered a whitespace + */ +static int skipspaces(void) { -repeat: - while (isspace(*input->p)) - ++input->p; - input->begin = input->p; - - if (*input->p != '\0') - return; + int c; - if (lexmode == CPPMODE) { - /* - * If we are in cpp mode, we only return eof when - * we don't have more inputs, or when the next - * next input is from a file - */ - if (!input || !input->next || input->next->fp) - return; + for (;;) { + switch (c = *input->p) { + case '\n': + if (lexmode == CPPMODE) + goto return_byte; + ++input->p; + case '\0': + if (!moreinput()) + return EOF; + break; + case ' ': + case '\t': + case '\v': + case '\r': + case '\f': + ++input->p; + break; + default: + goto return_byte; + } } - if (!moreinput()) - return; - goto repeat; + +return_byte: + input->begin = input->p; + return c; } unsigned next(void) { - char c; + int c; - skipspaces(); - c = *input->begin; - if ((eof || lexmode == CPPMODE) && c == '\0') { - strcpy(yytext, "<EOF>"); - if (cppctx && eof) - error("#endif expected"); + if ((c = skipspaces()) == EOF) yytoken = EOFTOK; - goto exit; - } - - if (isalpha(c) || c == '_') + else if (isalpha(c) || c == '_') yytoken = iden(); else if (isdigit(c)) yytoken = number(); @@ -694,7 +709,12 @@ next(void) else yytoken = operator(); -exit: + if (yytoken == EOF) { + strcpy(yytext, "<EOF>"); + if (cppctx) + errorp("#endif expected"); + } + DBG("TOKEN %s", yytext); return yytoken; }