scc

simple C compiler
git clone git://git.2f30.org/scc
Log | Files | Refs | README | LICENSE

commit 1ae4a875f9b9215126fd406329cbfd267271492d
parent 3f1b9619b57cbd1e479f281c86f8ed3fabe2b024
Author: Roberto E. Vargas Caballero <k0ga@shike2.com>
Date:   Sat,  9 May 2015 17:01:22 +0200

Rewrite lexer analizer

This new lexer uses the full line as input, and it does not read
character by character.

Diffstat:
Mcc1/cc1.h | 1-
Mcc1/lex.c | 552+++++++++++++++++++++++++++++++++----------------------------------------------
Mcc1/symbol.c | 38++++++--------------------------------
3 files changed, 238 insertions(+), 353 deletions(-)

diff --git a/cc1/cc1.h b/cc1/cc1.h @@ -40,7 +40,6 @@ struct symbol { int i; char *s; uint8_t token; - void (*fun)(char *); } u; struct symbol *next; struct symbol *hash; diff --git a/cc1/lex.c b/cc1/lex.c @@ -1,5 +1,6 @@ #include <assert.h> +#include <errno.h> #include <inttypes.h> #include <setjmp.h> #include <stdio.h> @@ -18,21 +19,17 @@ typedef struct input Input; struct input { char *fname; unsigned short nline; - int cnt; FILE *fp; - char *line, *ptr; + char *line, *begin, *p; struct input *next; }; -#define nextchar() ((--input->cnt >= 0) ? \ - (unsigned char) *input->ptr++ : readline()) - uint8_t lex_ns = NS_IDEN; uint8_t yytoken; struct yystype yylval; char yytext[IDENTSIZ + 1]; -static uint8_t safe, comment, commentline; +static uint8_t safe; static Input *input; bool @@ -59,7 +56,6 @@ addinput(char *fname) ip->fname = fname; ip->next = input; ip->line = NULL; - ip->cnt = 0; ip->nline = nline; ip->fp = fp; input = ip; @@ -95,53 +91,13 @@ fileline(void) return input->nline; } -static void -newline(void) -{ - if (++input->nline == 0) - die("input file '%s' too long", input->fname); -} - /* TODO: preprocessor error must not rise recover */ -static void -preprocessor(void) -{ - char str[IDENTSIZ+1], *p, *q; - unsigned short cnt, n; - Symbol *sym; - - p = input->ptr; - q = &p[input->cnt-1]; - while (q > p && isspace(*q)) - ++q; - while (isspace(*p)) - ++p; - for (q = p; isalpha(*q); ++q) - /* nothing */; - if ((n = q - p) > IDENTSIZ) - goto bad_directive; - strncpy(str, p, n); - str[n] = '\0'; - - /* discard this line for the lexer */ - input->cnt = 0; - if ((sym = lookup(str, NS_CPP)) == NULL) - goto bad_directive; - (*sym->u.fun)(q); - return; - -bad_directive: - error("incorrect preprocessor directive"); -} - -void +char * include(char *s) { char fname[FILENAME_MAX], delim, c, *p; size_t len; - while (isspace(*s)) - ++s; if ((c = *s++) == '>') delim = '>'; else if (c == '"') @@ -162,11 +118,11 @@ include(char *s) fname[len] = '\0'; if (!addinput(fname)) goto not_found; - return; + } else { + abort(); } - abort(); - return; + return p+1; not_found: error("included file '%s' not found", fname); @@ -176,34 +132,41 @@ bad_include: error("#include expects \"FILENAME\" or <FILENAME>"); } -void -define(char *str) -{ - -} - -void -undef(char *str) -{ - fprintf(stderr, "Esto en un undef\n"); -} - -void -ifdef(char *str) -{ - fprintf(stderr, "Esto en un ifdef\n"); -} - -void -ifndef(char *str) -{ - fprintf(stderr, "Esto en un ifndef\n"); -} +static char * +preprocessor(char *p) +{ + char str[IDENTSIZ+1], *q; + unsigned short n; + static char **bp, *cmds[] = { + "include", + NULL + }; + static char *(*funs[])(char *) = { + include + }; -void -endif(char *str) -{ - fprintf(stderr, "Esto en un endif\n"); + while (isspace(*p)) + ++p; + if (*p != '#') + return p; + for (++p; isspace(*p); ++p) + /* nothing */; + for (q = p; isalpha(*q); ++q) + /* nothing */; + n = q - p; + while (isspace(*q)) + ++q; + for (bp = cmds; *bp; ++bp) { + if (strncmp(*bp, p, n)) + continue; + q = (*funs[bp - cmds])(q); + while (isspace(*q++)) + /* nothing */; + if (*q != '\0') + error("trailing characters after preprocessor directive"); + return NULL; + } + error("incorrect preprocessor directive"); } static int @@ -222,31 +185,15 @@ repeat: return c; } -static int +static void readline(void) { - char *bp, *ptr; - uint8_t n; + static int comment, commentline; + char *bp, *lim; int c; - FILE *fp; -repeat: - if (!input) - return EOF; - fp = input->fp; - if (!input->line) - input->line = xmalloc(INPUTSIZ); - bp = ptr = input->ptr = input->line; - - while ((c = getc(fp)) != EOF && isspace(c)) { - if (c == '\n') - newline(); - } - if (c == EOF) { - delinput(); - goto repeat; - } - ungetc(c, fp); + bp = input->line; + lim = bp + INPUTSIZ-1; for (;;) { c = readchar(); @@ -268,7 +215,7 @@ repeat: } if (c == '\n') break; - if (bp == &ptr[INPUTSIZ-1]) + if (bp == lim) die("line %d too big in file '%s'", input->nline, input->fname); if (c == '/') { @@ -284,60 +231,88 @@ repeat: } *bp++ = c; } + ungetc(c, input->fp); + *bp = '\0'; +} + +static bool +fill(void) +{ + int c; + char *p; + FILE *fp; - *bp = ' '; - input->cnt = bp - ptr; +repeat: + if (!input) + return 0; + if (input->begin && *input->begin != '\0') + return 1; - if ((c = *input->ptr++) == '#') { - *bp = '\0'; - preprocessor(); + fp = input->fp; + if (!input->line) + input->line = xmalloc(INPUTSIZ); + + while ((c = getc(fp)) != EOF && (c == '\n')) { + if (++input->nline == 0) + die("input file '%s' too long", input->fname); + } + if (c == EOF) { + delinput(); goto repeat; } - return c; + ungetc(c, fp); + readline(); + if ((p = preprocessor(input->line)) == NULL) + goto repeat; + input->begin = input->p = p; + return 1; } -static int -backchar(int c) +static void +tok2str(void) { - if (!input) { - assert(c == EOF); - return c; - } - ++input->cnt; - return *--input->ptr = c; + size_t len; + + if ((len = input->p - input->begin) > IDENTSIZ) + error("token too big"); + strncpy(yytext, input->begin, len); + yytext[len] = '\0'; + fprintf(stderr ,"%s\n", yytext); + input->begin = input->p; } static uint8_t integer(char *s, char base) { - static Type *tp; - static Symbol *sym; - static char ch, size, sign; - static long v; - - size = sign = 0; -type: - switch (ch = toupper(nextchar())) { - case 'L': - if (size == LLONG) - goto wrong_type; - size = (size == LONG) ? LLONG : LONG; - goto type; - case 'U': - if (sign == UNSIGNED) - goto wrong_type; - goto type; - default: - backchar(ch); - tp = ctype(INT, sign, size); - break; - wrong_type: - error("invalid suffix in integer constant"); + Type *tp; + Symbol *sym; + uint8_t size, sign; + long v; + + for (size = sign = 0; ; ++input->p) { + switch (toupper(*input->p)) { + case 'L': + if (size == LLONG) + goto wrong_type; + size = (size == LONG) ? LLONG : LONG; + continue; + case 'U': + if (sign == UNSIGNED) + goto wrong_type; + sign = UNSIGNED; + continue; + default: + goto convert; + wrong_type: + error("invalid suffix in integer constant"); + } } +convert: + tp = ctype(INT, sign, size); sym = install("", NS_IDEN); sym->type = tp; - v = strtol(yytext, NULL, base); + v = strtol(s, NULL, base); if (tp == inttype) sym->u.i = v; yylval.sym = sym; @@ -347,30 +322,27 @@ type: static char * digits(uint8_t base) { - char ch, *bp; + char c, *p; - for (bp = yytext ; bp < &yytext[IDENTSIZ]; *bp++ = ch) { - ch = nextchar(); + for (p = input->p; c = *p; ++p) { switch (base) { case 8: - if (ch >= '7') + if (c > '7' || c < '0') goto end; - /* passthru */ + break; case 10: - if (!isdigit(ch)) + if (!isdigit(c)) goto end; break; case 16: - if (!isxdigit(ch)) + if (!isxdigit(c)) goto end; break; } } end: - if (bp == &yytext[IDENTSIZ]) - error("number too long %s", yytext); - *bp = '\0'; - backchar(ch); + input->p = p; + tok2str(); return yytext; } @@ -378,63 +350,51 @@ static uint8_t number(void) { int ch; - static char base; + char base; - if ((ch = nextchar()) == '0') { - if (toupper(ch = nextchar()) == 'X') { + if (*input->p != '0') { + base = 10; + } else { + if (toupper(*++input->p) == 'X') { + ++input->p; base = 16; } else { base = 8; - backchar(ch); } - } else { - base = 10; - backchar(ch); } return integer(digits(base), base); } -static char * -escape(char *s) -{ - uint8_t base; - int c; - -repeat: - switch (nextchar()) { - case '\\': c = '\''; break; - case 'a': c = '\a'; break; - case 'f': c = '\f'; break; - case 'n': c = '\n'; break; - case 'r': c = '\r'; break; - case 't': c = '\t'; break; - case 'v': c = '\v'; break; - case '\'': c = '\\'; break; - case '"': c ='"'; break; - case '?': c = '?'; break; - case 'u': /* TODO: */ - case 'x': - base = 16; - goto number; - case '0': - base = 8; - number: - if ((c = atoi(digits(base))) > 255) - warn("character constant out of range"); - break; - case '\n': - newline(); - if ((c = nextchar()) == '\\') - goto repeat; - break; +static char +escape(void) +{ + int c, base; + + ++input->p; + switch (*input->p++) { + case '\\': return '\\'; + case 'a': return '\a'; + case 'f': return '\f'; + case 'n': return '\n'; + case 'r': return '\r'; + case 't': return '\t'; + case 'v': return '\v'; + case '\'': return '\\'; + case '"': return'"'; + case '?': return '?'; + case 'u': base = 10; break; + case 'x': base = 16; break; + case '0': base = 8; break; default: warn("unknown escape sequence"); - return s; + return ' '; } - - *s = c; - return ++s; + errno = 0; + c = strtoul(input->p, &input->p, base); + if (errno || c > 255) + warn("character constant out of range"); + return c; } static uint8_t @@ -443,12 +403,12 @@ character(void) static char c; Symbol *sym; - nextchar(); /* discard the initial ' */ - c = nextchar(); - if (c == '\\') - escape(&c); - if (nextchar() != '\'') + if ((c = *++input->p) == '\\') + c = escape(); + if (*input->p != '\'') error("invalid character constant"); + ++input->p; + sym = install("", NS_IDEN); sym->u.i = c; sym->type = inttype; @@ -460,29 +420,20 @@ static uint8_t string(void) { static char buf[STRINGSIZ+1]; - char *bp; - int c; - static Symbol *sym; - - nextchar(); /* discard the initial " */ - - for (bp = buf; bp < &buf[STRINGSIZ]; ) { - switch (c = nextchar()) { - case EOF: - error("found EOF while parsing"); - case '"': - goto end_string; - case '\\': - bp = escape(bp); + Symbol *sym; + char *bp = buf, c; + + assert(STRINGSIZ <= INPUTSIZ); + for (++input->p; (c = *input->p) != '\0'; ++input->p) { + if (c == '"') break; - default: - *bp++ = c; - } + if (c == '\\') + c = escape(); + *bp++ = c; } -end_string: - if (bp == &buf[IDENTSIZ]) - error("string too long"); + if (c == '\0') + error("missing terminating '\"' character"); *bp = '\0'; sym = install("", NS_IDEN); sym->u.s = xstrdup(buf); @@ -494,19 +445,13 @@ end_string: static uint8_t iden(void) { - char *bp; - int c; + char c, *p; Symbol *sym; - for (bp = yytext; bp < &yytext[IDENTSIZ]; *bp++ = c) { - if (!isalnum(c = nextchar()) && c != '_') - break; - } - if (bp == &yytext[IDENTSIZ]) - error("identifier too long %s", yytext); - *bp = '\0'; - backchar(c); - + for (p = input->p; isalpha(*p); ++p) + /* nothing */; + input->p = p; + tok2str(); sym = yylval.sym = lookup(yytext, lex_ns); if (!sym || sym->token == IDEN) return IDEN; @@ -517,146 +462,106 @@ iden(void) static uint8_t follow(int expect, int ifyes, int ifno) { - int c = nextchar(); - - if (c == expect) { - yytext[1] = c; - yytext[2] = 0; + if (*input->p++) return ifyes; - } - backchar(c); + --input->p; return ifno; } static uint8_t minus(void) { - int c = nextchar(); - - yytext[1] = c; - yytext[2] = '\0'; - switch (c) { + switch (*input->p++) { case '-': return DEC; case '>': return INDIR; case '=': return SUB_EQ; - default: - yytext[1] = '\0'; - backchar(c); - return '-'; + default: --input->p; return '-'; } } static uint8_t plus(void) { - int c = nextchar(); - - yytext[1] = c; - yytext[2] = '\0'; - switch (c) { + switch (*input->p++) { case '+': return INC; case '=': return ADD_EQ; - default: - yytext[1] = '\0'; - backchar(c); - return '+'; + default: --input->p; return '+'; } } static uint8_t relational(uint8_t op, uint8_t equal, uint8_t shift, uint8_t assig) { - int c = nextchar(); - - yytext[1] = c; - yytext[2] = '\0'; + char c; - if (c == '=') + if ((c = *input->p++) == '=') return equal; if (c == op) return follow('=', assig, shift); - backchar(c); - yytext[1] = '\0'; + --input->p; return op; } static uint8_t logic(uint8_t op, uint8_t equal, uint8_t logic) { - int c = nextchar(); + char c; - yytext[1] = c; - yytext[2] = '\0'; - - if (c == '=') + if ((c = *input->p++) == equal) return equal; if (c == op) return logic; - backchar(c); - yytext[1] = '\0'; + --input->p; return op; } static uint8_t dot(void) { - int c; + char c; - if ((c = nextchar()) != '.') { - backchar(c); + if (c = *input->p != '.') return '.'; - } else if ((c = nextchar()) != '.') { - error("incorrect token '%s'", yytext); - } else { - yytext[2] = yytext[1] = '.'; - yytext[3] = '\0'; - return ELLIPSIS; - } + if ((c = *++input->p) != '.') + error("incorrect token '..'"); + ++input->p; + return ELLIPSIS; } static uint8_t operator(void) { - uint8_t c = nextchar(); - - yytext[0] = c; - yytext[1] = '\0'; - switch (c) { - case '<': return relational('<', LE, SHL, SHL_EQ); - case '>': return relational('>', GE, SHR, SHR_EQ); - case '&': return logic('&', AND_EQ, AND); - case '|': return logic('|', OR_EQ, OR); - case '=': return follow('=', EQ, '='); - case '^': return follow('=', XOR_EQ, '^'); - case '*': return follow('=', MUL_EQ, '*'); - case '/': return follow('=', DIV_EQ, '/'); - case '!': return follow('=', NE, '!'); - case '-': return minus(); - case '+': return plus(); - case '.': return dot(); - default: return c; + uint8_t t; + + switch (t = *input->p++) { + case '<': t = relational('<', LE, SHL, SHL_EQ); break; + case '>': t = relational('>', GE, SHR, SHR_EQ); break; + case '&': t = logic('&', AND_EQ, AND); break; + case '|': t = logic('|', OR_EQ, OR); break; + case '=': t = follow('=', EQ, '='); break; + case '^': t = follow('=', XOR_EQ, '^'); break; + case '*': t = follow('=', MUL_EQ, '*'); break; + case '/': t = follow('=', DIV_EQ, '/'); break; + case '!': t = follow('=', NE, '!'); break; + case '-': t = minus(); break; + case '+': t = plus(); break; + case '.': t = dot(); break; } -} - -static int -skipspaces(void) -{ - - int c; - - while (isspace(c = nextchar())) { - if (c == '\n') - newline(); - } - return c; + tok2str(); + return t; } uint8_t next(void) { - int c; + char c; - backchar(c = skipspaces()); + if (!fill()) + return EOFTOK; + + while (isspace(*input->begin)) + ++input->begin; + c = *(input->p = input->begin); if (isalpha(c) || c == '_') { yytoken = iden(); @@ -666,9 +571,6 @@ next(void) yytoken = string(); } else if (c == '\'') { yytoken = character(); - } else if (c == EOF) { - strcpy(yytext, "EOF"); - yytoken = EOFTOK; } else { yytoken = operator(); } @@ -693,8 +595,15 @@ ahead(void) { int c; - backchar(c = skipspaces()); - +repeat: + if (!input) + return EOFTOK; + while (isspace(c = *input->begin)) + ; + if (c == '\0') { + fill(); + goto repeat; + } return c; } @@ -708,10 +617,9 @@ void discard(void) { extern jmp_buf recover; - int c; + char c; - c = yytoken; - do { + for (c = yytoken; ; c = *input->p++) { switch (safe) { case END_COMP: if (c == '}') @@ -730,7 +638,11 @@ discard(void) goto jump; break; } - } while ((c = nextchar()) != EOF); + if (*input->p == '\0') + fill(); + if (!input) + break; + } c = EOFTOK; jump: diff --git a/cc1/symbol.c b/cc1/symbol.c @@ -107,13 +107,10 @@ install(char *s, uint8_t ns) void ikeywords(void) { - extern void include(char *), define(char *), undef(char *); - extern void ifdef(char *), ifndef(char *), endif(char *); - static struct words { + static struct { char *str; uint8_t token, value; - void (*fun)(char *); - } ccwords[] = { + } *bp, buff[] = { {"auto", SCLASS, AUTO}, {"break", BREAK, BREAK}, {"_Bool", TYPE, BOOL}, @@ -149,36 +146,13 @@ ikeywords(void) {"volatile", TQUALIFIER, VOLATILE}, {"while", WHILE, WHILE}, {NULL, 0, 0}, - }, cppwords[] = { - {"include", 0, 0, include}, - {"define", 0, 0, define}, - {"undef", 0, 0, undef}, - {"ifdef", 0, 0, ifdef}, - {"ifndef", 0, 0, ifndef}, - {"endif", 0, 0, endif}, - {NULL, 0, 0} }; - static struct wordlist { - struct words *words; - uint8_t ns; - } wordlist [] = { - {ccwords, NS_IDEN}, - {cppwords, NS_CPP}, - {NULL, 0} - }; - struct wordlist *lp; - struct words *bp; Symbol *sym; - for (lp = wordlist; lp->words; ++lp) { - for (bp = lp->words; bp->str; ++bp) { - sym = install(bp->str, lp->ns); - sym->token = bp->token; - if (bp->fun) - sym->u.fun = bp->fun; - else - sym->u.token = bp->value; - } + for (bp = buff; bp->str; ++bp) { + sym = install(bp->str, NS_IDEN); + sym->token = bp->token; + sym->u.token = bp->value; } globalcnt = 0; }