lex.c (14092B)
1 static char sccsid[] = "@(#) ./cc1/lex.c"; 2 #include <assert.h> 3 #include <ctype.h> 4 #include <errno.h> 5 #include <limits.h> 6 #include <setjmp.h> 7 #include <stdio.h> 8 #include <stdlib.h> 9 #include <string.h> 10 11 #include <cstd.h> 12 #include "../inc/scc.h" 13 #include "cc1.h" 14 15 int yytoken; 16 struct yystype yylval; 17 char yytext[STRINGSIZ+3]; 18 unsigned short yylen; 19 int lexmode = CCMODE; 20 unsigned lineno; 21 char filenam[FILENAME_MAX]; 22 23 int namespace = NS_IDEN; 24 static int safe; 25 Input *input; 26 27 void 28 ilex(void) 29 { 30 static struct keyword keys[] = { 31 {"auto", SCLASS, AUTO}, 32 {"break", BREAK, BREAK}, 33 {"_Bool", TYPE, BOOL}, 34 {"__builtin_va_list", TYPE, VA_LIST}, 35 {"case", CASE, CASE}, 36 {"char", TYPE, CHAR}, 37 {"const", TQUALIFIER, CONST}, 38 {"continue", CONTINUE, CONTINUE}, 39 {"default", DEFAULT, DEFAULT}, 40 {"do", DO, DO}, 41 {"double", TYPE, DOUBLE}, 42 {"else", ELSE, ELSE}, 43 {"enum", TYPE, ENUM}, 44 {"extern", SCLASS, EXTERN}, 45 {"float", TYPE, FLOAT}, 46 {"for", FOR, FOR}, 47 {"goto", GOTO, GOTO}, 48 {"if", IF, IF}, 49 {"inline", TQUALIFIER, INLINE}, 50 {"int", TYPE, INT}, 51 {"long", TYPE, LONG}, 52 {"register", SCLASS, REGISTER}, 53 {"restrict", TQUALIFIER, RESTRICT}, 54 {"return", RETURN, RETURN}, 55 {"short", TYPE, SHORT}, 56 {"signed", TYPE, SIGNED}, 57 {"sizeof", SIZEOF, SIZEOF}, 58 {"static", SCLASS, STATIC}, 59 {"struct", TYPE, STRUCT}, 60 {"switch", SWITCH, SWITCH}, 61 {"typedef", SCLASS, TYPEDEF}, 62 {"union", TYPE, UNION}, 63 {"unsigned", TYPE, UNSIGNED}, 64 {"void", TYPE, VOID}, 65 {"volatile", TQUALIFIER, VOLATILE}, 66 {"while", WHILE, WHILE}, 67 {NULL, 0, 0}, 68 }; 69 keywords(keys, NS_KEYWORD); 70 } 71 72 int 73 setloc(char *fname, unsigned line) 74 { 75 size_t len; 76 77 if ((len = strlen(fname)) >= FILENAME_MAX) 78 die("file name too long: '%s'", fname); 79 memmove(filenam, fname, len); 80 filenam[len] = '\0'; 81 82 free(input->filenam); 83 input->filenam = xstrdup(fname); 84 lineno = input->lineno = line; 85 return 1; 86 } 87 88 int 89 addinput(char *fname, Symbol *hide, char *buffer) 90 { 91 FILE *fp; 92 char *extp; 93 unsigned flags; 94 int infileln; 95 Input *newip, *curip = input; 96 97 if (hide) { 98 /* this is a macro expansion */ 99 fp = NULL; 100 if (hide->hide == UCHAR_MAX) 101 die("Too many macro expansions"); 102 ++hide->hide; 103 flags = IMACRO; 104 } else if (fname) { 105 /* a new file */ 106 if ((fp = fopen(fname, "r")) == NULL) 107 return 0; 108 flags = IFILE; 109 if (curip && onlyheader) { 110 infileln = strlen(infile); 111 if (extp = strrchr(infile, '.')) 112 infileln -= strlen(extp); 113 printf("%.*s.o: %s %s\n", 114 infileln, infile, infile, fname); 115 } 116 } else { 117 /* reading from stdin */ 118 fp = stdin; 119 fname = "<stdin>"; 120 flags = ISTDIN; 121 } 122 123 newip = xmalloc(sizeof(*newip)); 124 125 if (!buffer) { 126 buffer = xmalloc(INPUTSIZ); 127 buffer[0] = '\0'; 128 } 129 130 if (curip) 131 curip->lineno = lineno; 132 133 newip->p = newip->begin = newip->line = buffer; 134 newip->filenam = NULL; 135 newip->lineno = 0; 136 newip->next = curip; 137 newip->fp = fp; 138 newip->hide = hide; 139 newip->flags = flags; 140 input = newip; 141 142 return setloc(fname, (curip) ? curip->lineno : newip->lineno); 143 } 144 145 void 146 delinput(void) 147 { 148 Input *ip = input; 149 Symbol *hide = ip->hide; 150 151 switch (ip->flags & ITYPE) { 152 case IFILE: 153 if (fclose(ip->fp)) 154 die("error: failed to read from input file '%s'", 155 ip->filenam); 156 break; 157 case IMACRO: 158 assert(hide->hide == 1); 159 --hide->hide; 160 break; 161 } 162 input = ip->next; 163 free(ip->filenam); 164 free(ip->line); 165 if (input) { 166 lineno = input->lineno; 167 strcpy(filenam, input->filenam); 168 } 169 } 170 171 static void 172 newline(void) 173 { 174 if (++lineno == 0) 175 die("error: input file '%s' too long", filenam); 176 } 177 178 /* 179 * Read the next character from the input file, counting number of lines 180 * and joining lines escaped with \ 181 */ 182 static int 183 readchar(void) 184 { 185 FILE *fp = input->fp; 186 int c; 187 188 repeat: 189 switch (c = getc(fp)) { 190 case '\\': 191 if ((c = getc(fp)) == '\n') { 192 newline(); 193 goto repeat; 194 } 195 ungetc(c, fp); 196 c = '\\'; 197 break; 198 case '\n': 199 newline(); 200 break; 201 default: 202 if (!isprint(c) && !ispunct(c)) 203 warn("invalid input character. The shame of UB is yours"); 204 break; 205 } 206 207 return c; 208 } 209 210 /* 211 * discard a C comment. This function is only called from readline 212 * because it is impossible to have a comment in a macro, because 213 * comments are always discarded before processing any cpp directive 214 */ 215 static void 216 comment(int type) 217 { 218 int c; 219 220 repeat: 221 while ((c = readchar()) != EOF && c != type) 222 /* nothing */; 223 224 if (c == EOF) { 225 errorp("unterminated comment"); 226 return; 227 } 228 229 if (type == '*' && (c = readchar()) != '/') 230 goto repeat; 231 } 232 233 /* 234 * readline is used to read a full logic line from a file. 235 * It discards comments and check that the line fits in 236 * the input buffer 237 */ 238 static int 239 readline(void) 240 { 241 char *bp, *lim; 242 int c, peekc = 0; 243 244 if (feof(input->fp)) { 245 input->flags |= IEOF; 246 return 0; 247 } 248 249 *input->line = '\0'; 250 lim = &input->line[INPUTSIZ-1]; 251 for (bp = input->line; bp < lim-1; *bp++ = c) { 252 c = (peekc) ? peekc : readchar(); 253 peekc = 0; 254 if (c == '\n' || c == EOF) 255 break; 256 if (c != '/') 257 continue; 258 259 /* check for /* or // */ 260 peekc = readchar(); 261 if (peekc != '*' && peekc != '/') 262 continue; 263 comment((peekc == '/') ? '\n' : '*'); 264 peekc = 0; 265 c = ' '; 266 } 267 268 input->begin = input->p = input->line; 269 if (bp == lim-1) { 270 errorp("line too long"); 271 --bp; 272 } 273 *bp++ = '\n'; 274 *bp = '\0'; 275 276 return 1; 277 } 278 279 /* 280 * moreinput gets more bytes to be passed to the lexer. 281 * It can take more bytes from macro expansions or 282 * directly reading from files. When a cpp directive 283 * is processed the line is discarded because it must not 284 * be passed to the lexer 285 */ 286 static int 287 moreinput(void) 288 { 289 int wasexpand = 0; 290 291 repeat: 292 if (!input) 293 return 0; 294 295 if (*input->p == '\0') { 296 if ((input->flags&ITYPE) == IMACRO) { 297 wasexpand = 1; 298 input->flags |= IEOF; 299 } 300 if (input->flags & IEOF) { 301 delinput(); 302 goto repeat; 303 } 304 if (!readline() || cpp()) { 305 *input->p = '\0'; 306 goto repeat; 307 } 308 } 309 310 if (onlycpp && !wasexpand) 311 ppragmaln(); 312 return 1; 313 } 314 315 static void 316 tok2str(void) 317 { 318 if ((yylen = input->p - input->begin) > INTIDENTSIZ) 319 error("token too big"); 320 memcpy(yytext, input->begin, yylen); 321 yytext[yylen] = '\0'; 322 input->begin = input->p; 323 } 324 325 static Symbol * 326 readint(char *s, int base, int sign, Symbol *sym) 327 { 328 Type *tp = sym->type; 329 struct limits *lim; 330 TUINT u, val, max; 331 int c; 332 333 lim = getlimits(tp); 334 max = lim->max.i; 335 if (*s == '0') 336 ++s; 337 if (toupper(*s) == 'X') 338 ++s; 339 340 for (u = 0; isxdigit(c = *s++); u = u*base + val) { 341 static char letters[] = "0123456789ABCDEF"; 342 val = strchr(letters, toupper(c)) - letters; 343 repeat: 344 if (u <= max/base && u*base <= max - val) 345 continue; 346 if (tp->prop & TSIGNED) { 347 if (tp == inttype) 348 tp = (base==10) ? longtype : uinttype; 349 else if (tp == longtype) 350 tp = (base==10) ? llongtype : ulongtype; 351 else 352 goto overflow; 353 } else { 354 if (tp == uinttype) 355 tp = (sign==UNSIGNED) ? ulongtype : longtype; 356 else if (tp == ulongtype) 357 tp = (sign==UNSIGNED) ? ullongtype : llongtype; 358 else 359 goto overflow; 360 } 361 sym->type = tp; 362 lim = getlimits(tp); 363 max = lim->max.i; 364 goto repeat; 365 } 366 367 if (tp->prop & TSIGNED) 368 sym->u.i = u; 369 else 370 sym->u.u = u; 371 372 return sym; 373 374 overflow: 375 errorp("overflow in integer constant"); 376 return sym; 377 } 378 379 static int 380 integer(char *s, int base) 381 { 382 Type *tp; 383 Symbol *sym; 384 unsigned size, sign; 385 386 for (size = sign = 0; ; ++input->p) { 387 switch (toupper(*input->p)) { 388 case 'L': 389 if (size == LLONG) 390 goto wrong_type; 391 size = (size == LONG) ? LLONG : LONG; 392 continue; 393 case 'U': 394 if (sign == UNSIGNED) 395 goto wrong_type; 396 sign = UNSIGNED; 397 continue; 398 default: 399 goto convert; 400 wrong_type: 401 error("invalid suffix in integer constant"); 402 } 403 } 404 405 convert: 406 tp = ctype(INT, sign, size); 407 sym = newsym(NS_IDEN, NULL); 408 sym->type = tp; 409 sym->flags |= SCONSTANT; 410 yylval.sym = readint(s, base, sign, sym); 411 return CONSTANT; 412 } 413 414 static char * 415 digits(int base) 416 { 417 char *p; 418 int c; 419 420 for (p = input->p; c = *p; ++p) { 421 switch (base) { 422 case 8: 423 if (!strchr("01234567", c)) 424 goto end; 425 break; 426 case 10: 427 if (!isdigit(c)) 428 goto end; 429 break; 430 case 16: 431 if (!isxdigit(c)) 432 goto end; 433 break; 434 } 435 } 436 end: 437 input->p = p; 438 tok2str(); 439 return yytext; 440 } 441 442 static int 443 number(void) 444 { 445 int base; 446 447 if (*input->p != '0') { 448 base = 10; 449 } else { 450 if (toupper(*++input->p) == 'X') { 451 ++input->p; 452 base = 16; 453 } else { 454 base = 8; 455 } 456 } 457 458 return integer(digits(base), base); 459 } 460 461 static int 462 escape(void) 463 { 464 int c, base; 465 466 switch (*++input->p) { 467 case 'a': return '\a'; 468 case 'f': return '\f'; 469 case 'n': return '\n'; 470 case 'r': return '\r'; 471 case 't': return '\t'; 472 case 'v': return '\v'; 473 case '"': return '"'; 474 case '\'': return '\''; 475 case '\\': return '\\'; 476 case '\?': return '\?'; 477 case 'u': 478 /* 479 * FIXME: universal constants are not correctly handled 480 */ 481 if (!isdigit(*++input->p)) 482 warn("incorrect digit for numerical character constant"); 483 base = 10; 484 break; 485 case 'x': 486 if (!isxdigit(*++input->p)) 487 warn("\\x used with no following hex digits"); 488 base = 16; 489 break; 490 case '0': 491 if (!strchr("01234567", *++input->p)) 492 warn("\\0 used with no following octal digits"); 493 base = 8; 494 break; 495 default: 496 warn("unknown escape sequence"); 497 return ' '; 498 } 499 errno = 0; 500 c = strtoul(input->p, &input->p, base); 501 if (errno || c > 255) 502 warn("character constant out of range"); 503 --input->p; 504 return c; 505 } 506 507 static int 508 character(void) 509 { 510 int c; 511 Symbol *sym; 512 513 if ((c = *++input->p) == '\\') 514 c = escape(); 515 else 516 c = *input->p; 517 ++input->p; 518 if (*input->p != '\'') 519 errorp("invalid character constant"); 520 else 521 ++input->p; 522 523 sym = newsym(NS_IDEN, NULL); 524 sym->u.i = c; 525 sym->type = inttype; 526 yylval.sym = sym; 527 tok2str(); 528 return CONSTANT; 529 } 530 531 static int 532 string(void) 533 { 534 char *bp = yytext; 535 int c; 536 537 *bp++ = '"'; 538 for (++input->p; (c = *input->p) != '"'; ++input->p) { 539 if (c == '\0') { 540 errorp("missing terminating '\"' character"); 541 break; 542 } 543 if (c == '\\') 544 c = escape(); 545 if (bp == &yytext[STRINGSIZ+1]) { 546 /* TODO: proper error handling here */ 547 error("string too long"); 548 } 549 *bp++ = c; 550 } 551 552 input->begin = ++input->p; 553 *bp = '\0'; 554 555 yylen = bp - yytext + 1; 556 yylval.sym = newstring(yytext+1, yylen-1); 557 *bp++ = '"'; 558 *bp = '\0'; 559 return STRING; 560 } 561 562 static int 563 iden(void) 564 { 565 Symbol *sym; 566 char *p, *begin; 567 568 begin = input->p; 569 for (p = begin; isalnum(*p) || *p == '_'; ++p) 570 /* nothing */; 571 input->p = p; 572 tok2str(); 573 if ((sym = lookup(NS_CPP, yytext, NOALLOC)) != NULL) { 574 if (!disexpand && !sym->hide && expand(begin, sym)) 575 return next(); 576 } 577 sym = lookup(namespace, yytext, ALLOC); 578 yylval.sym = sym; 579 if (sym->flags & SCONSTANT) 580 return CONSTANT; 581 if (sym->token != IDEN) 582 yylval.token = sym->u.token; 583 return sym->token; 584 } 585 586 static int 587 follow(int expect, int ifyes, int ifno) 588 { 589 if (*input->p++ == expect) 590 return ifyes; 591 --input->p; 592 return ifno; 593 } 594 595 static int 596 minus(void) 597 { 598 switch (*input->p++) { 599 case '-': return DEC; 600 case '>': return INDIR; 601 case '=': return SUB_EQ; 602 default: --input->p; return '-'; 603 } 604 } 605 606 static int 607 plus(void) 608 { 609 switch (*input->p++) { 610 case '+': return INC; 611 case '=': return ADD_EQ; 612 default: --input->p; return '+'; 613 } 614 } 615 616 static int 617 relational(int op, int equal, int shift, int assig) 618 { 619 int c; 620 621 if ((c = *input->p++) == '=') 622 return equal; 623 if (c == op) 624 return follow('=', assig, shift); 625 --input->p; 626 return op; 627 } 628 629 static int 630 logic(int op, int equal, int logic) 631 { 632 int c; 633 634 if ((c = *input->p++) == '=') 635 return equal; 636 if (c == op) 637 return logic; 638 --input->p; 639 return op; 640 } 641 642 static int 643 dot(void) 644 { 645 int c; 646 647 if ((c = *input->p) != '.') 648 return '.'; 649 if ((c = *++input->p) != '.') 650 error("incorrect token '..'"); 651 ++input->p; 652 return ELLIPSIS; 653 } 654 655 static int 656 operator(void) 657 { 658 int t; 659 660 switch (t = *input->p++) { 661 case '<': t = relational('<', LE, SHL, SHL_EQ); break; 662 case '>': t = relational('>', GE, SHR, SHR_EQ); break; 663 case '&': t = logic('&', AND_EQ, AND); break; 664 case '|': t = logic('|', OR_EQ, OR); break; 665 case '=': t = follow('=', EQ, '='); break; 666 case '^': t = follow('=', XOR_EQ, '^'); break; 667 case '*': t = follow('=', MUL_EQ, '*'); break; 668 case '/': t = follow('=', DIV_EQ, '/'); break; 669 case '!': t = follow('=', NE, '!'); break; 670 case '#': t = follow('#', '$', '#'); break; 671 case '-': t = minus(); break; 672 case '+': t = plus(); break; 673 case '.': t = dot(); break; 674 } 675 tok2str(); 676 return t; 677 } 678 679 /* TODO: Ensure that namespace is NS_IDEN after a recovery */ 680 681 /* 682 * skip all the spaces until the next token. When we are in 683 * CPPMODE \n is not considered a whitespace 684 */ 685 static int 686 skipspaces(void) 687 { 688 int c; 689 690 for (;;) { 691 switch (c = *input->p) { 692 case '\n': 693 if (lexmode == CPPMODE) 694 goto return_byte; 695 ++input->p; 696 case '\0': 697 if (!moreinput()) 698 return EOF; 699 break; 700 case ' ': 701 case '\t': 702 case '\v': 703 case '\r': 704 case '\f': 705 ++input->p; 706 break; 707 default: 708 goto return_byte; 709 } 710 } 711 712 return_byte: 713 input->begin = input->p; 714 return c; 715 } 716 717 int 718 next(void) 719 { 720 int c; 721 722 if ((c = skipspaces()) == EOF) 723 yytoken = EOFTOK; 724 else if (isalpha(c) || c == '_') 725 yytoken = iden(); 726 else if (isdigit(c)) 727 yytoken = number(); 728 else if (c == '"') 729 yytoken = string(); 730 else if (c == '\'') 731 yytoken = character(); 732 else 733 yytoken = operator(); 734 735 if (yytoken == EOF) { 736 strcpy(yytext, "<EOF>"); 737 if (cppctx) 738 errorp("#endif expected"); 739 } 740 741 DBG("TOKEN %s", yytext); 742 return yytoken; 743 } 744 745 void 746 expect(int tok) 747 { 748 if (yytoken != tok) { 749 if (isgraph(tok)) 750 errorp("expected '%c' before '%s'", tok, yytext); 751 else 752 errorp("unexpected '%s'", yytext); 753 } else { 754 next(); 755 } 756 } 757 758 int 759 ahead(void) 760 { 761 skipspaces(); 762 return *input->begin; 763 } 764 765 void 766 setsafe(int type) 767 { 768 safe = type; 769 } 770 771 void 772 discard(void) 773 { 774 extern jmp_buf recover; 775 int c; 776 777 input->begin = input->p; 778 for (c = yytoken; ; c = *input->begin++) { 779 switch (safe) { 780 case END_COMP: 781 if (c == '}') 782 goto jump; 783 goto semicolon; 784 case END_COND: 785 if (c == ')') 786 goto jump; 787 break; 788 case END_LDECL: 789 if (c == ',') 790 goto jump; 791 case END_DECL: 792 semicolon: 793 if (c == ';') 794 goto jump; 795 break; 796 } 797 if (c == '\0' && !moreinput()) 798 exit(1); 799 } 800 jump: 801 yytoken = c; 802 longjmp(recover, 1); 803 }