lex.c (13533B)
1 /* $OpenBSD: lex.c,v 1.12 2011/09/28 19:27:18 millert Exp $ */ 2 /**************************************************************** 3 Copyright (C) Lucent Technologies 1997 4 All Rights Reserved 5 6 Permission to use, copy, modify, and distribute this software and 7 its documentation for any purpose and without fee is hereby 8 granted, provided that the above copyright notice appear in all 9 copies and that both that the copyright notice and this 10 permission notice and warranty disclaimer appear in supporting 11 documentation, and that the name Lucent Technologies or any of 12 its entities not be used in advertising or publicity pertaining 13 to distribution of the software without specific, written prior 14 permission. 15 16 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, 17 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. 18 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY 19 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 20 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER 21 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, 22 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF 23 THIS SOFTWARE. 24 ****************************************************************/ 25 26 #include <stdio.h> 27 #include <stdlib.h> 28 #include <string.h> 29 #include <ctype.h> 30 #include "awk.h" 31 #include "ytab.h" 32 33 extern YYSTYPE yylval; 34 extern int infunc; 35 36 int lineno = 1; 37 int bracecnt = 0; 38 int brackcnt = 0; 39 int parencnt = 0; 40 41 typedef struct Keyword { 42 const char *word; 43 int sub; 44 int type; 45 } Keyword; 46 47 Keyword keywords[] ={ /* keep sorted: binary searched */ 48 { "BEGIN", XBEGIN, XBEGIN }, 49 { "END", XEND, XEND }, 50 { "NF", VARNF, VARNF }, 51 { "and", FAND, BLTIN }, 52 { "atan2", FATAN, BLTIN }, 53 { "break", BREAK, BREAK }, 54 { "close", CLOSE, CLOSE }, 55 { "compl", FCOMPL, BLTIN }, 56 { "continue", CONTINUE, CONTINUE }, 57 { "cos", FCOS, BLTIN }, 58 { "delete", DELETE, DELETE }, 59 { "do", DO, DO }, 60 { "else", ELSE, ELSE }, 61 { "exit", EXIT, EXIT }, 62 { "exp", FEXP, BLTIN }, 63 { "fflush", FFLUSH, BLTIN }, 64 { "for", FOR, FOR }, 65 { "func", FUNC, FUNC }, 66 { "function", FUNC, FUNC }, 67 { "getline", GETLINE, GETLINE }, 68 { "gsub", GSUB, GSUB }, 69 { "if", IF, IF }, 70 { "in", IN, IN }, 71 { "index", INDEX, INDEX }, 72 { "int", FINT, BLTIN }, 73 { "length", FLENGTH, BLTIN }, 74 { "log", FLOG, BLTIN }, 75 { "lshift", FLSHIFT, BLTIN }, 76 { "match", MATCHFCN, MATCHFCN }, 77 { "next", NEXT, NEXT }, 78 { "nextfile", NEXTFILE, NEXTFILE }, 79 { "or", FFOR, BLTIN }, 80 { "print", PRINT, PRINT }, 81 { "printf", PRINTF, PRINTF }, 82 { "rand", FRAND, BLTIN }, 83 { "return", RETURN, RETURN }, 84 { "rshift", FRSHIFT, BLTIN }, 85 { "sin", FSIN, BLTIN }, 86 { "split", SPLIT, SPLIT }, 87 { "sprintf", SPRINTF, SPRINTF }, 88 { "sqrt", FSQRT, BLTIN }, 89 { "srand", FSRAND, BLTIN }, 90 { "sub", SUB, SUB }, 91 { "substr", SUBSTR, SUBSTR }, 92 { "system", FSYSTEM, BLTIN }, 93 { "tolower", FTOLOWER, BLTIN }, 94 { "toupper", FTOUPPER, BLTIN }, 95 { "while", WHILE, WHILE }, 96 { "xor", FXOR, BLTIN }, 97 }; 98 99 #define RET(x) { if(dbg)printf("lex %s\n", tokname(x)); return(x); } 100 101 int peek(void); 102 int gettok(char **, int *); 103 int binsearch(char *, Keyword *, int); 104 105 int peek(void) 106 { 107 int c = input(); 108 unput(c); 109 return c; 110 } 111 112 int gettok(char **pbuf, int *psz) /* get next input token */ 113 { 114 int c, retc; 115 char *buf = *pbuf; 116 int sz = *psz; 117 char *bp = buf; 118 119 c = input(); 120 if (c == 0) 121 return 0; 122 buf[0] = c; 123 buf[1] = 0; 124 if (!isalnum(c) && c != '.' && c != '_') 125 return c; 126 127 *bp++ = c; 128 if (isalpha(c) || c == '_') { /* it's a varname */ 129 for ( ; (c = input()) != 0; ) { 130 if (bp-buf >= sz) 131 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok")) 132 FATAL( "out of space for name %.10s...", buf ); 133 if (isalnum(c) || c == '_') 134 *bp++ = c; 135 else { 136 *bp = 0; 137 unput(c); 138 break; 139 } 140 } 141 *bp = 0; 142 retc = 'a'; /* alphanumeric */ 143 } else { /* maybe it's a number, but could be . */ 144 char *rem; 145 /* read input until can't be a number */ 146 for ( ; (c = input()) != 0; ) { 147 if (bp-buf >= sz) 148 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok")) 149 FATAL( "out of space for number %.10s...", buf ); 150 if (isdigit(c) || c == 'e' || c == 'E' 151 || c == '.' || c == '+' || c == '-') 152 *bp++ = c; 153 else { 154 unput(c); 155 break; 156 } 157 } 158 *bp = 0; 159 strtod(buf, &rem); /* parse the number */ 160 if (rem == buf) { /* it wasn't a valid number at all */ 161 buf[1] = 0; /* return one character as token */ 162 retc = buf[0]; /* character is its own type */ 163 unputstr(rem+1); /* put rest back for later */ 164 } else { /* some prefix was a number */ 165 unputstr(rem); /* put rest back for later */ 166 rem[0] = 0; /* truncate buf after number part */ 167 retc = '0'; /* type is number */ 168 } 169 } 170 *pbuf = buf; 171 *psz = sz; 172 return retc; 173 } 174 175 int word(char *); 176 int string(void); 177 int regexpr(void); 178 int sc = 0; /* 1 => return a } right now */ 179 int reg = 0; /* 1 => return a REGEXPR now */ 180 181 int yylex(void) 182 { 183 int c; 184 static char *buf = 0; 185 static int bufsize = 5; /* BUG: setting this small causes core dump! */ 186 187 if (buf == 0 && (buf = (char *) malloc(bufsize)) == NULL) 188 FATAL( "out of space in yylex" ); 189 if (sc) { 190 sc = 0; 191 RET('}'); 192 } 193 if (reg) { 194 reg = 0; 195 return regexpr(); 196 } 197 for (;;) { 198 c = gettok(&buf, &bufsize); 199 if (c == 0) 200 return 0; 201 if (isalpha(c) || c == '_') 202 return word(buf); 203 if (isdigit(c)) { 204 yylval.cp = setsymtab(buf, tostring(buf), atof(buf), CON|NUM, symtab); 205 /* should this also have STR set? */ 206 RET(NUMBER); 207 } 208 209 yylval.i = c; 210 switch (c) { 211 case '\n': /* {EOL} */ 212 RET(NL); 213 case '\r': /* assume \n is coming */ 214 case ' ': /* {WS}+ */ 215 case '\t': 216 break; 217 case '#': /* #.* strip comments */ 218 while ((c = input()) != '\n' && c != 0) 219 ; 220 unput(c); 221 break; 222 case ';': 223 RET(';'); 224 case '\\': 225 if (peek() == '\n') { 226 input(); 227 } else if (peek() == '\r') { 228 input(); input(); /* \n */ 229 lineno++; 230 } else { 231 RET(c); 232 } 233 break; 234 case '&': 235 if (peek() == '&') { 236 input(); RET(AND); 237 } else 238 RET('&'); 239 case '|': 240 if (peek() == '|') { 241 input(); RET(BOR); 242 } else 243 RET('|'); 244 case '!': 245 if (peek() == '=') { 246 input(); yylval.i = NE; RET(NE); 247 } else if (peek() == '~') { 248 input(); yylval.i = NOTMATCH; RET(MATCHOP); 249 } else 250 RET(NOT); 251 case '~': 252 yylval.i = MATCH; 253 RET(MATCHOP); 254 case '<': 255 if (peek() == '=') { 256 input(); yylval.i = LE; RET(LE); 257 } else { 258 yylval.i = LT; RET(LT); 259 } 260 case '=': 261 if (peek() == '=') { 262 input(); yylval.i = EQ; RET(EQ); 263 } else { 264 yylval.i = ASSIGN; RET(ASGNOP); 265 } 266 case '>': 267 if (peek() == '=') { 268 input(); yylval.i = GE; RET(GE); 269 } else if (peek() == '>') { 270 input(); yylval.i = APPEND; RET(APPEND); 271 } else { 272 yylval.i = GT; RET(GT); 273 } 274 case '+': 275 if (peek() == '+') { 276 input(); yylval.i = INCR; RET(INCR); 277 } else if (peek() == '=') { 278 input(); yylval.i = ADDEQ; RET(ASGNOP); 279 } else 280 RET('+'); 281 case '-': 282 if (peek() == '-') { 283 input(); yylval.i = DECR; RET(DECR); 284 } else if (peek() == '=') { 285 input(); yylval.i = SUBEQ; RET(ASGNOP); 286 } else 287 RET('-'); 288 case '*': 289 if (peek() == '=') { /* *= */ 290 input(); yylval.i = MULTEQ; RET(ASGNOP); 291 } else if (peek() == '*') { /* ** or **= */ 292 input(); /* eat 2nd * */ 293 if (peek() == '=') { 294 input(); yylval.i = POWEQ; RET(ASGNOP); 295 } else { 296 RET(POWER); 297 } 298 } else 299 RET('*'); 300 case '/': 301 RET('/'); 302 case '%': 303 if (peek() == '=') { 304 input(); yylval.i = MODEQ; RET(ASGNOP); 305 } else 306 RET('%'); 307 case '^': 308 if (peek() == '=') { 309 input(); yylval.i = POWEQ; RET(ASGNOP); 310 } else 311 RET(POWER); 312 313 case '$': 314 /* BUG: awkward, if not wrong */ 315 c = gettok(&buf, &bufsize); 316 if (isalpha(c)) { 317 if (strcmp(buf, "NF") == 0) { /* very special */ 318 unputstr("(NF)"); 319 RET(INDIRECT); 320 } 321 c = peek(); 322 if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) { 323 unputstr(buf); 324 RET(INDIRECT); 325 } 326 yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab); 327 RET(IVAR); 328 } else if (c == 0) { /* */ 329 SYNTAX( "unexpected end of input after $" ); 330 RET(';'); 331 } else { 332 unputstr(buf); 333 RET(INDIRECT); 334 } 335 336 case '}': 337 if (--bracecnt < 0) 338 SYNTAX( "extra }" ); 339 sc = 1; 340 RET(';'); 341 case ']': 342 if (--brackcnt < 0) 343 SYNTAX( "extra ]" ); 344 RET(']'); 345 case ')': 346 if (--parencnt < 0) 347 SYNTAX( "extra )" ); 348 RET(')'); 349 case '{': 350 bracecnt++; 351 RET('{'); 352 case '[': 353 brackcnt++; 354 RET('['); 355 case '(': 356 parencnt++; 357 RET('('); 358 359 case '"': 360 return string(); /* BUG: should be like tran.c ? */ 361 362 default: 363 RET(c); 364 } 365 } 366 } 367 368 int string(void) 369 { 370 int c, n; 371 char *s, *bp; 372 static char *buf = 0; 373 static int bufsz = 500; 374 375 if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL) 376 FATAL("out of space for strings"); 377 for (bp = buf; (c = input()) != '"'; ) { 378 if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string")) 379 FATAL("out of space for string %.10s...", buf); 380 switch (c) { 381 case '\n': 382 case '\r': 383 case 0: 384 SYNTAX( "non-terminated string %.10s...", buf ); 385 lineno++; 386 if (c == 0) /* hopeless */ 387 FATAL( "giving up" ); 388 break; 389 case '\\': 390 c = input(); 391 switch (c) { 392 case '"': *bp++ = '"'; break; 393 case 'n': *bp++ = '\n'; break; 394 case 't': *bp++ = '\t'; break; 395 case 'f': *bp++ = '\f'; break; 396 case 'r': *bp++ = '\r'; break; 397 case 'b': *bp++ = '\b'; break; 398 case 'v': *bp++ = '\v'; break; 399 case 'a': *bp++ = '\007'; break; 400 case '\\': *bp++ = '\\'; break; 401 402 case '0': case '1': case '2': /* octal: \d \dd \ddd */ 403 case '3': case '4': case '5': case '6': case '7': 404 n = c - '0'; 405 if ((c = peek()) >= '0' && c < '8') { 406 n = 8 * n + input() - '0'; 407 if ((c = peek()) >= '0' && c < '8') 408 n = 8 * n + input() - '0'; 409 } 410 *bp++ = n; 411 break; 412 413 case 'x': /* hex \x0-9a-fA-F + */ 414 { char xbuf[100], *px; 415 for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) { 416 if (isdigit(c) 417 || (c >= 'a' && c <= 'f') 418 || (c >= 'A' && c <= 'F')) 419 *px++ = c; 420 else 421 break; 422 } 423 *px = 0; 424 unput(c); 425 sscanf(xbuf, "%x", (unsigned int *) &n); 426 *bp++ = n; 427 break; 428 } 429 430 default: 431 *bp++ = c; 432 break; 433 } 434 break; 435 default: 436 *bp++ = c; 437 break; 438 } 439 } 440 *bp = 0; 441 s = tostring(buf); 442 *bp++ = ' '; *bp++ = 0; 443 yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab); 444 RET(STRING); 445 } 446 447 448 int binsearch(char *w, Keyword *kp, int n) 449 { 450 int cond, low, mid, high; 451 452 low = 0; 453 high = n - 1; 454 while (low <= high) { 455 mid = (low + high) / 2; 456 if ((cond = strcmp(w, kp[mid].word)) < 0) 457 high = mid - 1; 458 else if (cond > 0) 459 low = mid + 1; 460 else 461 return mid; 462 } 463 return -1; 464 } 465 466 int word(char *w) 467 { 468 Keyword *kp; 469 int c, n; 470 471 n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0])); 472 /* BUG: this ought to be inside the if; in theory could fault (daniel barrett) */ 473 kp = keywords + n; 474 if (n != -1) { /* found in table */ 475 yylval.i = kp->sub; 476 switch (kp->type) { /* special handling */ 477 case BLTIN: 478 if (kp->sub == FSYSTEM && safe) 479 SYNTAX( "system is unsafe" ); 480 RET(kp->type); 481 case FUNC: 482 if (infunc) 483 SYNTAX( "illegal nested function" ); 484 RET(kp->type); 485 case RETURN: 486 if (!infunc) 487 SYNTAX( "return not in function" ); 488 RET(kp->type); 489 case VARNF: 490 yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab); 491 RET(VARNF); 492 default: 493 RET(kp->type); 494 } 495 } 496 c = peek(); /* look for '(' */ 497 if (c != '(' && infunc && (n=isarg(w)) >= 0) { 498 yylval.i = n; 499 RET(ARG); 500 } else { 501 yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab); 502 if (c == '(') { 503 RET(CALL); 504 } else { 505 RET(VAR); 506 } 507 } 508 } 509 510 void startreg(void) /* next call to yylex will return a regular expression */ 511 { 512 reg = 1; 513 } 514 515 int regexpr(void) 516 { 517 int c, openclass = 0; 518 static char *buf = 0; 519 static int bufsz = 500; 520 char *bp; 521 522 if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL) 523 FATAL("out of space for rex expr"); 524 bp = buf; 525 for ( ; ((c = input()) != '/' || openclass == 1) && c != 0; ) { 526 if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr")) 527 FATAL("out of space for reg expr %.10s...", buf); 528 if (c == '\n') { 529 SYNTAX( "newline in regular expression %.10s...", buf ); 530 unput('\n'); 531 break; 532 } else if (c == '\\') { 533 *bp++ = '\\'; 534 *bp++ = input(); 535 } else { 536 if (c == '[') 537 openclass = 1; 538 else if (c == ']') 539 openclass = 0; 540 *bp++ = c; 541 } 542 } 543 *bp = 0; 544 if (c == 0) 545 SYNTAX("non-terminated regular expression %.10s...", buf); 546 yylval.s = tostring(buf); 547 unput('/'); 548 RET(REGEXPR); 549 } 550 551 /* low-level lexical stuff, sort of inherited from lex */ 552 553 char ebuf[300]; 554 char *ep = ebuf; 555 char yysbuf[100]; /* pushback buffer */ 556 char *yysptr = yysbuf; 557 FILE *yyin = 0; 558 559 int input(void) /* get next lexical input character */ 560 { 561 int c; 562 extern char *lexprog; 563 564 if (yysptr > yysbuf) 565 c = (uschar)*--yysptr; 566 else if (lexprog != NULL) { /* awk '...' */ 567 if ((c = (uschar)*lexprog) != 0) 568 lexprog++; 569 } else /* awk -f ... */ 570 c = pgetc(); 571 if (c == '\n') 572 lineno++; 573 else if (c == EOF) 574 c = 0; 575 if (ep >= ebuf + sizeof ebuf) 576 ep = ebuf; 577 return *ep++ = c; 578 } 579 580 void unput(int c) /* put lexical character back on input */ 581 { 582 if (c == '\n') 583 lineno--; 584 if (yysptr >= yysbuf + sizeof(yysbuf)) 585 FATAL("pushed back too much: %.20s...", yysbuf); 586 *yysptr++ = c; 587 if (--ep < ebuf) 588 ep = ebuf + sizeof(ebuf) - 1; 589 } 590 591 void unputstr(const char *s) /* put a string back on input */ 592 { 593 int i; 594 595 for (i = strlen(s)-1; i >= 0; i--) 596 unput(s[i]); 597 }