morpheus-base

morpheus base system
git clone git://git.2f30.org/morpheus-base
Log | Files | Refs

lex.c (13533B)


      1 /*	$OpenBSD: lex.c,v 1.12 2011/09/28 19:27:18 millert Exp $	*/
      2 /****************************************************************
      3 Copyright (C) Lucent Technologies 1997
      4 All Rights Reserved
      5 
      6 Permission to use, copy, modify, and distribute this software and
      7 its documentation for any purpose and without fee is hereby
      8 granted, provided that the above copyright notice appear in all
      9 copies and that both that the copyright notice and this
     10 permission notice and warranty disclaimer appear in supporting
     11 documentation, and that the name Lucent Technologies or any of
     12 its entities not be used in advertising or publicity pertaining
     13 to distribution of the software without specific, written prior
     14 permission.
     15 
     16 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
     17 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
     18 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
     19 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
     20 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
     21 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
     22 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
     23 THIS SOFTWARE.
     24 ****************************************************************/
     25 
     26 #include <stdio.h>
     27 #include <stdlib.h>
     28 #include <string.h>
     29 #include <ctype.h>
     30 #include "awk.h"
     31 #include "ytab.h"
     32 
     33 extern YYSTYPE	yylval;
     34 extern int	infunc;
     35 
     36 int	lineno	= 1;
     37 int	bracecnt = 0;
     38 int	brackcnt  = 0;
     39 int	parencnt = 0;
     40 
     41 typedef struct Keyword {
     42 	const char *word;
     43 	int	sub;
     44 	int	type;
     45 } Keyword;
     46 
     47 Keyword keywords[] ={	/* keep sorted: binary searched */
     48 	{ "BEGIN",	XBEGIN,		XBEGIN },
     49 	{ "END",	XEND,		XEND },
     50 	{ "NF",		VARNF,		VARNF },
     51 	{ "and",	FAND,		BLTIN },
     52 	{ "atan2",	FATAN,		BLTIN },
     53 	{ "break",	BREAK,		BREAK },
     54 	{ "close",	CLOSE,		CLOSE },
     55 	{ "compl",	FCOMPL,		BLTIN },
     56 	{ "continue",	CONTINUE,	CONTINUE },
     57 	{ "cos",	FCOS,		BLTIN },
     58 	{ "delete",	DELETE,		DELETE },
     59 	{ "do",		DO,		DO },
     60 	{ "else",	ELSE,		ELSE },
     61 	{ "exit",	EXIT,		EXIT },
     62 	{ "exp",	FEXP,		BLTIN },
     63 	{ "fflush",	FFLUSH,		BLTIN },
     64 	{ "for",	FOR,		FOR },
     65 	{ "func",	FUNC,		FUNC },
     66 	{ "function",	FUNC,		FUNC },
     67 	{ "getline",	GETLINE,	GETLINE },
     68 	{ "gsub",	GSUB,		GSUB },
     69 	{ "if",		IF,		IF },
     70 	{ "in",		IN,		IN },
     71 	{ "index",	INDEX,		INDEX },
     72 	{ "int",	FINT,		BLTIN },
     73 	{ "length",	FLENGTH,	BLTIN },
     74 	{ "log",	FLOG,		BLTIN },
     75 	{ "lshift",	FLSHIFT,	BLTIN },
     76 	{ "match",	MATCHFCN,	MATCHFCN },
     77 	{ "next",	NEXT,		NEXT },
     78 	{ "nextfile",	NEXTFILE,	NEXTFILE },
     79 	{ "or",		FFOR,		BLTIN },
     80 	{ "print",	PRINT,		PRINT },
     81 	{ "printf",	PRINTF,		PRINTF },
     82 	{ "rand",	FRAND,		BLTIN },
     83 	{ "return",	RETURN,		RETURN },
     84 	{ "rshift",	FRSHIFT,	BLTIN },
     85 	{ "sin",	FSIN,		BLTIN },
     86 	{ "split",	SPLIT,		SPLIT },
     87 	{ "sprintf",	SPRINTF,	SPRINTF },
     88 	{ "sqrt",	FSQRT,		BLTIN },
     89 	{ "srand",	FSRAND,		BLTIN },
     90 	{ "sub",	SUB,		SUB },
     91 	{ "substr",	SUBSTR,		SUBSTR },
     92 	{ "system",	FSYSTEM,	BLTIN },
     93 	{ "tolower",	FTOLOWER,	BLTIN },
     94 	{ "toupper",	FTOUPPER,	BLTIN },
     95 	{ "while",	WHILE,		WHILE },
     96 	{ "xor",	FXOR,		BLTIN },
     97 };
     98 
     99 #define	RET(x)	{ if(dbg)printf("lex %s\n", tokname(x)); return(x); }
    100 
    101 int peek(void);
    102 int gettok(char **, int *);
    103 int binsearch(char *, Keyword *, int);
    104 
    105 int peek(void)
    106 {
    107 	int c = input();
    108 	unput(c);
    109 	return c;
    110 }
    111 
    112 int gettok(char **pbuf, int *psz)	/* get next input token */
    113 {
    114 	int c, retc;
    115 	char *buf = *pbuf;
    116 	int sz = *psz;
    117 	char *bp = buf;
    118 
    119 	c = input();
    120 	if (c == 0)
    121 		return 0;
    122 	buf[0] = c;
    123 	buf[1] = 0;
    124 	if (!isalnum(c) && c != '.' && c != '_')
    125 		return c;
    126 
    127 	*bp++ = c;
    128 	if (isalpha(c) || c == '_') {	/* it's a varname */
    129 		for ( ; (c = input()) != 0; ) {
    130 			if (bp-buf >= sz)
    131 				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
    132 					FATAL( "out of space for name %.10s...", buf );
    133 			if (isalnum(c) || c == '_')
    134 				*bp++ = c;
    135 			else {
    136 				*bp = 0;
    137 				unput(c);
    138 				break;
    139 			}
    140 		}
    141 		*bp = 0;
    142 		retc = 'a';	/* alphanumeric */
    143 	} else {	/* maybe it's a number, but could be . */
    144 		char *rem;
    145 		/* read input until can't be a number */
    146 		for ( ; (c = input()) != 0; ) {
    147 			if (bp-buf >= sz)
    148 				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
    149 					FATAL( "out of space for number %.10s...", buf );
    150 			if (isdigit(c) || c == 'e' || c == 'E' 
    151 			  || c == '.' || c == '+' || c == '-')
    152 				*bp++ = c;
    153 			else {
    154 				unput(c);
    155 				break;
    156 			}
    157 		}
    158 		*bp = 0;
    159 		strtod(buf, &rem);	/* parse the number */
    160 		if (rem == buf) {	/* it wasn't a valid number at all */
    161 			buf[1] = 0;	/* return one character as token */
    162 			retc = buf[0];	/* character is its own type */
    163 			unputstr(rem+1); /* put rest back for later */
    164 		} else {	/* some prefix was a number */
    165 			unputstr(rem);	/* put rest back for later */
    166 			rem[0] = 0;	/* truncate buf after number part */
    167 			retc = '0';	/* type is number */
    168 		}
    169 	}
    170 	*pbuf = buf;
    171 	*psz = sz;
    172 	return retc;
    173 }
    174 
    175 int	word(char *);
    176 int	string(void);
    177 int	regexpr(void);
    178 int	sc	= 0;	/* 1 => return a } right now */
    179 int	reg	= 0;	/* 1 => return a REGEXPR now */
    180 
    181 int yylex(void)
    182 {
    183 	int c;
    184 	static char *buf = 0;
    185 	static int bufsize = 5; /* BUG: setting this small causes core dump! */
    186 
    187 	if (buf == 0 && (buf = (char *) malloc(bufsize)) == NULL)
    188 		FATAL( "out of space in yylex" );
    189 	if (sc) {
    190 		sc = 0;
    191 		RET('}');
    192 	}
    193 	if (reg) {
    194 		reg = 0;
    195 		return regexpr();
    196 	}
    197 	for (;;) {
    198 		c = gettok(&buf, &bufsize);
    199 		if (c == 0)
    200 			return 0;
    201 		if (isalpha(c) || c == '_')
    202 			return word(buf);
    203 		if (isdigit(c)) {
    204 			yylval.cp = setsymtab(buf, tostring(buf), atof(buf), CON|NUM, symtab);
    205 			/* should this also have STR set? */
    206 			RET(NUMBER);
    207 		}
    208 	
    209 		yylval.i = c;
    210 		switch (c) {
    211 		case '\n':	/* {EOL} */
    212 			RET(NL);
    213 		case '\r':	/* assume \n is coming */
    214 		case ' ':	/* {WS}+ */
    215 		case '\t':
    216 			break;
    217 		case '#':	/* #.* strip comments */
    218 			while ((c = input()) != '\n' && c != 0)
    219 				;
    220 			unput(c);
    221 			break;
    222 		case ';':
    223 			RET(';');
    224 		case '\\':
    225 			if (peek() == '\n') {
    226 				input();
    227 			} else if (peek() == '\r') {
    228 				input(); input();	/* \n */
    229 				lineno++;
    230 			} else {
    231 				RET(c);
    232 			}
    233 			break;
    234 		case '&':
    235 			if (peek() == '&') {
    236 				input(); RET(AND);
    237 			} else 
    238 				RET('&');
    239 		case '|':
    240 			if (peek() == '|') {
    241 				input(); RET(BOR);
    242 			} else
    243 				RET('|');
    244 		case '!':
    245 			if (peek() == '=') {
    246 				input(); yylval.i = NE; RET(NE);
    247 			} else if (peek() == '~') {
    248 				input(); yylval.i = NOTMATCH; RET(MATCHOP);
    249 			} else
    250 				RET(NOT);
    251 		case '~':
    252 			yylval.i = MATCH;
    253 			RET(MATCHOP);
    254 		case '<':
    255 			if (peek() == '=') {
    256 				input(); yylval.i = LE; RET(LE);
    257 			} else {
    258 				yylval.i = LT; RET(LT);
    259 			}
    260 		case '=':
    261 			if (peek() == '=') {
    262 				input(); yylval.i = EQ; RET(EQ);
    263 			} else {
    264 				yylval.i = ASSIGN; RET(ASGNOP);
    265 			}
    266 		case '>':
    267 			if (peek() == '=') {
    268 				input(); yylval.i = GE; RET(GE);
    269 			} else if (peek() == '>') {
    270 				input(); yylval.i = APPEND; RET(APPEND);
    271 			} else {
    272 				yylval.i = GT; RET(GT);
    273 			}
    274 		case '+':
    275 			if (peek() == '+') {
    276 				input(); yylval.i = INCR; RET(INCR);
    277 			} else if (peek() == '=') {
    278 				input(); yylval.i = ADDEQ; RET(ASGNOP);
    279 			} else
    280 				RET('+');
    281 		case '-':
    282 			if (peek() == '-') {
    283 				input(); yylval.i = DECR; RET(DECR);
    284 			} else if (peek() == '=') {
    285 				input(); yylval.i = SUBEQ; RET(ASGNOP);
    286 			} else
    287 				RET('-');
    288 		case '*':
    289 			if (peek() == '=') {	/* *= */
    290 				input(); yylval.i = MULTEQ; RET(ASGNOP);
    291 			} else if (peek() == '*') {	/* ** or **= */
    292 				input();	/* eat 2nd * */
    293 				if (peek() == '=') {
    294 					input(); yylval.i = POWEQ; RET(ASGNOP);
    295 				} else {
    296 					RET(POWER);
    297 				}
    298 			} else
    299 				RET('*');
    300 		case '/':
    301 			RET('/');
    302 		case '%':
    303 			if (peek() == '=') {
    304 				input(); yylval.i = MODEQ; RET(ASGNOP);
    305 			} else
    306 				RET('%');
    307 		case '^':
    308 			if (peek() == '=') {
    309 				input(); yylval.i = POWEQ; RET(ASGNOP);
    310 			} else
    311 				RET(POWER);
    312 
    313 		case '$':
    314 			/* BUG: awkward, if not wrong */
    315 			c = gettok(&buf, &bufsize);
    316 			if (isalpha(c)) {
    317 				if (strcmp(buf, "NF") == 0) {	/* very special */
    318 					unputstr("(NF)");
    319 					RET(INDIRECT);
    320 				}
    321 				c = peek();
    322 				if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
    323 					unputstr(buf);
    324 					RET(INDIRECT);
    325 				}
    326 				yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
    327 				RET(IVAR);
    328 			} else if (c == 0) {	/*  */
    329 				SYNTAX( "unexpected end of input after $" );
    330 				RET(';');
    331 			} else {
    332 				unputstr(buf);
    333 				RET(INDIRECT);
    334 			}
    335 	
    336 		case '}':
    337 			if (--bracecnt < 0)
    338 				SYNTAX( "extra }" );
    339 			sc = 1;
    340 			RET(';');
    341 		case ']':
    342 			if (--brackcnt < 0)
    343 				SYNTAX( "extra ]" );
    344 			RET(']');
    345 		case ')':
    346 			if (--parencnt < 0)
    347 				SYNTAX( "extra )" );
    348 			RET(')');
    349 		case '{':
    350 			bracecnt++;
    351 			RET('{');
    352 		case '[':
    353 			brackcnt++;
    354 			RET('[');
    355 		case '(':
    356 			parencnt++;
    357 			RET('(');
    358 	
    359 		case '"':
    360 			return string();	/* BUG: should be like tran.c ? */
    361 	
    362 		default:
    363 			RET(c);
    364 		}
    365 	}
    366 }
    367 
    368 int string(void)
    369 {
    370 	int c, n;
    371 	char *s, *bp;
    372 	static char *buf = 0;
    373 	static int bufsz = 500;
    374 
    375 	if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL)
    376 		FATAL("out of space for strings");
    377 	for (bp = buf; (c = input()) != '"'; ) {
    378 		if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string"))
    379 			FATAL("out of space for string %.10s...", buf);
    380 		switch (c) {
    381 		case '\n':
    382 		case '\r':
    383 		case 0:
    384 			SYNTAX( "non-terminated string %.10s...", buf );
    385 			lineno++;
    386 			if (c == 0)	/* hopeless */
    387 				FATAL( "giving up" );
    388 			break;
    389 		case '\\':
    390 			c = input();
    391 			switch (c) {
    392 			case '"': *bp++ = '"'; break;
    393 			case 'n': *bp++ = '\n'; break;	
    394 			case 't': *bp++ = '\t'; break;
    395 			case 'f': *bp++ = '\f'; break;
    396 			case 'r': *bp++ = '\r'; break;
    397 			case 'b': *bp++ = '\b'; break;
    398 			case 'v': *bp++ = '\v'; break;
    399 			case 'a': *bp++ = '\007'; break;
    400 			case '\\': *bp++ = '\\'; break;
    401 
    402 			case '0': case '1': case '2': /* octal: \d \dd \ddd */
    403 			case '3': case '4': case '5': case '6': case '7':
    404 				n = c - '0';
    405 				if ((c = peek()) >= '0' && c < '8') {
    406 					n = 8 * n + input() - '0';
    407 					if ((c = peek()) >= '0' && c < '8')
    408 						n = 8 * n + input() - '0';
    409 				}
    410 				*bp++ = n;
    411 				break;
    412 
    413 			case 'x':	/* hex  \x0-9a-fA-F + */
    414 			    {	char xbuf[100], *px;
    415 				for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) {
    416 					if (isdigit(c)
    417 					 || (c >= 'a' && c <= 'f')
    418 					 || (c >= 'A' && c <= 'F'))
    419 						*px++ = c;
    420 					else
    421 						break;
    422 				}
    423 				*px = 0;
    424 				unput(c);
    425 	  			sscanf(xbuf, "%x", (unsigned int *) &n);
    426 				*bp++ = n;
    427 				break;
    428 			    }
    429 
    430 			default: 
    431 				*bp++ = c;
    432 				break;
    433 			}
    434 			break;
    435 		default:
    436 			*bp++ = c;
    437 			break;
    438 		}
    439 	}
    440 	*bp = 0; 
    441 	s = tostring(buf);
    442 	*bp++ = ' '; *bp++ = 0;
    443 	yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
    444 	RET(STRING);
    445 }
    446 
    447 
    448 int binsearch(char *w, Keyword *kp, int n)
    449 {
    450 	int cond, low, mid, high;
    451 
    452 	low = 0;
    453 	high = n - 1;
    454 	while (low <= high) {
    455 		mid = (low + high) / 2;
    456 		if ((cond = strcmp(w, kp[mid].word)) < 0)
    457 			high = mid - 1;
    458 		else if (cond > 0)
    459 			low = mid + 1;
    460 		else
    461 			return mid;
    462 	}
    463 	return -1;
    464 }
    465 
    466 int word(char *w) 
    467 {
    468 	Keyword *kp;
    469 	int c, n;
    470 
    471 	n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
    472 /* BUG: this ought to be inside the if; in theory could fault (daniel barrett) */
    473 	kp = keywords + n;
    474 	if (n != -1) {	/* found in table */
    475 		yylval.i = kp->sub;
    476 		switch (kp->type) {	/* special handling */
    477 		case BLTIN:
    478 			if (kp->sub == FSYSTEM && safe)
    479 				SYNTAX( "system is unsafe" );
    480 			RET(kp->type);
    481 		case FUNC:
    482 			if (infunc)
    483 				SYNTAX( "illegal nested function" );
    484 			RET(kp->type);
    485 		case RETURN:
    486 			if (!infunc)
    487 				SYNTAX( "return not in function" );
    488 			RET(kp->type);
    489 		case VARNF:
    490 			yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
    491 			RET(VARNF);
    492 		default:
    493 			RET(kp->type);
    494 		}
    495 	}
    496 	c = peek();	/* look for '(' */
    497 	if (c != '(' && infunc && (n=isarg(w)) >= 0) {
    498 		yylval.i = n;
    499 		RET(ARG);
    500 	} else {
    501 		yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
    502 		if (c == '(') {
    503 			RET(CALL);
    504 		} else {
    505 			RET(VAR);
    506 		}
    507 	}
    508 }
    509 
    510 void startreg(void)	/* next call to yylex will return a regular expression */
    511 {
    512 	reg = 1;
    513 }
    514 
    515 int regexpr(void)
    516 {
    517 	int c, openclass = 0;
    518 	static char *buf = 0;
    519 	static int bufsz = 500;
    520 	char *bp;
    521 
    522 	if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL)
    523 		FATAL("out of space for rex expr");
    524 	bp = buf;
    525 	for ( ; ((c = input()) != '/' || openclass == 1) && c != 0; ) {
    526 		if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr"))
    527 			FATAL("out of space for reg expr %.10s...", buf);
    528 		if (c == '\n') {
    529 			SYNTAX( "newline in regular expression %.10s...", buf ); 
    530 			unput('\n');
    531 			break;
    532 		} else if (c == '\\') {
    533 			*bp++ = '\\'; 
    534 			*bp++ = input();
    535 		} else {
    536 			if (c == '[')
    537 				openclass = 1;
    538 			else if (c == ']')
    539 				openclass = 0;
    540 			*bp++ = c;
    541 		}
    542 	}
    543 	*bp = 0;
    544 	if (c == 0)
    545 		SYNTAX("non-terminated regular expression %.10s...", buf);
    546 	yylval.s = tostring(buf);
    547 	unput('/');
    548 	RET(REGEXPR);
    549 }
    550 
    551 /* low-level lexical stuff, sort of inherited from lex */
    552 
    553 char	ebuf[300];
    554 char	*ep = ebuf;
    555 char	yysbuf[100];	/* pushback buffer */
    556 char	*yysptr = yysbuf;
    557 FILE	*yyin = 0;
    558 
    559 int input(void)	/* get next lexical input character */
    560 {
    561 	int c;
    562 	extern char *lexprog;
    563 
    564 	if (yysptr > yysbuf)
    565 		c = (uschar)*--yysptr;
    566 	else if (lexprog != NULL) {	/* awk '...' */
    567 		if ((c = (uschar)*lexprog) != 0)
    568 			lexprog++;
    569 	} else				/* awk -f ... */
    570 		c = pgetc();
    571 	if (c == '\n')
    572 		lineno++;
    573 	else if (c == EOF)
    574 		c = 0;
    575 	if (ep >= ebuf + sizeof ebuf)
    576 		ep = ebuf;
    577 	return *ep++ = c;
    578 }
    579 
    580 void unput(int c)	/* put lexical character back on input */
    581 {
    582 	if (c == '\n')
    583 		lineno--;
    584 	if (yysptr >= yysbuf + sizeof(yysbuf))
    585 		FATAL("pushed back too much: %.20s...", yysbuf);
    586 	*yysptr++ = c;
    587 	if (--ep < ebuf)
    588 		ep = ebuf + sizeof(ebuf) - 1;
    589 }
    590 
    591 void unputstr(const char *s)	/* put a string back on input */
    592 {
    593 	int i;
    594 
    595 	for (i = strlen(s)-1; i >= 0; i--)
    596 		unput(s[i]);
    597 }