hbase

heirloom base
git clone git://git.2f30.org/hbase
Log | Files | Refs | README

regcomp.c (9743B)


      1 #include "lib9.h"
      2 #include "regexp9.h"
      3 #include "regcomp.h"
      4 
      5 #define	TRUE	1
      6 #define	FALSE	0
      7 
      8 /*
      9  * Parser Information
     10  */
     11 typedef
     12 struct Node
     13 {
     14 	Reinst*	first;
     15 	Reinst*	last;
     16 }Node;
     17 
     18 #define	NSTACK	20
     19 static	Node	andstack[NSTACK];
     20 static	Node	*andp;
     21 static	int	atorstack[NSTACK];
     22 static	int*	atorp;
     23 static	int	cursubid;		/* id of current subexpression */
     24 static	int	subidstack[NSTACK];	/* parallel to atorstack */
     25 static	int*	subidp;
     26 static	int	lastwasand;	/* Last token was operand */
     27 static	int	nbra;
     28 static	char*	exprp;		/* pointer to next character in source expression */
     29 static	int	lexdone;
     30 static	int	nclass;
     31 static	Reclass*classp;
     32 static	Reinst*	freep;
     33 static	int	errors;
     34 static	Rune	yyrune;		/* last lex'd rune */
     35 static	Reclass*yyclassp;	/* last lex'd class */
     36 
     37 /* predeclared crap */
     38 static	void	operator(int);
     39 static	void	pushand(Reinst*, Reinst*);
     40 static	void	pushator(int);
     41 static	void	evaluntil(int);
     42 static	int	bldcclass(void);
     43 
     44 static jmp_buf regkaboom;
     45 
     46 static	void
     47 rcerror(char *s)
     48 {
     49 	errors++;
     50 	regerror(s);
     51 	longjmp(regkaboom, 1);
     52 }
     53 
     54 static	Reinst*
     55 newinst(int t)
     56 {
     57 	freep->type = t;
     58 	freep->u2.left = 0;
     59 	freep->u1.right = 0;
     60 	return freep++;
     61 }
     62 
     63 static	void
     64 operand(int t)
     65 {
     66 	Reinst *i;
     67 
     68 	if(lastwasand)
     69 		operator(CAT);	/* catenate is implicit */
     70 	i = newinst(t);
     71 
     72 	if(t == CCLASS || t == NCCLASS)
     73 		i->u1.cp = yyclassp;
     74 	if(t == RUNE)
     75 		i->u1.r = yyrune;
     76 
     77 	pushand(i, i);
     78 	lastwasand = TRUE;
     79 }
     80 
     81 static	void
     82 operator(int t)
     83 {
     84 	if(t==RBRA && --nbra<0)
     85 		rcerror("unmatched right paren");
     86 	if(t==LBRA){
     87 		if(++cursubid >= NSUBEXP)
     88 			rcerror ("too many subexpressions");
     89 		nbra++;
     90 		if(lastwasand)
     91 			operator(CAT);
     92 	} else
     93 		evaluntil(t);
     94 	if(t != RBRA)
     95 		pushator(t);
     96 	lastwasand = FALSE;
     97 	if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
     98 		lastwasand = TRUE;	/* these look like operands */
     99 }
    100 
    101 static	void
    102 regerr2(char *s, int c)
    103 {
    104 	char buf[100];
    105 	char *cp = buf;
    106 	while(*s)
    107 		*cp++ = *s++;
    108 	*cp++ = c;
    109 	*cp = '\0'; 
    110 	rcerror(buf);
    111 }
    112 
    113 static	void
    114 cant(char *s)
    115 {
    116 	char buf[100];
    117 	strcpy(buf, "can't happen: ");
    118 	strcat(buf, s);
    119 	rcerror(buf);
    120 }
    121 
    122 static	void
    123 pushand(Reinst *f, Reinst *l)
    124 {
    125 	if(andp >= &andstack[NSTACK])
    126 		cant("operand stack overflow");
    127 	andp->first = f;
    128 	andp->last = l;
    129 	andp++;
    130 }
    131 
    132 static	void
    133 pushator(int t)
    134 {
    135 	if(atorp >= &atorstack[NSTACK])
    136 		cant("operator stack overflow");
    137 	*atorp++ = t;
    138 	*subidp++ = cursubid;
    139 }
    140 
    141 static	Node*
    142 popand(int op)
    143 {
    144 	Reinst *inst;
    145 
    146 	if(andp <= &andstack[0]){
    147 		regerr2("missing operand for ", op);
    148 		inst = newinst(NOP);
    149 		pushand(inst,inst);
    150 	}
    151 	return --andp;
    152 }
    153 
    154 static	int
    155 popator(void)
    156 {
    157 	if(atorp <= &atorstack[0])
    158 		cant("operator stack underflow");
    159 	--subidp;
    160 	return *--atorp;
    161 }
    162 
    163 static	void
    164 evaluntil(int pri)
    165 {
    166 	Node *op1, *op2;
    167 	Reinst *inst1, *inst2;
    168 
    169 	while(pri==RBRA || atorp[-1]>=pri){
    170 		switch(popator()){
    171 		default:
    172 			rcerror("unknown operator in evaluntil");
    173 			break;
    174 		case LBRA:		/* must have been RBRA */
    175 			op1 = popand('(');
    176 			inst2 = newinst(RBRA);
    177 			inst2->u1.subid = *subidp;
    178 			op1->last->u2.next = inst2;
    179 			inst1 = newinst(LBRA);
    180 			inst1->u1.subid = *subidp;
    181 			inst1->u2.next = op1->first;
    182 			pushand(inst1, inst2);
    183 			return;
    184 		case OR:
    185 			op2 = popand('|');
    186 			op1 = popand('|');
    187 			inst2 = newinst(NOP);
    188 			op2->last->u2.next = inst2;
    189 			op1->last->u2.next = inst2;
    190 			inst1 = newinst(OR);
    191 			inst1->u1.right = op1->first;
    192 			inst1->u2.left = op2->first;
    193 			pushand(inst1, inst2);
    194 			break;
    195 		case CAT:
    196 			op2 = popand(0);
    197 			op1 = popand(0);
    198 			op1->last->u2.next = op2->first;
    199 			pushand(op1->first, op2->last);
    200 			break;
    201 		case STAR:
    202 			op2 = popand('*');
    203 			inst1 = newinst(OR);
    204 			op2->last->u2.next = inst1;
    205 			inst1->u1.right = op2->first;
    206 			pushand(inst1, inst1);
    207 			break;
    208 		case PLUS:
    209 			op2 = popand('+');
    210 			inst1 = newinst(OR);
    211 			op2->last->u2.next = inst1;
    212 			inst1->u1.right = op2->first;
    213 			pushand(op2->first, inst1);
    214 			break;
    215 		case QUEST:
    216 			op2 = popand('?');
    217 			inst1 = newinst(OR);
    218 			inst2 = newinst(NOP);
    219 			inst1->u2.left = inst2;
    220 			inst1->u1.right = op2->first;
    221 			op2->last->u2.next = inst2;
    222 			pushand(inst1, inst2);
    223 			break;
    224 		}
    225 	}
    226 }
    227 
    228 static	Reprog*
    229 optimize(Reprog *pp)
    230 {
    231 	Reinst *inst, *target;
    232 	int size;
    233 	Reprog *npp;
    234 	Reclass *cl;
    235 	int diff;
    236 
    237 	/*
    238 	 *  get rid of NOOP chains
    239 	 */
    240 	for(inst=pp->firstinst; inst->type!=END; inst++){
    241 		target = inst->u2.next;
    242 		while(target->type == NOP)
    243 			target = target->u2.next;
    244 		inst->u2.next = target;
    245 	}
    246 
    247 	/*
    248 	 *  The original allocation is for an area larger than
    249 	 *  necessary.  Reallocate to the actual space used
    250 	 *  and then relocate the code.
    251 	 */
    252 	size = sizeof(Reprog) + (freep - pp->firstinst)*sizeof(Reinst);
    253 	npp = realloc(pp, size);
    254 	if(npp==0 || npp==pp)
    255 		return pp;
    256 	diff = (char *)npp - (char *)pp;
    257 	freep = (Reinst *)((char *)freep + diff);
    258 	for(inst=npp->firstinst; inst<freep; inst++){
    259 		switch(inst->type){
    260 		case OR:
    261 		case STAR:
    262 		case PLUS:
    263 		case QUEST:
    264 			inst->u1.right = (void*)((char*)inst->u1.right + diff);
    265 			break;
    266 		case CCLASS:
    267 		case NCCLASS:
    268 			inst->u1.right = (void*)((char*)inst->u1.right + diff);
    269 			cl = inst->u1.cp;
    270 			cl->end = (void*)((char*)cl->end + diff);
    271 			break;
    272 		}
    273 		inst->u2.left = (void*)((char*)inst->u2.left + diff);
    274 	}
    275 	npp->startinst = (void*)((char*)npp->startinst + diff);
    276 	return npp;
    277 }
    278 
    279 #ifdef	DEBUG
    280 static	void
    281 dumpstack(void){
    282 	Node *stk;
    283 	int *ip;
    284 
    285 	print("operators\n");
    286 	for(ip=atorstack; ip<atorp; ip++)
    287 		print("0%o\n", *ip);
    288 	print("operands\n");
    289 	for(stk=andstack; stk<andp; stk++)
    290 		print("0%o\t0%o\n", stk->first->type, stk->last->type);
    291 }
    292 
    293 static	void
    294 dump(Reprog *pp)
    295 {
    296 	Reinst *l;
    297 	Rune *p;
    298 
    299 	l = pp->firstinst;
    300 	do{
    301 		print("%d:\t0%o\t%d\t%d", l-pp->firstinst, l->type,
    302 			l->u2.left-pp->firstinst, l->u1.right-pp->firstinst);
    303 		if(l->type == RUNE)
    304 			print("\t%C\n", l->u1.r);
    305 		else if(l->type == CCLASS || l->type == NCCLASS){
    306 			print("\t[");
    307 			if(l->type == NCCLASS)
    308 				print("^");
    309 			for(p = l->u1.cp->spans; p < l->u1.cp->end; p += 2)
    310 				if(p[0] == p[1])
    311 					print("%C", p[0]);
    312 				else
    313 					print("%C-%C", p[0], p[1]);
    314 			print("]\n");
    315 		} else
    316 			print("\n");
    317 	}while(l++->type);
    318 }
    319 #endif
    320 
    321 static	Reclass*
    322 newclass(void)
    323 {
    324 	if(nclass >= NCLASS)
    325 		regerr2("too many character classes; limit", NCLASS+'0');
    326 	return &(classp[nclass++]);
    327 }
    328 
    329 static	int
    330 nextc(Rune *rp)
    331 {
    332 	if(lexdone){
    333 		*rp = 0;
    334 		return 1;
    335 	}
    336 	exprp += chartorune(rp, exprp);
    337 	if(*rp == '\\'){
    338 		exprp += chartorune(rp, exprp);
    339 		return 1;
    340 	}
    341 	if(*rp == 0)
    342 		lexdone = 1;
    343 	return 0;
    344 }
    345 
    346 static	int
    347 lex(int literal, int dot_type)
    348 {
    349 	int quoted;
    350 
    351 	quoted = nextc(&yyrune);
    352 	if(literal || quoted){
    353 		if(yyrune == 0)
    354 			return END;
    355 		return RUNE;
    356 	}
    357 
    358 	switch(yyrune){
    359 	case 0:
    360 		return END;
    361 	case '*':
    362 		return STAR;
    363 	case '?':
    364 		return QUEST;
    365 	case '+':
    366 		return PLUS;
    367 	case '|':
    368 		return OR;
    369 	case '.':
    370 		return dot_type;
    371 	case '(':
    372 		return LBRA;
    373 	case ')':
    374 		return RBRA;
    375 	case '^':
    376 		return BOL;
    377 	case '$':
    378 		return EOL;
    379 	case '[':
    380 		return bldcclass();
    381 	}
    382 	return RUNE;
    383 }
    384 
    385 static int
    386 bldcclass(void)
    387 {
    388 	int type;
    389 	Rune r[NCCRUNE];
    390 	Rune *p, *ep, *np;
    391 	Rune rune;
    392 	int quoted;
    393 
    394 	/* we have already seen the '[' */
    395 	type = CCLASS;
    396 	yyclassp = newclass();
    397 
    398 	/* look ahead for negation */
    399 	/* SPECIAL CASE!!! negated classes don't match \n */
    400 	ep = r;
    401 	quoted = nextc(&rune);
    402 	if(!quoted && rune == '^'){
    403 		type = NCCLASS;
    404 		quoted = nextc(&rune);
    405 		*ep++ = '\n';
    406 		*ep++ = '\n';
    407 	}
    408 
    409 	/* parse class into a set of spans */
    410 	for(; ep<&r[NCCRUNE];){
    411 		if(rune == 0){
    412 			rcerror("malformed '[]'");
    413 			return 0;
    414 		}
    415 		if(!quoted && rune == ']')
    416 			break;
    417 		if(!quoted && rune == '-'){
    418 			if(ep == r){
    419 				rcerror("malformed '[]'");
    420 				return 0;
    421 			}
    422 			quoted = nextc(&rune);
    423 			if((!quoted && rune == ']') || rune == 0){
    424 				rcerror("malformed '[]'");
    425 				return 0;
    426 			}
    427 			*(ep-1) = rune;
    428 		} else {
    429 			*ep++ = rune;
    430 			*ep++ = rune;
    431 		}
    432 		quoted = nextc(&rune);
    433 	}
    434 
    435 	/* sort on span start */
    436 	for(p = r; p < ep; p += 2){
    437 		for(np = p; np < ep; np += 2)
    438 			if(*np < *p){
    439 				rune = np[0];
    440 				np[0] = p[0];
    441 				p[0] = rune;
    442 				rune = np[1];
    443 				np[1] = p[1];
    444 				p[1] = rune;
    445 			}
    446 	}
    447 
    448 	/* merge spans */
    449 	np = yyclassp->spans;
    450 	p = r;
    451 	if(r == ep)
    452 		yyclassp->end = np;
    453 	else {
    454 		np[0] = *p++;
    455 		np[1] = *p++;
    456 		for(; p < ep; p += 2)
    457 			if(p[0] <= np[1]){
    458 				if(p[1] > np[1])
    459 					np[1] = p[1];
    460 			} else {
    461 				np += 2;
    462 				np[0] = p[0];
    463 				np[1] = p[1];
    464 			}
    465 		yyclassp->end = np+2;
    466 	}
    467 
    468 	return type;
    469 }
    470 
    471 static	Reprog*
    472 regcomp1(char *s, int literal, int dot_type)
    473 {
    474 	int token;
    475 	Reprog *volatile pp;
    476 
    477 	/* get memory for the program */
    478 	pp = malloc(sizeof(Reprog) + 6*sizeof(Reinst)*strlen(s));
    479 	if(pp == 0){
    480 		regerror("out of memory");
    481 		return 0;
    482 	}
    483 	freep = pp->firstinst;
    484 	classp = pp->class;
    485 	errors = 0;
    486 
    487 	if(setjmp(regkaboom))
    488 		goto out;
    489 
    490 	/* go compile the sucker */
    491 	lexdone = 0;
    492 	exprp = s;
    493 	nclass = 0;
    494 	nbra = 0;
    495 	atorp = atorstack;
    496 	andp = andstack;
    497 	subidp = subidstack;
    498 	lastwasand = FALSE;
    499 	cursubid = 0;
    500 
    501 	/* Start with a low priority operator to prime parser */
    502 	pushator(START-1);
    503 	while((token = lex(literal, dot_type)) != END){
    504 		if((token&0300) == OPERATOR)
    505 			operator(token);
    506 		else
    507 			operand(token);
    508 	}
    509 
    510 	/* Close with a low priority operator */
    511 	evaluntil(START);
    512 
    513 	/* Force END */
    514 	operand(END);
    515 	evaluntil(START);
    516 #ifdef DEBUG
    517 	dumpstack();
    518 #endif
    519 	if(nbra)
    520 		rcerror("unmatched left paren");
    521 	--andp;	/* points to first and only operand */
    522 	pp->startinst = andp->first;
    523 #ifdef DEBUG
    524 	dump(pp);
    525 #endif
    526 	pp = optimize(pp);
    527 #ifdef DEBUG
    528 	print("start: %d\n", andp->first-pp->firstinst);
    529 	dump(pp);
    530 #endif
    531 out:
    532 	if(errors){
    533 		free(pp);
    534 		pp = 0;
    535 	}
    536 	return pp;
    537 }
    538 
    539 extern	Reprog*
    540 regcomp(char *s)
    541 {
    542 	return regcomp1(s, 0, ANY);
    543 }
    544 
    545 extern	Reprog*
    546 regcomplit(char *s)
    547 {
    548 	return regcomp1(s, 1, ANY);
    549 }
    550 
    551 extern	Reprog*
    552 regcompnl(char *s)
    553 {
    554 	return regcomp1(s, 0, ANYNL);
    555 }