regcomp.c (9743B)
1 #include "lib9.h" 2 #include "regexp9.h" 3 #include "regcomp.h" 4 5 #define TRUE 1 6 #define FALSE 0 7 8 /* 9 * Parser Information 10 */ 11 typedef 12 struct Node 13 { 14 Reinst* first; 15 Reinst* last; 16 }Node; 17 18 #define NSTACK 20 19 static Node andstack[NSTACK]; 20 static Node *andp; 21 static int atorstack[NSTACK]; 22 static int* atorp; 23 static int cursubid; /* id of current subexpression */ 24 static int subidstack[NSTACK]; /* parallel to atorstack */ 25 static int* subidp; 26 static int lastwasand; /* Last token was operand */ 27 static int nbra; 28 static char* exprp; /* pointer to next character in source expression */ 29 static int lexdone; 30 static int nclass; 31 static Reclass*classp; 32 static Reinst* freep; 33 static int errors; 34 static Rune yyrune; /* last lex'd rune */ 35 static Reclass*yyclassp; /* last lex'd class */ 36 37 /* predeclared crap */ 38 static void operator(int); 39 static void pushand(Reinst*, Reinst*); 40 static void pushator(int); 41 static void evaluntil(int); 42 static int bldcclass(void); 43 44 static jmp_buf regkaboom; 45 46 static void 47 rcerror(char *s) 48 { 49 errors++; 50 regerror(s); 51 longjmp(regkaboom, 1); 52 } 53 54 static Reinst* 55 newinst(int t) 56 { 57 freep->type = t; 58 freep->u2.left = 0; 59 freep->u1.right = 0; 60 return freep++; 61 } 62 63 static void 64 operand(int t) 65 { 66 Reinst *i; 67 68 if(lastwasand) 69 operator(CAT); /* catenate is implicit */ 70 i = newinst(t); 71 72 if(t == CCLASS || t == NCCLASS) 73 i->u1.cp = yyclassp; 74 if(t == RUNE) 75 i->u1.r = yyrune; 76 77 pushand(i, i); 78 lastwasand = TRUE; 79 } 80 81 static void 82 operator(int t) 83 { 84 if(t==RBRA && --nbra<0) 85 rcerror("unmatched right paren"); 86 if(t==LBRA){ 87 if(++cursubid >= NSUBEXP) 88 rcerror ("too many subexpressions"); 89 nbra++; 90 if(lastwasand) 91 operator(CAT); 92 } else 93 evaluntil(t); 94 if(t != RBRA) 95 pushator(t); 96 lastwasand = FALSE; 97 if(t==STAR || t==QUEST || t==PLUS || t==RBRA) 98 lastwasand = TRUE; /* these look like operands */ 99 } 100 101 static void 102 regerr2(char *s, int c) 103 { 104 char buf[100]; 105 char *cp = buf; 106 while(*s) 107 *cp++ = *s++; 108 *cp++ = c; 109 *cp = '\0'; 110 rcerror(buf); 111 } 112 113 static void 114 cant(char *s) 115 { 116 char buf[100]; 117 strcpy(buf, "can't happen: "); 118 strcat(buf, s); 119 rcerror(buf); 120 } 121 122 static void 123 pushand(Reinst *f, Reinst *l) 124 { 125 if(andp >= &andstack[NSTACK]) 126 cant("operand stack overflow"); 127 andp->first = f; 128 andp->last = l; 129 andp++; 130 } 131 132 static void 133 pushator(int t) 134 { 135 if(atorp >= &atorstack[NSTACK]) 136 cant("operator stack overflow"); 137 *atorp++ = t; 138 *subidp++ = cursubid; 139 } 140 141 static Node* 142 popand(int op) 143 { 144 Reinst *inst; 145 146 if(andp <= &andstack[0]){ 147 regerr2("missing operand for ", op); 148 inst = newinst(NOP); 149 pushand(inst,inst); 150 } 151 return --andp; 152 } 153 154 static int 155 popator(void) 156 { 157 if(atorp <= &atorstack[0]) 158 cant("operator stack underflow"); 159 --subidp; 160 return *--atorp; 161 } 162 163 static void 164 evaluntil(int pri) 165 { 166 Node *op1, *op2; 167 Reinst *inst1, *inst2; 168 169 while(pri==RBRA || atorp[-1]>=pri){ 170 switch(popator()){ 171 default: 172 rcerror("unknown operator in evaluntil"); 173 break; 174 case LBRA: /* must have been RBRA */ 175 op1 = popand('('); 176 inst2 = newinst(RBRA); 177 inst2->u1.subid = *subidp; 178 op1->last->u2.next = inst2; 179 inst1 = newinst(LBRA); 180 inst1->u1.subid = *subidp; 181 inst1->u2.next = op1->first; 182 pushand(inst1, inst2); 183 return; 184 case OR: 185 op2 = popand('|'); 186 op1 = popand('|'); 187 inst2 = newinst(NOP); 188 op2->last->u2.next = inst2; 189 op1->last->u2.next = inst2; 190 inst1 = newinst(OR); 191 inst1->u1.right = op1->first; 192 inst1->u2.left = op2->first; 193 pushand(inst1, inst2); 194 break; 195 case CAT: 196 op2 = popand(0); 197 op1 = popand(0); 198 op1->last->u2.next = op2->first; 199 pushand(op1->first, op2->last); 200 break; 201 case STAR: 202 op2 = popand('*'); 203 inst1 = newinst(OR); 204 op2->last->u2.next = inst1; 205 inst1->u1.right = op2->first; 206 pushand(inst1, inst1); 207 break; 208 case PLUS: 209 op2 = popand('+'); 210 inst1 = newinst(OR); 211 op2->last->u2.next = inst1; 212 inst1->u1.right = op2->first; 213 pushand(op2->first, inst1); 214 break; 215 case QUEST: 216 op2 = popand('?'); 217 inst1 = newinst(OR); 218 inst2 = newinst(NOP); 219 inst1->u2.left = inst2; 220 inst1->u1.right = op2->first; 221 op2->last->u2.next = inst2; 222 pushand(inst1, inst2); 223 break; 224 } 225 } 226 } 227 228 static Reprog* 229 optimize(Reprog *pp) 230 { 231 Reinst *inst, *target; 232 int size; 233 Reprog *npp; 234 Reclass *cl; 235 int diff; 236 237 /* 238 * get rid of NOOP chains 239 */ 240 for(inst=pp->firstinst; inst->type!=END; inst++){ 241 target = inst->u2.next; 242 while(target->type == NOP) 243 target = target->u2.next; 244 inst->u2.next = target; 245 } 246 247 /* 248 * The original allocation is for an area larger than 249 * necessary. Reallocate to the actual space used 250 * and then relocate the code. 251 */ 252 size = sizeof(Reprog) + (freep - pp->firstinst)*sizeof(Reinst); 253 npp = realloc(pp, size); 254 if(npp==0 || npp==pp) 255 return pp; 256 diff = (char *)npp - (char *)pp; 257 freep = (Reinst *)((char *)freep + diff); 258 for(inst=npp->firstinst; inst<freep; inst++){ 259 switch(inst->type){ 260 case OR: 261 case STAR: 262 case PLUS: 263 case QUEST: 264 inst->u1.right = (void*)((char*)inst->u1.right + diff); 265 break; 266 case CCLASS: 267 case NCCLASS: 268 inst->u1.right = (void*)((char*)inst->u1.right + diff); 269 cl = inst->u1.cp; 270 cl->end = (void*)((char*)cl->end + diff); 271 break; 272 } 273 inst->u2.left = (void*)((char*)inst->u2.left + diff); 274 } 275 npp->startinst = (void*)((char*)npp->startinst + diff); 276 return npp; 277 } 278 279 #ifdef DEBUG 280 static void 281 dumpstack(void){ 282 Node *stk; 283 int *ip; 284 285 print("operators\n"); 286 for(ip=atorstack; ip<atorp; ip++) 287 print("0%o\n", *ip); 288 print("operands\n"); 289 for(stk=andstack; stk<andp; stk++) 290 print("0%o\t0%o\n", stk->first->type, stk->last->type); 291 } 292 293 static void 294 dump(Reprog *pp) 295 { 296 Reinst *l; 297 Rune *p; 298 299 l = pp->firstinst; 300 do{ 301 print("%d:\t0%o\t%d\t%d", l-pp->firstinst, l->type, 302 l->u2.left-pp->firstinst, l->u1.right-pp->firstinst); 303 if(l->type == RUNE) 304 print("\t%C\n", l->u1.r); 305 else if(l->type == CCLASS || l->type == NCCLASS){ 306 print("\t["); 307 if(l->type == NCCLASS) 308 print("^"); 309 for(p = l->u1.cp->spans; p < l->u1.cp->end; p += 2) 310 if(p[0] == p[1]) 311 print("%C", p[0]); 312 else 313 print("%C-%C", p[0], p[1]); 314 print("]\n"); 315 } else 316 print("\n"); 317 }while(l++->type); 318 } 319 #endif 320 321 static Reclass* 322 newclass(void) 323 { 324 if(nclass >= NCLASS) 325 regerr2("too many character classes; limit", NCLASS+'0'); 326 return &(classp[nclass++]); 327 } 328 329 static int 330 nextc(Rune *rp) 331 { 332 if(lexdone){ 333 *rp = 0; 334 return 1; 335 } 336 exprp += chartorune(rp, exprp); 337 if(*rp == '\\'){ 338 exprp += chartorune(rp, exprp); 339 return 1; 340 } 341 if(*rp == 0) 342 lexdone = 1; 343 return 0; 344 } 345 346 static int 347 lex(int literal, int dot_type) 348 { 349 int quoted; 350 351 quoted = nextc(&yyrune); 352 if(literal || quoted){ 353 if(yyrune == 0) 354 return END; 355 return RUNE; 356 } 357 358 switch(yyrune){ 359 case 0: 360 return END; 361 case '*': 362 return STAR; 363 case '?': 364 return QUEST; 365 case '+': 366 return PLUS; 367 case '|': 368 return OR; 369 case '.': 370 return dot_type; 371 case '(': 372 return LBRA; 373 case ')': 374 return RBRA; 375 case '^': 376 return BOL; 377 case '$': 378 return EOL; 379 case '[': 380 return bldcclass(); 381 } 382 return RUNE; 383 } 384 385 static int 386 bldcclass(void) 387 { 388 int type; 389 Rune r[NCCRUNE]; 390 Rune *p, *ep, *np; 391 Rune rune; 392 int quoted; 393 394 /* we have already seen the '[' */ 395 type = CCLASS; 396 yyclassp = newclass(); 397 398 /* look ahead for negation */ 399 /* SPECIAL CASE!!! negated classes don't match \n */ 400 ep = r; 401 quoted = nextc(&rune); 402 if(!quoted && rune == '^'){ 403 type = NCCLASS; 404 quoted = nextc(&rune); 405 *ep++ = '\n'; 406 *ep++ = '\n'; 407 } 408 409 /* parse class into a set of spans */ 410 for(; ep<&r[NCCRUNE];){ 411 if(rune == 0){ 412 rcerror("malformed '[]'"); 413 return 0; 414 } 415 if(!quoted && rune == ']') 416 break; 417 if(!quoted && rune == '-'){ 418 if(ep == r){ 419 rcerror("malformed '[]'"); 420 return 0; 421 } 422 quoted = nextc(&rune); 423 if((!quoted && rune == ']') || rune == 0){ 424 rcerror("malformed '[]'"); 425 return 0; 426 } 427 *(ep-1) = rune; 428 } else { 429 *ep++ = rune; 430 *ep++ = rune; 431 } 432 quoted = nextc(&rune); 433 } 434 435 /* sort on span start */ 436 for(p = r; p < ep; p += 2){ 437 for(np = p; np < ep; np += 2) 438 if(*np < *p){ 439 rune = np[0]; 440 np[0] = p[0]; 441 p[0] = rune; 442 rune = np[1]; 443 np[1] = p[1]; 444 p[1] = rune; 445 } 446 } 447 448 /* merge spans */ 449 np = yyclassp->spans; 450 p = r; 451 if(r == ep) 452 yyclassp->end = np; 453 else { 454 np[0] = *p++; 455 np[1] = *p++; 456 for(; p < ep; p += 2) 457 if(p[0] <= np[1]){ 458 if(p[1] > np[1]) 459 np[1] = p[1]; 460 } else { 461 np += 2; 462 np[0] = p[0]; 463 np[1] = p[1]; 464 } 465 yyclassp->end = np+2; 466 } 467 468 return type; 469 } 470 471 static Reprog* 472 regcomp1(char *s, int literal, int dot_type) 473 { 474 int token; 475 Reprog *volatile pp; 476 477 /* get memory for the program */ 478 pp = malloc(sizeof(Reprog) + 6*sizeof(Reinst)*strlen(s)); 479 if(pp == 0){ 480 regerror("out of memory"); 481 return 0; 482 } 483 freep = pp->firstinst; 484 classp = pp->class; 485 errors = 0; 486 487 if(setjmp(regkaboom)) 488 goto out; 489 490 /* go compile the sucker */ 491 lexdone = 0; 492 exprp = s; 493 nclass = 0; 494 nbra = 0; 495 atorp = atorstack; 496 andp = andstack; 497 subidp = subidstack; 498 lastwasand = FALSE; 499 cursubid = 0; 500 501 /* Start with a low priority operator to prime parser */ 502 pushator(START-1); 503 while((token = lex(literal, dot_type)) != END){ 504 if((token&0300) == OPERATOR) 505 operator(token); 506 else 507 operand(token); 508 } 509 510 /* Close with a low priority operator */ 511 evaluntil(START); 512 513 /* Force END */ 514 operand(END); 515 evaluntil(START); 516 #ifdef DEBUG 517 dumpstack(); 518 #endif 519 if(nbra) 520 rcerror("unmatched left paren"); 521 --andp; /* points to first and only operand */ 522 pp->startinst = andp->first; 523 #ifdef DEBUG 524 dump(pp); 525 #endif 526 pp = optimize(pp); 527 #ifdef DEBUG 528 print("start: %d\n", andp->first-pp->firstinst); 529 dump(pp); 530 #endif 531 out: 532 if(errors){ 533 free(pp); 534 pp = 0; 535 } 536 return pp; 537 } 538 539 extern Reprog* 540 regcomp(char *s) 541 { 542 return regcomp1(s, 0, ANY); 543 } 544 545 extern Reprog* 546 regcomplit(char *s) 547 { 548 return regcomp1(s, 1, ANY); 549 } 550 551 extern Reprog* 552 regcompnl(char *s) 553 { 554 return regcomp1(s, 0, ANYNL); 555 }