regdfa.c (19532B)
1 /* 2 * Changes by Gunnar Ritter, Freiburg i. Br., Germany, November 2002. 3 * 4 * Sccsid @(#)regdfa.c 1.9 (gritter) 9/22/03 5 */ 6 /* UNIX(R) Regular Expresssion Library 7 * 8 * Note: Code is released under the GNU LGPL 9 * 10 * Copyright (C) 2001 Caldera International, Inc. 11 * 12 * This library is free software; you can redistribute it and/or 13 * modify it under the terms of the GNU Lesser General Public 14 * License as published by the Free Software Foundation; either 15 * version 2 of the License, or (at your option) any later version. 16 * 17 * This library is distributed in the hope that it will be useful, 18 * but WITHOUT ANY WARRANTY; without even the implied warranty of 19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 * Lesser General Public License for more details. 21 * 22 * You should have received a copy of the GNU Lesser General Public 23 * License along with this library; if not, write to: 24 * Free Software Foundation, Inc. 25 * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 26 */ 27 28 /* #include "synonyms.h" */ 29 #include <stdlib.h> 30 #include <string.h> 31 #include <ctype.h> 32 #include "regdfa.h" 33 34 /* 35 * Deterministic Finite Automata. 36 */ 37 38 /* 39 * Postorder traversal that returns a copy of the subtree, 40 * except that ROP_BKT becomes ROP_BKTCOPY (since they 41 * share the same pointed to Bracket object). 42 */ 43 static Tree * 44 copy(regex_t *ep, Tree *tp) 45 { 46 Tree *np; 47 48 if ((np = malloc(sizeof(Tree))) == 0) 49 return 0; 50 switch (np->op = tp->op) /* almost always correct */ 51 { 52 case ROP_BKT: 53 np->op = ROP_BKTCOPY; 54 /*FALLTHROUGH*/ 55 case ROP_BKTCOPY: 56 np->right.info.bkt = tp->right.info.bkt; 57 /*FALLTHROUGH*/ 58 default: 59 np->left.pos = ep->re_dfa->nposn++; 60 /*FALLTHROUGH*/ 61 case ROP_EMPTY: 62 return np; 63 case ROP_CAT: 64 case ROP_OR: 65 if ((np->right.ptr = copy(ep, tp->right.ptr)) == 0) 66 { 67 free(np); 68 return 0; 69 } 70 np->right.ptr->parent = np; 71 /*FALLTHROUGH*/ 72 case ROP_STAR: 73 case ROP_PLUS: 74 case ROP_QUEST: 75 case ROP_LP: 76 if ((np->left.ptr = copy(ep, tp->left.ptr)) == 0) 77 break; 78 np->left.ptr->parent = np; 79 return np; 80 } 81 libuxre_regdeltree(np, 1); 82 return 0; 83 } 84 85 /* 86 * Postorder traversal. 87 * Assign unique ascending integer values to the leaves. 88 * Since the right child is traversed before the left, 89 * the position for ROP_END is guaranteed to be zero. 90 * The parse tree is rewritten in two cases: 91 * - Each ROP_BRACE is replaced by an equivalent--sometimes 92 * large--subtree using only ROP_CAT, ROP_QUEST, and 93 * ROP_PLUS. 94 * - If REG_ICASE, replace each simple character that has 95 * an uppercase equivalent with a ROP_OR subtree over the 96 * two versions. 97 * Since these rewrites occur bottom up, they have already 98 * been applied before any subtrees passed to copy(). 99 */ 100 static Tree * 101 findposn(regex_t *ep, Tree *tp, int mb_cur_max) 102 { 103 unsigned int lo, hi; 104 Tree *ptr, *par; 105 w_type wc; 106 107 switch (tp->op) 108 { 109 default: 110 if (ep->re_flags & REG_ICASE 111 && (wc = to_upper(tp->op)) != tp->op) 112 { 113 if ((ptr = libuxre_reg1tree(tp->op, 0)) == 0) 114 return 0; 115 ptr->parent = tp; 116 ptr->left.pos = ep->re_dfa->nposn++; 117 tp->op = ROP_OR; 118 tp->left.ptr = ptr; 119 ptr = libuxre_reg1tree(wc, 0); 120 if ((tp->right.ptr = ptr) == 0) 121 return 0; 122 ptr->parent = tp; 123 ptr->left.pos = ep->re_dfa->nposn++; 124 return tp; 125 } 126 /*FALLTHROUGH*/ 127 case ROP_BOL: 128 case ROP_EOL: 129 case ROP_ALL: 130 case ROP_ANYCH: 131 case ROP_NOTNL: 132 case ROP_NONE: 133 case ROP_BKT: 134 case ROP_BKTCOPY: 135 case ROP_END: 136 tp->left.pos = ep->re_dfa->nposn++; 137 return tp; 138 case ROP_EMPTY: 139 return tp; 140 case ROP_OR: 141 case ROP_CAT: 142 if ((tp->right.ptr = findposn(ep, tp->right.ptr, 143 mb_cur_max)) == 0) 144 return 0; 145 /*FALLTHROUGH*/ 146 case ROP_STAR: 147 case ROP_PLUS: 148 case ROP_QUEST: 149 case ROP_LP: 150 if ((tp->left.ptr = findposn(ep, tp->left.ptr, 151 mb_cur_max)) == 0) 152 return 0; 153 return tp; 154 case ROP_BRACE: 155 if ((tp->left.ptr = findposn(ep, tp->left.ptr, 156 mb_cur_max)) == 0) 157 return 0; 158 break; 159 } 160 /* 161 * ROP_BRACE as is cannot be handled in a DFA. This code 162 * duplicates the ROP_BRACE subtree as a left-towering 163 * series of ROP_CAT nodes, the first "lo" of which are 164 * direct copies of the original subtree. The tail of 165 * the series are either some number of ROP_QUESTs over 166 * copies of the original subtree, or a single ROP_PLUS 167 * over a copy (when "hi" is infinity). 168 * 169 * All interesting cases {lo,hi}: 170 * {0,0} -> ROP_EMPTY, parsing, temporary 171 * {0,1} -> ROP_QUEST, parsing 172 * {0,2} -> CAT(QUEST(left), QUEST(copy)) 173 * {0,n} -> CAT({0,n-1}, QUEST(copy)) 174 * {0,} -> ROP_STAR, parsing 175 * 176 * {1,1} -> ROP_NOP, parsing, temporary 177 * {1,2} -> CAT(left, QUEST(copy)) 178 * {1,n} -> CAT({1,n-1}, QUEST(copy)) 179 * {1,} -> ROP_PLUS, parsing 180 * 181 * {2,2} -> CAT(left, copy) 182 * {2,n} -> CAT({2,n-1}, QUEST(copy)) 183 * {2,} -> CAT(left, PLUS(copy)) 184 * 185 * {3,3} -> CAT({2,2}, copy) 186 * {3,n} -> CAT({3,n-1}, QUEST(copy)) 187 * {3,} -> CAT({2,2}, PLUS(copy)) 188 * 189 * {n,} -> CAT({n-1,n-1}, PLUS(copy)) 190 * 191 * In all cases, the ROP_BRACE node is turned into the 192 * left-most ROP_CAT, and a copy of its original subtree 193 * is connected as the right child. Note that the bottom- 194 * up nature of this duplication guarantees that copy() 195 * never sees a ROP_BRACE node. 196 */ 197 par = tp->parent; 198 lo = tp->right.info.num[0]; 199 hi = tp->right.info.num[1]; 200 if ((ptr = copy(ep, tp->left.ptr)) == 0) 201 return 0; 202 ptr->parent = tp; 203 tp->op = ROP_CAT; 204 tp->right.ptr = ptr; 205 if (lo == 0) 206 { 207 if ((tp->left.ptr = libuxre_reg1tree(ROP_QUEST, tp->left.ptr)) 208 == 0) 209 return 0; 210 tp->left.ptr->parent = tp; 211 } 212 else 213 { 214 if (hi == BRACE_INF || (hi -= lo) == 0) 215 lo--; /* lo > 1; no extra needed */ 216 while (--lo != 0) 217 { 218 if ((tp = libuxre_reg2tree(ROP_CAT, tp, copy(ep, ptr))) 219 == 0) 220 return 0; 221 } 222 } 223 if (hi == BRACE_INF) 224 { 225 if ((tp->right.ptr = libuxre_reg1tree(ROP_PLUS, tp->right.ptr)) 226 == 0) 227 return 0; 228 tp->right.ptr->parent = tp; 229 } 230 else if (hi != 0) 231 { 232 if ((tp->right.ptr = libuxre_reg1tree(ROP_QUEST, tp->right.ptr)) 233 == 0) 234 return 0; 235 ptr = tp->right.ptr; 236 ptr->parent = tp; 237 while (--hi != 0) 238 { 239 if ((tp = libuxre_reg2tree(ROP_CAT, tp, copy(ep, ptr))) 240 == 0) 241 return 0; 242 } 243 } 244 tp->parent = par; 245 return tp; 246 } 247 248 /* 249 * Postorder traversal, but not always entire subtree. 250 * For each leaf reachable by the empty string, add it 251 * to the set. Return 0 if the subtree can match empty. 252 */ 253 static int 254 first(Dfa *dp, Tree *tp) 255 { 256 switch (tp->op) 257 { 258 case ROP_BOL: 259 if (dp->flags & REG_NOTBOL) 260 return 0; 261 break; 262 case ROP_EOL: 263 if (dp->flags & REG_NOTEOL) 264 return 0; 265 break; 266 case ROP_EMPTY: 267 return 0; 268 case ROP_OR: 269 return first(dp, tp->left.ptr) & first(dp, tp->right.ptr); 270 case ROP_CAT: 271 if (first(dp, tp->left.ptr) != 0) 272 return 1; 273 return first(dp, tp->right.ptr); 274 case ROP_BRACE: 275 if (tp->right.info.num[0] != 0 && first(dp, tp->left.ptr) != 0) 276 return 1; 277 /*FALLTHROUGH*/ 278 case ROP_STAR: 279 case ROP_QUEST: 280 first(dp, tp->left.ptr); 281 return 0; 282 case ROP_LP: 283 case ROP_PLUS: 284 return first(dp, tp->left.ptr); 285 } 286 if (dp->posset[tp->left.pos] == 0) 287 { 288 dp->posset[tp->left.pos] = 1; 289 dp->nset++; 290 } 291 return 1; 292 } 293 294 /* 295 * Walk from leaf up (most likely not to root). 296 * Determine follow set for the leaf by filling 297 * set[] with the positions reachable. 298 */ 299 static void 300 follow(Dfa *dp, Tree *tp) 301 { 302 Tree *pp; 303 304 switch ((pp = tp->parent)->op) 305 { 306 case ROP_CAT: 307 if (pp->left.ptr == tp && first(dp, pp->right.ptr) != 0) 308 break; 309 /*FALLTHROUGH*/ 310 case ROP_OR: 311 case ROP_QUEST: 312 case ROP_LP: 313 follow(dp, pp); 314 break; 315 case ROP_STAR: 316 case ROP_PLUS: 317 case ROP_BRACE: 318 first(dp, tp); 319 follow(dp, pp); 320 break; 321 } 322 } 323 324 /* 325 * Postorder traversal. 326 * At each leaf, copy it into posn[] and assign its follow set. 327 * Because the left-most subtree is ROP_ALL under ROP_STAR, the 328 * follow set for its leaf (position dp->nposn-1) is the same 329 * as the initial state's signature (prior to any ROP_BOL). 330 */ 331 static int 332 posnfoll(Dfa *dp, Tree *tp) 333 { 334 unsigned char *s; 335 size_t i, n; 336 size_t *fp; 337 Posn *p; 338 int ret; 339 340 switch (tp->op) 341 { 342 case ROP_OR: 343 case ROP_CAT: 344 if ((ret = posnfoll(dp, tp->right.ptr)) != 0) 345 return ret; 346 /*FALLTHROUGH*/ 347 case ROP_STAR: 348 case ROP_PLUS: 349 case ROP_QUEST: 350 case ROP_LP: 351 if ((ret = posnfoll(dp, tp->left.ptr)) != 0) 352 return ret; 353 return 0; 354 case ROP_END: /* keeps follow() from walking above the root */ 355 p = &dp->posn[tp->left.pos]; 356 p->op = tp->op; 357 p->seti = 0; 358 p->nset = 0; 359 return 0; 360 case ROP_BKT: 361 case ROP_BKTCOPY: 362 p = &dp->posn[tp->left.pos]; 363 p->bkt = tp->right.info.bkt; 364 goto skip; 365 case ROP_BOL: 366 dp->flags |= REG_NOTBOL; /* adjacent ROP_BOLs match empty */ 367 break; 368 case ROP_EOL: 369 dp->flags |= REG_NOTEOL; /* adjacent ROP_EOLs match empty */ 370 break; 371 } 372 p = &dp->posn[tp->left.pos]; 373 skip:; 374 p->op = tp->op; 375 memset(dp->posset, 0, dp->nposn); 376 dp->nset = 0; 377 follow(dp, tp); 378 dp->flags &= ~(REG_NOTBOL | REG_NOTEOL); 379 fp = dp->posfoll; 380 if ((p->nset = dp->nset) > dp->avail) /* need more */ 381 { 382 if ((n = p->nset << 1) < dp->nposn) 383 n = dp->nposn; 384 dp->avail += n; 385 if ((fp = realloc(dp->posfoll, 386 sizeof(size_t) * (dp->avail + dp->used))) == 0) 387 { 388 return REG_ESPACE; 389 } 390 dp->posfoll = fp; 391 } 392 p->seti = dp->used; 393 if ((i = dp->nset) != 0) 394 { 395 dp->used += i; 396 dp->avail -= i; 397 fp += p->seti; 398 s = dp->posset; 399 n = 0; 400 do 401 { 402 if (*s++ != 0) 403 { 404 *fp++ = n; 405 if (--i == 0) 406 break; 407 } 408 } while (++n != dp->nposn); 409 } 410 return 0; 411 } 412 413 static int 414 addstate(Dfa *dp) /* install state if unique; return its index */ 415 { 416 size_t *sp, *fp; 417 size_t t, n, i; 418 int flushed; 419 420 /* 421 * Compare dp->nset/dp->cursig[] against remembered states. 422 */ 423 t = dp->top; 424 do 425 { 426 if (dp->nsig[--t] != dp->nset) 427 continue; 428 if ((n = dp->nset) != 0) 429 { 430 fp = &dp->sigfoll[dp->sigi[t]]; 431 sp = &dp->cursig[0]; 432 loop:; 433 if (*fp++ != *sp++) 434 continue; /* to the do-while */ 435 if (--n != 0) 436 goto loop; 437 } 438 return t + 1; 439 } while (t != 0); 440 /* 441 * Not in currently cached states; add it. 442 */ 443 flushed = 0; 444 if ((t = dp->top) >= CACHESZ) /* need to flush the cache */ 445 { 446 flushed = 1; 447 n = dp->anybol; 448 n = dp->sigi[n] + dp->nsig[n]; /* past invariant states */ 449 dp->avail += dp->used - n; 450 dp->used = n; 451 dp->top = n = dp->nfix; 452 memset((void *)&dp->trans, 0, sizeof(dp->trans)); 453 memset((void *)&dp->acc[n], 0, CACHESZ - n); 454 t = n; 455 } 456 dp->top++; 457 fp = dp->sigfoll; 458 if ((n = dp->nset) > dp->avail) /* grow strip */ 459 { 460 i = dp->avail + n << 1; 461 if ((fp = realloc(fp, sizeof(size_t) * (i + dp->used))) == 0) 462 return 0; 463 dp->avail = i; 464 dp->sigfoll = fp; 465 } 466 dp->acc[t] = 0; 467 if ((dp->nsig[t] = n) != 0) 468 { 469 sp = dp->cursig; 470 if (sp[0] == 0) 471 dp->acc[t] = 1; 472 dp->sigi[t] = i = dp->used; 473 dp->used += n; 474 dp->avail -= n; 475 fp += i; 476 do 477 *fp++ = *sp++; 478 while (--n != 0); 479 } 480 t++; 481 if (flushed) 482 return -t; 483 return t; 484 } 485 486 void 487 libuxre_regdeldfa(Dfa *dp) 488 { 489 Posn *pp; 490 size_t np; 491 492 if (dp->posfoll != 0) 493 free(dp->posfoll); 494 if (dp->sigfoll != 0) 495 free(dp->sigfoll); 496 if (dp->cursig != 0) 497 free(dp->cursig); 498 if ((pp = dp->posn) != 0) 499 { 500 /* 501 * Need to walk the positions list to free any 502 * space used for ROP_BKTs. 503 */ 504 np = dp->nposn; 505 do 506 { 507 if (pp->op == ROP_BKT) 508 { 509 libuxre_bktfree(pp->bkt); 510 free(pp->bkt); 511 } 512 } while (++pp, --np != 0); 513 free(dp->posn); 514 } 515 free(dp); 516 } 517 518 int 519 regtrans(Dfa *dp, int st, w_type wc, int mb_cur_max) 520 { 521 const unsigned char *s; 522 size_t *fp, *sp; 523 size_t i, n; 524 Posn *pp; 525 int nst; 526 527 if ((n = dp->nsig[st]) == 0) /* dead state */ 528 return st + 1; /* stay here */ 529 memset(dp->posset, 0, dp->nposn); 530 dp->nset = 0; 531 fp = &dp->sigfoll[dp->sigi[st]]; 532 do 533 { 534 pp = &dp->posn[*fp]; 535 switch (pp->op) 536 { 537 case ROP_EOL: 538 if (wc == '\0' && (dp->flags & REG_NOTEOL) == 0) 539 break; 540 /*FALLTHROUGH*/ 541 case ROP_BOL: 542 default: 543 if (pp->op == wc) 544 break; 545 /*FALLTHROUGH*/ 546 case ROP_END: 547 case ROP_NONE: 548 continue; 549 case ROP_NOTNL: 550 if (wc == '\n') 551 continue; 552 /*FALLTHROUGH*/ 553 case ROP_ANYCH: 554 if (wc <= '\0') 555 continue; 556 break; 557 case ROP_ALL: 558 if (wc == '\0') 559 continue; 560 break; 561 case ROP_BKT: 562 case ROP_BKTCOPY: 563 /* 564 * Note that multiple character bracket matches 565 * are precluded from DFAs. (See regparse.c and 566 * regcomp.c.) Thus, the continuation string 567 * argument is not used in libuxre_bktmbexec(). 568 */ 569 if (wc > '\0' && 570 libuxre_bktmbexec(pp->bkt, wc, 0, mb_cur_max) == 0) 571 break; 572 continue; 573 } 574 /* 575 * Current character matches this position. 576 * For each position in its follow list, 577 * add that position to the new state's signature. 578 */ 579 i = pp->nset; 580 sp = &dp->posfoll[pp->seti]; 581 do 582 { 583 if (dp->posset[*sp] == 0) 584 { 585 dp->posset[*sp] = 1; 586 dp->nset++; 587 } 588 } while (++sp, --i != 0); 589 } while (++fp, --n != 0); 590 /* 591 * Move the signature (if any) into cursig[] and install it. 592 */ 593 if ((i = dp->nset) != 0) 594 { 595 fp = dp->cursig; 596 s = dp->posset; 597 for (n = 0;; n++) 598 { 599 if (*s++ != 0) 600 { 601 *fp++ = n; 602 if (--i == 0) 603 break; 604 } 605 } 606 } 607 if ((nst = addstate(dp)) < 0) /* flushed cache */ 608 nst = -nst; 609 else if (nst > 0 && (wc & ~(long)(NCHAR - 1)) == 0) 610 dp->trans[st][wc] = nst; 611 return nst; 612 } 613 614 LIBUXRE_STATIC int 615 libuxre_regdfacomp(regex_t *ep, Tree *tp, Lex *lxp) 616 { 617 Tree *lp; 618 Dfa *dp; 619 Posn *p; 620 int st; 621 622 /* 623 * It's convenient to insert an STAR(ALL) subtree to the 624 * immediate left of the current tree. This makes the 625 * "any match" libuxre_regdfaexec() not a special case, 626 * and the initial state signature will fall out when 627 * building the follow sets for all the leaves. 628 */ 629 if ((lp = libuxre_reg1tree(ROP_ALL, 0)) == 0 630 || (lp = libuxre_reg1tree(ROP_STAR, lp)) == 0 631 || (tp->left.ptr = lp 632 = libuxre_reg2tree(ROP_CAT, lp, tp->left.ptr)) == 0) 633 { 634 return REG_ESPACE; 635 } 636 lp->parent = tp; 637 if ((dp = calloc(1, sizeof(Dfa))) == 0) 638 return REG_ESPACE; 639 ep->re_dfa = dp; 640 /* 641 * Just in case null pointers aren't just all bits zero... 642 */ 643 dp->posfoll = 0; 644 dp->sigfoll = 0; 645 dp->cursig = 0; 646 dp->posn = 0; 647 /* 648 * Assign position values to each of the tree's leaves 649 * (the important parts), meanwhile potentially rewriting 650 * the parse tree so that it fits within the restrictions 651 * of our DFA. 652 */ 653 if ((tp = findposn(ep, tp, lxp->mb_cur_max)) == 0) 654 goto err; 655 /* 656 * Get space for the array of positions and current set, 657 * now that the number of positions is known. 658 */ 659 if ((dp->posn = malloc(sizeof(Posn) * dp->nposn + dp->nposn)) == 0) 660 goto err; 661 dp->posset = (unsigned char *)&dp->posn[dp->nposn]; 662 /* 663 * Get follow sets for each position. 664 */ 665 if (posnfoll(dp, tp) != 0) 666 goto err; 667 /* 668 * Set up the special invariant states: 669 * - dead state (no valid transitions); index 0. 670 * - initial state for any match [STAR(ALL) follow set]; index 1. 671 * - initial state for any match after ROP_BOL. 672 * - initial state for left-most longest if REG_NOTBOL. 673 * - initial state for left-most longest after ROP_BOL. 674 * The final two are not allocated if leftmost() cannot be called. 675 * The pairs of initial states are the same if there is no 676 * explicit ROP_BOL transition. 677 */ 678 dp->avail += dp->used; 679 dp->used = 0; 680 if ((dp->sigfoll = malloc(sizeof(size_t) * dp->avail)) == 0) 681 goto err; 682 p = &dp->posn[dp->nposn - 1]; /* same as first(root) */ 683 dp->cursig = &dp->posfoll[p->seti]; 684 dp->nset = p->nset; 685 dp->top = 1; /* index 0 is dead state */ 686 addstate(dp); /* must be state index 1 (returns 2) */ 687 if ((dp->cursig = malloc(sizeof(size_t) * dp->nposn)) == 0) 688 goto err; 689 dp->nfix = 2; 690 if ((st = regtrans(dp, 1, ROP_BOL, lxp->mb_cur_max)) == 0) 691 goto err; 692 if ((dp->anybol = st - 1) == 2) /* new state */ 693 dp->nfix = 3; 694 if ((ep->re_flags & REG_NOSUB) == 0) /* leftmost() might be called */ 695 { 696 /* 697 * leftmost() initial states are the same as the 698 * "any match" ones without the STAR(ALL) position. 699 */ 700 dp->sigi[dp->nfix] = 0; 701 dp->nsig[dp->nfix] = dp->nsig[1] - 1; 702 dp->acc[dp->nfix] = dp->acc[1]; 703 dp->leftbol = dp->leftmost = dp->nfix; 704 dp->nfix++; 705 if (dp->anybol != 1) /* distinct state w/BOL */ 706 { 707 dp->sigi[dp->nfix] = dp->sigi[2]; 708 dp->nsig[dp->nfix] = dp->nsig[2] - 1; 709 dp->acc[dp->nfix] = dp->acc[2]; 710 dp->leftbol = dp->nfix; 711 dp->nfix++; 712 } 713 dp->top = dp->nfix; 714 } 715 return 0; 716 err:; 717 libuxre_regdeldfa(dp); 718 return REG_ESPACE; 719 } 720 721 static int 722 leftmost(Dfa *dp, Exec *xp) 723 { 724 const unsigned char *s, *beg, *end; 725 int i, nst, st, mb_cur_max; 726 w_type wc; 727 728 mb_cur_max = xp->mb_cur_max; 729 beg = s = xp->str; 730 end = 0; 731 st = dp->leftbol; 732 if (xp->flags & REG_NOTBOL) 733 st = dp->leftmost; 734 if (dp->acc[st] && (xp->flags & REG_NONEMPTY) == 0) 735 end = s; /* initial empty match allowed */ 736 for (;;) 737 { 738 if ((wc = *s++) == '\n') 739 { 740 if (xp->flags & REG_NEWLINE) 741 wc = ROP_EOL; 742 } 743 else if (!ISONEBYTE(wc) && (i = libuxre_mb2wc(&wc, s)) > 0) 744 s += i; 745 if ((wc & ~(long)(NCHAR - 1)) != 0 746 || (nst = dp->trans[st][wc]) == 0) 747 { 748 if ((nst=regtrans(dp, st, wc, mb_cur_max)) == 0) 749 return REG_ESPACE; 750 if (wc == ROP_EOL) /* REG_NEWLINE only */ 751 { 752 if (dp->acc[nst - 1]) 753 { 754 if (end == 0 || end < s) 755 end = s; 756 break; 757 } 758 beg = s; 759 st = dp->leftbol; 760 goto newst; 761 } 762 } 763 if ((st = nst - 1) == 0) /* dead state */ 764 { 765 if (end != 0) 766 break; 767 if ((wc = *beg++) == '\0') 768 return REG_NOMATCH; 769 else if (!ISONEBYTE(wc) && 770 (i = libuxre_mb2wc(&wc, beg)) > 0) 771 beg += i; 772 s = beg; 773 st = dp->leftmost; 774 goto newst; 775 } 776 if (wc == '\0') 777 { 778 if (dp->acc[st]) 779 { 780 s--; /* don't include \0 */ 781 if (end == 0 || end < s) 782 end = s; 783 break; 784 } 785 if (end != 0) 786 break; 787 return REG_NOMATCH; 788 } 789 newst:; 790 if (dp->acc[st]) 791 { 792 if (end == 0 || end < s) 793 end = s; 794 } 795 } 796 xp->match[0].rm_so = beg - xp->str; 797 xp->match[0].rm_eo = end - xp->str; 798 return 0; 799 } 800 801 /* 802 * Optimization by simplification: singlebyte locale and REG_NEWLINE not set. 803 * Performance gain for grep is 25% so it's worth the hack. 804 */ 805 static int 806 regdfaexec_opt(Dfa *dp, Exec *xp) 807 { 808 const unsigned char *s; 809 int nst, st; 810 811 s = xp->str; 812 st = dp->anybol; 813 if (xp->flags & REG_NOTBOL) 814 st = 1; 815 if (dp->acc[st] && (xp->flags & REG_NONEMPTY) == 0) 816 return 0; /* initial empty match allowed */ 817 do 818 { 819 if ((nst = dp->trans[st][*s]) == 0) 820 { 821 if ((nst = regtrans(dp, st, *s, 1)) == 0) 822 return REG_ESPACE; 823 } 824 if (dp->acc[st = nst - 1]) 825 return 0; 826 } while (*s++ != '\0'); /* st != 0 */ 827 return REG_NOMATCH; 828 } 829 830 LIBUXRE_STATIC int 831 libuxre_regdfaexec(Dfa *dp, Exec *xp) 832 { 833 const unsigned char *s; 834 int i, nst, st, mb_cur_max; 835 w_type wc; 836 837 dp->flags = xp->flags & REG_NOTEOL; /* for regtrans() */ 838 mb_cur_max = xp->mb_cur_max; 839 if (xp->nmatch != 0) 840 return leftmost(dp, xp); 841 if (mb_cur_max == 1 && (xp->flags & REG_NEWLINE) == 0) 842 return regdfaexec_opt(dp, xp); 843 s = xp->str; 844 st = dp->anybol; 845 if (xp->flags & REG_NOTBOL) 846 st = 1; 847 if (dp->acc[st] && (xp->flags & REG_NONEMPTY) == 0) 848 return 0; /* initial empty match allowed */ 849 for (;;) 850 { 851 if ((wc = *s++) == '\n') 852 { 853 if (xp->flags & REG_NEWLINE) 854 wc = ROP_EOL; 855 } 856 else if (!ISONEBYTE(wc) && (i = libuxre_mb2wc(&wc, s)) > 0) 857 s += i; 858 if ((wc & ~(long)(NCHAR - 1)) != 0 859 || (nst = dp->trans[st][wc]) == 0) 860 { 861 if ((nst=regtrans(dp, st, wc, mb_cur_max)) == 0) 862 return REG_ESPACE; 863 if (wc == ROP_EOL) /* REG_NEWLINE only */ 864 { 865 if (dp->acc[nst - 1]) 866 return 0; 867 if (dp->acc[st = dp->anybol]) 868 return 0; 869 continue; 870 } 871 } 872 if (dp->acc[st = nst - 1]) 873 return 0; 874 if (wc == '\0') /* st == 0 */ 875 return REG_NOMATCH; 876 } 877 }