hbase

heirloom base
git clone git://git.2f30.org/hbase
Log | Files | Refs | README

regdfa.c (19532B)


      1 /*
      2  * Changes by Gunnar Ritter, Freiburg i. Br., Germany, November 2002.
      3  *
      4  * Sccsid @(#)regdfa.c	1.9 (gritter) 9/22/03
      5  */
      6 /*  UNIX(R) Regular Expresssion Library
      7  *
      8  *  Note: Code is released under the GNU LGPL
      9  *
     10  *  Copyright (C) 2001 Caldera International, Inc.
     11  *
     12  *  This library is free software; you can redistribute it and/or
     13  *  modify it under the terms of the GNU Lesser General Public
     14  *  License as published by the Free Software Foundation; either
     15  *  version 2 of the License, or (at your option) any later version.
     16  *
     17  *  This library is distributed in the hope that it will be useful,
     18  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
     19  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     20  *  Lesser General Public License for more details.
     21  *
     22  *  You should have received a copy of the GNU Lesser General Public
     23  *  License along with this library; if not, write to:
     24  *        Free Software Foundation, Inc.
     25  *        59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
     26  */
     27 
     28 /*	#include "synonyms.h"	*/
     29 #include <stdlib.h>
     30 #include <string.h>
     31 #include <ctype.h>
     32 #include "regdfa.h"
     33 
     34 /*
     35 * Deterministic Finite Automata.
     36 */
     37 
     38 	/*
     39 	* Postorder traversal that returns a copy of the subtree,
     40 	* except that ROP_BKT becomes ROP_BKTCOPY (since they
     41 	* share the same pointed to Bracket object).
     42 	*/
     43 static Tree *
     44 copy(regex_t *ep, Tree *tp)
     45 {
     46 	Tree *np;
     47 
     48 	if ((np = malloc(sizeof(Tree))) == 0)
     49 		return 0;
     50 	switch (np->op = tp->op) /* almost always correct */
     51 	{
     52 	case ROP_BKT:
     53 		np->op = ROP_BKTCOPY;
     54 		/*FALLTHROUGH*/
     55 	case ROP_BKTCOPY:
     56 		np->right.info.bkt = tp->right.info.bkt;
     57 		/*FALLTHROUGH*/
     58 	default:
     59 		np->left.pos = ep->re_dfa->nposn++;
     60 		/*FALLTHROUGH*/
     61 	case ROP_EMPTY:
     62 		return np;
     63 	case ROP_CAT:
     64 	case ROP_OR:
     65 		if ((np->right.ptr = copy(ep, tp->right.ptr)) == 0)
     66 		{
     67 			free(np);
     68 			return 0;
     69 		}
     70 		np->right.ptr->parent = np;
     71 		/*FALLTHROUGH*/
     72 	case ROP_STAR:
     73 	case ROP_PLUS:
     74 	case ROP_QUEST:
     75 	case ROP_LP:
     76 		if ((np->left.ptr = copy(ep, tp->left.ptr)) == 0)
     77 			break;
     78 		np->left.ptr->parent = np;
     79 		return np;
     80 	}
     81 	libuxre_regdeltree(np, 1);
     82 	return 0;
     83 }
     84 
     85 	/*
     86 	* Postorder traversal.
     87 	* Assign unique ascending integer values to the leaves.
     88 	* Since the right child is traversed before the left,
     89 	* the position for ROP_END is guaranteed to be zero.
     90 	* The parse tree is rewritten in two cases:
     91 	* - Each ROP_BRACE is replaced by an equivalent--sometimes
     92 	*   large--subtree using only ROP_CAT, ROP_QUEST, and
     93 	*   ROP_PLUS.
     94 	* - If REG_ICASE, replace each simple character that has
     95 	*   an uppercase equivalent with a ROP_OR subtree over the
     96 	*   two versions.
     97 	* Since these rewrites occur bottom up, they have already
     98 	* been applied before any subtrees passed to copy().
     99 	*/
    100 static Tree *
    101 findposn(regex_t *ep, Tree *tp, int mb_cur_max)
    102 {
    103 	unsigned int lo, hi;
    104 	Tree *ptr, *par;
    105 	w_type wc;
    106 
    107 	switch (tp->op)
    108 	{
    109 	default:
    110 		if (ep->re_flags & REG_ICASE
    111 			&& (wc = to_upper(tp->op)) != tp->op)
    112 		{
    113 			if ((ptr = libuxre_reg1tree(tp->op, 0)) == 0)
    114 				return 0;
    115 			ptr->parent = tp;
    116 			ptr->left.pos = ep->re_dfa->nposn++;
    117 			tp->op = ROP_OR;
    118 			tp->left.ptr = ptr;
    119 			ptr = libuxre_reg1tree(wc, 0);
    120 			if ((tp->right.ptr = ptr) == 0)
    121 				return 0;
    122 			ptr->parent = tp;
    123 			ptr->left.pos = ep->re_dfa->nposn++;
    124 			return tp;
    125 		}
    126 		/*FALLTHROUGH*/
    127 	case ROP_BOL:
    128 	case ROP_EOL:
    129 	case ROP_ALL:
    130 	case ROP_ANYCH:
    131 	case ROP_NOTNL:
    132 	case ROP_NONE:
    133 	case ROP_BKT:
    134 	case ROP_BKTCOPY:
    135 	case ROP_END:
    136 		tp->left.pos = ep->re_dfa->nposn++;
    137 		return tp;
    138 	case ROP_EMPTY:
    139 		return tp;
    140 	case ROP_OR:
    141 	case ROP_CAT:
    142 		if ((tp->right.ptr = findposn(ep, tp->right.ptr,
    143 						mb_cur_max)) == 0)
    144 			return 0;
    145 		/*FALLTHROUGH*/
    146 	case ROP_STAR:
    147 	case ROP_PLUS:
    148 	case ROP_QUEST:
    149 	case ROP_LP:
    150 		if ((tp->left.ptr = findposn(ep, tp->left.ptr,
    151 						mb_cur_max)) == 0)
    152 			return 0;
    153 		return tp;
    154 	case ROP_BRACE:
    155 		if ((tp->left.ptr = findposn(ep, tp->left.ptr,
    156 						mb_cur_max)) == 0)
    157 			return 0;
    158 		break;
    159 	}
    160 	/*
    161 	* ROP_BRACE as is cannot be handled in a DFA.  This code
    162 	* duplicates the ROP_BRACE subtree as a left-towering
    163 	* series of ROP_CAT nodes, the first "lo" of which are
    164 	* direct copies of the original subtree.  The tail of
    165 	* the series are either some number of ROP_QUESTs over
    166 	* copies of the original subtree, or a single ROP_PLUS
    167 	* over a copy (when "hi" is infinity).
    168 	*
    169 	* All interesting cases {lo,hi}:
    170 	*  {0,0} -> ROP_EMPTY, parsing, temporary
    171 	*  {0,1} -> ROP_QUEST, parsing
    172 	*  {0,2} -> CAT(QUEST(left), QUEST(copy))
    173 	*  {0,n} -> CAT({0,n-1}, QUEST(copy))
    174 	*  {0,}  -> ROP_STAR, parsing
    175 	*
    176 	*  {1,1} -> ROP_NOP, parsing, temporary
    177 	*  {1,2} -> CAT(left, QUEST(copy))
    178 	*  {1,n} -> CAT({1,n-1}, QUEST(copy))
    179 	*  {1,}  -> ROP_PLUS, parsing
    180 	*
    181 	*  {2,2} -> CAT(left, copy)
    182 	*  {2,n} -> CAT({2,n-1}, QUEST(copy))
    183 	*  {2,}  -> CAT(left, PLUS(copy))
    184 	*
    185 	*  {3,3} -> CAT({2,2}, copy)
    186 	*  {3,n} -> CAT({3,n-1}, QUEST(copy))
    187 	*  {3,}  -> CAT({2,2}, PLUS(copy))
    188 	*
    189 	*  {n,}  -> CAT({n-1,n-1}, PLUS(copy))
    190 	*
    191 	* In all cases, the ROP_BRACE node is turned into the
    192 	* left-most ROP_CAT, and a copy of its original subtree
    193 	* is connected as the right child.  Note that the bottom-
    194 	* up nature of this duplication guarantees that copy()
    195 	* never sees a ROP_BRACE node.
    196 	*/
    197 	par = tp->parent;
    198 	lo = tp->right.info.num[0];
    199 	hi = tp->right.info.num[1];
    200 	if ((ptr = copy(ep, tp->left.ptr)) == 0)
    201 		return 0;
    202 	ptr->parent = tp;
    203 	tp->op = ROP_CAT;
    204 	tp->right.ptr = ptr;
    205 	if (lo == 0)
    206 	{
    207 		if ((tp->left.ptr = libuxre_reg1tree(ROP_QUEST, tp->left.ptr))
    208 				== 0)
    209 			return 0;
    210 		tp->left.ptr->parent = tp;
    211 	}
    212 	else
    213 	{
    214 		if (hi == BRACE_INF || (hi -= lo) == 0)
    215 			lo--;	/* lo > 1; no extra needed */
    216 		while (--lo != 0)
    217 		{
    218 			if ((tp = libuxre_reg2tree(ROP_CAT, tp, copy(ep, ptr)))
    219 					== 0)
    220 				return 0;
    221 		}
    222 	}
    223 	if (hi == BRACE_INF)
    224 	{
    225 		if ((tp->right.ptr = libuxre_reg1tree(ROP_PLUS, tp->right.ptr))
    226 				== 0)
    227 			return 0;
    228 		tp->right.ptr->parent = tp;
    229 	}
    230 	else if (hi != 0)
    231 	{
    232 		if ((tp->right.ptr = libuxre_reg1tree(ROP_QUEST, tp->right.ptr))
    233 				== 0)
    234 			return 0;
    235 		ptr = tp->right.ptr;
    236 		ptr->parent = tp;
    237 		while (--hi != 0)
    238 		{
    239 			if ((tp = libuxre_reg2tree(ROP_CAT, tp, copy(ep, ptr)))
    240 					== 0)
    241 				return 0;
    242 		}
    243 	}
    244 	tp->parent = par;
    245 	return tp;
    246 }
    247 
    248 	/*
    249 	* Postorder traversal, but not always entire subtree.
    250 	* For each leaf reachable by the empty string, add it
    251 	* to the set.  Return 0 if the subtree can match empty.
    252 	*/
    253 static int
    254 first(Dfa *dp, Tree *tp)
    255 {
    256 	switch (tp->op)
    257 	{
    258 	case ROP_BOL:
    259 		if (dp->flags & REG_NOTBOL)
    260 			return 0;
    261 		break;
    262 	case ROP_EOL:
    263 		if (dp->flags & REG_NOTEOL)
    264 			return 0;
    265 		break;
    266 	case ROP_EMPTY:
    267 		return 0;
    268 	case ROP_OR:
    269 		return first(dp, tp->left.ptr) & first(dp, tp->right.ptr);
    270 	case ROP_CAT:
    271 		if (first(dp, tp->left.ptr) != 0)
    272 			return 1;
    273 		return first(dp, tp->right.ptr);
    274 	case ROP_BRACE:
    275 		if (tp->right.info.num[0] != 0 && first(dp, tp->left.ptr) != 0)
    276 			return 1;
    277 		/*FALLTHROUGH*/
    278 	case ROP_STAR:
    279 	case ROP_QUEST:
    280 		first(dp, tp->left.ptr);
    281 		return 0;
    282 	case ROP_LP:
    283 	case ROP_PLUS:
    284 		return first(dp, tp->left.ptr);
    285 	}
    286 	if (dp->posset[tp->left.pos] == 0)
    287 	{
    288 		dp->posset[tp->left.pos] = 1;
    289 		dp->nset++;
    290 	}
    291 	return 1;
    292 }
    293 
    294 	/*
    295 	* Walk from leaf up (most likely not to root).
    296 	* Determine follow set for the leaf by filling
    297 	* set[] with the positions reachable.
    298 	*/
    299 static void
    300 follow(Dfa *dp, Tree *tp)
    301 {
    302 	Tree *pp;
    303 
    304 	switch ((pp = tp->parent)->op)
    305 	{
    306 	case ROP_CAT:
    307 		if (pp->left.ptr == tp && first(dp, pp->right.ptr) != 0)
    308 			break;
    309 		/*FALLTHROUGH*/
    310 	case ROP_OR:
    311 	case ROP_QUEST:
    312 	case ROP_LP:
    313 		follow(dp, pp);
    314 		break;
    315 	case ROP_STAR:
    316 	case ROP_PLUS:
    317 	case ROP_BRACE:
    318 		first(dp, tp);
    319 		follow(dp, pp);
    320 		break;
    321 	}
    322 }
    323 
    324 	/*
    325 	* Postorder traversal.
    326 	* At each leaf, copy it into posn[] and assign its follow set.
    327 	* Because the left-most subtree is ROP_ALL under ROP_STAR, the
    328 	* follow set for its leaf (position dp->nposn-1) is the same
    329 	* as the initial state's signature (prior to any ROP_BOL).
    330 	*/
    331 static int
    332 posnfoll(Dfa *dp, Tree *tp)
    333 {
    334 	unsigned char *s;
    335 	size_t i, n;
    336 	size_t *fp;
    337 	Posn *p;
    338 	int ret;
    339 
    340 	switch (tp->op)
    341 	{
    342 	case ROP_OR:
    343 	case ROP_CAT:
    344 		if ((ret = posnfoll(dp, tp->right.ptr)) != 0)
    345 			return ret;
    346 		/*FALLTHROUGH*/
    347 	case ROP_STAR:
    348 	case ROP_PLUS:
    349 	case ROP_QUEST:
    350 	case ROP_LP:
    351 		if ((ret = posnfoll(dp, tp->left.ptr)) != 0)
    352 			return ret;
    353 		return 0;
    354 	case ROP_END:	/* keeps follow() from walking above the root */
    355 		p = &dp->posn[tp->left.pos];
    356 		p->op = tp->op;
    357 		p->seti = 0;
    358 		p->nset = 0;
    359 		return 0;
    360 	case ROP_BKT:
    361 	case ROP_BKTCOPY:
    362 		p = &dp->posn[tp->left.pos];
    363 		p->bkt = tp->right.info.bkt;
    364 		goto skip;
    365 	case ROP_BOL:
    366 		dp->flags |= REG_NOTBOL; /* adjacent ROP_BOLs match empty */
    367 		break;
    368 	case ROP_EOL:
    369 		dp->flags |= REG_NOTEOL; /* adjacent ROP_EOLs match empty */
    370 		break;
    371 	}
    372 	p = &dp->posn[tp->left.pos];
    373 skip:;
    374 	p->op = tp->op;
    375 	memset(dp->posset, 0, dp->nposn);
    376 	dp->nset = 0;
    377 	follow(dp, tp);
    378 	dp->flags &= ~(REG_NOTBOL | REG_NOTEOL);
    379 	fp = dp->posfoll;
    380 	if ((p->nset = dp->nset) > dp->avail) /* need more */
    381 	{
    382 		if ((n = p->nset << 1) < dp->nposn)
    383 			n = dp->nposn;	
    384 		dp->avail += n;
    385 		if ((fp = realloc(dp->posfoll,
    386 			sizeof(size_t) * (dp->avail + dp->used))) == 0)
    387 		{
    388 			return REG_ESPACE;
    389 		}
    390 		dp->posfoll = fp;
    391 	}
    392 	p->seti = dp->used;
    393 	if ((i = dp->nset) != 0)
    394 	{
    395 		dp->used += i;
    396 		dp->avail -= i;
    397 		fp += p->seti;
    398 		s = dp->posset;
    399 		n = 0;
    400 		do
    401 		{
    402 			if (*s++ != 0)
    403 			{
    404 				*fp++ = n;
    405 				if (--i == 0)
    406 					break;
    407 			}
    408 		} while (++n != dp->nposn);
    409 	}
    410 	return 0;
    411 }
    412 
    413 static int
    414 addstate(Dfa *dp) /* install state if unique; return its index */
    415 {
    416 	size_t *sp, *fp;
    417 	size_t t, n, i;
    418 	int flushed;
    419 
    420 	/*
    421 	* Compare dp->nset/dp->cursig[] against remembered states.
    422 	*/
    423 	t = dp->top;
    424 	do
    425 	{
    426 		if (dp->nsig[--t] != dp->nset)
    427 			continue;
    428 		if ((n = dp->nset) != 0)
    429 		{
    430 			fp = &dp->sigfoll[dp->sigi[t]];
    431 			sp = &dp->cursig[0];
    432 		loop:;
    433 			if (*fp++ != *sp++)
    434 				continue; /* to the do-while */
    435 			if (--n != 0)
    436 				goto loop;
    437 		}
    438 		return t + 1;
    439 	} while (t != 0);
    440 	/*
    441 	* Not in currently cached states; add it.
    442 	*/
    443 	flushed = 0;
    444 	if ((t = dp->top) >= CACHESZ)	/* need to flush the cache */
    445 	{
    446 		flushed = 1;
    447 		n = dp->anybol;
    448 		n = dp->sigi[n] + dp->nsig[n];	/* past invariant states */
    449 		dp->avail += dp->used - n;
    450 		dp->used = n;
    451 		dp->top = n = dp->nfix;
    452 		memset((void *)&dp->trans, 0, sizeof(dp->trans));
    453 		memset((void *)&dp->acc[n], 0, CACHESZ - n);
    454 		t = n;
    455 	}
    456 	dp->top++;
    457 	fp = dp->sigfoll;
    458 	if ((n = dp->nset) > dp->avail)	/* grow strip */
    459 	{
    460 		i = dp->avail + n << 1;
    461 		if ((fp = realloc(fp, sizeof(size_t) * (i + dp->used))) == 0)
    462 			return 0;
    463 		dp->avail = i;
    464 		dp->sigfoll = fp;
    465 	}
    466 	dp->acc[t] = 0;
    467 	if ((dp->nsig[t] = n) != 0)
    468 	{
    469 		sp = dp->cursig;
    470 		if (sp[0] == 0)
    471 			dp->acc[t] = 1;
    472 		dp->sigi[t] = i = dp->used;
    473 		dp->used += n;
    474 		dp->avail -= n;
    475 		fp += i;
    476 		do
    477 			*fp++ = *sp++;
    478 		while (--n != 0);
    479 	}
    480 	t++;
    481 	if (flushed)
    482 		return -t;
    483 	return t;
    484 }
    485 
    486 void
    487 libuxre_regdeldfa(Dfa *dp)
    488 {
    489 	Posn *pp;
    490 	size_t np;
    491 
    492 	if (dp->posfoll != 0)
    493 		free(dp->posfoll);
    494 	if (dp->sigfoll != 0)
    495 		free(dp->sigfoll);
    496 	if (dp->cursig != 0)
    497 		free(dp->cursig);
    498 	if ((pp = dp->posn) != 0)
    499 	{
    500 		/*
    501 		* Need to walk the positions list to free any
    502 		* space used for ROP_BKTs.
    503 		*/
    504 		np = dp->nposn;
    505 		do
    506 		{
    507 			if (pp->op == ROP_BKT)
    508 			{
    509 				libuxre_bktfree(pp->bkt);
    510 				free(pp->bkt);
    511 			}
    512 		} while (++pp, --np != 0);
    513 		free(dp->posn);
    514 	}
    515 	free(dp);
    516 }
    517 
    518 int
    519 regtrans(Dfa *dp, int st, w_type wc, int mb_cur_max)
    520 {
    521 	const unsigned char *s;
    522 	size_t *fp, *sp;
    523 	size_t i, n;
    524 	Posn *pp;
    525 	int nst;
    526 
    527 	if ((n = dp->nsig[st]) == 0)	/* dead state */
    528 		return st + 1;		/* stay here */
    529 	memset(dp->posset, 0, dp->nposn);
    530 	dp->nset = 0;
    531 	fp = &dp->sigfoll[dp->sigi[st]];
    532 	do
    533 	{
    534 		pp = &dp->posn[*fp];
    535 		switch (pp->op)
    536 		{
    537 		case ROP_EOL:
    538 			if (wc == '\0' && (dp->flags & REG_NOTEOL) == 0)
    539 				break;
    540 			/*FALLTHROUGH*/
    541 		case ROP_BOL:
    542 		default:
    543 			if (pp->op == wc)
    544 				break;
    545 			/*FALLTHROUGH*/
    546 		case ROP_END:
    547 		case ROP_NONE:
    548 			continue;
    549 		case ROP_NOTNL:
    550 			if (wc == '\n')
    551 				continue;
    552 			/*FALLTHROUGH*/
    553 		case ROP_ANYCH:
    554 			if (wc <= '\0')
    555 				continue;
    556 			break;
    557 		case ROP_ALL:
    558 			if (wc == '\0')
    559 				continue;
    560 			break;
    561 		case ROP_BKT:
    562 		case ROP_BKTCOPY:
    563 			/*
    564 			* Note that multiple character bracket matches
    565 			* are precluded from DFAs.  (See regparse.c and
    566 			* regcomp.c.)  Thus, the continuation string
    567 			* argument is not used in libuxre_bktmbexec().
    568 			*/
    569 			if (wc > '\0' &&
    570 			    libuxre_bktmbexec(pp->bkt, wc, 0, mb_cur_max) == 0)
    571 				break;
    572 			continue;
    573 		}
    574 		/*
    575 		* Current character matches this position.
    576 		* For each position in its follow list,
    577 		* add that position to the new state's signature.
    578 		*/
    579 		i = pp->nset;
    580 		sp = &dp->posfoll[pp->seti];
    581 		do
    582 		{
    583 			if (dp->posset[*sp] == 0)
    584 			{
    585 				dp->posset[*sp] = 1;
    586 				dp->nset++;
    587 			}
    588 		} while (++sp, --i != 0);
    589 	} while (++fp, --n != 0);
    590 	/*
    591 	* Move the signature (if any) into cursig[] and install it.
    592 	*/
    593 	if ((i = dp->nset) != 0)
    594 	{
    595 		fp = dp->cursig;
    596 		s = dp->posset;
    597 		for (n = 0;; n++)
    598 		{
    599 			if (*s++ != 0)
    600 			{
    601 				*fp++ = n;
    602 				if (--i == 0)
    603 					break;
    604 			}
    605 		}
    606 	}
    607 	if ((nst = addstate(dp)) < 0) /* flushed cache */
    608 		nst = -nst;
    609 	else if (nst > 0 && (wc & ~(long)(NCHAR - 1)) == 0)
    610 		dp->trans[st][wc] = nst;
    611 	return nst;
    612 }
    613 
    614 LIBUXRE_STATIC int
    615 libuxre_regdfacomp(regex_t *ep, Tree *tp, Lex *lxp)
    616 {
    617 	Tree *lp;
    618 	Dfa *dp;
    619 	Posn *p;
    620 	int st;
    621 
    622 	/*
    623 	* It's convenient to insert an STAR(ALL) subtree to the
    624 	* immediate left of the current tree.  This makes the
    625 	* "any match" libuxre_regdfaexec() not a special case,
    626 	* and the initial state signature will fall out when
    627 	* building the follow sets for all the leaves.
    628 	*/
    629 	if ((lp = libuxre_reg1tree(ROP_ALL, 0)) == 0
    630 		|| (lp = libuxre_reg1tree(ROP_STAR, lp)) == 0
    631 		|| (tp->left.ptr = lp
    632 			= libuxre_reg2tree(ROP_CAT, lp, tp->left.ptr)) == 0)
    633 	{
    634 		return REG_ESPACE;
    635 	}
    636 	lp->parent = tp;
    637 	if ((dp = calloc(1, sizeof(Dfa))) == 0)
    638 		return REG_ESPACE;
    639 	ep->re_dfa = dp;
    640 	/*
    641 	* Just in case null pointers aren't just all bits zero...
    642 	*/
    643 	dp->posfoll = 0;
    644 	dp->sigfoll = 0;
    645 	dp->cursig = 0;
    646 	dp->posn = 0;
    647 	/*
    648 	* Assign position values to each of the tree's leaves
    649 	* (the important parts), meanwhile potentially rewriting
    650 	* the parse tree so that it fits within the restrictions
    651 	* of our DFA.
    652 	*/
    653 	if ((tp = findposn(ep, tp, lxp->mb_cur_max)) == 0)
    654 		goto err;
    655 	/*
    656 	* Get space for the array of positions and current set,
    657 	* now that the number of positions is known.
    658 	*/
    659 	if ((dp->posn = malloc(sizeof(Posn) * dp->nposn + dp->nposn)) == 0)
    660 		goto err;
    661 	dp->posset = (unsigned char *)&dp->posn[dp->nposn];
    662 	/*
    663 	* Get follow sets for each position.
    664 	*/
    665 	if (posnfoll(dp, tp) != 0)
    666 		goto err;
    667 	/*
    668 	* Set up the special invariant states:
    669 	*  - dead state (no valid transitions); index 0.
    670 	*  - initial state for any match [STAR(ALL) follow set]; index 1.
    671 	*  - initial state for any match after ROP_BOL.
    672 	*  - initial state for left-most longest if REG_NOTBOL.
    673 	*  - initial state for left-most longest after ROP_BOL.
    674 	* The final two are not allocated if leftmost() cannot be called.
    675 	* The pairs of initial states are the same if there is no
    676 	* explicit ROP_BOL transition.
    677 	*/
    678 	dp->avail += dp->used;
    679 	dp->used = 0;
    680 	if ((dp->sigfoll = malloc(sizeof(size_t) * dp->avail)) == 0)
    681 		goto err;
    682 	p = &dp->posn[dp->nposn - 1];	/* same as first(root) */
    683 	dp->cursig = &dp->posfoll[p->seti];
    684 	dp->nset = p->nset;
    685 	dp->top = 1;	/* index 0 is dead state */
    686 	addstate(dp);	/* must be state index 1 (returns 2) */
    687 	if ((dp->cursig = malloc(sizeof(size_t) * dp->nposn)) == 0)
    688 		goto err;
    689 	dp->nfix = 2;
    690 	if ((st = regtrans(dp, 1, ROP_BOL, lxp->mb_cur_max)) == 0)
    691 		goto err;
    692 	if ((dp->anybol = st - 1) == 2) /* new state */
    693 		dp->nfix = 3;
    694 	if ((ep->re_flags & REG_NOSUB) == 0) /* leftmost() might be called */
    695 	{
    696 		/*
    697 		* leftmost() initial states are the same as the
    698 		* "any match" ones without the STAR(ALL) position.
    699 		*/
    700 		dp->sigi[dp->nfix] = 0;
    701 		dp->nsig[dp->nfix] = dp->nsig[1] - 1;
    702 		dp->acc[dp->nfix] = dp->acc[1];
    703 		dp->leftbol = dp->leftmost = dp->nfix;
    704 		dp->nfix++;
    705 		if (dp->anybol != 1)	/* distinct state w/BOL */
    706 		{
    707 			dp->sigi[dp->nfix] = dp->sigi[2];
    708 			dp->nsig[dp->nfix] = dp->nsig[2] - 1;
    709 			dp->acc[dp->nfix] = dp->acc[2];
    710 			dp->leftbol = dp->nfix;
    711 			dp->nfix++;
    712 		}
    713 		dp->top = dp->nfix;
    714 	}
    715 	return 0;
    716 err:;
    717 	libuxre_regdeldfa(dp);
    718 	return REG_ESPACE;
    719 }
    720 
    721 static int
    722 leftmost(Dfa *dp, Exec *xp)
    723 {
    724 	const unsigned char *s, *beg, *end;
    725 	int i, nst, st, mb_cur_max;
    726 	w_type wc;
    727 
    728 	mb_cur_max = xp->mb_cur_max;
    729 	beg = s = xp->str;
    730 	end = 0;
    731 	st = dp->leftbol;
    732 	if (xp->flags & REG_NOTBOL)
    733 		st = dp->leftmost;
    734 	if (dp->acc[st] && (xp->flags & REG_NONEMPTY) == 0)
    735 		end = s;	/* initial empty match allowed */
    736 	for (;;)
    737 	{
    738 		if ((wc = *s++) == '\n')
    739 		{
    740 			if (xp->flags & REG_NEWLINE)
    741 				wc = ROP_EOL;
    742 		}
    743 		else if (!ISONEBYTE(wc) && (i = libuxre_mb2wc(&wc, s)) > 0)
    744 			s += i;
    745 		if ((wc & ~(long)(NCHAR - 1)) != 0
    746 			|| (nst = dp->trans[st][wc]) == 0)
    747 		{
    748 			if ((nst=regtrans(dp, st, wc, mb_cur_max)) == 0)
    749 				return REG_ESPACE;
    750 			if (wc == ROP_EOL) /* REG_NEWLINE only */
    751 			{
    752 				if (dp->acc[nst - 1])
    753 				{
    754 					if (end == 0 || end < s)
    755 						end = s;
    756 					break;
    757 				}
    758 				beg = s;
    759 				st = dp->leftbol;
    760 				goto newst;
    761 			}
    762 		}
    763 		if ((st = nst - 1) == 0) /* dead state */
    764 		{
    765 			if (end != 0)
    766 				break;
    767 			if ((wc = *beg++) == '\0')
    768 				return REG_NOMATCH;
    769 			else if (!ISONEBYTE(wc) &&
    770 					(i = libuxre_mb2wc(&wc, beg)) > 0)
    771 				beg += i;
    772 			s = beg;
    773 			st = dp->leftmost;
    774 			goto newst;
    775 		}
    776 		if (wc == '\0')
    777 		{
    778 			if (dp->acc[st])
    779 			{
    780 				s--;	/* don't include \0 */
    781 				if (end == 0 || end < s)
    782 					end = s;
    783 				break;
    784 			}
    785 			if (end != 0)
    786 				break;
    787 			return REG_NOMATCH;
    788 		}
    789 	newst:;
    790 		if (dp->acc[st])
    791 		{
    792 			if (end == 0 || end < s)
    793 				end = s;
    794 		}
    795 	}
    796 	xp->match[0].rm_so = beg - xp->str;
    797 	xp->match[0].rm_eo = end - xp->str;
    798 	return 0;
    799 }
    800 
    801 /*
    802 * Optimization by simplification: singlebyte locale and REG_NEWLINE not set.
    803 * Performance gain for grep is 25% so it's worth the hack.
    804 */
    805 static int
    806 regdfaexec_opt(Dfa *dp, Exec *xp)
    807 {
    808 	const unsigned char *s;
    809 	int nst, st;
    810 
    811 	s = xp->str;
    812 	st = dp->anybol;
    813 	if (xp->flags & REG_NOTBOL)
    814 		st = 1;
    815 	if (dp->acc[st] && (xp->flags & REG_NONEMPTY) == 0)
    816 		return 0;	/* initial empty match allowed */
    817 	do
    818 	{
    819 		if ((nst = dp->trans[st][*s]) == 0)
    820 		{
    821 			if ((nst = regtrans(dp, st, *s, 1)) == 0)
    822 				return REG_ESPACE;
    823 		}
    824 		if (dp->acc[st = nst - 1])
    825 			return 0;
    826 	} while (*s++ != '\0');	/* st != 0 */
    827 	return REG_NOMATCH;
    828 }
    829 
    830 LIBUXRE_STATIC int
    831 libuxre_regdfaexec(Dfa *dp, Exec *xp)
    832 {
    833 	const unsigned char *s;
    834 	int i, nst, st, mb_cur_max;
    835 	w_type wc;
    836 
    837 	dp->flags = xp->flags & REG_NOTEOL;	/* for regtrans() */
    838 	mb_cur_max = xp->mb_cur_max;
    839 	if (xp->nmatch != 0)
    840 		return leftmost(dp, xp);
    841 	if (mb_cur_max == 1 && (xp->flags & REG_NEWLINE) == 0)
    842 		return regdfaexec_opt(dp, xp);
    843 	s = xp->str;
    844 	st = dp->anybol;
    845 	if (xp->flags & REG_NOTBOL)
    846 		st = 1;
    847 	if (dp->acc[st] && (xp->flags & REG_NONEMPTY) == 0)
    848 		return 0;	/* initial empty match allowed */
    849 	for (;;)
    850 	{
    851 		if ((wc = *s++) == '\n')
    852 		{
    853 			if (xp->flags & REG_NEWLINE)
    854 				wc = ROP_EOL;
    855 		}
    856 		else if (!ISONEBYTE(wc) && (i = libuxre_mb2wc(&wc, s)) > 0)
    857 			s += i;
    858 		if ((wc & ~(long)(NCHAR - 1)) != 0
    859 			|| (nst = dp->trans[st][wc]) == 0)
    860 		{
    861 			if ((nst=regtrans(dp, st, wc, mb_cur_max)) == 0)
    862 				return REG_ESPACE;
    863 			if (wc == ROP_EOL) /* REG_NEWLINE only */
    864 			{
    865 				if (dp->acc[nst - 1])
    866 					return 0;
    867 				if (dp->acc[st = dp->anybol])
    868 					return 0;
    869 				continue;
    870 			}
    871 		}
    872 		if (dp->acc[st = nst - 1])
    873 			return 0;
    874 		if (wc == '\0')	/* st == 0 */
    875 			return REG_NOMATCH;
    876 	}
    877 }