bracket.c - hbase - heirloom base

bracket.c (18517B)
      1 /*
      2  * Changes by Gunnar Ritter, Freiburg i. Br., Germany, November 2002.
      3  *
      4  * Sccsid @(#)bracket.c	1.14 (gritter) 10/18/03
      5  */
      6 /*  UNIX(R) Regular Expresssion Library
      7  *
      8  *  Note: Code is released under the GNU LGPL
      9  *
     10  *  Copyright (C) 2001 Caldera International, Inc.
     11  *
     12  *  This library is free software; you can redistribute it and/or
     13  *  modify it under the terms of the GNU Lesser General Public
     14  *  License as published by the Free Software Foundation; either
     15  *  version 2 of the License, or (at your option) any later version.
     16  *
     17  *  This library is distributed in the hope that it will be useful,
     18  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
     19  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     20  *  Lesser General Public License for more details.
     21  *
     22  *  You should have received a copy of the GNU Lesser General Public
     23  *  License along with this library; if not, write to:
     24  *        Free Software Foundation, Inc.
     25  *        59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
     26  */
     27 
     28 /*	#include "synonyms.h"	*/
     29 #include <ctype.h>
     30 #include <stdlib.h>
     31 #include <string.h>
     32 #include "re.h"
     33 
     34 /*
     35 * Build and match the [...] part of REs.
     36 *
     37 * In general, each compiled bracket construct holds a set of mapped
     38 * wide character values and a set of character classifications.
     39 * The mapping applied (when the current LC_COLLATE is not CHF_ENCODED)
     40 * is the "basic" weight (cep->weight[0]); otherwise the actual wide
     41 * character is used.
     42 *
     43 * To support simplified range handling, this code assumes that a w_type,
     44 * a signed integer type, can hold all valid basic weight values (as well
     45 * as all wide character values for CHF_ENCODED locales) and that these
     46 * are all positive.  Negative values indicate error conditions (BKT_*);
     47 * zero (which must be the same as WGHT_IGNORE) indicates success, but
     48 * that the item installed is not a range endpoint.
     49 */
     50 
     51 static int
     52 addwide(Bracket *bp, wchar_t ord)
     53 {
     54 	unsigned int nw;
     55 
     56 	if ((nw = bp->nwide) < NWIDE)
     57 		bp->wide[nw] = ord;
     58 	else
     59 	{
     60 		if (nw % NWIDE == 0 && (bp->exwide =
     61 			realloc(bp->exwide, nw * sizeof(wchar_t))) == 0)
     62 		{
     63 			return BKT_ESPACE;
     64 		}
     65 		nw -= NWIDE;
     66 		bp->exwide[nw] = ord;
     67 	}
     68 	bp->nwide++;
     69 	return 0;
     70 }
     71 
     72 #if USHRT_MAX == 65535	/* have 16 bits */
     73 #define PLIND(n)	((n) >> 4)
     74 #define PLBIT(n)	(1 << ((n) & 0xf))
     75 #else
     76 #define PLIND(n)	((n) / CHAR_BIT)
     77 #define PLBIT(n)	(1 << ((n) % CHAR_BIT))
     78 #endif
     79 
     80 #define RANGE	((wchar_t)'-')	/* separates wide chars in ranges */
     81 
     82 static int
     83 addrange(Bracket *bp, wchar_t ord, w_type prev)
     84 {
     85 	int ret;
     86 
     87 	if (prev > 0 && prev != ord) /* try for range */
     88 	{
     89 		if (prev > ord)
     90 		{
     91 			if (bp->flags & BKT_ODDRANGE)	/* prev only - done */
     92 				return 0;
     93 			else if ((bp->flags & BKT_BADRANGE) == 0)
     94 				return BKT_ERANGE;
     95 		}
     96 		else
     97 		{
     98 			if (++prev <= UCHAR_MAX) /* "prev" already there */
     99 			{
    100 				do
    101 				{
    102 					bp->byte[PLIND(prev)] |= PLBIT(prev);
    103 					if (prev == ord)
    104 						return 0;
    105 				} while (++prev <= UCHAR_MAX);
    106 			}
    107 			if ((ret = addwide(bp, prev)) != 0)
    108 				return ret;
    109 			if (++prev > ord)
    110 				return 0;
    111 			if (prev < ord && (ret = addwide(bp, RANGE)) != 0)
    112 				return ret;
    113 			return addwide(bp, ord);
    114 		}
    115 	}
    116 	if (ord <= UCHAR_MAX)
    117 	{
    118 		bp->byte[PLIND(ord)] |= PLBIT(ord);
    119 		return 0;
    120 	}
    121 	if (prev == ord) /* don't bother */
    122 		return 0;
    123 	return addwide(bp, ord);
    124 }
    125 
    126 static w_type
    127 place(Bracket *bp, wchar_t wc, w_type prev, int mb_cur_max)
    128 {
    129 	const CollElem *cep;
    130 	CollElem spare;
    131 	int ret;
    132 
    133 	if ((cep = libuxre_collelem(bp->col, &spare, wc)) != ELEM_ENCODED)
    134 	{
    135 		if (cep == ELEM_BADCHAR)
    136 			return BKT_BADCHAR;
    137 		wc = cep->weight[0];
    138 	}
    139 	if ((ret = addrange(bp, wc, prev)) != 0)
    140 		return ret;
    141 	return wc;
    142 }
    143 
    144 #ifndef CHARCLASS_NAME_MAX
    145 #   define CHARCLASS_NAME_MAX	127
    146 #endif
    147 
    148 static w_type
    149 chcls(Bracket *bp, const unsigned char *s, int n)
    150 {
    151 	char clsstr[CHARCLASS_NAME_MAX + 1];
    152 	unsigned int nt;
    153 	wctype_t wct;
    154 
    155 	if (n > CHARCLASS_NAME_MAX)
    156 		return BKT_ECTYPE;
    157 	(void)memcpy(clsstr, s, n);
    158 	clsstr[n] = '\0';
    159 	if ((wct = wctype(clsstr)) == 0)
    160 		return BKT_ECTYPE;
    161 	if ((nt = bp->ntype) < NTYPE)
    162 		bp->type[nt] = wct;
    163 	else
    164 	{
    165 		if (nt % NTYPE == 0 && (bp->extype =
    166 			realloc(bp->extype, nt * sizeof(wctype_t))) == 0)
    167 		{
    168 			return BKT_ESPACE;
    169 		}
    170 		nt -= NTYPE;
    171 		bp->extype[nt] = wct;
    172 	}
    173 	bp->ntype++;
    174 	return 0; /* cannot be end point of a range */
    175 }
    176 
    177 	/*
    178 	* The purpose of mcce() and its Mcce structure is to locate
    179 	* the next full collation element from "wc" and "s".  It is
    180 	* called both at compile and execute time.  These two differ
    181 	* primarily in that at compile time there is an exact number
    182 	* of bytes to be consumed, while at execute time the longest
    183 	* valid collation element is to be found.
    184 	*
    185 	* When BKT_ONECASE is set, MCCEs become particularly messy.
    186 	* There is no guarantee that all possible combinations of
    187 	* upper/lower case are defined as MCCEs.  Thus, this code
    188 	* tries both lower- and uppercase (in that order) for each
    189 	* character than might be part of an MCCE.
    190 	*/
    191 
    192 typedef struct
    193 {
    194 	const unsigned char	*max;	/* restriction by caller */
    195 	const unsigned char	*aft;	/* longest successful */
    196 	Bracket			*bp;	/* readonly */
    197 	struct lc_collate	*col;	/* readonly */
    198 	const CollElem		*cep;	/* entry matching longest */
    199 	wchar_t			ch;	/* initial character (if any) */
    200 	w_type			wc;	/* character matching "aft" */
    201 } Mcce;
    202 
    203 static int
    204 mcce(Mcce *mcp, const CollElem *cep, const unsigned char *s, int mb_cur_max,
    205 		int compile_time)
    206 {
    207 	const CollElem *nxt;
    208 	CollElem spare;
    209 	w_type ch, wc;
    210 	int i;
    211 
    212 	/*
    213 	* Get next character.
    214 	*/
    215 	if ((wc = mcp->ch) != '\0')
    216 	{
    217 		mcp->ch = '\0';
    218 	}
    219 	else if (ISONEBYTE(wc = *s++))
    220 	{
    221 		if (wc == '\0')
    222 			return 0;
    223 	}
    224 	else if ((i = libuxre_mb2wc(&wc, s)) > 0)
    225 	{
    226 		s += i;
    227 		if (mcp->max != 0 && s > mcp->max)
    228 			return 0;
    229 	}
    230 	else if (i < 0)
    231 		return BKT_ILLSEQ;
    232 	/*
    233 	* Try out the this character as part of an MCCE.
    234 	* If BKT_ONECASE is set, this code tries both the lower- and
    235 	* uppercase version, continuing if it matches so far.
    236 	*/
    237 	ch = wc;
    238 	if (mcp->bp->flags & BKT_ONECASE)
    239 	{
    240 		if ((wc = to_lower(wc)) == ch)
    241 			ch = to_upper(wc);
    242 	}
    243 	for (;;) /* at most twice */
    244 	{
    245 		if (cep == ELEM_BADCHAR) /* first character */
    246 		{
    247 			if ((nxt = libuxre_collelem(mcp->col, &spare, wc))
    248 				== ELEM_ENCODED
    249 				|| (mcp->col->flags & CHF_MULTICH) == 0
    250 				|| s == mcp->max)
    251 			{
    252 				mcp->aft = s;
    253 				mcp->cep = nxt;
    254 				mcp->wc = wc;
    255 				break;
    256 			}
    257 		}
    258 		else
    259 		{
    260 			nxt = libuxre_collmult(mcp->col, cep, wc);
    261 		} 
    262 		if (nxt != ELEM_BADCHAR)
    263 		{
    264 			/*
    265 			* Okay so far.  Record this collating element
    266 			* if it's really one (not WGHT_IGNORE) and
    267 			* we've reached a new high point or it's the
    268 			* first match.
    269 			*
    270 			* If there's a possibility for more, call mcce()
    271 			* recursively for the subsequent characters.
    272 			*/
    273 			if (nxt->weight[0] != WGHT_IGNORE
    274 				&& (mcp->aft < s || mcp->cep == ELEM_BADCHAR))
    275 			{
    276 				mcp->aft = s;
    277 				mcp->cep = nxt;
    278 				mcp->wc = wc;
    279 			}
    280 			if (nxt->multbeg != 0
    281 				&& (mcp->max == 0 || s < mcp->max))
    282 			{
    283 				if ((i = mcce(mcp, nxt, s, mb_cur_max,
    284 						compile_time)) != 0)
    285 					return i;
    286 			}
    287 		}
    288 		if (wc == ch)
    289 			break;
    290 		wc = ch;
    291 	}
    292 	return 0;
    293 }
    294 
    295 static w_type
    296 eqcls(Bracket *bp, const unsigned char *s, int n, w_type prev, int mb_cur_max)
    297 {
    298 	w_type last;
    299 	Mcce mcbuf;
    300 	int err;
    301 
    302 	mcbuf.max = &s[n];
    303 	mcbuf.aft = &s[0];
    304 	mcbuf.bp = bp;
    305 	mcbuf.col = bp->col;
    306 	mcbuf.cep = ELEM_BADCHAR;
    307 	mcbuf.ch = '\0';
    308 	if ((err = mcce(&mcbuf, ELEM_BADCHAR, s, mb_cur_max, 1)) != 0)
    309 		return err;
    310 	if (mcbuf.cep == ELEM_BADCHAR || mcbuf.aft != mcbuf.max)
    311 		return BKT_EEQUIV;
    312 	last = mcbuf.wc;
    313 	if (mcbuf.cep != ELEM_ENCODED && mcbuf.col->nweight > 1)
    314 	{
    315 		const CollElem *cep;
    316 
    317 		/*
    318 		* The first and last weight[0] values for equivalence
    319 		* classes are stuffed into the terminator for the
    320 		* multiple character lists.  If these values are
    321 		* scattered (elements that are not part of this
    322 		* equivalence class have weight[0] values between the
    323 		* two end points), then SUBN_SPECIAL is placed in
    324 		* this terminator.  Note that weight[1] of the
    325 		* terminator must be other than WGHT_IGNORE, too.
    326 		*/
    327 		last = mcbuf.cep->weight[0];
    328 		if ((cep = libuxre_collmult(bp->col, mcbuf.cep, 0))
    329 				!= ELEM_BADCHAR
    330 			&& cep->weight[1] != WGHT_IGNORE)
    331 		{
    332 			last = cep->weight[1];
    333 			if (cep->subnbeg == SUBN_SPECIAL)
    334 			{
    335 				unsigned int nq;
    336 
    337 				/*
    338 				* Permit ranges up to the first and
    339 				* after the last.
    340 				*/
    341 				if (prev > 0 && prev != cep->weight[0]
    342 					&& (prev = addrange(bp,
    343 						cep->weight[0], prev)) != 0)
    344 				{
    345 					return prev;
    346 				}
    347 				/*
    348 				* Record the equivalence class by storing
    349 				* the primary weight.
    350 				*/
    351 				if ((nq = bp->nquiv) < NQUIV)
    352 					bp->quiv[nq] = mcbuf.cep->weight[1];
    353 				else
    354 				{
    355 					if (nq % NQUIV == 0 && (bp->exquiv = 
    356 						realloc(bp->exquiv,
    357 							nq * sizeof(wuchar_type)))
    358 						== 0)
    359 					{
    360 						return REG_ESPACE;
    361 					}
    362 					nq -= NQUIV;
    363 					bp->exquiv[nq] = mcbuf.cep->weight[1];
    364 				}
    365 				bp->nquiv++;
    366 				return last;
    367 			}
    368 			mcbuf.cep = cep;
    369 		}
    370 		mcbuf.wc = mcbuf.cep->weight[0];
    371 	}
    372 	/*
    373 	* Determine range, if any, to install.
    374 	*
    375 	* If there's a pending low (prev > 0), then try to use it.
    376 	*
    377 	* Otherwise, try to use mcbuf.wc as the low end of the range.
    378 	* Since addrange() assumes that the low point has already been
    379 	* placed, we try to fool it by using a prev of one less than
    380 	* mcbuf.wc.  But, if that value would not look like a valid
    381 	* low point of a range, we have to explicitly place mcbuf.wc.
    382 	*/
    383 	if (prev <= 0 && (prev = mcbuf.wc - 1) <= 0)
    384 	{
    385 		if ((prev = addrange(bp, mcbuf.wc, 0)) != 0)
    386 			return prev;
    387 	}
    388 	if ((mcbuf.wc = addrange(bp, last, prev)) != 0)
    389 		return mcbuf.wc;
    390 	return last;
    391 }
    392 
    393 static w_type
    394 clsym(Bracket *bp, const unsigned char *s, int n, w_type prev, int mb_cur_max)
    395 {
    396 	Mcce mcbuf;
    397 	int err;
    398 
    399 	mcbuf.max = &s[n];
    400 	mcbuf.aft = &s[0];
    401 	mcbuf.bp = bp;
    402 	mcbuf.col = bp->col;
    403 	mcbuf.cep = ELEM_BADCHAR;
    404 	mcbuf.ch = '\0';
    405 	if ((err = mcce(&mcbuf, ELEM_BADCHAR, s, mb_cur_max, 1)) != 0)
    406 		return err;
    407 	if (mcbuf.cep == ELEM_BADCHAR || mcbuf.aft != mcbuf.max)
    408 		return BKT_ECOLLATE;
    409 	if (mcbuf.cep != ELEM_ENCODED)
    410 		mcbuf.wc = mcbuf.cep->weight[0];
    411 	if ((err = addrange(bp, mcbuf.wc, prev)) != 0)
    412 		return err;
    413 	return mcbuf.wc;
    414 }
    415 
    416 	/*
    417 	* Scans the rest of a bracket construction within a regular
    418 	* expression and fills in a description for it.
    419 	* The leading [ and the optional set complement indicator
    420 	* were handled already by the caller.
    421 	* Returns:
    422 	*	<0 error (a BKT_* value)
    423 	*	>0 success; equals how many bytes were scanned.
    424 	*/
    425 LIBUXRE_STATIC int
    426 libuxre_bktmbcomp(Bracket *bp, const unsigned char *pat0,
    427 		int flags, int mb_cur_max)
    428 {
    429 	static const Bracket zero = {0};
    430 	const unsigned char *pat = pat0;
    431 	struct lc_collate *savecol;
    432 	w_type n, wc, prev = 0;
    433 
    434 	/*
    435 	* Set represented set to empty.  Easiest to copy an empty
    436 	* version over the caller's, (re)setting col and flags.
    437 	*/
    438 	savecol = bp->col;
    439 	*bp = zero;
    440 	bp->col = savecol;
    441 	bp->flags = flags
    442 		& (BKT_NEGATED | BKT_ONECASE | BKT_NOTNL | BKT_BADRANGE |
    443 				BKT_ODDRANGE);
    444 	/*
    445 	* Handle optional "empty" brackets; typically only used
    446 	* in combination with BKT_QUOTE or BKT_ESCAPE.
    447 	*/
    448 	if ((wc = *pat) == ']' && (flags & BKT_EMPTY) != 0)
    449 		return 1;
    450 	/*
    451 	* Populate *bp.
    452 	*/
    453 	for (;; prev = n)
    454 	{
    455 		switch (wc)
    456 		{
    457 		case '\0':
    458 		ebrack:;
    459 			n = BKT_EBRACK;
    460 			goto err;
    461 		case '\n':
    462 			if (flags & BKT_NLBAD)
    463 				goto ebrack;
    464 			goto regular;
    465 		case '/':
    466 			if (flags & BKT_SLASHBAD)
    467 				goto ebrack;
    468 			goto regular;
    469 		case '\\':
    470 			if ((flags & (BKT_ESCAPE | BKT_QUOTE
    471 				| BKT_ESCNL | BKT_ESCSEQ)) == 0)
    472 			{
    473 				goto regular;
    474 			}
    475 			switch (wc = *++pat)
    476 			{
    477 			default:
    478 			noesc:;
    479 				if ((flags & BKT_ESCAPE) == 0)
    480 				{
    481 					wc = '\\';
    482 					pat--;
    483 				}
    484 				break;
    485 			case '\\':
    486 			case ']':
    487 			case '-':
    488 			case '^':
    489 				if ((flags & BKT_QUOTE) == 0)
    490 					goto noesc;
    491 				break;
    492 			case 'a':
    493 				if ((flags & BKT_ESCSEQ) == 0 ||
    494 						(flags & BKT_OLDESC))
    495 					goto noesc;
    496 				wc = '\a';
    497 				break;
    498 			case 'b':
    499 				if ((flags & BKT_ESCSEQ) == 0)
    500 					goto noesc;
    501 				wc = '\b';
    502 				break;
    503 			case 'f':
    504 				if ((flags & BKT_ESCSEQ) == 0)
    505 					goto noesc;
    506 				wc = '\f';
    507 				break;
    508 			case 'n':
    509 				if ((flags & (BKT_ESCSEQ | BKT_ESCNL)) == 0)
    510 					goto noesc;
    511 				wc = '\n';
    512 				break;
    513 			case 'r':
    514 				if ((flags & BKT_ESCSEQ) == 0)
    515 					goto noesc;
    516 				wc = '\r';
    517 				break;
    518 			case 't':
    519 				if ((flags & BKT_ESCSEQ) == 0)
    520 					goto noesc;
    521 				wc = '\t';
    522 				break;
    523 			case 'v':
    524 				if ((flags & BKT_ESCSEQ) == 0 ||
    525 						(flags & BKT_OLDESC))
    526 					goto noesc;
    527 				wc = '\v';
    528 				break;
    529 			case 'x':
    530 				if ((flags & BKT_ESCSEQ) == 0 ||
    531 						(flags & BKT_OLDESC))
    532 					goto noesc;
    533 				if (!isxdigit(wc = *++pat))
    534 				{
    535 					pat--;
    536 					goto noesc;
    537 				}
    538 				/*
    539 				* Take as many hex digits as possible,
    540 				* ignoring overflows.
    541 				* Any positive result is okay.
    542 				*/
    543 				n = 0;
    544 				do
    545 				{
    546 					if (isdigit(wc))
    547 						wc -= '0';
    548 					else if (isupper(wc))
    549 						wc -= 'A' + 10;
    550 					else
    551 						wc -= 'a' + 10;
    552 					n <<= 4;
    553 					n |= wc;
    554 				} while (isxdigit(wc = *++pat));
    555 				pat--;
    556 				if ((wc = n) <= 0)
    557 				{
    558 					n = BKT_BADESC;
    559 					goto err;
    560 				}
    561 				break;
    562 			case '0':
    563 			case '1':
    564 			case '2':
    565 			case '3':
    566 			case '4':
    567 			case '5':
    568 			case '6':
    569 			case '7':
    570 			case '8':
    571 			case '9':
    572 				if ((flags & BKT_ESCSEQ) == 0 ||
    573 						(flags & BKT_OLDESC))
    574 					goto noesc;
    575 				/*
    576 				* For compatibility (w/awk),
    577 				* permit "octal" 8 and 9.
    578 				*/
    579 				n = wc - '0';
    580 				if ((wc = *++pat) >= '0' && wc <= '9')
    581 				{
    582 					n <<= 3;
    583 					n += wc - '0';
    584 					if ((wc = *++pat) >= '0' && wc <= '9')
    585 					{
    586 						n <<= 3;
    587 						n += wc - '0';
    588 					}
    589 				}
    590 				pat--;
    591 				if ((wc = n) <= 0)
    592 				{
    593 					n = BKT_BADESC;
    594 					goto err;
    595 				}
    596 				break;
    597 			}
    598 			goto regular;
    599 		case '[':
    600 			if (((wc = *++pat) == ':' || wc == '=' || wc == '.') &&
    601 					(flags & BKT_NOI18N) == 0)
    602 			{
    603 				n = 0;
    604 				while (*++pat != wc || pat[1] != ']')
    605 				{
    606 					if (*pat == '\0')
    607 					{
    608 					badpat:;
    609 						n = BKT_BADPAT;
    610 						goto err;
    611 					}
    612 					else if (*pat == '/')
    613 					{
    614 						if (flags & BKT_SLASHBAD)
    615 							goto badpat;
    616 					}
    617 					else if (*pat == '\n')
    618 					{
    619 						if (flags & BKT_NLBAD)
    620 							goto badpat;
    621 					}
    622 					n++;
    623 				}
    624 				if (n == 0)
    625 				{
    626 					n = BKT_EMPTYSUBBKT;
    627 					goto err;
    628 				}
    629 				if (wc == ':')
    630 					n = chcls(bp, &pat[-n], n);
    631 				else if (wc == '=')
    632 					n = eqcls(bp, &pat[-n], n, prev,
    633 							mb_cur_max);
    634 				else /* wc == '.' */
    635 					n = clsym(bp, &pat[-n], n, prev,
    636 							mb_cur_max);
    637 				pat++;
    638 				break;
    639 			}
    640 			wc = '[';
    641 			pat--;
    642 			goto regular;
    643 		default:
    644 			if (!ISONEBYTE(wc) &&
    645 					(n = libuxre_mb2wc(&wc, pat + 1)) > 0)
    646 				pat += n;
    647 		regular:;
    648 			n = place(bp, wc, prev, mb_cur_max);
    649 			break;
    650 		}
    651 		if (n < 0) {
    652 			n = BKT_ILLSEQ;
    653 			goto err;
    654 		}
    655 		if ((wc = *++pat) == ']')
    656 			break;
    657 		if (wc == '-' && n != 0)
    658 		{
    659 			if (prev == 0 || (flags & BKT_SEPRANGE) == 0)
    660 			{
    661 				if ((wc = *++pat) != ']')
    662 					continue; /* valid range */
    663 				wc = '-';
    664 				pat--;
    665 			}
    666 		}
    667 		n = 0;	/* no range this time */
    668 	}
    669 	return pat - pat0 + 1;
    670 err:;
    671 	libuxre_bktfree(bp);
    672 	return n;
    673 }
    674 
    675 LIBUXRE_STATIC void
    676 libuxre_bktfree(Bracket *bp)
    677 {
    678 	if (bp->extype != 0)
    679 		free(bp->extype);
    680 	if (bp->exquiv != 0)
    681 		free(bp->exquiv);
    682 	if (bp->exwide != 0)
    683 		free(bp->exwide);
    684 }
    685 
    686 LIBUXRE_STATIC int
    687 libuxre_bktmbexec(Bracket *bp, wchar_t wc,
    688 		const unsigned char *str, int mb_cur_max)
    689 {
    690 	unsigned int i;
    691 	wchar_t lc, uc;
    692 	Mcce mcbuf;
    693 
    694 	mcbuf.aft = str; /* in case of match in character classes */
    695 	mcbuf.ch = wc;
    696 	/*
    697 	* First: check the single wc against any character classes.
    698 	* Since multiple character collating elements are not part
    699 	* of this world, they don't apply here.
    700 	*/
    701 	if ((i = bp->ntype) != 0)
    702 	{
    703 		wctype_t *wctp = &bp->type[0];
    704 
    705 		if (bp->flags & BKT_ONECASE)
    706 		{
    707 			if ((wc = to_lower(wc)) == mcbuf.ch)
    708 				mcbuf.ch = to_upper(wc);
    709 		}
    710 		for (;;)
    711 		{
    712 			if (iswctype(mb_cur_max==1?btowc(wc):wc, *wctp))
    713 				goto match;
    714 			if (wc != mcbuf.ch &&
    715 			    iswctype(mb_cur_max==1?btowc(mcbuf.ch):mcbuf.ch,
    716 				    *wctp))
    717 				goto match;
    718 			if (--i == 0)
    719 				break;
    720 			if (++wctp == &bp->type[NTYPE])
    721 				wctp = &bp->extype[0];
    722 		}
    723 	}
    724 	/*
    725 	* The main match is determined by the weight[0] value
    726 	* of the character (or characters, if the input can be
    727 	* taken as a multiple character collating element).
    728 	*/
    729 	mcbuf.max = 0;
    730 	mcbuf.bp = bp;
    731 	mcbuf.col = bp->col;
    732 	mcbuf.cep = ELEM_BADCHAR;
    733 	mcce(&mcbuf, ELEM_BADCHAR, str, mb_cur_max, 0);
    734 	if (mcbuf.cep == ELEM_BADCHAR)
    735 		return -1;	/* never matches */
    736 	if (mcbuf.cep != ELEM_ENCODED)
    737 		mcbuf.wc = mcbuf.cep->weight[0];
    738 	/*
    739 	* POSIX.2 demands that both a character and its case counterpart
    740 	* can match if REG_ICASE is set. This means that [B-z] matches
    741 	* 'A', 'a', and '['.
    742 	*/
    743 	if (bp->flags & BKT_ONECASE)
    744 	{
    745 		lc = to_lower(mcbuf.wc);
    746 		uc = to_upper(mcbuf.wc);
    747 	}
    748 	else
    749 		lc = uc = mcbuf.wc;
    750 	/*
    751 	* See if it's in the set.  Note that the list of true wide
    752 	* character values has explicit ranges.
    753 	*/
    754 	if (mcbuf.wc <= UCHAR_MAX)
    755 	{
    756 		if (bp->byte[PLIND(lc)] & PLBIT(lc))
    757 			goto match;
    758 		if (lc != uc && (bp->byte[PLIND(uc)] & PLBIT(uc)))
    759 			goto match;
    760 	}
    761 	else if ((i = bp->nwide) != 0)
    762 	{
    763 		wchar_t *wcp = &bp->wide[0];
    764 		long lcmp, ucmp;
    765 
    766 		for (;;)
    767 		{
    768 			if ((lcmp = lc - *wcp) == 0)
    769 				goto match;
    770 			ucmp = uc - *wcp;
    771 			if (lc != uc && ucmp == 0)
    772 				goto match;
    773 			if (--i == 0)
    774 				break;
    775 			if (++wcp == &bp->wide[NWIDE])
    776 				wcp = &bp->exwide[0];
    777 			if (*wcp == RANGE)
    778 			{
    779 				if (++wcp == &bp->wide[NWIDE])
    780 					wcp = &bp->exwide[0];
    781 				if (lcmp > 0 && lc <= *wcp)
    782 					goto match;
    783 				if (lc != uc && ucmp > 0 && uc < *wcp)
    784 					goto match;
    785 				if ((i -= 2) == 0)
    786 					break;
    787 				if (++wcp == &bp->wide[NWIDE])
    788 					wcp = &bp->exwide[0];
    789 			}
    790 		}
    791 	}
    792 	/*
    793 	* The last chance for a match is if an equivalence class
    794 	* was specified for which the primary weights are scattered
    795 	* through the weight[0]s.
    796 	*/
    797 	if ((i = bp->nquiv) != 0 && mcbuf.cep != ELEM_ENCODED)
    798 	{
    799 		wuchar_type *wucp = &bp->quiv[0];
    800 
    801 		mcbuf.wc = mcbuf.cep->weight[1];
    802 		for (;;)
    803 		{
    804 			if (mcbuf.wc == *wucp)
    805 				goto match;
    806 			if (--i == 0)
    807 				break;
    808 			if (++wucp == &bp->quiv[NQUIV])
    809 				wucp = &bp->exquiv[0];
    810 		}
    811 	}
    812 	/*
    813 	* Only here when no match against the set was found.
    814 	* One final special case w/r/t newline.
    815 	*/
    816 	if (bp->flags & BKT_NEGATED)
    817 	{
    818 		if (wc != '\n' || (bp->flags & BKT_NOTNL) == 0)
    819 			return mcbuf.aft - str;
    820 	}
    821 	return -1;
    822 match:;
    823 	/*
    824 	* Only here when a match against the described set is found.
    825 	*/
    826 	if (bp->flags & BKT_NEGATED)
    827 		return -1;
    828 	return mcbuf.aft - str;
    829 }
	hbase heirloom base
	git clone git://git.2f30.org/hbase
	Log \| Files \| Refs \| README