hbase

heirloom base
git clone git://git.2f30.org/hbase
Log | Files | Refs | README

rcomp.c (8795B)


      1 /*
      2  * grep - search a file for a pattern
      3  *
      4  * Gunnar Ritter, Freiburg i. Br., Germany, April 2001.
      5  */
      6 /*
      7  * Copyright (c) 2003 Gunnar Ritter
      8  *
      9  * This software is provided 'as-is', without any express or implied
     10  * warranty. In no event will the authors be held liable for any damages
     11  * arising from the use of this software.
     12  *
     13  * Permission is granted to anyone to use this software for any purpose,
     14  * including commercial applications, and to alter it and redistribute
     15  * it freely, subject to the following restrictions:
     16  *
     17  * 1. The origin of this software must not be misrepresented; you must not
     18  *    claim that you wrote the original software. If you use this software
     19  *    in a product, an acknowledgment in the product documentation would be
     20  *    appreciated but is not required.
     21  *
     22  * 2. Altered source versions must be plainly marked as such, and must not be
     23  *    misrepresented as being the original software.
     24  *
     25  * 3. This notice may not be removed or altered from any source distribution.
     26  */
     27 
     28 /*	Sccsid @(#)rcomp.c	1.27 (gritter) 2/6/05>	*/
     29 
     30 /*
     31  * Code involving POSIX.2 regcomp()/regexpr() routines.
     32  */
     33 
     34 #include	"grep.h"
     35 #include	"alloc.h"
     36 #include	<stdio.h>
     37 #include	<stdlib.h>
     38 #include	<string.h>
     39 #include	<mbtowi.h>
     40 
     41 static int	emptypat;
     42 
     43 #ifdef	UXRE
     44 #include	<regdfa.h>
     45 static int	rc_range(struct iblok *, char *);
     46 static int	rc_rangew(struct iblok *, char *);
     47 #endif
     48 
     49 /*
     50  * Check whether line matches any pattern of the pattern list.
     51  */
     52 static int
     53 rc_match(const char *str, size_t sz)
     54 {
     55 #ifndef	UXRE
     56 	struct expr *e;
     57 #endif
     58 	regmatch_t pmatch[1];
     59 	int gotcha = 0;
     60 
     61 	if (emptypat) {
     62 		if (xflag) {
     63 			if (*str == '\0')
     64 				return 1;
     65 		} else
     66 			return 1;
     67 	}
     68 #ifdef	UXRE
     69 	if (e0->e_exp)
     70 		gotcha = (regexec(e0->e_exp, str, 1, pmatch, 0) == 0);
     71 #else	/* !UXRE */
     72 	for (e = e0; e; e = e->e_nxt) {
     73 		if (e->e_exp) {
     74 			gotcha = (regexec(e->e_exp, str, 1, pmatch, 0) == 0);
     75 			if (gotcha)
     76 				break;
     77 		}
     78 	}
     79 #endif	/* !UXRE */
     80 	if (gotcha)
     81 		if (!xflag || (pmatch[0].rm_so == 0
     82 				&& pmatch[0].rm_eo == sz))
     83 			return 1;
     84 	return 0;
     85 }
     86 
     87 /*
     88  * Compile a pattern structure using regcomp().
     89  */
     90 static void
     91 rc_build(void)
     92 {
     93 	int rerror = REG_BADPAT;
     94 	int rflags = 0;
     95 	size_t sz;
     96 #ifdef	UXRE
     97 	char *pat, *cp;
     98 #endif	/* UXRE */
     99 	struct expr *e;
    100 
    101 	if ((e0->e_flg & E_NULL) == 0) {
    102 		for (sz = 0, e = e0; e; e = e->e_nxt) {
    103 			if (e->e_len > 0)
    104 				sz += e->e_len + 1;
    105 			else
    106 				emptypat = 1;
    107 		}
    108 	} else
    109 		sz = 1;
    110 	if ((e0->e_flg & E_NULL || emptypat) && sus == 0)
    111 		rc_error(e0, rerror);
    112 	if (sz == 0 || (emptypat && xflag == 0)) {
    113 		e0->e_exp = NULL;
    114 		return;
    115 	}
    116 #ifdef	UXRE
    117 	pat = smalloc(sz);
    118 	for (cp = pat, e = e0; e; e = e->e_nxt) {
    119 		if (e->e_len > 0) {
    120 			memcpy(cp, e->e_pat, e->e_len);
    121 			cp[e->e_len] = '\n';
    122 			cp = &cp[e->e_len + 1];
    123 		}
    124 	}
    125 	pat[sz - 1] = '\0';
    126 	if (iflag)
    127 		rflags |= REG_ICASE;
    128 	if (Eflag)
    129 		rflags |= (sus ? REG_EXTENDED : REG_OLDERE|REG_NOI18N) |
    130 			REG_MTPARENBAD;
    131 	else {
    132 		rflags |= REG_ANGLES;
    133 		if (sus >= 3)
    134 			rflags |= REG_AVOIDNULL;
    135 	}
    136 	if (xflag)
    137 		rflags |= REG_ONESUB;
    138 	else
    139 		rflags |= REG_NOSUB;
    140 	if ((e = e0)->e_nxt)
    141 		rflags |= REG_NLALT;
    142 	e->e_exp = (regex_t *)smalloc(sizeof *e->e_exp);
    143 	if ((rerror = regcomp(e->e_exp, pat, rflags)) != 0)
    144 		rc_error(e, rerror);
    145 	free(pat);
    146 	if (!xflag && e->e_exp->re_flags & REG_DFA)
    147 		range = mbcode ? rc_rangew : rc_range;
    148 #else	/* !UXRE */
    149 	if (iflag)
    150 		rflags |= REG_ICASE;
    151 	if (Eflag)
    152 		rflags |= REG_EXTENDED;
    153 	if (!xflag)
    154 		rflags |= REG_NOSUB;
    155 	for (e = e0; e; e = e->e_nxt) {
    156 		e->e_exp = (regex_t *)smalloc(sizeof *e->e_exp);
    157 		if ((rerror = regcomp(e->e_exp, e->e_pat, rflags)) != 0)
    158 			rc_error(e, rerror);
    159 	}
    160 #endif	/* !UXRE */
    161 }
    162 
    163 void
    164 rc_select(void)
    165 {
    166 	build = rc_build;
    167 	match = rc_match;
    168 	matchflags |= MF_NULTERM;
    169 	matchflags &= ~MF_LOCONV;
    170 }
    171 
    172 /*
    173  * Derived from Unix 32V /usr/src/cmd/egrep.y
    174  *
    175  * Changes by Gunnar Ritter, Freiburg i. Br., Germany, April 2001.
    176  */
    177 /*
    178  * Copyright(C) Caldera International Inc. 2001-2002. All rights reserved.
    179  *
    180  * Redistribution and use in source and binary forms, with or without
    181  * modification, are permitted provided that the following conditions
    182  * are met:
    183  *   Redistributions of source code and documentation must retain the
    184  *    above copyright notice, this list of conditions and the following
    185  *    disclaimer.
    186  *   Redistributions in binary form must reproduce the above copyright
    187  *    notice, this list of conditions and the following disclaimer in the
    188  *    documentation and/or other materials provided with the distribution.
    189  *   All advertising materials mentioning features or use of this software
    190  *    must display the following acknowledgement:
    191  *      This product includes software developed or owned by Caldera
    192  *      International, Inc.
    193  *   Neither the name of Caldera International, Inc. nor the names of
    194  *    other contributors may be used to endorse or promote products
    195  *    derived from this software without specific prior written permission.
    196  *
    197  * USE OF THE SOFTWARE PROVIDED FOR UNDER THIS LICENSE BY CALDERA
    198  * INTERNATIONAL, INC. AND CONTRIBUTORS ``AS IS'' AND ANY EXPRESS OR
    199  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
    200  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
    201  * ARE DISCLAIMED. IN NO EVENT SHALL CALDERA INTERNATIONAL, INC. BE
    202  * LIABLE FOR ANY DIRECT, INDIRECT INCIDENTAL, SPECIAL, EXEMPLARY, OR
    203  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
    204  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
    205  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
    206  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
    207  * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
    208  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    209  */
    210 #ifdef	UXRE
    211 /*
    212  * Range search for singlebyte locales using the modified UNIX(R) Regular
    213  * Expression Library DFA.
    214  */
    215 static int
    216 rc_range(struct iblok *ip, char *last)
    217 {
    218 	char	*p;
    219 	int	c, cstat, nstat;
    220 	Dfa	*dp = e0->e_exp->re_dfa;
    221 
    222 	p = ip->ib_cur;
    223 	lineno++;
    224 	cstat = dp->anybol;
    225 	if (dp->acc[cstat])
    226 		goto found;
    227 	for (;;) {
    228 		if ((nstat = dp->trans[cstat][*p & 0377]) == 0) {
    229 			/*
    230 			 * '\0' is used to indicate end-of-line. If a '\0'
    231 			 * character appears in input, it matches '$' but
    232 			 * the DFA remains in dead state afterwards; there
    233 			 * is thus no need to handle this condition
    234 			 * specially to get the same behavior as in plain
    235 			 * regexec().
    236 			 */
    237 			if ((c = *p & 0377) == '\n')
    238 				c = '\0';
    239 			if ((nstat = regtrans(dp, cstat, c, 1)) == 0)
    240 				goto fail;
    241 			dp->trans[cstat]['\n'] = dp->trans[cstat]['\0'];
    242 		}
    243 		if (dp->acc[cstat = nstat - 1]) {
    244 		found:	for (;;) {
    245 				if (vflag == 0) {
    246 		succeed:		outline(ip, last, p - ip->ib_cur);
    247 					if (qflag || lflag)
    248 						return 1;
    249 				} else {
    250 		fail:			ip->ib_cur = p;
    251 					while (*ip->ib_cur++ != '\n');
    252 				}
    253 				if ((p = ip->ib_cur) > last)
    254 					return 0;
    255 				lineno++;
    256 				if (dp->acc[cstat = dp->anybol] == 0)
    257 					goto brk2;
    258 			}
    259 		}
    260 		if (*p++ == '\n') {
    261 			if (vflag) {
    262 				p--;
    263 				goto succeed;
    264 			}
    265 			if ((ip->ib_cur = p) > last)
    266 				return 0;
    267 			lineno++;
    268 			if (dp->acc[cstat = dp->anybol])
    269 				goto found;
    270 		}
    271 		brk2:;
    272 	}
    273 }
    274 
    275 /*
    276  * Range search for multibyte locales using the modified UNIX(R) Regular
    277  * Expression Library DFA.
    278  */
    279 static int
    280 rc_rangew(struct iblok *ip, char *last)
    281 {
    282 	char	*p;
    283 	int	n, cstat, nstat;
    284 	wint_t	wc;
    285 	Dfa	*dp = e0->e_exp->re_dfa;
    286 
    287 	p = ip->ib_cur;
    288 	lineno++;
    289 	cstat = dp->anybol;
    290 	if (dp->acc[cstat])
    291 		goto found;
    292 	for (;;) {
    293 		if (*p & 0200) {
    294 			if ((n = mbtowi(&wc, p, last + 1 - p)) < 0) {
    295 				n = 1;
    296 				wc = WEOF;
    297 			}
    298 		} else {
    299 			wc = *p;
    300 			n = 1;
    301 		}
    302 		if ((wc & ~(wchar_t)(NCHAR-1)) != 0 ||
    303 				(nstat = dp->trans[cstat][wc]) == 0) {
    304 			/*
    305 			 * '\0' is used to indicate end-of-line. If a '\0'
    306 			 * character appears in input, it matches '$' but
    307 			 * the DFA remains in dead state afterwards; there
    308 			 * is thus no need to handle this condition
    309 			 * specially to get the same behavior as in plain
    310 			 * regexec().
    311 			 */
    312 			if (wc == '\n')
    313 				wc = '\0';
    314 			if ((nstat = regtrans(dp, cstat, wc, mb_cur_max)) == 0)
    315 				goto fail;
    316 			dp->trans[cstat]['\n'] = dp->trans[cstat]['\0'];
    317 		}
    318 		if (dp->acc[cstat = nstat - 1]) {
    319 		found:	for (;;) {
    320 				if (vflag == 0) {
    321 		succeed:		outline(ip, last, p - ip->ib_cur);
    322 					if (qflag || lflag)
    323 						return 1;
    324 				} else {
    325 		fail:			ip->ib_cur = p;
    326 					while (*ip->ib_cur++ != '\n');
    327 				}
    328 				if ((p = ip->ib_cur) > last)
    329 					return 0;
    330 				lineno++;
    331 				if (dp->acc[cstat = dp->anybol] == 0)
    332 					goto brk2;
    333 			}
    334 		}
    335 		p += n;
    336 		if (p[-n] == '\n') {
    337 			if (vflag) {
    338 				p--;
    339 				goto succeed;
    340 			}
    341 			if ((ip->ib_cur = p) > last)
    342 				return 0;
    343 			lineno++;
    344 			if (dp->acc[cstat = dp->anybol])
    345 				goto found;
    346 		}
    347 		brk2:;
    348 	}
    349 }
    350 #endif	/* UXRE */