hbase

heirloom base
git clone git://git.2f30.org/hbase
Log | Files | Refs | README

ac.c (10812B)


      1 /*
      2  * Aho-Corasick algorithm derived from Unix 32V /usr/src/cmd/fgrep.c,
      3  * additionally incorporating the fix from the v7 addenda tape.
      4  *
      5  * Changes by Gunnar Ritter, Freiburg i. Br., Germany, September 2002.
      6  */
      7 /*
      8  * Copyright(C) Caldera International Inc. 2001-2002. All rights reserved.
      9  *
     10  * Redistribution and use in source and binary forms, with or without
     11  * modification, are permitted provided that the following conditions
     12  * are met:
     13  *   Redistributions of source code and documentation must retain the
     14  *    above copyright notice, this list of conditions and the following
     15  *    disclaimer.
     16  *   Redistributions in binary form must reproduce the above copyright
     17  *    notice, this list of conditions and the following disclaimer in the
     18  *    documentation and/or other materials provided with the distribution.
     19  *   All advertising materials mentioning features or use of this software
     20  *    must display the following acknowledgement:
     21  *      This product includes software developed or owned by Caldera
     22  *      International, Inc.
     23  *   Neither the name of Caldera International, Inc. nor the names of
     24  *    other contributors may be used to endorse or promote products
     25  *    derived from this software without specific prior written permission.
     26  *
     27  * USE OF THE SOFTWARE PROVIDED FOR UNDER THIS LICENSE BY CALDERA
     28  * INTERNATIONAL, INC. AND CONTRIBUTORS ``AS IS'' AND ANY EXPRESS OR
     29  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     30  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     31  * ARE DISCLAIMED. IN NO EVENT SHALL CALDERA INTERNATIONAL, INC. BE
     32  * LIABLE FOR ANY DIRECT, INDIRECT INCIDENTAL, SPECIAL, EXEMPLARY, OR
     33  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
     34  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
     35  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
     36  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
     37  * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
     38  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     39  */
     40 
     41 #if __GNUC__ >= 3 && __GNUC_MINOR__ >= 4 || __GNUC__ >= 4
     42 #define	USED	__attribute__ ((used))
     43 #elif defined __GNUC__
     44 #define	USED	__attribute__ ((unused))
     45 #else
     46 #define	USED
     47 #endif
     48 static const char sccsid[] USED = "@(#)fgrep.sl	2.10 (gritter) 5/29/05";
     49 
     50 #include <sys/types.h>
     51 #include <string.h>
     52 #include <stdlib.h>
     53 #include <stdio.h>
     54 #include <limits.h>
     55 #include "grep.h"
     56 #include "alloc.h"
     57 
     58 #include <mbtowi.h>
     59 
     60 #define	MAXSIZ	256
     61 #define	QSIZE	128
     62 
     63 struct words {
     64 	struct words	*nst;
     65 	struct words	*link;
     66 	struct words	*fail;
     67 	int 	inp;
     68 	char	out;
     69 };
     70 
     71 static	struct words	*w, *wcur;
     72 static	struct words	*smax;
     73 static	struct words	*q;
     74 
     75 static void	ac_build(void);
     76 static int	ac_match(const char *, size_t);
     77 static int	ac_matchw(const char *, size_t);
     78 static int	ac_range(struct iblok *, char *);
     79 static int	ac_rangew(struct iblok *, char *);
     80 static void	cgotofn(void);
     81 static void	check(int, int);
     82 static void	woverflo(void);
     83 static void	qoverflo(struct words ***queue, int *qsize);
     84 static void	cfail(void);
     85 static int	a0_match(const char *, size_t);
     86 static int	a1_match(const char *, size_t);
     87 
     88 void
     89 ac_select(void)
     90 {
     91 	build = ac_build;
     92 	match = mbcode ? ac_matchw : ac_match;
     93 	matchflags &= ~MF_NULTERM;
     94 	matchflags |= MF_LOCONV;
     95 }
     96 
     97 static void
     98 ac_build(void)
     99 {
    100 	struct expr *e;
    101 
    102 	if (e0->e_flg & E_NULL) {
    103 		match = a0_match;
    104 		return;
    105 	}
    106 	for (e = e0; e; e = e->e_nxt) {
    107 		if (e->e_len == 0 && !xflag) {
    108 			match = a1_match;
    109 			return;
    110 		}
    111 	}
    112 	cgotofn();
    113 	cfail();
    114 	if (!iflag)
    115 		range = mbcode ? ac_rangew : ac_range;
    116 }
    117 
    118 static int
    119 ac_match(const char *line, size_t sz)
    120 {
    121 	register const char *p;
    122 	register int z;
    123 	register struct words *c;
    124 	int failed;
    125 
    126 	p = line;
    127 	failed = 0;
    128 	c = w;
    129 	if (p == &line[sz])
    130 		z = '\n';
    131 	else
    132 		z = *p & 0377;
    133 	for (;;) {
    134 		nstate:
    135 			if (c->inp == z) {
    136 				c = c->nst;
    137 			}
    138 			else if (c->link != 0) {
    139 				c = c->link;
    140 				goto nstate;
    141 			}
    142 			else {
    143 				c = c->fail;
    144 				failed = 1;
    145 				if (c==0) {
    146 					c = w;
    147 					istate:
    148 					if (c->inp == z) {
    149 						c = c->nst;
    150 					}
    151 					else if (c->link != 0) {
    152 						c = c->link;
    153 						goto istate;
    154 					}
    155 				}
    156 				else goto nstate;
    157 			}
    158 		if (c->out) {
    159 			if (xflag) {
    160 				if (failed || p < &line[sz])
    161 					return 0;
    162 			}
    163 			return 1;
    164 		}
    165 		if (++p >= &line[sz]) {
    166 			if (z == '\n')
    167 				return 0;
    168 			else
    169 				z = '\n';
    170 		} else
    171 			z = *p & 0377;
    172 	}
    173 }
    174 
    175 static int
    176 ac_range(struct iblok *ip, char *last)
    177 {
    178 	register char *p;
    179 	register struct words *c;
    180 	int failed;
    181 
    182 	p = ip->ib_cur;
    183 	lineno++;
    184 	failed = 0;
    185 	c = w;
    186 	for (;;) {
    187 		nstate:
    188 			if (c->inp == (*p & 0377)) {
    189 				c = c->nst;
    190 			}
    191 			else if (c->link != 0) {
    192 				c = c->link;
    193 				goto nstate;
    194 			}
    195 			else {
    196 				c = c->fail;
    197 				failed = 1;
    198 				if (c==0) {
    199 					c = w;
    200 					istate:
    201 					if (c->inp == (*p & 0377)) {
    202 						c = c->nst;
    203 					}
    204 					else if (c->link != 0) {
    205 						c = c->link;
    206 						goto istate;
    207 					}
    208 				}
    209 				else goto nstate;
    210 			}
    211 		if (c->out) {
    212 			if (xflag) {
    213 				register char *ep = p;
    214 				while (*ep != '\n')
    215 					ep++;
    216 				if ((failed || ep > p) && vflag == 0) {
    217 					ip->ib_cur = &ep[1];
    218 					goto nogood;
    219 				}
    220 			}
    221 			if (vflag == 0) {
    222 		succeed:	outline(ip, last, p - ip->ib_cur);
    223 				if (qflag || lflag)
    224 					return 1;
    225 			} else {
    226 				ip->ib_cur = p;
    227 				while (*ip->ib_cur++ != '\n');
    228 			}
    229 		nogood:	if ((p = ip->ib_cur) > last)
    230 				return 0;
    231 			lineno++;
    232 			c = w;
    233 			failed = 0;
    234 			continue;
    235 		}
    236 		if (*p++ == '\n') {
    237 			if (vflag) {
    238 				p--;
    239 				goto succeed;
    240 			}
    241 			if ((ip->ib_cur = p) > last)
    242 				return 0;
    243 			lineno++;
    244 			c = w;
    245 			failed = 0;
    246 		}
    247 	}
    248 }
    249 
    250 static int
    251 ac_matchw(const char *line, size_t sz)
    252 {
    253 	register const char *p;
    254 	wint_t z;
    255 	register struct words *c;
    256 	int failed, n = 0;
    257 
    258 	p = line;
    259 	failed = 0;
    260 	c = w;
    261 	if (p == &line[sz])
    262 		z = '\n';
    263 	else {
    264 		if (*p & 0200) {
    265 			if ((n = mbtowi(&z, p, &line[sz] - p)) < 0) {
    266 				n = 1;
    267 				z = WEOF;
    268 			}
    269 		} else {
    270 			z = *p;
    271 			n = 1;
    272 		}
    273 	}
    274 	for (;;) {
    275 		nstate:
    276 			if (c->inp == z) {
    277 				c = c->nst;
    278 			}
    279 			else if (c->link != 0) {
    280 				c = c->link;
    281 				goto nstate;
    282 			}
    283 			else {
    284 				c = c->fail;
    285 				failed = 1;
    286 				if (c==0) {
    287 					c = w;
    288 					istate:
    289 					if (c->inp == z) {
    290 						c = c->nst;
    291 					}
    292 					else if (c->link != 0) {
    293 						c = c->link;
    294 						goto istate;
    295 					}
    296 				}
    297 				else goto nstate;
    298 			}
    299 		if (c->out) {
    300 			if (xflag) {
    301 				if (failed || p < &line[sz])
    302 					return 0;
    303 			}
    304 			return 1;
    305 		}
    306 		p += n;
    307 		if (p >= &line[sz]) {
    308 			if (z == '\n')
    309 				return 0;
    310 			else
    311 				z = '\n';
    312 		} else {
    313 			if (*p & 0200) {
    314 				if ((n = mbtowi(&z, p, &line[sz] - p)) < 0) {
    315 					n = 1;
    316 					z = WEOF;
    317 				}
    318 			} else {
    319 				z = *p;
    320 				n = 1;
    321 			}
    322 		}
    323 	}
    324 }
    325 
    326 static int
    327 ac_rangew(struct iblok *ip, char *last)
    328 {
    329 	register char *p;
    330 	wint_t	z;
    331 	register struct words *c;
    332 	int failed, n = 0;
    333 
    334 	p = ip->ib_cur;
    335 	lineno++;
    336 	failed = 0;
    337 	c = w;
    338 	for (;;) {
    339 		nstate:
    340 			if (*p & 0200) {
    341 				if ((n = mbtowi(&z, p, last + 1 - p)) < 0) {
    342 					n = 1;
    343 					z = WEOF;
    344 				}
    345 			} else {
    346 				z = *p;
    347 				n = 1;
    348 			}
    349 			if (c->inp == z) {
    350 				c = c->nst;
    351 			}
    352 			else if (c->link != 0) {
    353 				c = c->link;
    354 				goto nstate;
    355 			}
    356 			else {
    357 				c = c->fail;
    358 				failed = 1;
    359 				if (c==0) {
    360 					c = w;
    361 					istate:
    362 					if (c->inp == z) {
    363 						c = c->nst;
    364 					}
    365 					else if (c->link != 0) {
    366 						c = c->link;
    367 						goto istate;
    368 					}
    369 				}
    370 				else goto nstate;
    371 			}
    372 		if (c->out) {
    373 			if (xflag) {
    374 				register char *ep = p;
    375 				while (*ep != '\n')
    376 					ep++;
    377 				if ((failed || ep > p) && vflag == 0) {
    378 					ip->ib_cur = &ep[1];
    379 					goto nogood;
    380 				}
    381 			}
    382 			if (vflag == 0) {
    383 		succeed:	outline(ip, last, p - ip->ib_cur);
    384 				if (qflag || lflag)
    385 					return 1;
    386 			} else {
    387 				ip->ib_cur = p;
    388 				while (*ip->ib_cur++ != '\n');
    389 			}
    390 		nogood:	if ((p = ip->ib_cur) > last)
    391 				return 0;
    392 			lineno++;
    393 			c = w;
    394 			failed = 0;
    395 			continue;
    396 		}
    397 		p += n;
    398 		if (p[-n] == '\n') {
    399 			if (vflag) {
    400 				p--;
    401 				goto succeed;
    402 			}
    403 			if ((ip->ib_cur = p) > last)
    404 				return 0;
    405 			lineno++;
    406 			c = w;
    407 			failed = 0;
    408 		}
    409 	}
    410 }
    411 
    412 static void
    413 cgotofn(void)
    414 {
    415 	register int c;
    416 	register struct words *s;
    417 
    418 	woverflo();
    419 	s = smax = w = wcur;
    420 nword:	for(;;) {
    421 		c = nextch();
    422 		if (c==EOF)
    423 			return;
    424 		if (c == '\n') {
    425 			if (xflag) {
    426 				for(;;) {
    427 					if (s->inp == c) {
    428 						s = s->nst;
    429 						break;
    430 					}
    431 					if (s->inp == 0) goto nenter;
    432 					if (s->link == 0) {
    433 						if (++smax >= &wcur[MAXSIZ])
    434 							woverflo();
    435 						s->link = smax;
    436 						s = smax;
    437 						goto nenter;
    438 					}
    439 					s = s->link;
    440 				}
    441 			}
    442 			s->out = 1;
    443 			s = w;
    444 		} else {
    445 		loop:	if (s->inp == c) {
    446 				s = s->nst;
    447 				continue;
    448 			}
    449 			if (s->inp == 0) goto enter;
    450 			if (s->link == 0) {
    451 				if (++smax >= &wcur[MAXSIZ])
    452 					woverflo();
    453 				s->link = smax;
    454 				s = smax;
    455 				goto enter;
    456 			}
    457 			s = s->link;
    458 			goto loop;
    459 		}
    460 	}
    461 
    462 	enter:
    463 	do {
    464 		s->inp = c;
    465 		if (++smax >= &wcur[MAXSIZ])
    466 			woverflo();
    467 		s->nst = smax;
    468 		s = smax;
    469 	} while ((c = nextch()) != '\n' && c!=EOF);
    470 	if (xflag) {
    471 	nenter:	s->inp = '\n';
    472 		if (++smax >= &wcur[MAXSIZ])
    473 			woverflo();
    474 		s->nst = smax;
    475 	}
    476 	smax->out = 1;
    477 	s = w;
    478 	if (c != EOF)
    479 		goto nword;
    480 }
    481 
    482 static void
    483 check(int val, int incr)
    484 {
    485 	if ((unsigned)(val + incr) >= INT_MAX) {
    486 		fprintf(stderr, "%s: wordlist too large\n", progname);
    487 		exit(2);
    488 	}
    489 }
    490 
    491 static void
    492 woverflo(void)
    493 {
    494 	wcur = smax = scalloc(MAXSIZ, sizeof *smax);
    495 }
    496 
    497 static void
    498 qoverflo(struct words ***queue, int *qsize)
    499 {
    500 	check(*qsize, QSIZE);
    501 	*queue = srealloc(*queue, (*qsize += QSIZE) * sizeof **queue);
    502 }
    503 
    504 static void
    505 cfail(void)
    506 {
    507 	struct words **queue = NULL;
    508 	int front, rear;
    509 	int qsize = 0;
    510 	struct words *state;
    511 	int bstart;
    512 	register char c;
    513 	register struct words *s;
    514 	qoverflo(&queue, &qsize);
    515 	s = w;
    516 	front = rear = 0;
    517 init:	if ((s->inp) != 0) {
    518 		queue[rear++] = s->nst;
    519 		if (rear >= qsize - 1)
    520 			qoverflo(&queue, &qsize);
    521 	}
    522 	if ((s = s->link) != 0) {
    523 		goto init;
    524 	}
    525 
    526 	while (rear!=front) {
    527 		s = queue[front];
    528 		if (front == qsize-1)
    529 			qoverflo(&queue, &qsize);
    530 		front++;
    531 	cloop:	if ((c = s->inp) != 0) {
    532 			bstart = 0;
    533 			q = s->nst;
    534 			queue[rear] = q;
    535 			if (front < rear) {
    536 				if (rear >= qsize-1)
    537 					qoverflo(&queue, &qsize);
    538 				rear++;
    539 			} else
    540 				if (++rear == front)
    541 					qoverflo(&queue, &qsize);
    542 			state = s->fail;
    543 		floop:	if (state == 0) {
    544 				state = w;
    545 				bstart = 1;
    546 			}
    547 			if (state->inp == c) {
    548 			qloop:	q->fail = state->nst;
    549 				if ((state->nst)->out == 1)
    550 					q->out = 1;
    551 				if ((q = q->link) != 0) goto qloop;
    552 			}
    553 			else if ((state = state->link) != 0)
    554 				goto floop;
    555 			else if (bstart == 0) {
    556 				state = 0;
    557 				goto floop;
    558 			}
    559 		}
    560 		if ((s = s->link) != 0)
    561 			goto cloop;
    562 	}
    563 	free(queue);
    564 }
    565 
    566 /*ARGSUSED*/
    567 static int
    568 a0_match(const char *str, size_t sz)
    569 {
    570 	return 0;
    571 }
    572 
    573 /*ARGSUSED*/
    574 static int
    575 a1_match(const char *str, size_t sz)
    576 {
    577 	return 1;
    578 }