ac.c (10812B)
1 /* 2 * Aho-Corasick algorithm derived from Unix 32V /usr/src/cmd/fgrep.c, 3 * additionally incorporating the fix from the v7 addenda tape. 4 * 5 * Changes by Gunnar Ritter, Freiburg i. Br., Germany, September 2002. 6 */ 7 /* 8 * Copyright(C) Caldera International Inc. 2001-2002. All rights reserved. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * Redistributions of source code and documentation must retain the 14 * above copyright notice, this list of conditions and the following 15 * disclaimer. 16 * Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * All advertising materials mentioning features or use of this software 20 * must display the following acknowledgement: 21 * This product includes software developed or owned by Caldera 22 * International, Inc. 23 * Neither the name of Caldera International, Inc. nor the names of 24 * other contributors may be used to endorse or promote products 25 * derived from this software without specific prior written permission. 26 * 27 * USE OF THE SOFTWARE PROVIDED FOR UNDER THIS LICENSE BY CALDERA 28 * INTERNATIONAL, INC. AND CONTRIBUTORS ``AS IS'' AND ANY EXPRESS OR 29 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 30 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 31 * ARE DISCLAIMED. IN NO EVENT SHALL CALDERA INTERNATIONAL, INC. BE 32 * LIABLE FOR ANY DIRECT, INDIRECT INCIDENTAL, SPECIAL, EXEMPLARY, OR 33 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 34 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 35 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 36 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE 37 * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 38 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 39 */ 40 41 #if __GNUC__ >= 3 && __GNUC_MINOR__ >= 4 || __GNUC__ >= 4 42 #define USED __attribute__ ((used)) 43 #elif defined __GNUC__ 44 #define USED __attribute__ ((unused)) 45 #else 46 #define USED 47 #endif 48 static const char sccsid[] USED = "@(#)fgrep.sl 2.10 (gritter) 5/29/05"; 49 50 #include <sys/types.h> 51 #include <string.h> 52 #include <stdlib.h> 53 #include <stdio.h> 54 #include <limits.h> 55 #include "grep.h" 56 #include "alloc.h" 57 58 #include <mbtowi.h> 59 60 #define MAXSIZ 256 61 #define QSIZE 128 62 63 struct words { 64 struct words *nst; 65 struct words *link; 66 struct words *fail; 67 int inp; 68 char out; 69 }; 70 71 static struct words *w, *wcur; 72 static struct words *smax; 73 static struct words *q; 74 75 static void ac_build(void); 76 static int ac_match(const char *, size_t); 77 static int ac_matchw(const char *, size_t); 78 static int ac_range(struct iblok *, char *); 79 static int ac_rangew(struct iblok *, char *); 80 static void cgotofn(void); 81 static void check(int, int); 82 static void woverflo(void); 83 static void qoverflo(struct words ***queue, int *qsize); 84 static void cfail(void); 85 static int a0_match(const char *, size_t); 86 static int a1_match(const char *, size_t); 87 88 void 89 ac_select(void) 90 { 91 build = ac_build; 92 match = mbcode ? ac_matchw : ac_match; 93 matchflags &= ~MF_NULTERM; 94 matchflags |= MF_LOCONV; 95 } 96 97 static void 98 ac_build(void) 99 { 100 struct expr *e; 101 102 if (e0->e_flg & E_NULL) { 103 match = a0_match; 104 return; 105 } 106 for (e = e0; e; e = e->e_nxt) { 107 if (e->e_len == 0 && !xflag) { 108 match = a1_match; 109 return; 110 } 111 } 112 cgotofn(); 113 cfail(); 114 if (!iflag) 115 range = mbcode ? ac_rangew : ac_range; 116 } 117 118 static int 119 ac_match(const char *line, size_t sz) 120 { 121 register const char *p; 122 register int z; 123 register struct words *c; 124 int failed; 125 126 p = line; 127 failed = 0; 128 c = w; 129 if (p == &line[sz]) 130 z = '\n'; 131 else 132 z = *p & 0377; 133 for (;;) { 134 nstate: 135 if (c->inp == z) { 136 c = c->nst; 137 } 138 else if (c->link != 0) { 139 c = c->link; 140 goto nstate; 141 } 142 else { 143 c = c->fail; 144 failed = 1; 145 if (c==0) { 146 c = w; 147 istate: 148 if (c->inp == z) { 149 c = c->nst; 150 } 151 else if (c->link != 0) { 152 c = c->link; 153 goto istate; 154 } 155 } 156 else goto nstate; 157 } 158 if (c->out) { 159 if (xflag) { 160 if (failed || p < &line[sz]) 161 return 0; 162 } 163 return 1; 164 } 165 if (++p >= &line[sz]) { 166 if (z == '\n') 167 return 0; 168 else 169 z = '\n'; 170 } else 171 z = *p & 0377; 172 } 173 } 174 175 static int 176 ac_range(struct iblok *ip, char *last) 177 { 178 register char *p; 179 register struct words *c; 180 int failed; 181 182 p = ip->ib_cur; 183 lineno++; 184 failed = 0; 185 c = w; 186 for (;;) { 187 nstate: 188 if (c->inp == (*p & 0377)) { 189 c = c->nst; 190 } 191 else if (c->link != 0) { 192 c = c->link; 193 goto nstate; 194 } 195 else { 196 c = c->fail; 197 failed = 1; 198 if (c==0) { 199 c = w; 200 istate: 201 if (c->inp == (*p & 0377)) { 202 c = c->nst; 203 } 204 else if (c->link != 0) { 205 c = c->link; 206 goto istate; 207 } 208 } 209 else goto nstate; 210 } 211 if (c->out) { 212 if (xflag) { 213 register char *ep = p; 214 while (*ep != '\n') 215 ep++; 216 if ((failed || ep > p) && vflag == 0) { 217 ip->ib_cur = &ep[1]; 218 goto nogood; 219 } 220 } 221 if (vflag == 0) { 222 succeed: outline(ip, last, p - ip->ib_cur); 223 if (qflag || lflag) 224 return 1; 225 } else { 226 ip->ib_cur = p; 227 while (*ip->ib_cur++ != '\n'); 228 } 229 nogood: if ((p = ip->ib_cur) > last) 230 return 0; 231 lineno++; 232 c = w; 233 failed = 0; 234 continue; 235 } 236 if (*p++ == '\n') { 237 if (vflag) { 238 p--; 239 goto succeed; 240 } 241 if ((ip->ib_cur = p) > last) 242 return 0; 243 lineno++; 244 c = w; 245 failed = 0; 246 } 247 } 248 } 249 250 static int 251 ac_matchw(const char *line, size_t sz) 252 { 253 register const char *p; 254 wint_t z; 255 register struct words *c; 256 int failed, n = 0; 257 258 p = line; 259 failed = 0; 260 c = w; 261 if (p == &line[sz]) 262 z = '\n'; 263 else { 264 if (*p & 0200) { 265 if ((n = mbtowi(&z, p, &line[sz] - p)) < 0) { 266 n = 1; 267 z = WEOF; 268 } 269 } else { 270 z = *p; 271 n = 1; 272 } 273 } 274 for (;;) { 275 nstate: 276 if (c->inp == z) { 277 c = c->nst; 278 } 279 else if (c->link != 0) { 280 c = c->link; 281 goto nstate; 282 } 283 else { 284 c = c->fail; 285 failed = 1; 286 if (c==0) { 287 c = w; 288 istate: 289 if (c->inp == z) { 290 c = c->nst; 291 } 292 else if (c->link != 0) { 293 c = c->link; 294 goto istate; 295 } 296 } 297 else goto nstate; 298 } 299 if (c->out) { 300 if (xflag) { 301 if (failed || p < &line[sz]) 302 return 0; 303 } 304 return 1; 305 } 306 p += n; 307 if (p >= &line[sz]) { 308 if (z == '\n') 309 return 0; 310 else 311 z = '\n'; 312 } else { 313 if (*p & 0200) { 314 if ((n = mbtowi(&z, p, &line[sz] - p)) < 0) { 315 n = 1; 316 z = WEOF; 317 } 318 } else { 319 z = *p; 320 n = 1; 321 } 322 } 323 } 324 } 325 326 static int 327 ac_rangew(struct iblok *ip, char *last) 328 { 329 register char *p; 330 wint_t z; 331 register struct words *c; 332 int failed, n = 0; 333 334 p = ip->ib_cur; 335 lineno++; 336 failed = 0; 337 c = w; 338 for (;;) { 339 nstate: 340 if (*p & 0200) { 341 if ((n = mbtowi(&z, p, last + 1 - p)) < 0) { 342 n = 1; 343 z = WEOF; 344 } 345 } else { 346 z = *p; 347 n = 1; 348 } 349 if (c->inp == z) { 350 c = c->nst; 351 } 352 else if (c->link != 0) { 353 c = c->link; 354 goto nstate; 355 } 356 else { 357 c = c->fail; 358 failed = 1; 359 if (c==0) { 360 c = w; 361 istate: 362 if (c->inp == z) { 363 c = c->nst; 364 } 365 else if (c->link != 0) { 366 c = c->link; 367 goto istate; 368 } 369 } 370 else goto nstate; 371 } 372 if (c->out) { 373 if (xflag) { 374 register char *ep = p; 375 while (*ep != '\n') 376 ep++; 377 if ((failed || ep > p) && vflag == 0) { 378 ip->ib_cur = &ep[1]; 379 goto nogood; 380 } 381 } 382 if (vflag == 0) { 383 succeed: outline(ip, last, p - ip->ib_cur); 384 if (qflag || lflag) 385 return 1; 386 } else { 387 ip->ib_cur = p; 388 while (*ip->ib_cur++ != '\n'); 389 } 390 nogood: if ((p = ip->ib_cur) > last) 391 return 0; 392 lineno++; 393 c = w; 394 failed = 0; 395 continue; 396 } 397 p += n; 398 if (p[-n] == '\n') { 399 if (vflag) { 400 p--; 401 goto succeed; 402 } 403 if ((ip->ib_cur = p) > last) 404 return 0; 405 lineno++; 406 c = w; 407 failed = 0; 408 } 409 } 410 } 411 412 static void 413 cgotofn(void) 414 { 415 register int c; 416 register struct words *s; 417 418 woverflo(); 419 s = smax = w = wcur; 420 nword: for(;;) { 421 c = nextch(); 422 if (c==EOF) 423 return; 424 if (c == '\n') { 425 if (xflag) { 426 for(;;) { 427 if (s->inp == c) { 428 s = s->nst; 429 break; 430 } 431 if (s->inp == 0) goto nenter; 432 if (s->link == 0) { 433 if (++smax >= &wcur[MAXSIZ]) 434 woverflo(); 435 s->link = smax; 436 s = smax; 437 goto nenter; 438 } 439 s = s->link; 440 } 441 } 442 s->out = 1; 443 s = w; 444 } else { 445 loop: if (s->inp == c) { 446 s = s->nst; 447 continue; 448 } 449 if (s->inp == 0) goto enter; 450 if (s->link == 0) { 451 if (++smax >= &wcur[MAXSIZ]) 452 woverflo(); 453 s->link = smax; 454 s = smax; 455 goto enter; 456 } 457 s = s->link; 458 goto loop; 459 } 460 } 461 462 enter: 463 do { 464 s->inp = c; 465 if (++smax >= &wcur[MAXSIZ]) 466 woverflo(); 467 s->nst = smax; 468 s = smax; 469 } while ((c = nextch()) != '\n' && c!=EOF); 470 if (xflag) { 471 nenter: s->inp = '\n'; 472 if (++smax >= &wcur[MAXSIZ]) 473 woverflo(); 474 s->nst = smax; 475 } 476 smax->out = 1; 477 s = w; 478 if (c != EOF) 479 goto nword; 480 } 481 482 static void 483 check(int val, int incr) 484 { 485 if ((unsigned)(val + incr) >= INT_MAX) { 486 fprintf(stderr, "%s: wordlist too large\n", progname); 487 exit(2); 488 } 489 } 490 491 static void 492 woverflo(void) 493 { 494 wcur = smax = scalloc(MAXSIZ, sizeof *smax); 495 } 496 497 static void 498 qoverflo(struct words ***queue, int *qsize) 499 { 500 check(*qsize, QSIZE); 501 *queue = srealloc(*queue, (*qsize += QSIZE) * sizeof **queue); 502 } 503 504 static void 505 cfail(void) 506 { 507 struct words **queue = NULL; 508 int front, rear; 509 int qsize = 0; 510 struct words *state; 511 int bstart; 512 register char c; 513 register struct words *s; 514 qoverflo(&queue, &qsize); 515 s = w; 516 front = rear = 0; 517 init: if ((s->inp) != 0) { 518 queue[rear++] = s->nst; 519 if (rear >= qsize - 1) 520 qoverflo(&queue, &qsize); 521 } 522 if ((s = s->link) != 0) { 523 goto init; 524 } 525 526 while (rear!=front) { 527 s = queue[front]; 528 if (front == qsize-1) 529 qoverflo(&queue, &qsize); 530 front++; 531 cloop: if ((c = s->inp) != 0) { 532 bstart = 0; 533 q = s->nst; 534 queue[rear] = q; 535 if (front < rear) { 536 if (rear >= qsize-1) 537 qoverflo(&queue, &qsize); 538 rear++; 539 } else 540 if (++rear == front) 541 qoverflo(&queue, &qsize); 542 state = s->fail; 543 floop: if (state == 0) { 544 state = w; 545 bstart = 1; 546 } 547 if (state->inp == c) { 548 qloop: q->fail = state->nst; 549 if ((state->nst)->out == 1) 550 q->out = 1; 551 if ((q = q->link) != 0) goto qloop; 552 } 553 else if ((state = state->link) != 0) 554 goto floop; 555 else if (bstart == 0) { 556 state = 0; 557 goto floop; 558 } 559 } 560 if ((s = s->link) != 0) 561 goto cloop; 562 } 563 free(queue); 564 } 565 566 /*ARGSUSED*/ 567 static int 568 a0_match(const char *str, size_t sz) 569 { 570 return 0; 571 } 572 573 /*ARGSUSED*/ 574 static int 575 a1_match(const char *str, size_t sz) 576 { 577 return 1; 578 }