nfa.c (17417B)
1 /* $OpenBSD: nfa.c,v 1.9 2003/06/04 17:34:44 millert Exp $ */ 2 3 /* nfa - NFA construction routines */ 4 5 /*- 6 * Copyright (c) 1990 The Regents of the University of California. 7 * All rights reserved. 8 * 9 * This code is derived from software contributed to Berkeley by 10 * Vern Paxson. 11 * 12 * The United States Government has rights in this work pursuant 13 * to contract no. DE-AC03-76SF00098 between the United States 14 * Department of Energy and the University of California. 15 * 16 * Redistribution and use in source and binary forms, with or without 17 * modification, are permitted provided that the following conditions 18 * are met: 19 * 20 * 1. Redistributions of source code must retain the above copyright 21 * notice, this list of conditions and the following disclaimer. 22 * 2. Redistributions in binary form must reproduce the above copyright 23 * notice, this list of conditions and the following disclaimer in the 24 * documentation and/or other materials provided with the distribution. 25 * 26 * Neither the name of the University nor the names of its contributors 27 * may be used to endorse or promote products derived from this software 28 * without specific prior written permission. 29 * 30 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR 31 * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED 32 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 33 * PURPOSE. 34 */ 35 36 /* $Header: /cvs/src/usr.bin/lex/nfa.c,v 1.9 2003/06/04 17:34:44 millert Exp $ */ 37 38 #include "flexdef.h" 39 40 41 /* declare functions that have forward references */ 42 43 int dupmachine PROTO((int)); 44 void mkxtion PROTO((int, int)); 45 46 47 /* add_accept - add an accepting state to a machine 48 * 49 * accepting_number becomes mach's accepting number. 50 */ 51 52 void add_accept( mach, accepting_number ) 53 int mach, accepting_number; 54 { 55 /* Hang the accepting number off an epsilon state. if it is associated 56 * with a state that has a non-epsilon out-transition, then the state 57 * will accept BEFORE it makes that transition, i.e., one character 58 * too soon. 59 */ 60 61 if ( transchar[finalst[mach]] == SYM_EPSILON ) 62 accptnum[finalst[mach]] = accepting_number; 63 64 else 65 { 66 int astate = mkstate( SYM_EPSILON ); 67 accptnum[astate] = accepting_number; 68 (void) link_machines( mach, astate ); 69 } 70 } 71 72 73 /* copysingl - make a given number of copies of a singleton machine 74 * 75 * synopsis 76 * 77 * newsng = copysingl( singl, num ); 78 * 79 * newsng - a new singleton composed of num copies of singl 80 * singl - a singleton machine 81 * num - the number of copies of singl to be present in newsng 82 */ 83 84 int copysingl( singl, num ) 85 int singl, num; 86 { 87 int copy, i; 88 89 copy = mkstate( SYM_EPSILON ); 90 91 for ( i = 1; i <= num; ++i ) 92 copy = link_machines( copy, dupmachine( singl ) ); 93 94 return copy; 95 } 96 97 98 /* dumpnfa - debugging routine to write out an nfa */ 99 100 void dumpnfa( state1 ) 101 int state1; 102 103 { 104 int sym, tsp1, tsp2, anum, ns; 105 106 fprintf( stderr, 107 _( "\n\n********** beginning dump of nfa with start state %d\n" ), 108 state1 ); 109 110 /* We probably should loop starting at firstst[state1] and going to 111 * lastst[state1], but they're not maintained properly when we "or" 112 * all of the rules together. So we use our knowledge that the machine 113 * starts at state 1 and ends at lastnfa. 114 */ 115 116 /* for ( ns = firstst[state1]; ns <= lastst[state1]; ++ns ) */ 117 for ( ns = 1; ns <= lastnfa; ++ns ) 118 { 119 fprintf( stderr, _( "state # %4d\t" ), ns ); 120 121 sym = transchar[ns]; 122 tsp1 = trans1[ns]; 123 tsp2 = trans2[ns]; 124 anum = accptnum[ns]; 125 126 fprintf( stderr, "%3d: %4d, %4d", sym, tsp1, tsp2 ); 127 128 if ( anum != NIL ) 129 fprintf( stderr, " [%d]", anum ); 130 131 fprintf( stderr, "\n" ); 132 } 133 134 fprintf( stderr, _( "********** end of dump\n" ) ); 135 } 136 137 138 /* dupmachine - make a duplicate of a given machine 139 * 140 * synopsis 141 * 142 * copy = dupmachine( mach ); 143 * 144 * copy - holds duplicate of mach 145 * mach - machine to be duplicated 146 * 147 * note that the copy of mach is NOT an exact duplicate; rather, all the 148 * transition states values are adjusted so that the copy is self-contained, 149 * as the original should have been. 150 * 151 * also note that the original MUST be contiguous, with its low and high 152 * states accessible by the arrays firstst and lastst 153 */ 154 155 int dupmachine( mach ) 156 int mach; 157 { 158 int i, init, state_offset; 159 int state = 0; 160 int last = lastst[mach]; 161 162 for ( i = firstst[mach]; i <= last; ++i ) 163 { 164 state = mkstate( transchar[i] ); 165 166 if ( trans1[i] != NO_TRANSITION ) 167 { 168 mkxtion( finalst[state], trans1[i] + state - i ); 169 170 if ( transchar[i] == SYM_EPSILON && 171 trans2[i] != NO_TRANSITION ) 172 mkxtion( finalst[state], 173 trans2[i] + state - i ); 174 } 175 176 accptnum[state] = accptnum[i]; 177 } 178 179 if ( state == 0 ) 180 flexfatal( _( "empty machine in dupmachine()" ) ); 181 182 state_offset = state - i + 1; 183 184 init = mach + state_offset; 185 firstst[init] = firstst[mach] + state_offset; 186 finalst[init] = finalst[mach] + state_offset; 187 lastst[init] = lastst[mach] + state_offset; 188 189 return init; 190 } 191 192 193 /* finish_rule - finish up the processing for a rule 194 * 195 * An accepting number is added to the given machine. If variable_trail_rule 196 * is true then the rule has trailing context and both the head and trail 197 * are variable size. Otherwise if headcnt or trailcnt is non-zero then 198 * the machine recognizes a pattern with trailing context and headcnt is 199 * the number of characters in the matched part of the pattern, or zero 200 * if the matched part has variable length. trailcnt is the number of 201 * trailing context characters in the pattern, or zero if the trailing 202 * context has variable length. 203 */ 204 205 void finish_rule( mach, variable_trail_rule, headcnt, trailcnt ) 206 int mach, variable_trail_rule, headcnt, trailcnt; 207 { 208 char action_text[MAXLINE]; 209 210 add_accept( mach, num_rules ); 211 212 /* We did this in new_rule(), but it often gets the wrong 213 * number because we do it before we start parsing the current rule. 214 */ 215 rule_linenum[num_rules] = linenum; 216 217 /* If this is a continued action, then the line-number has already 218 * been updated, giving us the wrong number. 219 */ 220 if ( continued_action ) 221 --rule_linenum[num_rules]; 222 223 snprintf( action_text, sizeof action_text, "case %d:\n", num_rules ); 224 add_action( action_text ); 225 226 if ( variable_trail_rule ) 227 { 228 rule_type[num_rules] = RULE_VARIABLE; 229 230 if ( performance_report > 0 ) 231 fprintf( stderr, 232 _( "Variable trailing context rule at line %d\n" ), 233 rule_linenum[num_rules] ); 234 235 variable_trailing_context_rules = true; 236 } 237 238 else 239 { 240 rule_type[num_rules] = RULE_NORMAL; 241 242 if ( headcnt > 0 || trailcnt > 0 ) 243 { 244 /* Do trailing context magic to not match the trailing 245 * characters. 246 */ 247 char *scanner_cp = "yy_c_buf_p = yy_cp"; 248 char *scanner_bp = "yy_bp"; 249 250 add_action( 251 "*yy_cp = yy_hold_char; /* undo effects of setting up yytext */\n" ); 252 253 if ( headcnt > 0 ) 254 { 255 snprintf( action_text, sizeof action_text, 256 "%s = %s + %d;\n", 257 scanner_cp, scanner_bp, headcnt ); 258 add_action( action_text ); 259 } 260 261 else 262 { 263 snprintf( action_text, sizeof action_text, 264 "%s -= %d;\n", 265 scanner_cp, trailcnt ); 266 add_action( action_text ); 267 } 268 269 add_action( 270 "YY_DO_BEFORE_ACTION; /* set up yytext again */\n" ); 271 } 272 } 273 274 /* Okay, in the action code at this point yytext and yyleng have 275 * their proper final values for this rule, so here's the point 276 * to do any user action. But don't do it for continued actions, 277 * as that'll result in multiple YY_RULE_SETUP's. 278 */ 279 if ( ! continued_action ) 280 add_action( "YY_RULE_SETUP\n" ); 281 282 line_directive_out( (FILE *) 0, 1 ); 283 } 284 285 286 /* link_machines - connect two machines together 287 * 288 * synopsis 289 * 290 * new = link_machines( first, last ); 291 * 292 * new - a machine constructed by connecting first to last 293 * first - the machine whose successor is to be last 294 * last - the machine whose predecessor is to be first 295 * 296 * note: this routine concatenates the machine first with the machine 297 * last to produce a machine new which will pattern-match first first 298 * and then last, and will fail if either of the sub-patterns fails. 299 * FIRST is set to new by the operation. last is unmolested. 300 */ 301 302 int link_machines( first, last ) 303 int first, last; 304 { 305 if ( first == NIL ) 306 return last; 307 308 else if ( last == NIL ) 309 return first; 310 311 else 312 { 313 mkxtion( finalst[first], last ); 314 finalst[first] = finalst[last]; 315 lastst[first] = MAX( lastst[first], lastst[last] ); 316 firstst[first] = MIN( firstst[first], firstst[last] ); 317 318 return first; 319 } 320 } 321 322 323 /* mark_beginning_as_normal - mark each "beginning" state in a machine 324 * as being a "normal" (i.e., not trailing context- 325 * associated) states 326 * 327 * The "beginning" states are the epsilon closure of the first state 328 */ 329 330 void mark_beginning_as_normal( mach ) 331 int mach; 332 { 333 switch ( state_type[mach] ) 334 { 335 case STATE_NORMAL: 336 /* Oh, we've already visited here. */ 337 return; 338 339 case STATE_TRAILING_CONTEXT: 340 state_type[mach] = STATE_NORMAL; 341 342 if ( transchar[mach] == SYM_EPSILON ) 343 { 344 if ( trans1[mach] != NO_TRANSITION ) 345 mark_beginning_as_normal( 346 trans1[mach] ); 347 348 if ( trans2[mach] != NO_TRANSITION ) 349 mark_beginning_as_normal( 350 trans2[mach] ); 351 } 352 break; 353 354 default: 355 flexerror( 356 _( "bad state type in mark_beginning_as_normal()" ) ); 357 break; 358 } 359 } 360 361 362 /* mkbranch - make a machine that branches to two machines 363 * 364 * synopsis 365 * 366 * branch = mkbranch( first, second ); 367 * 368 * branch - a machine which matches either first's pattern or second's 369 * first, second - machines whose patterns are to be or'ed (the | operator) 370 * 371 * Note that first and second are NEITHER destroyed by the operation. Also, 372 * the resulting machine CANNOT be used with any other "mk" operation except 373 * more mkbranch's. Compare with mkor() 374 */ 375 376 int mkbranch( first, second ) 377 int first, second; 378 { 379 int eps; 380 381 if ( first == NO_TRANSITION ) 382 return second; 383 384 else if ( second == NO_TRANSITION ) 385 return first; 386 387 eps = mkstate( SYM_EPSILON ); 388 389 mkxtion( eps, first ); 390 mkxtion( eps, second ); 391 392 return eps; 393 } 394 395 396 /* mkclos - convert a machine into a closure 397 * 398 * synopsis 399 * new = mkclos( state ); 400 * 401 * new - a new state which matches the closure of "state" 402 */ 403 404 int mkclos( state ) 405 int state; 406 { 407 return mkopt( mkposcl( state ) ); 408 } 409 410 411 /* mkopt - make a machine optional 412 * 413 * synopsis 414 * 415 * new = mkopt( mach ); 416 * 417 * new - a machine which optionally matches whatever mach matched 418 * mach - the machine to make optional 419 * 420 * notes: 421 * 1. mach must be the last machine created 422 * 2. mach is destroyed by the call 423 */ 424 425 int mkopt( mach ) 426 int mach; 427 { 428 int eps; 429 430 if ( ! SUPER_FREE_EPSILON(finalst[mach]) ) 431 { 432 eps = mkstate( SYM_EPSILON ); 433 mach = link_machines( mach, eps ); 434 } 435 436 /* Can't skimp on the following if FREE_EPSILON(mach) is true because 437 * some state interior to "mach" might point back to the beginning 438 * for a closure. 439 */ 440 eps = mkstate( SYM_EPSILON ); 441 mach = link_machines( eps, mach ); 442 443 mkxtion( mach, finalst[mach] ); 444 445 return mach; 446 } 447 448 449 /* mkor - make a machine that matches either one of two machines 450 * 451 * synopsis 452 * 453 * new = mkor( first, second ); 454 * 455 * new - a machine which matches either first's pattern or second's 456 * first, second - machines whose patterns are to be or'ed (the | operator) 457 * 458 * note that first and second are both destroyed by the operation 459 * the code is rather convoluted because an attempt is made to minimize 460 * the number of epsilon states needed 461 */ 462 463 int mkor( first, second ) 464 int first, second; 465 { 466 int eps, orend; 467 468 if ( first == NIL ) 469 return second; 470 471 else if ( second == NIL ) 472 return first; 473 474 else 475 { 476 /* See comment in mkopt() about why we can't use the first 477 * state of "first" or "second" if they satisfy "FREE_EPSILON". 478 */ 479 eps = mkstate( SYM_EPSILON ); 480 481 first = link_machines( eps, first ); 482 483 mkxtion( first, second ); 484 485 if ( SUPER_FREE_EPSILON(finalst[first]) && 486 accptnum[finalst[first]] == NIL ) 487 { 488 orend = finalst[first]; 489 mkxtion( finalst[second], orend ); 490 } 491 492 else if ( SUPER_FREE_EPSILON(finalst[second]) && 493 accptnum[finalst[second]] == NIL ) 494 { 495 orend = finalst[second]; 496 mkxtion( finalst[first], orend ); 497 } 498 499 else 500 { 501 eps = mkstate( SYM_EPSILON ); 502 503 first = link_machines( first, eps ); 504 orend = finalst[first]; 505 506 mkxtion( finalst[second], orend ); 507 } 508 } 509 510 finalst[first] = orend; 511 return first; 512 } 513 514 515 /* mkposcl - convert a machine into a positive closure 516 * 517 * synopsis 518 * new = mkposcl( state ); 519 * 520 * new - a machine matching the positive closure of "state" 521 */ 522 523 int mkposcl( state ) 524 int state; 525 { 526 int eps; 527 528 if ( SUPER_FREE_EPSILON(finalst[state]) ) 529 { 530 mkxtion( finalst[state], state ); 531 return state; 532 } 533 534 else 535 { 536 eps = mkstate( SYM_EPSILON ); 537 mkxtion( eps, state ); 538 return link_machines( state, eps ); 539 } 540 } 541 542 543 /* mkrep - make a replicated machine 544 * 545 * synopsis 546 * new = mkrep( mach, lb, ub ); 547 * 548 * new - a machine that matches whatever "mach" matched from "lb" 549 * number of times to "ub" number of times 550 * 551 * note 552 * if "ub" is INFINITY then "new" matches "lb" or more occurrences of "mach" 553 */ 554 555 int mkrep( mach, lb, ub ) 556 int mach, lb, ub; 557 { 558 int base_mach, tail, copy, i; 559 560 base_mach = copysingl( mach, lb - 1 ); 561 562 if ( ub == INFINITY ) 563 { 564 copy = dupmachine( mach ); 565 mach = link_machines( mach, 566 link_machines( base_mach, mkclos( copy ) ) ); 567 } 568 569 else 570 { 571 tail = mkstate( SYM_EPSILON ); 572 573 for ( i = lb; i < ub; ++i ) 574 { 575 copy = dupmachine( mach ); 576 tail = mkopt( link_machines( copy, tail ) ); 577 } 578 579 mach = link_machines( mach, link_machines( base_mach, tail ) ); 580 } 581 582 return mach; 583 } 584 585 586 /* mkstate - create a state with a transition on a given symbol 587 * 588 * synopsis 589 * 590 * state = mkstate( sym ); 591 * 592 * state - a new state matching sym 593 * sym - the symbol the new state is to have an out-transition on 594 * 595 * note that this routine makes new states in ascending order through the 596 * state array (and increments LASTNFA accordingly). The routine DUPMACHINE 597 * relies on machines being made in ascending order and that they are 598 * CONTIGUOUS. Change it and you will have to rewrite DUPMACHINE (kludge 599 * that it admittedly is) 600 */ 601 602 int mkstate( sym ) 603 int sym; 604 { 605 if ( ++lastnfa >= current_mns ) 606 { 607 if ( (current_mns += MNS_INCREMENT) >= MAXIMUM_MNS ) 608 lerrif( 609 _( "input rules are too complicated (>= %d NFA states)" ), 610 current_mns ); 611 612 ++num_reallocs; 613 614 firstst = reallocate_integer_array( firstst, current_mns ); 615 lastst = reallocate_integer_array( lastst, current_mns ); 616 finalst = reallocate_integer_array( finalst, current_mns ); 617 transchar = reallocate_integer_array( transchar, current_mns ); 618 trans1 = reallocate_integer_array( trans1, current_mns ); 619 trans2 = reallocate_integer_array( trans2, current_mns ); 620 accptnum = reallocate_integer_array( accptnum, current_mns ); 621 assoc_rule = 622 reallocate_integer_array( assoc_rule, current_mns ); 623 state_type = 624 reallocate_integer_array( state_type, current_mns ); 625 } 626 627 firstst[lastnfa] = lastnfa; 628 finalst[lastnfa] = lastnfa; 629 lastst[lastnfa] = lastnfa; 630 transchar[lastnfa] = sym; 631 trans1[lastnfa] = NO_TRANSITION; 632 trans2[lastnfa] = NO_TRANSITION; 633 accptnum[lastnfa] = NIL; 634 assoc_rule[lastnfa] = num_rules; 635 state_type[lastnfa] = current_state_type; 636 637 /* Fix up equivalence classes base on this transition. Note that any 638 * character which has its own transition gets its own equivalence 639 * class. Thus only characters which are only in character classes 640 * have a chance at being in the same equivalence class. E.g. "a|b" 641 * puts 'a' and 'b' into two different equivalence classes. "[ab]" 642 * puts them in the same equivalence class (barring other differences 643 * elsewhere in the input). 644 */ 645 646 if ( sym < 0 ) 647 { 648 /* We don't have to update the equivalence classes since 649 * that was already done when the ccl was created for the 650 * first time. 651 */ 652 } 653 654 else if ( sym == SYM_EPSILON ) 655 ++numeps; 656 657 else 658 { 659 check_char( sym ); 660 661 if ( useecs ) 662 /* Map NUL's to csize. */ 663 mkechar( sym ? sym : csize, nextecm, ecgroup ); 664 } 665 666 return lastnfa; 667 } 668 669 670 /* mkxtion - make a transition from one state to another 671 * 672 * synopsis 673 * 674 * mkxtion( statefrom, stateto ); 675 * 676 * statefrom - the state from which the transition is to be made 677 * stateto - the state to which the transition is to be made 678 */ 679 680 void mkxtion( statefrom, stateto ) 681 int statefrom, stateto; 682 { 683 if ( trans1[statefrom] == NO_TRANSITION ) 684 trans1[statefrom] = stateto; 685 686 else if ( (transchar[statefrom] != SYM_EPSILON) || 687 (trans2[statefrom] != NO_TRANSITION) ) 688 flexfatal( _( "found too many transitions in mkxtion()" ) ); 689 690 else 691 { /* second out-transition for an epsilon state */ 692 ++eps2; 693 trans2[statefrom] = stateto; 694 } 695 } 696 697 /* new_rule - initialize for a new rule */ 698 699 void new_rule() 700 { 701 if ( ++num_rules >= current_max_rules ) 702 { 703 ++num_reallocs; 704 current_max_rules += MAX_RULES_INCREMENT; 705 rule_type = reallocate_integer_array( rule_type, 706 current_max_rules ); 707 rule_linenum = reallocate_integer_array( rule_linenum, 708 current_max_rules ); 709 rule_useful = reallocate_integer_array( rule_useful, 710 current_max_rules ); 711 } 712 713 if ( num_rules > MAX_RULE ) 714 lerrif( _( "too many rules (> %d)!" ), MAX_RULE ); 715 716 rule_linenum[num_rules] = linenum; 717 rule_useful[num_rules] = false; 718 }