fatbase

portable OpenBSD tools
git clone git://git.2f30.org/fatbase
Log | Files | Refs

nfa.c (17417B)


      1 /*	$OpenBSD: nfa.c,v 1.9 2003/06/04 17:34:44 millert Exp $	*/
      2 
      3 /* nfa - NFA construction routines */
      4 
      5 /*-
      6  * Copyright (c) 1990 The Regents of the University of California.
      7  * All rights reserved.
      8  *
      9  * This code is derived from software contributed to Berkeley by
     10  * Vern Paxson.
     11  * 
     12  * The United States Government has rights in this work pursuant
     13  * to contract no. DE-AC03-76SF00098 between the United States
     14  * Department of Energy and the University of California.
     15  *
     16  * Redistribution and use in source and binary forms, with or without
     17  * modification, are permitted provided that the following conditions
     18  * are met:
     19  *
     20  * 1. Redistributions of source code must retain the above copyright
     21  *    notice, this list of conditions and the following disclaimer.
     22  * 2. Redistributions in binary form must reproduce the above copyright
     23  *    notice, this list of conditions and the following disclaimer in the
     24  *    documentation and/or other materials provided with the distribution.
     25  *
     26  * Neither the name of the University nor the names of its contributors
     27  * may be used to endorse or promote products derived from this software
     28  * without specific prior written permission.
     29  *
     30  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
     31  * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
     32  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     33  * PURPOSE.
     34  */
     35 
     36 /* $Header: /cvs/src/usr.bin/lex/nfa.c,v 1.9 2003/06/04 17:34:44 millert Exp $ */
     37 
     38 #include "flexdef.h"
     39 
     40 
     41 /* declare functions that have forward references */
     42 
     43 int dupmachine PROTO((int));
     44 void mkxtion PROTO((int, int));
     45 
     46 
     47 /* add_accept - add an accepting state to a machine
     48  *
     49  * accepting_number becomes mach's accepting number.
     50  */
     51 
     52 void add_accept( mach, accepting_number )
     53 int mach, accepting_number;
     54 	{
     55 	/* Hang the accepting number off an epsilon state.  if it is associated
     56 	 * with a state that has a non-epsilon out-transition, then the state
     57 	 * will accept BEFORE it makes that transition, i.e., one character
     58 	 * too soon.
     59 	 */
     60 
     61 	if ( transchar[finalst[mach]] == SYM_EPSILON )
     62 		accptnum[finalst[mach]] = accepting_number;
     63 
     64 	else
     65 		{
     66 		int astate = mkstate( SYM_EPSILON );
     67 		accptnum[astate] = accepting_number;
     68 		(void) link_machines( mach, astate );
     69 		}
     70 	}
     71 
     72 
     73 /* copysingl - make a given number of copies of a singleton machine
     74  *
     75  * synopsis
     76  *
     77  *   newsng = copysingl( singl, num );
     78  *
     79  *     newsng - a new singleton composed of num copies of singl
     80  *     singl  - a singleton machine
     81  *     num    - the number of copies of singl to be present in newsng
     82  */
     83 
     84 int copysingl( singl, num )
     85 int singl, num;
     86 	{
     87 	int copy, i;
     88 
     89 	copy = mkstate( SYM_EPSILON );
     90 
     91 	for ( i = 1; i <= num; ++i )
     92 		copy = link_machines( copy, dupmachine( singl ) );
     93 
     94 	return copy;
     95 	}
     96 
     97 
     98 /* dumpnfa - debugging routine to write out an nfa */
     99 
    100 void dumpnfa( state1 )
    101 int state1;
    102 
    103 	{
    104 	int sym, tsp1, tsp2, anum, ns;
    105 
    106 	fprintf( stderr,
    107 	_( "\n\n********** beginning dump of nfa with start state %d\n" ),
    108 		state1 );
    109 
    110 	/* We probably should loop starting at firstst[state1] and going to
    111 	 * lastst[state1], but they're not maintained properly when we "or"
    112 	 * all of the rules together.  So we use our knowledge that the machine
    113 	 * starts at state 1 and ends at lastnfa.
    114 	 */
    115 
    116 	/* for ( ns = firstst[state1]; ns <= lastst[state1]; ++ns ) */
    117 	for ( ns = 1; ns <= lastnfa; ++ns )
    118 		{
    119 		fprintf( stderr, _( "state # %4d\t" ), ns );
    120 
    121 		sym = transchar[ns];
    122 		tsp1 = trans1[ns];
    123 		tsp2 = trans2[ns];
    124 		anum = accptnum[ns];
    125 
    126 		fprintf( stderr, "%3d:  %4d, %4d", sym, tsp1, tsp2 );
    127 
    128 		if ( anum != NIL )
    129 			fprintf( stderr, "  [%d]", anum );
    130 
    131 		fprintf( stderr, "\n" );
    132 		}
    133 
    134 	fprintf( stderr, _( "********** end of dump\n" ) );
    135 	}
    136 
    137 
    138 /* dupmachine - make a duplicate of a given machine
    139  *
    140  * synopsis
    141  *
    142  *   copy = dupmachine( mach );
    143  *
    144  *     copy - holds duplicate of mach
    145  *     mach - machine to be duplicated
    146  *
    147  * note that the copy of mach is NOT an exact duplicate; rather, all the
    148  * transition states values are adjusted so that the copy is self-contained,
    149  * as the original should have been.
    150  *
    151  * also note that the original MUST be contiguous, with its low and high
    152  * states accessible by the arrays firstst and lastst
    153  */
    154 
    155 int dupmachine( mach )
    156 int mach;
    157 	{
    158 	int i, init, state_offset;
    159 	int state = 0;
    160 	int last = lastst[mach];
    161 
    162 	for ( i = firstst[mach]; i <= last; ++i )
    163 		{
    164 		state = mkstate( transchar[i] );
    165 
    166 		if ( trans1[i] != NO_TRANSITION )
    167 			{
    168 			mkxtion( finalst[state], trans1[i] + state - i );
    169 
    170 			if ( transchar[i] == SYM_EPSILON &&
    171 			     trans2[i] != NO_TRANSITION )
    172 				mkxtion( finalst[state],
    173 					trans2[i] + state - i );
    174 			}
    175 
    176 		accptnum[state] = accptnum[i];
    177 		}
    178 
    179 	if ( state == 0 )
    180 		flexfatal( _( "empty machine in dupmachine()" ) );
    181 
    182 	state_offset = state - i + 1;
    183 
    184 	init = mach + state_offset;
    185 	firstst[init] = firstst[mach] + state_offset;
    186 	finalst[init] = finalst[mach] + state_offset;
    187 	lastst[init] = lastst[mach] + state_offset;
    188 
    189 	return init;
    190 	}
    191 
    192 
    193 /* finish_rule - finish up the processing for a rule
    194  *
    195  * An accepting number is added to the given machine.  If variable_trail_rule
    196  * is true then the rule has trailing context and both the head and trail
    197  * are variable size.  Otherwise if headcnt or trailcnt is non-zero then
    198  * the machine recognizes a pattern with trailing context and headcnt is
    199  * the number of characters in the matched part of the pattern, or zero
    200  * if the matched part has variable length.  trailcnt is the number of
    201  * trailing context characters in the pattern, or zero if the trailing
    202  * context has variable length.
    203  */
    204 
    205 void finish_rule( mach, variable_trail_rule, headcnt, trailcnt )
    206 int mach, variable_trail_rule, headcnt, trailcnt;
    207 	{
    208 	char action_text[MAXLINE];
    209 
    210 	add_accept( mach, num_rules );
    211 
    212 	/* We did this in new_rule(), but it often gets the wrong
    213 	 * number because we do it before we start parsing the current rule.
    214 	 */
    215 	rule_linenum[num_rules] = linenum;
    216 
    217 	/* If this is a continued action, then the line-number has already
    218 	 * been updated, giving us the wrong number.
    219 	 */
    220 	if ( continued_action )
    221 		--rule_linenum[num_rules];
    222 
    223 	snprintf( action_text, sizeof action_text, "case %d:\n", num_rules );
    224 	add_action( action_text );
    225 
    226 	if ( variable_trail_rule )
    227 		{
    228 		rule_type[num_rules] = RULE_VARIABLE;
    229 
    230 		if ( performance_report > 0 )
    231 			fprintf( stderr,
    232 			_( "Variable trailing context rule at line %d\n" ),
    233 				rule_linenum[num_rules] );
    234 
    235 		variable_trailing_context_rules = true;
    236 		}
    237 
    238 	else
    239 		{
    240 		rule_type[num_rules] = RULE_NORMAL;
    241 
    242 		if ( headcnt > 0 || trailcnt > 0 )
    243 			{
    244 			/* Do trailing context magic to not match the trailing
    245 			 * characters.
    246 			 */
    247 			char *scanner_cp = "yy_c_buf_p = yy_cp";
    248 			char *scanner_bp = "yy_bp";
    249 
    250 			add_action(
    251 	"*yy_cp = yy_hold_char; /* undo effects of setting up yytext */\n" );
    252 
    253 			if ( headcnt > 0 )
    254 				{
    255 				snprintf( action_text, sizeof action_text,
    256 					"%s = %s + %d;\n",
    257 					scanner_cp, scanner_bp, headcnt );
    258 				add_action( action_text );
    259 				}
    260 
    261 			else
    262 				{
    263 				snprintf( action_text, sizeof action_text,
    264 					"%s -= %d;\n",
    265 					scanner_cp, trailcnt );
    266 				add_action( action_text );
    267 				}
    268 
    269 			add_action(
    270 			"YY_DO_BEFORE_ACTION; /* set up yytext again */\n" );
    271 			}
    272 		}
    273 
    274 	/* Okay, in the action code at this point yytext and yyleng have
    275 	 * their proper final values for this rule, so here's the point
    276 	 * to do any user action.  But don't do it for continued actions,
    277 	 * as that'll result in multiple YY_RULE_SETUP's.
    278 	 */
    279 	if ( ! continued_action )
    280 		add_action( "YY_RULE_SETUP\n" );
    281 
    282 	line_directive_out( (FILE *) 0, 1 );
    283 	}
    284 
    285 
    286 /* link_machines - connect two machines together
    287  *
    288  * synopsis
    289  *
    290  *   new = link_machines( first, last );
    291  *
    292  *     new    - a machine constructed by connecting first to last
    293  *     first  - the machine whose successor is to be last
    294  *     last   - the machine whose predecessor is to be first
    295  *
    296  * note: this routine concatenates the machine first with the machine
    297  *  last to produce a machine new which will pattern-match first first
    298  *  and then last, and will fail if either of the sub-patterns fails.
    299  *  FIRST is set to new by the operation.  last is unmolested.
    300  */
    301 
    302 int link_machines( first, last )
    303 int first, last;
    304 	{
    305 	if ( first == NIL )
    306 		return last;
    307 
    308 	else if ( last == NIL )
    309 		return first;
    310 
    311 	else
    312 		{
    313 		mkxtion( finalst[first], last );
    314 		finalst[first] = finalst[last];
    315 		lastst[first] = MAX( lastst[first], lastst[last] );
    316 		firstst[first] = MIN( firstst[first], firstst[last] );
    317 
    318 		return first;
    319 		}
    320 	}
    321 
    322 
    323 /* mark_beginning_as_normal - mark each "beginning" state in a machine
    324  *                            as being a "normal" (i.e., not trailing context-
    325  *                            associated) states
    326  *
    327  * The "beginning" states are the epsilon closure of the first state
    328  */
    329 
    330 void mark_beginning_as_normal( mach )
    331 int mach;
    332 	{
    333 	switch ( state_type[mach] )
    334 		{
    335 		case STATE_NORMAL:
    336 			/* Oh, we've already visited here. */
    337 			return;
    338 
    339 		case STATE_TRAILING_CONTEXT:
    340 			state_type[mach] = STATE_NORMAL;
    341 
    342 			if ( transchar[mach] == SYM_EPSILON )
    343 				{
    344 				if ( trans1[mach] != NO_TRANSITION )
    345 					mark_beginning_as_normal(
    346 						trans1[mach] );
    347 
    348 				if ( trans2[mach] != NO_TRANSITION )
    349 					mark_beginning_as_normal(
    350 						trans2[mach] );
    351 				}
    352 			break;
    353 
    354 		default:
    355 			flexerror(
    356 			_( "bad state type in mark_beginning_as_normal()" ) );
    357 			break;
    358 		}
    359 	}
    360 
    361 
    362 /* mkbranch - make a machine that branches to two machines
    363  *
    364  * synopsis
    365  *
    366  *   branch = mkbranch( first, second );
    367  *
    368  *     branch - a machine which matches either first's pattern or second's
    369  *     first, second - machines whose patterns are to be or'ed (the | operator)
    370  *
    371  * Note that first and second are NEITHER destroyed by the operation.  Also,
    372  * the resulting machine CANNOT be used with any other "mk" operation except
    373  * more mkbranch's.  Compare with mkor()
    374  */
    375 
    376 int mkbranch( first, second )
    377 int first, second;
    378 	{
    379 	int eps;
    380 
    381 	if ( first == NO_TRANSITION )
    382 		return second;
    383 
    384 	else if ( second == NO_TRANSITION )
    385 		return first;
    386 
    387 	eps = mkstate( SYM_EPSILON );
    388 
    389 	mkxtion( eps, first );
    390 	mkxtion( eps, second );
    391 
    392 	return eps;
    393 	}
    394 
    395 
    396 /* mkclos - convert a machine into a closure
    397  *
    398  * synopsis
    399  *   new = mkclos( state );
    400  *
    401  * new - a new state which matches the closure of "state"
    402  */
    403 
    404 int mkclos( state )
    405 int state;
    406 	{
    407 	return mkopt( mkposcl( state ) );
    408 	}
    409 
    410 
    411 /* mkopt - make a machine optional
    412  *
    413  * synopsis
    414  *
    415  *   new = mkopt( mach );
    416  *
    417  *     new  - a machine which optionally matches whatever mach matched
    418  *     mach - the machine to make optional
    419  *
    420  * notes:
    421  *     1. mach must be the last machine created
    422  *     2. mach is destroyed by the call
    423  */
    424 
    425 int mkopt( mach )
    426 int mach;
    427 	{
    428 	int eps;
    429 
    430 	if ( ! SUPER_FREE_EPSILON(finalst[mach]) )
    431 		{
    432 		eps = mkstate( SYM_EPSILON );
    433 		mach = link_machines( mach, eps );
    434 		}
    435 
    436 	/* Can't skimp on the following if FREE_EPSILON(mach) is true because
    437 	 * some state interior to "mach" might point back to the beginning
    438 	 * for a closure.
    439 	 */
    440 	eps = mkstate( SYM_EPSILON );
    441 	mach = link_machines( eps, mach );
    442 
    443 	mkxtion( mach, finalst[mach] );
    444 
    445 	return mach;
    446 	}
    447 
    448 
    449 /* mkor - make a machine that matches either one of two machines
    450  *
    451  * synopsis
    452  *
    453  *   new = mkor( first, second );
    454  *
    455  *     new - a machine which matches either first's pattern or second's
    456  *     first, second - machines whose patterns are to be or'ed (the | operator)
    457  *
    458  * note that first and second are both destroyed by the operation
    459  * the code is rather convoluted because an attempt is made to minimize
    460  * the number of epsilon states needed
    461  */
    462 
    463 int mkor( first, second )
    464 int first, second;
    465 	{
    466 	int eps, orend;
    467 
    468 	if ( first == NIL )
    469 		return second;
    470 
    471 	else if ( second == NIL )
    472 		return first;
    473 
    474 	else
    475 		{
    476 		/* See comment in mkopt() about why we can't use the first
    477 		 * state of "first" or "second" if they satisfy "FREE_EPSILON".
    478 		 */
    479 		eps = mkstate( SYM_EPSILON );
    480 
    481 		first = link_machines( eps, first );
    482 
    483 		mkxtion( first, second );
    484 
    485 		if ( SUPER_FREE_EPSILON(finalst[first]) &&
    486 		     accptnum[finalst[first]] == NIL )
    487 			{
    488 			orend = finalst[first];
    489 			mkxtion( finalst[second], orend );
    490 			}
    491 
    492 		else if ( SUPER_FREE_EPSILON(finalst[second]) &&
    493 			  accptnum[finalst[second]] == NIL )
    494 			{
    495 			orend = finalst[second];
    496 			mkxtion( finalst[first], orend );
    497 			}
    498 
    499 		else
    500 			{
    501 			eps = mkstate( SYM_EPSILON );
    502 
    503 			first = link_machines( first, eps );
    504 			orend = finalst[first];
    505 
    506 			mkxtion( finalst[second], orend );
    507 			}
    508 		}
    509 
    510 	finalst[first] = orend;
    511 	return first;
    512 	}
    513 
    514 
    515 /* mkposcl - convert a machine into a positive closure
    516  *
    517  * synopsis
    518  *   new = mkposcl( state );
    519  *
    520  *    new - a machine matching the positive closure of "state"
    521  */
    522 
    523 int mkposcl( state )
    524 int state;
    525 	{
    526 	int eps;
    527 
    528 	if ( SUPER_FREE_EPSILON(finalst[state]) )
    529 		{
    530 		mkxtion( finalst[state], state );
    531 		return state;
    532 		}
    533 
    534 	else
    535 		{
    536 		eps = mkstate( SYM_EPSILON );
    537 		mkxtion( eps, state );
    538 		return link_machines( state, eps );
    539 		}
    540 	}
    541 
    542 
    543 /* mkrep - make a replicated machine
    544  *
    545  * synopsis
    546  *   new = mkrep( mach, lb, ub );
    547  *
    548  *    new - a machine that matches whatever "mach" matched from "lb"
    549  *          number of times to "ub" number of times
    550  *
    551  * note
    552  *   if "ub" is INFINITY then "new" matches "lb" or more occurrences of "mach"
    553  */
    554 
    555 int mkrep( mach, lb, ub )
    556 int mach, lb, ub;
    557 	{
    558 	int base_mach, tail, copy, i;
    559 
    560 	base_mach = copysingl( mach, lb - 1 );
    561 
    562 	if ( ub == INFINITY )
    563 		{
    564 		copy = dupmachine( mach );
    565 		mach = link_machines( mach,
    566 		link_machines( base_mach, mkclos( copy ) ) );
    567 		}
    568 
    569 	else
    570 		{
    571 		tail = mkstate( SYM_EPSILON );
    572 
    573 		for ( i = lb; i < ub; ++i )
    574 			{
    575 			copy = dupmachine( mach );
    576 			tail = mkopt( link_machines( copy, tail ) );
    577 			}
    578 
    579 		mach = link_machines( mach, link_machines( base_mach, tail ) );
    580 		}
    581 
    582 	return mach;
    583 	}
    584 
    585 
    586 /* mkstate - create a state with a transition on a given symbol
    587  *
    588  * synopsis
    589  *
    590  *   state = mkstate( sym );
    591  *
    592  *     state - a new state matching sym
    593  *     sym   - the symbol the new state is to have an out-transition on
    594  *
    595  * note that this routine makes new states in ascending order through the
    596  * state array (and increments LASTNFA accordingly).  The routine DUPMACHINE
    597  * relies on machines being made in ascending order and that they are
    598  * CONTIGUOUS.  Change it and you will have to rewrite DUPMACHINE (kludge
    599  * that it admittedly is)
    600  */
    601 
    602 int mkstate( sym )
    603 int sym;
    604 	{
    605 	if ( ++lastnfa >= current_mns )
    606 		{
    607 		if ( (current_mns += MNS_INCREMENT) >= MAXIMUM_MNS )
    608 			lerrif(
    609 		_( "input rules are too complicated (>= %d NFA states)" ),
    610 				current_mns );
    611 
    612 		++num_reallocs;
    613 
    614 		firstst = reallocate_integer_array( firstst, current_mns );
    615 		lastst = reallocate_integer_array( lastst, current_mns );
    616 		finalst = reallocate_integer_array( finalst, current_mns );
    617 		transchar = reallocate_integer_array( transchar, current_mns );
    618 		trans1 = reallocate_integer_array( trans1, current_mns );
    619 		trans2 = reallocate_integer_array( trans2, current_mns );
    620 		accptnum = reallocate_integer_array( accptnum, current_mns );
    621 		assoc_rule =
    622 			reallocate_integer_array( assoc_rule, current_mns );
    623 		state_type =
    624 			reallocate_integer_array( state_type, current_mns );
    625 		}
    626 
    627 	firstst[lastnfa] = lastnfa;
    628 	finalst[lastnfa] = lastnfa;
    629 	lastst[lastnfa] = lastnfa;
    630 	transchar[lastnfa] = sym;
    631 	trans1[lastnfa] = NO_TRANSITION;
    632 	trans2[lastnfa] = NO_TRANSITION;
    633 	accptnum[lastnfa] = NIL;
    634 	assoc_rule[lastnfa] = num_rules;
    635 	state_type[lastnfa] = current_state_type;
    636 
    637 	/* Fix up equivalence classes base on this transition.  Note that any
    638 	 * character which has its own transition gets its own equivalence
    639 	 * class.  Thus only characters which are only in character classes
    640 	 * have a chance at being in the same equivalence class.  E.g. "a|b"
    641 	 * puts 'a' and 'b' into two different equivalence classes.  "[ab]"
    642 	 * puts them in the same equivalence class (barring other differences
    643 	 * elsewhere in the input).
    644 	 */
    645 
    646 	if ( sym < 0 )
    647 		{
    648 		/* We don't have to update the equivalence classes since
    649 		 * that was already done when the ccl was created for the
    650 		 * first time.
    651 		 */
    652 		}
    653 
    654 	else if ( sym == SYM_EPSILON )
    655 		++numeps;
    656 
    657 	else
    658 		{
    659 		check_char( sym );
    660 
    661 		if ( useecs )
    662 			/* Map NUL's to csize. */
    663 			mkechar( sym ? sym : csize, nextecm, ecgroup );
    664 		}
    665 
    666 	return lastnfa;
    667 	}
    668 
    669 
    670 /* mkxtion - make a transition from one state to another
    671  *
    672  * synopsis
    673  *
    674  *   mkxtion( statefrom, stateto );
    675  *
    676  *     statefrom - the state from which the transition is to be made
    677  *     stateto   - the state to which the transition is to be made
    678  */
    679 
    680 void mkxtion( statefrom, stateto )
    681 int statefrom, stateto;
    682 	{
    683 	if ( trans1[statefrom] == NO_TRANSITION )
    684 		trans1[statefrom] = stateto;
    685 
    686 	else if ( (transchar[statefrom] != SYM_EPSILON) ||
    687 		  (trans2[statefrom] != NO_TRANSITION) )
    688 		flexfatal( _( "found too many transitions in mkxtion()" ) );
    689 
    690 	else
    691 		{ /* second out-transition for an epsilon state */
    692 		++eps2;
    693 		trans2[statefrom] = stateto;
    694 		}
    695 	}
    696 
    697 /* new_rule - initialize for a new rule */
    698 
    699 void new_rule()
    700 	{
    701 	if ( ++num_rules >= current_max_rules )
    702 		{
    703 		++num_reallocs;
    704 		current_max_rules += MAX_RULES_INCREMENT;
    705 		rule_type = reallocate_integer_array( rule_type,
    706 							current_max_rules );
    707 		rule_linenum = reallocate_integer_array( rule_linenum,
    708 							current_max_rules );
    709 		rule_useful = reallocate_integer_array( rule_useful,
    710 							current_max_rules );
    711 		}
    712 
    713 	if ( num_rules > MAX_RULE )
    714 		lerrif( _( "too many rules (> %d)!" ), MAX_RULE );
    715 
    716 	rule_linenum[num_rules] = linenum;
    717 	rule_useful[num_rules] = false;
    718 	}