sub3.c - hbase - heirloom base

sub3.c (10944B)
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License, Version 1.0 only
      6  * (the "License").  You may not use this file except in compliance
      7  * with the License.
      8  *
      9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
     10  * or http://www.opensolaris.org/os/licensing.
     11  * See the License for the specific language governing permissions
     12  * and limitations under the License.
     13  *
     14  * When distributing Covered Code, include this CDDL HEADER in each
     15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     16  * If applicable, add the following below this CDDL HEADER, with the
     17  * fields enclosed by brackets "[]" replaced with your own identifying
     18  * information: Portions Copyright [yyyy] [name of copyright owner]
     19  *
     20  * CDDL HEADER END
     21  */
     22 /*
     23  * Copyright 2005 Sun Microsystems, Inc.
     24  * All rights reserved.
     25  * Use is subject to license terms.
     26  */
     27 
     28 /*	from OpenSolaris "sub3.c	1.8	05/06/08 SMI"	*/
     29 
     30 /*
     31  * Portions Copyright (c) 2005 Gunnar Ritter, Freiburg i. Br., Germany
     32  *
     33  * Sccsid @(#)sub3.c	1.4 (gritter) 11/26/05
     34  */
     35 
     36 /*
     37  * sub3.c ... ALE enhancement.
     38  * Since a typical Asian language has a huge character set, it is not
     39  * ideal to index an array by a character code itself, which requires
     40  * as large as 2**16 entries per array.
     41  * To get arround this problem, we identify a set of characters that
     42  * causes the same transition on all states and call it character group.
     43  * Every character in a same character group has a unique number called
     44  * character group id.  A function yycgid(c) maps the character c (in process
     45  * code) to the id.  This mapping is determined by analyzing all regular
     46  * expressions in the lex program.
     47  *
     48  */
     49 #include	<stdlib.h>
     50 #ifdef	__sun
     51 #include	<widec.h>
     52 #endif
     53 #include	"search.h"
     54 #include	"ldefs.c"
     55 #include	<ctype.h>
     56 
     57 /*
     58  * "lchar" stands for linearized character.  It is a variant of
     59  * process code.  AT&T's 16-bit process code has a drawback in which
     60  * for three three process code C, D and E where C <= D <= E,
     61  * codeset(C)==codeset(E) does not mean codeset(D)==codeset(C).
     62  * In other words, four codesets alternates as the magnitude
     63  * of character increases.
     64  * The lchar representation holds this property:
     65  *   If three lchar C', D' and E' have the relationship C' < D' <  E' and
     66  *   codeset(C') == codeset(E') then D' is guaranteed to belong to
     67  *   the same codeset as C' and E'.
     68  * lchar is implemented as 32 bit entities and the function linearize()
     69  * that maps a wchar_t to lchar is defined below.  There is no
     70  * reverse function for it though.
     71  * The 32-bit process code by AT&T, used only for Taiwanese version at the
     72  * time of wrting, has no such problem and we use it as it is.
     73  */
     74 
     75 lchar	yycgidtbl[MAXNCG] = {
     76 	0, 		/* For ease of computation of the id. */
     77 	'\n', 		/* Newline is always special because '.' exclude it. */
     78 	0x000000ff, 	/* The upper limit of codeset 0. */
     79 	0x20ffffff,	/* The upper limit of codeset 2. */
     80 	0x40ffffff	/* The upper limit of codeset 3. */
     81 /*	0x60ffffff	   The upper limit of codeset 1. */
     82 	/* Above assumes the number of significant bits of wchar_t is <= 24. */
     83 };
     84 int	ncgidtbl = 5; /* # elements in yycgidtbl. */
     85 int	ncg; /* Should set to ncgidtbl*2; this is the largest value yycgid() */
     86 		/* returns plus 1. */
     87 
     88 static void setsymbol(int i);
     89 
     90 /*
     91  * For given 16-bit wchar_t (See NOTE), lchar is computed as illustrated below:
     92  *
     93  * 	wc: axxxxxxbyyyyyyy
     94  *
     95  * returns: 0ab0000000000000axxxxxxxbyyyyyyy
     96  *
     97  * linearize() doesn't do any if compiled with 32-bit wchar_t, use of
     98  * which is flagged with LONG_WCHAR_T macro.
     99  * NOTE:
    100  * The implementation is highly depends on the process code representation.
    101  * This function should be modified when 32-bit process code is used.
    102  * There is no need to keep 'a' and 'b' bits in the lower half of lchar.
    103  * You can actually omit these and squeeze the xxxxxx part one bit right.
    104  * We don't do that here just in sake of speed.
    105  */
    106 lchar
    107 linearize(wchar_t wc)
    108 {
    109 #ifdef LONG_WCHAR_T
    110 	return ((lchar)wc); /* Don't do anything. */
    111 #else
    112 
    113 	lchar	prefix;
    114 	switch (wc&0x8080) {
    115 	case 0x0000: prefix = 0x00000000; break;
    116 	case 0x0080: prefix = 0x20000000; break;
    117 	case 0x8000: prefix = 0x40000000; break;
    118 	case 0x8080: prefix = 0x60000000; break;
    119 	}
    120 	return (prefix|wc);
    121 #endif
    122 }
    123 
    124 /* compare liniear characters pointed to by pc1 and pc2 */
    125 int
    126 cmplc(const void *arg1, const void *arg2)
    127 {
    128 	lchar *pc1 = (lchar *)arg1;
    129 	lchar *pc2 = (lchar *)arg2;
    130 
    131 	if (*pc1 > *pc2)
    132 		return (1);
    133 	else if (*pc1 == *pc2)
    134 		return (0);
    135 	else
    136 		return (-1);
    137 }
    138 
    139 void
    140 remch(wchar_t c)
    141 {
    142 	lchar	lc = linearize(c);
    143 
    144 	/*
    145 	 * User-friendliness consideration:
    146 	 * Make sure no EUC chars are used in reg. exp.
    147 	 */
    148 	if (!handleeuc) {
    149 		if (!isascii(c))
    150 		    if (iswprint(c))
    151 			warning(
    152 "Non-ASCII character '%lc' in pattern; use -w or -e lex option.", c);
    153 		    else warning(
    154 "Non-ASCII character of value %#x in pattern; use -w or -e lex option.", c);
    155 		/* In any case, we don't need to construct ncgidtbl[]. */
    156 		return;
    157 	}
    158 
    159 	xlsearch(&lc, yycgidtbl,
    160 		(unsigned *)&ncgidtbl, sizeof (lchar), cmplc);
    161 }
    162 
    163 void
    164 sortcgidtbl(void)
    165 {
    166 	if (!handleeuc)
    167 		return;
    168 	qsort(yycgidtbl, ncgidtbl, sizeof (lchar), cmplc);
    169 }
    170 
    171 /*
    172  * int yycgid(wchar_t c)
    173  *	Takes c and returns its character group id, determind by the
    174  *	following algorithm.  The program also uses the binary search
    175  *	algorithm, generalized from Knuth (6.2.1) Algorithm B.
    176  *
    177  *	This function computes the "character group id" based on
    178  *	a table yycgidtbl of which each lchar entry is pre-sorted
    179  *	in ascending sequence  The number of valid entries is given
    180  *	by YYNCGIDTBL.  There is no duplicate entries in yycgidtbl.
    181  *		const int YYNCGIDTBL;
    182  *		lchar	yycgidtbl[YYNCGIDTBL];
    183  *
    184  *	yycgidtbl[0] is guaranteed to have zero.
    185  *
    186  *	For given c, yycgid(c) returns:
    187  *		2*i	iff yycgidtbl[i] == lc
    188  *		2*i+1	iff yycgidtbl[i] < lc < yycgidtbl[i+1]
    189  *		YYNCGIDTBL*2-1
    190  *			iff yycgidtbl[YYNCGIDTBL-1] < lc
    191  *	where lc=linearize(c).
    192  *
    193  *	Some interesting properties.:
    194  *	1.  For any c, 0 <= yycgid(c) <= 2*YYNCGIDTBL-1
    195  *	2.  yycgid(c) == 0  iff  c == 0.
    196  *	3.  For any wchar_t c and d, if linearize(c) < linearize(d) then
    197  *	    yycgid(c) <= yycgid(d).
    198  *	4.  For any wchar_t c and d, if yycgid(c) < yycgid(d) then
    199  *	    linearize(c) < linearize(d).
    200  */
    201 #define	YYNCGIDTBL ncgidtbl
    202 
    203 int
    204 yycgid(wchar_t c)
    205 {
    206 	int first = 0;
    207 	int last = YYNCGIDTBL - 1;
    208 	lchar lc;
    209 
    210 	/*
    211 	 * In ASCII compat. mode, each character forms a "group" and the
    212 	 * group-id is itself...
    213 	 */
    214 	if (!handleeuc)
    215 		return (c);
    216 
    217 	lc = linearize(c);
    218 
    219 	/* An exceptional case: yycgidtbl[YYNCGIDTBL-1] < lc */
    220 	if (yycgidtbl[YYNCGIDTBL - 1] < lc)
    221 		return (YYNCGIDTBL*2 - 1);
    222 
    223 	while (last >= 0) {
    224 		int i = (first+last)/2;
    225 		if (lc == yycgidtbl[i])
    226 		    return (2*i);	/* lc  exactly matches an element. */
    227 		else if (yycgidtbl[i] < lc) {
    228 			if (lc < yycgidtbl[i+1])
    229 			    return (2*i+1); /* lc is in between two elements. */
    230 			else
    231 			    first = i + 1;
    232 		} else
    233 		    last = i - 1;
    234 	}
    235 	error(
    236 	"system error in yycgid():binary search failed for c=0x%04x\n", c);
    237 	return (0);
    238 }
    239 
    240 /*
    241  * repbycgid --- replaces each character in the parsing tree by its
    242  * character group id.   This, however, should be called even in
    243  * the ASCII compat. mode to process DOT nodes and to call cclinter()
    244  * for the DOT and CCL nodes.
    245  */
    246 void
    247 repbycgid(void)
    248 {
    249 	int i, c;
    250 
    251 	for (i = 0; i < tptr; ++i) {
    252 		c = name[i];
    253 		if (!ISOPERATOR(c)) {
    254 		/* If not an operator, it must be a char.  */
    255 		    name[i] = yycgid((wchar_t)c); /* So replace it. */
    256 #ifdef DEBUG
    257 		    if (debug) {
    258 			    printf("name[%d]:'%c'->%d;\n", i, c, name[i]);
    259 		    }
    260 #endif
    261 		} else if (c == RSTR) {
    262 			c = right[i];
    263 			right[i] = yycgid((wchar_t)c);
    264 #ifdef DEBUG
    265 		    if (debug) {
    266 			    printf(
    267 			    "name[%d].right:'%c'->%d;\n", i, c, right[i]);
    268 		    }
    269 #endif
    270 		} else if ((c == RCCL) || (c == RNCCL)) {
    271 			CHR cc, *s;
    272 			int j;
    273 			CHR ccltoken[CCLSIZE];
    274 			CHR *ccp;
    275 			int m;
    276 			/*
    277 			 * This node represetns a character class RE [ccccc]
    278 			 * s points to the string of characters that forms
    279 			 * the class and/or a special prefix notation
    280 			 * <RANGE>XY which corresponds to the RE X-Y,
    281 			 * characters in the range of X and Y.  Here,
    282 			 * X <= Y is guranteed.
    283 			 * We transform these characters into a string
    284 			 * of sorted character group ids.
    285 			 *
    286 			 * There is another mechanism of packing tables
    287 			 * that is inherited from the ASCII lex.  Call of
    288 			 * cclinter() is required for this packing.
    289 			 * This used to be done as yylex() reads the lex
    290 			 * rules but we have to do this here because the
    291 			 * transition table is made to work on the char-group
    292 			 * ids and the mapping cannot be determined until
    293 			 * the entire file is read.
    294 			 */
    295 #ifdef DEBUG
    296 			if (debug) {
    297 				printf("name[%d]:R[N]CCL of \"", i);
    298 				strpt((CHR *)left[i]);
    299 				printf(" -> {");
    300 			}
    301 #endif
    302 			/* Prepare symbol[] for cclinter(). */
    303 			for (j = 0; j < ncg; ++j)
    304 				symbol[j] = FALSE;
    305 
    306 			s = (CHR *) left[i];
    307 			while (cc = *s++) {
    308 				if (cc == RANGE) {
    309 					int	low, high, i;
    310 					/*
    311 					 * Special form: <RANGE>XY
    312 					 * This means the range X-Y.
    313 					 * We mark all symbols[]
    314 					 * elements for yycgid(X) thru
    315 					 * yycgid(Y), inclusively.
    316 					 */
    317 					low = yycgid(*s++);
    318 					high = yycgid(*s++);
    319 					for (i = low; i <= high; ++i)
    320 					    setsymbol(i);
    321 				} else {
    322 					setsymbol(yycgid(cc));
    323 				}
    324 			}
    325 
    326 			/* Now make a transformed string of cgids. */
    327 			s = ccptr;
    328 			m = 0;
    329 			for (j = 0; j < ncg; ++j)
    330 			    if (symbol[j]) {
    331 				    ccltoken[m++] = (CHR)j;
    332 #ifdef DEBUG
    333 				    if (debug) printf("%d, ", j);
    334 #endif
    335 			    }
    336 
    337 #ifdef DEBUG
    338 			if (debug) printf("}\n");
    339 #endif
    340 			ccltoken[m] = 0;
    341 			ccp = ccl;
    342 			while (ccp < ccptr && scomp(ccltoken, ccp) != 0)
    343 				ccp++;
    344 			if (ccp < ccptr) {  /* character class found in ccl */
    345 			    left[i] = (intptr_t)ccp;
    346 			} else { /* not in ccl, add it */
    347 			    left[i] = (intptr_t)ccptr;
    348 			    scopy(ccltoken, ccptr);
    349 			    ccptr += slength(ccltoken) + 1;
    350 			    if (ccptr > ccl + CCLSIZE)
    351 				error("Too many large character classes");
    352 			}
    353 			cclinter(c == RCCL);
    354 		} else if (c == DOT) {
    355 			if (psave == 0) { /* First DOT node. */
    356 				int j, nlid;
    357 				/*
    358 				 * Make symbol[k]=TRUE for all k
    359 				 *  except k == yycgid('\n').
    360 				 */
    361 				nlid = yycgid('\n');
    362 				psave = ccptr;
    363 				for (j = 1; j < ncg; ++j) {
    364 					if (j == nlid) {
    365 						symbol[j] = FALSE;
    366 					} else {
    367 						symbol[j] = TRUE;
    368 						*ccptr++ = (CHR) j;
    369 					}
    370 				}
    371 				*ccptr++ = 0;
    372 				if (ccptr > ccl + CCLSIZE)
    373 				    error("Too many large character classes");
    374 			}
    375 			/* Mimic mn1(RCCL,psave)... */
    376 			name[i] = RCCL;
    377 			left[i] = (intptr_t)psave;
    378 			cclinter(1);
    379 		}
    380 	}
    381 #ifdef DEBUG
    382 	if (debug) {
    383 		printf("treedump after repbycgid().\n");
    384 		treedump();
    385 	}
    386 #endif
    387 }
    388 
    389 static void
    390 setsymbol(int i)
    391 {
    392 	if (i > sizeof (symbol))
    393 		error("setsymbol: (SYSERR) %d out of range", i);
    394 	symbol[i] = TRUE;
    395 }
	hbase heirloom base
	git clone git://git.2f30.org/hbase
	Log \| Files \| Refs \| README