sub3.c (10944B)
1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. 24 * All rights reserved. 25 * Use is subject to license terms. 26 */ 27 28 /* from OpenSolaris "sub3.c 1.8 05/06/08 SMI" */ 29 30 /* 31 * Portions Copyright (c) 2005 Gunnar Ritter, Freiburg i. Br., Germany 32 * 33 * Sccsid @(#)sub3.c 1.4 (gritter) 11/26/05 34 */ 35 36 /* 37 * sub3.c ... ALE enhancement. 38 * Since a typical Asian language has a huge character set, it is not 39 * ideal to index an array by a character code itself, which requires 40 * as large as 2**16 entries per array. 41 * To get arround this problem, we identify a set of characters that 42 * causes the same transition on all states and call it character group. 43 * Every character in a same character group has a unique number called 44 * character group id. A function yycgid(c) maps the character c (in process 45 * code) to the id. This mapping is determined by analyzing all regular 46 * expressions in the lex program. 47 * 48 */ 49 #include <stdlib.h> 50 #ifdef __sun 51 #include <widec.h> 52 #endif 53 #include "search.h" 54 #include "ldefs.c" 55 #include <ctype.h> 56 57 /* 58 * "lchar" stands for linearized character. It is a variant of 59 * process code. AT&T's 16-bit process code has a drawback in which 60 * for three three process code C, D and E where C <= D <= E, 61 * codeset(C)==codeset(E) does not mean codeset(D)==codeset(C). 62 * In other words, four codesets alternates as the magnitude 63 * of character increases. 64 * The lchar representation holds this property: 65 * If three lchar C', D' and E' have the relationship C' < D' < E' and 66 * codeset(C') == codeset(E') then D' is guaranteed to belong to 67 * the same codeset as C' and E'. 68 * lchar is implemented as 32 bit entities and the function linearize() 69 * that maps a wchar_t to lchar is defined below. There is no 70 * reverse function for it though. 71 * The 32-bit process code by AT&T, used only for Taiwanese version at the 72 * time of wrting, has no such problem and we use it as it is. 73 */ 74 75 lchar yycgidtbl[MAXNCG] = { 76 0, /* For ease of computation of the id. */ 77 '\n', /* Newline is always special because '.' exclude it. */ 78 0x000000ff, /* The upper limit of codeset 0. */ 79 0x20ffffff, /* The upper limit of codeset 2. */ 80 0x40ffffff /* The upper limit of codeset 3. */ 81 /* 0x60ffffff The upper limit of codeset 1. */ 82 /* Above assumes the number of significant bits of wchar_t is <= 24. */ 83 }; 84 int ncgidtbl = 5; /* # elements in yycgidtbl. */ 85 int ncg; /* Should set to ncgidtbl*2; this is the largest value yycgid() */ 86 /* returns plus 1. */ 87 88 static void setsymbol(int i); 89 90 /* 91 * For given 16-bit wchar_t (See NOTE), lchar is computed as illustrated below: 92 * 93 * wc: axxxxxxbyyyyyyy 94 * 95 * returns: 0ab0000000000000axxxxxxxbyyyyyyy 96 * 97 * linearize() doesn't do any if compiled with 32-bit wchar_t, use of 98 * which is flagged with LONG_WCHAR_T macro. 99 * NOTE: 100 * The implementation is highly depends on the process code representation. 101 * This function should be modified when 32-bit process code is used. 102 * There is no need to keep 'a' and 'b' bits in the lower half of lchar. 103 * You can actually omit these and squeeze the xxxxxx part one bit right. 104 * We don't do that here just in sake of speed. 105 */ 106 lchar 107 linearize(wchar_t wc) 108 { 109 #ifdef LONG_WCHAR_T 110 return ((lchar)wc); /* Don't do anything. */ 111 #else 112 113 lchar prefix; 114 switch (wc&0x8080) { 115 case 0x0000: prefix = 0x00000000; break; 116 case 0x0080: prefix = 0x20000000; break; 117 case 0x8000: prefix = 0x40000000; break; 118 case 0x8080: prefix = 0x60000000; break; 119 } 120 return (prefix|wc); 121 #endif 122 } 123 124 /* compare liniear characters pointed to by pc1 and pc2 */ 125 int 126 cmplc(const void *arg1, const void *arg2) 127 { 128 lchar *pc1 = (lchar *)arg1; 129 lchar *pc2 = (lchar *)arg2; 130 131 if (*pc1 > *pc2) 132 return (1); 133 else if (*pc1 == *pc2) 134 return (0); 135 else 136 return (-1); 137 } 138 139 void 140 remch(wchar_t c) 141 { 142 lchar lc = linearize(c); 143 144 /* 145 * User-friendliness consideration: 146 * Make sure no EUC chars are used in reg. exp. 147 */ 148 if (!handleeuc) { 149 if (!isascii(c)) 150 if (iswprint(c)) 151 warning( 152 "Non-ASCII character '%lc' in pattern; use -w or -e lex option.", c); 153 else warning( 154 "Non-ASCII character of value %#x in pattern; use -w or -e lex option.", c); 155 /* In any case, we don't need to construct ncgidtbl[]. */ 156 return; 157 } 158 159 xlsearch(&lc, yycgidtbl, 160 (unsigned *)&ncgidtbl, sizeof (lchar), cmplc); 161 } 162 163 void 164 sortcgidtbl(void) 165 { 166 if (!handleeuc) 167 return; 168 qsort(yycgidtbl, ncgidtbl, sizeof (lchar), cmplc); 169 } 170 171 /* 172 * int yycgid(wchar_t c) 173 * Takes c and returns its character group id, determind by the 174 * following algorithm. The program also uses the binary search 175 * algorithm, generalized from Knuth (6.2.1) Algorithm B. 176 * 177 * This function computes the "character group id" based on 178 * a table yycgidtbl of which each lchar entry is pre-sorted 179 * in ascending sequence The number of valid entries is given 180 * by YYNCGIDTBL. There is no duplicate entries in yycgidtbl. 181 * const int YYNCGIDTBL; 182 * lchar yycgidtbl[YYNCGIDTBL]; 183 * 184 * yycgidtbl[0] is guaranteed to have zero. 185 * 186 * For given c, yycgid(c) returns: 187 * 2*i iff yycgidtbl[i] == lc 188 * 2*i+1 iff yycgidtbl[i] < lc < yycgidtbl[i+1] 189 * YYNCGIDTBL*2-1 190 * iff yycgidtbl[YYNCGIDTBL-1] < lc 191 * where lc=linearize(c). 192 * 193 * Some interesting properties.: 194 * 1. For any c, 0 <= yycgid(c) <= 2*YYNCGIDTBL-1 195 * 2. yycgid(c) == 0 iff c == 0. 196 * 3. For any wchar_t c and d, if linearize(c) < linearize(d) then 197 * yycgid(c) <= yycgid(d). 198 * 4. For any wchar_t c and d, if yycgid(c) < yycgid(d) then 199 * linearize(c) < linearize(d). 200 */ 201 #define YYNCGIDTBL ncgidtbl 202 203 int 204 yycgid(wchar_t c) 205 { 206 int first = 0; 207 int last = YYNCGIDTBL - 1; 208 lchar lc; 209 210 /* 211 * In ASCII compat. mode, each character forms a "group" and the 212 * group-id is itself... 213 */ 214 if (!handleeuc) 215 return (c); 216 217 lc = linearize(c); 218 219 /* An exceptional case: yycgidtbl[YYNCGIDTBL-1] < lc */ 220 if (yycgidtbl[YYNCGIDTBL - 1] < lc) 221 return (YYNCGIDTBL*2 - 1); 222 223 while (last >= 0) { 224 int i = (first+last)/2; 225 if (lc == yycgidtbl[i]) 226 return (2*i); /* lc exactly matches an element. */ 227 else if (yycgidtbl[i] < lc) { 228 if (lc < yycgidtbl[i+1]) 229 return (2*i+1); /* lc is in between two elements. */ 230 else 231 first = i + 1; 232 } else 233 last = i - 1; 234 } 235 error( 236 "system error in yycgid():binary search failed for c=0x%04x\n", c); 237 return (0); 238 } 239 240 /* 241 * repbycgid --- replaces each character in the parsing tree by its 242 * character group id. This, however, should be called even in 243 * the ASCII compat. mode to process DOT nodes and to call cclinter() 244 * for the DOT and CCL nodes. 245 */ 246 void 247 repbycgid(void) 248 { 249 int i, c; 250 251 for (i = 0; i < tptr; ++i) { 252 c = name[i]; 253 if (!ISOPERATOR(c)) { 254 /* If not an operator, it must be a char. */ 255 name[i] = yycgid((wchar_t)c); /* So replace it. */ 256 #ifdef DEBUG 257 if (debug) { 258 printf("name[%d]:'%c'->%d;\n", i, c, name[i]); 259 } 260 #endif 261 } else if (c == RSTR) { 262 c = right[i]; 263 right[i] = yycgid((wchar_t)c); 264 #ifdef DEBUG 265 if (debug) { 266 printf( 267 "name[%d].right:'%c'->%d;\n", i, c, right[i]); 268 } 269 #endif 270 } else if ((c == RCCL) || (c == RNCCL)) { 271 CHR cc, *s; 272 int j; 273 CHR ccltoken[CCLSIZE]; 274 CHR *ccp; 275 int m; 276 /* 277 * This node represetns a character class RE [ccccc] 278 * s points to the string of characters that forms 279 * the class and/or a special prefix notation 280 * <RANGE>XY which corresponds to the RE X-Y, 281 * characters in the range of X and Y. Here, 282 * X <= Y is guranteed. 283 * We transform these characters into a string 284 * of sorted character group ids. 285 * 286 * There is another mechanism of packing tables 287 * that is inherited from the ASCII lex. Call of 288 * cclinter() is required for this packing. 289 * This used to be done as yylex() reads the lex 290 * rules but we have to do this here because the 291 * transition table is made to work on the char-group 292 * ids and the mapping cannot be determined until 293 * the entire file is read. 294 */ 295 #ifdef DEBUG 296 if (debug) { 297 printf("name[%d]:R[N]CCL of \"", i); 298 strpt((CHR *)left[i]); 299 printf(" -> {"); 300 } 301 #endif 302 /* Prepare symbol[] for cclinter(). */ 303 for (j = 0; j < ncg; ++j) 304 symbol[j] = FALSE; 305 306 s = (CHR *) left[i]; 307 while (cc = *s++) { 308 if (cc == RANGE) { 309 int low, high, i; 310 /* 311 * Special form: <RANGE>XY 312 * This means the range X-Y. 313 * We mark all symbols[] 314 * elements for yycgid(X) thru 315 * yycgid(Y), inclusively. 316 */ 317 low = yycgid(*s++); 318 high = yycgid(*s++); 319 for (i = low; i <= high; ++i) 320 setsymbol(i); 321 } else { 322 setsymbol(yycgid(cc)); 323 } 324 } 325 326 /* Now make a transformed string of cgids. */ 327 s = ccptr; 328 m = 0; 329 for (j = 0; j < ncg; ++j) 330 if (symbol[j]) { 331 ccltoken[m++] = (CHR)j; 332 #ifdef DEBUG 333 if (debug) printf("%d, ", j); 334 #endif 335 } 336 337 #ifdef DEBUG 338 if (debug) printf("}\n"); 339 #endif 340 ccltoken[m] = 0; 341 ccp = ccl; 342 while (ccp < ccptr && scomp(ccltoken, ccp) != 0) 343 ccp++; 344 if (ccp < ccptr) { /* character class found in ccl */ 345 left[i] = (intptr_t)ccp; 346 } else { /* not in ccl, add it */ 347 left[i] = (intptr_t)ccptr; 348 scopy(ccltoken, ccptr); 349 ccptr += slength(ccltoken) + 1; 350 if (ccptr > ccl + CCLSIZE) 351 error("Too many large character classes"); 352 } 353 cclinter(c == RCCL); 354 } else if (c == DOT) { 355 if (psave == 0) { /* First DOT node. */ 356 int j, nlid; 357 /* 358 * Make symbol[k]=TRUE for all k 359 * except k == yycgid('\n'). 360 */ 361 nlid = yycgid('\n'); 362 psave = ccptr; 363 for (j = 1; j < ncg; ++j) { 364 if (j == nlid) { 365 symbol[j] = FALSE; 366 } else { 367 symbol[j] = TRUE; 368 *ccptr++ = (CHR) j; 369 } 370 } 371 *ccptr++ = 0; 372 if (ccptr > ccl + CCLSIZE) 373 error("Too many large character classes"); 374 } 375 /* Mimic mn1(RCCL,psave)... */ 376 name[i] = RCCL; 377 left[i] = (intptr_t)psave; 378 cclinter(1); 379 } 380 } 381 #ifdef DEBUG 382 if (debug) { 383 printf("treedump after repbycgid().\n"); 384 treedump(); 385 } 386 #endif 387 } 388 389 static void 390 setsymbol(int i) 391 { 392 if (i > sizeof (symbol)) 393 error("setsymbol: (SYSERR) %d out of range", i); 394 symbol[i] = TRUE; 395 }