hbase

heirloom base
git clone git://git.2f30.org/hbase
Log | Files | Refs | README

rune.c (3963B)


      1 /*
      2  * The authors of this software are Rob Pike and Ken Thompson.
      3  *              Copyright (c) 2002 by Lucent Technologies.
      4  * Permission to use, copy, modify, and distribute this software for any
      5  * purpose without fee is hereby granted, provided that this entire notice
      6  * is included in all copies of any software which is or includes a copy
      7  * or modification of this software and in all copies of the supporting
      8  * documentation for such software.
      9  * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
     10  * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE
     11  * ANY REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
     12  * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
     13  */
     14 #include <stdarg.h>
     15 #include <string.h>
     16 #include "plan9.h"
     17 #include "utf.h"
     18 
     19 enum
     20 {
     21 	Bit1	= 7,
     22 	Bitx	= 6,
     23 	Bit2	= 5,
     24 	Bit3	= 4,
     25 	Bit4	= 3,
     26 	Bit5	= 2,
     27 
     28 	T1	= ((1<<(Bit1+1))-1) ^ 0xFF,	/* 0000 0000 */
     29 	Tx	= ((1<<(Bitx+1))-1) ^ 0xFF,	/* 1000 0000 */
     30 	T2	= ((1<<(Bit2+1))-1) ^ 0xFF,	/* 1100 0000 */
     31 	T3	= ((1<<(Bit3+1))-1) ^ 0xFF,	/* 1110 0000 */
     32 	T4	= ((1<<(Bit4+1))-1) ^ 0xFF,	/* 1111 0000 */
     33 	T5	= ((1<<(Bit5+1))-1) ^ 0xFF,	/* 1111 1000 */
     34 
     35 	Rune1	= (1<<(Bit1+0*Bitx))-1,		/* 0000 0000 0000 0000 0111 1111 */
     36 	Rune2	= (1<<(Bit2+1*Bitx))-1,		/* 0000 0000 0000 0111 1111 1111 */
     37 	Rune3	= (1<<(Bit3+2*Bitx))-1,		/* 0000 0000 1111 1111 1111 1111 */
     38 	Rune4	= (1<<(Bit4+3*Bitx))-1,		/* 0011 1111 1111 1111 1111 1111 */
     39 
     40 	Maskx	= (1<<Bitx)-1,			/* 0011 1111 */
     41 	Testx	= Maskx ^ 0xFF,			/* 1100 0000 */
     42 
     43 	Bad	= Runeerror
     44 };
     45 
     46 int
     47 chartorune(Rune *rune, char *str)
     48 {
     49 	int c, c1, c2, c3;
     50 	long l;
     51 
     52 	/*
     53 	 * one character sequence
     54 	 *	00000-0007F => T1
     55 	 */
     56 	c = *(uchar*)str;
     57 	if(c < Tx) {
     58 		*rune = c;
     59 		return 1;
     60 	}
     61 
     62 	/*
     63 	 * two character sequence
     64 	 *	0080-07FF => T2 Tx
     65 	 */
     66 	c1 = *(uchar*)(str+1) ^ Tx;
     67 	if(c1 & Testx)
     68 		goto bad;
     69 	if(c < T3) {
     70 		if(c < T2)
     71 			goto bad;
     72 		l = ((c << Bitx) | c1) & Rune2;
     73 		if(l <= Rune1)
     74 			goto bad;
     75 		*rune = l;
     76 		return 2;
     77 	}
     78 
     79 	/*
     80 	 * three character sequence
     81 	 *	0800-FFFF => T3 Tx Tx
     82 	 */
     83 	c2 = *(uchar*)(str+2) ^ Tx;
     84 	if(c2 & Testx)
     85 		goto bad;
     86 	if(c < T4) {
     87 		l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
     88 		if(l <= Rune2)
     89 			goto bad;
     90 		*rune = l;
     91 		return 3;
     92 	}
     93 
     94 	/*
     95 	 * four character sequence
     96 	 *	10000-10FFFF => T4 Tx Tx Tx
     97 	 */
     98 	if(UTFmax >= 4) {
     99 		c3 = *(uchar*)(str+3) ^ Tx;
    100 		if(c3 & Testx)
    101 			goto bad;
    102 		if(c < T5) {
    103 			l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
    104 			if(l <= Rune3)
    105 				goto bad;
    106 			if(l > Runemax)
    107 				goto bad;
    108 			*rune = l;
    109 			return 4;
    110 		}
    111 	}
    112 
    113 	/*
    114 	 * bad decoding
    115 	 */
    116 bad:
    117 	*rune = Bad;
    118 	return 1;
    119 }
    120 
    121 int
    122 runetochar(char *str, Rune *rune)
    123 {
    124 	long c;
    125 
    126 	/*
    127 	 * one character sequence
    128 	 *	00000-0007F => 00-7F
    129 	 */
    130 	c = *rune;
    131 	if(c <= Rune1) {
    132 		str[0] = c;
    133 		return 1;
    134 	}
    135 
    136 	/*
    137 	 * two character sequence
    138 	 *	00080-007FF => T2 Tx
    139 	 */
    140 	if(c <= Rune2) {
    141 		str[0] = T2 | (c >> 1*Bitx);
    142 		str[1] = Tx | (c & Maskx);
    143 		return 2;
    144 	}
    145 
    146 	/*
    147 	 * three character sequence
    148 	 *	00800-0FFFF => T3 Tx Tx
    149 	 */
    150 	if(c > Runemax)
    151 		c = Runeerror;
    152 	if(c <= Rune3) {
    153 		str[0] = T3 |  (c >> 2*Bitx);
    154 		str[1] = Tx | ((c >> 1*Bitx) & Maskx);
    155 		str[2] = Tx |  (c & Maskx);
    156 		return 3;
    157 	}
    158 	
    159 	/*
    160 	 * four character sequence
    161 	 *	010000-1FFFFF => T4 Tx Tx Tx
    162 	 */
    163 	str[0] = T4 |  (c >> 3*Bitx);
    164 	str[1] = Tx | ((c >> 2*Bitx) & Maskx);
    165 	str[2] = Tx | ((c >> 1*Bitx) & Maskx);
    166 	str[3] = Tx |  (c & Maskx);
    167 	return 4;
    168 }
    169 
    170 int
    171 runelen(long c)
    172 {
    173 	Rune rune;
    174 	char str[10];
    175 
    176 	rune = c;
    177 	return runetochar(str, &rune);
    178 }
    179 
    180 int
    181 runenlen(Rune *r, int nrune)
    182 {
    183 	int nb, c;
    184 
    185 	nb = 0;
    186 	while(nrune--) {
    187 		c = *r++;
    188 		if(c <= Rune1)
    189 			nb++;
    190 		else
    191 		if(c <= Rune2)
    192 			nb += 2;
    193 		else
    194 		if(c <= Rune3 || c > Runemax)
    195 			nb += 3;
    196 		else
    197 			nb += 4;
    198 	}
    199 	return nb;
    200 }
    201 
    202 int
    203 fullrune(char *str, int n)
    204 {
    205 	int c;
    206 
    207 	if(n <= 0)
    208 		return 0;
    209 	c = *(uchar*)str;
    210 	if(c < Tx)
    211 		return 1;
    212 	if(c < T3)
    213 		return n >= 2;
    214 	if(UTFmax == 3 || c < T4)
    215 		return n >= 3;
    216 	return n >= 4;
    217 }