00001 /* 00002 ******************************************************************************* 00003 * 00004 * Copyright (C) 1999-2000, International Business Machines 00005 * Corporation and others. All Rights Reserved. 00006 * 00007 ******************************************************************************* 00008 * file name: utf8.h 00009 * encoding: US-ASCII 00010 * tab size: 8 (not used) 00011 * indentation:4 00012 * 00013 * created on: 1999sep13 00014 * created by: Markus W. Scherer 00015 * 00016 * This file defines macros to deal with UTF-8 code units and code points. 00017 * Signatures and semantics are the same as for the similarly named macros 00018 * in utf16.h. 00019 * utf8.h is included by utf.h after unicode/umachine.h 00020 * and some common definitions. 00021 */ 00022 00023 00024 /* utf.h must be included first. */ 00025 #ifndef __UTF_H__ 00026 # include "unicode/utf.h" 00027 #endif 00028 00029 #ifndef __UTF8_H__ 00030 #define __UTF8_H__ 00031 00032 /* internal definitions ----------------------------------------------------- */ 00033 00034 #ifdef U_UTF8_IMPL 00035 U_CAPI uint8_t U_EXPORT2 00036 utf8_countTrailBytes[256]; 00037 #else 00038 U_CFUNC uint8_t /* U_IMPORT2? */ U_IMPORT 00039 utf8_countTrailBytes[256]; 00040 #endif 00041 00042 /* 00043 * Count the trail bytes for a lead byte - 00044 * this macro should be used so that the assembler code 00045 * that is mentioned in utf_impl.c could be used here. 00046 */ 00047 #define UTF8_COUNT_TRAIL_BYTES(leadByte) (utf8_countTrailBytes[(uint8_t)leadByte]) 00048 00049 /* use a macro here, too - there may be a simpler way with some machines */ 00050 #define UTF8_MASK_LEAD_BYTE(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1) 00051 00052 U_CAPI UChar32 U_EXPORT2 00053 utf8_nextCharSafeBody(const uint8_t *s, UTextOffset *pi, UTextOffset length, UChar32 c, UBool strict); 00054 00055 U_CAPI UTextOffset U_EXPORT2 00056 utf8_appendCharSafeBody(uint8_t *s, UTextOffset i, UTextOffset length, UChar32 c); 00057 00058 U_CAPI UChar32 U_EXPORT2 00059 utf8_prevCharSafeBody(const uint8_t *s, UTextOffset start, UTextOffset *pi, UChar32 c, UBool strict); 00060 00061 U_CAPI UTextOffset U_EXPORT2 00062 utf8_back1SafeBody(const uint8_t *s, UTextOffset start, UTextOffset i); 00063 00064 /* 00065 * For the semantics of all of these macros, see utf16.h. 00066 * The UTF-8 macros favor sequences more the shorter they are. 00067 * Sometimes, only the single-byte case is covered by a macro, 00068 * while longer sequences are handled by a function call. 00069 */ 00070 00071 /* single-code point definitions -------------------------------------------- */ 00072 00073 /* classes of code unit values */ 00074 #define UTF8_IS_SINGLE(uchar) (((uchar)&0x80)==0) 00075 #define UTF8_IS_LEAD(uchar) ((uint8_t)((uchar)-0xc0)<0x3e) 00076 #define UTF8_IS_TRAIL(uchar) (((uchar)&0xc0)==0x80) 00077 00078 /* number of code units per code point */ 00079 #define UTF8_NEED_MULTIPLE_UCHAR(c) ((uint32_t)(c)>0x7f) 00080 00081 /* 00082 * ICU does not deal with code points >0x10ffff 00083 * unless necessary for advancing in the byte stream. 00084 * 00085 * These length macros take into account that for values >0x10ffff 00086 * the "safe" append macros would write the error code point 0xffff 00087 * with 3 bytes. 00088 * Code point comparisons need to be in uint32_t because UChar32 00089 * may be a signed type, and negative values must be recognized. 00090 */ 00091 #if 1 00092 # define UTF8_CHAR_LENGTH(c) \ 00093 ((uint32_t)(c)<=0x7f ? 1 : \ 00094 ((uint32_t)(c)<=0x7ff ? 2 : \ 00095 ((uint32_t)((c)-0x10000)>0xfffff ? 3 : 4) \ 00096 ) \ 00097 ) 00098 #else 00099 # define UTF8_CHAR_LENGTH(c) \ 00100 ((uint32_t)(c)<=0x7f ? 1 : \ 00101 ((uint32_t)(c)<=0x7ff ? 2 : \ 00102 ((uint32_t)(c)<=0xffff ? 3 : \ 00103 ((uint32_t)(c)<=0x10ffff ? 4 : \ 00104 ((uint32_t)(c)<=0x3ffffff ? 5 : \ 00105 ((uint32_t)(c)<=0x7fffffff ? 6 : 3) \ 00106 ) \ 00107 ) \ 00108 ) \ 00109 ) \ 00110 ) 00111 #endif 00112 00113 #define UTF8_MAX_CHAR_LENGTH 4 00114 00115 /* average number of code units compared to UTF-16 */ 00116 #define UTF8_ARRAY_SIZE(size) ((5*(size))/2) 00117 00118 #define UTF8_GET_CHAR_UNSAFE(s, i, c) { \ 00119 UTextOffset __I=(UTextOffset)(i); \ 00120 UTF8_SET_CHAR_START_UNSAFE(s, __I); \ 00121 UTF8_NEXT_CHAR_UNSAFE(s, __I, c); \ 00122 } 00123 00124 #define UTF8_GET_CHAR_SAFE(s, start, i, length, c, strict) { \ 00125 UTextOffset __I=(UTextOffset)(i); \ 00126 UTF8_SET_CHAR_START_SAFE(s, start, __I); \ 00127 UTF8_NEXT_CHAR_SAFE(s, __I, length, c, strict); \ 00128 } 00129 00130 /* definitions with forward iteration --------------------------------------- */ 00131 00132 /* 00133 * Read a Unicode scalar value from an array of UTF-8 bytes. 00134 * Only values <=0x10ffff are accepted, and if an error occurs, 00135 * then c will be set such that UTF_IS_ERROR(c). 00136 * The _UNSAFE macro is fast and does not check for errors. 00137 * The _SAFE macro checks for errors and optionally for 00138 * irregular sequences, too, i.e., for sequences that 00139 * are longer than necessary, such as <c0 80> instead of <0>. 00140 * The strict checks also check for surrogates and 00141 * for 0xXXXXfffe and 0xXXXXffff. 00142 */ 00143 #define UTF8_NEXT_CHAR_UNSAFE(s, i, c) { \ 00144 (c)=(s)[(i)++]; \ 00145 if((uint8_t)((c)-0xc0)<0x35) { \ 00146 uint8_t __count=UTF8_COUNT_TRAIL_BYTES(c); \ 00147 UTF8_MASK_LEAD_BYTE(c, __count); \ 00148 switch(__count) { \ 00149 /* each following branch falls through to the next one */ \ 00150 case 3: \ 00151 (c)=((c)<<6)|((s)[(i)++]&0x3f); \ 00152 case 2: \ 00153 (c)=((c)<<6)|((s)[(i)++]&0x3f); \ 00154 case 1: \ 00155 (c)=((c)<<6)|((s)[(i)++]&0x3f); \ 00156 /* no other branches to optimize switch() */ \ 00157 break; \ 00158 } \ 00159 } \ 00160 } 00161 00162 #define UTF8_APPEND_CHAR_UNSAFE(s, i, c) { \ 00163 if((uint32_t)(c)<=0x7f) { \ 00164 (s)[(i)++]=(uint8_t)(c); \ 00165 } else { \ 00166 if((uint32_t)(c)<=0x7ff) { \ 00167 (s)[(i)++]=(uint8_t)(((c)>>6)|0xc0); \ 00168 } else { \ 00169 if((uint32_t)(c)<=0xffff) { \ 00170 (s)[(i)++]=(uint8_t)(((c)>>12)|0xe0); \ 00171 } else { \ 00172 (s)[(i)++]=(uint8_t)(((c)>>18)|0xf0); \ 00173 (s)[(i)++]=(uint8_t)(((c)>>12)&0x3f|0x80); \ 00174 } \ 00175 (s)[(i)++]=(uint8_t)(((c)>>6)&0x3f|0x80); \ 00176 } \ 00177 (s)[(i)++]=(uint8_t)((c)&0x3f|0x80); \ 00178 } \ 00179 } 00180 00181 #define UTF8_FWD_1_UNSAFE(s, i) { \ 00182 (i)+=1+UTF8_COUNT_TRAIL_BYTES((s)[i]); \ 00183 } 00184 00185 #define UTF8_FWD_N_UNSAFE(s, i, n) { \ 00186 UTextOffset __N=(n); \ 00187 while(__N>0) { \ 00188 UTF8_FWD_1_UNSAFE(s, i); \ 00189 --__N; \ 00190 } \ 00191 } 00192 00193 #define UTF8_SET_CHAR_START_UNSAFE(s, i) { \ 00194 while(UTF8_IS_TRAIL((s)[i])) { --(i); } \ 00195 } 00196 00197 #define UTF8_NEXT_CHAR_SAFE(s, i, length, c, strict) { \ 00198 (c)=(s)[(i)++]; \ 00199 if(UTF8_IS_LEAD(c)) { \ 00200 (c)=utf8_nextCharSafeBody(s, &(i), (UTextOffset)(length), c, strict); \ 00201 } \ 00202 } 00203 00204 #define UTF8_APPEND_CHAR_SAFE(s, i, length, c) { \ 00205 if((uint32_t)(c)<=0x7f) { \ 00206 (s)[(i)++]=(uint8_t)(c); \ 00207 } else { \ 00208 (i)=utf8_appendCharSafeBody(s, (UTextOffset)(i), (UTextOffset)(length), c); \ 00209 } \ 00210 } 00211 00212 #define UTF8_FWD_1_SAFE(s, i, length) { \ 00213 uint8_t __b=(s)[(i)++]; \ 00214 if(UTF8_IS_LEAD(__b)) { \ 00215 uint8_t __count=UTF8_COUNT_TRAIL_BYTES(__b); \ 00216 if((i)+__count>(length)) { \ 00217 __count=(length)-(i); \ 00218 } \ 00219 while(__count>0 && UTF8_IS_TRAIL((s)[i])) { \ 00220 ++(i); \ 00221 --__count; \ 00222 } \ 00223 } \ 00224 } 00225 00226 #define UTF8_FWD_N_SAFE(s, i, length, n) { \ 00227 UTextOffset __N=(n); \ 00228 while(__N>0 && (i)<(length)) { \ 00229 UTF8_FWD_1_SAFE(s, i, length); \ 00230 --__N; \ 00231 } \ 00232 } 00233 00234 #define UTF8_SET_CHAR_START_SAFE(s, start, i) { \ 00235 if(UTF8_IS_TRAIL((s)[(i)])) { \ 00236 (i)=utf8_back1SafeBody(s, start, (UTextOffset)(i)); \ 00237 } \ 00238 } 00239 00240 /* definitions with backward iteration -------------------------------------- */ 00241 00242 #define UTF8_PREV_CHAR_UNSAFE(s, i, c) { \ 00243 (c)=(s)[--(i)]; \ 00244 if(UTF8_IS_TRAIL(c)) { \ 00245 uint8_t __b, __count=1, __shift=6; \ 00246 \ 00247 /* c is a trail byte */ \ 00248 (c)&=0x3f; \ 00249 for(;;) { \ 00250 __b=(s)[--(i)]; \ 00251 if(__b>=0xc0) { \ 00252 UTF8_MASK_LEAD_BYTE(__b, __count); \ 00253 (c)|=(UChar32)__b<<__shift; \ 00254 break; \ 00255 } else { \ 00256 (c)|=(UChar32)(__b&0x3f)<<__shift; \ 00257 ++__count; \ 00258 __shift+=6; \ 00259 } \ 00260 } \ 00261 } \ 00262 } 00263 00264 #define UTF8_BACK_1_UNSAFE(s, i) { \ 00265 while(UTF8_IS_TRAIL((s)[--(i)])) {} \ 00266 } 00267 00268 #define UTF8_BACK_N_UNSAFE(s, i, n) { \ 00269 UTextOffset __N=(n); \ 00270 while(__N>0) { \ 00271 UTF8_BACK_1_UNSAFE(s, i); \ 00272 --__N; \ 00273 } \ 00274 } 00275 00276 #define UTF8_SET_CHAR_LIMIT_UNSAFE(s, i) { \ 00277 UTF8_BACK_1_UNSAFE(s, i); \ 00278 UTF8_FWD_1_UNSAFE(s, i); \ 00279 } 00280 00281 #define UTF8_PREV_CHAR_SAFE(s, start, i, c, strict) { \ 00282 (c)=(s)[--(i)]; \ 00283 if(UTF8_IS_TRAIL((c))) { \ 00284 (c)=utf8_prevCharSafeBody(s, start, &(i), c, strict); \ 00285 } \ 00286 } 00287 00288 #define UTF8_BACK_1_SAFE(s, start, i) { \ 00289 if(UTF8_IS_TRAIL((s)[--(i)])) { \ 00290 (i)=utf8_back1SafeBody(s, start, (UTextOffset)(i)); \ 00291 } \ 00292 } 00293 00294 #define UTF8_BACK_N_SAFE(s, start, i, n) { \ 00295 UTextOffset __N=(n); \ 00296 while(__N>0 && (i)>(start)) { \ 00297 UTF8_BACK_1_SAFE(s, start, i); \ 00298 --__N; \ 00299 } \ 00300 } 00301 00302 #define UTF8_SET_CHAR_LIMIT_SAFE(s, start, i, length) { \ 00303 if((start)<(i) && (i)<(length)) { \ 00304 UTF8_BACK_1_SAFE(s, start, i); \ 00305 (i)+=1+UTF8_COUNT_TRAIL_BYTES((s)[i]); \ 00306 if((i)>(length)) { \ 00307 (i)=(length); \ 00308 } \ 00309 } \ 00310 } 00311 00312 #endif