00001 /* 00002 ******************************************************************************* 00003 * 00004 * Copyright (C) 1999-2000, International Business Machines 00005 * Corporation and others. All Rights Reserved. 00006 * 00007 ******************************************************************************* 00008 * file name: utf16.h 00009 * encoding: US-ASCII 00010 * tab size: 8 (not used) 00011 * indentation:4 00012 * 00013 * created on: 1999sep09 00014 * created by: Markus W. Scherer 00015 * 00016 * This file defines macros to deal with UTF-16 code units and code points. 00017 * "Safe" macros check for length overruns and illegal sequences, and 00018 * also for irregular sequences when the strict option is set. 00019 * "Unsafe" macros are designed for maximum speed. 00020 * utf16.h is included by utf.h after unicode/umachine.h 00021 * and some common definitions. 00022 */ 00023 00024 #ifndef __UTF16_H__ 00025 #define __UTF16_H__ 00026 00027 /* single-code point definitions -------------------------------------------- */ 00028 00029 /* handle surrogate pairs */ 00030 #define UTF_IS_FIRST_SURROGATE(uchar) (((uchar)&0xfffffc00)==0xd800) 00031 #define UTF_IS_SECOND_SURROGATE(uchar) (((uchar)&0xfffffc00)==0xdc00) 00032 00033 #define UTF_IS_SURROGATE_FIRST(c) (((c)&0x400)==0) 00034 00035 /* get the UTF-32 value directly from the surrogate pseudo-characters */ 00036 #define UTF_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000) 00037 00038 #define UTF16_GET_PAIR_VALUE(first, second) \ 00039 (((first)<<10UL)+(second)-UTF_SURROGATE_OFFSET) 00040 00041 /* classes of code unit values */ 00042 #define UTF16_IS_SINGLE(uchar) !UTF_IS_SURROGATE(uchar) 00043 #define UTF16_IS_LEAD(uchar) UTF_IS_FIRST_SURROGATE(uchar) 00044 #define UTF16_IS_TRAIL(uchar) UTF_IS_SECOND_SURROGATE(uchar) 00045 00046 /* number of code units per code point */ 00047 #define UTF16_NEED_MULTIPLE_UCHAR(c) ((uint32_t)(c)>0xffff) 00048 #define UTF16_CHAR_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2) 00049 #define UTF16_MAX_CHAR_LENGTH 2 00050 00051 /* average number of code units compared to UTF-16 */ 00052 #define UTF16_ARRAY_SIZE(size) (size) 00053 00054 /* 00055 * Get a single code point from an offset that points to any 00056 * of the code units that belong to that code point. 00057 * Assume 0<=i<length. 00058 * 00059 * This could be used for iteration together with 00060 * UTF16_CHAR_LENGTH() and UTF_IS_ERROR(), 00061 * but the use of UTF16_NEXT_CHAR_[UN]SAFE() and 00062 * UTF16_PREV_CHAR_[UN]SAFE() is more efficient for that. 00063 */ 00064 #define UTF16_GET_CHAR_UNSAFE(s, i, c) { \ 00065 (c)=(s)[i]; \ 00066 if(UTF_IS_SURROGATE(c)) { \ 00067 if(UTF_IS_SURROGATE_FIRST(c)) { \ 00068 (c)=UTF16_GET_PAIR_VALUE((c), (s)[(i)+1]); \ 00069 } else { \ 00070 (c)=UTF16_GET_PAIR_VALUE((s)[(i)-1], (c)); \ 00071 } \ 00072 } \ 00073 } 00074 00075 #define UTF16_GET_CHAR_SAFE(s, start, i, length, c, strict) { \ 00076 (c)=(s)[i]; \ 00077 if(UTF_IS_SURROGATE(c)) { \ 00078 uint16_t __c2; \ 00079 if(UTF_IS_SURROGATE_FIRST(c)) { \ 00080 if((i)+1<(length) && UTF_IS_SECOND_SURROGATE(__c2=(s)[(i)+1])) { \ 00081 (c)=UTF16_GET_PAIR_VALUE((c), __c2); \ 00082 /* strict: ((c)&0xfffe)==0xfffe is caught by UTF_IS_ERROR() */ \ 00083 } else if(strict) {\ 00084 /* unmatched first surrogate */ \ 00085 (c)=UTF_ERROR_VALUE; \ 00086 } \ 00087 } else { \ 00088 if((i)>(start) && UTF_IS_FIRST_SURROGATE(__c2=(s)[(i)-1])) { \ 00089 (c)=UTF16_GET_PAIR_VALUE(__c2, (c)); \ 00090 /* strict: ((c)&0xfffe)==0xfffe is caught by UTF_IS_ERROR() */ \ 00091 } else if(strict) {\ 00092 /* unmatched second surrogate */ \ 00093 (c)=UTF_ERROR_VALUE; \ 00094 } \ 00095 } \ 00096 /* else strict: (c)==0xfffe is caught by UTF_IS_ERROR() */ \ 00097 } \ 00098 } 00099 00100 /* definitions with forward iteration --------------------------------------- */ 00101 00102 /* 00103 * all the macros that go forward assume that 00104 * the initial offset is 0<=i<length; 00105 * they update the offset 00106 */ 00107 00108 /* fast versions, no error-checking */ 00109 00110 /* 00111 * Get a single code point from an offset that points to the first 00112 * of the code units that belong to that code point. 00113 * Assume 0<=i<length. 00114 */ 00115 #define UTF16_NEXT_CHAR_UNSAFE(s, i, c) { \ 00116 (c)=(s)[(i)++]; \ 00117 if(UTF_IS_FIRST_SURROGATE(c)) { \ 00118 (c)=UTF16_GET_PAIR_VALUE((c), (s)[(i)++]); \ 00119 } \ 00120 } 00121 00122 #define UTF16_APPEND_CHAR_UNSAFE(s, i, c) { \ 00123 if((uint32_t)(c)<=0xffff) { \ 00124 (s)[(i)++]=(uint16_t)(c); \ 00125 } else { \ 00126 (s)[(i)++]=(uint16_t)((c)>>10)+0xd7c0; \ 00127 (s)[(i)++]=(uint16_t)(c)&0x3ff|0xdc00; \ 00128 } \ 00129 } 00130 00131 #define UTF16_FWD_1_UNSAFE(s, i) { \ 00132 if(UTF_IS_FIRST_SURROGATE((s)[(i)++])) { \ 00133 ++(i); \ 00134 } \ 00135 } 00136 00137 #define UTF16_FWD_N_UNSAFE(s, i, n) { \ 00138 UTextOffset __N=(n); \ 00139 while(__N>0) { \ 00140 UTF16_FWD_1_UNSAFE(s, i); \ 00141 --__N; \ 00142 } \ 00143 } 00144 00145 /* 00146 * Set a random-access offset and adjust it so that 00147 * it points to the beginning of a Unicode character. 00148 * The offset that is passed in points to 00149 * any code unit of a code point 00150 * and will point to the first code unit after 00151 * the macro invocation. 00152 * Never increments the offset. 00153 */ 00154 #define UTF16_SET_CHAR_START_UNSAFE(s, i) { \ 00155 if(UTF_IS_SECOND_SURROGATE((s)[i])) { \ 00156 --(i); \ 00157 } \ 00158 } 00159 00160 /* safe versions with error-checking and optional regularity-checking */ 00161 00162 #define UTF16_NEXT_CHAR_SAFE(s, i, length, c, strict) { \ 00163 (c)=(s)[(i)++]; \ 00164 if(UTF_IS_FIRST_SURROGATE(c)) { \ 00165 uint16_t __c2; \ 00166 if((i)<(length) && UTF_IS_SECOND_SURROGATE(__c2=(s)[(i)])) { \ 00167 ++(i); \ 00168 (c)=UTF16_GET_PAIR_VALUE((c), __c2); \ 00169 /* strict: ((c)&0xfffe)==0xfffe is caught by UTF_IS_ERROR() */ \ 00170 } else if(strict) {\ 00171 /* unmatched first surrogate */ \ 00172 (c)=UTF_ERROR_VALUE; \ 00173 } \ 00174 } else if(strict && UTF_IS_SECOND_SURROGATE(c)) { \ 00175 /* unmatched second surrogate */ \ 00176 (c)=UTF_ERROR_VALUE; \ 00177 /* else strict: (c)==0xfffe is caught by UTF_IS_ERROR() */ \ 00178 } \ 00179 } 00180 00181 #define UTF16_APPEND_CHAR_SAFE(s, i, length, c) { \ 00182 if((uint32_t)(c)<=0xffff) { \ 00183 (s)[(i)++]=(uint16_t)(c); \ 00184 } else if((uint32_t)(c)<=0x10ffff) { \ 00185 if((i)+1<(length)) { \ 00186 (s)[(i)++]=(uint16_t)((c)>>10)+0xd7c0; \ 00187 (s)[(i)++]=(uint16_t)(c)&0x3ff|0xdc00; \ 00188 } else /* not enough space */ { \ 00189 (s)[(i)++]=UTF_ERROR_VALUE; \ 00190 } \ 00191 } else /* c>0x10ffff, write error value */ { \ 00192 (s)[(i)++]=UTF_ERROR_VALUE; \ 00193 } \ 00194 } 00195 00196 #define UTF16_FWD_1_SAFE(s, i, length) { \ 00197 if(UTF_IS_FIRST_SURROGATE((s)[(i)++]) && (i)<(length) && UTF_IS_SECOND_SURROGATE((s)[i])) { \ 00198 ++(i); \ 00199 } \ 00200 } 00201 00202 #define UTF16_FWD_N_SAFE(s, i, length, n) { \ 00203 UTextOffset __N=(n); \ 00204 while(__N>0 && (i)<(length)) { \ 00205 UTF16_FWD_1_SAFE(s, i, length); \ 00206 --__N; \ 00207 } \ 00208 } 00209 00210 #define UTF16_SET_CHAR_START_SAFE(s, start, i) { \ 00211 if(UTF_IS_SECOND_SURROGATE((s)[i]) && (i)>(start) && UTF_IS_FIRST_SURROGATE((s)[(i)-1])) { \ 00212 --(i); \ 00213 } \ 00214 } 00215 00216 /* definitions with backward iteration -------------------------------------- */ 00217 00218 /* 00219 * all the macros that go backward assume that 00220 * the valid buffer range starts at offset 0 00221 * and that the initial offset is 0<i<=length; 00222 * they update the offset 00223 */ 00224 00225 /* fast versions, no error-checking */ 00226 00227 /* 00228 * Get a single code point from an offset that points behind the last 00229 * of the code units that belong to that code point. 00230 * Assume 0<=i<length. 00231 */ 00232 #define UTF16_PREV_CHAR_UNSAFE(s, i, c) { \ 00233 (c)=(s)[--(i)]; \ 00234 if(UTF_IS_SECOND_SURROGATE(c)) { \ 00235 (c)=UTF16_GET_PAIR_VALUE((s)[--(i)], (c)); \ 00236 } \ 00237 } 00238 00239 #define UTF16_BACK_1_UNSAFE(s, i) { \ 00240 if(UTF_IS_SECOND_SURROGATE((s)[--(i)])) { \ 00241 --(i); \ 00242 } \ 00243 } 00244 00245 #define UTF16_BACK_N_UNSAFE(s, i, n) { \ 00246 UTextOffset __N=(n); \ 00247 while(__N>0) { \ 00248 UTF16_BACK_1_UNSAFE(s, i); \ 00249 --__N; \ 00250 } \ 00251 } 00252 00253 /* 00254 * Set a random-access offset and adjust it so that 00255 * it points after the end of a Unicode character. 00256 * The offset that is passed in points behind 00257 * any code unit of a code point 00258 * and will point behind the last code unit after 00259 * the macro invocation. 00260 * Never decrements the offset. 00261 */ 00262 #define UTF16_SET_CHAR_LIMIT_UNSAFE(s, i) { \ 00263 if(UTF_IS_FIRST_SURROGATE((s)[(i)-1])) { \ 00264 ++(i); \ 00265 } \ 00266 } 00267 00268 /* safe versions with error-checking and optional regularity-checking */ 00269 00270 #define UTF16_PREV_CHAR_SAFE(s, start, i, c, strict) { \ 00271 (c)=(s)[--(i)]; \ 00272 if(UTF_IS_SECOND_SURROGATE(c)) { \ 00273 uint16_t __c2; \ 00274 if((i)>(start) && UTF_IS_FIRST_SURROGATE(__c2=(s)[(i)-1])) { \ 00275 --(i); \ 00276 (c)=UTF16_GET_PAIR_VALUE(__c2, (c)); \ 00277 /* strict: ((c)&0xfffe)==0xfffe is caught by UTF_IS_ERROR() */ \ 00278 } else if(strict) {\ 00279 /* unmatched second surrogate */ \ 00280 (c)=UTF_ERROR_VALUE; \ 00281 } \ 00282 } else if(strict && UTF_IS_FIRST_SURROGATE(c)) { \ 00283 /* unmatched first surrogate */ \ 00284 (c)=UTF_ERROR_VALUE; \ 00285 /* else strict: (c)==0xfffe is caught by UTF_IS_ERROR() */ \ 00286 } \ 00287 } 00288 00289 #define UTF16_BACK_1_SAFE(s, start, i) { \ 00290 if(UTF_IS_SECOND_SURROGATE((s)[--(i)]) && (i)>(start) && UTF_IS_FIRST_SURROGATE((s)[(i)-1])) { \ 00291 --(i); \ 00292 } \ 00293 } 00294 00295 #define UTF16_BACK_N_SAFE(s, start, i, n) { \ 00296 UTextOffset __N=(n); \ 00297 while(__N>0 && (i)>(start)) { \ 00298 UTF16_BACK_1_SAFE(s, start, i); \ 00299 --__N; \ 00300 } \ 00301 } 00302 00303 #define UTF16_SET_CHAR_LIMIT_SAFE(s, start, i, length) { \ 00304 if((start)<(i) && (i)<(length) && UTF_IS_FIRST_SURROGATE((s)[(i)-1]) && UTF_IS_SECOND_SURROGATE((s)[i])) { \ 00305 ++(i); \ 00306 } \ 00307 } 00308 00309 #endif