Main Page   Class Hierarchy   Alphabetical List   Compound List   File List   Compound Members   File Members  

utf16.h

Go to the documentation of this file.
00001 /*
00002 *******************************************************************************
00003 *
00004 *   Copyright (C) 1999-2001, International Business Machines
00005 *   Corporation and others.  All Rights Reserved.
00006 *
00007 *******************************************************************************
00008 *   file name:  utf16.h
00009 *   encoding:   US-ASCII
00010 *   tab size:   8 (not used)
00011 *   indentation:4
00012 *
00013 *   created on: 1999sep09
00014 *   created by: Markus W. Scherer
00015 */
00016 
00032 #ifndef __UTF16_H__
00033 #define __UTF16_H__
00034 
00035 /* single-code point definitions -------------------------------------------- */
00036 
00037 /* handle surrogate pairs */
00038 #define UTF_IS_FIRST_SURROGATE(uchar) (((uchar)&0xfffffc00)==0xd800)
00039 #define UTF_IS_SECOND_SURROGATE(uchar) (((uchar)&0xfffffc00)==0xdc00)
00040 
00041 #define UTF_IS_SURROGATE_FIRST(c) (((c)&0x400)==0)
00042 
00043 /* get the UTF-32 value directly from the surrogate pseudo-characters */
00044 #define UTF_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000)
00045 
00046 #define UTF16_GET_PAIR_VALUE(first, second) \
00047     (((first)<<10UL)+(second)-UTF_SURROGATE_OFFSET)
00048 
00049 /* classes of code unit values */
00050 #define UTF16_IS_SINGLE(uchar) !UTF_IS_SURROGATE(uchar)
00051 #define UTF16_IS_LEAD(uchar) UTF_IS_FIRST_SURROGATE(uchar)
00052 #define UTF16_IS_TRAIL(uchar) UTF_IS_SECOND_SURROGATE(uchar)
00053 
00054 /* number of code units per code point */
00055 #define UTF16_NEED_MULTIPLE_UCHAR(c) ((uint32_t)(c)>0xffff)
00056 #define UTF16_CHAR_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2)
00057 #define UTF16_MAX_CHAR_LENGTH 2
00058 
00059 /* average number of code units compared to UTF-16 */
00060 #define UTF16_ARRAY_SIZE(size) (size)
00061 
00062 /*
00063  * Get a single code point from an offset that points to any
00064  * of the code units that belong to that code point.
00065  * Assume 0<=i<length.
00066  *
00067  * This could be used for iteration together with
00068  * UTF16_CHAR_LENGTH() and UTF_IS_ERROR(),
00069  * but the use of UTF16_NEXT_CHAR_[UN]SAFE() and
00070  * UTF16_PREV_CHAR_[UN]SAFE() is more efficient for that.
00071  */
00072 #define UTF16_GET_CHAR_UNSAFE(s, i, c) { \
00073     (c)=(s)[i]; \
00074     if(UTF_IS_SURROGATE(c)) { \
00075         if(UTF_IS_SURROGATE_FIRST(c)) { \
00076             (c)=UTF16_GET_PAIR_VALUE((c), (s)[(i)+1]); \
00077         } else { \
00078             (c)=UTF16_GET_PAIR_VALUE((s)[(i)-1], (c)); \
00079         } \
00080     } \
00081 }
00082 
00083 #define UTF16_GET_CHAR_SAFE(s, start, i, length, c, strict) { \
00084     (c)=(s)[i]; \
00085     if(UTF_IS_SURROGATE(c)) { \
00086         uint16_t __c2; \
00087         if(UTF_IS_SURROGATE_FIRST(c)) { \
00088             if((i)+1<(length) && UTF_IS_SECOND_SURROGATE(__c2=(s)[(i)+1])) { \
00089                 (c)=UTF16_GET_PAIR_VALUE((c), __c2); \
00090                 /* strict: ((c)&0xfffe)==0xfffe is caught by UTF_IS_ERROR() */ \
00091             } else if(strict) {\
00092                 /* unmatched first surrogate */ \
00093                 (c)=UTF_ERROR_VALUE; \
00094             } \
00095         } else { \
00096             if((i)-1>=(start) && UTF_IS_FIRST_SURROGATE(__c2=(s)[(i)-1])) { \
00097                 (c)=UTF16_GET_PAIR_VALUE(__c2, (c)); \
00098                 /* strict: ((c)&0xfffe)==0xfffe is caught by UTF_IS_ERROR() */ \
00099             } else if(strict) {\
00100                 /* unmatched second surrogate */ \
00101                 (c)=UTF_ERROR_VALUE; \
00102             } \
00103         } \
00104     /* else strict: (c)==0xfffe is caught by UTF_IS_ERROR() */ \
00105     } \
00106 }
00107 
00108 /* definitions with forward iteration --------------------------------------- */
00109 
00110 /*
00111  * all the macros that go forward assume that
00112  * the initial offset is 0<=i<length;
00113  * they update the offset
00114  */
00115 
00116 /* fast versions, no error-checking */
00117 
00118 /*
00119  * Get a single code point from an offset that points to the first
00120  * of the code units that belong to that code point.
00121  * Assume 0<=i<length.
00122  */
00123 #define UTF16_NEXT_CHAR_UNSAFE(s, i, c) { \
00124     (c)=(s)[(i)++]; \
00125     if(UTF_IS_FIRST_SURROGATE(c)) { \
00126         (c)=UTF16_GET_PAIR_VALUE((c), (s)[(i)++]); \
00127     } \
00128 }
00129 
00130 #define UTF16_APPEND_CHAR_UNSAFE(s, i, c) { \
00131     if((uint32_t)(c)<=0xffff) { \
00132         (s)[(i)++]=(uint16_t)(c); \
00133     } else { \
00134         (s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \
00135         (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \
00136     } \
00137 }
00138 
00139 #define UTF16_FWD_1_UNSAFE(s, i) { \
00140     if(UTF_IS_FIRST_SURROGATE((s)[(i)++])) { \
00141         ++(i); \
00142     } \
00143 }
00144 
00145 #define UTF16_FWD_N_UNSAFE(s, i, n) { \
00146     UTextOffset __N=(n); \
00147     while(__N>0) { \
00148         UTF16_FWD_1_UNSAFE(s, i); \
00149         --__N; \
00150     } \
00151 }
00152 
00153 /*
00154  * Set a random-access offset and adjust it so that
00155  * it points to the beginning of a Unicode character.
00156  * The offset that is passed in points to
00157  * any code unit of a code point
00158  * and will point to the first code unit after
00159  * the macro invocation.
00160  * Never increments the offset.
00161  */
00162 #define UTF16_SET_CHAR_START_UNSAFE(s, i) { \
00163     if(UTF_IS_SECOND_SURROGATE((s)[i])) { \
00164         --(i); \
00165     } \
00166 }
00167 
00168 /* safe versions with error-checking and optional regularity-checking */
00169 
00170 #define UTF16_NEXT_CHAR_SAFE(s, i, length, c, strict) { \
00171     (c)=(s)[(i)++]; \
00172     if(UTF_IS_FIRST_SURROGATE(c)) { \
00173         uint16_t __c2; \
00174         if((i)<(length) && UTF_IS_SECOND_SURROGATE(__c2=(s)[(i)])) { \
00175             ++(i); \
00176             (c)=UTF16_GET_PAIR_VALUE((c), __c2); \
00177             /* strict: ((c)&0xfffe)==0xfffe is caught by UTF_IS_ERROR() */ \
00178         } else if(strict) {\
00179             /* unmatched first surrogate */ \
00180             (c)=UTF_ERROR_VALUE; \
00181         } \
00182     } else if(strict && UTF_IS_SECOND_SURROGATE(c)) { \
00183         /* unmatched second surrogate */ \
00184         (c)=UTF_ERROR_VALUE; \
00185     /* else strict: (c)==0xfffe is caught by UTF_IS_ERROR() */ \
00186     } \
00187 }
00188 
00189 #define UTF16_APPEND_CHAR_SAFE(s, i, length, c) { \
00190     if((uint32_t)(c)<=0xffff) { \
00191         (s)[(i)++]=(uint16_t)(c); \
00192     } else if((uint32_t)(c)<=0x10ffff) { \
00193         if((i)+1<(length)) { \
00194             (s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \
00195             (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \
00196         } else /* not enough space */ { \
00197             (s)[(i)++]=UTF_ERROR_VALUE; \
00198         } \
00199     } else /* c>0x10ffff, write error value */ { \
00200         (s)[(i)++]=UTF_ERROR_VALUE; \
00201     } \
00202 }
00203 
00204 #define UTF16_FWD_1_SAFE(s, i, length) { \
00205     if(UTF_IS_FIRST_SURROGATE((s)[(i)++]) && (i)<(length) && UTF_IS_SECOND_SURROGATE((s)[i])) { \
00206         ++(i); \
00207     } \
00208 }
00209 
00210 #define UTF16_FWD_N_SAFE(s, i, length, n) { \
00211     UTextOffset __N=(n); \
00212     while(__N>0 && (i)<(length)) { \
00213         UTF16_FWD_1_SAFE(s, i, length); \
00214         --__N; \
00215     } \
00216 }
00217 
00218 #define UTF16_SET_CHAR_START_SAFE(s, start, i) { \
00219     if(UTF_IS_SECOND_SURROGATE((s)[i]) && (i)>(start) && UTF_IS_FIRST_SURROGATE((s)[(i)-1])) { \
00220         --(i); \
00221     } \
00222 }
00223 
00224 /* definitions with backward iteration -------------------------------------- */
00225 
00226 /*
00227  * all the macros that go backward assume that
00228  * the valid buffer range starts at offset 0
00229  * and that the initial offset is 0<i<=length;
00230  * they update the offset
00231  */
00232 
00233 /* fast versions, no error-checking */
00234 
00235 /*
00236  * Get a single code point from an offset that points behind the last
00237  * of the code units that belong to that code point.
00238  * Assume 0<=i<length.
00239  */
00240 #define UTF16_PREV_CHAR_UNSAFE(s, i, c) { \
00241     (c)=(s)[--(i)]; \
00242     if(UTF_IS_SECOND_SURROGATE(c)) { \
00243         (c)=UTF16_GET_PAIR_VALUE((s)[--(i)], (c)); \
00244     } \
00245 }
00246 
00247 #define UTF16_BACK_1_UNSAFE(s, i) { \
00248     if(UTF_IS_SECOND_SURROGATE((s)[--(i)])) { \
00249         --(i); \
00250     } \
00251 }
00252 
00253 #define UTF16_BACK_N_UNSAFE(s, i, n) { \
00254     UTextOffset __N=(n); \
00255     while(__N>0) { \
00256         UTF16_BACK_1_UNSAFE(s, i); \
00257         --__N; \
00258     } \
00259 }
00260 
00261 /*
00262  * Set a random-access offset and adjust it so that
00263  * it points after the end of a Unicode character.
00264  * The offset that is passed in points behind
00265  * any code unit of a code point
00266  * and will point behind the last code unit after
00267  * the macro invocation.
00268  * Never decrements the offset.
00269  */
00270 #define UTF16_SET_CHAR_LIMIT_UNSAFE(s, i) { \
00271     if(UTF_IS_FIRST_SURROGATE((s)[(i)-1])) { \
00272         ++(i); \
00273     } \
00274 }
00275 
00276 /* safe versions with error-checking and optional regularity-checking */
00277 
00278 #define UTF16_PREV_CHAR_SAFE(s, start, i, c, strict) { \
00279     (c)=(s)[--(i)]; \
00280     if(UTF_IS_SECOND_SURROGATE(c)) { \
00281         uint16_t __c2; \
00282         if((i)>(start) && UTF_IS_FIRST_SURROGATE(__c2=(s)[(i)-1])) { \
00283             --(i); \
00284             (c)=UTF16_GET_PAIR_VALUE(__c2, (c)); \
00285             /* strict: ((c)&0xfffe)==0xfffe is caught by UTF_IS_ERROR() */ \
00286         } else if(strict) {\
00287             /* unmatched second surrogate */ \
00288             (c)=UTF_ERROR_VALUE; \
00289         } \
00290     } else if(strict && UTF_IS_FIRST_SURROGATE(c)) { \
00291         /* unmatched first surrogate */ \
00292         (c)=UTF_ERROR_VALUE; \
00293     /* else strict: (c)==0xfffe is caught by UTF_IS_ERROR() */ \
00294     } \
00295 }
00296 
00297 #define UTF16_BACK_1_SAFE(s, start, i) { \
00298     if(UTF_IS_SECOND_SURROGATE((s)[--(i)]) && (i)>(start) && UTF_IS_FIRST_SURROGATE((s)[(i)-1])) { \
00299         --(i); \
00300     } \
00301 }
00302 
00303 #define UTF16_BACK_N_SAFE(s, start, i, n) { \
00304     UTextOffset __N=(n); \
00305     while(__N>0 && (i)>(start)) { \
00306         UTF16_BACK_1_SAFE(s, start, i); \
00307         --__N; \
00308     } \
00309 }
00310 
00311 #define UTF16_SET_CHAR_LIMIT_SAFE(s, start, i, length) { \
00312     if((start)<(i) && (i)<(length) && UTF_IS_FIRST_SURROGATE((s)[(i)-1]) && UTF_IS_SECOND_SURROGATE((s)[i])) { \
00313         ++(i); \
00314     } \
00315 }
00316 
00317 #endif

Generated at Thu Mar 22 16:12:42 2001 for ICU 1.8 by doxygen1.2.3 written by Dimitri van Heesch, © 1997-2000