Main Page   Class Hierarchy   Alphabetical List   Compound List   File List   Compound Members   File Members  

utf16.h

Go to the documentation of this file.
00001 /*
00002 *******************************************************************************
00003 *
00004 *   Copyright (C) 1999-2000, International Business Machines
00005 *   Corporation and others.  All Rights Reserved.
00006 *
00007 *******************************************************************************
00008 *   file name:  utf16.h
00009 *   encoding:   US-ASCII
00010 *   tab size:   8 (not used)
00011 *   indentation:4
00012 *
00013 *   created on: 1999sep09
00014 *   created by: Markus W. Scherer
00015 */
00016 
00029 #ifndef __UTF16_H__
00030 #define __UTF16_H__
00031 
00032 /* single-code point definitions -------------------------------------------- */
00033 
00034 /* handle surrogate pairs */
00035 #define UTF_IS_FIRST_SURROGATE(uchar) (((uchar)&0xfffffc00)==0xd800)
00036 #define UTF_IS_SECOND_SURROGATE(uchar) (((uchar)&0xfffffc00)==0xdc00)
00037 
00038 #define UTF_IS_SURROGATE_FIRST(c) (((c)&0x400)==0)
00039 
00040 /* get the UTF-32 value directly from the surrogate pseudo-characters */
00041 #define UTF_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000)
00042 
00043 #define UTF16_GET_PAIR_VALUE(first, second) \
00044     (((first)<<10UL)+(second)-UTF_SURROGATE_OFFSET)
00045 
00046 /* classes of code unit values */
00047 #define UTF16_IS_SINGLE(uchar) !UTF_IS_SURROGATE(uchar)
00048 #define UTF16_IS_LEAD(uchar) UTF_IS_FIRST_SURROGATE(uchar)
00049 #define UTF16_IS_TRAIL(uchar) UTF_IS_SECOND_SURROGATE(uchar)
00050 
00051 /* number of code units per code point */
00052 #define UTF16_NEED_MULTIPLE_UCHAR(c) ((uint32_t)(c)>0xffff)
00053 #define UTF16_CHAR_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2)
00054 #define UTF16_MAX_CHAR_LENGTH 2
00055 
00056 /* average number of code units compared to UTF-16 */
00057 #define UTF16_ARRAY_SIZE(size) (size)
00058 
00059 /*
00060  * Get a single code point from an offset that points to any
00061  * of the code units that belong to that code point.
00062  * Assume 0<=i<length.
00063  *
00064  * This could be used for iteration together with
00065  * UTF16_CHAR_LENGTH() and UTF_IS_ERROR(),
00066  * but the use of UTF16_NEXT_CHAR_[UN]SAFE() and
00067  * UTF16_PREV_CHAR_[UN]SAFE() is more efficient for that.
00068  */
00069 #define UTF16_GET_CHAR_UNSAFE(s, i, c) { \
00070     (c)=(s)[i]; \
00071     if(UTF_IS_SURROGATE(c)) { \
00072         if(UTF_IS_SURROGATE_FIRST(c)) { \
00073             (c)=UTF16_GET_PAIR_VALUE((c), (s)[(i)+1]); \
00074         } else { \
00075             (c)=UTF16_GET_PAIR_VALUE((s)[(i)-1], (c)); \
00076         } \
00077     } \
00078 }
00079 
00080 #define UTF16_GET_CHAR_SAFE(s, start, i, length, c, strict) { \
00081     (c)=(s)[i]; \
00082     if(UTF_IS_SURROGATE(c)) { \
00083         uint16_t __c2; \
00084         if(UTF_IS_SURROGATE_FIRST(c)) { \
00085             if((i)+1<(length) && UTF_IS_SECOND_SURROGATE(__c2=(s)[(i)+1])) { \
00086                 (c)=UTF16_GET_PAIR_VALUE((c), __c2); \
00087                 /* strict: ((c)&0xfffe)==0xfffe is caught by UTF_IS_ERROR() */ \
00088             } else if(strict) {\
00089                 /* unmatched first surrogate */ \
00090                 (c)=UTF_ERROR_VALUE; \
00091             } \
00092         } else { \
00093             if((i)>(start) && UTF_IS_FIRST_SURROGATE(__c2=(s)[(i)-1])) { \
00094                 (c)=UTF16_GET_PAIR_VALUE(__c2, (c)); \
00095                 /* strict: ((c)&0xfffe)==0xfffe is caught by UTF_IS_ERROR() */ \
00096             } else if(strict) {\
00097                 /* unmatched second surrogate */ \
00098                 (c)=UTF_ERROR_VALUE; \
00099             } \
00100         } \
00101     /* else strict: (c)==0xfffe is caught by UTF_IS_ERROR() */ \
00102     } \
00103 }
00104 
00105 /* definitions with forward iteration --------------------------------------- */
00106 
00107 /*
00108  * all the macros that go forward assume that
00109  * the initial offset is 0<=i<length;
00110  * they update the offset
00111  */
00112 
00113 /* fast versions, no error-checking */
00114 
00115 /*
00116  * Get a single code point from an offset that points to the first
00117  * of the code units that belong to that code point.
00118  * Assume 0<=i<length.
00119  */
00120 #define UTF16_NEXT_CHAR_UNSAFE(s, i, c) { \
00121     (c)=(s)[(i)++]; \
00122     if(UTF_IS_FIRST_SURROGATE(c)) { \
00123         (c)=UTF16_GET_PAIR_VALUE((c), (s)[(i)++]); \
00124     } \
00125 }
00126 
00127 #define UTF16_APPEND_CHAR_UNSAFE(s, i, c) { \
00128     if((uint32_t)(c)<=0xffff) { \
00129         (s)[(i)++]=(uint16_t)(c); \
00130     } else { \
00131         (s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \
00132         (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \
00133     } \
00134 }
00135 
00136 #define UTF16_FWD_1_UNSAFE(s, i) { \
00137     if(UTF_IS_FIRST_SURROGATE((s)[(i)++])) { \
00138         ++(i); \
00139     } \
00140 }
00141 
00142 #define UTF16_FWD_N_UNSAFE(s, i, n) { \
00143     UTextOffset __N=(n); \
00144     while(__N>0) { \
00145         UTF16_FWD_1_UNSAFE(s, i); \
00146         --__N; \
00147     } \
00148 }
00149 
00150 /*
00151  * Set a random-access offset and adjust it so that
00152  * it points to the beginning of a Unicode character.
00153  * The offset that is passed in points to
00154  * any code unit of a code point
00155  * and will point to the first code unit after
00156  * the macro invocation.
00157  * Never increments the offset.
00158  */
00159 #define UTF16_SET_CHAR_START_UNSAFE(s, i) { \
00160     if(UTF_IS_SECOND_SURROGATE((s)[i])) { \
00161         --(i); \
00162     } \
00163 }
00164 
00165 /* safe versions with error-checking and optional regularity-checking */
00166 
00167 #define UTF16_NEXT_CHAR_SAFE(s, i, length, c, strict) { \
00168     (c)=(s)[(i)++]; \
00169     if(UTF_IS_FIRST_SURROGATE(c)) { \
00170         uint16_t __c2; \
00171         if((i)<(length) && UTF_IS_SECOND_SURROGATE(__c2=(s)[(i)])) { \
00172             ++(i); \
00173             (c)=UTF16_GET_PAIR_VALUE((c), __c2); \
00174             /* strict: ((c)&0xfffe)==0xfffe is caught by UTF_IS_ERROR() */ \
00175         } else if(strict) {\
00176             /* unmatched first surrogate */ \
00177             (c)=UTF_ERROR_VALUE; \
00178         } \
00179     } else if(strict && UTF_IS_SECOND_SURROGATE(c)) { \
00180         /* unmatched second surrogate */ \
00181         (c)=UTF_ERROR_VALUE; \
00182     /* else strict: (c)==0xfffe is caught by UTF_IS_ERROR() */ \
00183     } \
00184 }
00185 
00186 #define UTF16_APPEND_CHAR_SAFE(s, i, length, c) { \
00187     if((uint32_t)(c)<=0xffff) { \
00188         (s)[(i)++]=(uint16_t)(c); \
00189     } else if((uint32_t)(c)<=0x10ffff) { \
00190         if((i)+1<(length)) { \
00191             (s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \
00192             (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \
00193         } else /* not enough space */ { \
00194             (s)[(i)++]=UTF_ERROR_VALUE; \
00195         } \
00196     } else /* c>0x10ffff, write error value */ { \
00197         (s)[(i)++]=UTF_ERROR_VALUE; \
00198     } \
00199 }
00200 
00201 #define UTF16_FWD_1_SAFE(s, i, length) { \
00202     if(UTF_IS_FIRST_SURROGATE((s)[(i)++]) && (i)<(length) && UTF_IS_SECOND_SURROGATE((s)[i])) { \
00203         ++(i); \
00204     } \
00205 }
00206 
00207 #define UTF16_FWD_N_SAFE(s, i, length, n) { \
00208     UTextOffset __N=(n); \
00209     while(__N>0 && (i)<(length)) { \
00210         UTF16_FWD_1_SAFE(s, i, length); \
00211         --__N; \
00212     } \
00213 }
00214 
00215 #define UTF16_SET_CHAR_START_SAFE(s, start, i) { \
00216     if(UTF_IS_SECOND_SURROGATE((s)[i]) && (i)>(start) && UTF_IS_FIRST_SURROGATE((s)[(i)-1])) { \
00217         --(i); \
00218     } \
00219 }
00220 
00221 /* definitions with backward iteration -------------------------------------- */
00222 
00223 /*
00224  * all the macros that go backward assume that
00225  * the valid buffer range starts at offset 0
00226  * and that the initial offset is 0<i<=length;
00227  * they update the offset
00228  */
00229 
00230 /* fast versions, no error-checking */
00231 
00232 /*
00233  * Get a single code point from an offset that points behind the last
00234  * of the code units that belong to that code point.
00235  * Assume 0<=i<length.
00236  */
00237 #define UTF16_PREV_CHAR_UNSAFE(s, i, c) { \
00238     (c)=(s)[--(i)]; \
00239     if(UTF_IS_SECOND_SURROGATE(c)) { \
00240         (c)=UTF16_GET_PAIR_VALUE((s)[--(i)], (c)); \
00241     } \
00242 }
00243 
00244 #define UTF16_BACK_1_UNSAFE(s, i) { \
00245     if(UTF_IS_SECOND_SURROGATE((s)[--(i)])) { \
00246         --(i); \
00247     } \
00248 }
00249 
00250 #define UTF16_BACK_N_UNSAFE(s, i, n) { \
00251     UTextOffset __N=(n); \
00252     while(__N>0) { \
00253         UTF16_BACK_1_UNSAFE(s, i); \
00254         --__N; \
00255     } \
00256 }
00257 
00258 /*
00259  * Set a random-access offset and adjust it so that
00260  * it points after the end of a Unicode character.
00261  * The offset that is passed in points behind
00262  * any code unit of a code point
00263  * and will point behind the last code unit after
00264  * the macro invocation.
00265  * Never decrements the offset.
00266  */
00267 #define UTF16_SET_CHAR_LIMIT_UNSAFE(s, i) { \
00268     if(UTF_IS_FIRST_SURROGATE((s)[(i)-1])) { \
00269         ++(i); \
00270     } \
00271 }
00272 
00273 /* safe versions with error-checking and optional regularity-checking */
00274 
00275 #define UTF16_PREV_CHAR_SAFE(s, start, i, c, strict) { \
00276     (c)=(s)[--(i)]; \
00277     if(UTF_IS_SECOND_SURROGATE(c)) { \
00278         uint16_t __c2; \
00279         if((i)>(start) && UTF_IS_FIRST_SURROGATE(__c2=(s)[(i)-1])) { \
00280             --(i); \
00281             (c)=UTF16_GET_PAIR_VALUE(__c2, (c)); \
00282             /* strict: ((c)&0xfffe)==0xfffe is caught by UTF_IS_ERROR() */ \
00283         } else if(strict) {\
00284             /* unmatched second surrogate */ \
00285             (c)=UTF_ERROR_VALUE; \
00286         } \
00287     } else if(strict && UTF_IS_FIRST_SURROGATE(c)) { \
00288         /* unmatched first surrogate */ \
00289         (c)=UTF_ERROR_VALUE; \
00290     /* else strict: (c)==0xfffe is caught by UTF_IS_ERROR() */ \
00291     } \
00292 }
00293 
00294 #define UTF16_BACK_1_SAFE(s, start, i) { \
00295     if(UTF_IS_SECOND_SURROGATE((s)[--(i)]) && (i)>(start) && UTF_IS_FIRST_SURROGATE((s)[(i)-1])) { \
00296         --(i); \
00297     } \
00298 }
00299 
00300 #define UTF16_BACK_N_SAFE(s, start, i, n) { \
00301     UTextOffset __N=(n); \
00302     while(__N>0 && (i)>(start)) { \
00303         UTF16_BACK_1_SAFE(s, start, i); \
00304         --__N; \
00305     } \
00306 }
00307 
00308 #define UTF16_SET_CHAR_LIMIT_SAFE(s, start, i, length) { \
00309     if((start)<(i) && (i)<(length) && UTF_IS_FIRST_SURROGATE((s)[(i)-1]) && UTF_IS_SECOND_SURROGATE((s)[i])) { \
00310         ++(i); \
00311     } \
00312 }
00313 
00314 #endif

Generated at Fri Dec 15 12:12:39 2000 for ICU 1.7 by doxygen1.2.3 written by Dimitri van Heesch, © 1997-2000