Main Page   Class Hierarchy   Compound List   File List   Header Files   Sources   Compound Members   File Members  

utf16.h

00001 /*
00002 *******************************************************************************
00003 *
00004 *   Copyright (C) 1999-2000, International Business Machines
00005 *   Corporation and others.  All Rights Reserved.
00006 *
00007 *******************************************************************************
00008 *   file name:  utf16.h
00009 *   encoding:   US-ASCII
00010 *   tab size:   8 (not used)
00011 *   indentation:4
00012 *
00013 *   created on: 1999sep09
00014 *   created by: Markus W. Scherer
00015 *
00016 *   This file defines macros to deal with UTF-16 code units and code points.
00017 *   "Safe" macros check for length overruns and illegal sequences, and
00018 *   also for irregular sequences when the strict option is set.
00019 *   "Unsafe" macros are designed for maximum speed.
00020 *   utf16.h is included by utf.h after unicode/umachine.h
00021 *   and some common definitions.
00022 */
00023 
00024 #ifndef __UTF16_H__
00025 #define __UTF16_H__
00026 
00027 /* single-code point definitions -------------------------------------------- */
00028 
00029 /* handle surrogate pairs */
00030 #define UTF_IS_FIRST_SURROGATE(uchar) (((uchar)&0xfffffc00)==0xd800)
00031 #define UTF_IS_SECOND_SURROGATE(uchar) (((uchar)&0xfffffc00)==0xdc00)
00032 
00033 #define UTF_IS_SURROGATE_FIRST(c) (((c)&0x400)==0)
00034 
00035 /* get the UTF-32 value directly from the surrogate pseudo-characters */
00036 #define UTF_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000)
00037 
00038 #define UTF16_GET_PAIR_VALUE(first, second) \
00039     (((first)<<10UL)+(second)-UTF_SURROGATE_OFFSET)
00040 
00041 /* classes of code unit values */
00042 #define UTF16_IS_SINGLE(uchar) !UTF_IS_SURROGATE(uchar)
00043 #define UTF16_IS_LEAD(uchar) UTF_IS_FIRST_SURROGATE(uchar)
00044 #define UTF16_IS_TRAIL(uchar) UTF_IS_SECOND_SURROGATE(uchar)
00045 
00046 /* number of code units per code point */
00047 #define UTF16_NEED_MULTIPLE_UCHAR(c) ((uint32_t)(c)>0xffff)
00048 #define UTF16_CHAR_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2)
00049 #define UTF16_MAX_CHAR_LENGTH 2
00050 
00051 /* average number of code units compared to UTF-16 */
00052 #define UTF16_ARRAY_SIZE(size) (size)
00053 
00054 /*
00055  * Get a single code point from an offset that points to any
00056  * of the code units that belong to that code point.
00057  * Assume 0<=i<length.
00058  *
00059  * This could be used for iteration together with
00060  * UTF16_CHAR_LENGTH() and UTF_IS_ERROR(),
00061  * but the use of UTF16_NEXT_CHAR_[UN]SAFE() and
00062  * UTF16_PREV_CHAR_[UN]SAFE() is more efficient for that.
00063  */
00064 #define UTF16_GET_CHAR_UNSAFE(s, i, c) { \
00065     (c)=(s)[i]; \
00066     if(UTF_IS_SURROGATE(c)) { \
00067         if(UTF_IS_SURROGATE_FIRST(c)) { \
00068             (c)=UTF16_GET_PAIR_VALUE((c), (s)[(i)+1]); \
00069         } else { \
00070             (c)=UTF16_GET_PAIR_VALUE((s)[(i)-1], (c)); \
00071         } \
00072     } \
00073 }
00074 
00075 #define UTF16_GET_CHAR_SAFE(s, start, i, length, c, strict) { \
00076     (c)=(s)[i]; \
00077     if(UTF_IS_SURROGATE(c)) { \
00078         uint16_t __c2; \
00079         if(UTF_IS_SURROGATE_FIRST(c)) { \
00080             if((i)+1<(length) && UTF_IS_SECOND_SURROGATE(__c2=(s)[(i)+1])) { \
00081                 (c)=UTF16_GET_PAIR_VALUE((c), __c2); \
00082                 /* strict: ((c)&0xfffe)==0xfffe is caught by UTF_IS_ERROR() */ \
00083             } else if(strict) {\
00084                 /* unmatched first surrogate */ \
00085                 (c)=UTF_ERROR_VALUE; \
00086             } \
00087         } else { \
00088             if((i)>(start) && UTF_IS_FIRST_SURROGATE(__c2=(s)[(i)-1])) { \
00089                 (c)=UTF16_GET_PAIR_VALUE(__c2, (c)); \
00090                 /* strict: ((c)&0xfffe)==0xfffe is caught by UTF_IS_ERROR() */ \
00091             } else if(strict) {\
00092                 /* unmatched second surrogate */ \
00093                 (c)=UTF_ERROR_VALUE; \
00094             } \
00095         } \
00096     /* else strict: (c)==0xfffe is caught by UTF_IS_ERROR() */ \
00097     } \
00098 }
00099 
00100 /* definitions with forward iteration --------------------------------------- */
00101 
00102 /*
00103  * all the macros that go forward assume that
00104  * the initial offset is 0<=i<length;
00105  * they update the offset
00106  */
00107 
00108 /* fast versions, no error-checking */
00109 
00110 /*
00111  * Get a single code point from an offset that points to the first
00112  * of the code units that belong to that code point.
00113  * Assume 0<=i<length.
00114  */
00115 #define UTF16_NEXT_CHAR_UNSAFE(s, i, c) { \
00116     (c)=(s)[(i)++]; \
00117     if(UTF_IS_FIRST_SURROGATE(c)) { \
00118         (c)=UTF16_GET_PAIR_VALUE((c), (s)[(i)++]); \
00119     } \
00120 }
00121 
00122 #define UTF16_APPEND_CHAR_UNSAFE(s, i, c) { \
00123     if((uint32_t)(c)<=0xffff) { \
00124         (s)[(i)++]=(uint16_t)(c); \
00125     } else { \
00126         (s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \
00127         (s)[(i)++]=(uint16_t)((c)&0x3ff|0xdc00); \
00128     } \
00129 }
00130 
00131 #define UTF16_FWD_1_UNSAFE(s, i) { \
00132     if(UTF_IS_FIRST_SURROGATE((s)[(i)++])) { \
00133         ++(i); \
00134     } \
00135 }
00136 
00137 #define UTF16_FWD_N_UNSAFE(s, i, n) { \
00138     UTextOffset __N=(n); \
00139     while(__N>0) { \
00140         UTF16_FWD_1_UNSAFE(s, i); \
00141         --__N; \
00142     } \
00143 }
00144 
00145 /*
00146  * Set a random-access offset and adjust it so that
00147  * it points to the beginning of a Unicode character.
00148  * The offset that is passed in points to
00149  * any code unit of a code point
00150  * and will point to the first code unit after
00151  * the macro invocation.
00152  * Never increments the offset.
00153  */
00154 #define UTF16_SET_CHAR_START_UNSAFE(s, i) { \
00155     if(UTF_IS_SECOND_SURROGATE((s)[i])) { \
00156         --(i); \
00157     } \
00158 }
00159 
00160 /* safe versions with error-checking and optional regularity-checking */
00161 
00162 #define UTF16_NEXT_CHAR_SAFE(s, i, length, c, strict) { \
00163     (c)=(s)[(i)++]; \
00164     if(UTF_IS_FIRST_SURROGATE(c)) { \
00165         uint16_t __c2; \
00166         if((i)<(length) && UTF_IS_SECOND_SURROGATE(__c2=(s)[(i)])) { \
00167             ++(i); \
00168             (c)=UTF16_GET_PAIR_VALUE((c), __c2); \
00169             /* strict: ((c)&0xfffe)==0xfffe is caught by UTF_IS_ERROR() */ \
00170         } else if(strict) {\
00171             /* unmatched first surrogate */ \
00172             (c)=UTF_ERROR_VALUE; \
00173         } \
00174     } else if(strict && UTF_IS_SECOND_SURROGATE(c)) { \
00175         /* unmatched second surrogate */ \
00176         (c)=UTF_ERROR_VALUE; \
00177     /* else strict: (c)==0xfffe is caught by UTF_IS_ERROR() */ \
00178     } \
00179 }
00180 
00181 #define UTF16_APPEND_CHAR_SAFE(s, i, length, c) { \
00182     if((uint32_t)(c)<=0xffff) { \
00183         (s)[(i)++]=(uint16_t)(c); \
00184     } else if((uint32_t)(c)<=0x10ffff) { \
00185         if((i)+1<(length)) { \
00186             (s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \
00187             (s)[(i)++]=(uint16_t)((c)&0x3ff|0xdc00); \
00188         } else /* not enough space */ { \
00189             (s)[(i)++]=UTF_ERROR_VALUE; \
00190         } \
00191     } else /* c>0x10ffff, write error value */ { \
00192         (s)[(i)++]=UTF_ERROR_VALUE; \
00193     } \
00194 }
00195 
00196 #define UTF16_FWD_1_SAFE(s, i, length) { \
00197     if(UTF_IS_FIRST_SURROGATE((s)[(i)++]) && (i)<(length) && UTF_IS_SECOND_SURROGATE((s)[i])) { \
00198         ++(i); \
00199     } \
00200 }
00201 
00202 #define UTF16_FWD_N_SAFE(s, i, length, n) { \
00203     UTextOffset __N=(n); \
00204     while(__N>0 && (i)<(length)) { \
00205         UTF16_FWD_1_SAFE(s, i, length); \
00206         --__N; \
00207     } \
00208 }
00209 
00210 #define UTF16_SET_CHAR_START_SAFE(s, start, i) { \
00211     if(UTF_IS_SECOND_SURROGATE((s)[i]) && (i)>(start) && UTF_IS_FIRST_SURROGATE((s)[(i)-1])) { \
00212         --(i); \
00213     } \
00214 }
00215 
00216 /* definitions with backward iteration -------------------------------------- */
00217 
00218 /*
00219  * all the macros that go backward assume that
00220  * the valid buffer range starts at offset 0
00221  * and that the initial offset is 0<i<=length;
00222  * they update the offset
00223  */
00224 
00225 /* fast versions, no error-checking */
00226 
00227 /*
00228  * Get a single code point from an offset that points behind the last
00229  * of the code units that belong to that code point.
00230  * Assume 0<=i<length.
00231  */
00232 #define UTF16_PREV_CHAR_UNSAFE(s, i, c) { \
00233     (c)=(s)[--(i)]; \
00234     if(UTF_IS_SECOND_SURROGATE(c)) { \
00235         (c)=UTF16_GET_PAIR_VALUE((s)[--(i)], (c)); \
00236     } \
00237 }
00238 
00239 #define UTF16_BACK_1_UNSAFE(s, i) { \
00240     if(UTF_IS_SECOND_SURROGATE((s)[--(i)])) { \
00241         --(i); \
00242     } \
00243 }
00244 
00245 #define UTF16_BACK_N_UNSAFE(s, i, n) { \
00246     UTextOffset __N=(n); \
00247     while(__N>0) { \
00248         UTF16_BACK_1_UNSAFE(s, i); \
00249         --__N; \
00250     } \
00251 }
00252 
00253 /*
00254  * Set a random-access offset and adjust it so that
00255  * it points after the end of a Unicode character.
00256  * The offset that is passed in points behind
00257  * any code unit of a code point
00258  * and will point behind the last code unit after
00259  * the macro invocation.
00260  * Never decrements the offset.
00261  */
00262 #define UTF16_SET_CHAR_LIMIT_UNSAFE(s, i) { \
00263     if(UTF_IS_FIRST_SURROGATE((s)[(i)-1])) { \
00264         ++(i); \
00265     } \
00266 }
00267 
00268 /* safe versions with error-checking and optional regularity-checking */
00269 
00270 #define UTF16_PREV_CHAR_SAFE(s, start, i, c, strict) { \
00271     (c)=(s)[--(i)]; \
00272     if(UTF_IS_SECOND_SURROGATE(c)) { \
00273         uint16_t __c2; \
00274         if((i)>(start) && UTF_IS_FIRST_SURROGATE(__c2=(s)[(i)-1])) { \
00275             --(i); \
00276             (c)=UTF16_GET_PAIR_VALUE(__c2, (c)); \
00277             /* strict: ((c)&0xfffe)==0xfffe is caught by UTF_IS_ERROR() */ \
00278         } else if(strict) {\
00279             /* unmatched second surrogate */ \
00280             (c)=UTF_ERROR_VALUE; \
00281         } \
00282     } else if(strict && UTF_IS_FIRST_SURROGATE(c)) { \
00283         /* unmatched first surrogate */ \
00284         (c)=UTF_ERROR_VALUE; \
00285     /* else strict: (c)==0xfffe is caught by UTF_IS_ERROR() */ \
00286     } \
00287 }
00288 
00289 #define UTF16_BACK_1_SAFE(s, start, i) { \
00290     if(UTF_IS_SECOND_SURROGATE((s)[--(i)]) && (i)>(start) && UTF_IS_FIRST_SURROGATE((s)[(i)-1])) { \
00291         --(i); \
00292     } \
00293 }
00294 
00295 #define UTF16_BACK_N_SAFE(s, start, i, n) { \
00296     UTextOffset __N=(n); \
00297     while(__N>0 && (i)>(start)) { \
00298         UTF16_BACK_1_SAFE(s, start, i); \
00299         --__N; \
00300     } \
00301 }
00302 
00303 #define UTF16_SET_CHAR_LIMIT_SAFE(s, start, i, length) { \
00304     if((start)<(i) && (i)<(length) && UTF_IS_FIRST_SURROGATE((s)[(i)-1]) && UTF_IS_SECOND_SURROGATE((s)[i])) { \
00305         ++(i); \
00306     } \
00307 }
00308 
00309 #endif

Generated at Wed Aug 16 16:05:39 2000 for ICU1.6 by doxygen 1.0.0 written by Dimitri van Heesch, © 1997-1999