00001 /* 00002 ******************************************************************************* 00003 * 00004 * Copyright (C) 1999-2000, International Business Machines 00005 * Corporation and others. All Rights Reserved. 00006 * 00007 ******************************************************************************* 00008 * file name: utf.h 00009 * encoding: US-ASCII 00010 * tab size: 8 (not used) 00011 * indentation:4 00012 * 00013 * created on: 1999sep09 00014 * created by: Markus W. Scherer 00015 * 00016 * This file defines the UChar and UChar32 data types for Unicode code units 00017 * and code points, as well as macros for efficiently getting code points 00018 * in and out of a string. 00019 * utf.h is included by utypes.h and itself includes the utfXX.h after some 00020 * common definitions. Those files define the macros for each UTF-size. 00021 */ 00022 00023 #ifndef __UTF_H__ 00024 #define __UTF_H__ 00025 00026 /* 00027 * ANSI C headers: 00028 * stddef.h defines wchar_t 00029 * limits.h defines CHAR_MAX 00030 */ 00031 #include <stddef.h> 00032 #include <limits.h> 00033 #include "unicode/umachine.h" 00034 /* include the utfXX.h after the following definitions */ 00035 00036 /* If there is no compiler option for the preferred UTF size, then default to UTF-16. */ 00037 #ifndef UTF_SIZE 00038 # define UTF_SIZE 16 00039 #endif 00040 00041 #define U_SIZEOF_UCHAR (UTF_SIZE>>3) 00042 00043 /* Do we have wchar.h on this platform? It is there on most platforms. */ 00044 #ifndef U_HAVE_WCHAR_H 00045 # define U_HAVE_WCHAR_H 1 00046 #endif 00047 00048 /* U_SIZEOF_WCHAR_T==sizeof(wchar_t) (0 means it is not defined or autoconf could not set it) */ 00049 #if U_SIZEOF_WCHAR_T==0 00050 # undef U_SIZEOF_WCHAR_T 00051 # define U_SIZEOF_WCHAR_T 4 00052 #endif 00053 00054 /* Define UChar32 to be compatible with wchar_t if possible. */ 00055 #if U_SIZEOF_WCHAR_T==4 00056 typedef wchar_t UChar32; 00057 #else 00058 typedef uint32_t UChar32; 00059 #endif 00060 00061 /* Unicode string and array offset and index type */ 00062 typedef int32_t UTextOffset; 00063 00064 /* Specify which macro versions are the default ones - safe or fast. */ 00065 #if !defined(UTF_SAFE) && !defined(UTF_STRICT) && !defined(UTF_UNSAFE) 00066 # define UTF_SAFE 00067 #endif 00068 00069 /* internal definitions ----------------------------------------------------- */ 00070 00071 /* 00072 * Special error values for UTF-8, 00073 * which need 1 or 2 bytes in UTF-8: 00074 * U+0015 = NAK = Negative Acknowledge, C0 control character 00075 * U+009f = highest C1 control character 00076 * 00077 * These are used by ("safe") UTF-8 macros so that they can return an error value 00078 * that needs the same number of code units (bytes) as were seen by 00079 * a macro. 00080 */ 00081 #define UTF8_ERROR_VALUE_1 0x15 00082 #define UTF8_ERROR_VALUE_2 0x9f 00083 00084 /* error value for all UTFs */ 00085 #define UTF_ERROR_VALUE 0xffff 00086 00087 /* single-code point definitions -------------------------------------------- */ 00088 00089 /* is this code unit or code point a surrogate? */ 00090 #define UTF_IS_SURROGATE(uchar) (((uchar)&0xfffff800)==0xd800) 00091 00092 /* 00093 * Is a given 32-bit code point/Unicode scalar value 00094 * actually a valid Unicode (abstract) character? 00095 */ 00096 #define UTF_IS_UNICODE_CHAR(c) \ 00097 ((uint32_t)(c)<=0x10ffff && \ 00098 !UTF_IS_SURROGATE(c) && ((c)&0xfffe)!=0xfffe) 00099 00100 /* 00101 * Is a given 32-bit code an error value 00102 * as returned by one of the macros for any UTF? 00103 */ 00104 #define UTF_IS_ERROR(c) \ 00105 (((c)&0xfffe)==0xfffe || (c)==UTF8_ERROR_VALUE_1 || (c)==UTF8_ERROR_VALUE_2) 00106 00107 /* This is a combined macro: is c a valid Unicode value _and_ not an error code? */ 00108 #define UTF_IS_VALID(c) \ 00109 ((uint32_t)(c)<=0x10ffff && \ 00110 !UTF_IS_SURROGATE(c) && \ 00111 ((c)&0xfffe)!=0xfffe && \ 00112 (c)!=UTF8_ERROR_VALUE_1 && (c)!=UTF8_ERROR_VALUE_2) 00113 00114 /* include the utfXX.h ------------------------------------------------------ */ 00115 00116 #include "unicode/utf8.h" 00117 #include "unicode/utf16.h" 00118 #include "unicode/utf32.h" 00119 00120 /* Define types and macros according to the selected UTF size. -------------- */ 00121 00122 #if UTF_SIZE==8 00123 00124 # error UTF-8 is not implemented, undefine UTF_SIZE or define it to 16 00125 00126 /* Define UChar to be compatible with char if possible. */ 00127 # if CHAR_MAX>=255 00128 typedef char UChar; 00129 # else 00130 typedef uint8_t UChar; 00131 # endif 00132 00133 #elif UTF_SIZE==16 00134 00135 /* Define UChar to be compatible with wchar_t if possible. */ 00136 # if U_SIZEOF_WCHAR_T==2 00137 typedef wchar_t UChar; 00138 # else 00139 typedef uint16_t UChar; 00140 # endif 00141 00142 # define UTF_IS_SINGLE(uchar) UTF16_IS_SINGLE(uchar) 00143 # define UTF_IS_LEAD(uchar) UTF16_IS_LEAD(uchar) 00144 # define UTF_IS_TRAIL(uchar) UTF16_IS_TRAIL(uchar) 00145 00146 # define UTF_NEED_MULTIPLE_UCHAR(c) UTF16_NEED_MULTIPLE_UCHAR(c) 00147 # define UTF_CHAR_LENGTH(c) UTF16_CHAR_LENGTH(c) 00148 # define UTF_MAX_CHAR_LENGTH UTF16_MAX_CHAR_LENGTH 00149 # define UTF_ARRAY_SIZE(size) UTF16_ARRAY_SIZE(size) 00150 00151 # define UTF_GET_CHAR_UNSAFE(s, i, c) UTF16_GET_CHAR_UNSAFE(s, i, c) 00152 # define UTF_GET_CHAR_SAFE(s, start, i, length, c, strict) UTF16_GET_CHAR_SAFE(s, start, i, length, c, strict) 00153 00154 # define UTF_NEXT_CHAR_UNSAFE(s, i, c) UTF16_NEXT_CHAR_UNSAFE(s, i, c) 00155 # define UTF_NEXT_CHAR_SAFE(s, i, length, c, strict) UTF16_NEXT_CHAR_SAFE(s, i, length, c, strict) 00156 00157 # define UTF_APPEND_CHAR_UNSAFE(s, i, c) UTF16_APPEND_CHAR_UNSAFE(s, i, c) 00158 # define UTF_APPEND_CHAR_SAFE(s, i, length, c) UTF16_APPEND_CHAR_SAFE(s, i, length, c) 00159 00160 # define UTF_FWD_1_UNSAFE(s, i) UTF16_FWD_1_UNSAFE(s, i) 00161 # define UTF_FWD_1_SAFE(s, i, length) UTF16_FWD_1_SAFE(s, i, length) 00162 00163 # define UTF_FWD_N_UNSAFE(s, i, n) UTF16_FWD_N_UNSAFE(s, i, n) 00164 # define UTF_FWD_N_SAFE(s, i, length, n) UTF16_FWD_N_SAFE(s, i, length, n) 00165 00166 # define UTF_SET_CHAR_START_UNSAFE(s, i) UTF16_SET_CHAR_START_UNSAFE(s, i) 00167 # define UTF_SET_CHAR_START_SAFE(s, start, i) UTF16_SET_CHAR_START_SAFE(s, start, i) 00168 00169 # define UTF_PREV_CHAR_UNSAFE(s, i, c) UTF16_PREV_CHAR_UNSAFE(s, i, c) 00170 # define UTF_PREV_CHAR_SAFE(s, start, i, c, strict) UTF16_PREV_CHAR_SAFE(s, start, i, c, strict) 00171 00172 # define UTF_BACK_1_UNSAFE(s, i) UTF16_BACK_1_UNSAFE(s, i) 00173 # define UTF_BACK_1_SAFE(s, start, i) UTF16_BACK_1_SAFE(s, start, i) 00174 00175 # define UTF_BACK_N_UNSAFE(s, i, n) UTF16_BACK_N_UNSAFE(s, i, n) 00176 # define UTF_BACK_N_SAFE(s, start, i, n) UTF16_BACK_N_SAFE(s, start, i, n) 00177 00178 # define UTF_SET_CHAR_LIMIT_UNSAFE(s, i) UTF16_SET_CHAR_LIMIT_UNSAFE(s, i) 00179 # define UTF_SET_CHAR_LIMIT_SAFE(s, start, i, length) UTF16_SET_CHAR_LIMIT_SAFE(s, start, i, length) 00180 00181 #elif UTF_SIZE==32 00182 00183 # error UTF-32 is not implemented, undefine UTF_SIZE or define it to 16 00184 00185 typedef UChar32 UChar; 00186 00187 #else 00188 # error UTF_SIZE must be undefined or one of { 8, 16, 32 } - only 16 is implemented 00189 #endif 00190 00191 /* Define the default macros for handling UTF characters. ------------------- */ 00192 00193 #ifdef UTF_SAFE 00194 00195 # define UTF_GET_CHAR(s, start, i, length, c) UTF_GET_CHAR_SAFE(s, start, i, length, c, FALSE) 00196 00197 # define UTF_NEXT_CHAR(s, i, length, c) UTF_NEXT_CHAR_SAFE(s, i, length, c, FALSE) 00198 # define UTF_APPEND_CHAR(s, i, length, c) UTF_APPEND_CHAR_SAFE(s, i, length, c) 00199 # define UTF_FWD_1(s, i, length) UTF_FWD_1_SAFE(s, i, length) 00200 # define UTF_FWD_N(s, i, length, n) UTF_FWD_N_SAFE(s, i, length, n) 00201 # define UTF_SET_CHAR_START(s, start, i) UTF_SET_CHAR_START_SAFE(s, start, i) 00202 00203 # define UTF_PREV_CHAR(s, start, i, c) UTF_PREV_CHAR_SAFE(s, start, i, c, FALSE) 00204 # define UTF_BACK_1(s, start, i) UTF_BACK_1_SAFE(s, start, i) 00205 # define UTF_BACK_N(s, start, i, n) UTF_BACK_N_SAFE(s, start, i, n) 00206 # define UTF_SET_CHAR_LIMIT(s, start, i, length) UTF_SET_CHAR_LIMIT_SAFE(s, start, i, length) 00207 00208 #elif defined(UTF_STRICT) 00209 00210 # define UTF_GET_CHAR(s, start, i, length, c) UTF_GET_CHAR_SAFE(s, start, i, length, c, TRUE) 00211 00212 # define UTF_NEXT_CHAR(s, i, length, c) UTF_NEXT_CHAR_SAFE(s, i, length, c, TRUE) 00213 # define UTF_APPEND_CHAR(s, i, length, c) UTF_APPEND_CHAR_SAFE(s, i, length, c) 00214 # define UTF_FWD_1(s, i, length) UTF_FWD_1_SAFE(s, i, length) 00215 # define UTF_FWD_N(s, i, length, n) UTF_FWD_N_SAFE(s, i, length, n) 00216 # define UTF_SET_CHAR_START(s, start, i) UTF_SET_CHAR_START_SAFE(s, start, i) 00217 00218 # define UTF_PREV_CHAR(s, start, i, c) UTF_PREV_CHAR_SAFE(s, start, i, c, TRUE) 00219 # define UTF_BACK_1(s, start, i) UTF_BACK_1_SAFE(s, start, i) 00220 # define UTF_BACK_N(s, start, i, n) UTF_BACK_N_SAFE(s, start, i, n) 00221 # define UTF_SET_CHAR_LIMIT(s, start, i, length) UTF_SET_CHAR_LIMIT_SAFE(s, start, i, length) 00222 00223 #else /* UTF_UNSAFE */ 00224 00225 # define UTF_GET_CHAR(s, start, i, length, c) UTF_GET_CHAR_UNSAFE(s, i, c) 00226 00227 # define UTF_NEXT_CHAR(s, i, length, c) UTF_NEXT_CHAR_UNSAFE(s, i, c) 00228 # define UTF_APPEND_CHAR(s, i, length, c) UTF_APPEND_CHAR_UNSAFE(s, i, c) 00229 # define UTF_FWD_1(s, i, length) UTF_FWD_1_UNSAFE(s, i) 00230 # define UTF_FWD_N(s, i, length, n) UTF_FWD_N_UNSAFE(s, i, n) 00231 # define UTF_SET_CHAR_START(s, start, i) UTF_SET_CHAR_START_UNSAFE(s, i) 00232 00233 # define UTF_PREV_CHAR(s, start, i, c) UTF_PREV_CHAR_UNSAFE(s, i, c) 00234 # define UTF_BACK_1(s, start, i) UTF_BACK_1_UNSAFE(s, i) 00235 # define UTF_BACK_N(s, start, i, n) UTF_BACK_N_UNSAFE(s, i, n) 00236 # define UTF_SET_CHAR_LIMIT(s, start, i, length) UTF_SET_CHAR_LIMIT_UNSAFE(s, i) 00237 00238 #endif 00239 00240 #endif