Main Page   Class Hierarchy   Compound List   File List   Header Files   Sources   Compound Members   File Members  

utf.h

00001 /*
00002 *******************************************************************************
00003 *
00004 *   Copyright (C) 1999-2000, International Business Machines
00005 *   Corporation and others.  All Rights Reserved.
00006 *
00007 *******************************************************************************
00008 *   file name:  utf.h
00009 *   encoding:   US-ASCII
00010 *   tab size:   8 (not used)
00011 *   indentation:4
00012 *
00013 *   created on: 1999sep09
00014 *   created by: Markus W. Scherer
00015 *
00016 *   This file defines the UChar and UChar32 data types for Unicode code units
00017 *   and code points, as well as macros for efficiently getting code points
00018 *   in and out of a string.
00019 *   utf.h is included by utypes.h and itself includes the utfXX.h after some
00020 *   common definitions. Those files define the macros for each UTF-size.
00021 */
00022 
00023 #ifndef __UTF_H__
00024 #define __UTF_H__
00025 
00026 /*
00027  * ANSI C headers:
00028  * stddef.h defines wchar_t
00029  * limits.h defines CHAR_MAX
00030  */
00031 #include <stddef.h>
00032 #include <limits.h>
00033 #include "unicode/umachine.h"
00034 /* include the utfXX.h after the following definitions */
00035 
00036 /* If there is no compiler option for the preferred UTF size, then default to UTF-16. */
00037 #ifndef UTF_SIZE
00038 #   define UTF_SIZE 16
00039 #endif
00040 
00041 #define U_SIZEOF_UCHAR (UTF_SIZE>>3)
00042 
00043 /* Do we have wchar.h on this platform? It is there on most platforms. */
00044 #ifndef U_HAVE_WCHAR_H
00045 #   define U_HAVE_WCHAR_H 1
00046 #endif
00047 
00048 /* U_SIZEOF_WCHAR_T==sizeof(wchar_t) (0 means it is not defined or autoconf could not set it) */
00049 #if U_SIZEOF_WCHAR_T==0
00050 #   undef U_SIZEOF_WCHAR_T
00051 #   define U_SIZEOF_WCHAR_T 4
00052 #endif
00053 
00054 /* Define UChar32 to be compatible with wchar_t if possible. */
00055 #if U_SIZEOF_WCHAR_T==4
00056     typedef wchar_t UChar32;
00057 #else
00058     typedef uint32_t UChar32;
00059 #endif
00060 
00061 /* Unicode string and array offset and index type */
00062 typedef int32_t UTextOffset;
00063 
00064 /* Specify which macro versions are the default ones - safe or fast. */
00065 #if !defined(UTF_SAFE) && !defined(UTF_STRICT) && !defined(UTF_UNSAFE)
00066 #   define UTF_SAFE
00067 #endif
00068 
00069 /* internal definitions ----------------------------------------------------- */
00070 
00071 /*
00072  * Special error values for UTF-8,
00073  * which need 1 or 2 bytes in UTF-8:
00074  * U+0015 = NAK = Negative Acknowledge, C0 control character
00075  * U+009f = highest C1 control character
00076  *
00077  * These are used by ("safe") UTF-8 macros so that they can return an error value
00078  * that needs the same number of code units (bytes) as were seen by
00079  * a macro.
00080  */
00081 #define UTF8_ERROR_VALUE_1 0x15
00082 #define UTF8_ERROR_VALUE_2 0x9f
00083 
00084 /* error value for all UTFs */
00085 #define UTF_ERROR_VALUE 0xffff
00086 
00087 /* single-code point definitions -------------------------------------------- */
00088 
00089 /* is this code unit or code point a surrogate? */
00090 #define UTF_IS_SURROGATE(uchar) (((uchar)&0xfffff800)==0xd800)
00091 
00092 /*
00093  * Is a given 32-bit code point/Unicode scalar value
00094  * actually a valid Unicode (abstract) character?
00095  */
00096 #define UTF_IS_UNICODE_CHAR(c) \
00097     ((uint32_t)(c)<=0x10ffff && \
00098      !UTF_IS_SURROGATE(c) && ((c)&0xfffe)!=0xfffe)
00099 
00100 /*
00101  * Is a given 32-bit code an error value
00102  * as returned by one of the macros for any UTF?
00103  */
00104 #define UTF_IS_ERROR(c) \
00105     (((c)&0xfffe)==0xfffe || (c)==UTF8_ERROR_VALUE_1 || (c)==UTF8_ERROR_VALUE_2)
00106 
00107 /* This is a combined macro: is c a valid Unicode value _and_ not an error code? */
00108 #define UTF_IS_VALID(c) \
00109     ((uint32_t)(c)<=0x10ffff && \
00110      !UTF_IS_SURROGATE(c) && \
00111      ((c)&0xfffe)!=0xfffe && \
00112      (c)!=UTF8_ERROR_VALUE_1 && (c)!=UTF8_ERROR_VALUE_2)
00113 
00114 /* include the utfXX.h ------------------------------------------------------ */
00115 
00116 #include "unicode/utf8.h"
00117 #include "unicode/utf16.h"
00118 #include "unicode/utf32.h"
00119 
00120 /* Define types and macros according to the selected UTF size. -------------- */
00121 
00122 #if UTF_SIZE==8
00123 
00124 #   error UTF-8 is not implemented, undefine UTF_SIZE or define it to 16
00125 
00126     /* Define UChar to be compatible with char if possible. */
00127 #   if CHAR_MAX>=255
00128         typedef char UChar;
00129 #   else
00130         typedef uint8_t UChar;
00131 #   endif
00132 
00133 #elif UTF_SIZE==16
00134 
00135     /* Define UChar to be compatible with wchar_t if possible. */
00136 #   if U_SIZEOF_WCHAR_T==2
00137         typedef wchar_t UChar;
00138 #   else
00139         typedef uint16_t UChar;
00140 #   endif
00141 
00142 #   define UTF_IS_SINGLE(uchar)                         UTF16_IS_SINGLE(uchar)
00143 #   define UTF_IS_LEAD(uchar)                           UTF16_IS_LEAD(uchar)
00144 #   define UTF_IS_TRAIL(uchar)                          UTF16_IS_TRAIL(uchar)
00145 
00146 #   define UTF_NEED_MULTIPLE_UCHAR(c)                   UTF16_NEED_MULTIPLE_UCHAR(c)
00147 #   define UTF_CHAR_LENGTH(c)                           UTF16_CHAR_LENGTH(c)
00148 #   define UTF_MAX_CHAR_LENGTH                          UTF16_MAX_CHAR_LENGTH
00149 #   define UTF_ARRAY_SIZE(size)                         UTF16_ARRAY_SIZE(size)
00150 
00151 #   define UTF_GET_CHAR_UNSAFE(s, i, c)                 UTF16_GET_CHAR_UNSAFE(s, i, c)
00152 #   define UTF_GET_CHAR_SAFE(s, start, i, length, c, strict) UTF16_GET_CHAR_SAFE(s, start, i, length, c, strict)
00153 
00154 #   define UTF_NEXT_CHAR_UNSAFE(s, i, c)                UTF16_NEXT_CHAR_UNSAFE(s, i, c)
00155 #   define UTF_NEXT_CHAR_SAFE(s, i, length, c, strict)  UTF16_NEXT_CHAR_SAFE(s, i, length, c, strict)
00156 
00157 #   define UTF_APPEND_CHAR_UNSAFE(s, i, c)              UTF16_APPEND_CHAR_UNSAFE(s, i, c)
00158 #   define UTF_APPEND_CHAR_SAFE(s, i, length, c)        UTF16_APPEND_CHAR_SAFE(s, i, length, c)
00159 
00160 #   define UTF_FWD_1_UNSAFE(s, i)                       UTF16_FWD_1_UNSAFE(s, i)
00161 #   define UTF_FWD_1_SAFE(s, i, length)                 UTF16_FWD_1_SAFE(s, i, length)
00162 
00163 #   define UTF_FWD_N_UNSAFE(s, i, n)                    UTF16_FWD_N_UNSAFE(s, i, n)
00164 #   define UTF_FWD_N_SAFE(s, i, length, n)              UTF16_FWD_N_SAFE(s, i, length, n)
00165 
00166 #   define UTF_SET_CHAR_START_UNSAFE(s, i)              UTF16_SET_CHAR_START_UNSAFE(s, i)
00167 #   define UTF_SET_CHAR_START_SAFE(s, start, i)         UTF16_SET_CHAR_START_SAFE(s, start, i)
00168 
00169 #   define UTF_PREV_CHAR_UNSAFE(s, i, c)                UTF16_PREV_CHAR_UNSAFE(s, i, c)
00170 #   define UTF_PREV_CHAR_SAFE(s, start, i, c, strict)   UTF16_PREV_CHAR_SAFE(s, start, i, c, strict)
00171 
00172 #   define UTF_BACK_1_UNSAFE(s, i)                      UTF16_BACK_1_UNSAFE(s, i)
00173 #   define UTF_BACK_1_SAFE(s, start, i)                 UTF16_BACK_1_SAFE(s, start, i)
00174 
00175 #   define UTF_BACK_N_UNSAFE(s, i, n)                   UTF16_BACK_N_UNSAFE(s, i, n)
00176 #   define UTF_BACK_N_SAFE(s, start, i, n)              UTF16_BACK_N_SAFE(s, start, i, n)
00177 
00178 #   define UTF_SET_CHAR_LIMIT_UNSAFE(s, i)              UTF16_SET_CHAR_LIMIT_UNSAFE(s, i)
00179 #   define UTF_SET_CHAR_LIMIT_SAFE(s, start, i, length) UTF16_SET_CHAR_LIMIT_SAFE(s, start, i, length)
00180 
00181 #elif UTF_SIZE==32
00182 
00183 #   error UTF-32 is not implemented, undefine UTF_SIZE or define it to 16
00184 
00185     typedef UChar32 UChar;
00186 
00187 #else
00188 #   error UTF_SIZE must be undefined or one of { 8, 16, 32 } - only 16 is implemented
00189 #endif
00190 
00191 /* Define the default macros for handling UTF characters. ------------------- */
00192 
00193 #ifdef UTF_SAFE
00194 
00195 #   define UTF_GET_CHAR(s, start, i, length, c) UTF_GET_CHAR_SAFE(s, start, i, length, c, FALSE)
00196 
00197 #   define UTF_NEXT_CHAR(s, i, length, c)       UTF_NEXT_CHAR_SAFE(s, i, length, c, FALSE)
00198 #   define UTF_APPEND_CHAR(s, i, length, c)     UTF_APPEND_CHAR_SAFE(s, i, length, c)
00199 #   define UTF_FWD_1(s, i, length)              UTF_FWD_1_SAFE(s, i, length)
00200 #   define UTF_FWD_N(s, i, length, n)           UTF_FWD_N_SAFE(s, i, length, n)
00201 #   define UTF_SET_CHAR_START(s, start, i)      UTF_SET_CHAR_START_SAFE(s, start, i)
00202 
00203 #   define UTF_PREV_CHAR(s, start, i, c)        UTF_PREV_CHAR_SAFE(s, start, i, c, FALSE)
00204 #   define UTF_BACK_1(s, start, i)              UTF_BACK_1_SAFE(s, start, i)
00205 #   define UTF_BACK_N(s, start, i, n)           UTF_BACK_N_SAFE(s, start, i, n)
00206 #   define UTF_SET_CHAR_LIMIT(s, start, i, length) UTF_SET_CHAR_LIMIT_SAFE(s, start, i, length)
00207 
00208 #elif defined(UTF_STRICT)
00209 
00210 #   define UTF_GET_CHAR(s, start, i, length, c) UTF_GET_CHAR_SAFE(s, start, i, length, c, TRUE)
00211 
00212 #   define UTF_NEXT_CHAR(s, i, length, c)       UTF_NEXT_CHAR_SAFE(s, i, length, c, TRUE)
00213 #   define UTF_APPEND_CHAR(s, i, length, c)     UTF_APPEND_CHAR_SAFE(s, i, length, c)
00214 #   define UTF_FWD_1(s, i, length)              UTF_FWD_1_SAFE(s, i, length)
00215 #   define UTF_FWD_N(s, i, length, n)           UTF_FWD_N_SAFE(s, i, length, n)
00216 #   define UTF_SET_CHAR_START(s, start, i)      UTF_SET_CHAR_START_SAFE(s, start, i)
00217 
00218 #   define UTF_PREV_CHAR(s, start, i, c)        UTF_PREV_CHAR_SAFE(s, start, i, c, TRUE)
00219 #   define UTF_BACK_1(s, start, i)              UTF_BACK_1_SAFE(s, start, i)
00220 #   define UTF_BACK_N(s, start, i, n)           UTF_BACK_N_SAFE(s, start, i, n)
00221 #   define UTF_SET_CHAR_LIMIT(s, start, i, length) UTF_SET_CHAR_LIMIT_SAFE(s, start, i, length)
00222 
00223 #else /* UTF_UNSAFE */
00224 
00225 #   define UTF_GET_CHAR(s, start, i, length, c) UTF_GET_CHAR_UNSAFE(s, i, c)
00226 
00227 #   define UTF_NEXT_CHAR(s, i, length, c)       UTF_NEXT_CHAR_UNSAFE(s, i, c)
00228 #   define UTF_APPEND_CHAR(s, i, length, c)     UTF_APPEND_CHAR_UNSAFE(s, i, c)
00229 #   define UTF_FWD_1(s, i, length)              UTF_FWD_1_UNSAFE(s, i)
00230 #   define UTF_FWD_N(s, i, length, n)           UTF_FWD_N_UNSAFE(s, i, n)
00231 #   define UTF_SET_CHAR_START(s, start, i)      UTF_SET_CHAR_START_UNSAFE(s, i)
00232 
00233 #   define UTF_PREV_CHAR(s, start, i, c)        UTF_PREV_CHAR_UNSAFE(s, i, c)
00234 #   define UTF_BACK_1(s, start, i)              UTF_BACK_1_UNSAFE(s, i)
00235 #   define UTF_BACK_N(s, start, i, n)           UTF_BACK_N_UNSAFE(s, i, n)
00236 #   define UTF_SET_CHAR_LIMIT(s, start, i, length) UTF_SET_CHAR_LIMIT_UNSAFE(s, i)
00237 
00238 #endif
00239 
00240 #endif

Generated at Wed Aug 16 16:05:39 2000 for ICU1.6 by doxygen 1.0.0 written by Dimitri van Heesch, © 1997-1999