Main Page   Class Hierarchy   Alphabetical List   Compound List   File List   Compound Members  

uchar.c

00001 /*
00002 ********************************************************************************
00003 *   Copyright (C) 1996-2000, International Business Machines
00004 *   Corporation and others.  All Rights Reserved.
00005 ********************************************************************************
00006 *
00007 * File UCHAR.C
00008 *
00009 * Modification History:
00010 *
00011 *   Date        Name        Description
00012 *   04/02/97    aliu        Creation.
00013 *
00014 *   4/15/99     Madhu       Updated all the function definitions for C Implementation
00015 *   5/20/99     Madhu           Added the function u_getVersion()
00016 *   8/19/1999   srl         Upgraded scripts to Unicode3.0 
00017 *   11/11/1999  weiv        added u_isalnum(), cleaned comments
00018 *   01/11/2000  helena      Renamed u_getVersion to u_getUnicodeVersion.
00019 *   06/20/2000  helena      OS/400 port changes; mostly typecast.
00020 ********************************************************************************************
00021 */
00022 #include "unicode/utypes.h"
00023 #include "ucmp16.h"
00024 #include "ucmp8.h"
00025 #include "umutex.h"
00026 #include "unicode/uchar.h"
00027 #include "unicode/udata.h"
00028 #include "cmemory.h"
00029 #include "cstring.h"
00030 
00031 /* dynamically loaded Unicode character properties -------------------------- */
00032 
00033 /* fallback properties for the ASCII range if the data cannot be loaded */
00034 /* these are printed by genprops in verbose mode */
00035 static uint32_t staticProps32Table[0xa0]={
00036     /* 0x00 */ 0x48f,
00037     /* 0x01 */ 0x48f,
00038     /* 0x02 */ 0x48f,
00039     /* 0x03 */ 0x48f,
00040     /* 0x04 */ 0x48f,
00041     /* 0x05 */ 0x48f,
00042     /* 0x06 */ 0x48f,
00043     /* 0x07 */ 0x48f,
00044     /* 0x08 */ 0x48f,
00045     /* 0x09 */ 0x20c,
00046     /* 0x0a */ 0x1ce,
00047     /* 0x0b */ 0x20c,
00048     /* 0x0c */ 0x24d,
00049     /* 0x0d */ 0x1ce,
00050     /* 0x0e */ 0x48f,
00051     /* 0x0f */ 0x48f,
00052     /* 0x10 */ 0x48f,
00053     /* 0x11 */ 0x48f,
00054     /* 0x12 */ 0x48f,
00055     /* 0x13 */ 0x48f,
00056     /* 0x14 */ 0x48f,
00057     /* 0x15 */ 0x48f,
00058     /* 0x16 */ 0x48f,
00059     /* 0x17 */ 0x48f,
00060     /* 0x18 */ 0x48f,
00061     /* 0x19 */ 0x48f,
00062     /* 0x1a */ 0x48f,
00063     /* 0x1b */ 0x48f,
00064     /* 0x1c */ 0x1ce,
00065     /* 0x1d */ 0x1ce,
00066     /* 0x1e */ 0x1ce,
00067     /* 0x1f */ 0x20c,
00068     /* 0x20 */ 0x24c,
00069     /* 0x21 */ 0x297,
00070     /* 0x22 */ 0x297,
00071     /* 0x23 */ 0x117,
00072     /* 0x24 */ 0x119,
00073     /* 0x25 */ 0x117,
00074     /* 0x26 */ 0x297,
00075     /* 0x27 */ 0x297,
00076     /* 0x28 */ 0x100a94,
00077     /* 0x29 */ 0xfff00a95,
00078     /* 0x2a */ 0x297,
00079     /* 0x2b */ 0x118,
00080     /* 0x2c */ 0x197,
00081     /* 0x2d */ 0x113,
00082     /* 0x2e */ 0x197,
00083     /* 0x2f */ 0xd7,
00084     /* 0x30 */ 0x89,
00085     /* 0x31 */ 0x100089,
00086     /* 0x32 */ 0x200089,
00087     /* 0x33 */ 0x300089,
00088     /* 0x34 */ 0x400089,
00089     /* 0x35 */ 0x500089,
00090     /* 0x36 */ 0x600089,
00091     /* 0x37 */ 0x700089,
00092     /* 0x38 */ 0x800089,
00093     /* 0x39 */ 0x900089,
00094     /* 0x3a */ 0x197,
00095     /* 0x3b */ 0x297,
00096     /* 0x3c */ 0x200a98,
00097     /* 0x3d */ 0x298,
00098     /* 0x3e */ 0xffe00a98,
00099     /* 0x3f */ 0x297,
00100     /* 0x40 */ 0x297,
00101     /* 0x41 */ 0x2000001,
00102     /* 0x42 */ 0x2000001,
00103     /* 0x43 */ 0x2000001,
00104     /* 0x44 */ 0x2000001,
00105     /* 0x45 */ 0x2000001,
00106     /* 0x46 */ 0x2000001,
00107     /* 0x47 */ 0x2000001,
00108     /* 0x48 */ 0x2000001,
00109     /* 0x49 */ 0x2000001,
00110     /* 0x4a */ 0x2000001,
00111     /* 0x4b */ 0x2000001,
00112     /* 0x4c */ 0x2000001,
00113     /* 0x4d */ 0x2000001,
00114     /* 0x4e */ 0x2000001,
00115     /* 0x4f */ 0x2000001,
00116     /* 0x50 */ 0x2000001,
00117     /* 0x51 */ 0x2000001,
00118     /* 0x52 */ 0x2000001,
00119     /* 0x53 */ 0x2000001,
00120     /* 0x54 */ 0x2000001,
00121     /* 0x55 */ 0x2000001,
00122     /* 0x56 */ 0x2000001,
00123     /* 0x57 */ 0x2000001,
00124     /* 0x58 */ 0x2000001,
00125     /* 0x59 */ 0x2000001,
00126     /* 0x5a */ 0x2000001,
00127     /* 0x5b */ 0x200a94,
00128     /* 0x5c */ 0x297,
00129     /* 0x5d */ 0xffe00a95,
00130     /* 0x5e */ 0x29a,
00131     /* 0x5f */ 0x296,
00132     /* 0x60 */ 0x29a,
00133     /* 0x61 */ 0x2000002,
00134     /* 0x62 */ 0x2000002,
00135     /* 0x63 */ 0x2000002,
00136     /* 0x64 */ 0x2000002,
00137     /* 0x65 */ 0x2000002,
00138     /* 0x66 */ 0x2000002,
00139     /* 0x67 */ 0x2000002,
00140     /* 0x68 */ 0x2000002,
00141     /* 0x69 */ 0x2000002,
00142     /* 0x6a */ 0x2000002,
00143     /* 0x6b */ 0x2000002,
00144     /* 0x6c */ 0x2000002,
00145     /* 0x6d */ 0x2000002,
00146     /* 0x6e */ 0x2000002,
00147     /* 0x6f */ 0x2000002,
00148     /* 0x70 */ 0x2000002,
00149     /* 0x71 */ 0x2000002,
00150     /* 0x72 */ 0x2000002,
00151     /* 0x73 */ 0x2000002,
00152     /* 0x74 */ 0x2000002,
00153     /* 0x75 */ 0x2000002,
00154     /* 0x76 */ 0x2000002,
00155     /* 0x77 */ 0x2000002,
00156     /* 0x78 */ 0x2000002,
00157     /* 0x79 */ 0x2000002,
00158     /* 0x7a */ 0x2000002,
00159     /* 0x7b */ 0x200a94,
00160     /* 0x7c */ 0x298,
00161     /* 0x7d */ 0xffe00a95,
00162     /* 0x7e */ 0x298,
00163     /* 0x7f */ 0x48f,
00164     /* 0x80 */ 0x48f,
00165     /* 0x81 */ 0x48f,
00166     /* 0x82 */ 0x48f,
00167     /* 0x83 */ 0x48f,
00168     /* 0x84 */ 0x48f,
00169     /* 0x85 */ 0x1ce,
00170     /* 0x86 */ 0x48f,
00171     /* 0x87 */ 0x48f,
00172     /* 0x88 */ 0x48f,
00173     /* 0x89 */ 0x48f,
00174     /* 0x8a */ 0x48f,
00175     /* 0x8b */ 0x48f,
00176     /* 0x8c */ 0x48f,
00177     /* 0x8d */ 0x48f,
00178     /* 0x8e */ 0x48f,
00179     /* 0x8f */ 0x48f,
00180     /* 0x90 */ 0x48f,
00181     /* 0x91 */ 0x48f,
00182     /* 0x92 */ 0x48f,
00183     /* 0x93 */ 0x48f,
00184     /* 0x94 */ 0x48f,
00185     /* 0x95 */ 0x48f,
00186     /* 0x96 */ 0x48f,
00187     /* 0x97 */ 0x48f,
00188     /* 0x98 */ 0x48f,
00189     /* 0x99 */ 0x48f,
00190     /* 0x9a */ 0x48f,
00191     /* 0x9b */ 0x48f,
00192     /* 0x9c */ 0x48f,
00193     /* 0x9d */ 0x48f,
00194     /* 0x9e */ 0x48f,
00195     /* 0x9f */ 0x48f
00196 };
00197 
00198 /*
00199  * loaded uprops.dat -
00200  * for a description of the file format, see icu/source/tools/genprops/store.c
00201  */
00202 #define DATA_NAME "uprops"
00203 #define DATA_TYPE "dat"
00204 
00205 static UDataMemory *propsData=NULL;
00206 
00207 static uint8_t formatVersion[4]={ 0, 0, 0, 0 };
00208 static UVersionInfo dataVersion={ 3, 0, 0, 0 };
00209 
00210 static const uint16_t *propsTable=NULL;
00211 #define props32Table ((uint32_t *)propsTable)
00212 
00213 static int8_t havePropsData=0;
00214 
00215 /* index values loaded from uprops.dat */
00216 static uint16_t indexes[8];
00217 
00218 enum {
00219     INDEX_STAGE_2_BITS,
00220     INDEX_STAGE_3_BITS,
00221     INDEX_EXCEPTIONS
00222 };
00223 
00224 /* access values calculated from indexes */
00225 static uint16_t stage23Bits, stage2Mask, stage3Mask;
00226 
00227 static UBool
00228 isAcceptable(void *context,
00229              const char *type, const char *name,
00230              const UDataInfo *pInfo) {
00231     if(
00232         pInfo->size>=20 &&
00233         pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
00234         pInfo->charsetFamily==U_CHARSET_FAMILY &&
00235         pInfo->dataFormat[0]==0x55 &&   /* dataFormat="UPro" */
00236         pInfo->dataFormat[1]==0x50 &&
00237         pInfo->dataFormat[2]==0x72 &&
00238         pInfo->dataFormat[3]==0x6f &&
00239         pInfo->formatVersion[0]==1
00240     ) {
00241         uprv_memcpy(formatVersion, pInfo->formatVersion, 4);
00242         uprv_memcpy(dataVersion, pInfo->dataVersion, 4);
00243         return TRUE;
00244     } else {
00245         return FALSE;
00246     }
00247 }
00248 
00249 static int8_t
00250 loadPropsData() {
00251     /* load Unicode character properties data from file if necessary */
00252     if(havePropsData==0) {
00253         UErrorCode errorCode=U_ZERO_ERROR;
00254         UDataMemory *data;
00255         const uint16_t *p=NULL;
00256 
00257         /* open the data outside the mutex block */
00258         data=udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, &errorCode);
00259         if(U_FAILURE(errorCode)) {
00260             return havePropsData=-1;
00261         }
00262 
00263         p=(const uint16_t *)udata_getMemory(data);
00264 
00265         /* in the mutex block, set the data for this process */
00266         umtx_lock(NULL);
00267         if(propsData==NULL) {
00268             propsData=data;
00269             data=NULL;
00270             propsTable=p;
00271             p=NULL;
00272         }
00273         umtx_unlock(NULL);
00274 
00275         /* initialize some variables */
00276         uprv_memcpy(indexes, propsTable, 16);
00277         stage23Bits=(uint16_t)(indexes[INDEX_STAGE_2_BITS]+indexes[INDEX_STAGE_3_BITS]);
00278         stage2Mask=(uint16_t)((1<<indexes[INDEX_STAGE_2_BITS])-1);
00279         stage3Mask=(uint16_t)((1<<indexes[INDEX_STAGE_3_BITS])-1);
00280         havePropsData=1;
00281 
00282         /* if a different thread set it first, then close the extra data */
00283         if(data!=NULL) {
00284             udata_close(data); /* NULL if it was set correctly */
00285         }
00286     }
00287 
00288     return havePropsData;
00289 }
00290 
00291 /* constants and macros for access to the data */
00292 enum {
00293     EXC_UPPERCASE,
00294     EXC_LOWERCASE,
00295     EXC_TITLECASE,
00296     EXC_DIGIT_VALUE,
00297     EXC_NUMERIC_VALUE,
00298     EXC_DENOMINATOR_VALUE,
00299 
00300     EXC_MIRROR_MAPPING
00301 };
00302 
00303 enum {
00304     EXCEPTION_SHIFT=5,
00305     BIDI_SHIFT,
00306     MIRROR_SHIFT=BIDI_SHIFT+5,
00307     VALUE_SHIFT=20,
00308 
00309     VALUE_BITS=32-VALUE_SHIFT
00310 };
00311 
00312 /* getting a uint32_t properties word from the data */
00313 #define HAVE_DATA (havePropsData>0 || (havePropsData==0 && loadPropsData()>0))
00314 #define VALIDATE(c) (((uint32_t)(c))<=0x10ffff && HAVE_DATA)
00315 #define GET_PROPS(c) \
00316     (((uint32_t)(c))<=0x10ffff ? \
00317         HAVE_DATA ? \
00318             props32Table[ \
00319                 propsTable[ \
00320                     propsTable[ \
00321                         propsTable[8+(c>>stage23Bits)]+ \
00322                         (c>>indexes[INDEX_STAGE_3_BITS]&stage2Mask)]+ \
00323                     (c&stage3Mask)] \
00324             ] \
00325         : (c)<=0x9f ? \
00326             staticProps32Table[c] \
00327         : 0 \
00328     : 0)
00329 #define PROPS_VALUE_IS_EXCEPTION(props) ((props)&(1UL<<EXCEPTION_SHIFT))
00330 #define GET_CATEGORY(props) ((props)&0x1f)
00331 #define GET_UNSIGNED_VALUE(props) ((props)>>VALUE_SHIFT)
00332 #define GET_SIGNED_VALUE(props) ((int32_t)(props)>>VALUE_SHIFT)
00333 #define GET_EXCEPTIONS(props) (props32Table+indexes[INDEX_EXCEPTIONS]+GET_UNSIGNED_VALUE(props))
00334 
00335 /* finding an exception value */
00336 #define HAVE_EXCEPTION_VALUE(flags, index) ((flags)&(1UL<<(index)))
00337 
00338 /* number of bits in an 8-bit integer value */
00339 #define EXC_GROUP 8
00340 static uint8_t flagsOffset[256]={
00341     0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
00342     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
00343     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
00344     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
00345     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
00346     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
00347     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
00348     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
00349     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
00350     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
00351     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
00352     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
00353     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
00354     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
00355     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
00356     4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
00357 };
00358 
00359 #define ADD_EXCEPTION_OFFSET(flags, index, offset) { \
00360     if((index)>=EXC_GROUP) { \
00361         (offset)+=flagsOffset[(flags)&((1<<EXC_GROUP)-1)]; \
00362         (flags)>>=EXC_GROUP; \
00363         (index)-=EXC_GROUP; \
00364     } \
00365     (offset)+=flagsOffset[(flags)&((1<<(index))-1)]; \
00366 }
00367 
00368 /* API functions ------------------------------------------------------------ */
00369 
00370 /* Gets the Unicode character's general category.*/
00371 U_CAPI int8_t U_EXPORT2
00372 u_charType(UChar32 c) {
00373     return (int8_t)GET_CATEGORY(GET_PROPS(c));
00374 }
00375 
00376 /* Checks if ch is a lower case letter.*/
00377 U_CAPI UBool U_EXPORT2
00378 u_islower(UChar32 c) {
00379     return (UBool)(GET_CATEGORY(GET_PROPS(c))==U_LOWERCASE_LETTER);
00380 }
00381 
00382 /* Checks if ch is an upper case letter.*/
00383 U_CAPI UBool U_EXPORT2
00384 u_isupper(UChar32 c) {
00385     return (UBool)(GET_CATEGORY(GET_PROPS(c))==U_UPPERCASE_LETTER);
00386 }
00387 
00388 /* Checks if ch is a title case letter; usually upper case letters.*/
00389 U_CAPI UBool U_EXPORT2
00390 u_istitle(UChar32 c) {
00391     return (UBool)(GET_CATEGORY(GET_PROPS(c))==U_TITLECASE_LETTER);
00392 }
00393 
00394 /* Checks if ch is a decimal digit. */
00395 U_CAPI UBool U_EXPORT2
00396 u_isdigit(UChar32 c) {
00397     return (UBool)(((1UL<<GET_CATEGORY(GET_PROPS(c)))&
00398             (1UL<<U_DECIMAL_DIGIT_NUMBER|1UL<<U_OTHER_NUMBER|1UL<<U_LETTER_NUMBER)
00399            )!=0);
00400 }
00401 
00402 /* Checks if the Unicode character is a letter.*/
00403 U_CAPI UBool U_EXPORT2
00404 u_isalpha(UChar32 c) {
00405     return (UBool)(((1UL<<GET_CATEGORY(GET_PROPS(c)))&
00406             (1UL<<U_UPPERCASE_LETTER|1UL<<U_LOWERCASE_LETTER|1UL<<U_TITLECASE_LETTER|1UL<<U_MODIFIER_LETTER|1UL<<U_OTHER_LETTER)
00407            )!=0);
00408 }
00409 
00410 /* Checks if ch is a letter or a decimal digit */
00411 U_CAPI UBool U_EXPORT2
00412 u_isalnum(UChar32 c) {
00413     return (UBool)(((1UL<<GET_CATEGORY(GET_PROPS(c)))&
00414             (1UL<<U_DECIMAL_DIGIT_NUMBER|1UL<<U_OTHER_NUMBER|1UL<<U_LETTER_NUMBER|
00415              1UL<<U_UPPERCASE_LETTER|1UL<<U_LOWERCASE_LETTER|1UL<<U_TITLECASE_LETTER|1UL<<U_MODIFIER_LETTER|1UL<<U_OTHER_LETTER)
00416            )!=0);
00417 }
00418 
00419 /* Checks if ch is a unicode character with assigned character type.*/
00420 U_CAPI UBool U_EXPORT2
00421 u_isdefined(UChar32 c) {
00422     return (UBool)(GET_PROPS(c)!=0);
00423 }
00424 
00425 /* Checks if the Unicode character is a base form character that can take a diacritic.*/
00426 U_CAPI UBool U_EXPORT2
00427 u_isbase(UChar32 c) {
00428     return (UBool)(((1UL<<GET_CATEGORY(GET_PROPS(c)))&
00429             (1UL<<U_DECIMAL_DIGIT_NUMBER|1UL<<U_OTHER_NUMBER|1UL<<U_LETTER_NUMBER|
00430              1UL<<U_UPPERCASE_LETTER|1UL<<U_LOWERCASE_LETTER|1UL<<U_TITLECASE_LETTER|1UL<<U_MODIFIER_LETTER|1UL<<U_OTHER_LETTER|
00431              1UL<<U_NON_SPACING_MARK|1UL<<U_ENCLOSING_MARK|1UL<<U_COMBINING_SPACING_MARK)
00432            )!=0);
00433 }
00434 
00435 /* Checks if the Unicode character is a control character.*/
00436 U_CAPI UBool U_EXPORT2
00437 u_iscntrl(UChar32 c) {
00438     return (UBool)(((1UL<<GET_CATEGORY(GET_PROPS(c)))&
00439             (1UL<<U_CONTROL_CHAR|1UL<<U_FORMAT_CHAR|1UL<<U_LINE_SEPARATOR|1UL<<U_PARAGRAPH_SEPARATOR)
00440            )!=0);
00441 }
00442 
00443 /* Checks if the Unicode character is a space character.*/
00444 UBool
00445 u_isspace(UChar32 c) {
00446     return (UBool)(((1UL<<GET_CATEGORY(GET_PROPS(c)))&
00447             (1UL<<U_SPACE_SEPARATOR|1UL<<U_LINE_SEPARATOR|1UL<<U_PARAGRAPH_SEPARATOR)
00448            )!=0);
00449 }
00450 
00451 /* Checks if the Unicode character is a whitespace character.*/
00452 U_CAPI UBool U_EXPORT2
00453 u_isWhitespace(UChar32 c) {
00454     return (UBool)(((1UL<<GET_CATEGORY(GET_PROPS(c)))&
00455             (1UL<<U_SPACE_SEPARATOR|1UL<<U_LINE_SEPARATOR|1UL<<U_PARAGRAPH_SEPARATOR)
00456            )!=0 &&
00457            c!=0xa0 && c!=0x202f && c!=0xfeff); /* exclude no-break spaces */
00458 }
00459 
00460 /* Checks if the Unicode character is printable.*/
00461 U_CAPI UBool U_EXPORT2
00462 u_isprint(UChar32 c) {    
00463     return (UBool)(
00464             ((1UL<<GET_CATEGORY(GET_PROPS(c)))&
00465             (1UL<<U_DECIMAL_DIGIT_NUMBER|1UL<<U_OTHER_NUMBER|1UL<<U_LETTER_NUMBER|
00466              1UL<<U_UPPERCASE_LETTER|1UL<<U_LOWERCASE_LETTER|1UL<<U_TITLECASE_LETTER|1UL<<U_MODIFIER_LETTER|1UL<<U_OTHER_LETTER|
00467              1UL<<U_NON_SPACING_MARK|1UL<<U_ENCLOSING_MARK|1UL<<U_COMBINING_SPACING_MARK|
00468              1UL<<U_SPACE_SEPARATOR|1UL<<U_LINE_SEPARATOR|1UL<<U_PARAGRAPH_SEPARATOR|
00469              1UL<<U_DASH_PUNCTUATION|1UL<<U_START_PUNCTUATION|1UL<<U_END_PUNCTUATION|1UL<<U_CONNECTOR_PUNCTUATION|1UL<<U_OTHER_PUNCTUATION|
00470              1UL<<U_MATH_SYMBOL|1UL<<U_CURRENCY_SYMBOL|1UL<<U_MODIFIER_SYMBOL|1UL<<U_OTHER_SYMBOL)
00471            )!=0);
00472 }
00473 
00474 /* Checks if the Unicode character can start a Unicode identifier.*/
00475 U_CAPI UBool U_EXPORT2
00476 u_isIDStart(UChar32 c) {
00477     /* same as u_isalpha() */
00478     return (UBool)(((1UL<<GET_CATEGORY(GET_PROPS(c)))&
00479             (1UL<<U_UPPERCASE_LETTER|1UL<<U_LOWERCASE_LETTER|1UL<<U_TITLECASE_LETTER|1UL<<U_MODIFIER_LETTER|1UL<<U_OTHER_LETTER)
00480            )!=0);
00481 }
00482 
00483 /* Checks if the Unicode character can be a Unicode identifier part other than starting the
00484  identifier.*/
00485 U_CAPI UBool U_EXPORT2
00486 u_isIDPart(UChar32 c) {
00487     return (UBool)(
00488            ((1UL<<GET_CATEGORY(GET_PROPS(c)))&
00489             (1UL<<U_DECIMAL_DIGIT_NUMBER|1UL<<U_LETTER_NUMBER|
00490              1UL<<U_UPPERCASE_LETTER|1UL<<U_LOWERCASE_LETTER|1UL<<U_TITLECASE_LETTER|1UL<<U_MODIFIER_LETTER|1UL<<U_OTHER_LETTER|
00491              1UL<<U_CONNECTOR_PUNCTUATION|1UL<<U_COMBINING_SPACING_MARK|1UL<<U_NON_SPACING_MARK)
00492            )!=0 ||
00493            u_isIDIgnorable(c));
00494 }
00495 
00496 /*Checks if the Unicode character can be ignorable in a Java or Unicode identifier.*/
00497 U_CAPI UBool U_EXPORT2
00498 u_isIDIgnorable(UChar32 c) {
00499     return (UBool)((uint32_t)c<=8 ||
00500            (uint32_t)(c-0xe)<=(0x1b-0xe) ||
00501            (uint32_t)(c-0x7f)<=(0x9f-0x7f) ||
00502            (uint32_t)(c-0x200a)<=(0x200f-0x200a) ||
00503            (uint32_t)(c-0x206a)<=(0x206f-0x206a) ||
00504            c==0xfeff);
00505 }
00506 
00507 /*Checks if the Unicode character can start a Java identifier.*/
00508 U_CAPI UBool U_EXPORT2
00509 u_isJavaIDStart(UChar32 c) {
00510     return (UBool)(
00511            ((1UL<<GET_CATEGORY(GET_PROPS(c)))&
00512             (1UL<<U_UPPERCASE_LETTER|1UL<<U_LOWERCASE_LETTER|1UL<<U_TITLECASE_LETTER|1UL<<U_MODIFIER_LETTER|1UL<<U_OTHER_LETTER|
00513              1UL<<U_CURRENCY_SYMBOL|1UL<<U_CONNECTOR_PUNCTUATION)
00514            )!=0);
00515 }
00516 
00517 /*Checks if the Unicode character can be a Java identifier part other than starting the
00518  * identifier.
00519  */
00520 U_CAPI UBool U_EXPORT2
00521 u_isJavaIDPart(UChar32 c) {
00522     return (UBool)(
00523            ((1UL<<GET_CATEGORY(GET_PROPS(c)))&
00524             (1UL<<U_DECIMAL_DIGIT_NUMBER|1UL<<U_LETTER_NUMBER|
00525              1UL<<U_UPPERCASE_LETTER|1UL<<U_LOWERCASE_LETTER|1UL<<U_TITLECASE_LETTER|1UL<<U_MODIFIER_LETTER|1UL<<U_OTHER_LETTER|
00526              1UL<<U_CURRENCY_SYMBOL|1UL<<U_CONNECTOR_PUNCTUATION|
00527              1UL<<U_COMBINING_SPACING_MARK|1UL<<U_NON_SPACING_MARK)
00528            )!=0 ||
00529            u_isIDIgnorable(c));
00530 }
00531 
00532 /* Transforms the Unicode character to its lower case equivalent.*/
00533 U_CAPI UChar32 U_EXPORT2
00534 u_tolower(UChar32 c) {
00535     uint32_t props=GET_PROPS(c);
00536     if(!PROPS_VALUE_IS_EXCEPTION(props)) {
00537         if((1UL<<GET_CATEGORY(props))&(1UL<<U_UPPERCASE_LETTER|1UL<<U_TITLECASE_LETTER)) {
00538             return c+GET_SIGNED_VALUE(props);
00539         }
00540     } else {
00541         uint32_t *pe=GET_EXCEPTIONS(props);
00542         uint32_t firstExceptionValue=*pe;
00543         if(HAVE_EXCEPTION_VALUE(firstExceptionValue, EXC_LOWERCASE)) {
00544             int i=EXC_LOWERCASE;
00545             ++pe;
00546             ADD_EXCEPTION_OFFSET(firstExceptionValue, i, pe);
00547             return (UChar32)*pe;
00548         }
00549     }
00550     return c; /* no mapping - return c itself */
00551 }
00552     
00553 /* Transforms the Unicode character to its upper case equivalent.*/
00554 U_CAPI UChar32 U_EXPORT2
00555 u_toupper(UChar32 c) {
00556     uint32_t props=GET_PROPS(c);
00557     if(!PROPS_VALUE_IS_EXCEPTION(props)) {
00558         if(GET_CATEGORY(props)==U_LOWERCASE_LETTER) {
00559             return c-GET_SIGNED_VALUE(props);
00560         }
00561     } else {
00562         uint32_t *pe=GET_EXCEPTIONS(props);
00563         uint32_t firstExceptionValue=*pe;
00564         if(HAVE_EXCEPTION_VALUE(firstExceptionValue, EXC_UPPERCASE)) {
00565             int i=EXC_UPPERCASE;
00566             ++pe;
00567             ADD_EXCEPTION_OFFSET(firstExceptionValue, i, pe);
00568             return (UChar32)*pe;
00569         }
00570     }
00571     return c; /* no mapping - return c itself */
00572 }
00573 
00574 /* Transforms the Unicode character to its title case equivalent.*/
00575 U_CAPI UChar32 U_EXPORT2
00576 u_totitle(UChar32 c) {
00577     uint32_t props=GET_PROPS(c);
00578     if(!PROPS_VALUE_IS_EXCEPTION(props)) {
00579         if(GET_CATEGORY(props)==U_LOWERCASE_LETTER) {
00580             /* here, titlecase is same as uppercase */
00581             return c-GET_SIGNED_VALUE(props);
00582         }
00583     } else {
00584         uint32_t *pe=GET_EXCEPTIONS(props);
00585         uint32_t firstExceptionValue=*pe;
00586         if(HAVE_EXCEPTION_VALUE(firstExceptionValue, EXC_TITLECASE)) {
00587             int i=EXC_TITLECASE;
00588             ++pe;
00589             ADD_EXCEPTION_OFFSET(firstExceptionValue, i, pe);
00590             return (UChar32)*pe;
00591         } else if(HAVE_EXCEPTION_VALUE(firstExceptionValue, EXC_UPPERCASE)) {
00592             /* here, titlecase is same as uppercase */
00593             int i=EXC_UPPERCASE;
00594             ++pe;
00595             ADD_EXCEPTION_OFFSET(firstExceptionValue, i, pe);
00596             return (UChar32)*pe;
00597         }
00598     }
00599     return c; /* no mapping - return c itself */
00600 }
00601 
00602 U_CAPI int32_t U_EXPORT2
00603 u_charDigitValue(UChar32 c) {
00604     uint32_t props=GET_PROPS(c);
00605     if(!PROPS_VALUE_IS_EXCEPTION(props)) {
00606         if(GET_CATEGORY(props)==U_DECIMAL_DIGIT_NUMBER) {
00607             return GET_SIGNED_VALUE(props);
00608         }
00609     } else {
00610         uint32_t *pe=GET_EXCEPTIONS(props);
00611         uint32_t firstExceptionValue=*pe;
00612         if(HAVE_EXCEPTION_VALUE(firstExceptionValue, EXC_DIGIT_VALUE)) {
00613             int32_t value;
00614             int i=EXC_DIGIT_VALUE;
00615             ++pe;
00616             ADD_EXCEPTION_OFFSET(firstExceptionValue, i, pe);
00617             value=(int32_t)(int16_t)*pe; /* the digit value is in bits 15..0 */
00618             if(value!=-1) {
00619                 return value;
00620             }
00621         }
00622     }
00623 
00624     /* if there is no value in the properties table, then check for some special characters */
00625     switch(c) {
00626     case 0x3007:    return 0; /* Han Zero*/
00627     case 0x4e00:    return 1; /* Han One*/
00628     case 0x4e8c:    return 2; /* Han Two*/
00629     case 0x4e09:    return 3; /* Han Three*/
00630     case 0x56d8:    return 4; /* Han Four*/
00631     case 0x4e94:    return 5; /* Han Five*/
00632     case 0x516d:    return 6; /* Han Six*/
00633     case 0x4e03:    return 7; /* Han Seven*/
00634     case 0x516b:    return 8; /* Han Eight*/
00635     case 0x4e5d:    return 9; /* Han Nine*/
00636     default:        return -1; /* no value */
00637     }
00638 }
00639 
00640 /* Gets the character's linguistic directionality.*/
00641 U_CAPI UCharDirection U_EXPORT2
00642 u_charDirection(UChar32 c) {   
00643     uint32_t props=GET_PROPS(c);
00644     if(props!=0) {
00645         return (UCharDirection)((props>>BIDI_SHIFT)&0x1f);
00646     } else {
00647         return U_BOUNDARY_NEUTRAL;
00648     }
00649 }
00650 
00651 U_CAPI UBool U_EXPORT2
00652 u_isMirrored(UChar32 c) {
00653     return (UBool)(GET_PROPS(c)&(1UL<<MIRROR_SHIFT) ? TRUE : FALSE);
00654 }
00655 
00656 U_CAPI UChar32 U_EXPORT2
00657 u_charMirror(UChar32 c) {
00658     uint32_t props=GET_PROPS(c);
00659     if((props&(1UL<<MIRROR_SHIFT))==0) {
00660         /* not mirrored - the value is not a mirror offset */
00661         return c;
00662     } else if(!PROPS_VALUE_IS_EXCEPTION(props)) {
00663         return c+GET_SIGNED_VALUE(props);
00664     } else {
00665         uint32_t *pe=GET_EXCEPTIONS(props);
00666         uint32_t firstExceptionValue=*pe;
00667         if(HAVE_EXCEPTION_VALUE(firstExceptionValue, EXC_MIRROR_MAPPING)) {
00668             int i=EXC_MIRROR_MAPPING;
00669             ++pe;
00670             ADD_EXCEPTION_OFFSET(firstExceptionValue, i, pe);
00671             return (UChar32)*pe;
00672         } else {
00673             return c;
00674         }
00675     }
00676 }
00677 
00678 /* static data tables ------------------------------------------------------- */
00679 
00680 struct BlockScriptMap {
00681     UChar        fFirstCode;
00682     UChar        fLastCode;
00683 };
00684 typedef struct BlockScriptMap BlockScriptMap;
00685 
00686 static const BlockScriptMap fScriptIndex[] = {
00687 /* Generated from the Unicode-3.0-beta blocks.txt file */
00688   { 0x0000, 0x007F }, /*BASIC_LATIN */
00689   { 0x0080, 0x00FF }, /*LATIN_1_SUPPLEMENT */
00690   { 0x0100, 0x017F }, /*LATIN_EXTENDED_A */
00691   { 0x0180, 0x024F }, /*LATIN_EXTENDED_B */
00692   { 0x0250, 0x02AF }, /*IPA_EXTENSIONS */
00693   { 0x02B0, 0x02FF }, /*SPACING_MODIFIER_LETTERS */
00694   { 0x0300, 0x036F }, /*COMBINING_DIACRITICAL_MARKS */
00695   { 0x0370, 0x03FF }, /*GREEK */
00696   { 0x0400, 0x04FF }, /*CYRILLIC */
00697   { 0x0530, 0x058F }, /*ARMENIAN */
00698   { 0x0590, 0x05FF }, /*HEBREW */
00699   { 0x0600, 0x06FF }, /*ARABIC */
00700   { 0x0700, 0x074F }, /*SYRIAC */
00701   { 0x0780, 0x07BF }, /*THAANA */
00702   { 0x0900, 0x097F }, /*DEVANAGARI */
00703   { 0x0980, 0x09FF }, /*BENGALI */
00704   { 0x0A00, 0x0A7F }, /*GURMUKHI */
00705   { 0x0A80, 0x0AFF }, /*GUJARATI */
00706   { 0x0B00, 0x0B7F }, /*ORIYA */
00707   { 0x0B80, 0x0BFF }, /*TAMIL */
00708   { 0x0C00, 0x0C7F }, /*TELUGU */
00709   { 0x0C80, 0x0CFF }, /*KANNADA */
00710   { 0x0D00, 0x0D7F }, /*MALAYALAM */
00711   { 0x0D80, 0x0DFF }, /*SINHALA */
00712   { 0x0E00, 0x0E7F }, /*THAI */
00713   { 0x0E80, 0x0EFF }, /*LAO */
00714   { 0x0F00, 0x0FFF }, /*TIBETAN */
00715   { 0x1000, 0x109F }, /*MYANMAR */
00716   { 0x10A0, 0x10FF }, /*GEORGIAN */
00717   { 0x1100, 0x11FF }, /*HANGUL_JAMO */
00718   { 0x1200, 0x137F }, /*ETHIOPIC */
00719   { 0x13A0, 0x13FF }, /*CHEROKEE */
00720   { 0x1400, 0x167F }, /*UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS */
00721   { 0x1680, 0x169F }, /*OGHAM */
00722   { 0x16A0, 0x16FF }, /*RUNIC */
00723   { 0x1780, 0x17FF }, /*KHMER */
00724   { 0x1800, 0x18AF }, /*MONGOLIAN */
00725   { 0x1E00, 0x1EFF }, /*LATIN_EXTENDED_ADDITIONAL */
00726   { 0x1F00, 0x1FFF }, /*GREEK_EXTENDED */
00727   { 0x2000, 0x206F }, /*GENERAL_PUNCTUATION */
00728   { 0x2070, 0x209F }, /*SUPERSCRIPTS_AND_SUBSCRIPTS */
00729   { 0x20A0, 0x20CF }, /*CURRENCY_SYMBOLS */
00730   { 0x20D0, 0x20FF }, /*COMBINING_MARKS_FOR_SYMBOLS */
00731   { 0x2100, 0x214F }, /*LETTERLIKE_SYMBOLS */
00732   { 0x2150, 0x218F }, /*NUMBER_FORMS */
00733   { 0x2190, 0x21FF }, /*ARROWS */
00734   { 0x2200, 0x22FF }, /*MATHEMATICAL_OPERATORS */
00735   { 0x2300, 0x23FF }, /*MISCELLANEOUS_TECHNICAL */
00736   { 0x2400, 0x243F }, /*CONTROL_PICTURES */
00737   { 0x2440, 0x245F }, /*OPTICAL_CHARACTER_RECOGNITION */
00738   { 0x2460, 0x24FF }, /*ENCLOSED_ALPHANUMERICS */
00739   { 0x2500, 0x257F }, /*BOX_DRAWING */
00740   { 0x2580, 0x259F }, /*BLOCK_ELEMENTS */
00741   { 0x25A0, 0x25FF }, /*GEOMETRIC_SHAPES */
00742   { 0x2600, 0x26FF }, /*MISCELLANEOUS_SYMBOLS */
00743   { 0x2700, 0x27BF }, /*DINGBATS */
00744   { 0x2800, 0x28FF }, /*BRAILLE_PATTERNS */
00745   { 0x2E80, 0x2EFF }, /*CJK_RADICALS_SUPPLEMENT */
00746   { 0x2F00, 0x2FDF }, /*KANGXI_RADICALS */
00747   { 0x2FF0, 0x2FFF }, /*IDEOGRAPHIC_DESCRIPTION_CHARACTERS */
00748   { 0x3000, 0x303F }, /*CJK_SYMBOLS_AND_PUNCTUATION */
00749   { 0x3040, 0x309F }, /*HIRAGANA */
00750   { 0x30A0, 0x30FF }, /*KATAKANA */
00751   { 0x3100, 0x312F }, /*BOPOMOFO */
00752   { 0x3130, 0x318F }, /*HANGUL_COMPATIBILITY_JAMO */
00753   { 0x3190, 0x319F }, /*KANBUN */
00754   { 0x31A0, 0x31BF }, /*BOPOMOFO_EXTENDED */
00755   { 0x3200, 0x32FF }, /*ENCLOSED_CJK_LETTERS_AND_MONTHS */
00756   { 0x3300, 0x33FF }, /*CJK_COMPATIBILITY */
00757   { 0x3400, 0x4DB5 }, /*CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A */
00758   { 0x4E00, 0x9FFF }, /*CJK_UNIFIED_IDEOGRAPHS */
00759   { 0xA000, 0xA48F }, /*YI_SYLLABLES */
00760   { 0xA490, 0xA4CF }, /*YI_RADICALS */
00761   { 0xAC00, 0xD7A3 }, /*HANGUL_SYLLABLES */
00762   { 0xD800, 0xDB7F }, /*HIGH_SURROGATES */
00763   { 0xDB80, 0xDBFF }, /*HIGH_PRIVATE_USE_SURROGATES */
00764   { 0xDC00, 0xDFFF }, /*LOW_SURROGATES */
00765   { 0xE000, 0xF8FF }, /*PRIVATE_USE */
00766   { 0xF900, 0xFAFF }, /*CJK_COMPATIBILITY_IDEOGRAPHS */
00767   { 0xFB00, 0xFB4F }, /*ALPHABETIC_PRESENTATION_FORMS */
00768   { 0xFB50, 0xFDFF }, /*ARABIC_PRESENTATION_FORMS_A */
00769   { 0xFE20, 0xFE2F }, /*COMBINING_HALF_MARKS */
00770   { 0xFE30, 0xFE4F }, /*CJK_COMPATIBILITY_FORMS */
00771   { 0xFE50, 0xFE6F }, /*SMALL_FORM_VARIANTS */
00772   { 0xFE70, 0xFEFE }, /*ARABIC_PRESENTATION_FORMS_B */
00773   { 0xFEFF, 0xFEFF }, /*U_SPECIALS */
00774   { 0xFF00, 0xFFEF }, /*HALFWIDTH_AND_FULLWIDTH_FORMS */
00775   { 0xFFF0, 0xFFFD }, /*SPECIALS_2 = "U_CHAR_SCRIPT_COUNT" (really specials) */
00776   { 0xFFFF, 0xFFFF } /* END */
00777 };
00778 
00779 const UChar cellWidthRanges[] =
00780     {
00781         0x0000, /* general scripts area*/
00782         0x1100, /* combining Hangul choseong*/
00783         0x1160, /* combining Hangul jungseong and jongseong*/
00784         0x1e00, /* Latin Extended Additional, Greek Extended*/
00785         0x2000, /* symbols and punctuation*/
00786         0x3000, /* CJK phonetics & symbols, CJK ideographs, Hangul syllables*/
00787         0xd800, /* surrogates, private use*/
00788         0xf900, /* CJK compatibility ideographs*/
00789         0xfb00, /* alphabetic presentation forms, Arabic presentations forms A, combining half marks*/
00790         0xfe30, /* CJK compatibility forms, small form variants*/
00791         0xfe70, /* Arabic presentation forms B*/
00792         0xff00, /* fullwidth ASCII*/
00793         0xff60, /* halfwidth, CJK punctuation, Katakana, Hangul Jamo*/
00794         0xffe0, /* fullwidth punctuation and currency signs*/
00795         0xffe8, /* halfwidth forms, arrows, and shapes*/
00796         0xfff0  /* specials*/
00797     };
00798 
00799 const UChar cellWidthValues[] =
00800     {
00801         U_HALF_WIDTH,    /* general scripts area*/
00802         U_FULL_WIDTH,    /* combining Hangul choseong*/
00803         U_ZERO_WIDTH,    /* combining Hangul jungseong and jongseong*/
00804         U_HALF_WIDTH,    /* Latin extended aAdditional, Greek extended*/
00805         U_NEUTRAL_WIDTH, /* symbols and punctuation*/
00806         U_FULL_WIDTH,    /* CJK phonetics & symbols, CJK ideographs, Hangul syllables*/
00807         U_NEUTRAL_WIDTH, /* surrogates, private use*/
00808         U_FULL_WIDTH,    /* CJK compatibility ideographs*/
00809         U_HALF_WIDTH,    /* alphabetic presentation forms, Arabic presentations forms A, combining half marks*/
00810         U_FULL_WIDTH,    /* CJK compatibility forms, small form variants*/
00811         U_HALF_WIDTH,    /* Arabic presentation forms B*/
00812         U_FULL_WIDTH,    /* fullwidth ASCII*/
00813         U_HALF_WIDTH,    /* halfwidth CJK punctuation, Katakana, Hangul Jamo*/
00814         U_FULL_WIDTH,    /* fullwidth punctuation and currency signs*/
00815         U_HALF_WIDTH,    /* halfwidth forms, arrows, and shapes*/
00816         U_ZERO_WIDTH     /* specials*/
00817     };
00818 
00819 const int16_t numCellWidthValues = 16;
00820 
00821 /* Get the script associated with the character*/
00822 UCharScript
00823 u_charScript(UChar32 ch)
00824 {
00825     int32_t i, j;
00826     UCharScript returnValue = U_NO_SCRIPT;
00827 
00828     /* surrogate support is still incomplete */
00829     if((uint32_t)ch>0xffff) {
00830         return U_NO_SCRIPT;
00831     }
00832 
00833     /* ### a binary search would be faster; maybe this should go into a data file, too */
00834     i = -1;
00835     for( j = 0; i == -1 && fScriptIndex[j].fFirstCode != 0xFFFF; ++j )
00836         if( fScriptIndex[j].fFirstCode <= ch && ch <= fScriptIndex[j].fLastCode ) {
00837             i = j;
00838             if(j == U_CHAR_SCRIPT_COUNT) /* "U_SPECIALS 2" */
00839               i = U_SPECIALS;
00840         }
00841     if(i >= U_CHAR_SCRIPT_COUNT) {
00842         returnValue = U_NO_SCRIPT;
00843     }
00844     else if( i != -1 ) {
00845         returnValue = (UCharScript)i;
00846     } 
00847 
00848     return returnValue;
00849 }
00850 
00851 /* Gets table cell width of the Unicode character.*/
00852 uint16_t
00853 u_charCellWidth(UChar32 ch)
00854 {
00855     int16_t i;
00856     int32_t type = u_charType(ch);
00857 
00858     /* surrogate support is still incomplete */
00859     if((uint32_t)ch>0xffff) {
00860         return U_ZERO_WIDTH;
00861     }
00862 
00863     /* these Unicode character types are scattered throughout the Unicode range, so
00864      special-case for them*/
00865     switch (type) {
00866         case U_UNASSIGNED:
00867         case U_NON_SPACING_MARK:
00868         case U_ENCLOSING_MARK:
00869         case U_LINE_SEPARATOR:
00870         case U_PARAGRAPH_SEPARATOR:
00871         case U_CONTROL_CHAR:
00872         case U_FORMAT_CHAR:
00873             return U_ZERO_WIDTH;
00874 
00875         default:
00876             /* for all remaining characters, find out which Unicode range they belong to using
00877                the table above, and then look up the appropriate return value in that table*/
00878             for (i = 0; i < numCellWidthValues; ++i)
00879                 if (ch < cellWidthRanges[i])
00880                     break;
00881             --i;
00882             return cellWidthValues[i];
00883     }
00884 }
00885 
00886 void u_getUnicodeVersion(UVersionInfo versionArray) {
00887     if(versionArray!=NULL) {
00888         uprv_memcpy(versionArray, dataVersion, U_MAX_VERSION_LENGTH);
00889     }
00890 }
00891 
00892 /* string casing ------------------------------------------------------------ */
00893 
00894 U_CAPI int32_t U_EXPORT2
00895 u_strToUpper(const UChar *src, int32_t srcLength,
00896              UChar *dest, int32_t destCapacity,
00897              const char *locale,
00898              UErrorCode *pErrorCode) {
00899     /* ### TODO for ICU 1.8 */
00900     *pErrorCode=U_UNSUPPORTED_ERROR;
00901     return 0;
00902 }
00903 
00904 U_CAPI int32_t U_EXPORT2
00905 u_strToLower(const UChar *src, int32_t srcLength,
00906              UChar *dest, int32_t destCapacity,
00907              const char *locale,
00908              UErrorCode *pErrorCode) {
00909     /* ### TODO for ICU 1.8 */
00910     *pErrorCode=U_UNSUPPORTED_ERROR;
00911     return 0;
00912 }

Generated at Tue Dec 5 10:47:56 2000 for ICU by doxygen1.2.3 written by Dimitri van Heesch, © 1997-2000