Main Page   Class Hierarchy   Compound List   File List   Header Files   Sources   Compound Members   File Members  

unicode.h

00001 /*
00002 *****************************************************************************************
00003 *   Copyright (C) 1996-1999, International Business Machines
00004 *   Corporation and others.  All Rights Reserved.
00005 *****************************************************************************************
00006 */
00007 //  FILE NAME : unicode.h
00008 //
00009 //  CREATED
00010 //      Wednesday, December 11, 1996
00011 //
00012 //  CREATED BY
00013 //      Helena Shih
00014 //
00015 //  CHANGES
00016 //      Thursday, April 15, 1999
00017 //      Modified the definitions of all the functions
00018 //      C++ Wrappers for Unicode
00019 //  CHANGES BY
00020 //      Madhu Katragadda
00021 //   5/20/99     Madhu          Added the function getVersion()
00022 //  11/22/99     aliu       Added MIN_RADIX, MAX_RADIX, digit, forDigit
00023 //********************************************************************************************
00024    
00025          
00026 
00027 #ifndef UNICODE_H
00028 #define UNICODE_H
00029 
00030 #include "unicode/utypes.h"
00031 #include "unicode/uchar.h"
00032 
00046 class U_COMMON_API Unicode
00047 {
00048 public:
00049     /*
00050      * In C++, static const members actually take up memory and need to be accessed.
00051      * enum values are more like C #define's.
00052      * The following is a collection of constants, not an enumeration type.
00053      */
00054     enum {
00056         MIN_VALUE=0,
00057 
00063         MAX_VALUE=0x10ffff,
00064 
00072         MAX_CHAR_LENGTH=UTF_MAX_CHAR_LENGTH,
00073 
00084         MIN_RADIX=2,
00085 
00096         MAX_RADIX=36
00097     };
00098 
00103     enum EUnicodeGeneralTypes
00104     {
00105         UNASSIGNED              = 0,
00106         UPPERCASE_LETTER        = 1,
00107         LOWERCASE_LETTER        = 2,
00108         TITLECASE_LETTER        = 3,
00109         MODIFIER_LETTER         = 4,
00110         OTHER_LETTER            = 5,
00111         NON_SPACING_MARK        = 6,
00112         ENCLOSING_MARK          = 7,
00113         COMBINING_SPACING_MARK  = 8,
00114         DECIMAL_DIGIT_NUMBER    = 9,
00115         LETTER_NUMBER           = 10,
00116         OTHER_NUMBER            = 11,
00117         SPACE_SEPARATOR         = 12,
00118         LINE_SEPARATOR          = 13,
00119         PARAGRAPH_SEPARATOR     = 14,
00120         CONTROL                 = 15,
00121         FORMAT                  = 16,
00122         PRIVATE_USE             = 17,
00123         SURROGATE               = 18,
00124         DASH_PUNCTUATION        = 19,
00125         START_PUNCTUATION       = 20,
00126         END_PUNCTUATION         = 21,
00127                 CONNECTOR_PUNCTUATION   = 22,
00128         OTHER_PUNCTUATION       = 23,
00129         MATH_SYMBOL             = 24,
00130         CURRENCY_SYMBOL         = 25,
00131         MODIFIER_SYMBOL         = 26,
00132         OTHER_SYMBOL            = 27,
00133                 INITIAL_PUNCTUATION     = 28,
00134                 FINAL_PUNCTUATION       = 29,
00135         GENERAL_TYPES_COUNT     = 30
00136     };
00137 
00138     enum EUnicodeScript 
00139     {
00140         kBasicLatin,
00141         kLatin1Supplement,
00142         kLatinExtendedA,
00143         kLatinExtendedB,
00144         kIPAExtension,
00145         kSpacingModifier,
00146         kCombiningDiacritical,
00147         kGreek,
00148         kCyrillic,
00149         kArmenian,
00150         kHebrew,
00151         kArabic,
00152         kDevanagari,
00153         kBengali,
00154         kGurmukhi,
00155         kGujarati,
00156         kOriya,
00157         kTamil,
00158         kTelugu,
00159         kKannada,
00160         kMalayalam,
00161         kThai,
00162         kLao,
00163         kTibetan,
00164         kGeorgian,
00165         kHangulJamo,
00166         kLatinExtendedAdditional,
00167         kGreekExtended,
00168         kGeneralPunctuation,
00169         kSuperSubScript,
00170         kCurrencySymbolScript,
00171         kSymbolCombiningMark,
00172         kLetterlikeSymbol,
00173         kNumberForm,
00174         kArrow,
00175         kMathOperator,
00176         kMiscTechnical,
00177         kControlPicture,
00178         kOpticalCharacter,
00179         kEnclosedAlphanumeric,
00180         kBoxDrawing,
00181         kBlockElement,
00182         kGeometricShape,
00183         kMiscSymbol,
00184         kDingbat,
00185         kCJKSymbolPunctuation,
00186         kHiragana,
00187         kKatakana,
00188         kBopomofo,
00189         kHangulCompatibilityJamo,
00190         kKanbun,
00191         kEnclosedCJKLetterMonth,
00192         kCJKCompatibility,
00193         kCJKUnifiedIdeograph,
00194         kHangulSyllable,
00195         kHighSurrogate,
00196         kHighPrivateUseSurrogate,
00197         kLowSurrogate,
00198         kPrivateUse,
00199         kCJKCompatibilityIdeograph,
00200         kAlphabeticPresentation,
00201         kArabicPresentationA,
00202         kCombiningHalfMark,
00203         kCJKCompatibilityForm,
00204         kSmallFormVariant,
00205         kArabicPresentationB,
00206         kNoScript,
00207         kHalfwidthFullwidthForm,
00208         kScriptCount
00209     };
00210 
00214     enum EDirectionProperty { 
00215         LEFT_TO_RIGHT               = 0, 
00216                 RIGHT_TO_LEFT               = 1, 
00217                 EUROPEAN_NUMBER             = 2,
00218                 EUROPEAN_NUMBER_SEPARATOR   = 3,
00219                 EUROPEAN_NUMBER_TERMINATOR  = 4,
00220                 ARABIC_NUMBER               = 5,
00221                 COMMON_NUMBER_SEPARATOR     = 6,
00222                 BLOCK_SEPARATOR             = 7,
00223                 SEGMENT_SEPARATOR           = 8,
00224                 WHITE_SPACE_NEUTRAL         = 9, 
00225                 OTHER_NEUTRAL               = 10, 
00226                 LEFT_TO_RIGHT_EMBEDDING     = 11,
00227                 LEFT_TO_RIGHT_OVERRIDE      = 12,
00228                 RIGHT_TO_LEFT_ARABIC        = 13,
00229                 RIGHT_TO_LEFT_EMBEDDING     = 14,
00230                 RIGHT_TO_LEFT_OVERRIDE      = 15,
00231                 POP_DIRECTIONAL_FORMAT      = 16,
00232                 DIR_NON_SPACING_MARK        = 17,
00233                 BOUNDARY_NEUTRAL            = 18
00234     };
00235     
00240     enum ECellWidths
00241     {
00242         ZERO_WIDTH              = 0,
00243         HALF_WIDTH              = 1,
00244         FULL_WIDTH              = 2,
00245         NEUTRAL                 = 3
00246     };
00247 
00257     static inline UBool isSingle(UChar c);
00258 
00266     static inline UBool isLead(UChar c);
00267 
00275     static inline UBool isTrail(UChar c);
00276 
00286     static inline UBool isSurrogate(UChar32 c);
00287 
00299     static inline UBool isUnicodeChar(UChar32 c);
00300 
00311     static inline UBool isError(UChar32 c);
00312 
00321     static inline UBool isValid(UChar32 c);
00322 
00333     static inline UBool needMultipleUChar(UChar32 c);
00334 
00342     static inline int32_t charLength(UChar32 c);
00343 
00356     static inline int32_t arraySize(int32_t size);
00357 
00370     static inline UBool isLowerCase(UChar32 ch);
00371 
00383     static inline UBool isUpperCase(UChar32 ch);
00384 
00396     static inline UBool isTitleCase(UChar32 ch);
00397 
00409     static inline UBool isDigit(UChar32 ch);
00410 
00426     static inline UBool isDefined(UChar32 ch);
00427 
00438     static inline UBool isControl(UChar32 ch);
00439 
00450     static inline UBool isPrintable(UChar32 ch);
00451 
00463      static inline UBool isBaseForm(UChar32 ch);
00464 
00480     static inline UBool isLetter(UChar32 ch);
00481 
00502     static inline UBool isJavaIdentifierStart(UChar32 ch);
00503 
00532     static inline UBool isJavaIdentifierPart(UChar32 ch);
00533 
00548     static inline UBool isUnicodeIdentifierStart(UChar32 ch);
00549 
00576     static inline UBool isUnicodeIdentifierPart(UChar32 ch);
00577 
00603     static inline UBool isIdentifierIgnorable(UChar32 ch);
00604 
00629    static inline UChar32 toLowerCase(UChar32 ch); 
00630 
00652     static inline UChar32 toUpperCase(UChar32 ch);
00653 
00671     static inline UChar32 toTitleCase(UChar32 ch);
00672 
00681     static inline UBool isSpaceChar(UChar32 ch);
00682 
00711     static inline UBool isWhitespace(UChar32 ch);
00712 
00747     static inline int8_t getType(UChar32 ch);
00748 
00758     static inline EDirectionProperty characterDirection(UChar32 ch);
00759 
00769     static inline UBool isMirrored(UChar32 c);
00770 
00786     static inline UChar32 charMirror(UChar32 c);
00787 
00793     static inline EUnicodeScript getScript(UChar32 ch);
00794 
00846     static inline uint16_t getCellWidth(UChar32 ch);
00847 
00875     static inline UTextOffset
00876     getCharName(uint32_t code,
00877                 char *buffer, UTextOffset bufferLength,
00878                 UCharNameChoice nameChoice=U_UNICODE_CHAR_NAME);
00879 
00890     static inline int32_t digitValue(UChar32 ch);     
00891 
00929     static inline int8_t digit(UChar32 ch, int8_t radix);
00930         
00958     static inline UChar32 forDigit(int32_t digit, int8_t radix);
00959 
00965         static void getUnicodeVersion(UVersionInfo info);
00966 
00967 protected:
00968     // These constructors, destructor, and assignment operator must
00969     // be protected (not private, as they semantically are) to make
00970     // various UNIX compilers happy. [LIU]
00971     // They should be private to prevent anyone from instantiating or
00972     // subclassing Unicode.
00973     Unicode();
00974     Unicode(const Unicode &other);
00975     ~Unicode();
00976     const Unicode &operator=(const Unicode &other);
00977 };
00978 
00979 /* inline implementations --------------------------------------------------- */
00980 
00981 inline UBool
00982 Unicode::isSingle(UChar c) {
00983     return UTF_IS_SINGLE(c);
00984 }
00985 
00986 inline UBool
00987 Unicode::isLead(UChar c) {
00988     return UTF_IS_LEAD(c);
00989 }
00990 
00991 inline UBool
00992 Unicode::isTrail(UChar c) {
00993     return UTF_IS_TRAIL(c);
00994 }
00995 
00996 inline UBool
00997 Unicode::isSurrogate(UChar32 c) {
00998     return UTF_IS_SURROGATE(c);
00999 }
01000 
01001 inline UBool
01002 Unicode::isUnicodeChar(UChar32 c) {
01003     return UTF_IS_UNICODE_CHAR(c);
01004 }
01005 
01006 inline UBool
01007 Unicode::isError(UChar32 c) {
01008     return UTF_IS_ERROR(c);
01009 }
01010 
01011 inline UBool
01012 Unicode::isValid(UChar32 c) {
01013     return UTF_IS_VALID(c);
01014 }
01015 
01016 inline UBool
01017 Unicode::needMultipleUChar(UChar32 c) {
01018     return UTF_NEED_MULTIPLE_UCHAR(c);
01019 }
01020 
01021 inline int32_t
01022 Unicode::charLength(UChar32 c) {
01023     return UTF_CHAR_LENGTH(c);
01024 }
01025 
01026 inline int32_t
01027 Unicode::arraySize(int32_t size) {
01028     return UTF_ARRAY_SIZE(size);
01029 }
01030 
01031 // Checks if ch is a lower case letter.
01032 inline UBool
01033 Unicode::isLowerCase(UChar32 ch) {
01034     return u_islower(ch);
01035 }
01036 
01037 // Checks if ch is a upper case letter.
01038 inline UBool
01039 Unicode::isUpperCase(UChar32 ch) {
01040     return u_isupper(ch);
01041 }
01042 
01043 // Checks if ch is a title case letter; usually upper case letters.
01044 inline UBool
01045 Unicode::isTitleCase(UChar32 ch) {
01046     return u_istitle(ch);
01047 }
01048 
01049 // Checks if ch is a decimal digit.
01050 inline UBool
01051 Unicode::isDigit(UChar32 ch) {
01052     return u_isdigit(ch);
01053 }
01054 
01055 // Checks if ch is a unicode character with assigned character type.
01056 inline UBool
01057 Unicode::isDefined(UChar32 ch) {
01058     return u_isdefined(ch);
01059 }
01060 
01061 // Checks if the Unicode character is a control character.
01062 inline UBool
01063 Unicode::isControl(UChar32 ch) {
01064     return u_iscntrl(ch);
01065 }
01066 
01067 // Checks if the Unicode character is printable.
01068 inline UBool
01069 Unicode::isPrintable(UChar32 ch) {
01070     return u_isprint(ch);
01071 }
01072 
01073 // Checks if the Unicode character is a base form character that can take a diacritic.
01074 inline UBool
01075 Unicode::isBaseForm(UChar32 ch) {
01076     return u_isbase(ch);
01077 }
01078 
01079 // Checks if the Unicode character is a letter.
01080 inline UBool
01081 Unicode::isLetter(UChar32 ch) {
01082     return u_isalpha(ch);
01083 }
01084 
01085 // Checks if the Unicode character can start a Java identifier.
01086 inline UBool
01087 Unicode::isJavaIdentifierStart(UChar32 ch) {
01088     return u_isJavaIDStart(ch);
01089 }
01090 
01091 // Checks if the Unicode character can be a Java identifier part other than starting the
01092 // identifier.
01093 inline UBool
01094 Unicode::isJavaIdentifierPart(UChar32 ch) {
01095     return u_isJavaIDPart(ch);
01096 }
01097 
01098 // Checks if the Unicode character can start a Unicode identifier.
01099 inline UBool
01100 Unicode::isUnicodeIdentifierStart(UChar32 ch) {
01101     return u_isIDStart(ch);
01102 }
01103 
01104 // Checks if the Unicode character can be a Unicode identifier part other than starting the
01105 // identifier.
01106 inline UBool
01107 Unicode::isUnicodeIdentifierPart(UChar32 ch) {
01108     return u_isIDPart(ch);
01109 }
01110 
01111 // Checks if the Unicode character can be ignorable in a Java or Unicode identifier.
01112 inline UBool
01113 Unicode::isIdentifierIgnorable(UChar32 ch) {
01114     return u_isIDIgnorable(ch);
01115 }
01116 
01117 // Transforms the Unicode character to its lower case equivalent.
01118 inline UChar32       
01119 Unicode::toLowerCase(UChar32 ch) {
01120     return u_tolower(ch);
01121 }
01122     
01123 // Transforms the Unicode character to its upper case equivalent.
01124 inline UChar32
01125 Unicode::toUpperCase(UChar32 ch) {
01126     return u_toupper(ch);
01127 }
01128 
01129 // Transforms the Unicode character to its title case equivalent.
01130 inline UChar32
01131 Unicode::toTitleCase(UChar32 ch) {
01132     return u_totitle(ch);
01133 }
01134 
01135 // Checks if the Unicode character is a space character.
01136 inline UBool
01137 Unicode::isSpaceChar(UChar32 ch) {
01138     return u_isspace(ch);
01139 }
01140 
01141 // Determines if the specified character is white space according to ICU.
01142 inline UBool
01143 Unicode::isWhitespace(UChar32 ch) {
01144     return u_isWhitespace(ch);
01145 }
01146 
01147 // Gets if the Unicode character's character property.
01148 inline int8_t
01149 Unicode::getType(UChar32 ch) {
01150     return u_charType(ch);
01151 }
01152 
01153 // Gets the character's linguistic directionality.
01154 inline Unicode::EDirectionProperty
01155 Unicode::characterDirection(UChar32 ch) {
01156     return (EDirectionProperty)u_charDirection(ch);
01157 }
01158 
01159 // Determines if the character has the "mirrored" property.
01160 inline UBool
01161 Unicode::isMirrored(UChar32 ch) {
01162     return u_isMirrored(ch);
01163 }
01164 
01165 // Maps the character to a "mirror-image" character, or to itself.
01166 inline UChar32
01167 Unicode::charMirror(UChar32 ch) {
01168     return u_charMirror(ch);
01169 }
01170 
01171 // Get the script associated with the character
01172 inline Unicode::EUnicodeScript
01173 Unicode::getScript(UChar32 ch) {
01174     return (EUnicodeScript) u_charScript(ch);
01175 }
01176 
01177 // Gets table cell width of the Unicode character.
01178 inline uint16_t
01179 Unicode::getCellWidth(UChar32 ch) {
01180     return u_charCellWidth(ch);
01181 }
01182 
01183 inline UTextOffset
01184 Unicode::getCharName(uint32_t code,
01185                      char *buffer, UTextOffset bufferLength,
01186                      UCharNameChoice nameChoice) {
01187     UErrorCode errorCode=U_ZERO_ERROR;
01188     UTextOffset length=u_charName(code, nameChoice, buffer, bufferLength, &errorCode);
01189     return U_SUCCESS(errorCode) ? length : 0;
01190 }
01191 
01192 inline int32_t            
01193 Unicode::digitValue(UChar32 ch) {
01194     return u_charDigitValue(ch);
01195 }
01196 
01197 inline int8_t
01198 Unicode::digit(UChar32 ch, int8_t radix) {
01199     // ### TODO this should probably move to a C u_charDigitValueEx(ch, radix) and be called here
01200     int8_t value;
01201     if((uint8_t)(radix-MIN_RADIX)<=(MAX_RADIX-MIN_RADIX)) {
01202         value=(int8_t)u_charDigitValue(ch);
01203         if(value<0) {
01204             // ch is not a decimal digit, try latin letters
01205             if ((uint32_t)(ch-0x41)<26) {
01206                 value=(int8_t)(ch-(0x41-10)); // A-Z, subtract A
01207             } else if ((uint32_t)(ch-0x61)<26) {
01208                 value=(int8_t)(ch-(0x61-10)); // a-z, subtract a
01209             } else {
01210                 return -1; // ch is not a digit character
01211             }
01212         }
01213     } else {
01214         return -1; // invalid radix
01215     }
01216     return (uint8_t)((value<radix) ? value : (uint8_t)(-1));
01217 }
01218 
01219 inline UChar32
01220 Unicode::forDigit(int32_t digit, int8_t radix) {
01221     // ### TODO this should probably move to a C u_forDigit(digit, radix) and be called here
01222     if((uint8_t)(radix-MIN_RADIX)>(MAX_RADIX-MIN_RADIX) || (uint32_t)digit>=(uint32_t)radix) {
01223         return 0;
01224     } else if(digit<10) {
01225         return (UChar32)(0x30+digit);
01226     } else {
01227         return (UChar32)((0x61-10)+digit);
01228     }
01229 }
01230 
01231 inline void
01232 Unicode::getUnicodeVersion(UVersionInfo versionArray) {
01233         u_getUnicodeVersion(versionArray);
01234 }
01235 
01236 #endif

Generated at Wed Aug 16 16:05:37 2000 for ICU1.6 by doxygen 1.0.0 written by Dimitri van Heesch, © 1997-1999