Main Page   Class Hierarchy   Alphabetical List   Compound List   File List   Compound Members   File Members  

unicode.h

Go to the documentation of this file.
00001 /*
00002 *****************************************************************************************
00003 *   Copyright (C) 1996-1999, International Business Machines
00004 *   Corporation and others.  All Rights Reserved.
00005 *****************************************************************************************
00006 */
00007 //  FILE NAME : unicode.h
00008 //
00009 //  CREATED
00010 //      Wednesday, December 11, 1996
00011 //
00012 //  CREATED BY
00013 //      Helena Shih
00014 //
00015 //  CHANGES
00016 //      Thursday, April 15, 1999
00017 //      Modified the definitions of all the functions
00018 //      C++ Wrappers for Unicode
00019 //  CHANGES BY
00020 //      Madhu Katragadda
00021 //   5/20/99     Madhu      Added the function getVersion()
00022 //  11/22/99     aliu       Added MIN_RADIX, MAX_RADIX, digit, forDigit
00023 //********************************************************************************************
00024 
00025 
00026 
00027 #ifndef UNICODE_H
00028 #define UNICODE_H
00029 
00030 #include "unicode/utypes.h"
00031 #include "unicode/uchar.h"
00032 
00046 class U_COMMON_API Unicode
00047 {
00048 public:
00049     /*
00050      * In C++, static const members actually take up memory and need to be accessed.
00051      * enum values are more like C #define's.
00052      * The following is a collection of constants, not an enumeration type.
00053      */
00054     enum {
00056         MIN_VALUE=0,
00057 
00063         MAX_VALUE=0x10ffff,
00064 
00072         MAX_CHAR_LENGTH=UTF_MAX_CHAR_LENGTH,
00073 
00084         MIN_RADIX=2,
00085 
00096         MAX_RADIX=36
00097     };
00098 
00103     enum EUnicodeGeneralTypes
00104     {
00105         UNASSIGNED              = 0,
00106         UPPERCASE_LETTER        = 1,
00107         LOWERCASE_LETTER        = 2,
00108         TITLECASE_LETTER        = 3,
00109         MODIFIER_LETTER         = 4,
00110         OTHER_LETTER            = 5,
00111         NON_SPACING_MARK        = 6,
00112         ENCLOSING_MARK          = 7,
00113         COMBINING_SPACING_MARK  = 8,
00114         DECIMAL_DIGIT_NUMBER    = 9,
00115         LETTER_NUMBER           = 10,
00116         OTHER_NUMBER            = 11,
00117         SPACE_SEPARATOR         = 12,
00118         LINE_SEPARATOR          = 13,
00119         PARAGRAPH_SEPARATOR     = 14,
00120         CONTROL                 = 15,
00121         FORMAT                  = 16,
00122         PRIVATE_USE             = 17,
00123         SURROGATE               = 18,
00124         DASH_PUNCTUATION        = 19,
00125         START_PUNCTUATION       = 20,
00126         END_PUNCTUATION         = 21,
00127         CONNECTOR_PUNCTUATION   = 22,
00128         OTHER_PUNCTUATION       = 23,
00129         MATH_SYMBOL             = 24,
00130         CURRENCY_SYMBOL         = 25,
00131         MODIFIER_SYMBOL         = 26,
00132         OTHER_SYMBOL            = 27,
00133         INITIAL_PUNCTUATION     = 28,
00134         FINAL_PUNCTUATION       = 29,
00135         GENERAL_TYPES_COUNT     = 30
00136     };
00137 
00138     /* Please keep these values in sync with UCharScript */
00142     enum EUnicodeScript 
00143     {
00144         kBasicLatin,
00145         kLatin1Supplement,
00146         kLatinExtendedA,
00147         kLatinExtendedB,
00148         kIPAExtension,
00149         kSpacingModifier,
00150         kCombiningDiacritical,
00151         kGreek,
00152         kCyrillic,
00153         kArmenian,
00154         kHebrew,
00155         kArabic,
00156         kSyriac,
00157         kThaana,
00158         kDevanagari,
00159         kBengali,
00160         kGurmukhi,
00161         kGujarati,
00162         kOriya,
00163         kTamil,
00164         kTelugu,
00165         kKannada,
00166         kMalayalam,
00167         kSinhala,
00168         kThai,
00169         kLao,
00170         kTibetan,
00171         kMyanmar,
00172         kGeorgian,
00173         kHangulJamo,
00174         kEthiopic,
00175         kCherokee,
00176         kUnifiedCanadianAboriginalSyllabics,
00177         kogham,
00178         kRunic,
00179         kKhmer,
00180         kMongolian,
00181         kLatinExtendedAdditional,
00182         kGreekExtended,
00183         kGeneralPunctuation,
00184         kSuperSubScript,
00185         kCurrencySymbolScript,
00186         kSymbolCombiningMark,
00187         kLetterlikeSymbol,
00188         kNumberForm,
00189         kArrow,
00190         kMathOperator,
00191         kMiscTechnical,
00192         kControlPicture,
00193         kOpticalCharacter,
00194         kEnclosedAlphanumeric,
00195         kBoxDrawing,
00196         kBlockElement,
00197         kGeometricShape,
00198         kMiscSymbol,
00199         kDingbat,
00200         kBraillePatterns,
00201         kCJKRadicalsSupplement,
00202         kKangxiRadicals,
00203         kIdeographicDescriptionCharacters,
00204         kCJKSymbolPunctuation,
00205         kHiragana,
00206         kKatakana,
00207         kBopomofo,
00208         kHangulCompatibilityJamo,
00209         kKanbun,
00210         kBopomofoExtended,
00211         kEnclosedCJKLetterMonth,
00212         kCJKCompatibility,
00213         kCJKUnifiedIdeographExtensionA,
00214         kCJKUnifiedIdeograph,
00215         kYiSyllables,
00216         kYiRadicals,
00217         kHangulSyllable,
00218         kHighSurrogate,
00219         kHighPrivateUseSurrogate,
00220         kLowSurrogate,
00221         kPrivateUse,
00222         kCJKCompatibilityIdeograph,
00223         kAlphabeticPresentation,
00224         kArabicPresentationA,
00225         kCombiningHalfMark,
00226         kCJKCompatibilityForm,
00227         kSmallFormVariant,
00228         kArabicPresentationB,
00229         kNoScript,
00230         kHalfwidthFullwidthForm,
00231         kScriptCount
00232     };
00233 
00237     enum EDirectionProperty { 
00238         LEFT_TO_RIGHT               = 0, 
00239         RIGHT_TO_LEFT               = 1, 
00240         EUROPEAN_NUMBER             = 2,
00241         EUROPEAN_NUMBER_SEPARATOR   = 3,
00242         EUROPEAN_NUMBER_TERMINATOR  = 4,
00243         ARABIC_NUMBER               = 5,
00244         COMMON_NUMBER_SEPARATOR     = 6,
00245         BLOCK_SEPARATOR             = 7,
00246         SEGMENT_SEPARATOR           = 8,
00247         WHITE_SPACE_NEUTRAL         = 9, 
00248         OTHER_NEUTRAL               = 10, 
00249         LEFT_TO_RIGHT_EMBEDDING     = 11,
00250         LEFT_TO_RIGHT_OVERRIDE      = 12,
00251         RIGHT_TO_LEFT_ARABIC        = 13,
00252         RIGHT_TO_LEFT_EMBEDDING     = 14,
00253         RIGHT_TO_LEFT_OVERRIDE      = 15,
00254         POP_DIRECTIONAL_FORMAT      = 16,
00255         DIR_NON_SPACING_MARK        = 17,
00256         BOUNDARY_NEUTRAL            = 18
00257     };
00258 
00263     enum ECellWidths
00264     {
00265         ZERO_WIDTH              = 0,
00266         HALF_WIDTH              = 1,
00267         FULL_WIDTH              = 2,
00268         NEUTRAL                 = 3
00269     };
00270 
00280     static inline UBool isSingle(UChar c);
00281 
00289     static inline UBool isLead(UChar c);
00290 
00298     static inline UBool isTrail(UChar c);
00299 
00309     static inline UBool isSurrogate(UChar32 c);
00310 
00322     static inline UBool isUnicodeChar(UChar32 c);
00323 
00334     static inline UBool isError(UChar32 c);
00335 
00344     static inline UBool isValid(UChar32 c);
00345 
00356     static inline UBool needMultipleUChar(UChar32 c);
00357 
00365     static inline int32_t charLength(UChar32 c);
00366 
00379     static inline int32_t arraySize(int32_t size);
00380 
00393     static inline UBool isLowerCase(UChar32 ch);
00394 
00406     static inline UBool isUpperCase(UChar32 ch);
00407 
00419     static inline UBool isTitleCase(UChar32 ch);
00420 
00432     static inline UBool isDigit(UChar32 ch);
00433 
00449     static inline UBool isDefined(UChar32 ch);
00450 
00461     static inline UBool isControl(UChar32 ch);
00462 
00473     static inline UBool isPrintable(UChar32 ch);
00474 
00486      static inline UBool isBaseForm(UChar32 ch);
00487 
00503     static inline UBool isLetter(UChar32 ch);
00504 
00525     static inline UBool isJavaIdentifierStart(UChar32 ch);
00526 
00555     static inline UBool isJavaIdentifierPart(UChar32 ch);
00556 
00571     static inline UBool isUnicodeIdentifierStart(UChar32 ch);
00572 
00599     static inline UBool isUnicodeIdentifierPart(UChar32 ch);
00600 
00626     static inline UBool isIdentifierIgnorable(UChar32 ch);
00627 
00652    static inline UChar32 toLowerCase(UChar32 ch); 
00653 
00675     static inline UChar32 toUpperCase(UChar32 ch);
00676 
00694     static inline UChar32 toTitleCase(UChar32 ch);
00695 
00704     static inline UBool isSpaceChar(UChar32 ch);
00705 
00734     static inline UBool isWhitespace(UChar32 ch);
00735 
00770     static inline int8_t getType(UChar32 ch);
00771 
00781     static inline EDirectionProperty characterDirection(UChar32 ch);
00782 
00792     static inline UBool isMirrored(UChar32 c);
00793 
00809     static inline UChar32 charMirror(UChar32 c);
00810 
00816     static inline EUnicodeScript getScript(UChar32 ch);
00817 
00869     static inline uint16_t getCellWidth(UChar32 ch);
00870 
00898     static inline UTextOffset
00899     getCharName(uint32_t code,
00900                 char *buffer, UTextOffset bufferLength,
00901                 UCharNameChoice nameChoice=U_UNICODE_CHAR_NAME);
00902 
00913     static inline int32_t digitValue(UChar32 ch);     
00914 
00952     static inline int8_t digit(UChar32 ch, int8_t radix);
00953 
00981     static inline UChar32 forDigit(int32_t digit, int8_t radix);
00982 
00988     static void getUnicodeVersion(UVersionInfo info);
00989 
00990 protected:
00991     // These constructors, destructor, and assignment operator must
00992     // be protected (not private, as they semantically are) to make
00993     // various UNIX compilers happy. [LIU]
00994     // They should be private to prevent anyone from instantiating or
00995     // subclassing Unicode.
00996     Unicode();
00997     Unicode(const Unicode &other);
00998     ~Unicode();
00999     const Unicode &operator=(const Unicode &other);
01000 };
01001 
01002 /* inline implementations --------------------------------------------------- */
01003 
01004 inline UBool
01005 Unicode::isSingle(UChar c) {
01006     return UTF_IS_SINGLE(c);
01007 }
01008 
01009 inline UBool
01010 Unicode::isLead(UChar c) {
01011     return UTF_IS_LEAD(c);
01012 }
01013 
01014 inline UBool
01015 Unicode::isTrail(UChar c) {
01016     return UTF_IS_TRAIL(c);
01017 }
01018 
01019 inline UBool
01020 Unicode::isSurrogate(UChar32 c) {
01021     return UTF_IS_SURROGATE(c);
01022 }
01023 
01024 inline UBool
01025 Unicode::isUnicodeChar(UChar32 c) {
01026     return UTF_IS_UNICODE_CHAR(c);
01027 }
01028 
01029 inline UBool
01030 Unicode::isError(UChar32 c) {
01031     return UTF_IS_ERROR(c);
01032 }
01033 
01034 inline UBool
01035 Unicode::isValid(UChar32 c) {
01036     return UTF_IS_VALID(c);
01037 }
01038 
01039 inline UBool
01040 Unicode::needMultipleUChar(UChar32 c) {
01041     return UTF_NEED_MULTIPLE_UCHAR(c);
01042 }
01043 
01044 inline int32_t
01045 Unicode::charLength(UChar32 c) {
01046     return UTF_CHAR_LENGTH(c);
01047 }
01048 
01049 inline int32_t
01050 Unicode::arraySize(int32_t size) {
01051     return UTF_ARRAY_SIZE(size);
01052 }
01053 
01054 // Checks if ch is a lower case letter.
01055 inline UBool
01056 Unicode::isLowerCase(UChar32 ch) {
01057     return u_islower(ch);
01058 }
01059 
01060 // Checks if ch is a upper case letter.
01061 inline UBool
01062 Unicode::isUpperCase(UChar32 ch) {
01063     return u_isupper(ch);
01064 }
01065 
01066 // Checks if ch is a title case letter; usually upper case letters.
01067 inline UBool
01068 Unicode::isTitleCase(UChar32 ch) {
01069     return u_istitle(ch);
01070 }
01071 
01072 // Checks if ch is a decimal digit.
01073 inline UBool
01074 Unicode::isDigit(UChar32 ch) {
01075     return u_isdigit(ch);
01076 }
01077 
01078 // Checks if ch is a unicode character with assigned character type.
01079 inline UBool
01080 Unicode::isDefined(UChar32 ch) {
01081     return u_isdefined(ch);
01082 }
01083 
01084 // Checks if the Unicode character is a control character.
01085 inline UBool
01086 Unicode::isControl(UChar32 ch) {
01087     return u_iscntrl(ch);
01088 }
01089 
01090 // Checks if the Unicode character is printable.
01091 inline UBool
01092 Unicode::isPrintable(UChar32 ch) {
01093     return u_isprint(ch);
01094 }
01095 
01096 // Checks if the Unicode character is a base form character that can take a diacritic.
01097 inline UBool
01098 Unicode::isBaseForm(UChar32 ch) {
01099     return u_isbase(ch);
01100 }
01101 
01102 // Checks if the Unicode character is a letter.
01103 inline UBool
01104 Unicode::isLetter(UChar32 ch) {
01105     return u_isalpha(ch);
01106 }
01107 
01108 // Checks if the Unicode character can start a Java identifier.
01109 inline UBool
01110 Unicode::isJavaIdentifierStart(UChar32 ch) {
01111     return u_isJavaIDStart(ch);
01112 }
01113 
01114 // Checks if the Unicode character can be a Java identifier part other than starting the
01115 // identifier.
01116 inline UBool
01117 Unicode::isJavaIdentifierPart(UChar32 ch) {
01118     return u_isJavaIDPart(ch);
01119 }
01120 
01121 // Checks if the Unicode character can start a Unicode identifier.
01122 inline UBool
01123 Unicode::isUnicodeIdentifierStart(UChar32 ch) {
01124     return u_isIDStart(ch);
01125 }
01126 
01127 // Checks if the Unicode character can be a Unicode identifier part other than starting the
01128 // identifier.
01129 inline UBool
01130 Unicode::isUnicodeIdentifierPart(UChar32 ch) {
01131     return u_isIDPart(ch);
01132 }
01133 
01134 // Checks if the Unicode character can be ignorable in a Java or Unicode identifier.
01135 inline UBool
01136 Unicode::isIdentifierIgnorable(UChar32 ch) {
01137     return u_isIDIgnorable(ch);
01138 }
01139 
01140 // Transforms the Unicode character to its lower case equivalent.
01141 inline UChar32       
01142 Unicode::toLowerCase(UChar32 ch) {
01143     return u_tolower(ch);
01144 }
01145     
01146 // Transforms the Unicode character to its upper case equivalent.
01147 inline UChar32
01148 Unicode::toUpperCase(UChar32 ch) {
01149     return u_toupper(ch);
01150 }
01151 
01152 // Transforms the Unicode character to its title case equivalent.
01153 inline UChar32
01154 Unicode::toTitleCase(UChar32 ch) {
01155     return u_totitle(ch);
01156 }
01157 
01158 // Checks if the Unicode character is a space character.
01159 inline UBool
01160 Unicode::isSpaceChar(UChar32 ch) {
01161     return u_isspace(ch);
01162 }
01163 
01164 // Determines if the specified character is white space according to ICU.
01165 inline UBool
01166 Unicode::isWhitespace(UChar32 ch) {
01167     return u_isWhitespace(ch);
01168 }
01169 
01170 // Gets if the Unicode character's character property.
01171 inline int8_t
01172 Unicode::getType(UChar32 ch) {
01173     return u_charType(ch);
01174 }
01175 
01176 // Gets the character's linguistic directionality.
01177 inline Unicode::EDirectionProperty
01178 Unicode::characterDirection(UChar32 ch) {
01179     return (EDirectionProperty)u_charDirection(ch);
01180 }
01181 
01182 // Determines if the character has the "mirrored" property.
01183 inline UBool
01184 Unicode::isMirrored(UChar32 ch) {
01185     return u_isMirrored(ch);
01186 }
01187 
01188 // Maps the character to a "mirror-image" character, or to itself.
01189 inline UChar32
01190 Unicode::charMirror(UChar32 ch) {
01191     return u_charMirror(ch);
01192 }
01193 
01194 // Get the script associated with the character
01195 inline Unicode::EUnicodeScript
01196 Unicode::getScript(UChar32 ch) {
01197     return (EUnicodeScript) u_charScript(ch);
01198 }
01199 
01200 // Gets table cell width of the Unicode character.
01201 inline uint16_t
01202 Unicode::getCellWidth(UChar32 ch) {
01203     return u_charCellWidth(ch);
01204 }
01205 
01206 inline UTextOffset
01207 Unicode::getCharName(uint32_t code,
01208                      char *buffer, UTextOffset bufferLength,
01209                      UCharNameChoice nameChoice) {
01210     UErrorCode errorCode=U_ZERO_ERROR;
01211     UTextOffset length=u_charName(code, nameChoice, buffer, bufferLength, &errorCode);
01212     return U_SUCCESS(errorCode) ? length : 0;
01213 }
01214 
01215 inline int32_t            
01216 Unicode::digitValue(UChar32 ch) {
01217     return u_charDigitValue(ch);
01218 }
01219 
01220 inline int8_t
01221 Unicode::digit(UChar32 ch, int8_t radix) {
01222     // ### TODO this should probably move to a C u_charDigitValueEx(ch, radix) and be called here
01223     int8_t value;
01224     if((uint8_t)(radix-MIN_RADIX)<=(MAX_RADIX-MIN_RADIX)) {
01225         value=(int8_t)u_charDigitValue(ch);
01226         if(value<0) {
01227             // ch is not a decimal digit, try latin letters
01228             if ((uint32_t)(ch-0x41)<26) {
01229                 value=(int8_t)(ch-(0x41-10)); // A-Z, subtract A
01230             } else if ((uint32_t)(ch-0x61)<26) {
01231                 value=(int8_t)(ch-(0x61-10)); // a-z, subtract a
01232             } else {
01233                 return -1; // ch is not a digit character
01234             }
01235         }
01236     } else {
01237         return -1; // invalid radix
01238     }
01239     return (uint8_t)((value<radix) ? value : (uint8_t)(-1));
01240 }
01241 
01242 inline UChar32
01243 Unicode::forDigit(int32_t digit, int8_t radix) {
01244     // ### TODO this should probably move to a C u_forDigit(digit, radix) and be called here
01245     if((uint8_t)(radix-MIN_RADIX)>(MAX_RADIX-MIN_RADIX) || (uint32_t)digit>=(uint32_t)radix) {
01246         return 0;
01247     } else if(digit<10) {
01248         return (UChar32)(0x30+digit);
01249     } else {
01250         return (UChar32)((0x61-10)+digit);
01251     }
01252 }
01253 
01254 inline void
01255 Unicode::getUnicodeVersion(UVersionInfo versionArray) {
01256     u_getUnicodeVersion(versionArray);
01257 }
01258 
01259 #endif

Generated at Fri Dec 15 12:12:37 2000 for ICU 1.7 by doxygen1.2.3 written by Dimitri van Heesch, © 1997-2000