Main Page   Class Hierarchy   Alphabetical List   Compound List   File List   Compound Members   File Members  

unicode.h

Go to the documentation of this file.
00001 /*
00002 ******************************************************************************
00003 *   Copyright (C) 1996-2001, International Business Machines
00004 *   Corporation and others.  All Rights Reserved.
00005 ******************************************************************************
00006 */
00007 //  FILE NAME : unicode.h
00008 //
00009 //  CREATED
00010 //      Wednesday, December 11, 1996
00011 //
00012 //  CREATED BY
00013 //      Helena Shih
00014 //
00015 //  CHANGES
00016 //      Thursday, April 15, 1999
00017 //      Modified the definitions of all the functions
00018 //      C++ Wrappers for Unicode
00019 //  CHANGES BY
00020 //      Madhu Katragadda
00021 //   5/20/99     Madhu      Added the function getVersion()
00022 //  11/22/99     aliu       Added MIN_RADIX, MAX_RADIX, digit, forDigit
00023 //*****************************************************************************
00024 
00025 
00026 
00027 #ifndef UNICODE_H
00028 #define UNICODE_H
00029 
00030 #include "unicode/utypes.h"
00031 #include "unicode/uchar.h"
00032 
00046 class U_COMMON_API Unicode
00047 {
00048 public:
00049     /*
00050      * In C++, static const members actually take up memory and need to be accessed.
00051      * enum values are more like C #define's.
00052      * The following is a collection of constants, not an enumeration type.
00053      */
00054     enum {
00056         MIN_VALUE=0,
00057 
00063         MAX_VALUE=0x10ffff,
00064 
00072         MAX_CHAR_LENGTH=UTF_MAX_CHAR_LENGTH,
00073 
00084         MIN_RADIX=2,
00085 
00096         MAX_RADIX=36
00097     };
00098 
00103     enum EUnicodeGeneralTypes
00104     {
00105         UNASSIGNED              = 0,
00106         UPPERCASE_LETTER        = 1,
00107         LOWERCASE_LETTER        = 2,
00108         TITLECASE_LETTER        = 3,
00109         MODIFIER_LETTER         = 4,
00110         OTHER_LETTER            = 5,
00111         NON_SPACING_MARK        = 6,
00112         ENCLOSING_MARK          = 7,
00113         COMBINING_SPACING_MARK  = 8,
00114         DECIMAL_DIGIT_NUMBER    = 9,
00115         LETTER_NUMBER           = 10,
00116         OTHER_NUMBER            = 11,
00117         SPACE_SEPARATOR         = 12,
00118         LINE_SEPARATOR          = 13,
00119         PARAGRAPH_SEPARATOR     = 14,
00120         CONTROL                 = 15,
00121         FORMAT                  = 16,
00122         PRIVATE_USE             = 17,
00123         SURROGATE               = 18,
00124         DASH_PUNCTUATION        = 19,
00125         START_PUNCTUATION       = 20,
00126         END_PUNCTUATION         = 21,
00127         CONNECTOR_PUNCTUATION   = 22,
00128         OTHER_PUNCTUATION       = 23,
00129         MATH_SYMBOL             = 24,
00130         CURRENCY_SYMBOL         = 25,
00131         MODIFIER_SYMBOL         = 26,
00132         OTHER_SYMBOL            = 27,
00133         INITIAL_PUNCTUATION     = 28,
00134         FINAL_PUNCTUATION       = 29,
00135         GENERAL_TYPES_COUNT     = 30
00136     };
00137 
00138     /* Please keep these values in sync with UCharScript */
00142     enum EUnicodeScript 
00143     {
00144         kBasicLatin,
00145         kLatin1Supplement,
00146         kLatinExtendedA,
00147         kLatinExtendedB,
00148         kIPAExtension,
00149         kSpacingModifier,
00150         kCombiningDiacritical,
00151         kGreek,
00152         kCyrillic,
00153         kArmenian,
00154         kHebrew,
00155         kArabic,
00156         kSyriac,
00157         kThaana,
00158         kDevanagari,
00159         kBengali,
00160         kGurmukhi,
00161         kGujarati,
00162         kOriya,
00163         kTamil,
00164         kTelugu,
00165         kKannada,
00166         kMalayalam,
00167         kSinhala,
00168         kThai,
00169         kLao,
00170         kTibetan,
00171         kMyanmar,
00172         kGeorgian,
00173         kHangulJamo,
00174         kEthiopic,
00175         kCherokee,
00176         kUnifiedCanadianAboriginalSyllabics,
00177         kogham,
00178         kRunic,
00179         kKhmer,
00180         kMongolian,
00181         kLatinExtendedAdditional,
00182         kGreekExtended,
00183         kGeneralPunctuation,
00184         kSuperSubScript,
00185         kCurrencySymbolScript,
00186         kSymbolCombiningMark,
00187         kLetterlikeSymbol,
00188         kNumberForm,
00189         kArrow,
00190         kMathOperator,
00191         kMiscTechnical,
00192         kControlPicture,
00193         kOpticalCharacter,
00194         kEnclosedAlphanumeric,
00195         kBoxDrawing,
00196         kBlockElement,
00197         kGeometricShape,
00198         kMiscSymbol,
00199         kDingbat,
00200         kBraillePatterns,
00201         kCJKRadicalsSupplement,
00202         kKangxiRadicals,
00203         kIdeographicDescriptionCharacters,
00204         kCJKSymbolPunctuation,
00205         kHiragana,
00206         kKatakana,
00207         kBopomofo,
00208         kHangulCompatibilityJamo,
00209         kKanbun,
00210         kBopomofoExtended,
00211         kEnclosedCJKLetterMonth,
00212         kCJKCompatibility,
00213         kCJKUnifiedIdeographExtensionA,
00214         kCJKUnifiedIdeograph,
00215         kYiSyllables,
00216         kYiRadicals,
00217         kHangulSyllable,
00218         kHighSurrogate,
00219         kHighPrivateUseSurrogate,
00220         kLowSurrogate,
00221         kPrivateUse,
00222         kCJKCompatibilityIdeograph,
00223         kAlphabeticPresentation,
00224         kArabicPresentationA,
00225         kCombiningHalfMark,
00226         kCJKCompatibilityForm,
00227         kSmallFormVariant,
00228         kArabicPresentationB,
00229         kNoScript,
00230         kHalfwidthFullwidthForm,
00231         kScriptCount
00232     };
00233 
00237     enum EDirectionProperty { 
00238         LEFT_TO_RIGHT               = 0, 
00239         RIGHT_TO_LEFT               = 1, 
00240         EUROPEAN_NUMBER             = 2,
00241         EUROPEAN_NUMBER_SEPARATOR   = 3,
00242         EUROPEAN_NUMBER_TERMINATOR  = 4,
00243         ARABIC_NUMBER               = 5,
00244         COMMON_NUMBER_SEPARATOR     = 6,
00245         BLOCK_SEPARATOR             = 7,
00246         SEGMENT_SEPARATOR           = 8,
00247         WHITE_SPACE_NEUTRAL         = 9, 
00248         OTHER_NEUTRAL               = 10, 
00249         LEFT_TO_RIGHT_EMBEDDING     = 11,
00250         LEFT_TO_RIGHT_OVERRIDE      = 12,
00251         RIGHT_TO_LEFT_ARABIC        = 13,
00252         RIGHT_TO_LEFT_EMBEDDING     = 14,
00253         RIGHT_TO_LEFT_OVERRIDE      = 15,
00254         POP_DIRECTIONAL_FORMAT      = 16,
00255         DIR_NON_SPACING_MARK        = 17,
00256         BOUNDARY_NEUTRAL            = 18
00257     };
00258 
00263     enum ECellWidths
00264     {
00265         ZERO_WIDTH              = 0,
00266         HALF_WIDTH              = 1,
00267         FULL_WIDTH              = 2,
00268         NEUTRAL                 = 3
00269     };
00270 
00280     static inline UBool isSingle(UChar c);
00281 
00289     static inline UBool isLead(UChar c);
00290 
00298     static inline UBool isTrail(UChar c);
00299 
00309     static inline UBool isSurrogate(UChar32 c);
00310 
00322     static inline UBool isUnicodeChar(UChar32 c);
00323 
00334     static inline UBool isError(UChar32 c);
00335 
00344     static inline UBool isValid(UChar32 c);
00345 
00356     static inline UBool needMultipleUChar(UChar32 c);
00357 
00365     static inline int32_t charLength(UChar32 c);
00366 
00379     static inline int32_t arraySize(int32_t size);
00380 
00393     static inline UBool isLowerCase(UChar32 ch);
00394 
00406     static inline UBool isUpperCase(UChar32 ch);
00407 
00419     static inline UBool isTitleCase(UChar32 ch);
00420 
00432     static inline UBool isDigit(UChar32 ch);
00433 
00449     static inline UBool isDefined(UChar32 ch);
00450 
00461     static inline UBool isControl(UChar32 ch);
00462 
00473     static inline UBool isPrintable(UChar32 ch);
00474 
00486      static inline UBool isBaseForm(UChar32 ch);
00487 
00503     static inline UBool isLetter(UChar32 ch);
00504 
00525     static inline UBool isJavaIdentifierStart(UChar32 ch);
00526 
00555     static inline UBool isJavaIdentifierPart(UChar32 ch);
00556 
00571     static inline UBool isUnicodeIdentifierStart(UChar32 ch);
00572 
00599     static inline UBool isUnicodeIdentifierPart(UChar32 ch);
00600 
00626     static inline UBool isIdentifierIgnorable(UChar32 ch);
00627 
00652    static inline UChar32 toLowerCase(UChar32 ch); 
00653 
00675     static inline UChar32 toUpperCase(UChar32 ch);
00676 
00694     static inline UChar32 toTitleCase(UChar32 ch);
00695 
00709     static inline UChar32
00710     foldCase(UChar32 c, uint32_t options);
00711 
00720     static inline UBool isSpaceChar(UChar32 ch);
00721 
00750     static inline UBool isWhitespace(UChar32 ch);
00751 
00786     static inline int8_t getType(UChar32 ch);
00787 
00795     static inline uint8_t getCombiningClass(UChar32 c);
00796 
00806     static inline EDirectionProperty characterDirection(UChar32 ch);
00807 
00817     static inline UBool isMirrored(UChar32 c);
00818 
00834     static inline UChar32 charMirror(UChar32 c);
00835 
00841     static inline EUnicodeScript getScript(UChar32 ch);
00842 
00894     static inline uint16_t getCellWidth(UChar32 ch);
00895 
00923     static inline UTextOffset
00924     getCharName(uint32_t code,
00925                 char *buffer, UTextOffset bufferLength,
00926                 UCharNameChoice nameChoice=U_UNICODE_CHAR_NAME);
00927 
00938     static inline int32_t digitValue(UChar32 ch);     
00939 
00977     static inline int8_t digit(UChar32 ch, int8_t radix);
00978 
01006     static inline UChar32 forDigit(int32_t digit, int8_t radix);
01007 
01013     static void getUnicodeVersion(UVersionInfo info);
01014 
01015 protected:
01016     // These constructors, destructor, and assignment operator must
01017     // be protected (not private, as they semantically are) to make
01018     // various UNIX compilers happy. [LIU]
01019     // They should be private to prevent anyone from instantiating or
01020     // subclassing Unicode.
01021     Unicode();
01022     Unicode(const Unicode &other);
01023     ~Unicode();
01024     const Unicode &operator=(const Unicode &other);
01025 };
01026 
01027 /* inline implementations --------------------------------------------------- */
01028 
01029 inline UBool
01030 Unicode::isSingle(UChar c) {
01031     return UTF_IS_SINGLE(c);
01032 }
01033 
01034 inline UBool
01035 Unicode::isLead(UChar c) {
01036     return UTF_IS_LEAD(c);
01037 }
01038 
01039 inline UBool
01040 Unicode::isTrail(UChar c) {
01041     return UTF_IS_TRAIL(c);
01042 }
01043 
01044 inline UBool
01045 Unicode::isSurrogate(UChar32 c) {
01046     return UTF_IS_SURROGATE(c);
01047 }
01048 
01049 inline UBool
01050 Unicode::isUnicodeChar(UChar32 c) {
01051     return UTF_IS_UNICODE_CHAR(c);
01052 }
01053 
01054 inline UBool
01055 Unicode::isError(UChar32 c) {
01056     return UTF_IS_ERROR(c);
01057 }
01058 
01059 inline UBool
01060 Unicode::isValid(UChar32 c) {
01061     return UTF_IS_VALID(c);
01062 }
01063 
01064 inline UBool
01065 Unicode::needMultipleUChar(UChar32 c) {
01066     return UTF_NEED_MULTIPLE_UCHAR(c);
01067 }
01068 
01069 inline int32_t
01070 Unicode::charLength(UChar32 c) {
01071     return UTF_CHAR_LENGTH(c);
01072 }
01073 
01074 inline int32_t
01075 Unicode::arraySize(int32_t size) {
01076     return UTF_ARRAY_SIZE(size);
01077 }
01078 
01079 // Checks if ch is a lower case letter.
01080 inline UBool
01081 Unicode::isLowerCase(UChar32 ch) {
01082     return u_islower(ch);
01083 }
01084 
01085 // Checks if ch is a upper case letter.
01086 inline UBool
01087 Unicode::isUpperCase(UChar32 ch) {
01088     return u_isupper(ch);
01089 }
01090 
01091 // Checks if ch is a title case letter; usually upper case letters.
01092 inline UBool
01093 Unicode::isTitleCase(UChar32 ch) {
01094     return u_istitle(ch);
01095 }
01096 
01097 // Checks if ch is a decimal digit.
01098 inline UBool
01099 Unicode::isDigit(UChar32 ch) {
01100     return u_isdigit(ch);
01101 }
01102 
01103 // Checks if ch is a unicode character with assigned character type.
01104 inline UBool
01105 Unicode::isDefined(UChar32 ch) {
01106     return u_isdefined(ch);
01107 }
01108 
01109 // Checks if the Unicode character is a control character.
01110 inline UBool
01111 Unicode::isControl(UChar32 ch) {
01112     return u_iscntrl(ch);
01113 }
01114 
01115 // Checks if the Unicode character is printable.
01116 inline UBool
01117 Unicode::isPrintable(UChar32 ch) {
01118     return u_isprint(ch);
01119 }
01120 
01121 // Checks if the Unicode character is a base form character that can take a diacritic.
01122 inline UBool
01123 Unicode::isBaseForm(UChar32 ch) {
01124     return u_isbase(ch);
01125 }
01126 
01127 // Checks if the Unicode character is a letter.
01128 inline UBool
01129 Unicode::isLetter(UChar32 ch) {
01130     return u_isalpha(ch);
01131 }
01132 
01133 // Checks if the Unicode character can start a Java identifier.
01134 inline UBool
01135 Unicode::isJavaIdentifierStart(UChar32 ch) {
01136     return u_isJavaIDStart(ch);
01137 }
01138 
01139 // Checks if the Unicode character can be a Java identifier part other than starting the
01140 // identifier.
01141 inline UBool
01142 Unicode::isJavaIdentifierPart(UChar32 ch) {
01143     return u_isJavaIDPart(ch);
01144 }
01145 
01146 // Checks if the Unicode character can start a Unicode identifier.
01147 inline UBool
01148 Unicode::isUnicodeIdentifierStart(UChar32 ch) {
01149     return u_isIDStart(ch);
01150 }
01151 
01152 // Checks if the Unicode character can be a Unicode identifier part other than starting the
01153 // identifier.
01154 inline UBool
01155 Unicode::isUnicodeIdentifierPart(UChar32 ch) {
01156     return u_isIDPart(ch);
01157 }
01158 
01159 // Checks if the Unicode character can be ignorable in a Java or Unicode identifier.
01160 inline UBool
01161 Unicode::isIdentifierIgnorable(UChar32 ch) {
01162     return u_isIDIgnorable(ch);
01163 }
01164 
01165 // Transforms the Unicode character to its lower case equivalent.
01166 inline UChar32       
01167 Unicode::toLowerCase(UChar32 ch) {
01168     return u_tolower(ch);
01169 }
01170     
01171 // Transforms the Unicode character to its upper case equivalent.
01172 inline UChar32
01173 Unicode::toUpperCase(UChar32 ch) {
01174     return u_toupper(ch);
01175 }
01176 
01177 // Transforms the Unicode character to its title case equivalent.
01178 inline UChar32
01179 Unicode::toTitleCase(UChar32 ch) {
01180     return u_totitle(ch);
01181 }
01182 
01183 // Transforms the Unicode character to its case folded equivalent.
01184 inline UChar32       
01185 Unicode::foldCase(UChar32 ch, uint32_t options) {
01186     return u_foldCase(ch, options);
01187 }
01188     
01189 // Checks if the Unicode character is a space character.
01190 inline UBool
01191 Unicode::isSpaceChar(UChar32 ch) {
01192     return u_isspace(ch);
01193 }
01194 
01195 // Determines if the specified character is white space according to ICU.
01196 inline UBool
01197 Unicode::isWhitespace(UChar32 ch) {
01198     return u_isWhitespace(ch);
01199 }
01200 
01201 // Gets if the Unicode character's character property.
01202 inline int8_t
01203 Unicode::getType(UChar32 ch) {
01204     return u_charType(ch);
01205 }
01206 
01207 inline uint8_t
01208 Unicode::getCombiningClass(UChar32 c) {
01209     return u_getCombiningClass(c);
01210 }
01211 
01212 // Gets the character's linguistic directionality.
01213 inline Unicode::EDirectionProperty
01214 Unicode::characterDirection(UChar32 ch) {
01215     return (EDirectionProperty)u_charDirection(ch);
01216 }
01217 
01218 // Determines if the character has the "mirrored" property.
01219 inline UBool
01220 Unicode::isMirrored(UChar32 ch) {
01221     return u_isMirrored(ch);
01222 }
01223 
01224 // Maps the character to a "mirror-image" character, or to itself.
01225 inline UChar32
01226 Unicode::charMirror(UChar32 ch) {
01227     return u_charMirror(ch);
01228 }
01229 
01230 // Get the script associated with the character
01231 inline Unicode::EUnicodeScript
01232 Unicode::getScript(UChar32 ch) {
01233     return (EUnicodeScript) u_charScript(ch);
01234 }
01235 
01236 // Gets table cell width of the Unicode character.
01237 inline uint16_t
01238 Unicode::getCellWidth(UChar32 ch) {
01239     return u_charCellWidth(ch);
01240 }
01241 
01242 inline UTextOffset
01243 Unicode::getCharName(uint32_t code,
01244                      char *buffer, UTextOffset bufferLength,
01245                      UCharNameChoice nameChoice) {
01246     UErrorCode errorCode=U_ZERO_ERROR;
01247     UTextOffset length=u_charName(code, nameChoice, buffer, bufferLength, &errorCode);
01248     return U_SUCCESS(errorCode) ? length : 0;
01249 }
01250 
01251 inline int32_t            
01252 Unicode::digitValue(UChar32 ch) {
01253     return u_charDigitValue(ch);
01254 }
01255 
01256 inline int8_t
01257 Unicode::digit(UChar32 ch, int8_t radix) {
01258     // ### TODO this should probably move to a C u_charDigitValueEx(ch, radix) and be called here
01259     int8_t value;
01260     if((uint8_t)(radix-MIN_RADIX)<=(MAX_RADIX-MIN_RADIX)) {
01261         value=(int8_t)u_charDigitValue(ch);
01262         if(value<0) {
01263             // ch is not a decimal digit, try latin letters
01264             if ((uint32_t)(ch-0x41)<26) {
01265                 value=(int8_t)(ch-(0x41-10)); // A-Z, subtract A
01266             } else if ((uint32_t)(ch-0x61)<26) {
01267                 value=(int8_t)(ch-(0x61-10)); // a-z, subtract a
01268             } else {
01269                 return -1; // ch is not a digit character
01270             }
01271         }
01272     } else {
01273         return -1; // invalid radix
01274     }
01275     return (uint8_t)((value<radix) ? value : (uint8_t)(-1));
01276 }
01277 
01278 inline UChar32
01279 Unicode::forDigit(int32_t digit, int8_t radix) {
01280     // ### TODO this should probably move to a C u_forDigit(digit, radix) and be called here
01281     if((uint8_t)(radix-MIN_RADIX)>(MAX_RADIX-MIN_RADIX) || (uint32_t)digit>=(uint32_t)radix) {
01282         return 0;
01283     } else if(digit<10) {
01284         return (UChar32)(0x30+digit);
01285     } else {
01286         return (UChar32)((0x61-10)+digit);
01287     }
01288 }
01289 
01290 inline void
01291 Unicode::getUnicodeVersion(UVersionInfo versionArray) {
01292     u_getUnicodeVersion(versionArray);
01293 }
01294 
01295 #endif

Generated at Thu Mar 22 16:12:40 2001 for ICU 1.8 by doxygen1.2.3 written by Dimitri van Heesch, © 1997-2000