Main Page   Class Hierarchy   Alphabetical List   Compound List   File List   Compound Members   File Members   Search  

unicode.h

Go to the documentation of this file.
00001 /*
00002 ******************************************************************************
00003 *   Copyright (C) 1996-2001, International Business Machines
00004 *   Corporation and others.  All Rights Reserved.
00005 ******************************************************************************
00006 */
00007 //  FILE NAME : unicode.h
00008 //
00009 //  CREATED
00010 //      Wednesday, December 11, 1996
00011 //
00012 //  CREATED BY
00013 //      Helena Shih
00014 //
00015 //  CHANGES
00016 //      Thursday, April 15, 1999
00017 //      Modified the definitions of all the functions
00018 //      C++ Wrappers for Unicode
00019 //  CHANGES BY
00020 //      Madhu Katragadda
00021 //   5/20/99     Madhu      Added the function getVersion()
00022 //  11/22/99     aliu       Added MIN_RADIX, MAX_RADIX, digit, forDigit
00023 //*****************************************************************************
00024 
00025 
00026 
00027 #ifndef UNICODE_H
00028 #define UNICODE_H
00029 
00030 #include "unicode/utypes.h"
00031 #include "unicode/uchar.h"
00032 
00033 U_NAMESPACE_BEGIN
00055 class U_COMMON_API Unicode
00056 {
00057 public:
00058     /*
00059      * In C++, static const members actually take up memory and need to be accessed.
00060      * enum values are more like C #define's.
00061      * The following is a collection of constants, not an enumeration type.
00062      *
00063      * @deprecated See the Unicode class description.
00064      */
00065     enum {
00067         MIN_VALUE=0,
00068 
00074         MAX_VALUE=0x10ffff,
00075 
00083         MAX_CHAR_LENGTH=UTF_MAX_CHAR_LENGTH,
00084 
00095         MIN_RADIX=2,
00096 
00107         MAX_RADIX=36
00108     };
00109 
00116     enum EUnicodeGeneralTypes
00117     {
00118         UNASSIGNED              = 0,
00119         UPPERCASE_LETTER        = 1,
00120         LOWERCASE_LETTER        = 2,
00121         TITLECASE_LETTER        = 3,
00122         MODIFIER_LETTER         = 4,
00123         OTHER_LETTER            = 5,
00124         NON_SPACING_MARK        = 6,
00125         ENCLOSING_MARK          = 7,
00126         COMBINING_SPACING_MARK  = 8,
00127         DECIMAL_DIGIT_NUMBER    = 9,
00128         LETTER_NUMBER           = 10,
00129         OTHER_NUMBER            = 11,
00130         SPACE_SEPARATOR         = 12,
00131         LINE_SEPARATOR          = 13,
00132         PARAGRAPH_SEPARATOR     = 14,
00133         CONTROL                 = 15,
00134         FORMAT                  = 16,
00135         PRIVATE_USE             = 17,
00136         SURROGATE               = 18,
00137         DASH_PUNCTUATION        = 19,
00138         START_PUNCTUATION       = 20,
00139         END_PUNCTUATION         = 21,
00140         CONNECTOR_PUNCTUATION   = 22,
00141         OTHER_PUNCTUATION       = 23,
00142         MATH_SYMBOL             = 24,
00143         CURRENCY_SYMBOL         = 25,
00144         MODIFIER_SYMBOL         = 26,
00145         OTHER_SYMBOL            = 27,
00146         INITIAL_PUNCTUATION     = 28,
00147         FINAL_PUNCTUATION       = 29,
00148         GENERAL_TYPES_COUNT     = 30
00149     };
00150 
00151     /* Please keep these values in sync with UCharScript */
00157     enum EUnicodeScript 
00158     {
00159         kBasicLatin=UBLOCK_BASIC_LATIN,
00160         kLatin1Supplement,
00161         kLatinExtendedA,
00162         kLatinExtendedB,
00163         kIPAExtension,
00164         kSpacingModifier,
00165         kCombiningDiacritical,
00166         kGreek,
00167         kCyrillic,
00168         kArmenian,
00169         kHebrew,
00170         kArabic,
00171         kSyriac,
00172         kThaana,
00173         kDevanagari,
00174         kBengali,
00175         kGurmukhi,
00176         kGujarati,
00177         kOriya,
00178         kTamil,
00179         kTelugu,
00180         kKannada,
00181         kMalayalam,
00182         kSinhala,
00183         kThai,
00184         kLao,
00185         kTibetan,
00186         kMyanmar,
00187         kGeorgian,
00188         kHangulJamo,
00189         kEthiopic,
00190         kCherokee,
00191         kUnifiedCanadianAboriginalSyllabics,
00192         kogham,
00193         kRunic,
00194         kKhmer,
00195         kMongolian,
00196         kLatinExtendedAdditional,
00197         kGreekExtended,
00198         kGeneralPunctuation,
00199         kSuperSubScript,
00200         kCurrencySymbolScript,
00201         kSymbolCombiningMark,
00202         kLetterlikeSymbol,
00203         kNumberForm,
00204         kArrow,
00205         kMathOperator,
00206         kMiscTechnical,
00207         kControlPicture,
00208         kOpticalCharacter,
00209         kEnclosedAlphanumeric,
00210         kBoxDrawing,
00211         kBlockElement,
00212         kGeometricShape,
00213         kMiscSymbol,
00214         kDingbat,
00215         kBraillePatterns,
00216         kCJKRadicalsSupplement,
00217         kKangxiRadicals,
00218         kIdeographicDescriptionCharacters,
00219         kCJKSymbolPunctuation,
00220         kHiragana,
00221         kKatakana,
00222         kBopomofo,
00223         kHangulCompatibilityJamo,
00224         kKanbun,
00225         kBopomofoExtended,
00226         kEnclosedCJKLetterMonth,
00227         kCJKCompatibility,
00228         kCJKUnifiedIdeographExtensionA,
00229         kCJKUnifiedIdeograph,
00230         kYiSyllables,
00231         kYiRadicals,
00232         kHangulSyllable,
00233         kHighSurrogate,
00234         kHighPrivateUseSurrogate,
00235         kLowSurrogate,
00236         kPrivateUse,
00237         kCJKCompatibilityIdeograph,
00238         kAlphabeticPresentation,
00239         kArabicPresentationA,
00240         kCombiningHalfMark,
00241         kCJKCompatibilityForm,
00242         kSmallFormVariant,
00243         kArabicPresentationB,
00244         kNoScript,
00245         kHalfwidthFullwidthForm,
00246         kScriptCount=UBLOCK_COUNT
00247     };
00248 
00254     enum EDirectionProperty { 
00255         LEFT_TO_RIGHT               = 0, 
00256         RIGHT_TO_LEFT               = 1, 
00257         EUROPEAN_NUMBER             = 2,
00258         EUROPEAN_NUMBER_SEPARATOR   = 3,
00259         EUROPEAN_NUMBER_TERMINATOR  = 4,
00260         ARABIC_NUMBER               = 5,
00261         COMMON_NUMBER_SEPARATOR     = 6,
00262         BLOCK_SEPARATOR             = 7,
00263         SEGMENT_SEPARATOR           = 8,
00264         WHITE_SPACE_NEUTRAL         = 9, 
00265         OTHER_NEUTRAL               = 10, 
00266         LEFT_TO_RIGHT_EMBEDDING     = 11,
00267         LEFT_TO_RIGHT_OVERRIDE      = 12,
00268         RIGHT_TO_LEFT_ARABIC        = 13,
00269         RIGHT_TO_LEFT_EMBEDDING     = 14,
00270         RIGHT_TO_LEFT_OVERRIDE      = 15,
00271         POP_DIRECTIONAL_FORMAT      = 16,
00272         DIR_NON_SPACING_MARK        = 17,
00273         BOUNDARY_NEUTRAL            = 18
00274     };
00275 
00282     enum ECellWidths
00283     {
00284         ZERO_WIDTH              = 0,
00285         HALF_WIDTH              = 1,
00286         FULL_WIDTH              = 2,
00287         NEUTRAL                 = 3
00288     };
00289 
00301     static inline UBool isSingle(UChar c);
00302 
00312     static inline UBool isLead(UChar c);
00313 
00323     static inline UBool isTrail(UChar c);
00324 
00336     static inline UBool isSurrogate(UChar32 c);
00337 
00351     static inline UBool isUnicodeChar(UChar32 c);
00352 
00365     static inline UBool isError(UChar32 c);
00366 
00377     static inline UBool isValid(UChar32 c);
00378 
00391     static inline UBool needMultipleUChar(UChar32 c);
00392 
00402     static inline int32_t charLength(UChar32 c);
00403 
00418     static inline int32_t arraySize(int32_t size);
00419 
00433     static inline UBool isLowerCase(UChar32 ch);
00434 
00447     static inline UBool isUpperCase(UChar32 ch);
00448 
00461     static inline UBool isTitleCase(UChar32 ch);
00462 
00475     static inline UBool isDigit(UChar32 ch);
00476 
00493     static inline UBool isDefined(UChar32 ch);
00494 
00506     static inline UBool isControl(UChar32 ch);
00507 
00519     static inline UBool isPrintable(UChar32 ch);
00520 
00533      static inline UBool isBaseForm(UChar32 ch);
00534 
00551     static inline UBool isLetter(UChar32 ch);
00552 
00574     static inline UBool isJavaIdentifierStart(UChar32 ch);
00575 
00605     static inline UBool isJavaIdentifierPart(UChar32 ch);
00606 
00622     static inline UBool isUnicodeIdentifierStart(UChar32 ch);
00623 
00651     static inline UBool isUnicodeIdentifierPart(UChar32 ch);
00652 
00679     static inline UBool isIdentifierIgnorable(UChar32 ch);
00680 
00706    static inline UChar32 toLowerCase(UChar32 ch); 
00707 
00730     static inline UChar32 toUpperCase(UChar32 ch);
00731 
00750     static inline UChar32 toTitleCase(UChar32 ch);
00751 
00766     static inline UChar32
00767     foldCase(UChar32 c, uint32_t options);
00768 
00778     static inline UBool isSpaceChar(UChar32 ch);
00779 
00809     static inline UBool isWhitespace(UChar32 ch);
00810 
00846     static inline int8_t getType(UChar32 ch);
00847 
00856     static inline uint8_t getCombiningClass(UChar32 c);
00857 
00870     static inline EDirectionProperty characterDirection(UChar32 ch);
00871 
00883     static inline UBool isMirrored(UChar32 c);
00884 
00902     static inline UChar32 charMirror(UChar32 c);
00903 
00911     static inline EUnicodeScript getScript(UChar32 ch);
00912 
00966     static inline uint16_t getCellWidth(UChar32 ch);
00967 
00996     static inline int32_t
00997     getCharName(uint32_t code,
00998                 char *buffer, int32_t bufferLength,
00999                 UCharNameChoice nameChoice=U_UNICODE_CHAR_NAME);
01000 
01012     static inline int32_t digitValue(UChar32 ch);     
01013 
01052     static inline int32_t digit(UChar32 ch, int8_t radix);
01053 
01082     static inline UChar32 forDigit(int32_t digit, int8_t radix);
01083 
01090     static void getUnicodeVersion(UVersionInfo info);
01091 
01092 protected:
01093     // These constructors, destructor, and assignment operator must
01094     // be protected (not private, as they semantically are) to make
01095     // various UNIX compilers happy. [LIU]
01096     // They should be private to prevent anyone from instantiating or
01097     // subclassing Unicode.
01098     Unicode();
01099     /* copy constructor
01100      * @param other The object to be copied
01101      */
01102     Unicode(const Unicode &other);
01103     ~Unicode();
01104     /* assignment operator
01105      * @param other The object to be copied
01106      * @return the newly created object
01107      */
01108     const Unicode &operator=(const Unicode &other);
01109 };
01110 
01111 /* inline implementations --------------------------------------------------- */
01112 
01113 inline UBool
01114 Unicode::isSingle(UChar c) {
01115     return UTF_IS_SINGLE(c);
01116 }
01117 
01118 inline UBool
01119 Unicode::isLead(UChar c) {
01120     return UTF_IS_LEAD(c);
01121 }
01122 
01123 inline UBool
01124 Unicode::isTrail(UChar c) {
01125     return UTF_IS_TRAIL(c);
01126 }
01127 
01128 inline UBool
01129 Unicode::isSurrogate(UChar32 c) {
01130     return UTF_IS_SURROGATE(c);
01131 }
01132 
01133 inline UBool
01134 Unicode::isUnicodeChar(UChar32 c) {
01135     return UTF_IS_UNICODE_CHAR(c);
01136 }
01137 
01138 inline UBool
01139 Unicode::isError(UChar32 c) {
01140     return UTF_IS_ERROR(c);
01141 }
01142 
01143 inline UBool
01144 Unicode::isValid(UChar32 c) {
01145     return UTF_IS_VALID(c);
01146 }
01147 
01148 inline UBool
01149 Unicode::needMultipleUChar(UChar32 c) {
01150     return UTF_NEED_MULTIPLE_UCHAR(c);
01151 }
01152 
01153 inline int32_t
01154 Unicode::charLength(UChar32 c) {
01155     return UTF_CHAR_LENGTH(c);
01156 }
01157 
01158 inline int32_t
01159 Unicode::arraySize(int32_t size) {
01160     return UTF_ARRAY_SIZE(size);
01161 }
01162 
01163 // Checks if ch is a lower case letter.
01164 inline UBool
01165 Unicode::isLowerCase(UChar32 ch) {
01166     return u_islower(ch);
01167 }
01168 
01169 // Checks if ch is a upper case letter.
01170 inline UBool
01171 Unicode::isUpperCase(UChar32 ch) {
01172     return u_isupper(ch);
01173 }
01174 
01175 // Checks if ch is a title case letter; usually upper case letters.
01176 inline UBool
01177 Unicode::isTitleCase(UChar32 ch) {
01178     return u_istitle(ch);
01179 }
01180 
01181 // Checks if ch is a decimal digit.
01182 inline UBool
01183 Unicode::isDigit(UChar32 ch) {
01184     return u_isdigit(ch);
01185 }
01186 
01187 // Checks if ch is a unicode character with assigned character type.
01188 inline UBool
01189 Unicode::isDefined(UChar32 ch) {
01190     return u_isdefined(ch);
01191 }
01192 
01193 // Checks if the Unicode character is a control character.
01194 inline UBool
01195 Unicode::isControl(UChar32 ch) {
01196     return u_iscntrl(ch);
01197 }
01198 
01199 // Checks if the Unicode character is printable.
01200 inline UBool
01201 Unicode::isPrintable(UChar32 ch) {
01202     return u_isprint(ch);
01203 }
01204 
01205 // Checks if the Unicode character is a base form character that can take a diacritic.
01206 inline UBool
01207 Unicode::isBaseForm(UChar32 ch) {
01208     return u_isbase(ch);
01209 }
01210 
01211 // Checks if the Unicode character is a letter.
01212 inline UBool
01213 Unicode::isLetter(UChar32 ch) {
01214     return u_isalpha(ch);
01215 }
01216 
01217 // Checks if the Unicode character can start a Java identifier.
01218 inline UBool
01219 Unicode::isJavaIdentifierStart(UChar32 ch) {
01220     return u_isJavaIDStart(ch);
01221 }
01222 
01223 // Checks if the Unicode character can be a Java identifier part other than starting the
01224 // identifier.
01225 inline UBool
01226 Unicode::isJavaIdentifierPart(UChar32 ch) {
01227     return u_isJavaIDPart(ch);
01228 }
01229 
01230 // Checks if the Unicode character can start a Unicode identifier.
01231 inline UBool
01232 Unicode::isUnicodeIdentifierStart(UChar32 ch) {
01233     return u_isIDStart(ch);
01234 }
01235 
01236 // Checks if the Unicode character can be a Unicode identifier part other than starting the
01237 // identifier.
01238 inline UBool
01239 Unicode::isUnicodeIdentifierPart(UChar32 ch) {
01240     return u_isIDPart(ch);
01241 }
01242 
01243 // Checks if the Unicode character can be ignorable in a Java or Unicode identifier.
01244 inline UBool
01245 Unicode::isIdentifierIgnorable(UChar32 ch) {
01246     return u_isIDIgnorable(ch);
01247 }
01248 
01249 // Transforms the Unicode character to its lower case equivalent.
01250 inline UChar32       
01251 Unicode::toLowerCase(UChar32 ch) {
01252     return u_tolower(ch);
01253 }
01254     
01255 // Transforms the Unicode character to its upper case equivalent.
01256 inline UChar32
01257 Unicode::toUpperCase(UChar32 ch) {
01258     return u_toupper(ch);
01259 }
01260 
01261 // Transforms the Unicode character to its title case equivalent.
01262 inline UChar32
01263 Unicode::toTitleCase(UChar32 ch) {
01264     return u_totitle(ch);
01265 }
01266 
01267 // Transforms the Unicode character to its case folded equivalent.
01268 inline UChar32       
01269 Unicode::foldCase(UChar32 ch, uint32_t options) {
01270     return u_foldCase(ch, options);
01271 }
01272     
01273 // Checks if the Unicode character is a space character.
01274 inline UBool
01275 Unicode::isSpaceChar(UChar32 ch) {
01276     return u_isspace(ch);
01277 }
01278 
01279 // Determines if the specified character is white space according to ICU.
01280 inline UBool
01281 Unicode::isWhitespace(UChar32 ch) {
01282     return u_isWhitespace(ch);
01283 }
01284 
01285 // Gets if the Unicode character's character property.
01286 inline int8_t
01287 Unicode::getType(UChar32 ch) {
01288     return u_charType(ch);
01289 }
01290 
01291 inline uint8_t
01292 Unicode::getCombiningClass(UChar32 c) {
01293     return u_getCombiningClass(c);
01294 }
01295 
01296 // Gets the character's linguistic directionality.
01297 inline Unicode::EDirectionProperty
01298 Unicode::characterDirection(UChar32 ch) {
01299     return (EDirectionProperty)u_charDirection(ch);
01300 }
01301 
01302 // Determines if the character has the "mirrored" property.
01303 inline UBool
01304 Unicode::isMirrored(UChar32 ch) {
01305     return u_isMirrored(ch);
01306 }
01307 
01308 // Maps the character to a "mirror-image" character, or to itself.
01309 inline UChar32
01310 Unicode::charMirror(UChar32 ch) {
01311     return u_charMirror(ch);
01312 }
01313 
01314 // Get the script associated with the character
01315 inline Unicode::EUnicodeScript
01316 Unicode::getScript(UChar32 ch) {
01317     return (EUnicodeScript) u_charScript(ch);
01318 }
01319 
01320 // Gets table cell width of the Unicode character.
01321 inline uint16_t
01322 Unicode::getCellWidth(UChar32 ch) {
01323     return u_charCellWidth(ch);
01324 }
01325 
01326 inline int32_t
01327 Unicode::getCharName(uint32_t code,
01328                      char *buffer, int32_t bufferLength,
01329                      UCharNameChoice nameChoice) {
01330     UErrorCode errorCode=U_ZERO_ERROR;
01331     int32_t length=u_charName(code, nameChoice, buffer, bufferLength, &errorCode);
01332     return U_SUCCESS(errorCode) ? length : 0;
01333 }
01334 
01335 inline int32_t            
01336 Unicode::digitValue(UChar32 ch) {
01337     return u_charDigitValue(ch);
01338 }
01339 
01340 inline int32_t
01341 Unicode::digit(UChar32 ch, int8_t radix) {
01342     return u_digit(ch, radix);
01343 }
01344 
01345 inline UChar32
01346 Unicode::forDigit(int32_t digit, int8_t radix) {
01347     return u_forDigit(digit, radix);
01348 }
01349 
01350 inline void
01351 Unicode::getUnicodeVersion(UVersionInfo versionArray) {
01352     u_getUnicodeVersion(versionArray);
01353 }
01354 U_NAMESPACE_END
01355 
01356 #endif

Generated on Thu Aug 15 14:13:31 2002 for ICU 2.2 by doxygen1.2.11.1 written by Dimitri van Heesch, © 1997-2001