Main Page   Class Hierarchy   Alphabetical List   Compound List   File List   Compound Members   File Members   Search  

unicode.h

Go to the documentation of this file.
00001 /*
00002 ******************************************************************************
00003 *   Copyright (C) 1996-2001, International Business Machines
00004 *   Corporation and others.  All Rights Reserved.
00005 ******************************************************************************
00006 */
00007 //  FILE NAME : unicode.h
00008 //
00009 //  CREATED
00010 //      Wednesday, December 11, 1996
00011 //
00012 //  CREATED BY
00013 //      Helena Shih
00014 //
00015 //  CHANGES
00016 //      Thursday, April 15, 1999
00017 //      Modified the definitions of all the functions
00018 //      C++ Wrappers for Unicode
00019 //  CHANGES BY
00020 //      Madhu Katragadda
00021 //   5/20/99     Madhu      Added the function getVersion()
00022 //  11/22/99     aliu       Added MIN_RADIX, MAX_RADIX, digit, forDigit
00023 //*****************************************************************************
00024 
00025 
00026 
00027 #ifndef UNICODE_H
00028 #define UNICODE_H
00029 
00030 #include "unicode/utypes.h"
00031 #include "unicode/uchar.h"
00032 
00033 #ifdef ICU_UNICODE_CLASS_USE_DEPRECATES
00034 U_NAMESPACE_BEGIN
00056 class U_COMMON_API Unicode
00057 {
00058 public:
00059     /*
00060      * In C++, static const members actually take up memory and need to be accessed.
00061      * enum values are more like C #define's.
00062      * The following is a collection of constants, not an enumeration type.
00063      *
00064      * @obsolete ICU 2.4. Use the parallel uchar.h/utf.h C API instead since this API will be removed in that release.
00065      */
00066     enum {
00068         MIN_VALUE=0,
00069 
00075         MAX_VALUE=0x10ffff,
00076 
00084         MAX_CHAR_LENGTH=UTF_MAX_CHAR_LENGTH,
00085 
00096         MIN_RADIX=2,
00097 
00108         MAX_RADIX=36
00109     };
00110 
00117     enum EUnicodeGeneralTypes
00118     {
00119         UNASSIGNED              = 0,
00120         UPPERCASE_LETTER        = 1,
00121         LOWERCASE_LETTER        = 2,
00122         TITLECASE_LETTER        = 3,
00123         MODIFIER_LETTER         = 4,
00124         OTHER_LETTER            = 5,
00125         NON_SPACING_MARK        = 6,
00126         ENCLOSING_MARK          = 7,
00127         COMBINING_SPACING_MARK  = 8,
00128         DECIMAL_DIGIT_NUMBER    = 9,
00129         LETTER_NUMBER           = 10,
00130         OTHER_NUMBER            = 11,
00131         SPACE_SEPARATOR         = 12,
00132         LINE_SEPARATOR          = 13,
00133         PARAGRAPH_SEPARATOR     = 14,
00134         CONTROL                 = 15,
00135         FORMAT                  = 16,
00136         PRIVATE_USE             = 17,
00137         SURROGATE               = 18,
00138         DASH_PUNCTUATION        = 19,
00139         START_PUNCTUATION       = 20,
00140         END_PUNCTUATION         = 21,
00141         CONNECTOR_PUNCTUATION   = 22,
00142         OTHER_PUNCTUATION       = 23,
00143         MATH_SYMBOL             = 24,
00144         CURRENCY_SYMBOL         = 25,
00145         MODIFIER_SYMBOL         = 26,
00146         OTHER_SYMBOL            = 27,
00147         INITIAL_PUNCTUATION     = 28,
00148         FINAL_PUNCTUATION       = 29,
00149         GENERAL_TYPES_COUNT     = 30
00150     };
00151 
00152     /* Please keep these values in sync with UCharScript */
00158     enum EUnicodeScript 
00159     {
00160         kBasicLatin=UBLOCK_BASIC_LATIN,
00161         kLatin1Supplement,
00162         kLatinExtendedA,
00163         kLatinExtendedB,
00164         kIPAExtension,
00165         kSpacingModifier,
00166         kCombiningDiacritical,
00167         kGreek,
00168         kCyrillic,
00169         kArmenian,
00170         kHebrew,
00171         kArabic,
00172         kSyriac,
00173         kThaana,
00174         kDevanagari,
00175         kBengali,
00176         kGurmukhi,
00177         kGujarati,
00178         kOriya,
00179         kTamil,
00180         kTelugu,
00181         kKannada,
00182         kMalayalam,
00183         kSinhala,
00184         kThai,
00185         kLao,
00186         kTibetan,
00187         kMyanmar,
00188         kGeorgian,
00189         kHangulJamo,
00190         kEthiopic,
00191         kCherokee,
00192         kUnifiedCanadianAboriginalSyllabics,
00193         kogham,
00194         kRunic,
00195         kKhmer,
00196         kMongolian,
00197         kLatinExtendedAdditional,
00198         kGreekExtended,
00199         kGeneralPunctuation,
00200         kSuperSubScript,
00201         kCurrencySymbolScript,
00202         kSymbolCombiningMark,
00203         kLetterlikeSymbol,
00204         kNumberForm,
00205         kArrow,
00206         kMathOperator,
00207         kMiscTechnical,
00208         kControlPicture,
00209         kOpticalCharacter,
00210         kEnclosedAlphanumeric,
00211         kBoxDrawing,
00212         kBlockElement,
00213         kGeometricShape,
00214         kMiscSymbol,
00215         kDingbat,
00216         kBraillePatterns,
00217         kCJKRadicalsSupplement,
00218         kKangxiRadicals,
00219         kIdeographicDescriptionCharacters,
00220         kCJKSymbolPunctuation,
00221         kHiragana,
00222         kKatakana,
00223         kBopomofo,
00224         kHangulCompatibilityJamo,
00225         kKanbun,
00226         kBopomofoExtended,
00227         kEnclosedCJKLetterMonth,
00228         kCJKCompatibility,
00229         kCJKUnifiedIdeographExtensionA,
00230         kCJKUnifiedIdeograph,
00231         kYiSyllables,
00232         kYiRadicals,
00233         kHangulSyllable,
00234         kHighSurrogate,
00235         kHighPrivateUseSurrogate,
00236         kLowSurrogate,
00237         kPrivateUse,
00238         kCJKCompatibilityIdeograph,
00239         kAlphabeticPresentation,
00240         kArabicPresentationA,
00241         kCombiningHalfMark,
00242         kCJKCompatibilityForm,
00243         kSmallFormVariant,
00244         kArabicPresentationB,
00245         kNoScript,
00246         kHalfwidthFullwidthForm,
00247         kScriptCount=UBLOCK_COUNT
00248     };
00249 
00255     enum EDirectionProperty { 
00256         LEFT_TO_RIGHT               = 0, 
00257         RIGHT_TO_LEFT               = 1, 
00258         EUROPEAN_NUMBER             = 2,
00259         EUROPEAN_NUMBER_SEPARATOR   = 3,
00260         EUROPEAN_NUMBER_TERMINATOR  = 4,
00261         ARABIC_NUMBER               = 5,
00262         COMMON_NUMBER_SEPARATOR     = 6,
00263         BLOCK_SEPARATOR             = 7,
00264         SEGMENT_SEPARATOR           = 8,
00265         WHITE_SPACE_NEUTRAL         = 9, 
00266         OTHER_NEUTRAL               = 10, 
00267         LEFT_TO_RIGHT_EMBEDDING     = 11,
00268         LEFT_TO_RIGHT_OVERRIDE      = 12,
00269         RIGHT_TO_LEFT_ARABIC        = 13,
00270         RIGHT_TO_LEFT_EMBEDDING     = 14,
00271         RIGHT_TO_LEFT_OVERRIDE      = 15,
00272         POP_DIRECTIONAL_FORMAT      = 16,
00273         DIR_NON_SPACING_MARK        = 17,
00274         BOUNDARY_NEUTRAL            = 18
00275     };
00276 
00283     enum ECellWidths
00284     {
00285         ZERO_WIDTH              = 0,
00286         HALF_WIDTH              = 1,
00287         FULL_WIDTH              = 2,
00288         NEUTRAL                 = 3
00289     };
00290 
00302     static inline UBool isSingle(UChar c);
00303 
00313     static inline UBool isLead(UChar c);
00314 
00324     static inline UBool isTrail(UChar c);
00325 
00337     static inline UBool isSurrogate(UChar32 c);
00338 
00352     static inline UBool isUnicodeChar(UChar32 c);
00353 
00366     static inline UBool isError(UChar32 c);
00367 
00378     static inline UBool isValid(UChar32 c);
00379 
00392     static inline UBool needMultipleUChar(UChar32 c);
00393 
00403     static inline int32_t charLength(UChar32 c);
00404 
00419     static inline int32_t arraySize(int32_t size);
00420 
00434     static inline UBool isLowerCase(UChar32 ch);
00435 
00448     static inline UBool isUpperCase(UChar32 ch);
00449 
00462     static inline UBool isTitleCase(UChar32 ch);
00463 
00476     static inline UBool isDigit(UChar32 ch);
00477 
00494     static inline UBool isDefined(UChar32 ch);
00495 
00507     static inline UBool isControl(UChar32 ch);
00508 
00520     static inline UBool isPrintable(UChar32 ch);
00521 
00534      static inline UBool isBaseForm(UChar32 ch);
00535 
00552     static inline UBool isLetter(UChar32 ch);
00553 
00575     static inline UBool isJavaIdentifierStart(UChar32 ch);
00576 
00606     static inline UBool isJavaIdentifierPart(UChar32 ch);
00607 
00623     static inline UBool isUnicodeIdentifierStart(UChar32 ch);
00624 
00652     static inline UBool isUnicodeIdentifierPart(UChar32 ch);
00653 
00680     static inline UBool isIdentifierIgnorable(UChar32 ch);
00681 
00707    static inline UChar32 toLowerCase(UChar32 ch); 
00708 
00731     static inline UChar32 toUpperCase(UChar32 ch);
00732 
00751     static inline UChar32 toTitleCase(UChar32 ch);
00752 
00767     static inline UChar32
00768     foldCase(UChar32 c, uint32_t options);
00769 
00779     static inline UBool isSpaceChar(UChar32 ch);
00780 
00810     static inline UBool isWhitespace(UChar32 ch);
00811 
00847     static inline int8_t getType(UChar32 ch);
00848 
00857     static inline uint8_t getCombiningClass(UChar32 c);
00858 
00871     static inline EDirectionProperty characterDirection(UChar32 ch);
00872 
00884     static inline UBool isMirrored(UChar32 c);
00885 
00903     static inline UChar32 charMirror(UChar32 c);
00904 
00913     static inline EUnicodeScript getScript(UChar32 ch);
00914 
00968     static inline uint16_t getCellWidth(UChar32 ch);
00969 
00998     static inline int32_t
00999     getCharName(uint32_t code,
01000                 char *buffer, int32_t bufferLength,
01001                 UCharNameChoice nameChoice=U_UNICODE_CHAR_NAME);
01002 
01014     static inline int32_t digitValue(UChar32 ch);     
01015 
01054     static inline int32_t digit(UChar32 ch, int8_t radix);
01055 
01084     static inline UChar32 forDigit(int32_t digit, int8_t radix);
01085 
01092     static void getUnicodeVersion(UVersionInfo info);
01093 
01094 protected:
01095     // These constructors, destructor, and assignment operator must
01096     // be protected (not private, as they semantically are) to make
01097     // various UNIX compilers happy. [LIU]
01098     // They should be private to prevent anyone from instantiating or
01099     // subclassing Unicode.
01100     Unicode();
01101     /* copy constructor
01102      * @param other The object to be copied
01103      */
01104     Unicode(const Unicode &other);
01105     ~Unicode();
01106     /* assignment operator
01107      * @param other The object to be copied
01108      * @return the newly created object
01109      */
01110     const Unicode &operator=(const Unicode &other);
01111 };
01112 
01113 /* inline implementations --------------------------------------------------- */
01114 
01115 inline UBool
01116 Unicode::isSingle(UChar c) {
01117     return UTF_IS_SINGLE(c);
01118 }
01119 
01120 inline UBool
01121 Unicode::isLead(UChar c) {
01122     return UTF_IS_LEAD(c);
01123 }
01124 
01125 inline UBool
01126 Unicode::isTrail(UChar c) {
01127     return UTF_IS_TRAIL(c);
01128 }
01129 
01130 inline UBool
01131 Unicode::isSurrogate(UChar32 c) {
01132     return UTF_IS_SURROGATE(c);
01133 }
01134 
01135 inline UBool
01136 Unicode::isUnicodeChar(UChar32 c) {
01137     return UTF_IS_UNICODE_CHAR(c);
01138 }
01139 
01140 inline UBool
01141 Unicode::isError(UChar32 c) {
01142     return UTF_IS_ERROR(c);
01143 }
01144 
01145 inline UBool
01146 Unicode::isValid(UChar32 c) {
01147     return UTF_IS_VALID(c);
01148 }
01149 
01150 inline UBool
01151 Unicode::needMultipleUChar(UChar32 c) {
01152     return UTF_NEED_MULTIPLE_UCHAR(c);
01153 }
01154 
01155 inline int32_t
01156 Unicode::charLength(UChar32 c) {
01157     return UTF_CHAR_LENGTH(c);
01158 }
01159 
01160 inline int32_t
01161 Unicode::arraySize(int32_t size) {
01162     return UTF_ARRAY_SIZE(size);
01163 }
01164 
01165 // Checks if ch is a lower case letter.
01166 inline UBool
01167 Unicode::isLowerCase(UChar32 ch) {
01168     return u_islower(ch);
01169 }
01170 
01171 // Checks if ch is a upper case letter.
01172 inline UBool
01173 Unicode::isUpperCase(UChar32 ch) {
01174     return u_isupper(ch);
01175 }
01176 
01177 // Checks if ch is a title case letter; usually upper case letters.
01178 inline UBool
01179 Unicode::isTitleCase(UChar32 ch) {
01180     return u_istitle(ch);
01181 }
01182 
01183 // Checks if ch is a decimal digit.
01184 inline UBool
01185 Unicode::isDigit(UChar32 ch) {
01186     return u_isdigit(ch);
01187 }
01188 
01189 // Checks if ch is a unicode character with assigned character type.
01190 inline UBool
01191 Unicode::isDefined(UChar32 ch) {
01192     return u_isdefined(ch);
01193 }
01194 
01195 // Checks if the Unicode character is a control character.
01196 inline UBool
01197 Unicode::isControl(UChar32 ch) {
01198     return u_iscntrl(ch);
01199 }
01200 
01201 // Checks if the Unicode character is printable.
01202 inline UBool
01203 Unicode::isPrintable(UChar32 ch) {
01204     return u_isprint(ch);
01205 }
01206 
01207 // Checks if the Unicode character is a base form character that can take a diacritic.
01208 inline UBool
01209 Unicode::isBaseForm(UChar32 ch) {
01210     return u_isbase(ch);
01211 }
01212 
01213 // Checks if the Unicode character is a letter.
01214 inline UBool
01215 Unicode::isLetter(UChar32 ch) {
01216     return u_isalpha(ch);
01217 }
01218 
01219 // Checks if the Unicode character can start a Java identifier.
01220 inline UBool
01221 Unicode::isJavaIdentifierStart(UChar32 ch) {
01222     return u_isJavaIDStart(ch);
01223 }
01224 
01225 // Checks if the Unicode character can be a Java identifier part other than starting the
01226 // identifier.
01227 inline UBool
01228 Unicode::isJavaIdentifierPart(UChar32 ch) {
01229     return u_isJavaIDPart(ch);
01230 }
01231 
01232 // Checks if the Unicode character can start a Unicode identifier.
01233 inline UBool
01234 Unicode::isUnicodeIdentifierStart(UChar32 ch) {
01235     return u_isIDStart(ch);
01236 }
01237 
01238 // Checks if the Unicode character can be a Unicode identifier part other than starting the
01239 // identifier.
01240 inline UBool
01241 Unicode::isUnicodeIdentifierPart(UChar32 ch) {
01242     return u_isIDPart(ch);
01243 }
01244 
01245 // Checks if the Unicode character can be ignorable in a Java or Unicode identifier.
01246 inline UBool
01247 Unicode::isIdentifierIgnorable(UChar32 ch) {
01248     return u_isIDIgnorable(ch);
01249 }
01250 
01251 // Transforms the Unicode character to its lower case equivalent.
01252 inline UChar32       
01253 Unicode::toLowerCase(UChar32 ch) {
01254     return u_tolower(ch);
01255 }
01256     
01257 // Transforms the Unicode character to its upper case equivalent.
01258 inline UChar32
01259 Unicode::toUpperCase(UChar32 ch) {
01260     return u_toupper(ch);
01261 }
01262 
01263 // Transforms the Unicode character to its title case equivalent.
01264 inline UChar32
01265 Unicode::toTitleCase(UChar32 ch) {
01266     return u_totitle(ch);
01267 }
01268 
01269 // Transforms the Unicode character to its case folded equivalent.
01270 inline UChar32       
01271 Unicode::foldCase(UChar32 ch, uint32_t options) {
01272     return u_foldCase(ch, options);
01273 }
01274     
01275 // Checks if the Unicode character is a space character.
01276 inline UBool
01277 Unicode::isSpaceChar(UChar32 ch) {
01278     return u_isspace(ch);
01279 }
01280 
01281 // Determines if the specified character is white space according to ICU.
01282 inline UBool
01283 Unicode::isWhitespace(UChar32 ch) {
01284     return u_isWhitespace(ch);
01285 }
01286 
01287 // Gets if the Unicode character's character property.
01288 inline int8_t
01289 Unicode::getType(UChar32 ch) {
01290     return u_charType(ch);
01291 }
01292 
01293 inline uint8_t
01294 Unicode::getCombiningClass(UChar32 c) {
01295     return u_getCombiningClass(c);
01296 }
01297 
01298 // Gets the character's linguistic directionality.
01299 inline Unicode::EDirectionProperty
01300 Unicode::characterDirection(UChar32 ch) {
01301     return (EDirectionProperty)u_charDirection(ch);
01302 }
01303 
01304 // Determines if the character has the "mirrored" property.
01305 inline UBool
01306 Unicode::isMirrored(UChar32 ch) {
01307     return u_isMirrored(ch);
01308 }
01309 
01310 // Maps the character to a "mirror-image" character, or to itself.
01311 inline UChar32
01312 Unicode::charMirror(UChar32 ch) {
01313     return u_charMirror(ch);
01314 }
01315 
01316 // Get the script associated with the character
01317 inline Unicode::EUnicodeScript
01318 Unicode::getScript(UChar32 ch) {
01319     return (EUnicodeScript) u_charScript(ch);
01320 }
01321 
01322 // Gets table cell width of the Unicode character.
01323 inline uint16_t
01324 Unicode::getCellWidth(UChar32 ch) {
01325     return u_charCellWidth(ch);
01326 }
01327 
01328 inline int32_t
01329 Unicode::getCharName(uint32_t code,
01330                      char *buffer, int32_t bufferLength,
01331                      UCharNameChoice nameChoice) {
01332     UErrorCode errorCode=U_ZERO_ERROR;
01333     int32_t length=u_charName(code, nameChoice, buffer, bufferLength, &errorCode);
01334     return U_SUCCESS(errorCode) ? length : 0;
01335 }
01336 
01337 inline int32_t            
01338 Unicode::digitValue(UChar32 ch) {
01339     return u_charDigitValue(ch);
01340 }
01341 
01342 inline int32_t
01343 Unicode::digit(UChar32 ch, int8_t radix) {
01344     return u_digit(ch, radix);
01345 }
01346 
01347 inline UChar32
01348 Unicode::forDigit(int32_t digit, int8_t radix) {
01349     return u_forDigit(digit, radix);
01350 }
01351 
01352 inline void
01353 Unicode::getUnicodeVersion(UVersionInfo versionArray) {
01354     u_getUnicodeVersion(versionArray);
01355 }
01356 U_NAMESPACE_END
01357 #else
01358 
01359 #error "The unicode/unicode.h header is obsolete. Please use the Unicode C API in unicode/uchar.h instead."
01360 
01361 #endif /* ICU_UNICODE_CLASS_USE_DEPRECATES */
01362 
01363 #endif

Generated on Wed Dec 18 16:49:56 2002 for ICU 2.4 by doxygen1.2.11.1 written by Dimitri van Heesch, © 1997-2001