00001 /* 00002 ***************************************************************************************** 00003 * Copyright (C) 1996-1999, International Business Machines 00004 * Corporation and others. All Rights Reserved. 00005 ***************************************************************************************** 00006 */ 00007 // FILE NAME : unicode.h 00008 // 00009 // CREATED 00010 // Wednesday, December 11, 1996 00011 // 00012 // CREATED BY 00013 // Helena Shih 00014 // 00015 // CHANGES 00016 // Thursday, April 15, 1999 00017 // Modified the definitions of all the functions 00018 // C++ Wrappers for Unicode 00019 // CHANGES BY 00020 // Madhu Katragadda 00021 // 5/20/99 Madhu Added the function getVersion() 00022 // 11/22/99 aliu Added MIN_RADIX, MAX_RADIX, digit, forDigit 00023 //******************************************************************************************** 00024 00025 00026 00027 #ifndef UNICODE_H 00028 #define UNICODE_H 00029 00030 #include "unicode/utypes.h" 00031 #include "unicode/uchar.h" 00032 00046 class U_COMMON_API Unicode 00047 { 00048 public: 00049 /* 00050 * In C++, static const members actually take up memory and need to be accessed. 00051 * enum values are more like C #define's. 00052 * The following is a collection of constants, not an enumeration type. 00053 */ 00054 enum { 00056 MIN_VALUE=0, 00057 00063 MAX_VALUE=0x10ffff, 00064 00072 MAX_CHAR_LENGTH=UTF_MAX_CHAR_LENGTH, 00073 00084 MIN_RADIX=2, 00085 00096 MAX_RADIX=36 00097 }; 00098 00103 enum EUnicodeGeneralTypes 00104 { 00105 UNASSIGNED = 0, 00106 UPPERCASE_LETTER = 1, 00107 LOWERCASE_LETTER = 2, 00108 TITLECASE_LETTER = 3, 00109 MODIFIER_LETTER = 4, 00110 OTHER_LETTER = 5, 00111 NON_SPACING_MARK = 6, 00112 ENCLOSING_MARK = 7, 00113 COMBINING_SPACING_MARK = 8, 00114 DECIMAL_DIGIT_NUMBER = 9, 00115 LETTER_NUMBER = 10, 00116 OTHER_NUMBER = 11, 00117 SPACE_SEPARATOR = 12, 00118 LINE_SEPARATOR = 13, 00119 PARAGRAPH_SEPARATOR = 14, 00120 CONTROL = 15, 00121 FORMAT = 16, 00122 PRIVATE_USE = 17, 00123 SURROGATE = 18, 00124 DASH_PUNCTUATION = 19, 00125 START_PUNCTUATION = 20, 00126 END_PUNCTUATION = 21, 00127 CONNECTOR_PUNCTUATION = 22, 00128 OTHER_PUNCTUATION = 23, 00129 MATH_SYMBOL = 24, 00130 CURRENCY_SYMBOL = 25, 00131 MODIFIER_SYMBOL = 26, 00132 OTHER_SYMBOL = 27, 00133 INITIAL_PUNCTUATION = 28, 00134 FINAL_PUNCTUATION = 29, 00135 GENERAL_TYPES_COUNT = 30 00136 }; 00137 00138 enum EUnicodeScript 00139 { 00140 kBasicLatin, 00141 kLatin1Supplement, 00142 kLatinExtendedA, 00143 kLatinExtendedB, 00144 kIPAExtension, 00145 kSpacingModifier, 00146 kCombiningDiacritical, 00147 kGreek, 00148 kCyrillic, 00149 kArmenian, 00150 kHebrew, 00151 kArabic, 00152 kDevanagari, 00153 kBengali, 00154 kGurmukhi, 00155 kGujarati, 00156 kOriya, 00157 kTamil, 00158 kTelugu, 00159 kKannada, 00160 kMalayalam, 00161 kThai, 00162 kLao, 00163 kTibetan, 00164 kGeorgian, 00165 kHangulJamo, 00166 kLatinExtendedAdditional, 00167 kGreekExtended, 00168 kGeneralPunctuation, 00169 kSuperSubScript, 00170 kCurrencySymbolScript, 00171 kSymbolCombiningMark, 00172 kLetterlikeSymbol, 00173 kNumberForm, 00174 kArrow, 00175 kMathOperator, 00176 kMiscTechnical, 00177 kControlPicture, 00178 kOpticalCharacter, 00179 kEnclosedAlphanumeric, 00180 kBoxDrawing, 00181 kBlockElement, 00182 kGeometricShape, 00183 kMiscSymbol, 00184 kDingbat, 00185 kCJKSymbolPunctuation, 00186 kHiragana, 00187 kKatakana, 00188 kBopomofo, 00189 kHangulCompatibilityJamo, 00190 kKanbun, 00191 kEnclosedCJKLetterMonth, 00192 kCJKCompatibility, 00193 kCJKUnifiedIdeograph, 00194 kHangulSyllable, 00195 kHighSurrogate, 00196 kHighPrivateUseSurrogate, 00197 kLowSurrogate, 00198 kPrivateUse, 00199 kCJKCompatibilityIdeograph, 00200 kAlphabeticPresentation, 00201 kArabicPresentationA, 00202 kCombiningHalfMark, 00203 kCJKCompatibilityForm, 00204 kSmallFormVariant, 00205 kArabicPresentationB, 00206 kNoScript, 00207 kHalfwidthFullwidthForm, 00208 kScriptCount 00209 }; 00210 00214 enum EDirectionProperty { 00215 LEFT_TO_RIGHT = 0, 00216 RIGHT_TO_LEFT = 1, 00217 EUROPEAN_NUMBER = 2, 00218 EUROPEAN_NUMBER_SEPARATOR = 3, 00219 EUROPEAN_NUMBER_TERMINATOR = 4, 00220 ARABIC_NUMBER = 5, 00221 COMMON_NUMBER_SEPARATOR = 6, 00222 BLOCK_SEPARATOR = 7, 00223 SEGMENT_SEPARATOR = 8, 00224 WHITE_SPACE_NEUTRAL = 9, 00225 OTHER_NEUTRAL = 10, 00226 LEFT_TO_RIGHT_EMBEDDING = 11, 00227 LEFT_TO_RIGHT_OVERRIDE = 12, 00228 RIGHT_TO_LEFT_ARABIC = 13, 00229 RIGHT_TO_LEFT_EMBEDDING = 14, 00230 RIGHT_TO_LEFT_OVERRIDE = 15, 00231 POP_DIRECTIONAL_FORMAT = 16, 00232 DIR_NON_SPACING_MARK = 17, 00233 BOUNDARY_NEUTRAL = 18 00234 }; 00235 00240 enum ECellWidths 00241 { 00242 ZERO_WIDTH = 0, 00243 HALF_WIDTH = 1, 00244 FULL_WIDTH = 2, 00245 NEUTRAL = 3 00246 }; 00247 00257 static inline UBool isSingle(UChar c); 00258 00266 static inline UBool isLead(UChar c); 00267 00275 static inline UBool isTrail(UChar c); 00276 00286 static inline UBool isSurrogate(UChar32 c); 00287 00299 static inline UBool isUnicodeChar(UChar32 c); 00300 00311 static inline UBool isError(UChar32 c); 00312 00321 static inline UBool isValid(UChar32 c); 00322 00333 static inline UBool needMultipleUChar(UChar32 c); 00334 00342 static inline int32_t charLength(UChar32 c); 00343 00356 static inline int32_t arraySize(int32_t size); 00357 00370 static inline UBool isLowerCase(UChar32 ch); 00371 00383 static inline UBool isUpperCase(UChar32 ch); 00384 00396 static inline UBool isTitleCase(UChar32 ch); 00397 00409 static inline UBool isDigit(UChar32 ch); 00410 00426 static inline UBool isDefined(UChar32 ch); 00427 00438 static inline UBool isControl(UChar32 ch); 00439 00450 static inline UBool isPrintable(UChar32 ch); 00451 00463 static inline UBool isBaseForm(UChar32 ch); 00464 00480 static inline UBool isLetter(UChar32 ch); 00481 00502 static inline UBool isJavaIdentifierStart(UChar32 ch); 00503 00532 static inline UBool isJavaIdentifierPart(UChar32 ch); 00533 00548 static inline UBool isUnicodeIdentifierStart(UChar32 ch); 00549 00576 static inline UBool isUnicodeIdentifierPart(UChar32 ch); 00577 00603 static inline UBool isIdentifierIgnorable(UChar32 ch); 00604 00629 static inline UChar32 toLowerCase(UChar32 ch); 00630 00652 static inline UChar32 toUpperCase(UChar32 ch); 00653 00671 static inline UChar32 toTitleCase(UChar32 ch); 00672 00681 static inline UBool isSpaceChar(UChar32 ch); 00682 00711 static inline UBool isWhitespace(UChar32 ch); 00712 00747 static inline int8_t getType(UChar32 ch); 00748 00758 static inline EDirectionProperty characterDirection(UChar32 ch); 00759 00769 static inline UBool isMirrored(UChar32 c); 00770 00786 static inline UChar32 charMirror(UChar32 c); 00787 00793 static inline EUnicodeScript getScript(UChar32 ch); 00794 00846 static inline uint16_t getCellWidth(UChar32 ch); 00847 00875 static inline UTextOffset 00876 getCharName(uint32_t code, 00877 char *buffer, UTextOffset bufferLength, 00878 UCharNameChoice nameChoice=U_UNICODE_CHAR_NAME); 00879 00890 static inline int32_t digitValue(UChar32 ch); 00891 00929 static inline int8_t digit(UChar32 ch, int8_t radix); 00930 00958 static inline UChar32 forDigit(int32_t digit, int8_t radix); 00959 00965 static void getUnicodeVersion(UVersionInfo info); 00966 00967 protected: 00968 // These constructors, destructor, and assignment operator must 00969 // be protected (not private, as they semantically are) to make 00970 // various UNIX compilers happy. [LIU] 00971 // They should be private to prevent anyone from instantiating or 00972 // subclassing Unicode. 00973 Unicode(); 00974 Unicode(const Unicode &other); 00975 ~Unicode(); 00976 const Unicode &operator=(const Unicode &other); 00977 }; 00978 00979 /* inline implementations --------------------------------------------------- */ 00980 00981 inline UBool 00982 Unicode::isSingle(UChar c) { 00983 return UTF_IS_SINGLE(c); 00984 } 00985 00986 inline UBool 00987 Unicode::isLead(UChar c) { 00988 return UTF_IS_LEAD(c); 00989 } 00990 00991 inline UBool 00992 Unicode::isTrail(UChar c) { 00993 return UTF_IS_TRAIL(c); 00994 } 00995 00996 inline UBool 00997 Unicode::isSurrogate(UChar32 c) { 00998 return UTF_IS_SURROGATE(c); 00999 } 01000 01001 inline UBool 01002 Unicode::isUnicodeChar(UChar32 c) { 01003 return UTF_IS_UNICODE_CHAR(c); 01004 } 01005 01006 inline UBool 01007 Unicode::isError(UChar32 c) { 01008 return UTF_IS_ERROR(c); 01009 } 01010 01011 inline UBool 01012 Unicode::isValid(UChar32 c) { 01013 return UTF_IS_VALID(c); 01014 } 01015 01016 inline UBool 01017 Unicode::needMultipleUChar(UChar32 c) { 01018 return UTF_NEED_MULTIPLE_UCHAR(c); 01019 } 01020 01021 inline int32_t 01022 Unicode::charLength(UChar32 c) { 01023 return UTF_CHAR_LENGTH(c); 01024 } 01025 01026 inline int32_t 01027 Unicode::arraySize(int32_t size) { 01028 return UTF_ARRAY_SIZE(size); 01029 } 01030 01031 // Checks if ch is a lower case letter. 01032 inline UBool 01033 Unicode::isLowerCase(UChar32 ch) { 01034 return u_islower(ch); 01035 } 01036 01037 // Checks if ch is a upper case letter. 01038 inline UBool 01039 Unicode::isUpperCase(UChar32 ch) { 01040 return u_isupper(ch); 01041 } 01042 01043 // Checks if ch is a title case letter; usually upper case letters. 01044 inline UBool 01045 Unicode::isTitleCase(UChar32 ch) { 01046 return u_istitle(ch); 01047 } 01048 01049 // Checks if ch is a decimal digit. 01050 inline UBool 01051 Unicode::isDigit(UChar32 ch) { 01052 return u_isdigit(ch); 01053 } 01054 01055 // Checks if ch is a unicode character with assigned character type. 01056 inline UBool 01057 Unicode::isDefined(UChar32 ch) { 01058 return u_isdefined(ch); 01059 } 01060 01061 // Checks if the Unicode character is a control character. 01062 inline UBool 01063 Unicode::isControl(UChar32 ch) { 01064 return u_iscntrl(ch); 01065 } 01066 01067 // Checks if the Unicode character is printable. 01068 inline UBool 01069 Unicode::isPrintable(UChar32 ch) { 01070 return u_isprint(ch); 01071 } 01072 01073 // Checks if the Unicode character is a base form character that can take a diacritic. 01074 inline UBool 01075 Unicode::isBaseForm(UChar32 ch) { 01076 return u_isbase(ch); 01077 } 01078 01079 // Checks if the Unicode character is a letter. 01080 inline UBool 01081 Unicode::isLetter(UChar32 ch) { 01082 return u_isalpha(ch); 01083 } 01084 01085 // Checks if the Unicode character can start a Java identifier. 01086 inline UBool 01087 Unicode::isJavaIdentifierStart(UChar32 ch) { 01088 return u_isJavaIDStart(ch); 01089 } 01090 01091 // Checks if the Unicode character can be a Java identifier part other than starting the 01092 // identifier. 01093 inline UBool 01094 Unicode::isJavaIdentifierPart(UChar32 ch) { 01095 return u_isJavaIDPart(ch); 01096 } 01097 01098 // Checks if the Unicode character can start a Unicode identifier. 01099 inline UBool 01100 Unicode::isUnicodeIdentifierStart(UChar32 ch) { 01101 return u_isIDStart(ch); 01102 } 01103 01104 // Checks if the Unicode character can be a Unicode identifier part other than starting the 01105 // identifier. 01106 inline UBool 01107 Unicode::isUnicodeIdentifierPart(UChar32 ch) { 01108 return u_isIDPart(ch); 01109 } 01110 01111 // Checks if the Unicode character can be ignorable in a Java or Unicode identifier. 01112 inline UBool 01113 Unicode::isIdentifierIgnorable(UChar32 ch) { 01114 return u_isIDIgnorable(ch); 01115 } 01116 01117 // Transforms the Unicode character to its lower case equivalent. 01118 inline UChar32 01119 Unicode::toLowerCase(UChar32 ch) { 01120 return u_tolower(ch); 01121 } 01122 01123 // Transforms the Unicode character to its upper case equivalent. 01124 inline UChar32 01125 Unicode::toUpperCase(UChar32 ch) { 01126 return u_toupper(ch); 01127 } 01128 01129 // Transforms the Unicode character to its title case equivalent. 01130 inline UChar32 01131 Unicode::toTitleCase(UChar32 ch) { 01132 return u_totitle(ch); 01133 } 01134 01135 // Checks if the Unicode character is a space character. 01136 inline UBool 01137 Unicode::isSpaceChar(UChar32 ch) { 01138 return u_isspace(ch); 01139 } 01140 01141 // Determines if the specified character is white space according to ICU. 01142 inline UBool 01143 Unicode::isWhitespace(UChar32 ch) { 01144 return u_isWhitespace(ch); 01145 } 01146 01147 // Gets if the Unicode character's character property. 01148 inline int8_t 01149 Unicode::getType(UChar32 ch) { 01150 return u_charType(ch); 01151 } 01152 01153 // Gets the character's linguistic directionality. 01154 inline Unicode::EDirectionProperty 01155 Unicode::characterDirection(UChar32 ch) { 01156 return (EDirectionProperty)u_charDirection(ch); 01157 } 01158 01159 // Determines if the character has the "mirrored" property. 01160 inline UBool 01161 Unicode::isMirrored(UChar32 ch) { 01162 return u_isMirrored(ch); 01163 } 01164 01165 // Maps the character to a "mirror-image" character, or to itself. 01166 inline UChar32 01167 Unicode::charMirror(UChar32 ch) { 01168 return u_charMirror(ch); 01169 } 01170 01171 // Get the script associated with the character 01172 inline Unicode::EUnicodeScript 01173 Unicode::getScript(UChar32 ch) { 01174 return (EUnicodeScript) u_charScript(ch); 01175 } 01176 01177 // Gets table cell width of the Unicode character. 01178 inline uint16_t 01179 Unicode::getCellWidth(UChar32 ch) { 01180 return u_charCellWidth(ch); 01181 } 01182 01183 inline UTextOffset 01184 Unicode::getCharName(uint32_t code, 01185 char *buffer, UTextOffset bufferLength, 01186 UCharNameChoice nameChoice) { 01187 UErrorCode errorCode=U_ZERO_ERROR; 01188 UTextOffset length=u_charName(code, nameChoice, buffer, bufferLength, &errorCode); 01189 return U_SUCCESS(errorCode) ? length : 0; 01190 } 01191 01192 inline int32_t 01193 Unicode::digitValue(UChar32 ch) { 01194 return u_charDigitValue(ch); 01195 } 01196 01197 inline int8_t 01198 Unicode::digit(UChar32 ch, int8_t radix) { 01199 // ### TODO this should probably move to a C u_charDigitValueEx(ch, radix) and be called here 01200 int8_t value; 01201 if((uint8_t)(radix-MIN_RADIX)<=(MAX_RADIX-MIN_RADIX)) { 01202 value=(int8_t)u_charDigitValue(ch); 01203 if(value<0) { 01204 // ch is not a decimal digit, try latin letters 01205 if ((uint32_t)(ch-0x41)<26) { 01206 value=(int8_t)(ch-(0x41-10)); // A-Z, subtract A 01207 } else if ((uint32_t)(ch-0x61)<26) { 01208 value=(int8_t)(ch-(0x61-10)); // a-z, subtract a 01209 } else { 01210 return -1; // ch is not a digit character 01211 } 01212 } 01213 } else { 01214 return -1; // invalid radix 01215 } 01216 return (value<radix) ? value : -1; 01217 } 01218 01219 inline UChar32 01220 Unicode::forDigit(int32_t digit, int8_t radix) { 01221 // ### TODO this should probably move to a C u_forDigit(digit, radix) and be called here 01222 if((uint8_t)(radix-MIN_RADIX)>(MAX_RADIX-MIN_RADIX) || (uint32_t)digit>=(uint32_t)radix) { 01223 return 0; 01224 } else if(digit<10) { 01225 return (UChar32)(0x30+digit); 01226 } else { 01227 return (UChar32)((0x61-10)+digit); 01228 } 01229 } 01230 01231 inline void 01232 Unicode::getUnicodeVersion(UVersionInfo versionArray) { 01233 u_getUnicodeVersion(versionArray); 01234 } 01235 01236 #endif