00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027 #ifndef UNICODE_H
00028 #define UNICODE_H
00029
00030 #include "unicode/utypes.h"
00031 #include "unicode/uchar.h"
00032
00046 class U_COMMON_API Unicode
00047 {
00048 public:
00049
00050
00051
00052
00053
00054 enum {
00056 MIN_VALUE=0,
00057
00063 MAX_VALUE=0x10ffff,
00064
00072 MAX_CHAR_LENGTH=UTF_MAX_CHAR_LENGTH,
00073
00084 MIN_RADIX=2,
00085
00096 MAX_RADIX=36
00097 };
00098
00103 enum EUnicodeGeneralTypes
00104 {
00105 UNASSIGNED = 0,
00106 UPPERCASE_LETTER = 1,
00107 LOWERCASE_LETTER = 2,
00108 TITLECASE_LETTER = 3,
00109 MODIFIER_LETTER = 4,
00110 OTHER_LETTER = 5,
00111 NON_SPACING_MARK = 6,
00112 ENCLOSING_MARK = 7,
00113 COMBINING_SPACING_MARK = 8,
00114 DECIMAL_DIGIT_NUMBER = 9,
00115 LETTER_NUMBER = 10,
00116 OTHER_NUMBER = 11,
00117 SPACE_SEPARATOR = 12,
00118 LINE_SEPARATOR = 13,
00119 PARAGRAPH_SEPARATOR = 14,
00120 CONTROL = 15,
00121 FORMAT = 16,
00122 PRIVATE_USE = 17,
00123 SURROGATE = 18,
00124 DASH_PUNCTUATION = 19,
00125 START_PUNCTUATION = 20,
00126 END_PUNCTUATION = 21,
00127 CONNECTOR_PUNCTUATION = 22,
00128 OTHER_PUNCTUATION = 23,
00129 MATH_SYMBOL = 24,
00130 CURRENCY_SYMBOL = 25,
00131 MODIFIER_SYMBOL = 26,
00132 OTHER_SYMBOL = 27,
00133 INITIAL_PUNCTUATION = 28,
00134 FINAL_PUNCTUATION = 29,
00135 GENERAL_TYPES_COUNT = 30
00136 };
00137
00138
00142 enum EUnicodeScript
00143 {
00144 kBasicLatin,
00145 kLatin1Supplement,
00146 kLatinExtendedA,
00147 kLatinExtendedB,
00148 kIPAExtension,
00149 kSpacingModifier,
00150 kCombiningDiacritical,
00151 kGreek,
00152 kCyrillic,
00153 kArmenian,
00154 kHebrew,
00155 kArabic,
00156 kSyriac,
00157 kThaana,
00158 kDevanagari,
00159 kBengali,
00160 kGurmukhi,
00161 kGujarati,
00162 kOriya,
00163 kTamil,
00164 kTelugu,
00165 kKannada,
00166 kMalayalam,
00167 kSinhala,
00168 kThai,
00169 kLao,
00170 kTibetan,
00171 kMyanmar,
00172 kGeorgian,
00173 kHangulJamo,
00174 kEthiopic,
00175 kCherokee,
00176 kUnifiedCanadianAboriginalSyllabics,
00177 kogham,
00178 kRunic,
00179 kKhmer,
00180 kMongolian,
00181 kLatinExtendedAdditional,
00182 kGreekExtended,
00183 kGeneralPunctuation,
00184 kSuperSubScript,
00185 kCurrencySymbolScript,
00186 kSymbolCombiningMark,
00187 kLetterlikeSymbol,
00188 kNumberForm,
00189 kArrow,
00190 kMathOperator,
00191 kMiscTechnical,
00192 kControlPicture,
00193 kOpticalCharacter,
00194 kEnclosedAlphanumeric,
00195 kBoxDrawing,
00196 kBlockElement,
00197 kGeometricShape,
00198 kMiscSymbol,
00199 kDingbat,
00200 kBraillePatterns,
00201 kCJKRadicalsSupplement,
00202 kKangxiRadicals,
00203 kIdeographicDescriptionCharacters,
00204 kCJKSymbolPunctuation,
00205 kHiragana,
00206 kKatakana,
00207 kBopomofo,
00208 kHangulCompatibilityJamo,
00209 kKanbun,
00210 kBopomofoExtended,
00211 kEnclosedCJKLetterMonth,
00212 kCJKCompatibility,
00213 kCJKUnifiedIdeographExtensionA,
00214 kCJKUnifiedIdeograph,
00215 kYiSyllables,
00216 kYiRadicals,
00217 kHangulSyllable,
00218 kHighSurrogate,
00219 kHighPrivateUseSurrogate,
00220 kLowSurrogate,
00221 kPrivateUse,
00222 kCJKCompatibilityIdeograph,
00223 kAlphabeticPresentation,
00224 kArabicPresentationA,
00225 kCombiningHalfMark,
00226 kCJKCompatibilityForm,
00227 kSmallFormVariant,
00228 kArabicPresentationB,
00229 kNoScript,
00230 kHalfwidthFullwidthForm,
00231 kScriptCount
00232 };
00233
00237 enum EDirectionProperty {
00238 LEFT_TO_RIGHT = 0,
00239 RIGHT_TO_LEFT = 1,
00240 EUROPEAN_NUMBER = 2,
00241 EUROPEAN_NUMBER_SEPARATOR = 3,
00242 EUROPEAN_NUMBER_TERMINATOR = 4,
00243 ARABIC_NUMBER = 5,
00244 COMMON_NUMBER_SEPARATOR = 6,
00245 BLOCK_SEPARATOR = 7,
00246 SEGMENT_SEPARATOR = 8,
00247 WHITE_SPACE_NEUTRAL = 9,
00248 OTHER_NEUTRAL = 10,
00249 LEFT_TO_RIGHT_EMBEDDING = 11,
00250 LEFT_TO_RIGHT_OVERRIDE = 12,
00251 RIGHT_TO_LEFT_ARABIC = 13,
00252 RIGHT_TO_LEFT_EMBEDDING = 14,
00253 RIGHT_TO_LEFT_OVERRIDE = 15,
00254 POP_DIRECTIONAL_FORMAT = 16,
00255 DIR_NON_SPACING_MARK = 17,
00256 BOUNDARY_NEUTRAL = 18
00257 };
00258
00263 enum ECellWidths
00264 {
00265 ZERO_WIDTH = 0,
00266 HALF_WIDTH = 1,
00267 FULL_WIDTH = 2,
00268 NEUTRAL = 3
00269 };
00270
00280 static inline UBool isSingle(UChar c);
00281
00289 static inline UBool isLead(UChar c);
00290
00298 static inline UBool isTrail(UChar c);
00299
00309 static inline UBool isSurrogate(UChar32 c);
00310
00322 static inline UBool isUnicodeChar(UChar32 c);
00323
00334 static inline UBool isError(UChar32 c);
00335
00344 static inline UBool isValid(UChar32 c);
00345
00356 static inline UBool needMultipleUChar(UChar32 c);
00357
00365 static inline int32_t charLength(UChar32 c);
00366
00379 static inline int32_t arraySize(int32_t size);
00380
00393 static inline UBool isLowerCase(UChar32 ch);
00394
00406 static inline UBool isUpperCase(UChar32 ch);
00407
00419 static inline UBool isTitleCase(UChar32 ch);
00420
00432 static inline UBool isDigit(UChar32 ch);
00433
00449 static inline UBool isDefined(UChar32 ch);
00450
00461 static inline UBool isControl(UChar32 ch);
00462
00473 static inline UBool isPrintable(UChar32 ch);
00474
00486 static inline UBool isBaseForm(UChar32 ch);
00487
00503 static inline UBool isLetter(UChar32 ch);
00504
00525 static inline UBool isJavaIdentifierStart(UChar32 ch);
00526
00555 static inline UBool isJavaIdentifierPart(UChar32 ch);
00556
00571 static inline UBool isUnicodeIdentifierStart(UChar32 ch);
00572
00599 static inline UBool isUnicodeIdentifierPart(UChar32 ch);
00600
00626 static inline UBool isIdentifierIgnorable(UChar32 ch);
00627
00652 static inline UChar32 toLowerCase(UChar32 ch);
00653
00675 static inline UChar32 toUpperCase(UChar32 ch);
00676
00694 static inline UChar32 toTitleCase(UChar32 ch);
00695
00704 static inline UBool isSpaceChar(UChar32 ch);
00705
00734 static inline UBool isWhitespace(UChar32 ch);
00735
00770 static inline int8_t getType(UChar32 ch);
00771
00781 static inline EDirectionProperty characterDirection(UChar32 ch);
00782
00792 static inline UBool isMirrored(UChar32 c);
00793
00809 static inline UChar32 charMirror(UChar32 c);
00810
00816 static inline EUnicodeScript getScript(UChar32 ch);
00817
00869 static inline uint16_t getCellWidth(UChar32 ch);
00870
00898 static inline UTextOffset
00899 getCharName(uint32_t code,
00900 char *buffer, UTextOffset bufferLength,
00901 UCharNameChoice nameChoice=U_UNICODE_CHAR_NAME);
00902
00913 static inline int32_t digitValue(UChar32 ch);
00914
00952 static inline int8_t digit(UChar32 ch, int8_t radix);
00953
00981 static inline UChar32 forDigit(int32_t digit, int8_t radix);
00982
00988 static void getUnicodeVersion(UVersionInfo info);
00989
00990 protected:
00991
00992
00993
00994
00995
00996 Unicode();
00997 Unicode(const Unicode &other);
00998 ~Unicode();
00999 const Unicode &operator=(const Unicode &other);
01000 };
01001
01002
01003
01004 inline UBool
01005 Unicode::isSingle(UChar c) {
01006 return UTF_IS_SINGLE(c);
01007 }
01008
01009 inline UBool
01010 Unicode::isLead(UChar c) {
01011 return UTF_IS_LEAD(c);
01012 }
01013
01014 inline UBool
01015 Unicode::isTrail(UChar c) {
01016 return UTF_IS_TRAIL(c);
01017 }
01018
01019 inline UBool
01020 Unicode::isSurrogate(UChar32 c) {
01021 return UTF_IS_SURROGATE(c);
01022 }
01023
01024 inline UBool
01025 Unicode::isUnicodeChar(UChar32 c) {
01026 return UTF_IS_UNICODE_CHAR(c);
01027 }
01028
01029 inline UBool
01030 Unicode::isError(UChar32 c) {
01031 return UTF_IS_ERROR(c);
01032 }
01033
01034 inline UBool
01035 Unicode::isValid(UChar32 c) {
01036 return UTF_IS_VALID(c);
01037 }
01038
01039 inline UBool
01040 Unicode::needMultipleUChar(UChar32 c) {
01041 return UTF_NEED_MULTIPLE_UCHAR(c);
01042 }
01043
01044 inline int32_t
01045 Unicode::charLength(UChar32 c) {
01046 return UTF_CHAR_LENGTH(c);
01047 }
01048
01049 inline int32_t
01050 Unicode::arraySize(int32_t size) {
01051 return UTF_ARRAY_SIZE(size);
01052 }
01053
01054
01055 inline UBool
01056 Unicode::isLowerCase(UChar32 ch) {
01057 return u_islower(ch);
01058 }
01059
01060
01061 inline UBool
01062 Unicode::isUpperCase(UChar32 ch) {
01063 return u_isupper(ch);
01064 }
01065
01066
01067 inline UBool
01068 Unicode::isTitleCase(UChar32 ch) {
01069 return u_istitle(ch);
01070 }
01071
01072
01073 inline UBool
01074 Unicode::isDigit(UChar32 ch) {
01075 return u_isdigit(ch);
01076 }
01077
01078
01079 inline UBool
01080 Unicode::isDefined(UChar32 ch) {
01081 return u_isdefined(ch);
01082 }
01083
01084
01085 inline UBool
01086 Unicode::isControl(UChar32 ch) {
01087 return u_iscntrl(ch);
01088 }
01089
01090
01091 inline UBool
01092 Unicode::isPrintable(UChar32 ch) {
01093 return u_isprint(ch);
01094 }
01095
01096
01097 inline UBool
01098 Unicode::isBaseForm(UChar32 ch) {
01099 return u_isbase(ch);
01100 }
01101
01102
01103 inline UBool
01104 Unicode::isLetter(UChar32 ch) {
01105 return u_isalpha(ch);
01106 }
01107
01108
01109 inline UBool
01110 Unicode::isJavaIdentifierStart(UChar32 ch) {
01111 return u_isJavaIDStart(ch);
01112 }
01113
01114
01115
01116 inline UBool
01117 Unicode::isJavaIdentifierPart(UChar32 ch) {
01118 return u_isJavaIDPart(ch);
01119 }
01120
01121
01122 inline UBool
01123 Unicode::isUnicodeIdentifierStart(UChar32 ch) {
01124 return u_isIDStart(ch);
01125 }
01126
01127
01128
01129 inline UBool
01130 Unicode::isUnicodeIdentifierPart(UChar32 ch) {
01131 return u_isIDPart(ch);
01132 }
01133
01134
01135 inline UBool
01136 Unicode::isIdentifierIgnorable(UChar32 ch) {
01137 return u_isIDIgnorable(ch);
01138 }
01139
01140
01141 inline UChar32
01142 Unicode::toLowerCase(UChar32 ch) {
01143 return u_tolower(ch);
01144 }
01145
01146
01147 inline UChar32
01148 Unicode::toUpperCase(UChar32 ch) {
01149 return u_toupper(ch);
01150 }
01151
01152
01153 inline UChar32
01154 Unicode::toTitleCase(UChar32 ch) {
01155 return u_totitle(ch);
01156 }
01157
01158
01159 inline UBool
01160 Unicode::isSpaceChar(UChar32 ch) {
01161 return u_isspace(ch);
01162 }
01163
01164
01165 inline UBool
01166 Unicode::isWhitespace(UChar32 ch) {
01167 return u_isWhitespace(ch);
01168 }
01169
01170
01171 inline int8_t
01172 Unicode::getType(UChar32 ch) {
01173 return u_charType(ch);
01174 }
01175
01176
01177 inline Unicode::EDirectionProperty
01178 Unicode::characterDirection(UChar32 ch) {
01179 return (EDirectionProperty)u_charDirection(ch);
01180 }
01181
01182
01183 inline UBool
01184 Unicode::isMirrored(UChar32 ch) {
01185 return u_isMirrored(ch);
01186 }
01187
01188
01189 inline UChar32
01190 Unicode::charMirror(UChar32 ch) {
01191 return u_charMirror(ch);
01192 }
01193
01194
01195 inline Unicode::EUnicodeScript
01196 Unicode::getScript(UChar32 ch) {
01197 return (EUnicodeScript) u_charScript(ch);
01198 }
01199
01200
01201 inline uint16_t
01202 Unicode::getCellWidth(UChar32 ch) {
01203 return u_charCellWidth(ch);
01204 }
01205
01206 inline UTextOffset
01207 Unicode::getCharName(uint32_t code,
01208 char *buffer, UTextOffset bufferLength,
01209 UCharNameChoice nameChoice) {
01210 UErrorCode errorCode=U_ZERO_ERROR;
01211 UTextOffset length=u_charName(code, nameChoice, buffer, bufferLength, &errorCode);
01212 return U_SUCCESS(errorCode) ? length : 0;
01213 }
01214
01215 inline int32_t
01216 Unicode::digitValue(UChar32 ch) {
01217 return u_charDigitValue(ch);
01218 }
01219
01220 inline int8_t
01221 Unicode::digit(UChar32 ch, int8_t radix) {
01222
01223 int8_t value;
01224 if((uint8_t)(radix-MIN_RADIX)<=(MAX_RADIX-MIN_RADIX)) {
01225 value=(int8_t)u_charDigitValue(ch);
01226 if(value<0) {
01227
01228 if ((uint32_t)(ch-0x41)<26) {
01229 value=(int8_t)(ch-(0x41-10));
01230 } else if ((uint32_t)(ch-0x61)<26) {
01231 value=(int8_t)(ch-(0x61-10));
01232 } else {
01233 return -1;
01234 }
01235 }
01236 } else {
01237 return -1;
01238 }
01239 return (uint8_t)((value<radix) ? value : (uint8_t)(-1));
01240 }
01241
01242 inline UChar32
01243 Unicode::forDigit(int32_t digit, int8_t radix) {
01244
01245 if((uint8_t)(radix-MIN_RADIX)>(MAX_RADIX-MIN_RADIX) || (uint32_t)digit>=(uint32_t)radix) {
01246 return 0;
01247 } else if(digit<10) {
01248 return (UChar32)(0x30+digit);
01249 } else {
01250 return (UChar32)((0x61-10)+digit);
01251 }
01252 }
01253
01254 inline void
01255 Unicode::getUnicodeVersion(UVersionInfo versionArray) {
01256 u_getUnicodeVersion(versionArray);
01257 }
01258
01259 #endif