00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027 #ifndef UNICODE_H
00028 #define UNICODE_H
00029
00030 #include "unicode/utypes.h"
00031 #include "unicode/uchar.h"
00032
00046 class U_COMMON_API Unicode
00047 {
00048 public:
00049
00050
00051
00052
00053
00054 enum {
00056 MIN_VALUE=0,
00057
00063 MAX_VALUE=0x10ffff,
00064
00072 MAX_CHAR_LENGTH=UTF_MAX_CHAR_LENGTH,
00073
00084 MIN_RADIX=2,
00085
00096 MAX_RADIX=36
00097 };
00098
00103 enum EUnicodeGeneralTypes
00104 {
00105 UNASSIGNED = 0,
00106 UPPERCASE_LETTER = 1,
00107 LOWERCASE_LETTER = 2,
00108 TITLECASE_LETTER = 3,
00109 MODIFIER_LETTER = 4,
00110 OTHER_LETTER = 5,
00111 NON_SPACING_MARK = 6,
00112 ENCLOSING_MARK = 7,
00113 COMBINING_SPACING_MARK = 8,
00114 DECIMAL_DIGIT_NUMBER = 9,
00115 LETTER_NUMBER = 10,
00116 OTHER_NUMBER = 11,
00117 SPACE_SEPARATOR = 12,
00118 LINE_SEPARATOR = 13,
00119 PARAGRAPH_SEPARATOR = 14,
00120 CONTROL = 15,
00121 FORMAT = 16,
00122 PRIVATE_USE = 17,
00123 SURROGATE = 18,
00124 DASH_PUNCTUATION = 19,
00125 START_PUNCTUATION = 20,
00126 END_PUNCTUATION = 21,
00127 CONNECTOR_PUNCTUATION = 22,
00128 OTHER_PUNCTUATION = 23,
00129 MATH_SYMBOL = 24,
00130 CURRENCY_SYMBOL = 25,
00131 MODIFIER_SYMBOL = 26,
00132 OTHER_SYMBOL = 27,
00133 INITIAL_PUNCTUATION = 28,
00134 FINAL_PUNCTUATION = 29,
00135 GENERAL_TYPES_COUNT = 30
00136 };
00137
00138
00142 enum EUnicodeScript
00143 {
00144 kBasicLatin,
00145 kLatin1Supplement,
00146 kLatinExtendedA,
00147 kLatinExtendedB,
00148 kIPAExtension,
00149 kSpacingModifier,
00150 kCombiningDiacritical,
00151 kGreek,
00152 kCyrillic,
00153 kArmenian,
00154 kHebrew,
00155 kArabic,
00156 kSyriac,
00157 kThaana,
00158 kDevanagari,
00159 kBengali,
00160 kGurmukhi,
00161 kGujarati,
00162 kOriya,
00163 kTamil,
00164 kTelugu,
00165 kKannada,
00166 kMalayalam,
00167 kSinhala,
00168 kThai,
00169 kLao,
00170 kTibetan,
00171 kMyanmar,
00172 kGeorgian,
00173 kHangulJamo,
00174 kEthiopic,
00175 kCherokee,
00176 kUnifiedCanadianAboriginalSyllabics,
00177 kogham,
00178 kRunic,
00179 kKhmer,
00180 kMongolian,
00181 kLatinExtendedAdditional,
00182 kGreekExtended,
00183 kGeneralPunctuation,
00184 kSuperSubScript,
00185 kCurrencySymbolScript,
00186 kSymbolCombiningMark,
00187 kLetterlikeSymbol,
00188 kNumberForm,
00189 kArrow,
00190 kMathOperator,
00191 kMiscTechnical,
00192 kControlPicture,
00193 kOpticalCharacter,
00194 kEnclosedAlphanumeric,
00195 kBoxDrawing,
00196 kBlockElement,
00197 kGeometricShape,
00198 kMiscSymbol,
00199 kDingbat,
00200 kBraillePatterns,
00201 kCJKRadicalsSupplement,
00202 kKangxiRadicals,
00203 kIdeographicDescriptionCharacters,
00204 kCJKSymbolPunctuation,
00205 kHiragana,
00206 kKatakana,
00207 kBopomofo,
00208 kHangulCompatibilityJamo,
00209 kKanbun,
00210 kBopomofoExtended,
00211 kEnclosedCJKLetterMonth,
00212 kCJKCompatibility,
00213 kCJKUnifiedIdeographExtensionA,
00214 kCJKUnifiedIdeograph,
00215 kYiSyllables,
00216 kYiRadicals,
00217 kHangulSyllable,
00218 kHighSurrogate,
00219 kHighPrivateUseSurrogate,
00220 kLowSurrogate,
00221 kPrivateUse,
00222 kCJKCompatibilityIdeograph,
00223 kAlphabeticPresentation,
00224 kArabicPresentationA,
00225 kCombiningHalfMark,
00226 kCJKCompatibilityForm,
00227 kSmallFormVariant,
00228 kArabicPresentationB,
00229 kNoScript,
00230 kHalfwidthFullwidthForm,
00231 kScriptCount
00232 };
00233
00237 enum EDirectionProperty {
00238 LEFT_TO_RIGHT = 0,
00239 RIGHT_TO_LEFT = 1,
00240 EUROPEAN_NUMBER = 2,
00241 EUROPEAN_NUMBER_SEPARATOR = 3,
00242 EUROPEAN_NUMBER_TERMINATOR = 4,
00243 ARABIC_NUMBER = 5,
00244 COMMON_NUMBER_SEPARATOR = 6,
00245 BLOCK_SEPARATOR = 7,
00246 SEGMENT_SEPARATOR = 8,
00247 WHITE_SPACE_NEUTRAL = 9,
00248 OTHER_NEUTRAL = 10,
00249 LEFT_TO_RIGHT_EMBEDDING = 11,
00250 LEFT_TO_RIGHT_OVERRIDE = 12,
00251 RIGHT_TO_LEFT_ARABIC = 13,
00252 RIGHT_TO_LEFT_EMBEDDING = 14,
00253 RIGHT_TO_LEFT_OVERRIDE = 15,
00254 POP_DIRECTIONAL_FORMAT = 16,
00255 DIR_NON_SPACING_MARK = 17,
00256 BOUNDARY_NEUTRAL = 18
00257 };
00258
00263 enum ECellWidths
00264 {
00265 ZERO_WIDTH = 0,
00266 HALF_WIDTH = 1,
00267 FULL_WIDTH = 2,
00268 NEUTRAL = 3
00269 };
00270
00280 static inline UBool isSingle(UChar c);
00281
00289 static inline UBool isLead(UChar c);
00290
00298 static inline UBool isTrail(UChar c);
00299
00309 static inline UBool isSurrogate(UChar32 c);
00310
00322 static inline UBool isUnicodeChar(UChar32 c);
00323
00334 static inline UBool isError(UChar32 c);
00335
00344 static inline UBool isValid(UChar32 c);
00345
00356 static inline UBool needMultipleUChar(UChar32 c);
00357
00365 static inline int32_t charLength(UChar32 c);
00366
00379 static inline int32_t arraySize(int32_t size);
00380
00393 static inline UBool isLowerCase(UChar32 ch);
00394
00406 static inline UBool isUpperCase(UChar32 ch);
00407
00419 static inline UBool isTitleCase(UChar32 ch);
00420
00432 static inline UBool isDigit(UChar32 ch);
00433
00449 static inline UBool isDefined(UChar32 ch);
00450
00461 static inline UBool isControl(UChar32 ch);
00462
00473 static inline UBool isPrintable(UChar32 ch);
00474
00486 static inline UBool isBaseForm(UChar32 ch);
00487
00503 static inline UBool isLetter(UChar32 ch);
00504
00525 static inline UBool isJavaIdentifierStart(UChar32 ch);
00526
00555 static inline UBool isJavaIdentifierPart(UChar32 ch);
00556
00571 static inline UBool isUnicodeIdentifierStart(UChar32 ch);
00572
00599 static inline UBool isUnicodeIdentifierPart(UChar32 ch);
00600
00626 static inline UBool isIdentifierIgnorable(UChar32 ch);
00627
00652 static inline UChar32 toLowerCase(UChar32 ch);
00653
00675 static inline UChar32 toUpperCase(UChar32 ch);
00676
00694 static inline UChar32 toTitleCase(UChar32 ch);
00695
00709 static inline UChar32
00710 foldCase(UChar32 c, uint32_t options);
00711
00720 static inline UBool isSpaceChar(UChar32 ch);
00721
00750 static inline UBool isWhitespace(UChar32 ch);
00751
00786 static inline int8_t getType(UChar32 ch);
00787
00795 static inline uint8_t getCombiningClass(UChar32 c);
00796
00806 static inline EDirectionProperty characterDirection(UChar32 ch);
00807
00817 static inline UBool isMirrored(UChar32 c);
00818
00834 static inline UChar32 charMirror(UChar32 c);
00835
00841 static inline EUnicodeScript getScript(UChar32 ch);
00842
00894 static inline uint16_t getCellWidth(UChar32 ch);
00895
00923 static inline UTextOffset
00924 getCharName(uint32_t code,
00925 char *buffer, UTextOffset bufferLength,
00926 UCharNameChoice nameChoice=U_UNICODE_CHAR_NAME);
00927
00938 static inline int32_t digitValue(UChar32 ch);
00939
00977 static inline int8_t digit(UChar32 ch, int8_t radix);
00978
01006 static inline UChar32 forDigit(int32_t digit, int8_t radix);
01007
01013 static void getUnicodeVersion(UVersionInfo info);
01014
01015 protected:
01016
01017
01018
01019
01020
01021 Unicode();
01022 Unicode(const Unicode &other);
01023 ~Unicode();
01024 const Unicode &operator=(const Unicode &other);
01025 };
01026
01027
01028
01029 inline UBool
01030 Unicode::isSingle(UChar c) {
01031 return UTF_IS_SINGLE(c);
01032 }
01033
01034 inline UBool
01035 Unicode::isLead(UChar c) {
01036 return UTF_IS_LEAD(c);
01037 }
01038
01039 inline UBool
01040 Unicode::isTrail(UChar c) {
01041 return UTF_IS_TRAIL(c);
01042 }
01043
01044 inline UBool
01045 Unicode::isSurrogate(UChar32 c) {
01046 return UTF_IS_SURROGATE(c);
01047 }
01048
01049 inline UBool
01050 Unicode::isUnicodeChar(UChar32 c) {
01051 return UTF_IS_UNICODE_CHAR(c);
01052 }
01053
01054 inline UBool
01055 Unicode::isError(UChar32 c) {
01056 return UTF_IS_ERROR(c);
01057 }
01058
01059 inline UBool
01060 Unicode::isValid(UChar32 c) {
01061 return UTF_IS_VALID(c);
01062 }
01063
01064 inline UBool
01065 Unicode::needMultipleUChar(UChar32 c) {
01066 return UTF_NEED_MULTIPLE_UCHAR(c);
01067 }
01068
01069 inline int32_t
01070 Unicode::charLength(UChar32 c) {
01071 return UTF_CHAR_LENGTH(c);
01072 }
01073
01074 inline int32_t
01075 Unicode::arraySize(int32_t size) {
01076 return UTF_ARRAY_SIZE(size);
01077 }
01078
01079
01080 inline UBool
01081 Unicode::isLowerCase(UChar32 ch) {
01082 return u_islower(ch);
01083 }
01084
01085
01086 inline UBool
01087 Unicode::isUpperCase(UChar32 ch) {
01088 return u_isupper(ch);
01089 }
01090
01091
01092 inline UBool
01093 Unicode::isTitleCase(UChar32 ch) {
01094 return u_istitle(ch);
01095 }
01096
01097
01098 inline UBool
01099 Unicode::isDigit(UChar32 ch) {
01100 return u_isdigit(ch);
01101 }
01102
01103
01104 inline UBool
01105 Unicode::isDefined(UChar32 ch) {
01106 return u_isdefined(ch);
01107 }
01108
01109
01110 inline UBool
01111 Unicode::isControl(UChar32 ch) {
01112 return u_iscntrl(ch);
01113 }
01114
01115
01116 inline UBool
01117 Unicode::isPrintable(UChar32 ch) {
01118 return u_isprint(ch);
01119 }
01120
01121
01122 inline UBool
01123 Unicode::isBaseForm(UChar32 ch) {
01124 return u_isbase(ch);
01125 }
01126
01127
01128 inline UBool
01129 Unicode::isLetter(UChar32 ch) {
01130 return u_isalpha(ch);
01131 }
01132
01133
01134 inline UBool
01135 Unicode::isJavaIdentifierStart(UChar32 ch) {
01136 return u_isJavaIDStart(ch);
01137 }
01138
01139
01140
01141 inline UBool
01142 Unicode::isJavaIdentifierPart(UChar32 ch) {
01143 return u_isJavaIDPart(ch);
01144 }
01145
01146
01147 inline UBool
01148 Unicode::isUnicodeIdentifierStart(UChar32 ch) {
01149 return u_isIDStart(ch);
01150 }
01151
01152
01153
01154 inline UBool
01155 Unicode::isUnicodeIdentifierPart(UChar32 ch) {
01156 return u_isIDPart(ch);
01157 }
01158
01159
01160 inline UBool
01161 Unicode::isIdentifierIgnorable(UChar32 ch) {
01162 return u_isIDIgnorable(ch);
01163 }
01164
01165
01166 inline UChar32
01167 Unicode::toLowerCase(UChar32 ch) {
01168 return u_tolower(ch);
01169 }
01170
01171
01172 inline UChar32
01173 Unicode::toUpperCase(UChar32 ch) {
01174 return u_toupper(ch);
01175 }
01176
01177
01178 inline UChar32
01179 Unicode::toTitleCase(UChar32 ch) {
01180 return u_totitle(ch);
01181 }
01182
01183
01184 inline UChar32
01185 Unicode::foldCase(UChar32 ch, uint32_t options) {
01186 return u_foldCase(ch, options);
01187 }
01188
01189
01190 inline UBool
01191 Unicode::isSpaceChar(UChar32 ch) {
01192 return u_isspace(ch);
01193 }
01194
01195
01196 inline UBool
01197 Unicode::isWhitespace(UChar32 ch) {
01198 return u_isWhitespace(ch);
01199 }
01200
01201
01202 inline int8_t
01203 Unicode::getType(UChar32 ch) {
01204 return u_charType(ch);
01205 }
01206
01207 inline uint8_t
01208 Unicode::getCombiningClass(UChar32 c) {
01209 return u_getCombiningClass(c);
01210 }
01211
01212
01213 inline Unicode::EDirectionProperty
01214 Unicode::characterDirection(UChar32 ch) {
01215 return (EDirectionProperty)u_charDirection(ch);
01216 }
01217
01218
01219 inline UBool
01220 Unicode::isMirrored(UChar32 ch) {
01221 return u_isMirrored(ch);
01222 }
01223
01224
01225 inline UChar32
01226 Unicode::charMirror(UChar32 ch) {
01227 return u_charMirror(ch);
01228 }
01229
01230
01231 inline Unicode::EUnicodeScript
01232 Unicode::getScript(UChar32 ch) {
01233 return (EUnicodeScript) u_charScript(ch);
01234 }
01235
01236
01237 inline uint16_t
01238 Unicode::getCellWidth(UChar32 ch) {
01239 return u_charCellWidth(ch);
01240 }
01241
01242 inline UTextOffset
01243 Unicode::getCharName(uint32_t code,
01244 char *buffer, UTextOffset bufferLength,
01245 UCharNameChoice nameChoice) {
01246 UErrorCode errorCode=U_ZERO_ERROR;
01247 UTextOffset length=u_charName(code, nameChoice, buffer, bufferLength, &errorCode);
01248 return U_SUCCESS(errorCode) ? length : 0;
01249 }
01250
01251 inline int32_t
01252 Unicode::digitValue(UChar32 ch) {
01253 return u_charDigitValue(ch);
01254 }
01255
01256 inline int8_t
01257 Unicode::digit(UChar32 ch, int8_t radix) {
01258
01259 int8_t value;
01260 if ((uint8_t)(radix-MIN_RADIX) <= (MAX_RADIX-MIN_RADIX)) {
01261 value=(int8_t)u_charDigitValue(ch);
01262 if (value < 0) {
01263
01264 if (ch >= 0x61 && ch <= 0x7A) {
01265 value = (int8_t)(ch - 0x57);
01266 }
01267 else if (ch >= 0x41 && ch <= 0x5A) {
01268 value = (int8_t)(ch - 0x37);
01269 }
01270 }
01271 } else {
01272 value = -1;
01273 }
01274 return (int8_t)((value < radix) ? value : -1);
01275 }
01276
01277 inline UChar32
01278 Unicode::forDigit(int32_t digit, int8_t radix) {
01279
01280 if((uint8_t)(radix-MIN_RADIX)>(MAX_RADIX-MIN_RADIX) || (uint32_t)digit>=(uint32_t)radix) {
01281 return 0;
01282 } else if(digit<10) {
01283 return (UChar32)(0x30+digit);
01284 } else {
01285 return (UChar32)((0x61-10)+digit);
01286 }
01287 }
01288
01289 inline void
01290 Unicode::getUnicodeVersion(UVersionInfo versionArray) {
01291 u_getUnicodeVersion(versionArray);
01292 }
01293
01294 #endif