00001
00002
00003
00004
00005
00006
00007
00008
00009 #ifndef NORMLZR_H
00010 #define NORMLZR_H
00011
00012 #include "unicode/utypes.h"
00013 #include "unicode/uobject.h"
00014 #include "unicode/unistr.h"
00015 #include "unicode/chariter.h"
00016 #include "unicode/unorm.h"
00017
00018 struct UCharIterator;
00019 typedef struct UCharIterator UCharIterator;
00020
00021 U_NAMESPACE_BEGIN
00112 class U_COMMON_API Normalizer : public UObject {
00113 public:
00119 enum {
00120 DONE=0xffff
00121 };
00122
00123
00124
00135 Normalizer(const UnicodeString& str, UNormalizationMode mode);
00136
00148 Normalizer(const UChar* str, int32_t length, UNormalizationMode mode);
00149
00160 Normalizer(const CharacterIterator& iter, UNormalizationMode mode);
00161
00167 Normalizer(const Normalizer& copy);
00168
00173 ~Normalizer();
00174
00175
00176
00177
00178
00179
00197 static void normalize(const UnicodeString& source,
00198 UNormalizationMode mode, int32_t options,
00199 UnicodeString& result,
00200 UErrorCode &status);
00201
00223 static void compose(const UnicodeString& source,
00224 UBool compat, int32_t options,
00225 UnicodeString& result,
00226 UErrorCode &status);
00227
00250 static void decompose(const UnicodeString& source,
00251 UBool compat, int32_t options,
00252 UnicodeString& result,
00253 UErrorCode &status);
00254
00275 static inline UNormalizationCheckResult
00276 quickCheck(const UnicodeString &source, UNormalizationMode mode, UErrorCode &status);
00277
00298 static inline UBool
00299 isNormalized(const UnicodeString &src, UNormalizationMode mode, UErrorCode &errorCode);
00300
00301
00302
00303
00304
00305
00306
00307
00308
00309
00310
00311
00312
00313
00314
00315
00316
00317
00318
00319
00320
00321
00322
00323
00324
00325
00326
00327
00328
00329
00330 static UnicodeString &
00331 concatenate(UnicodeString &left, UnicodeString &right,
00332 UnicodeString &result,
00333 UNormalizationMode mode, int32_t options,
00334 UErrorCode &errorCode);
00335
00398 static inline int32_t
00399 compare(const UnicodeString &s1, const UnicodeString &s2,
00400 uint32_t options,
00401 UErrorCode &errorCode);
00402
00403
00404
00405
00406
00415 UChar32 current(void);
00416
00425 UChar32 first(void);
00426
00435 UChar32 last(void);
00436
00445 UChar32 next(void);
00446
00455 UChar32 previous(void);
00456
00476 UChar32 setIndex(int32_t index);
00477
00487 void setIndexOnly(int32_t index);
00488
00494 void reset(void);
00495
00510 int32_t getIndex(void) const;
00511
00520 int32_t startIndex(void) const;
00521
00532 int32_t endIndex(void) const;
00533
00542 UBool operator==(const Normalizer& that) const;
00543
00552 inline UBool operator!=(const Normalizer& that) const;
00553
00560 Normalizer* clone(void) const;
00561
00568 int32_t hashCode(void) const;
00569
00570
00571
00572
00573
00589 void setMode(UNormalizationMode newMode);
00590
00601 UNormalizationMode getUMode(void) const;
00602
00619 void setOption(int32_t option,
00620 UBool value);
00621
00632 UBool getOption(int32_t option) const;
00633
00642 void setText(const UnicodeString& newText,
00643 UErrorCode &status);
00644
00653 void setText(const CharacterIterator& newText,
00654 UErrorCode &status);
00655
00665 void setText(const UChar* newText,
00666 int32_t length,
00667 UErrorCode &status);
00674 void getText(UnicodeString& result);
00675
00681 virtual inline UClassID getDynamicClassID() const { return getStaticClassID(); }
00682
00688 static inline UClassID getStaticClassID() { return (UClassID)&fgClassID; }
00689
00690
00691
00692
00693
00698 enum {
00699 COMPAT_BIT = 1,
00700 DECOMP_BIT = 2,
00701 COMPOSE_BIT = 4,
00702 FCD_BIT = 8
00703 };
00704
00709 enum EMode {
00723 NO_OP = 0,
00724
00740 COMPOSE = COMPOSE_BIT,
00741
00757 COMPOSE_COMPAT = COMPOSE_BIT | COMPAT_BIT,
00758
00774 DECOMP = DECOMP_BIT,
00775
00791 DECOMP_COMPAT = DECOMP_BIT | COMPAT_BIT,
00792
00796 FCD = FCD_BIT
00797 };
00798
00800 enum {
00819 IGNORE_HANGUL = 0x001
00820 };
00821
00832 Normalizer(const UnicodeString& str,
00833 EMode mode);
00834
00853 Normalizer(const UnicodeString& str,
00854 EMode mode,
00855 int32_t opt);
00856
00868 Normalizer(const UChar* str,
00869 int32_t length,
00870 EMode mode);
00871
00887 Normalizer(const UChar* str,
00888 int32_t length,
00889 EMode mode,
00890 int32_t option);
00891
00902 Normalizer(const CharacterIterator& iter,
00903 EMode mode);
00904
00920 Normalizer(const CharacterIterator& iter,
00921 EMode mode,
00922 int32_t opt);
00923
00944 inline static void
00945 normalize(const UnicodeString& source,
00946 EMode mode,
00947 int32_t options,
00948 UnicodeString& result,
00949 UErrorCode &status);
00950
00967 inline static UNormalizationCheckResult
00968 quickCheck(const UnicodeString& source,
00969 EMode mode,
00970 UErrorCode& status);
00971
00979 inline static UNormalizationMode getUNormalizationMode(EMode mode,
00980 UErrorCode& status);
00981
00989 inline static EMode getNormalizerEMode(UNormalizationMode mode,
00990 UErrorCode& status);
00991
01018 inline void setMode(EMode newMode);
01019
01026 inline EMode getMode(void) const;
01027
01028 private:
01029
01030
01031
01032
01033
01034
01035 UBool nextNormalize();
01036 UBool previousNormalize();
01037
01038 void init(CharacterIterator *iter);
01039 void clearBuffer(void);
01040
01041
01042
01043 inline static UNormalizationMode getUMode(EMode mode);
01044
01045
01046
01047
01048
01049 UNormalizationMode fUMode;
01050 int32_t fOptions;
01051
01052
01053 UCharIterator *text;
01054
01055
01056
01057 int32_t currentIndex, nextIndex;
01058
01059
01060 UnicodeString buffer;
01061 int32_t bufferPos;
01062
01067 static const char fgClassID;
01068 };
01069
01070
01071
01072
01073
01074 inline UBool
01075 Normalizer::operator!= (const Normalizer& other) const
01076 { return ! operator==(other); }
01077
01078 inline void
01079 Normalizer::normalize(const UnicodeString& source,
01080 EMode mode, int32_t options,
01081 UnicodeString& result,
01082 UErrorCode &status) {
01083 normalize(source, getUNormalizationMode(mode, status), options, result, status);
01084 }
01085
01086 inline UNormalizationCheckResult
01087 Normalizer::quickCheck(const UnicodeString& source,
01088 EMode mode,
01089 UErrorCode &status) {
01090 return quickCheck(source, getUNormalizationMode(mode, status), status);
01091 }
01092
01093 inline UNormalizationCheckResult
01094 Normalizer::quickCheck(const UnicodeString& source,
01095 UNormalizationMode mode,
01096 UErrorCode &status) {
01097 if(U_FAILURE(status)) {
01098 return UNORM_MAYBE;
01099 }
01100
01101 return unorm_quickCheck(source.getBuffer(), source.length(),
01102 mode, &status);
01103 }
01104
01105 inline UBool
01106 Normalizer::isNormalized(const UnicodeString& source,
01107 UNormalizationMode mode,
01108 UErrorCode &status) {
01109 if(U_FAILURE(status)) {
01110 return FALSE;
01111 }
01112
01113 return unorm_isNormalized(source.getBuffer(), source.length(),
01114 mode, &status);
01115 }
01116
01117 inline int32_t
01118 Normalizer::compare(const UnicodeString &s1, const UnicodeString &s2,
01119 uint32_t options,
01120 UErrorCode &errorCode) {
01121
01122 return unorm_compare(s1.getBuffer(), s1.length(),
01123 s2.getBuffer(), s2.length(),
01124 options,
01125 &errorCode);
01126 }
01127
01128 inline void
01129 Normalizer::setMode(EMode newMode) {
01130 UErrorCode status = U_ZERO_ERROR;
01131 fUMode = getUNormalizationMode(newMode, status);
01132 }
01133
01134 inline Normalizer::EMode
01135 Normalizer::getMode() const {
01136 UErrorCode status = U_ZERO_ERROR;
01137 return getNormalizerEMode(fUMode, status);
01138 }
01139
01140 inline UNormalizationMode Normalizer::getUNormalizationMode(
01141 Normalizer::EMode mode, UErrorCode &status)
01142 {
01143 if (U_SUCCESS(status))
01144 {
01145 switch (mode)
01146 {
01147 case Normalizer::NO_OP :
01148 return UNORM_NONE;
01149 case Normalizer::COMPOSE :
01150 return UNORM_NFC;
01151 case Normalizer::COMPOSE_COMPAT :
01152 return UNORM_NFKC;
01153 case Normalizer::DECOMP :
01154 return UNORM_NFD;
01155 case Normalizer::DECOMP_COMPAT :
01156 return UNORM_NFKD;
01157 case Normalizer::FCD:
01158 return UNORM_FCD;
01159 default :
01160 status = U_ILLEGAL_ARGUMENT_ERROR;
01161 }
01162 }
01163 return UNORM_DEFAULT;
01164 }
01165
01166 inline UNormalizationMode
01167 Normalizer::getUMode(Normalizer::EMode mode) {
01168 switch(mode) {
01169 case Normalizer::NO_OP :
01170 return UNORM_NONE;
01171 case Normalizer::COMPOSE :
01172 return UNORM_NFC;
01173 case Normalizer::COMPOSE_COMPAT :
01174 return UNORM_NFKC;
01175 case Normalizer::DECOMP :
01176 return UNORM_NFD;
01177 case Normalizer::DECOMP_COMPAT :
01178 return UNORM_NFKD;
01179 case Normalizer::FCD:
01180 return UNORM_FCD;
01181 default :
01182 return UNORM_DEFAULT;
01183 }
01184 }
01185
01186 inline Normalizer::EMode Normalizer::getNormalizerEMode(
01187 UNormalizationMode mode, UErrorCode &status)
01188 {
01189 if (U_SUCCESS(status))
01190 {
01191 switch (mode)
01192 {
01193 case UNORM_NONE :
01194 return Normalizer::NO_OP;
01195 case UNORM_NFD :
01196 return Normalizer::DECOMP;
01197 case UNORM_NFKD :
01198 return Normalizer::DECOMP_COMPAT;
01199 case UNORM_NFC :
01200 return Normalizer::COMPOSE;
01201 case UNORM_NFKC :
01202 return Normalizer::COMPOSE_COMPAT;
01203 case UNORM_FCD:
01204 return Normalizer::FCD;
01205 default :
01206 status = U_ILLEGAL_ARGUMENT_ERROR;
01207 }
01208 }
01209 return Normalizer::DECOMP_COMPAT;
01210 }
01211
01212 U_NAMESPACE_END
01213 #endif // _NORMLZR