00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #ifndef NORMLZR_H
00023 #define NORMLZR_H
00024
00025 #include "unicode/utypes.h"
00026 #include "unicode/unistr.h"
00027 #include "unicode/chariter.h"
00028 #include "unicode/unorm.h"
00029
00030
00031 class ComposedCharIter;
00032
00128 class U_COMMON_API Normalizer
00129 {
00130
00131 public:
00132
00133 enum {
00134 COMPAT_BIT = 1,
00135 DECOMP_BIT = 2,
00136 COMPOSE_BIT = 4
00137 };
00138
00139
00140
00142 enum {
00143 DONE=0xffff
00144 };
00145
00147 enum EMode {
00148
00161 NO_OP = 0,
00162
00177 COMPOSE = COMPOSE_BIT,
00178
00193 COMPOSE_COMPAT = COMPOSE_BIT | COMPAT_BIT,
00194
00209 DECOMP = DECOMP_BIT,
00210
00225 DECOMP_COMPAT = DECOMP_BIT | COMPAT_BIT
00226 };
00227
00229 enum {
00230
00248 IGNORE_HANGUL = 0x001
00249 };
00250
00251
00252
00263 Normalizer(const UnicodeString& str,
00264 EMode mode);
00265
00284 Normalizer(const UnicodeString& str,
00285 EMode mode,
00286 int32_t opt);
00287
00300 Normalizer(const UChar* str,
00301 int32_t length,
00302 EMode mode);
00303
00320 Normalizer(const UChar* str,
00321 int32_t length,
00322 EMode mode,
00323 int32_t option);
00324
00336 Normalizer(const CharacterIterator& iter,
00337 EMode mode);
00338
00354 Normalizer(const CharacterIterator& iter,
00355 EMode mode,
00356 int32_t opt);
00357
00362 Normalizer(const Normalizer& copy);
00363
00368 ~Normalizer();
00369
00370
00371
00372
00373
00374
00395 static void normalize(const UnicodeString& source,
00396 EMode mode,
00397 int32_t options,
00398 UnicodeString& result,
00399 UErrorCode &status);
00400
00424 static void compose(const UnicodeString& source,
00425 UBool compat,
00426 int32_t options,
00427 UnicodeString& result,
00428 UErrorCode &status);
00429
00456 static void decompose(const UnicodeString& source,
00457 UBool compat,
00458 int32_t options,
00459 UnicodeString& result,
00460 UErrorCode &status);
00461
00468 inline static UNormalizationMode getUNormalizationMode(EMode mode,
00469 UErrorCode& status);
00470
00477 inline static EMode getNormalizerEMode(UNormalizationMode mode,
00478 UErrorCode& status);
00479
00495 static UNormalizationCheckResult
00496 quickCheck(const UnicodeString& source,
00497 EMode mode,
00498 UErrorCode& status);
00499
00500
00501
00502
00503
00508 UChar32 current(void) const;
00509
00515 UChar32 first(void);
00516
00523 UChar32 last(void);
00524
00531 UChar32 next(void);
00532
00539 UChar32 previous(void);
00540
00558 UChar32 setIndex(UTextOffset index);
00559
00568 void reset(void);
00569
00584 UTextOffset getIndex(void) const;
00585
00592 UTextOffset startIndex(void) const;
00593
00600 UTextOffset endIndex(void) const;
00601
00602
00608
00609 UBool operator==(const Normalizer& that) const;
00610 inline UBool operator!=(const Normalizer& that) const;
00611
00617 Normalizer* clone(void) const;
00618
00623 int32_t hashCode(void) const;
00624
00625
00626
00627
00628
00655 void setMode(EMode newMode);
00656
00663 EMode getMode(void) const;
00664
00688 void setOption(int32_t option,
00689 UBool value);
00690
00697 UBool getOption(int32_t option) const;
00698
00704 void setText(const UnicodeString& newText,
00705 UErrorCode &status);
00706
00712 void setText(const CharacterIterator& newText,
00713 UErrorCode &status);
00714
00720 void setText(const UChar* newText,
00721 int32_t length,
00722 UErrorCode &status);
00729 void getText(UnicodeString& result);
00730
00736 const UChar* getText(int32_t& count);
00737
00738 private:
00739
00740
00741 UChar nextCompose(void);
00742 UChar prevCompose(void);
00743 UChar nextDecomp(void);
00744 UChar prevDecomp(void);
00745
00746 UChar curForward(void);
00747 UChar curBackward(void);
00748
00749 void init(CharacterIterator* iter,
00750 EMode mode,
00751 int32_t option);
00752 void initBuffer(void);
00753 void clearBuffer(void);
00754
00755
00756 static void bubbleAppend(UnicodeString& target,
00757 UChar ch,
00758 uint32_t cclass);
00759 static uint32_t getComposeClass(UChar ch);
00760 static uint16_t composeLookup(UChar ch);
00761 static uint16_t composeAction(uint16_t baseIndex,
00762 uint16_t comIndex);
00763 static void explode(UnicodeString& target,
00764 uint16_t index);
00765 static UChar pairExplode(UnicodeString& target,
00766 uint16_t action);
00767
00768
00769 static void fixCanonical(UnicodeString& result);
00770 static uint8_t getClass(UChar ch);
00771
00772
00773 static void doAppend(const UChar source[],
00774 uint16_t offset,
00775 UnicodeString& dest);
00776 static void doInsert(const UChar source[],
00777 uint16_t offset,
00778 UnicodeString& dest,
00779 UTextOffset pos);
00780 static uint16_t doReplace(const UChar source[],
00781 uint16_t offset,
00782 UnicodeString& dest,
00783 UTextOffset pos);
00784
00785 static void hangulToJamo(UChar ch,
00786 UnicodeString& result,
00787 uint16_t decompLimit);
00788 static void jamoAppend(UChar ch,
00789 uint16_t decompLimit,
00790 UnicodeString& dest);
00791 static void jamoToHangul(UnicodeString& buffer,
00792 UTextOffset start);
00793
00794
00795
00796
00797
00798 EMode fMode;
00799 int32_t fOptions;
00800 int16_t minDecomp;
00801
00802
00803 CharacterIterator* text;
00804
00805
00806 UnicodeString buffer;
00807 UTextOffset bufferPos;
00808 UTextOffset bufferLimit;
00809 UChar currentChar;
00810
00811
00812 UnicodeString explodeBuf;
00813
00814 enum {
00815 EMPTY = -1,
00816 STR_INDEX_SHIFT = 2,
00817 STR_LENGTH_MASK = 0x0003
00818 };
00819
00820 enum {
00821 HANGUL_BASE = 0xac00,
00822 HANGUL_LIMIT = 0xd7a4,
00823 JAMO_LBASE = 0x1100,
00824 JAMO_VBASE = 0x1161,
00825 JAMO_TBASE = 0x11a7,
00826 JAMO_LCOUNT = 19,
00827 JAMO_VCOUNT = 21,
00828 JAMO_TCOUNT = 28,
00829 JAMO_NCOUNT = JAMO_VCOUNT * JAMO_TCOUNT
00830 };
00831
00832 friend class ComposedCharIter;
00833 };
00834
00835 inline UBool
00836 Normalizer::operator!= (const Normalizer& other) const
00837 { return ! operator==(other); }
00838
00839 inline UNormalizationMode Normalizer::getUNormalizationMode(
00840 Normalizer::EMode mode, UErrorCode &status)
00841 {
00842 if (U_SUCCESS(status))
00843 {
00844 switch (mode)
00845 {
00846 case Normalizer::NO_OP :
00847 return UNORM_NONE;
00848 case Normalizer::COMPOSE :
00849 return UNORM_NFC;
00850 case Normalizer::COMPOSE_COMPAT :
00851 return UNORM_NFKC;
00852 case Normalizer::DECOMP :
00853 return UNORM_NFD;
00854 case Normalizer::DECOMP_COMPAT :
00855 return UNORM_NFKD;
00856 default :
00857 status = U_ILLEGAL_ARGUMENT_ERROR;
00858 }
00859 }
00860 return UNORM_DEFAULT;
00861 }
00862
00863 inline Normalizer::EMode Normalizer::getNormalizerEMode(
00864 UNormalizationMode mode, UErrorCode &status)
00865 {
00866 if (U_SUCCESS(status))
00867 {
00868 switch (mode)
00869 {
00870 case UNORM_NONE :
00871 return Normalizer::NO_OP;
00872 case UNORM_NFD :
00873 return Normalizer::DECOMP;
00874 case UNORM_NFKD :
00875 return Normalizer::DECOMP_COMPAT;
00876 case UNORM_NFC :
00877 return Normalizer::COMPOSE;
00878 case UNORM_NFKC :
00879 return Normalizer::COMPOSE_COMPAT;
00880 default :
00881 status = U_ILLEGAL_ARGUMENT_ERROR;
00882 }
00883 }
00884 return Normalizer::DECOMP_COMPAT;
00885 }
00886
00887 #endif // _NORMLZR