00001
00002
00003
00004
00005
00006
00007
00008
00009 #ifndef NORMLZR_H
00010 #define NORMLZR_H
00011
00012 #include "unicode/utypes.h"
00013 #include "unicode/unistr.h"
00014 #include "unicode/chariter.h"
00015
00016
00017 class ComposedCharIter;
00018
00114 class U_COMMON_API Normalizer
00115 {
00116
00117 public:
00118
00119 enum {
00120 COMPAT_BIT = 1,
00121 DECOMP_BIT = 2,
00122 COMPOSE_BIT = 4
00123 };
00124
00125
00126
00128 enum {
00129 DONE=0xffff
00130 };
00131
00133 enum EMode {
00134
00147 NO_OP = 0,
00148
00163 COMPOSE = COMPOSE_BIT,
00164
00179 COMPOSE_COMPAT = COMPOSE_BIT | COMPAT_BIT,
00180
00195 DECOMP = DECOMP_BIT,
00196
00211 DECOMP_COMPAT = DECOMP_BIT | COMPAT_BIT
00212 };
00213
00215 enum {
00216
00234 IGNORE_HANGUL = 0x001
00235 };
00236
00237
00238
00249 Normalizer(const UnicodeString& str,
00250 EMode mode);
00251
00270 Normalizer(const UnicodeString& str,
00271 EMode mode,
00272 int32_t opt);
00273
00286 Normalizer(const UChar* str,
00287 int32_t length,
00288 EMode mode);
00289
00306 Normalizer(const UChar* str,
00307 int32_t length,
00308 EMode mode,
00309 int32_t option);
00310
00322 Normalizer(const CharacterIterator& iter,
00323 EMode mode);
00324
00340 Normalizer(const CharacterIterator& iter,
00341 EMode mode,
00342 int32_t opt);
00343
00348 Normalizer(const Normalizer& copy);
00349
00354 ~Normalizer();
00355
00356
00357
00358
00359
00360
00381 static void normalize(const UnicodeString& source,
00382 EMode mode,
00383 int32_t options,
00384 UnicodeString& result,
00385 UErrorCode &status);
00386
00410 static void compose(const UnicodeString& source,
00411 UBool compat,
00412 int32_t options,
00413 UnicodeString& result,
00414 UErrorCode &status);
00415
00442 static void decompose(const UnicodeString& source,
00443 UBool compat,
00444 int32_t options,
00445 UnicodeString& result,
00446 UErrorCode &status);
00447
00448
00449
00450
00451
00452
00457 UChar32 current(void) const;
00458
00464 UChar32 first(void);
00465
00472 UChar32 last(void);
00473
00480 UChar32 next(void);
00481
00488 UChar32 previous(void);
00489
00507 UChar32 setIndex(UTextOffset index);
00508
00517 void reset(void);
00518
00533 UTextOffset getIndex(void) const;
00534
00541 UTextOffset startIndex(void) const;
00542
00549 UTextOffset endIndex(void) const;
00550
00551
00557
00558 UBool operator==(const Normalizer& that) const;
00559 inline UBool operator!=(const Normalizer& that) const;
00560
00566 Normalizer* clone(void) const;
00567
00572 int32_t hashCode(void) const;
00573
00574
00575
00576
00577
00604 void setMode(EMode newMode);
00605
00612 EMode getMode(void) const;
00613
00637 void setOption(int32_t option,
00638 UBool value);
00639
00646 UBool getOption(int32_t option) const;
00647
00653 void setText(const UnicodeString& newText,
00654 UErrorCode &status);
00655
00661 void setText(const CharacterIterator& newText,
00662 UErrorCode &status);
00663
00669 void setText(const UChar* newText,
00670 int32_t length,
00671 UErrorCode &status);
00678 void getText(UnicodeString& result);
00679
00685 const UChar* getText(int32_t& count);
00686
00687 private:
00688
00689
00690 UChar nextCompose(void);
00691 UChar prevCompose(void);
00692 UChar nextDecomp(void);
00693 UChar prevDecomp(void);
00694
00695 UChar curForward(void);
00696 UChar curBackward(void);
00697
00698 void init(CharacterIterator* iter,
00699 EMode mode,
00700 int32_t option);
00701 void initBuffer(void);
00702 void clearBuffer(void);
00703
00704
00705 static void bubbleAppend(UnicodeString& target,
00706 UChar ch,
00707 uint32_t cclass);
00708 static uint32_t getComposeClass(UChar ch);
00709 static uint16_t composeLookup(UChar ch);
00710 static uint16_t composeAction(uint16_t baseIndex,
00711 uint16_t comIndex);
00712 static void explode(UnicodeString& target,
00713 uint16_t index);
00714 static UChar pairExplode(UnicodeString& target,
00715 uint16_t action);
00716
00717
00718 static void fixCanonical(UnicodeString& result);
00719 static uint8_t getClass(UChar ch);
00720
00721
00722 static void doAppend(const UChar source[],
00723 uint16_t offset,
00724 UnicodeString& dest);
00725 static void doInsert(const UChar source[],
00726 uint16_t offset,
00727 UnicodeString& dest,
00728 UTextOffset pos);
00729 static uint16_t doReplace(const UChar source[],
00730 uint16_t offset,
00731 UnicodeString& dest,
00732 UTextOffset pos);
00733
00734 static void hangulToJamo(UChar ch,
00735 UnicodeString& result,
00736 uint16_t decompLimit);
00737 static void jamoAppend(UChar ch,
00738 uint16_t decompLimit,
00739 UnicodeString& dest);
00740 static void jamoToHangul(UnicodeString& buffer,
00741 UTextOffset start);
00742
00743
00744
00745
00746
00747 EMode fMode;
00748 int32_t fOptions;
00749 int16_t minDecomp;
00750
00751
00752 CharacterIterator* text;
00753
00754
00755 UnicodeString buffer;
00756 UTextOffset bufferPos;
00757 UTextOffset bufferLimit;
00758 UChar currentChar;
00759
00760
00761 UnicodeString explodeBuf;
00762
00763 enum {
00764 EMPTY = -1,
00765 STR_INDEX_SHIFT = 2,
00766 STR_LENGTH_MASK = 0x0003
00767 };
00768
00769 enum {
00770 HANGUL_BASE = 0xac00,
00771 HANGUL_LIMIT = 0xd7a4,
00772 JAMO_LBASE = 0x1100,
00773 JAMO_VBASE = 0x1161,
00774 JAMO_TBASE = 0x11a7,
00775 JAMO_LCOUNT = 19,
00776 JAMO_VCOUNT = 21,
00777 JAMO_TCOUNT = 28,
00778 JAMO_NCOUNT = JAMO_VCOUNT * JAMO_TCOUNT
00779 };
00780
00781 friend class ComposedCharIter;
00782 };
00783
00784 inline UBool
00785 Normalizer::operator!= (const Normalizer& other) const
00786 { return ! operator==(other); }
00787
00788 #endif // _NORMLZR