00001 /* 00002 ******************************************************************** 00003 * COPYRIGHT: 00004 * Copyright (c) 1996-1999, International Business Machines Corporation and 00005 * others. All Rights Reserved. 00006 ******************************************************************** 00007 */ 00008 00009 #ifndef NORMLZR_H 00010 #define NORMLZR_H 00011 00012 #include "unicode/utypes.h" 00013 #include "unicode/unistr.h" 00014 #include "unicode/chariter.h" 00015 00016 /* forward declaration */ 00017 class ComposedCharIter; 00018 00106 class U_COMMON_API Normalizer 00107 { 00108 00109 public: 00110 // This tells us what the bits in the "mode" mean. 00111 enum { 00112 COMPAT_BIT = 1, 00113 DECOMP_BIT = 2, 00114 COMPOSE_BIT = 4 00115 }; 00116 00117 00118 00120 enum { 00121 DONE=0xffff 00122 }; 00123 00125 enum EMode { 00126 00139 NO_OP = 0, 00140 00155 COMPOSE = COMPOSE_BIT, 00156 00171 COMPOSE_COMPAT = COMPOSE_BIT | COMPAT_BIT, 00172 00187 DECOMP = DECOMP_BIT, 00188 00203 DECOMP_COMPAT = DECOMP_BIT | COMPAT_BIT 00204 }; 00205 00207 enum { 00208 00226 IGNORE_HANGUL = 0x001 00227 }; 00228 00229 // Constructors 00230 00241 Normalizer(const UnicodeString& str, 00242 EMode mode); 00243 00262 Normalizer(const UnicodeString& str, 00263 EMode mode, 00264 int32_t opt); 00265 00278 Normalizer(const UChar* str, 00279 int32_t length, 00280 EMode mode); 00281 00298 Normalizer(const UChar* str, 00299 int32_t length, 00300 EMode mode, 00301 int32_t option); 00302 00314 Normalizer(const CharacterIterator& iter, 00315 EMode mode); 00316 00332 Normalizer(const CharacterIterator& iter, 00333 EMode mode, 00334 int32_t opt); 00335 00340 Normalizer(const Normalizer& copy); 00341 00346 ~Normalizer(); 00347 00348 00349 //------------------------------------------------------------------------- 00350 // Static utility methods 00351 //------------------------------------------------------------------------- 00352 00373 static void normalize(const UnicodeString& source, 00374 EMode mode, 00375 int32_t options, 00376 UnicodeString& result, 00377 UErrorCode &status); 00378 00402 static void compose(const UnicodeString& source, 00403 UBool compat, 00404 int32_t options, 00405 UnicodeString& result, 00406 UErrorCode &status); 00407 00434 static void decompose(const UnicodeString& source, 00435 UBool compat, 00436 int32_t options, 00437 UnicodeString& result, 00438 UErrorCode &status); 00439 00440 00441 //------------------------------------------------------------------------- 00442 // CharacterIterator overrides 00443 //------------------------------------------------------------------------- 00444 00449 UChar32 current(void) const; 00450 00456 UChar32 first(void); 00457 00464 UChar32 last(void); 00465 00472 UChar32 next(void); 00473 00480 UChar32 previous(void); 00481 00499 UChar32 setIndex(UTextOffset index); 00500 00509 void reset(void); 00510 00525 UTextOffset getIndex(void) const; 00526 00533 UTextOffset startIndex(void) const; 00534 00541 UTextOffset endIndex(void) const; 00542 00543 00549 // virtual UBool operator==(const CharacterIterator& that) const; 00550 UBool operator==(const Normalizer& that) const; 00551 inline UBool operator!=(const Normalizer& that) const; 00552 00558 Normalizer* clone(void) const; 00559 00564 int32_t hashCode(void) const; 00565 00566 //------------------------------------------------------------------------- 00567 // Property access methods 00568 //------------------------------------------------------------------------- 00569 00596 void setMode(EMode newMode); 00597 00604 EMode getMode(void) const; 00605 00629 void setOption(int32_t option, 00630 UBool value); 00631 00638 UBool getOption(int32_t option) const; 00639 00645 void setText(const UnicodeString& newText, 00646 UErrorCode &status); 00647 00653 void setText(const CharacterIterator& newText, 00654 UErrorCode &status); 00655 00661 void setText(const UChar* newText, 00662 int32_t length, 00663 UErrorCode &status); 00670 void getText(UnicodeString& result); 00671 00677 const UChar* getText(int32_t& count); 00678 00679 private: 00680 // Private utility methods for iteration 00681 // For documentation, see the source code 00682 UChar nextCompose(void); 00683 UChar prevCompose(void); 00684 UChar nextDecomp(void); 00685 UChar prevDecomp(void); 00686 00687 UChar curForward(void); 00688 UChar curBackward(void); 00689 00690 void init(CharacterIterator* iter, 00691 EMode mode, 00692 int32_t option); 00693 void initBuffer(void); 00694 void clearBuffer(void); 00695 00696 // Utilities used by Compose 00697 static void bubbleAppend(UnicodeString& target, 00698 UChar ch, 00699 uint32_t cclass); 00700 static uint32_t getComposeClass(UChar ch); 00701 static uint16_t composeLookup(UChar ch); 00702 static uint16_t composeAction(uint16_t baseIndex, 00703 uint16_t comIndex); 00704 static void explode(UnicodeString& target, 00705 uint16_t index); 00706 static UChar pairExplode(UnicodeString& target, 00707 uint16_t action); 00708 00709 // Utilities used by Decompose 00710 static void fixCanonical(UnicodeString& result); // Reorders combining marks 00711 static uint8_t getClass(UChar ch); // Gets char's combining class 00712 00713 // Other static utility methods 00714 static void doAppend(const UChar source[], 00715 uint16_t offset, 00716 UnicodeString& dest); 00717 static void doInsert(const UChar source[], 00718 uint16_t offset, 00719 UnicodeString& dest, 00720 UTextOffset pos); 00721 static uint16_t doReplace(const UChar source[], 00722 uint16_t offset, 00723 UnicodeString& dest, 00724 UTextOffset pos); 00725 00726 static void hangulToJamo(UChar ch, 00727 UnicodeString& result, 00728 uint16_t decompLimit); 00729 static void jamoAppend(UChar ch, 00730 uint16_t decompLimit, 00731 UnicodeString& dest); 00732 static void jamoToHangul(UnicodeString& buffer, 00733 UTextOffset start); 00734 00735 //------------------------------------------------------------------------- 00736 // Private data 00737 //------------------------------------------------------------------------- 00738 00739 EMode fMode; 00740 int32_t fOptions; 00741 int16_t minDecomp; 00742 00743 // The input text and our position in it 00744 CharacterIterator* text; 00745 00746 // A buffer for holding intermediate results 00747 UnicodeString buffer; 00748 UTextOffset bufferPos; 00749 UTextOffset bufferLimit; 00750 UChar currentChar; 00751 00752 // Another buffer for use during iterative composition 00753 UnicodeString explodeBuf; 00754 00755 enum { 00756 EMPTY = -1, 00757 STR_INDEX_SHIFT = 2, //Must agree with the constants used in NormalizerBuilder 00758 STR_LENGTH_MASK = 0x0003 00759 }; 00760 00761 enum { 00762 HANGUL_BASE = 0xac00, 00763 HANGUL_LIMIT = 0xd7a4, 00764 JAMO_LBASE = 0x1100, 00765 JAMO_VBASE = 0x1161, 00766 JAMO_TBASE = 0x11a7, 00767 JAMO_LCOUNT = 19, 00768 JAMO_VCOUNT = 21, 00769 JAMO_TCOUNT = 28, 00770 JAMO_NCOUNT = JAMO_VCOUNT * JAMO_TCOUNT 00771 }; 00772 00773 friend class ComposedCharIter; 00774 }; 00775 00776 inline UBool 00777 Normalizer::operator!= (const Normalizer& other) const 00778 { return ! operator==(other); } 00779 00780 #endif // _NORMLZR