00001 /* 00002 ******************************************************************** 00003 * COPYRIGHT: 00004 * Copyright (c) 1996-1999, International Business Machines Corporation and 00005 * others. All Rights Reserved. 00006 ******************************************************************** 00007 */ 00008 00009 #ifndef NORMLZR_H 00010 #define NORMLZR_H 00011 00012 #include "unicode/utypes.h" 00013 #include "unicode/unistr.h" 00014 #include "unicode/chariter.h" 00015 00016 /* forward declaration */ 00017 class ComposedCharIter; 00018 00106 class U_COMMON_API Normalizer 00107 { 00108 00109 public: 00110 // This tells us what the bits in the "mode" mean. 00111 enum { 00112 COMPAT_BIT = 1, 00113 DECOMP_BIT = 2, 00114 COMPOSE_BIT = 4 00115 }; 00116 00117 00118 00120 enum { 00121 DONE=0xffff 00122 }; 00123 00125 enum EMode { 00126 00139 NO_OP = 0, 00140 00155 COMPOSE = COMPOSE_BIT, 00156 00171 COMPOSE_COMPAT = COMPOSE_BIT | COMPAT_BIT, 00172 00187 DECOMP = DECOMP_BIT, 00188 00203 DECOMP_COMPAT = DECOMP_BIT | COMPAT_BIT 00204 }; 00205 00207 enum { 00208 00226 IGNORE_HANGUL = 0x001 00227 }; 00228 00229 // Constructors 00230 00241 Normalizer(const UnicodeString& str, 00242 EMode mode); 00243 00262 Normalizer(const UnicodeString& str, 00263 EMode mode, 00264 int32_t opt); 00265 00278 Normalizer(const UChar* str, 00279 int32_t length, 00280 EMode mode); 00281 00298 Normalizer(const UChar* str, 00299 int32_t length, 00300 EMode mode, 00301 int32_t option); 00302 00314 Normalizer(const CharacterIterator& iter, 00315 EMode mode); 00316 00332 Normalizer(const CharacterIterator& iter, 00333 EMode mode, 00334 int32_t opt); 00335 00340 Normalizer(const Normalizer& copy); 00341 00346 ~Normalizer(); 00347 00348 00349 //------------------------------------------------------------------------- 00350 // Static utility methods 00351 //------------------------------------------------------------------------- 00352 00373 static void normalize(const UnicodeString& source, 00374 EMode mode, 00375 int32_t options, 00376 UnicodeString& result, 00377 UErrorCode &status); 00378 00402 static void compose(const UnicodeString& source, 00403 UBool compat, 00404 int32_t options, 00405 UnicodeString& result, 00406 UErrorCode &status); 00407 00434 static void decompose(const UnicodeString& source, 00435 UBool compat, 00436 int32_t options, 00437 UnicodeString& result, 00438 UErrorCode &status); 00439 00440 00441 //------------------------------------------------------------------------- 00442 // CharacterIterator overrides 00443 //------------------------------------------------------------------------- 00444 00449 UChar32 current(void) const; 00450 00456 UChar32 first(void); 00457 00464 UChar32 last(void); 00465 00472 UChar32 next(void); 00473 00480 UChar32 previous(void); 00481 00499 UChar32 setIndex(UTextOffset index); 00500 00509 void reset(void); 00510 00525 UTextOffset getIndex(void) const; 00526 00533 UTextOffset startIndex(void) const; 00534 00541 UTextOffset endIndex(void) const; 00542 00543 00549 // virtual UBool operator==(const CharacterIterator& that) const; 00550 UBool operator==(const Normalizer& that) const; 00551 inline UBool operator!=(const Normalizer& that) const; 00552 00558 Normalizer* clone(void) const; 00559 00564 int32_t hashCode(void) const; 00565 00566 //------------------------------------------------------------------------- 00567 // Property access methods 00568 //------------------------------------------------------------------------- 00569 00596 void setMode(EMode newMode); 00597 00604 EMode getMode(void) const; 00605 00629 void setOption(int32_t option, 00630 UBool value); 00631 00638 UBool getOption(int32_t option) const; 00639 00645 void setText(const UnicodeString& newText, 00646 UErrorCode &status); 00647 00653 void setText(const CharacterIterator& newText, 00654 UErrorCode &status); 00655 00661 void setText(const UChar* newText, 00662 int32_t length, 00663 UErrorCode &status); 00670 void getText(UnicodeString& result); 00671 00677 const UChar* getText(int32_t& count); 00678 00679 private: 00680 // Private utility methods for iteration 00681 // For documentation, see the source code 00682 UChar nextCompose(void); 00683 UChar prevCompose(void); 00684 UChar nextDecomp(void); 00685 UChar prevDecomp(void); 00686 00687 UChar curForward(void); 00688 UChar curBackward(void); 00689 00690 void init(CharacterIterator* iter, 00691 EMode mode, 00692 int32_t option); 00693 void initBuffer(void); 00694 void clearBuffer(void); 00695 00696 // Utilities used by Compose 00697 static void bubbleAppend(UnicodeString& target, 00698 UChar ch, 00699 uint32_t cclass); 00700 static uint32_t getComposeClass(UChar ch); 00701 static uint16_t composeLookup(UChar ch); 00702 static uint16_t composeAction(uint16_t baseIndex, 00703 uint16_t comIndex); 00704 static void explode(UnicodeString& target, 00705 uint16_t index); 00706 static UChar pairExplode(UnicodeString& target, 00707 uint16_t action); 00708 00709 // Utilities used by Decompose 00710 static void fixCanonical(UnicodeString& result); // Reorders combining marks 00711 static uint8_t getClass(UChar ch); // Gets char's combining class 00712 00713 // Other static utility methods 00714 static void doAppend(const UChar source[], 00715 uint16_t offset, 00716 UnicodeString& dest); 00717 static void doInsert(const UChar source[], 00718 uint16_t offset, 00719 UnicodeString& dest, 00720 UTextOffset pos); 00721 00722 static void hangulToJamo(UChar ch, 00723 UnicodeString& result, 00724 uint16_t decompLimit); 00725 static void jamoAppend(UChar ch, 00726 uint16_t decompLimit, 00727 UnicodeString& dest); 00728 static void jamoToHangul(UnicodeString& buffer, 00729 UTextOffset start); 00730 00731 //------------------------------------------------------------------------- 00732 // Private data 00733 //------------------------------------------------------------------------- 00734 00735 EMode fMode; 00736 int32_t fOptions; 00737 int16_t minDecomp; 00738 00739 // The input text and our position in it 00740 CharacterIterator* text; 00741 00742 // A buffer for holding intermediate results 00743 UnicodeString buffer; 00744 UTextOffset bufferPos; 00745 UTextOffset bufferLimit; 00746 UChar currentChar; 00747 00748 // Another buffer for use during iterative composition 00749 UnicodeString explodeBuf; 00750 00751 enum { 00752 EMPTY = -1, 00753 STR_INDEX_SHIFT = 2, //Must agree with the constants used in NormalizerBuilder 00754 STR_LENGTH_MASK = 0x0003 00755 }; 00756 00757 enum { 00758 HANGUL_BASE = 0xac00, 00759 HANGUL_LIMIT = 0xd7a4, 00760 JAMO_LBASE = 0x1100, 00761 JAMO_VBASE = 0x1161, 00762 JAMO_TBASE = 0x11a7, 00763 JAMO_LCOUNT = 19, 00764 JAMO_VCOUNT = 21, 00765 JAMO_TCOUNT = 28, 00766 JAMO_NCOUNT = JAMO_VCOUNT * JAMO_TCOUNT 00767 }; 00768 00769 friend class ComposedCharIter; 00770 }; 00771 00772 inline UBool 00773 Normalizer::operator!= (const Normalizer& other) const 00774 { return ! operator==(other); } 00775 00776 #endif // _NORMLZR