Main Page   Class Hierarchy   Compound List   File List   Header Files   Sources   Compound Members   File Members  

normlzr.h

00001 /*
00002  ********************************************************************
00003  * COPYRIGHT: 
00004  * Copyright (c) 1996-1999, International Business Machines Corporation and
00005  * others. All Rights Reserved.
00006  ********************************************************************
00007  */
00008 
00009 #ifndef NORMLZR_H
00010 #define NORMLZR_H
00011 
00012 #include "unicode/utypes.h"
00013 #include "unicode/unistr.h"
00014 #include "unicode/chariter.h"
00015 
00016 /* forward declaration */
00017 class ComposedCharIter;
00018 
00106 class U_COMMON_API Normalizer
00107 {
00108 
00109  public:
00110   // This tells us what the bits in the "mode" mean.
00111   enum {
00112     COMPAT_BIT         = 1,
00113     DECOMP_BIT         = 2,
00114     COMPOSE_BIT     = 4
00115   };
00116 
00117 
00118 
00120   enum {
00121       DONE=0xffff
00122   };
00123 
00125   enum EMode {
00126 
00139     NO_OP         = 0,
00140     
00155     COMPOSE         = COMPOSE_BIT,
00156 
00171     COMPOSE_COMPAT     = COMPOSE_BIT | COMPAT_BIT,
00172 
00187     DECOMP         = DECOMP_BIT,
00188 
00203     DECOMP_COMPAT     = DECOMP_BIT | COMPAT_BIT
00204   };
00205 
00207   enum {
00208 
00226     IGNORE_HANGUL     = 0x001
00227   };
00228 
00229   // Constructors
00230 
00241   Normalizer(const UnicodeString& str, 
00242          EMode mode);
00243     
00262   Normalizer(const UnicodeString& str, 
00263          EMode mode, 
00264          int32_t opt);
00265 
00278   Normalizer(const UChar* str,
00279          int32_t length,
00280          EMode mode);
00281 
00298   Normalizer(const UChar* str,
00299          int32_t length,
00300          EMode mode,
00301          int32_t option);
00302 
00314   Normalizer(const CharacterIterator& iter, 
00315          EMode mode);
00316 
00332   Normalizer(const CharacterIterator& iter, 
00333          EMode mode, 
00334          int32_t opt);
00335 
00340   Normalizer(const Normalizer& copy);
00341 
00346   ~Normalizer();
00347 
00348 
00349   //-------------------------------------------------------------------------
00350   // Static utility methods
00351   //-------------------------------------------------------------------------
00352 
00373   static void normalize(const UnicodeString& source, 
00374             EMode mode, 
00375             int32_t options,
00376             UnicodeString& result, 
00377             UErrorCode &status);
00378 
00402   static void compose(const UnicodeString& source, 
00403               UBool compat,
00404               int32_t options,
00405               UnicodeString& result, 
00406               UErrorCode &status);
00407 
00434   static void decompose(const UnicodeString& source, 
00435             UBool compat,
00436             int32_t options,
00437             UnicodeString& result, 
00438             UErrorCode &status);
00439 
00440 
00441   //-------------------------------------------------------------------------
00442   // CharacterIterator overrides
00443   //-------------------------------------------------------------------------
00444   
00449   UChar32              current(void) const;
00450 
00456   UChar32              first(void);
00457 
00464   UChar32              last(void);
00465 
00472   UChar32              next(void);
00473 
00480   UChar32              previous(void);
00481 
00499   UChar32              setIndex(UTextOffset index);
00500 
00509   void                reset(void);
00510 
00525   UTextOffset            getIndex(void) const;
00526 
00533   UTextOffset            startIndex(void) const;
00534 
00541   UTextOffset            endIndex(void) const;
00542 
00543 
00549   //  virtual UBool    operator==(const CharacterIterator& that) const;
00550   UBool        operator==(const Normalizer& that) const;
00551   inline UBool        operator!=(const Normalizer& that) const;
00552 
00558   Normalizer*        clone(void) const;
00559 
00564   int32_t                hashCode(void) const;
00565 
00566   //-------------------------------------------------------------------------
00567   // Property access methods
00568   //-------------------------------------------------------------------------
00569 
00596   void setMode(EMode newMode);
00597 
00604   EMode getMode(void) const;
00605 
00629   void setOption(int32_t option, 
00630          UBool value);
00631 
00638   UBool getOption(int32_t option) const;
00639 
00645   void setText(const UnicodeString& newText, 
00646            UErrorCode &status);
00647 
00653   void setText(const CharacterIterator& newText, 
00654            UErrorCode &status);
00655 
00661   void setText(const UChar* newText,
00662                     int32_t length,
00663             UErrorCode &status);
00670   void            getText(UnicodeString&  result);
00671 
00677   const UChar*     getText(int32_t&  count);
00678 
00679 private:
00680   // Private utility methods for iteration
00681   // For documentation, see the source code
00682   UChar nextCompose(void);
00683   UChar prevCompose(void);
00684   UChar nextDecomp(void);
00685   UChar prevDecomp(void);
00686 
00687   UChar curForward(void);
00688   UChar curBackward(void);
00689 
00690   void    init(CharacterIterator* iter, 
00691          EMode mode, 
00692          int32_t option);
00693   void    initBuffer(void);
00694   void    clearBuffer(void);
00695 
00696   // Utilities used by Compose
00697   static void        bubbleAppend(UnicodeString& target, 
00698                      UChar ch, 
00699                      uint32_t cclass);
00700   static uint32_t     getComposeClass(UChar ch);
00701   static uint16_t    composeLookup(UChar ch);
00702   static uint16_t    composeAction(uint16_t baseIndex, 
00703                       uint16_t comIndex);
00704   static void        explode(UnicodeString& target, 
00705                 uint16_t index);
00706   static UChar    pairExplode(UnicodeString& target, 
00707                     uint16_t action);
00708 
00709   // Utilities used by Decompose
00710   static void        fixCanonical(UnicodeString& result);    // Reorders combining marks
00711   static uint8_t    getClass(UChar ch);                    // Gets char's combining class
00712 
00713   // Other static utility methods
00714   static void doAppend(const UChar source[], 
00715                uint16_t offset, 
00716                UnicodeString& dest);
00717   static void doInsert(const UChar source[], 
00718                uint16_t offset, 
00719                UnicodeString& dest, 
00720                UTextOffset pos);
00721   static uint16_t doReplace(const UChar source[], 
00722                uint16_t offset, 
00723                UnicodeString& dest, 
00724                UTextOffset pos);
00725 
00726   static void hangulToJamo(UChar ch, 
00727                UnicodeString& result, 
00728                uint16_t decompLimit);
00729   static void jamoAppend(UChar ch, 
00730              uint16_t decompLimit, 
00731              UnicodeString& dest);
00732   static void jamoToHangul(UnicodeString& buffer, 
00733                UTextOffset start);
00734 
00735   //-------------------------------------------------------------------------
00736   // Private data
00737   //-------------------------------------------------------------------------
00738 
00739   EMode         fMode;
00740   int32_t       fOptions;
00741   int16_t    minDecomp;
00742 
00743   // The input text and our position in it
00744   CharacterIterator*  text;
00745 
00746   // A buffer for holding intermediate results
00747   UnicodeString       buffer;
00748   UTextOffset          bufferPos;
00749   UTextOffset          bufferLimit;
00750   UChar             currentChar;
00751 
00752   // Another buffer for use during iterative composition
00753   UnicodeString       explodeBuf;
00754 
00755   enum {
00756     EMPTY = -1,
00757     STR_INDEX_SHIFT = 2, //Must agree with the constants used in NormalizerBuilder
00758     STR_LENGTH_MASK = 0x0003
00759   };
00760 
00761   enum {
00762     HANGUL_BASE = 0xac00,
00763     HANGUL_LIMIT = 0xd7a4,
00764     JAMO_LBASE = 0x1100,
00765     JAMO_VBASE = 0x1161,
00766     JAMO_TBASE = 0x11a7,
00767     JAMO_LCOUNT = 19,
00768     JAMO_VCOUNT = 21,
00769     JAMO_TCOUNT = 28,
00770     JAMO_NCOUNT = JAMO_VCOUNT * JAMO_TCOUNT
00771   };
00772 
00773   friend class ComposedCharIter;
00774 };
00775 
00776 inline UBool
00777 Normalizer::operator!= (const Normalizer& other) const
00778 { return ! operator==(other); }
00779 
00780 #endif // _NORMLZR

Generated at Wed Aug 16 16:05:34 2000 for ICU1.6 by doxygen 1.0.0 written by Dimitri van Heesch, © 1997-1999