Main Page   Class Hierarchy   Alphabetical List   Compound List   File List   Compound Members   File Members   Search  

normlzr.h

Go to the documentation of this file.
00001 /*
00002  ********************************************************************
00003  * COPYRIGHT: 
00004  * Copyright (c) 1996-2001, International Business Machines Corporation and
00005  * others. All Rights Reserved.
00006  ********************************************************************
00007  */
00008 
00009 #ifndef NORMLZR_H
00010 #define NORMLZR_H
00011 
00012 #include "unicode/utypes.h"
00013 #include "unicode/uobject.h"
00014 #include "unicode/unistr.h"
00015 #include "unicode/chariter.h"
00016 #include "unicode/unorm.h"
00017 
00018 struct UCharIterator;
00019 typedef struct UCharIterator UCharIterator;
00020 
00021 U_NAMESPACE_BEGIN
00112 class U_COMMON_API Normalizer : public UObject {
00113 public:
00119   enum {
00120       DONE=0xffff
00121   };
00122 
00123   // Constructors
00124 
00135   Normalizer(const UnicodeString& str, UNormalizationMode mode);
00136     
00148   Normalizer(const UChar* str, int32_t length, UNormalizationMode mode);
00149 
00160   Normalizer(const CharacterIterator& iter, UNormalizationMode mode);
00161 
00167   Normalizer(const Normalizer& copy);
00168 
00173   ~Normalizer();
00174 
00175 
00176   //-------------------------------------------------------------------------
00177   // Static utility methods
00178   //-------------------------------------------------------------------------
00179 
00197   static void normalize(const UnicodeString& source,
00198                         UNormalizationMode mode, int32_t options,
00199                         UnicodeString& result,
00200                         UErrorCode &status);
00201 
00223   static void compose(const UnicodeString& source,
00224                       UBool compat, int32_t options,
00225                       UnicodeString& result,
00226                       UErrorCode &status);
00227 
00250   static void decompose(const UnicodeString& source,
00251                         UBool compat, int32_t options,
00252                         UnicodeString& result,
00253                         UErrorCode &status);
00254 
00275   static inline UNormalizationCheckResult
00276   quickCheck(const UnicodeString &source, UNormalizationMode mode, UErrorCode &status);
00277 
00298   static inline UBool
00299   isNormalized(const UnicodeString &src, UNormalizationMode mode, UErrorCode &errorCode);
00300 
00301   /*
00302    * Concatenate normalized strings, making sure that the result is normalized as well.
00303    *
00304    * If both the left and the right strings are in
00305    * the normalization form according to "mode",
00306    * then the result will be
00307    *
00308    * \code
00309    *     dest=normalize(left+right, mode)
00310    * \endcode
00311    *
00312    * For details see unorm_concatenate in unorm.h.
00313    *
00314    * @param left Left source string.
00315    * @param right Right source string.
00316    * @param result The output string.
00317    * @param mode The normalization mode.
00318    * @param options A bit set of normalization options.
00319    * @param pErrorCode ICU error code in/out parameter.
00320    *                   Must fulfill U_SUCCESS before the function call.
00321    * @return result
00322    *
00323    * @see unorm_concatenate
00324    * @see normalize
00325    * @see unorm_next
00326    * @see unorm_previous
00327    *
00328    * @draft ICU 2.1
00329    */
00330   static UnicodeString &
00331   concatenate(UnicodeString &left, UnicodeString &right,
00332               UnicodeString &result,
00333               UNormalizationMode mode, int32_t options,
00334               UErrorCode &errorCode);
00335 
00398   static inline int32_t
00399   compare(const UnicodeString &s1, const UnicodeString &s2,
00400           uint32_t options,
00401           UErrorCode &errorCode);
00402 
00403   //-------------------------------------------------------------------------
00404   // Iteration API
00405   //-------------------------------------------------------------------------
00406   
00415   UChar32              current(void);
00416 
00425   UChar32              first(void);
00426 
00435   UChar32              last(void);
00436 
00445   UChar32              next(void);
00446 
00455   UChar32              previous(void);
00456 
00476   UChar32              setIndex(int32_t index);
00477 
00487   void                 setIndexOnly(int32_t index);
00488 
00494   void                reset(void);
00495 
00510   int32_t            getIndex(void) const;
00511 
00520   int32_t            startIndex(void) const;
00521 
00532   int32_t            endIndex(void) const;
00533 
00542   UBool        operator==(const Normalizer& that) const;
00543 
00552   inline UBool        operator!=(const Normalizer& that) const;
00553 
00560   Normalizer*        clone(void) const;
00561 
00568   int32_t                hashCode(void) const;
00569 
00570   //-------------------------------------------------------------------------
00571   // Property access methods
00572   //-------------------------------------------------------------------------
00573 
00589   void setMode(UNormalizationMode newMode);
00590 
00601   UNormalizationMode getUMode(void) const;
00602 
00619   void setOption(int32_t option, 
00620          UBool value);
00621 
00632   UBool getOption(int32_t option) const;
00633 
00642   void setText(const UnicodeString& newText, 
00643            UErrorCode &status);
00644 
00653   void setText(const CharacterIterator& newText, 
00654            UErrorCode &status);
00655 
00665   void setText(const UChar* newText,
00666                     int32_t length,
00667             UErrorCode &status);
00674   void            getText(UnicodeString&  result);
00675 
00681   virtual inline UClassID getDynamicClassID() const { return getStaticClassID(); }
00682 
00688   static inline UClassID getStaticClassID() { return (UClassID)&fgClassID; }
00689 
00690   //-------------------------------------------------------------------------
00691   // Deprecated APIs
00692   //-------------------------------------------------------------------------
00693 
00698   enum {
00699     COMPAT_BIT         = 1,
00700     DECOMP_BIT         = 2,
00701     COMPOSE_BIT        = 4,
00702     FCD_BIT            = 8
00703   };
00704 
00709   enum EMode {
00723     NO_OP         = 0,
00724     
00740     COMPOSE         = COMPOSE_BIT,
00741 
00757     COMPOSE_COMPAT     = COMPOSE_BIT | COMPAT_BIT,
00758 
00774     DECOMP         = DECOMP_BIT,
00775 
00791     DECOMP_COMPAT     = DECOMP_BIT | COMPAT_BIT,
00792 
00796     FCD = FCD_BIT
00797   };
00798 
00800   enum {
00819     IGNORE_HANGUL     = 0x001
00820   };
00821 
00832   Normalizer(const UnicodeString& str, 
00833          EMode mode);
00834     
00853   Normalizer(const UnicodeString& str, 
00854          EMode mode, 
00855          int32_t opt);
00856 
00868   Normalizer(const UChar* str,
00869          int32_t length,
00870          EMode mode);
00871 
00887   Normalizer(const UChar* str,
00888          int32_t length,
00889          EMode mode,
00890          int32_t option);
00891 
00902   Normalizer(const CharacterIterator& iter, 
00903          EMode mode);
00904 
00920   Normalizer(const CharacterIterator& iter, 
00921          EMode mode, 
00922          int32_t opt);
00923 
00944   inline static void
00945   normalize(const UnicodeString& source, 
00946             EMode mode, 
00947             int32_t options,
00948             UnicodeString& result, 
00949             UErrorCode &status);
00950 
00967   inline static UNormalizationCheckResult
00968   quickCheck(const UnicodeString& source,
00969              EMode                mode, 
00970              UErrorCode&          status);
00971 
00979   inline static UNormalizationMode getUNormalizationMode(EMode mode, 
00980                                                   UErrorCode& status);
00981 
00989   inline static EMode getNormalizerEMode(UNormalizationMode mode, 
00990                                          UErrorCode& status);
00991 
01018   inline void setMode(EMode newMode);
01019 
01026   inline EMode getMode(void) const;
01027 
01028 private:
01029   //-------------------------------------------------------------------------
01030   // Private functions
01031   //-------------------------------------------------------------------------
01032 
01033   // Private utility methods for iteration
01034   // For documentation, see the source code
01035   UBool nextNormalize();
01036   UBool previousNormalize();
01037 
01038   void    init(CharacterIterator *iter);
01039   void    clearBuffer(void);
01040 
01041   // Helper, without UErrorCode, for easier transitional code
01042   // remove after 2002-sep-30 with EMode etc.
01043   inline static UNormalizationMode getUMode(EMode mode);
01044 
01045   //-------------------------------------------------------------------------
01046   // Private data
01047   //-------------------------------------------------------------------------
01048 
01049   UNormalizationMode  fUMode;
01050   int32_t             fOptions;
01051 
01052   // The input text and our position in it
01053   UCharIterator       *text;
01054 
01055   // The normalization buffer is the result of normalization
01056   // of the source in [currentIndex..nextIndex[ .
01057   int32_t         currentIndex, nextIndex;
01058 
01059   // A buffer for holding intermediate results
01060   UnicodeString       buffer;
01061   int32_t         bufferPos;
01062 
01067   static const char fgClassID;
01068 };
01069 
01070 //-------------------------------------------------------------------------
01071 // Inline implementations
01072 //-------------------------------------------------------------------------
01073 
01074 inline UBool
01075 Normalizer::operator!= (const Normalizer& other) const
01076 { return ! operator==(other); }
01077 
01078 inline void 
01079 Normalizer::normalize(const UnicodeString& source, 
01080                       EMode mode, int32_t options,
01081                       UnicodeString& result, 
01082                       UErrorCode &status) {
01083   normalize(source, getUNormalizationMode(mode, status), options, result, status);
01084 }
01085 
01086 inline UNormalizationCheckResult
01087 Normalizer::quickCheck(const UnicodeString& source,
01088                        EMode mode, 
01089                        UErrorCode &status) {
01090   return quickCheck(source, getUNormalizationMode(mode, status), status);
01091 }
01092 
01093 inline UNormalizationCheckResult
01094 Normalizer::quickCheck(const UnicodeString& source,
01095                        UNormalizationMode mode, 
01096                        UErrorCode &status) {
01097     if(U_FAILURE(status)) {
01098         return UNORM_MAYBE;
01099     }
01100 
01101     return unorm_quickCheck(source.getBuffer(), source.length(),
01102                             mode, &status);
01103 }
01104 
01105 inline UBool
01106 Normalizer::isNormalized(const UnicodeString& source,
01107                          UNormalizationMode mode, 
01108                          UErrorCode &status) {
01109     if(U_FAILURE(status)) {
01110         return FALSE;
01111     }
01112 
01113     return unorm_isNormalized(source.getBuffer(), source.length(),
01114                               mode, &status);
01115 }
01116 
01117 inline int32_t
01118 Normalizer::compare(const UnicodeString &s1, const UnicodeString &s2,
01119                     uint32_t options,
01120                     UErrorCode &errorCode) {
01121   // all argument checking is done in unorm_compare
01122   return unorm_compare(s1.getBuffer(), s1.length(),
01123                        s2.getBuffer(), s2.length(),
01124                        options,
01125                        &errorCode);
01126 }
01127 
01128 inline void
01129 Normalizer::setMode(EMode newMode) {
01130   UErrorCode status = U_ZERO_ERROR;
01131   fUMode = getUNormalizationMode(newMode, status);
01132 }
01133 
01134 inline Normalizer::EMode
01135 Normalizer::getMode() const {
01136   UErrorCode status = U_ZERO_ERROR;
01137   return getNormalizerEMode(fUMode, status);
01138 }
01139 
01140 inline UNormalizationMode Normalizer::getUNormalizationMode(
01141                                    Normalizer::EMode  mode, UErrorCode &status)
01142 {
01143   if (U_SUCCESS(status))
01144   { 
01145     switch (mode)
01146     {
01147     case Normalizer::NO_OP : 
01148       return UNORM_NONE;
01149     case Normalizer::COMPOSE :
01150       return UNORM_NFC;
01151     case Normalizer::COMPOSE_COMPAT :
01152       return UNORM_NFKC;
01153     case Normalizer::DECOMP :
01154       return UNORM_NFD;
01155     case Normalizer::DECOMP_COMPAT :
01156       return UNORM_NFKD;
01157     case Normalizer::FCD:
01158       return UNORM_FCD;
01159     default : 
01160       status = U_ILLEGAL_ARGUMENT_ERROR; 
01161     }
01162   }
01163   return UNORM_DEFAULT;
01164 }
01165 
01166 inline UNormalizationMode
01167 Normalizer::getUMode(Normalizer::EMode mode) {
01168   switch(mode) {
01169   case Normalizer::NO_OP : 
01170     return UNORM_NONE;
01171   case Normalizer::COMPOSE :
01172     return UNORM_NFC;
01173   case Normalizer::COMPOSE_COMPAT :
01174     return UNORM_NFKC;
01175   case Normalizer::DECOMP :
01176     return UNORM_NFD;
01177   case Normalizer::DECOMP_COMPAT :
01178     return UNORM_NFKD;
01179   case Normalizer::FCD:
01180     return UNORM_FCD;
01181   default : 
01182     return UNORM_DEFAULT;
01183   }
01184 }
01185 
01186 inline Normalizer::EMode Normalizer::getNormalizerEMode(
01187                                   UNormalizationMode mode, UErrorCode &status)
01188 {
01189   if (U_SUCCESS(status))
01190   {
01191     switch (mode)
01192     {
01193     case UNORM_NONE :
01194       return Normalizer::NO_OP;
01195     case UNORM_NFD :
01196       return Normalizer::DECOMP;
01197     case UNORM_NFKD :
01198       return Normalizer::DECOMP_COMPAT;
01199     case UNORM_NFC :
01200       return Normalizer::COMPOSE;
01201     case UNORM_NFKC :
01202       return Normalizer::COMPOSE_COMPAT;
01203     case UNORM_FCD:
01204       return Normalizer::FCD;
01205     default : 
01206       status = U_ILLEGAL_ARGUMENT_ERROR; 
01207     }
01208   }
01209   return Normalizer::DECOMP_COMPAT;
01210 }
01211 
01212 U_NAMESPACE_END
01213 #endif // _NORMLZR

Generated on Thu Aug 15 14:13:26 2002 for ICU 2.2 by doxygen1.2.11.1 written by Dimitri van Heesch, © 1997-2001