Main Page   Class Hierarchy   Alphabetical List   Compound List   File List   Compound Members   File Members  

txtbdat.h

Go to the documentation of this file.
00001 /*
00002 * Copyright © {1997-1999}, International Business Machines Corporation and others. All Rights Reserved.
00003 *****************************************************************************************
00004 *
00005 * File TXTBDAT.H
00006 *
00007 * Modification History:
00008 *
00009 *   Date        Name        Description
00010 *   02/18/97    aliu        Converted from OpenClass.
00011 *                           Made static data members const where appropriate.
00012 *   03/25/97    aliu        Removed subclasses, and merged their static data into this
00013 *                           class.  Instantiated four static instances for character,
00014 *                           word, sentence, and line.  Made forward(), backward(), and
00015 *                           map() methods inline.
00016 *   04/15/97    aliu        Worked around bug in AIX xlC compiler which occurs if static
00017 *                           arrays contain const elements.
00018 *   05/06/97    aliu        Made kSI, kStop, and kSI_Stop into #defines to help out
00019 *                           non-compliant compilers.
00020 *****************************************************************************************
00021 */
00022 
00023 #ifndef TXTBDAT_H
00024 #define TXTBDAT_H
00025 
00026 #include "unicode/utypes.h"
00027 class WordBreakTable;
00028 class UnicodeClassMapping;
00029 class SpecialMapping;
00030 
00036 class TextBoundaryData {
00037 public:
00038     ~TextBoundaryData() {} // Do not subclass
00039 
00040     // Fast inline accessors
00041     const WordBreakTable* forward(void) const;
00042     const WordBreakTable* backward(void) const;
00043     const UnicodeClassMapping* map(void) const;
00044 
00045     static const TextBoundaryData kCharacterBreakData;
00046     static const TextBoundaryData kWordBreakData;
00047     static const TextBoundaryData kLineBreakData;
00048     static const TextBoundaryData kSentenceBreakData;
00049 
00050     typedef uint8_t Node;
00051     typedef uint8_t Type;
00052 
00053 private:
00054     static const UChar ASCII_END_OF_TEXT;
00055     static const UChar ASCII_HORIZONTAL_TABULATION;
00056     static const UChar ASCII_LINEFEED;
00057     static const UChar ASCII_VERTICAL_TABULATION;
00058     static const UChar ASCII_FORM_FEED;
00059     static const UChar ASCII_CARRIAGE_RETURN;
00060     static const UChar ASCII_SPACE;
00061     static const UChar ASCII_EXCLAMATION_MARK;
00062     static const UChar ASCII_QUOTATION_MARK;
00063     static const UChar ASCII_NUMBER_SIGN;
00064     static const UChar ASCII_DOLLAR_SIGN;
00065     static const UChar ASCII_PERCENT;
00066     static const UChar ASCII_AMPERSAND;
00067     static const UChar ASCII_APOSTROPHE;
00068     static const UChar ASCII_COMMA;
00069     static const UChar ASCII_FULL_STOP;
00070     static const UChar ASCII_COLON;
00071     static const UChar ASCII_SEMICOLON;
00072     static const UChar ASCII_QUESTION_MARK;
00073     static const UChar ASCII_NONBREAKING_SPACE;
00074     static const UChar ASCII_CENT_SIGN;
00075     static const UChar ASCII_POUND_SIGN;
00076     static const UChar ASCII_YEN_SIGN;
00077     static const UChar LATIN1_SOFTHYPHEN;
00078     static const UChar LATIN1_DEGREE_SIGN;
00079     static const UChar ARABIC_PERCENT_SIGN;
00080     static const UChar ARABIC_DECIMAL_SEPARATOR;
00081     static const UChar HANGUL_CHOSEONG_LOW;
00082     static const UChar HANGUL_CHOSEONG_HIGH;
00083     static const UChar HANGUL_JUNGSEONG_LOW;
00084     static const UChar HANGUL_JUNGSEONG_HIGH;
00085     static const UChar HANGUL_JONGSEONG_LOW;
00086     static const UChar HANGUL_JONGSEONG_HIGH;
00087     static const UChar FIGURE_SPACE;
00088     static const UChar NONBREAKING_HYPHEN;
00089     static const UChar PUNCTUATION_HYPHENATION_POINT;
00090     static const UChar PUNCTUATION_LINE_SEPARATOR;
00091     static const UChar PUNCTUATION_PARAGRAPH_SEPARATOR;
00092     static const UChar PER_MILLE_SIGN;
00093     static const UChar PER_TEN_THOUSAND_SIGN;
00094     static const UChar PRIME;
00095     static const UChar DOUBLE_PRIME;
00096     static const UChar TRIPLE_PRIME;
00097     static const UChar DEGREE_CELSIUS;
00098     static const UChar DEGREE_FAHRENHEIT;
00099     static const UChar PUNCTUATION_IDEOGRAPHIC_COMMA;
00100     static const UChar PUNCTUATION_IDEOGRAPHIC_FULL_STOP; 
00101     static const UChar IDEOGRAPHIC_ITERATION_MARK;
00102     static const UChar HIRAGANA_LETTER_SMALL_A;
00103     static const UChar HIRAGANA_LETTER_A;
00104     static const UChar HIRAGANA_LETTER_SMALL_I;
00105     static const UChar HIRAGANA_LETTER_I;
00106     static const UChar HIRAGANA_LETTER_SMALL_U;
00107     static const UChar HIRAGANA_LETTER_U;
00108     static const UChar HIRAGANA_LETTER_SMALL_E;
00109     static const UChar HIRAGANA_LETTER_E;
00110     static const UChar HIRAGANA_LETTER_SMALL_O;
00111     static const UChar HIRAGANA_LETTER_O;
00112     static const UChar HIRAGANA_LETTER_DI;
00113     static const UChar HIRAGANA_LETTER_SMALL_TU;
00114     static const UChar HIRAGANA_LETTER_TU;
00115     static const UChar HIRAGANA_LETTER_MO;
00116     static const UChar HIRAGANA_LETTER_SMALL_YA;
00117     static const UChar HIRAGANA_LETTER_YA;
00118     static const UChar HIRAGANA_LETTER_SMALL_YU;
00119     static const UChar HIRAGANA_LETTER_YU;
00120     static const UChar HIRAGANA_LETTER_SMALL_YO;
00121     static const UChar HIRAGANA_LETTER_YO;
00122     static const UChar HIRAGANA_LETTER_RO;
00123     static const UChar HIRAGANA_LETTER_SMALL_WA;
00124     static const UChar HIRAGANA_LETTER_WA;
00125     static const UChar HIRAGANA_LETTER_VU;
00126     static const UChar COMBINING_KATAKANA_HIRAGANA_VOICED_SOUND_MARK;
00127     static const UChar HIRAGANA_SEMIVOICED_SOUND_MARK;
00128     static const UChar HIRAGANA_ITERATION_MARK;
00129     static const UChar HIRAGANA_VOICED_ITERATION_MARK;
00130     static const UChar KATAKANA_LETTER_SMALL_A;
00131     static const UChar KATAKANA_LETTER_A;
00132     static const UChar KATAKANA_LETTER_SMALL_I;
00133     static const UChar KATAKANA_LETTER_I;
00134     static const UChar KATAKANA_LETTER_SMALL_U;
00135     static const UChar KATAKANA_LETTER_U;
00136     static const UChar KATAKANA_LETTER_SMALL_E;
00137     static const UChar KATAKANA_LETTER_E;
00138     static const UChar KATAKANA_LETTER_SMALL_O;
00139     static const UChar KATAKANA_LETTER_O;
00140     static const UChar KATAKANA_LETTER_DI;
00141     static const UChar KATAKANA_LETTER_SMALL_TU;
00142     static const UChar KATAKANA_LETTER_TU;
00143     static const UChar KATAKANA_LETTER_MO;
00144     static const UChar KATAKANA_LETTER_SMALL_YA;
00145     static const UChar KATAKANA_LETTER_YA;
00146     static const UChar KATAKANA_LETTER_SMALL_YU;
00147     static const UChar KATAKANA_LETTER_YU;
00148     static const UChar KATAKANA_LETTER_SMALL_YO;
00149     static const UChar KATAKANA_LETTER_YO;
00150     static const UChar KATAKANA_LETTER_RO;
00151     static const UChar KATAKANA_LETTER_SMALL_WA;
00152     static const UChar KATAKANA_LETTER_WA;
00153     static const UChar KATAKANA_LETTER_VU;
00154     static const UChar KATAKANA_LETTER_SMALL_KA;
00155     static const UChar KATAKANA_LETTER_SMALL_KE;
00156     static const UChar KATAKANA_LETTER_VA;
00157     static const UChar KATAKANA_LETTER_VO;
00158     static const UChar KATAKANA_HIRAGANA_PROLONGED_SOUND_MARK;
00159     static const UChar KATAKANA_ITERATION_MARK;
00160     static const UChar KATAKANA_VOICED_ITERATION_MARK;
00161     static const UChar UNICODE_LOW_BOUND_HAN;
00162     static const UChar UNICODE_HIGH_BOUND_HAN;
00163     static const UChar HANGUL_SYL_LOW;
00164     static const UChar HANGUL_SYL_HIGH;
00165     static const UChar CJK_COMPATIBILITY_F900;
00166     static const UChar CJK_COMPATIBILITY_FA2D;
00167     static const UChar UNICODE_ZERO_WIDTH_NON_BREAKING_SPACE;
00168     static const UChar FULLWIDTH_EXCLAMATION_MARK;
00169     static const UChar FULLWIDTH_FULL_STOP;
00170     static const UChar FULLWIDTH_QUESTION_MARK;
00171     static const UChar END_OF_STRING;
00172 
00173 private:
00174     // Character data
00175     enum CharacterMapping
00176     {
00177         // These enum values must occur in this order; do not
00178         // modify unless you know what you are doing!  The forward
00179         // and backward data tables are indexed by these enums.
00180         kAccent_diacritic   = 0,
00181         kBaseForm           = 1,
00182         kBaseCR             = 2,
00183         kBaseLF             = 3,
00184         kChoseong           = 4,   // Korean initial consonant
00185         kJungseong          = 5,  // Korean vowel
00186         kJongseong          = 6,  // Korean final consonant
00187         kEOS                = 7,
00188         kCharacterCol_count = 8
00189     };
00190 
00191     static Node                     kCharacterForwardData[];
00192     static const int32_t            kCharacterForwardData_length;
00193     static WordBreakTable*          kCharacterForward;
00194     static Node                     kCharacterBackwardData[];
00195     static const int32_t            kCharacterBackwardData_length;
00196     static WordBreakTable*          kCharacterBackward;
00197     static Type                     kCharacterRawMapping[];
00198     static const int32_t            kCharacterRawMapping_length;
00199     static SpecialMapping           kCharacterExceptionChar[];
00200     static const int32_t            kCharacterExceptionChar_length;
00201     static const UBool             kCharacterExceptionFlags[];
00202     static UnicodeClassMapping*     kCharacterMap;
00203     static Type                     kCharacterAsciiValues[];
00204 
00205 private:
00206     // Word data
00207     enum WordMapping
00208     {
00209         // These enum values must occur in this order; do not
00210         // modify unless you know what you are doing!  The forward
00211         // and backward data tables are indexed by these enums.
00212         kBreak          = 0,
00213         kLetter         = 1,
00214         kNumber         = 2,
00215         kMidLetter      = 3,
00216         kMidLetNum      = 4,
00217         kPreNum         = 5,
00218         kPostNum        = 6,
00219         kMidNum         = 7,
00220         kPreMidNum      = 8,
00221         kBlank          = 9,
00222         kLF             = 10,
00223         kKata           = 11,
00224         kHira           = 12,
00225         kKanji          = 13,
00226         kDiacrit        = 14,
00227         kCR             = 15,
00228         kNsm            = 16,
00229         kwEOS           = 17,
00230         kWordCol_count  = 18
00231     };
00232 
00233     static Node                     kWordForwardData[];
00234     static const int32_t            kWordForwardData_length;
00235     static WordBreakTable*          kWordForward;
00236     static Node                     kWordBackwardData[];
00237     static const int32_t            kWordBackwardData_length;
00238     static WordBreakTable*          kWordBackward;
00239     static Type                     kWordRawMapping[];
00240     static const int32_t            kWordRawMapping_length;
00241     static SpecialMapping           kWordExceptionChar[];
00242     static const int32_t            kWordExceptionChar_length;
00243     static UnicodeClassMapping*     kWordMap;
00244     static Type                     kWordAsciiValues[];
00245     static const UBool             kWordExceptionFlags[];
00246 
00247 private:
00248     // Sentence data
00249     enum SentenceMapping
00250     {
00251         // These enum values must occur in this order; do not
00252         // modify unless you know what you are doing!  The forward
00253         // and backward data tables are indexed by these enums.
00254         kOther              = 0,
00255         kSpace              = 1,
00256         kTerminator         = 2,
00257         kAmbiguousTerm      = 3,
00258         kOpenBracket        = 4,
00259         kCloseBracket       = 5,
00260         kCJK                = 6,
00261         kParagraphBreak     = 7,
00262         kLowerCase          = 8,
00263         kUpperCase          = 9,
00264         ksNumber            = 10,
00265         kQuote              = 11,
00266         //ksCR,
00267         ksNsm               = 12,
00268         ksEOS               = 13,
00269         kSentenceCol_count  = 14
00270     };
00271 
00272     static Node                     kSentenceForwardData[];
00273     static const int32_t            kSentenceForwardData_length;
00274     static WordBreakTable*          kSentenceForward;
00275     static Node                     kSentenceBackwardData[];
00276     static const int32_t            kSentenceBackwardData_length;
00277     static WordBreakTable*          kSentenceBackward;
00278     static Type                     kSentenceRawMapping[];
00279     static const int32_t            kSentenceRawMapping_length;
00280     static SpecialMapping           kSentenceExceptionChar[];
00281     static const int32_t            kSentenceExceptionChar_length;
00282     static UnicodeClassMapping*     kSentenceMap;
00283     static Type                     kSentenceAsciiValues[];
00284     static const UBool             kSentenceExceptionFlags[];
00285 
00286 private:
00287     // Line data
00288     enum LineMapping
00289     {
00290         // These enum values must occur in this order; do not
00291         // modify unless you know what you are doing!  The forward
00292         // and backward data tables are indexed by these enums.
00293         kLineBreak,
00294         //always breaks (must be present as first item)
00295         kLineBlank,
00296         //spaces, tabs, nulls.
00297         kLineCR,
00298         //carriage return
00299         kLineNonBlank,
00300         //everything not included elsewhere
00301         kLineOp,
00302         //hyphens....
00303         kLineJwrd,
00304         //hiragana, katakana, and kanji
00305         kLinePreJwrd,
00306         //characters that bind to the beginning of a Japanese word
00307         kLinePostJwrd,
00308         //characters that bind to the end of a Japanese word
00309         kLineDigit,
00310         //digits
00311         kLineNumPunct,
00312         //punctuation that can appear within a number
00313         kLineCurrency,
00314         //currency symbols that can precede a number
00315         kLineNsm,
00316         // non-spacing marks
00317         kLineNbsp,
00318         // non-breaking characters
00319         kLineEOS,
00320         kLineCol_count
00321     };
00322 
00323     static Node                     kLineForwardData[];
00324     static const int32_t            kLineForwardData_length;
00325     static WordBreakTable*          kLineForward;
00326     static Node                     kLineBackwardData[];
00327     static const int32_t            kLineBackwardData_length;
00328     static WordBreakTable*          kLineBackward;
00329     static Type                     kLineRawMapping[];
00330     static const int32_t            kLineRawMapping_length;
00331     static SpecialMapping           kLineExceptionChar[];
00332     static const int32_t            kLineExceptionChar_length;
00333     static const UBool             kLineExceptionFlags[];
00334     static UnicodeClassMapping*     kLineMap;
00335     static Type                     kLineAsciiValues[];
00336 
00337 protected:
00342     TextBoundaryData(const TextBoundaryData&) {}
00343     TextBoundaryData& operator=(const TextBoundaryData&) { return *this; }
00344     TextBoundaryData() {} // Do not subclass
00345     TextBoundaryData(const WordBreakTable* forward,
00346                      const WordBreakTable* backward,
00347                      const UnicodeClassMapping* map)
00348                      : fForward(forward), fBackward(backward), fMap(map) {}
00349         
00350 private:
00351     const WordBreakTable*       fForward;
00352     const WordBreakTable*       fBackward;
00353     const UnicodeClassMapping*  fMap;
00354 };
00355 
00356 inline const WordBreakTable* TextBoundaryData::forward() const
00357 {
00358     return fForward;
00359 }
00360 
00361 inline const WordBreakTable* TextBoundaryData::backward() const
00362 {
00363     return fBackward;
00364 }
00365 
00366 inline const UnicodeClassMapping* TextBoundaryData::map() const
00367 {
00368     return fMap;
00369 }
00370 
00371 // These used to be static consts in the class, but some compilers didn't like that.
00372 #define kStop       (0)
00373 #define kSI         (0x80)
00374 #define kSI_Stop    (kSI+kStop)
00375 
00376 #define kSI_1       (kSI+1)
00377 #define kSI_2       (kSI+2)
00378 #define kSI_3       (kSI+3)
00379 #define kSI_4       (kSI+4)
00380 #define kSI_5       (kSI+5)
00381 #define kSI_6       (kSI+6)
00382 #define kSI_7       (kSI+7)
00383 #define kSI_8       (kSI+8)
00384 #define kSI_9       (kSI+9)
00385 #define kSI_10      (kSI+10)
00386 #define kSI_11      (kSI+11)
00387 #define kSI_12      (kSI+12)
00388 #define kSI_13      (kSI+13)
00389 #define kSI_14      (kSI+14)
00390 
00391 #endif // _TXTBDAT
00392 //eof

Generated at Tue Dec 5 17:55:33 2000 for ICU by doxygen1.2.3 written by Dimitri van Heesch, © 1997-2000