00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023 #ifndef TXTBDAT_H
00024 #define TXTBDAT_H
00025
00026 #include "unicode/utypes.h"
00027 class WordBreakTable;
00028 class UnicodeClassMapping;
00029 class SpecialMapping;
00030
00036 class TextBoundaryData {
00037 public:
00038 ~TextBoundaryData() {}
00039
00040
00041 const WordBreakTable* forward(void) const;
00042 const WordBreakTable* backward(void) const;
00043 const UnicodeClassMapping* map(void) const;
00044
00045 static const TextBoundaryData kCharacterBreakData;
00046 static const TextBoundaryData kWordBreakData;
00047 static const TextBoundaryData kLineBreakData;
00048 static const TextBoundaryData kSentenceBreakData;
00049
00050 typedef uint8_t Node;
00051 typedef uint8_t Type;
00052
00053 private:
00054 static const UChar ASCII_END_OF_TEXT;
00055 static const UChar ASCII_HORIZONTAL_TABULATION;
00056 static const UChar ASCII_LINEFEED;
00057 static const UChar ASCII_VERTICAL_TABULATION;
00058 static const UChar ASCII_FORM_FEED;
00059 static const UChar ASCII_CARRIAGE_RETURN;
00060 static const UChar ASCII_SPACE;
00061 static const UChar ASCII_EXCLAMATION_MARK;
00062 static const UChar ASCII_QUOTATION_MARK;
00063 static const UChar ASCII_NUMBER_SIGN;
00064 static const UChar ASCII_DOLLAR_SIGN;
00065 static const UChar ASCII_PERCENT;
00066 static const UChar ASCII_AMPERSAND;
00067 static const UChar ASCII_APOSTROPHE;
00068 static const UChar ASCII_COMMA;
00069 static const UChar ASCII_FULL_STOP;
00070 static const UChar ASCII_COLON;
00071 static const UChar ASCII_SEMICOLON;
00072 static const UChar ASCII_QUESTION_MARK;
00073 static const UChar ASCII_NONBREAKING_SPACE;
00074 static const UChar ASCII_CENT_SIGN;
00075 static const UChar ASCII_POUND_SIGN;
00076 static const UChar ASCII_YEN_SIGN;
00077 static const UChar LATIN1_SOFTHYPHEN;
00078 static const UChar LATIN1_DEGREE_SIGN;
00079 static const UChar ARABIC_PERCENT_SIGN;
00080 static const UChar ARABIC_DECIMAL_SEPARATOR;
00081 static const UChar HANGUL_CHOSEONG_LOW;
00082 static const UChar HANGUL_CHOSEONG_HIGH;
00083 static const UChar HANGUL_JUNGSEONG_LOW;
00084 static const UChar HANGUL_JUNGSEONG_HIGH;
00085 static const UChar HANGUL_JONGSEONG_LOW;
00086 static const UChar HANGUL_JONGSEONG_HIGH;
00087 static const UChar FIGURE_SPACE;
00088 static const UChar NONBREAKING_HYPHEN;
00089 static const UChar PUNCTUATION_HYPHENATION_POINT;
00090 static const UChar PUNCTUATION_LINE_SEPARATOR;
00091 static const UChar PUNCTUATION_PARAGRAPH_SEPARATOR;
00092 static const UChar PER_MILLE_SIGN;
00093 static const UChar PER_TEN_THOUSAND_SIGN;
00094 static const UChar PRIME;
00095 static const UChar DOUBLE_PRIME;
00096 static const UChar TRIPLE_PRIME;
00097 static const UChar DEGREE_CELSIUS;
00098 static const UChar DEGREE_FAHRENHEIT;
00099 static const UChar PUNCTUATION_IDEOGRAPHIC_COMMA;
00100 static const UChar PUNCTUATION_IDEOGRAPHIC_FULL_STOP;
00101 static const UChar IDEOGRAPHIC_ITERATION_MARK;
00102 static const UChar HIRAGANA_LETTER_SMALL_A;
00103 static const UChar HIRAGANA_LETTER_A;
00104 static const UChar HIRAGANA_LETTER_SMALL_I;
00105 static const UChar HIRAGANA_LETTER_I;
00106 static const UChar HIRAGANA_LETTER_SMALL_U;
00107 static const UChar HIRAGANA_LETTER_U;
00108 static const UChar HIRAGANA_LETTER_SMALL_E;
00109 static const UChar HIRAGANA_LETTER_E;
00110 static const UChar HIRAGANA_LETTER_SMALL_O;
00111 static const UChar HIRAGANA_LETTER_O;
00112 static const UChar HIRAGANA_LETTER_DI;
00113 static const UChar HIRAGANA_LETTER_SMALL_TU;
00114 static const UChar HIRAGANA_LETTER_TU;
00115 static const UChar HIRAGANA_LETTER_MO;
00116 static const UChar HIRAGANA_LETTER_SMALL_YA;
00117 static const UChar HIRAGANA_LETTER_YA;
00118 static const UChar HIRAGANA_LETTER_SMALL_YU;
00119 static const UChar HIRAGANA_LETTER_YU;
00120 static const UChar HIRAGANA_LETTER_SMALL_YO;
00121 static const UChar HIRAGANA_LETTER_YO;
00122 static const UChar HIRAGANA_LETTER_RO;
00123 static const UChar HIRAGANA_LETTER_SMALL_WA;
00124 static const UChar HIRAGANA_LETTER_WA;
00125 static const UChar HIRAGANA_LETTER_VU;
00126 static const UChar COMBINING_KATAKANA_HIRAGANA_VOICED_SOUND_MARK;
00127 static const UChar HIRAGANA_SEMIVOICED_SOUND_MARK;
00128 static const UChar HIRAGANA_ITERATION_MARK;
00129 static const UChar HIRAGANA_VOICED_ITERATION_MARK;
00130 static const UChar KATAKANA_LETTER_SMALL_A;
00131 static const UChar KATAKANA_LETTER_A;
00132 static const UChar KATAKANA_LETTER_SMALL_I;
00133 static const UChar KATAKANA_LETTER_I;
00134 static const UChar KATAKANA_LETTER_SMALL_U;
00135 static const UChar KATAKANA_LETTER_U;
00136 static const UChar KATAKANA_LETTER_SMALL_E;
00137 static const UChar KATAKANA_LETTER_E;
00138 static const UChar KATAKANA_LETTER_SMALL_O;
00139 static const UChar KATAKANA_LETTER_O;
00140 static const UChar KATAKANA_LETTER_DI;
00141 static const UChar KATAKANA_LETTER_SMALL_TU;
00142 static const UChar KATAKANA_LETTER_TU;
00143 static const UChar KATAKANA_LETTER_MO;
00144 static const UChar KATAKANA_LETTER_SMALL_YA;
00145 static const UChar KATAKANA_LETTER_YA;
00146 static const UChar KATAKANA_LETTER_SMALL_YU;
00147 static const UChar KATAKANA_LETTER_YU;
00148 static const UChar KATAKANA_LETTER_SMALL_YO;
00149 static const UChar KATAKANA_LETTER_YO;
00150 static const UChar KATAKANA_LETTER_RO;
00151 static const UChar KATAKANA_LETTER_SMALL_WA;
00152 static const UChar KATAKANA_LETTER_WA;
00153 static const UChar KATAKANA_LETTER_VU;
00154 static const UChar KATAKANA_LETTER_SMALL_KA;
00155 static const UChar KATAKANA_LETTER_SMALL_KE;
00156 static const UChar KATAKANA_LETTER_VA;
00157 static const UChar KATAKANA_LETTER_VO;
00158 static const UChar KATAKANA_HIRAGANA_PROLONGED_SOUND_MARK;
00159 static const UChar KATAKANA_ITERATION_MARK;
00160 static const UChar KATAKANA_VOICED_ITERATION_MARK;
00161 static const UChar UNICODE_LOW_BOUND_HAN;
00162 static const UChar UNICODE_HIGH_BOUND_HAN;
00163 static const UChar HANGUL_SYL_LOW;
00164 static const UChar HANGUL_SYL_HIGH;
00165 static const UChar CJK_COMPATIBILITY_F900;
00166 static const UChar CJK_COMPATIBILITY_FA2D;
00167 static const UChar UNICODE_ZERO_WIDTH_NON_BREAKING_SPACE;
00168 static const UChar FULLWIDTH_EXCLAMATION_MARK;
00169 static const UChar FULLWIDTH_FULL_STOP;
00170 static const UChar FULLWIDTH_QUESTION_MARK;
00171 static const UChar END_OF_STRING;
00172
00173 private:
00174
00175 enum CharacterMapping
00176 {
00177
00178
00179
00180 kAccent_diacritic = 0,
00181 kBaseForm = 1,
00182 kBaseCR = 2,
00183 kBaseLF = 3,
00184 kChoseong = 4,
00185 kJungseong = 5,
00186 kJongseong = 6,
00187 kEOS = 7,
00188 kCharacterCol_count = 8
00189 };
00190
00191 static Node kCharacterForwardData[];
00192 static const int32_t kCharacterForwardData_length;
00193 static WordBreakTable* kCharacterForward;
00194 static Node kCharacterBackwardData[];
00195 static const int32_t kCharacterBackwardData_length;
00196 static WordBreakTable* kCharacterBackward;
00197 static Type kCharacterRawMapping[];
00198 static const int32_t kCharacterRawMapping_length;
00199 static SpecialMapping kCharacterExceptionChar[];
00200 static const int32_t kCharacterExceptionChar_length;
00201 static const UBool kCharacterExceptionFlags[];
00202 static UnicodeClassMapping* kCharacterMap;
00203 static Type kCharacterAsciiValues[];
00204
00205 private:
00206
00207 enum WordMapping
00208 {
00209
00210
00211
00212 kBreak = 0,
00213 kLetter = 1,
00214 kNumber = 2,
00215 kMidLetter = 3,
00216 kMidLetNum = 4,
00217 kPreNum = 5,
00218 kPostNum = 6,
00219 kMidNum = 7,
00220 kPreMidNum = 8,
00221 kBlank = 9,
00222 kLF = 10,
00223 kKata = 11,
00224 kHira = 12,
00225 kKanji = 13,
00226 kDiacrit = 14,
00227 kCR = 15,
00228 kNsm = 16,
00229 kwEOS = 17,
00230 kWordCol_count = 18
00231 };
00232
00233 static Node kWordForwardData[];
00234 static const int32_t kWordForwardData_length;
00235 static WordBreakTable* kWordForward;
00236 static Node kWordBackwardData[];
00237 static const int32_t kWordBackwardData_length;
00238 static WordBreakTable* kWordBackward;
00239 static Type kWordRawMapping[];
00240 static const int32_t kWordRawMapping_length;
00241 static SpecialMapping kWordExceptionChar[];
00242 static const int32_t kWordExceptionChar_length;
00243 static UnicodeClassMapping* kWordMap;
00244 static Type kWordAsciiValues[];
00245 static const UBool kWordExceptionFlags[];
00246
00247 private:
00248
00249 enum SentenceMapping
00250 {
00251
00252
00253
00254 kOther = 0,
00255 kSpace = 1,
00256 kTerminator = 2,
00257 kAmbiguousTerm = 3,
00258 kOpenBracket = 4,
00259 kCloseBracket = 5,
00260 kCJK = 6,
00261 kParagraphBreak = 7,
00262 kLowerCase = 8,
00263 kUpperCase = 9,
00264 ksNumber = 10,
00265 kQuote = 11,
00266
00267 ksNsm = 12,
00268 ksEOS = 13,
00269 kSentenceCol_count = 14
00270 };
00271
00272 static Node kSentenceForwardData[];
00273 static const int32_t kSentenceForwardData_length;
00274 static WordBreakTable* kSentenceForward;
00275 static Node kSentenceBackwardData[];
00276 static const int32_t kSentenceBackwardData_length;
00277 static WordBreakTable* kSentenceBackward;
00278 static Type kSentenceRawMapping[];
00279 static const int32_t kSentenceRawMapping_length;
00280 static SpecialMapping kSentenceExceptionChar[];
00281 static const int32_t kSentenceExceptionChar_length;
00282 static UnicodeClassMapping* kSentenceMap;
00283 static Type kSentenceAsciiValues[];
00284 static const UBool kSentenceExceptionFlags[];
00285
00286 private:
00287
00288 enum LineMapping
00289 {
00290
00291
00292
00293 kLineBreak,
00294
00295 kLineBlank,
00296
00297 kLineCR,
00298
00299 kLineNonBlank,
00300
00301 kLineOp,
00302
00303 kLineJwrd,
00304
00305 kLinePreJwrd,
00306
00307 kLinePostJwrd,
00308
00309 kLineDigit,
00310
00311 kLineNumPunct,
00312
00313 kLineCurrency,
00314
00315 kLineNsm,
00316
00317 kLineNbsp,
00318
00319 kLineEOS,
00320 kLineCol_count
00321 };
00322
00323 static Node kLineForwardData[];
00324 static const int32_t kLineForwardData_length;
00325 static WordBreakTable* kLineForward;
00326 static Node kLineBackwardData[];
00327 static const int32_t kLineBackwardData_length;
00328 static WordBreakTable* kLineBackward;
00329 static Type kLineRawMapping[];
00330 static const int32_t kLineRawMapping_length;
00331 static SpecialMapping kLineExceptionChar[];
00332 static const int32_t kLineExceptionChar_length;
00333 static const UBool kLineExceptionFlags[];
00334 static UnicodeClassMapping* kLineMap;
00335 static Type kLineAsciiValues[];
00336
00337 protected:
00342 TextBoundaryData(const TextBoundaryData&) {}
00343 TextBoundaryData& operator=(const TextBoundaryData&) { return *this; }
00344 TextBoundaryData() {}
00345 TextBoundaryData(const WordBreakTable* forward,
00346 const WordBreakTable* backward,
00347 const UnicodeClassMapping* map)
00348 : fForward(forward), fBackward(backward), fMap(map) {}
00349
00350 private:
00351 const WordBreakTable* fForward;
00352 const WordBreakTable* fBackward;
00353 const UnicodeClassMapping* fMap;
00354 };
00355
00356 inline const WordBreakTable* TextBoundaryData::forward() const
00357 {
00358 return fForward;
00359 }
00360
00361 inline const WordBreakTable* TextBoundaryData::backward() const
00362 {
00363 return fBackward;
00364 }
00365
00366 inline const UnicodeClassMapping* TextBoundaryData::map() const
00367 {
00368 return fMap;
00369 }
00370
00371
00372 #define kStop (0)
00373 #define kSI (0x80)
00374 #define kSI_Stop (kSI+kStop)
00375
00376 #define kSI_1 (kSI+1)
00377 #define kSI_2 (kSI+2)
00378 #define kSI_3 (kSI+3)
00379 #define kSI_4 (kSI+4)
00380 #define kSI_5 (kSI+5)
00381 #define kSI_6 (kSI+6)
00382 #define kSI_7 (kSI+7)
00383 #define kSI_8 (kSI+8)
00384 #define kSI_9 (kSI+9)
00385 #define kSI_10 (kSI+10)
00386 #define kSI_11 (kSI+11)
00387 #define kSI_12 (kSI+12)
00388 #define kSI_13 (kSI+13)
00389 #define kSI_14 (kSI+14)
00390
00391 #endif // _TXTBDAT
00392