29 const
float WERD_CHOICE::kBadRating = 100000.0;
31 static const
char kPermuterTypeNoPerm[] = "None";
32 static const
char kPermuterTypePuncPerm[] = "Punctuation";
33 static const
char kPermuterTypeTopPerm[] = "Top Choice";
34 static const
char kPermuterTypeLowerPerm[] = "Top Lower Case";
35 static const
char kPermuterTypeUpperPerm[] = "Top Upper Case";
36 static const
char kPermuterTypeNgramPerm[] = "Ngram";
37 static const
char kPermuterTypeNumberPerm[] = "Number";
38 static const
char kPermuterTypeUserPatPerm[] = "User Pattern";
39 static const
char kPermuterTypeSysDawgPerm[] = "System Dictionary";
40 static const
char kPermuterTypeDocDawgPerm[] = "Document Dictionary";
41 static const
char kPermuterTypeUserDawgPerm[] = "User Dictionary";
42 static const
char kPermuterTypeFreqDawgPerm[] = "Frequent Words Dictionary";
43 static const
char kPermuterTypeCompoundPerm[] = "Compound";
45 static const
char * const kPermuterTypeNames[] = {
47 kPermuterTypePuncPerm,
49 kPermuterTypeLowerPerm,
50 kPermuterTypeUpperPerm,
51 kPermuterTypeNgramPerm,
52 kPermuterTypeNumberPerm,
53 kPermuterTypeUserPatPerm,
54 kPermuterTypeSysDawgPerm,
55 kPermuterTypeDocDawgPerm,
56 kPermuterTypeUserDawgPerm,
57 kPermuterTypeFreqDawgPerm,
58 kPermuterTypeCompoundPerm
69 inT16 src_fontinfo_id,
70 inT16 src_fontinfo_id2,
76 unichar_id_ = src_unichar_id;
78 certainty_ = src_cert;
79 fontinfo_id_ = src_fontinfo_id;
80 fontinfo_id2_ = src_fontinfo_id2;
81 script_id_ = src_script_id;
82 language_model_state_ =
NULL;
100 language_model_state_ =
NULL;
101 min_xheight_ = other.min_xheight_;
102 max_xheight_ = other.max_xheight_;
103 adapted_ = other.adapted_;
114 : unicharset_(&unicharset){
116 const char *ptr = src_string;
117 const char *end = src_string + strlen(src_string);
118 int step = unicharset.
step(ptr);
119 for (; ptr < end && step > 0;
120 step = unicharset.
step(ptr), src_lengths += step, ptr += step);
121 if (step != 0 && ptr == end) {
141 const char *src_lengths,
144 uinT8 src_permuter) {
145 int src_string_len = strlen(src_string);
146 if (src_string_len == 0) {
149 this->
init(src_lengths ? strlen(src_lengths): src_string_len);
152 for (
int i = 0; i < length_; ++i) {
153 int unichar_length = src_lengths ? src_lengths[i] : 1;
155 unicharset_->
unichar_to_id(src_string+offset, unichar_length);
156 fragment_lengths_[i] = 1;
157 offset += unichar_length;
160 rating_ = src_rating;
161 certainty_ = src_certainty;
162 permuter_ = src_permuter;
169 delete[] unichar_ids_;
170 delete[] fragment_lengths_;
171 delete_blob_choices();
175 return kPermuterTypeNames[permuter_];
185 if (blob_choices_ != blob_choices) {
186 delete_blob_choices();
198 for (
int i = 0; i < length_; ++i) {
199 if (unichar_ids_[i] == unichar_id) {
215 for (
int i = start; i+num < length_; ++i) {
216 unichar_ids_[i] = unichar_ids_[i+num];
217 fragment_lengths_[i] = fragment_lengths_[i+num];
228 for (
int i = 0; i < length_/2; ++i) {
230 unichar_ids_[i] = unicharset_->
get_mirror(unichar_ids_[length_-1-i]);
231 unichar_ids_[length_-1-i] = unicharset_->
get_mirror(tmp_id);
233 if (length_ % 2 != 0) {
234 unichar_ids_[length_/2] = unicharset_->
get_mirror(unichar_ids_[length_/2]);
248 while (*start <
length() &&
262 if (end < start) { end = start; }
264 for (
int i = start; i < end; i++) {
266 unichar_ids_[i], fragment_lengths_[i], 0.0
f, 0.0
f);
278 for (i = 0; i < length_; ++i) {
295 STRING *word_lengths_str)
const {
297 if (word_lengths_str !=
NULL) *word_lengths_str =
"";
298 for (
int i = 0; i < length_; ++i) {
301 if (word_lengths_str !=
NULL) {
302 *word_lengths_str += strlen(ch);
315 float rating,
float certainty) {
316 if (length_ == reserved_) {
340 while (reserved_ < length_ + second.
length()) {
345 for (
int i = 0; i < second.
length(); ++i) {
346 unichar_ids_[length_ + i] = other_unichar_ids[i];
347 fragment_lengths_[length_ + i] = other_fragment_lengths[i];
349 length_ += second.
length();
350 rating_ += second.
rating();
353 if (permuter_ == NO_PERM) {
355 }
else if (second.
permuter() != NO_PERM &&
357 permuter_ = COMPOUND_PERM;
361 if (second.blob_choices_ !=
NULL) {
362 if (this->blob_choices_ ==
NULL)
363 this->blob_choices_ =
new BLOB_CHOICE_LIST_CLIST;
365 BLOB_CHOICE_LIST_C_IT this_blob_choices_it;
366 BLOB_CHOICE_LIST_C_IT second_blob_choices_it;
368 this_blob_choices_it.set_to_list(this->blob_choices_);
369 this_blob_choices_it.move_to_last();
371 second_blob_choices_it.set_to_list(second.blob_choices_);
373 for (second_blob_choices_it.mark_cycle_pt();
374 !second_blob_choices_it.cycled_list();
375 second_blob_choices_it.forward()) {
377 BLOB_CHOICE_LIST* blob_choices_copy =
new BLOB_CHOICE_LIST();
378 blob_choices_copy->deep_copy(second_blob_choices_it.data(),
381 this_blob_choices_it.add_after_then_move(blob_choices_copy);
395 while (reserved_ < source.
length()) {
399 unicharset_ = source.unicharset_;
402 for (
int i = 0; i < source.
length(); ++i) {
403 unichar_ids_[i] = other_unichar_ids[i];
404 fragment_lengths_[i] = other_fragment_lengths[i];
406 length_ = source.
length();
407 rating_ = source.
rating();
413 this->delete_blob_choices();
416 if (source.blob_choices_ !=
NULL) {
417 BLOB_CHOICE_LIST_C_IT this_blob_choices_it;
418 BLOB_CHOICE_LIST_C_IT source_blob_choices_it;
420 this->blob_choices_ =
new BLOB_CHOICE_LIST_CLIST();
422 this_blob_choices_it.set_to_list(this->blob_choices_);
423 source_blob_choices_it.set_to_list(source.blob_choices_);
425 for (source_blob_choices_it.mark_cycle_pt();
426 !source_blob_choices_it.cycled_list();
427 source_blob_choices_it.forward()) {
429 BLOB_CHOICE_LIST* blob_choices_copy =
new BLOB_CHOICE_LIST();
430 blob_choices_copy->deep_copy(source_blob_choices_it.data(),
433 this_blob_choices_it.add_after_then_move(blob_choices_copy);
444 void WERD_CHOICE::delete_blob_choices() {
445 if (blob_choices_ !=
NULL) {
446 blob_choices_->deep_clear();
447 delete blob_choices_;
448 blob_choices_ =
NULL;
458 tprintf(
"%s WERD_CHOICE:\n", msg);
459 tprintf(
"length_ %d reserved_ %d permuter_ %d\n",
460 length_, reserved_, permuter_);
461 tprintf(
"rating_ %.4f certainty_ %.4f", rating_, certainty_);
462 if (fragment_mark_) {
463 tprintf(
" fragment_mark_ true");
466 if (unichar_string_.
length() > 0) {
467 tprintf(
"unichar_string_ %s unichar_lengths_ %s\n",
472 for (i = 0; i < length_; ++i) {
473 tprintf(
"%d ", unichar_ids_[i]);
475 tprintf(
"\nfragment_lengths_: ");
476 for (i = 0; i < length_; ++i) {
477 tprintf(
"%d ", fragment_lengths_[i]);
486 if (word2.
unicharset() != uchset)
return false;
491 if (w1end - w1start != w2end - w2start)
return false;
492 for (
int i = 0; i < w1end - w1start; i++) {
512 BLOB_CHOICE_LIST *ratings,
514 if (ratings->length() == 0) {
522 c_it.set_to_list(ratings);
523 for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
524 c_it.data()->print(¤t_unicharset);
525 if (!c_it.at_last())
tprintf(
"\n");
537 if (ratings->length() == 0) {
545 c_it.set_to_list(ratings);
546 for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
547 c_it.data()->print(
NULL);
548 if (!c_it.at_last())
tprintf(
"\n");
565 BLOB_CHOICE_LIST *ratings,
568 const char* first_char =
NULL;
571 const char* sec_char =
NULL;
574 BLOB_CHOICE_IT c_it = ratings;
576 index = ratings->length();
578 first_char = current_unicharset.
id_to_unichar(c_it.data()->unichar_id());
579 first_rat = c_it.data()->rating();
580 first_cert = -c_it.data()->certainty();
583 c_it.data_relative(1)->unichar_id());
584 sec_rat = c_it.data_relative(1)->rating();
585 sec_cert = -c_it.data_relative(1)->certainty();
596 if (first_char !=
NULL && (*first_char ==
'\0' || *first_char ==
' '))
598 if (sec_char !=
NULL && (*sec_char ==
'\0' || *sec_char ==
' '))
602 first_char !=
NULL ? first_char :
"~",
603 first_rat, first_cert, sec_char !=
NULL ? sec_char :
"~",
614 if (*msg !=
'\0')
tprintf(
"%s\n", msg);
615 for (
int x = 0; x < char_choices.
length(); ++x) {
617 c_it.set_to_list(char_choices.
get(x));
619 current_unicharset.
debug_str( c_it.data()->unichar_id()).
string());
631 if (!word || !alternates)
return;
634 for (
int i = 0; i < alternates->
size(); i++) {
635 if (i > 0) alternates_str +=
"\", \"";
638 tprintf(
"Alternates for \"%s\": {\"%s\"}\n",