52 #include "config_auto.h"
55 #define MIN_FONT_ROW_COUNT 8
56 #define MAX_XHEIGHT_DIFF 3
73 TBOX &selection_box) {
79 pseudo_block, pseudo_row);
102 block, row, word_res);
106 (
"\n%d chars; word_blob_quality: %d; outline_errs: %d; char_quality: %d; good_char_quality: %d\n",
129 const TBOX& target_word_box,
130 const char* word_config,
132 if (word_config !=
NULL) {
134 if (backup_config_file_ ==
NULL) {
136 FILE* config_fp = fopen(backup_config_file_,
"wb");
144 if (backup_config_file_ !=
NULL) {
148 backup_config_file_ =
NULL;
151 }
else if (pass > 1 && !word_box.
major_overlap(target_word_box)) {
180 const TBOX* target_word_box,
181 const char* word_config,
200 if (dopasses==0 || dopasses==1) {
214 for (
int i = 0; i < sub_langs_.
size(); ++i) {
220 if (monitor !=
NULL) {
222 while (page_res_it.
word() !=
NULL) {
240 most_recently_used_ =
this;
241 while (page_res_it.
word() !=
NULL) {
244 if (monitor !=
NULL) {
252 if (target_word_box &&
254 *target_word_box, word_config, 1)) {
308 if (dopasses == 1)
return true;
313 most_recently_used_ =
this;
317 if (monitor !=
NULL) {
328 if (target_word_box &&
330 *target_word_box, word_config, 2)) {
392 while (page_res_it.
word() !=
NULL) {
413 if (monitor !=
NULL) {
430 if (!word_it.
word())
break;
437 tprintf(
"Skipping because one of the words is W_REP_CHAR\n");
444 tprintf(
"Alt choices not set up for word choice: %s\n",
451 tprintf(
"Alt choices not set up for word choice: %s\n",
480 tprintf(
"Top choice \"%s %s\" verified by bigram model.\n",
486 tprintf(
"Examining alt choices for \"%s %s\".\n",
497 float best_rating = 0.0;
521 if (overrides_word1.
size() == 1 ||
524 best_idx = overrides_word1.
size() - 1;
529 if (overrides_word1.
size() >= 1) {
532 *overrides_word1[best_idx]) &&
534 *overrides_word2[best_idx])) {
536 tprintf(
"Top choice \"%s %s\" verified (sans case) by bigram "
537 "model.\n", orig_w1_str.
string(), orig_w2_str.
string());
541 STRING new_w1_str = overrides_word1[best_idx]->unichar_string();
542 STRING new_w2_str = overrides_word2[best_idx]->unichar_string();
543 if (new_w1_str != orig_w1_str) {
545 *overrides_word1_state[best_idx]);
547 if (new_w2_str != orig_w2_str) {
549 *overrides_word2_state[best_idx]);
552 STRING choices_description;
553 int num_bigram_choices
554 = overrides_word1.
size() * overrides_word2.
size();
555 if (num_bigram_choices == 1) {
556 choices_description =
"This was the unique bigram choice.";
560 const int kMaxChoicesToPrint = 20;
561 for (
int i = 0; i < overrides_word1.
size() &&
562 i < kMaxChoicesToPrint; i++) {
563 if (i > 0) { bigrams_list +=
", "; }
567 if (i == kMaxChoicesToPrint) {
568 bigrams_list +=
" ...";
571 choices_description =
"There were many choices: {";
572 choices_description += bigrams_list;
573 choices_description +=
"}";
575 choices_description.
add_str_int(
"There were ", num_bigram_choices);
576 choices_description +=
" compatible bigrams.";
579 tprintf(
"Replaced \"%s %s\" with \"%s %s\" with bigram model. %s\n",
582 choices_description.
string());
590 const TBOX* target_word_box,
591 const char* word_config) {
600 if (monitor !=
NULL) {
613 if (target_word_box &&
615 *target_word_box, word_config, 4)) {
629 inT16 all_char_quality;
630 inT16 accepted_all_char_quality;
632 &all_char_quality, &accepted_all_char_quality);
635 if ((permuter_type == SYSTEM_DAWG_PERM) ||
636 (permuter_type == FREQ_DAWG_PERM) ||
637 (permuter_type == USER_DAWG_PERM)) {
643 (blob_quality == 0) && (outline_errs >= chars_in_word))
651 (
"QUALITY: num_chs= %d num_rejs= %d %5.3f blob_qual= %d %5.3f"
652 " outline_errs= %d %5.3f char_qual= %d %5.3f good_ch_qual= %d %5.3f\n",
666 BOOL8 good_quality_doc =
705 STRING debug =
"Choice is incorrect after recognition";
722 static_cast<IncorrectResultReason>(bl)),
759 tprintf(
"Retrying word using lang %s, oem %d\n",
769 (this->*recognizer)(block, row, &lang_word);
770 bool new_is_better = NewWordBetter(*word, lang_word);
773 tprintf(
"New result %s better:%s\n",
774 new_is_better ?
"IS" :
"NOT");
776 tprintf(
"New result %s better:%s, r=%g, c=%g\n",
777 new_is_better ?
"IS" :
"NOT",
786 return new_is_better;
800 tprintf(
"Processing word with lang %s at:",
804 const char* result_type =
"Initial";
806 if (initially_done) {
811 result_type =
"Already done";
813 (most_recently_used_->*recognizer)(block, row, word);
815 result_type =
"Accepted";
817 tprintf(
"%s result: %s r=%g, c=%g, accepted=%d, adaptable=%d\n",
826 Tesseract* previous_used = most_recently_used_;
827 if (most_recently_used_ !=
this) {
832 most_recently_used_ =
this;
838 for (
int i = 0; i < sub_langs_.
size(); ++i) {
839 if (sub_langs_[i] != previous_used) {
841 tprintf(
"Retrying with sub-Tesseract[%d] lang: %s\n",
845 most_recently_used_ = sub_langs_[i];
867 BLOB_CHOICE_LIST_CLIST *blob_choices =
new BLOB_CHOICE_LIST_CLIST();
917 if (adapt_ok || word->
reject_map[index].accepted())
957 tprintf(
"New XHT Match:%s = %s ",
968 new_x_ht > 0.1 ?
"STILL DOUBT" :
"OK",
969 accept_new_word ?
"ACCEPTED" :
"");
977 bool accept_new_x_ht =
false;
979 if (original_misfits == 0)
982 if (new_x_ht > 0.0
f) {
994 tprintf(
"Old misfits=%d with x-height %f, new=%d with x-height %f\n",
996 new_misfits, new_x_ht);
997 tprintf(
"Old rating= %f, certainty=%f, new=%f, %f\n",
1003 accept_new_x_ht = new_misfits < original_misfits &&
1012 if (accept_new_x_ht) {
1032 bool done_this_pass =
false;
1040 done_this_pass =
TRUE;
1045 bool accept_new_xht =
false;
1051 done_this_pass =
true;
1055 double small_cap_delta = (block->
x_height() - small_cap_xheight) / 2.0;
1057 small_cap_xheight - small_cap_delta <= word->
x_height &&
1058 word->
x_height <= small_cap_xheight + small_cap_delta) {
1068 if (num_upper > 0 && num_lower == 0)
1075 #ifndef GRAPHICS_DISABLED
1101 BLOB_CHOICE_LIST_CLIST *blob_choices =
new BLOB_CHOICE_LIST_CLIST();
1117 tprintf(
"POST FIX_QUOTES FAIL String:\"%s\"; Strlen=%d;"
1118 " #Blobs=%d; #Choices=%d\n",
1141 BLOB_CHOICE_LIST* bc_list) {
1143 BLOB_CHOICE_IT choice_it(bc_list);
1144 for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
1145 choice_it.forward()) {
1161 for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
1162 BLOB_CHOICE* choice = FindMatchingChoice(char_id, bc_it.data());
1163 if (choice !=
NULL) {
1165 best_choice = choice;
1174 static void CorrectRepcharChoices(
BLOB_CHOICE* blob_choice,
1178 for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
1181 if (choice ==
NULL) {
1182 BLOB_CHOICE_IT choice_it(bc_it.data());
1183 choice_it.add_before_stay_put(
new BLOB_CHOICE(*blob_choice));
1187 for (
int i = 0; i < word->
length(); ++i) {
1207 for (
int i = 0; i < word.
length(); ++i) {
1214 int max_count = rep_ch.MaxCount(&maxch_id);
1216 BLOB_CHOICE* best_choice = FindBestMatchingChoice(maxch_id, word_res);
1217 if (best_choice ==
NULL) {
1218 tprintf(
"Failed to find a choice for %s, occurring %d times\n",
1229 C_BLOB* prev_blob = blob_it.data();
1230 for (blob_it.forward(); !blob_it.at_first(); blob_it.forward()) {
1231 C_BLOB* blob = blob_it.data();
1233 gap -= prev_blob->bounding_box().right();
1243 CorrectRepcharChoices(best_choice, word_res);
1260 for (; !blob_it.empty(); blob_it.forward()) {
1261 bool first_blob = blob_it.at_first();
1262 bool last_blob = blob_it.at_last();
1285 const UNICHARSET& char_set,
const char *s,
const char *lengths) {
1288 int leading_punct_count;
1289 int upper_count = 0;
1290 int hyphen_pos = -1;
1293 if (strlen (lengths) > 20)
1299 offset += lengths[i++];
1300 leading_punct_count = i;
1303 while (s[offset] !=
'\0' && char_set.
get_isupper(s + offset, lengths[i])) {
1304 offset += lengths[i++];
1307 if (upper_count > 1) {
1311 while (s[offset] !=
'\0' && char_set.
get_islower(s + offset, lengths[i])) {
1312 offset += lengths[i++];
1320 if (lengths[i] == 1 && s[offset] ==
'-') {
1322 offset += lengths[i++];
1323 if (s[offset] !=
'\0') {
1324 while ((s[offset] !=
'\0') &&
1326 offset += lengths[i++];
1328 if (i < hyphen_pos + 3)
1333 if (lengths[i] == 1 && (s[offset] ==
'\'') &&
1334 lengths[i + 1] == 1 && (s[offset + lengths[i]] ==
's')) {
1335 offset += lengths[i++];
1336 offset += lengths[i++];
1339 if (upper_count > 0)
1346 if (lengths[i] == 1 && s[offset] !=
'\0' &&
1348 offset += lengths[i++];
1349 if (lengths[i] == 1 && s[offset] !=
'\0' && i > 0 &&
1350 s[offset - lengths[i - 1]] != s[offset] &&
1352 offset += lengths[i++];
1354 if (s[offset] !=
'\0')
1363 if (s[0] !=
'\0' && char_set.
get_isupper(s, lengths[0])) {
1365 while (s[offset] !=
'\0' &&
1367 lengths[i + 1] == 1 && s[offset + lengths[i]] ==
'.') {
1368 offset += lengths[i++];
1369 offset += lengths[i++];
1372 else if (s[0] !=
'\0' && char_set.
get_islower(s, lengths[0])) {
1374 while (s[offset] !=
'\0' &&
1376 lengths[i + 1] == 1 && s[offset + lengths[i]] ==
'.') {
1377 offset += lengths[i++];
1378 offset += lengths[i++];
1381 if (s[offset] !=
'\0')
1392 #ifndef SECURE_NAMES
1407 tprintf (
"classify_word_pass1 start\n");
1411 tprintf (
"make_reject_map: initial map");
1414 tprintf (
"make_reject_map: after NN");
1417 tprintf (
"classify_word_pass2 - START");
1420 tprintf (
"classify_word_pass2 - Pre Xht");
1423 tprintf (
"classify_word_pass2 - END");
1424 show_map_detail =
TRUE;
1436 tprintf (
"After Poor quality rejection");
1439 tprintf (
"unrej_good_quality_words - START");
1442 tprintf (
"unrej_good_quality_words - END");
1445 tprintf (
"Write results pass");
1446 show_map_detail =
TRUE;
1453 if (show_map_detail) {
1461 tprintf (
"Done flag: %s\n\n", word->
done ?
"TRUE" :
"FALSE");
1474 static void find_modal_font(
1487 fonts->
add (font, -*font_count);
1501 BLOB_CHOICE_LIST_CLIST *blob_choices) {
1502 if (blob_choices ==
NULL)
return;
1509 BLOB_CHOICE_LIST_C_IT char_it = blob_choices;
1510 BLOB_CHOICE_IT choice_it;
1513 if (fontinfo_size == 0 || fontset_size == 0)
return;
1514 STATS fonts(0, fontinfo_size);
1522 for (char_it.mark_cycle_pt(), index = 0;
1523 !char_it.cycled_list(); ++index, char_it.forward()) {
1525 choice_it.set_to_list(char_it.data());
1527 tprintf(
"Examining fonts in %s\n",
1530 for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
1531 choice_it.forward()) {
1532 UNICHAR_ID blob_ch_id = choice_it.data()->unichar_id();
1533 if (blob_ch_id == word_ch_id) {
1535 tprintf(
"%s font %s (%d) font2 %s (%d)\n",
1537 choice_it.data()->fontinfo_id() < 0 ?
"unknown" :
1539 choice_it.data()->fontinfo_id(),
1540 choice_it.data()->fontinfo_id2() < 0 ?
"unknown" :
1542 choice_it.data()->fontinfo_id2());
1545 if (choice_it.data()->fontinfo_id() >= 0) {
1546 fonts.
add(choice_it.data()->fontinfo_id(), 2);
1548 if (choice_it.data()->fontinfo_id2() >= 0) {
1549 fonts.
add(choice_it.data()->fontinfo_id2(), 1);
1555 inT16 font_id1, font_id2;
1568 tprintf(
"Word modal font=%s, score=%d, 2nd choice %s/%d\n",
1573 tprintf(
"Word modal font=%s, score=%d. No 2nd choice\n",
1593 STATS doc_fonts(0, font_table_size_);
1598 word = page_res_it.
word();
1607 inT8 doc_font_count;
1608 find_modal_font(&doc_fonts, &doc_font, &doc_font_count);
1609 if (doc_font_count == 0)
1615 word = page_res_it.
word();
1630 word = page_res_it.
word();
1635 if (!(count == length || (length > 3 && count >= length * 3 / 4))) {