22 #pragma warning(disable:4244) // Conversion warnings
31 #include "allheaders.h"
76 static void clear_any_old_text(BLOCK_LIST *block_list) {
77 BLOCK_IT block_it(block_list);
78 for (block_it.mark_cycle_pt();
79 !block_it.cycled_list(); block_it.forward()) {
80 ROW_IT row_it(block_it.data()->row_list());
81 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
82 WERD_IT word_it(row_it.data()->word_list());
83 for (word_it.mark_cycle_pt();
84 !word_it.cycled_list(); word_it.forward()) {
85 word_it.data()->set_text(
"");
112 bool find_segmentation,
113 BLOCK_LIST *block_list) {
115 int box_failures = 0;
122 bool found_box =
true;
140 PAGE_RES* page_res = find_segmentation ?
142 clear_any_old_text(block_list);
144 for (
int i = 0; i < boxes.
size() - 1; i++) {
145 bool foundit =
false;
146 if (page_res !=
NULL) {
149 full_texts[i].
string());
152 boxes[i + 1], full_texts[i].
string());
161 "FAILURE! Couldn't find a matching blob");
165 if (page_res ==
NULL) {
173 tprintf(
" Boxes read from boxfile: %6d\n", box_count);
174 if (box_failures > 0)
175 tprintf(
" Boxes failed resegmentation: %6d\n", box_failures);
182 static double MedianXHeight(BLOCK_LIST *block_list) {
183 BLOCK_IT block_it(block_list);
184 STATS xheights(0, block_it.data()->bounding_box().height());
185 for (block_it.mark_cycle_pt();
186 !block_it.cycled_list(); block_it.forward()) {
187 ROW_IT row_it(block_it.data()->row_list());
188 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
192 return xheights.median();
198 BLOCK_LIST *block_list) {
199 double median_xheight = MedianXHeight(block_list);
202 BLOCK_IT b_it(block_list);
203 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
204 BLOCK* block = b_it.data();
206 for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) {
207 ROW* row = r_it.data();
208 float diff = fabs(row->
x_height() - median_xheight);
209 if (diff > max_deviation) {
211 tprintf(
"row xheight=%g, but median xheight = %g\n",
217 for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
218 WERD* word = w_it.data();
220 delete w_it.extract();
231 while ((word_res = pr_it.
word()) !=
NULL) {
233 pr_it.
row()->
row, word_res);
246 for (
int i = 0; i < char_choices.
size(); ++i) {
247 BLOB_CHOICE_IT it(char_choices[i]);
267 tprintf(
"Maximally chopping word at:");
271 BLOB_CHOICE_LIST *match_result;
274 float rating =
static_cast<float>(
MAX_INT8);
288 *char_choices += match_result;
292 int right_chop_index = 0;
303 MakeWordChoice(*char_choices, unicharset, word_res->
best_choice);
304 MakeWordChoice(*char_choices, unicharset, word_res->
raw_choice);
307 if (char_choices !=
NULL) {
324 static double BoxMissMetric(
const TBOX& box1,
const TBOX& box2) {
326 double miss_metric = box1.
area()- overlap_area;
327 miss_metric /= box1.
area();
328 miss_metric *= box2.
area() - overlap_area;
329 miss_metric /= box2.
area();
342 const TBOX& box,
const TBOX& next_box,
343 const char* correct_text) {
345 tprintf(
"\nAPPLY_BOX: in ResegmentCharBox() for %s\n", correct_text);
349 for (word_res = page_res_it.
word(); word_res !=
NULL;
350 word_res = page_res_it.
forward()) {
358 for (
int i = 0; i < word_len; ++i) {
361 for (blob_count = 0; i + blob_count < word_len; ++blob_count) {
367 double current_box_miss_metric = BoxMissMetric(blob_box, box);
368 double next_box_miss_metric = BoxMissMetric(blob_box, next_box);
372 tprintf(
"Current miss metric = %g, next = %g\n",
373 current_box_miss_metric, next_box_miss_metric);
375 if (current_box_miss_metric > next_box_miss_metric)
377 char_box += blob_box;
379 if (blob_count > 0) {
381 tprintf(
"Index [%d, %d) seem good.\n", i, i + blob_count);
384 (box.
x_gap(next_box) < -3 ||
385 (prev_box !=
NULL && prev_box->
x_gap(box) < -3))) {
396 tprintf(
"%d Blobs match: blob box:", blob_count);
405 for (
int j = 1; j < blob_count; ++j) {
440 const TBOX& box,
const TBOX& next_box,
441 const char* correct_text) {
443 tprintf(
"\nAPPLY_BOX: in ResegmentWordBox() for %s\n", correct_text);
446 BLOCK_IT b_it(block_list);
447 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
448 BLOCK* block = b_it.data();
452 for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) {
453 ROW* row = r_it.data();
457 for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
458 WERD* word = w_it.data();
468 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list();
470 C_BLOB* blob = blob_it.data();
474 double current_box_miss_metric = BoxMissMetric(blob_box, box);
475 double next_box_miss_metric = BoxMissMetric(blob_box, next_box);
479 tprintf(
"Current miss metric = %g, next = %g\n",
480 current_box_miss_metric, next_box_miss_metric);
482 if (current_box_miss_metric > next_box_miss_metric)
492 if (new_word ==
NULL) {
496 w_it.add_to_end(new_word);
498 C_BLOB_IT new_blob_it(new_word->
cblob_list());
499 new_blob_it.add_to_end(blob_it.extract());
505 return new_word !=
NULL;
520 tprintf(
"APPLY_BOX: FAILURE: can't find class_id for '%s'\n",
526 tprintf(
"APPLY_BOX: FAILURE: can't find segmentation for '%s'\n",
538 for (
int step = 0; *utf8 !=
'\0'; utf8 += step) {
539 const char* next_space = strchr(utf8,
' ');
540 if (next_space ==
NULL)
541 next_space = utf8 + strlen(utf8);
542 step = next_space - utf8;
544 if (class_id == INVALID_UNICHAR_ID) {
547 while (utf8[step] ==
' ')
567 for (
int i = 0; i < word_length; ++i) {
568 for (
int j = 1; j <=
kMaxGroupSize && i + j <= word_length; ++j) {
584 float best_rating = 0.0f;
586 &search_segmentation, &best_rating, &word_res->
best_state);
588 for (
int i = 0; i < word_length; ++i)
589 choices[i].delete_data_pointers();
612 for (
int i = 0; i < target_text.
size(); ++i) {
626 int choices_pos,
int choices_length,
633 for (
int length = 1; length <= choices[choices_pos].
size(); ++length) {
635 float choice_rating = 0.0f;
637 BLOB_CHOICE_IT choice_it(choices[choices_pos][length - 1]);
638 for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
639 choice_it.forward()) {
641 choice_rating = choice->
rating();
643 if (class_id == target_text[text_index]) {
647 if (class_id < table.
size() && table[class_id] !=
NULL) {
648 AmbigSpec_IT spec_it(table[class_id]);
649 for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();
651 const AmbigSpec *ambig_spec = spec_it.data();
653 if (ambig_spec->
wrong_ngram[1] == INVALID_UNICHAR_ID &&
657 if (!spec_it.cycled_list())
661 if (choice_it.cycled_list())
664 if (choices_pos + length == choices_length &&
665 text_index + 1 == target_text.
size()) {
668 tprintf(
"Complete match, rating = %g, best=%g, seglength=%d, best=%d\n",
669 rating + choice_rating, *best_rating, segmentation->
size(),
670 best_segmentation->
size());
672 if (best_segmentation->
empty() || rating + choice_rating < *best_rating) {
673 *best_segmentation = *segmentation;
674 *best_rating = rating + choice_rating;
676 }
else if (choices_pos + length < choices_length &&
677 text_index + 1 < target_text.
size()) {
679 tprintf(
"Match found for %d=%s:%s, at %d+%d, recursing...\n",
680 target_text[text_index],
682 choice_it.data()->unichar_id() == target_text[text_index]
684 choices_pos, length);
686 SearchForText(choices, choices_pos + length, choices_length, target_text,
687 text_index + 1, rating + choice_rating, segmentation,
688 best_rating, best_segmentation);
690 tprintf(
"End recursion for %d=%s\n", target_text[text_index],
703 int ok_blob_count = 0;
704 int bad_blob_count = 0;
705 int ok_word_count = 0;
706 int unlabelled_words = 0;
722 if (ok_in_word > 0) {
723 ok_blob_count += ok_in_word;
725 MakeWordChoice(char_choices, unicharset, word_res->
best_choice);
729 tprintf(
"APPLY_BOXES: Unlabelled word at :");
745 tprintf(
" Found %d good blobs.\n", ok_blob_count);
746 if (bad_blob_count > 0) {
747 tprintf(
" Leaving %d unlabelled blobs in %d words.\n",
748 bad_blob_count, ok_word_count);
750 if (unlabelled_words > 0)
751 tprintf(
" %d remaining unlabelled words deleted.\n", unlabelled_words);
757 const char *box_ch,
const char *err_msg) {
758 tprintf(
"APPLY_BOXES: boxfile line %d/%s ((%d,%d),(%d,%d)): %s\n",
759 boxfile_lineno, box_ch,
769 word_res->correct_text.size());
770 for (
int i = 0; i < word_res->correct_text.size(); ++i) {
774 word_res->correct_text[i].split(
' ', &tokens);
778 if (word_res->best_choice !=
NULL)
779 delete word_res->best_choice;
780 word_res->best_choice = choice;
794 tprintf(
"Generated training data for %d words\n", word_count);