Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
chopper.cpp
Go to the documentation of this file.
1 /* -*-C-*-
2  ********************************************************************************
3  *
4  * File: chopper.c (Formerly chopper.c)
5  * Description:
6  * Author: Mark Seaman, OCR Technology
7  * Created: Fri Oct 16 14:37:00 1987
8  * Modified: Tue Jul 30 16:18:52 1991 (Mark Seaman) marks@hpgrlt
9  * Language: C
10  * Package: N/A
11  * Status: Reusable Software Component
12  *
13  * (c) Copyright 1987, Hewlett-Packard Company.
14  ** Licensed under the Apache License, Version 2.0 (the "License");
15  ** you may not use this file except in compliance with the License.
16  ** You may obtain a copy of the License at
17  ** http://www.apache.org/licenses/LICENSE-2.0
18  ** Unless required by applicable law or agreed to in writing, software
19  ** distributed under the License is distributed on an "AS IS" BASIS,
20  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
21  ** See the License for the specific language governing permissions and
22  ** limitations under the License.
23  *
24  **************************************************************************/
25 
26 /*----------------------------------------------------------------------
27  I n c l u d e s
28 ----------------------------------------------------------------------*/
29 
30 #include <math.h>
31 
32 #include "chopper.h"
33 
34 #include "assert.h"
35 #include "associate.h"
36 #include "callcpp.h"
37 #include "const.h"
38 #include "findseam.h"
39 #include "freelist.h"
40 #include "globals.h"
41 #include "makechop.h"
42 #include "render.h"
43 #include "pageres.h"
44 #include "permute.h"
45 #include "seam.h"
46 #include "stopper.h"
47 #include "structures.h"
48 #include "unicharset.h"
49 #include "wordclass.h"
50 #include "wordrec.h"
51 
52 // Include automatically generated configuration file if running autoconf.
53 #ifdef HAVE_CONFIG_H
54 #include "config_auto.h"
55 #endif
56 
57 /*----------------------------------------------------------------------
58  F u n c t i o n s
59 ----------------------------------------------------------------------*/
65 void preserve_outline(EDGEPT *start) {
66  EDGEPT *srcpt;
67 
68  if (start == NULL)
69  return;
70  srcpt = start;
71  do {
72  srcpt->flags[1] = 1;
73  srcpt = srcpt->next;
74  }
75  while (srcpt != start);
76  srcpt->flags[1] = 2;
77 }
78 
79 
80 /**************************************************************************/
82  TESSLINE *outline;
83 
84  for (outline = srcline; outline != NULL; outline = outline->next) {
85  preserve_outline (outline->loop);
86  }
87 }
88 
89 
96  EDGEPT *srcpt;
97  EDGEPT *real_start;
98  EDGEPT *deadpt;
99 
100  if (start == NULL)
101  return NULL;
102  srcpt = start;
103  do {
104  if (srcpt->flags[1] == 2)
105  break;
106  srcpt = srcpt->next;
107  }
108  while (srcpt != start);
109  real_start = srcpt;
110  do {
111  if (srcpt->flags[1] == 0) {
112  deadpt = srcpt;
113  srcpt = srcpt->next;
114  srcpt->prev = deadpt->prev;
115  deadpt->prev->next = srcpt;
116  deadpt->prev->vec.x = srcpt->pos.x - deadpt->prev->pos.x;
117  deadpt->prev->vec.y = srcpt->pos.y - deadpt->prev->pos.y;
118  delete deadpt;
119  }
120  else
121  srcpt = srcpt->next;
122  }
123  while (srcpt != real_start);
124  return real_start;
125 }
126 
127 
128 /******************************************************************************/
130  TESSLINE *outline;
131 
132  for (outline = srcline; outline != NULL; outline = outline->next) {
133  outline->loop = restore_outline (outline->loop);
134  outline->start = outline->loop->pos;
135  }
136 }
137 
138 
145 namespace tesseract {
146 SEAM *Wordrec::attempt_blob_chop(TWERD *word, TBLOB *blob, inT32 blob_number,
147  bool italic_blob, SEAMS seam_list) {
148  TBLOB *next_blob = blob->next;
149  TBLOB *other_blob;
150  SEAM *seam;
151 
154  other_blob = new TBLOB; /* Make new blob */
155  other_blob->next = blob->next;
156  other_blob->outlines = NULL;
157  blob->next = other_blob;
158 
159  seam = NULL;
160  if (prioritize_division) {
161  TPOINT location;
162  if (divisible_blob(blob, italic_blob, &location)) {
163  seam = new_seam(0.0f, location, NULL, NULL, NULL);
164  }
165  }
166  if (seam == NULL)
167  seam = pick_good_seam(blob);
168  if (seam == NULL && word->latin_script) {
169  // If the blob can simply be divided into outlines, then do that.
170  TPOINT location;
171  if (divisible_blob(blob, italic_blob, &location)) {
172  seam = new_seam(0.0f, location, NULL, NULL, NULL);
173  }
174  }
175  if (chop_debug) {
176  if (seam != NULL) {
177  print_seam ("Good seam picked=", seam);
178  }
179  else
180  cprintf ("\n** no seam picked *** \n");
181  }
182  if (seam) {
183  apply_seam(blob, other_blob, italic_blob, seam);
184  }
185 
186  if ((seam == NULL) ||
187  (blob->outlines == NULL) ||
188  (other_blob->outlines == NULL) ||
189  total_containment (blob, other_blob) ||
190  check_blob (other_blob) ||
191  !(check_seam_order (blob, seam) &&
192  check_seam_order (other_blob, seam)) ||
193  any_shared_split_points (seam_list, seam) ||
194  !test_insert_seam(seam_list, blob_number, blob, word->blobs)) {
195 
196  blob->next = next_blob;
197  if (seam) {
198  undo_seam(blob, other_blob, seam);
199  delete_seam(seam);
200 #ifndef GRAPHICS_DISABLED
201  if (chop_debug) {
202  if (chop_debug >2)
203  display_blob(blob, Red);
204  cprintf ("\n** seam being removed ** \n");
205  }
206 #endif
207  } else {
208  delete other_blob;
209  }
210 
213  return (NULL);
214  }
215  return (seam);
216 }
217 
218 
220  bool italic_blob, SEAMS seam_list) {
221  TBLOB *blob;
222  inT16 x;
223 
224  blob = word->blobs;
225  for (x = 0; x < blob_number; x++)
226  blob = blob->next;
227 
228  return attempt_blob_chop(word, blob, blob_number,
229  italic_blob, seam_list);
230 }
231 
232 
234  WERD_RES *word_res, inT32 *blob_number,
235  bool italic_blob, SEAMS seam_list) {
236  TWERD *word = word_res->chopped_word;
237  TBLOB *blob;
238 
239  *blob_number = 0;
240  blob = word->blobs;
241  while (blob != NULL) {
242  TPOINT topleft, botright;
243  topleft.x = blob->bounding_box().left();
244  topleft.y = blob->bounding_box().top();
245  botright.x = blob->bounding_box().right();
246  botright.y = blob->bounding_box().bottom();
247 
248  TPOINT original_topleft, original_botright;
249  word_res->denorm.DenormTransform(topleft, &original_topleft);
250  word_res->denorm.DenormTransform(botright, &original_botright);
251 
252  TBOX original_box = TBOX(original_topleft.x, original_botright.y,
253  original_botright.x, original_topleft.y);
254 
255  bool almost_equal_box = false;
256  int num_overlap = 0;
257  for (int i = 0; i < boxes.size(); i++) {
258  if (original_box.overlap_fraction(boxes[i]) > 0.125)
259  num_overlap++;
260  if (original_box.almost_equal(boxes[i], 3))
261  almost_equal_box = true;
262  }
263 
264  TPOINT location;
265  if (divisible_blob(blob, italic_blob, &location) ||
266  (!almost_equal_box && num_overlap > 1)) {
267  SEAM *seam = attempt_blob_chop(word, blob, *blob_number,
268  italic_blob, seam_list);
269  if (seam != NULL)
270  return seam;
271  }
272 
273  *blob_number = *blob_number + 1;
274  blob = blob->next;
275  }
276 
277  *blob_number = -1;
278  return NULL;
279 }
280 
281 } // namespace tesseract
282 
283 
289 int any_shared_split_points(SEAMS seam_list, SEAM *seam) {
290  int length;
291  int index;
292 
293  length = array_count (seam_list);
294  for (index = 0; index < length; index++)
295  if (shared_split_points ((SEAM *) array_value (seam_list, index), seam))
296  return TRUE;
297  return FALSE;
298 }
299 
300 
306 int check_blob(TBLOB *blob) {
307  TESSLINE *outline;
308  EDGEPT *edgept;
309 
310  for (outline = blob->outlines; outline != NULL; outline = outline->next) {
311  edgept = outline->loop;
312  do {
313  if (edgept == NULL)
314  break;
315  edgept = edgept->next;
316  }
317  while (edgept != outline->loop);
318  if (edgept == NULL)
319  return 1;
320  }
321  return 0;
322 }
323 
324 
325 namespace tesseract {
333  BLOB_CHOICE_LIST_VECTOR *char_choices,
334  inT32 *blob_number,
335  SEAMS *seam_list,
336  DANGERR *fixpt,
337  bool split_next_to_fragment,
338  BlamerBundle *blamer_bundle) {
339  TWERD* word = word_res->chopped_word;
340  TBLOB *blob;
341  inT16 x = 0;
342  float rating_ceiling = MAX_FLOAT32;
343  BLOB_CHOICE_LIST *answer;
344  BLOB_CHOICE_IT answer_it;
345  SEAM *seam;
346 
347  do {
348  *blob_number = select_blob_to_split_from_fixpt(fixpt);
349  bool split_point_from_dict = (*blob_number != -1);
350  if (split_point_from_dict) {
351  fixpt->clear();
352  } else {
353  *blob_number = select_blob_to_split(*char_choices, rating_ceiling,
354  split_next_to_fragment);
355  }
356  if (chop_debug)
357  cprintf("blob_number = %d\n", *blob_number);
358  if (*blob_number == -1)
359  return false;
360 
361  // TODO(rays) it may eventually help to allow italic_blob to be true,
362  seam = chop_numbered_blob(word, *blob_number, false, *seam_list);
363  if (seam != NULL)
364  break;
365  /* Must split null blobs */
366  answer = char_choices->get(*blob_number);
367  if (answer == NULL)
368  return false;
369  answer_it.set_to_list(answer);
370  if (!split_point_from_dict) {
371  // We chopped the worst rated blob, try something else next time.
372  rating_ceiling = answer_it.data()->rating();
373  }
374  } while (true);
375  /* Split OK */
376  for (blob = word->blobs; x < *blob_number; x++) {
377  blob = blob->next;
378  }
379 
380  *seam_list =
381  insert_seam (*seam_list, *blob_number, seam, blob, word->blobs);
382 
383  delete char_choices->get(*blob_number);
384 
385  answer = classify_blob(blob, word_res->denorm, "improve 1:", Red,
386  blamer_bundle);
387  char_choices->insert(answer, *blob_number);
388 
389  answer = classify_blob(blob->next, word_res->denorm, "improve 2:", Yellow,
390  blamer_bundle);
391  char_choices->set(answer, *blob_number + 1);
392 
393  return true;
394 }
395 
403 void Wordrec::modify_blob_choice(BLOB_CHOICE_LIST *answer,
404  int chop_index) {
405  char chop_index_string[2];
406  if (chop_index <= 9) {
407  snprintf(chop_index_string, sizeof(chop_index_string), "%d", chop_index);
408  } else {
409  chop_index_string[0] = static_cast<char>('A' - 10 + chop_index);
410  chop_index_string[1] = '\0';
411  }
412  UNICHAR_ID unichar_id = unicharset.unichar_to_id(chop_index_string);
413  if (unichar_id == INVALID_UNICHAR_ID) {
414  // If the word is very long, we might exhaust the possibilities.
415  unichar_id = 1;
416  }
417  BLOB_CHOICE_IT answer_it(answer);
418  BLOB_CHOICE *modified_blob =
419  new BLOB_CHOICE(unichar_id,
420  answer_it.data()->rating(),
421  answer_it.data()->certainty(),
422  answer_it.data()->fontinfo_id(),
423  answer_it.data()->fontinfo_id2(),
424  answer_it.data()->script_id(),
425  answer_it.data()->min_xheight(),
426  answer_it.data()->max_xheight(),
427  answer_it.data()->adapted());
428  answer->clear();
429  answer_it.set_to_list(answer);
430  answer_it.add_after_then_move(modified_blob);
431 }
432 
433 
442  BLOB_CHOICE_LIST_VECTOR *char_choices,
443  inT32 *blob_number,
444  SEAMS *seam_list,
445  int *right_chop_index) {
446  TBLOB *blob;
447  inT16 x = 0;
448  float rating_ceiling = MAX_FLOAT32;
449  BLOB_CHOICE_LIST *answer;
450  BLOB_CHOICE_IT answer_it;
451  SEAM *seam;
452  UNICHAR_ID unichar_id = 0;
453  int left_chop_index = 0;
454 
455  do {
456  *blob_number = select_blob_to_split(*char_choices, rating_ceiling, false);
457  if (chop_debug)
458  cprintf("blob_number = %d\n", *blob_number);
459  if (*blob_number == -1)
460  return false;
461  seam = chop_numbered_blob(word, *blob_number, true, *seam_list);
462  if (seam != NULL)
463  break;
464  /* Must split null blobs */
465  answer = char_choices->get(*blob_number);
466  if (answer == NULL)
467  return false;
468  answer_it.set_to_list(answer);
469  rating_ceiling = answer_it.data()->rating(); // try a different blob
470  } while (true);
471  /* Split OK */
472  for (blob = word->blobs; x < *blob_number; x++) {
473  blob = blob->next;
474  }
475  if (chop_debug) {
476  tprintf("Chop made blob1:");
477  blob->bounding_box().print();
478  tprintf("and blob2:");
479  blob->next->bounding_box().print();
480  }
481  *seam_list = insert_seam(*seam_list, *blob_number, seam, blob, word->blobs);
482 
483  answer = char_choices->get(*blob_number);
484  answer_it.set_to_list(answer);
485  unichar_id = answer_it.data()->unichar_id();
486  float rating = answer_it.data()->rating() / exp(1.0);
487  left_chop_index = atoi(unicharset.id_to_unichar(unichar_id));
488 
489  delete char_choices->get(*blob_number);
490  // combine confidence w/ serial #
491  answer = fake_classify_blob(0, rating, -rating);
492  modify_blob_choice(answer, left_chop_index);
493  char_choices->insert(answer, *blob_number);
494 
495  answer = fake_classify_blob(0, rating - 0.125f, -rating);
496  modify_blob_choice(answer, ++*right_chop_index);
497  char_choices->set(answer, *blob_number + 1);
498  return true;
499 }
500 
501 
503  WERD_RES *word_res,
504  SEAMS *seam_list) {
505  inT32 blob_number;
506  inT16 x = 0;
507  TBLOB *blob;
508  SEAM *seam;
509 
510  seam = chop_overlapping_blob(boxes, word_res, &blob_number,
511  true, *seam_list);
512  if (seam == NULL)
513  return false;
514 
515  /* Split OK */
516  for (blob = word_res->chopped_word->blobs; x < blob_number; x++) {
517  blob = blob->next;
518  }
519  if (chop_debug) {
520  tprintf("Chop made blob1:");
521  blob->bounding_box().print();
522  tprintf("and blob2:");
523  blob->next->bounding_box().print();
524  }
525  *seam_list = insert_seam(*seam_list, blob_number, seam, blob,
526  word_res->chopped_word->blobs);
527  return true;
528 }
529 } // namespace tesseract
530 
540  TESSLINE *outline;
541  TESSLINE *last_outline;
542  inT8 found_em[3];
543 
544  if (seam->split1 == NULL || seam->split1 == NULL || blob == NULL)
545  return (TRUE);
546 
547  found_em[0] = found_em[1] = found_em[2] = FALSE;
548 
549  for (outline = blob->outlines; outline; outline = outline->next) {
550  if (!found_em[0] &&
551  ((seam->split1 == NULL) ||
552  is_split_outline (outline, seam->split1))) {
553  found_em[0] = TRUE;
554  }
555  if (!found_em[1] &&
556  ((seam->split2 == NULL) ||
557  is_split_outline (outline, seam->split2))) {
558  found_em[1] = TRUE;
559  }
560  if (!found_em[2] &&
561  ((seam->split3 == NULL) ||
562  is_split_outline (outline, seam->split3))) {
563  found_em[2] = TRUE;
564  }
565  last_outline = outline;
566  }
567 
568  if (!found_em[0] || !found_em[1] || !found_em[2])
569  return (FALSE);
570  else
571  return (TRUE);
572 }
573 
574 namespace tesseract {
584  TBLOB *blob;
585  int index;
586  int did_chopping;
587  STATE state;
588  BLOB_CHOICE_LIST *match_result;
589  MATRIX *ratings = NULL;
590  DANGERR fixpt; /*dangerous ambig */
591  inT32 bit_count; //no of bits
592 
593  BLOB_CHOICE_LIST_VECTOR *char_choices = new BLOB_CHOICE_LIST_VECTOR();
594  BLOB_CHOICE_LIST_VECTOR *best_char_choices = new BLOB_CHOICE_LIST_VECTOR();
595 
596  did_chopping = 0;
597  for (blob = word->chopped_word->blobs, index = 0;
598  blob != NULL; blob = blob->next, index++) {
599  match_result = classify_blob(blob, word->denorm, "chop_word:", Green,
600  word->blamer_bundle);
601  if (match_result == NULL)
602  cprintf("Null classifier output!\n");
603  *char_choices += match_result;
604  }
605  bit_count = index - 1;
606  set_n_ones(&state, char_choices->length() - 1);
607  bool acceptable = false;
608  bool replaced = false;
609  bool best_choice_updated =
610  getDict().permute_characters(*char_choices, word->best_choice,
611  word->raw_choice);
612  if (best_choice_updated &&
613  getDict().AcceptableChoice(char_choices, word->best_choice, &fixpt,
614  CHOPPER_CALLER, &replaced)) {
615  acceptable = true;
616  }
617  if (replaced)
618  update_blob_classifications(word->chopped_word, *char_choices);
619  CopyCharChoices(*char_choices, best_char_choices);
620  if (!acceptable) { // do more work to find a better choice
621  did_chopping = 1;
622 
623  bool best_choice_acceptable = false;
624  if (chop_enable)
625  improve_by_chopping(word,
626  char_choices,
627  &state,
628  best_char_choices,
629  &fixpt,
630  &best_choice_acceptable);
631  if (chop_debug)
632  print_seams ("Final seam list:", word->seam_array);
633 
634  if (word->blamer_bundle != NULL &&
635  !ChoiceIsCorrect(*word->uch_set, word->best_choice,
636  word->blamer_bundle->truth_text)) {
637  set_chopper_blame(word);
638  }
639 
640  // The force_word_assoc is almost redundant to enable_assoc. However,
641  // it is not conditioned on the dict behavior. For CJK, we need to force
642  // the associator to be invoked. When we figure out the exact behavior
643  // of dict on CJK, we can remove the flag if it turns out to be redundant.
644  if ((wordrec_enable_assoc && !best_choice_acceptable) || force_word_assoc) {
645  ratings = word_associator(false, word, &state, best_char_choices,
646  &fixpt, &state);
647  }
648  }
649  best_char_choices = rebuild_current_state(word, &state, best_char_choices,
650  ratings);
651 
652  // If after running only the chopper best_choice is incorrect and no blame
653  // has been yet set, blame the classifier if best_choice is classifier's
654  // top choice and is a dictionary word (i.e. language model could not have
655  // helped). Otherwise blame the tradeoff between the classifier and
656  // the old language model (permuters).
657  if (word->blamer_bundle != NULL &&
659  ratings == NULL && // only the chopper was run
660  !ChoiceIsCorrect(*word->uch_set, word->best_choice,
661  word->blamer_bundle->truth_text)) {
662  if (word->best_choice != NULL &&
664  // Find out whether best choice is a top choice.
666  for (int i = 0; i < word->best_choice->length(); ++i) {
667  BLOB_CHOICE_IT blob_choice_it(best_char_choices->get(i));
668  ASSERT_HOST(!blob_choice_it.empty());
669  BLOB_CHOICE *first_choice = NULL;
670  for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list();
671  blob_choice_it.forward()) { // find first non-fragment choice
672  if (!(getDict().getUnicharset().get_fragment(
673  blob_choice_it.data()->unichar_id()))) {
674  first_choice = blob_choice_it.data();
675  break;
676  }
677  }
678  ASSERT_HOST(first_choice != NULL);
679  if (first_choice->unichar_id() != word->best_choice->unichar_id(i)) {
681  break;
682  }
683  }
684  }
685  STRING debug;
687  debug = "Best choice is: incorrect, top choice, dictionary word";
688  debug += " with permuter ";
689  debug += word->best_choice->permuter_name();
690  } else {
691  debug = "Classifier/Old LM tradeoff is to blame";
692  }
693  word->blamer_bundle->SetBlame(
696  debug, word->best_choice, wordrec_debug_blamer);
697  }
698 
699  if (word->blamer_bundle != NULL && this->fill_lattice_ != NULL) {
700  if (ratings == NULL) {
701  ratings = word_associator(true, word, NULL, NULL, NULL, NULL);
702  }
703  CallFillLattice(*ratings, getDict().getBestChoices(),
704  *word->uch_set, word->blamer_bundle);
705  }
706  if (ratings != NULL) {
707  if (wordrec_debug_level > 0) {
708  tprintf("Final Ratings Matrix:\n");
709  ratings->print(getDict().getUnicharset());
710  }
711  ratings->delete_matrix_pointers();
712  delete ratings;
713  }
715  // TODO(antonova, eger): check that FilterWordChoices() does not filter
716  // out anything useful for word bigram or phrase search.
717  // TODO(antonova, eger): when implementing word bigram and phrase search
718  // we will need to think carefully about how to replace a word with its
719  // alternative choice.
720  // In particular it might be required to save the segmentation state
721  // associated with the word, so that best_char_choices could be updated
722  // by rebuild_current_state() correctly.
723  if (save_alt_choices) SaveAltChoices(getDict().getBestChoices(), word);
724  char_choices->delete_data_pointers();
725  delete char_choices;
726 
727  return best_char_choices;
728 }
729 
730 
731 
742  BLOB_CHOICE_LIST_VECTOR *char_choices,
743  STATE *best_state,
744  BLOB_CHOICE_LIST_VECTOR *best_char_choices,
745  DANGERR *fixpt,
746  bool *best_choice_acceptable) {
747  inT32 blob_number;
748  float old_best;
749  bool updated_best_choice = false;
750 
751  while (1) { // improvement loop
752  old_best = word->best_choice->rating();
753  if (improve_one_blob(word, char_choices,
754  &blob_number, &word->seam_array,
755  fixpt, (fragments_guide_chopper &&
756  word->best_choice->fragment_mark()),
757  word->blamer_bundle)) {
758  getDict().LogNewSplit(blob_number);
759  updated_best_choice =
760  getDict().permute_characters(*char_choices, word->best_choice,
761  word->raw_choice);
762 
763  if (old_best > word->best_choice->rating()) {
764  set_n_ones(best_state, char_choices->length() - 1);
765  } else {
766  insert_new_chunk(best_state, blob_number, char_choices->length() - 2);
767  fixpt->clear();
768  }
769 
770  if (chop_debug)
771  print_state("best state = ",
772  best_state, count_blobs(word->chopped_word->blobs) - 1);
773  } else {
774  break;
775  }
776 
777  // Check if we should break from the loop.
778  bool done = false;
779  bool replaced = false;
780  if ((updated_best_choice &&
781  (*best_choice_acceptable =
782  getDict().AcceptableChoice(char_choices, word->best_choice,
783  fixpt, CHOPPER_CALLER, &replaced))) ||
784  char_choices->length() >= MAX_NUM_CHUNKS) {
785  done = true;
786  }
787  if (replaced) update_blob_classifications(word->chopped_word,
788  *char_choices);
789  if (updated_best_choice) CopyCharChoices(*char_choices, best_char_choices);
790  if (done) break;
791  }
792 }
793 
794 
795 /**********************************************************************
796  * select_blob_to_split
797  *
798  * These are the results of the last classification. Find a likely
799  * place to apply splits. If none, return -1.
800  **********************************************************************/
802  float rating_ceiling,
803  bool split_next_to_fragment) {
804  BLOB_CHOICE_IT blob_choice_it;
805  BLOB_CHOICE *blob_choice;
806  BLOB_CHOICE_IT temp_it;
807  int x;
808  float worst = -MAX_FLOAT32;
809  int worst_index = -1;
810  float worst_near_fragment = -MAX_FLOAT32;
811  int worst_index_near_fragment = -1;
812  const CHAR_FRAGMENT **fragments = NULL;
813 
814  if (chop_debug) {
815  if (rating_ceiling < MAX_FLOAT32)
816  cprintf("rating_ceiling = %8.4f\n", rating_ceiling);
817  else
818  cprintf("rating_ceiling = No Limit\n");
819  }
820 
821  if (split_next_to_fragment && char_choices.length() > 0) {
822  fragments = new const CHAR_FRAGMENT *[char_choices.length()];
823  if (char_choices.get(0) != NULL) {
824  temp_it.set_to_list(char_choices.get(0));
825  fragments[0] = getDict().getUnicharset().get_fragment(
826  temp_it.data()->unichar_id());
827  } else {
828  fragments[0] = NULL;
829  }
830  }
831 
832  for (x = 0; x < char_choices.length(); ++x) {
833  if (char_choices.get(x) == NULL) {
834  if (fragments != NULL) {
835  delete[] fragments;
836  }
837  return x;
838  } else {
839  blob_choice_it.set_to_list(char_choices.get(x));
840  blob_choice = blob_choice_it.data();
841  // Populate fragments for the following position.
842  if (split_next_to_fragment && x+1 < char_choices.length()) {
843  if (char_choices.get(x+1) != NULL) {
844  temp_it.set_to_list(char_choices.get(x+1));
845  fragments[x+1] = getDict().getUnicharset().get_fragment(
846  temp_it.data()->unichar_id());
847  } else {
848  fragments[x+1] = NULL;
849  }
850  }
851  if (blob_choice->rating() < rating_ceiling &&
852  blob_choice->certainty() < tessedit_certainty_threshold) {
853  // Update worst and worst_index.
854  if (blob_choice->rating() > worst) {
855  worst_index = x;
856  worst = blob_choice->rating();
857  }
858  if (split_next_to_fragment) {
859  // Update worst_near_fragment and worst_index_near_fragment.
860  bool expand_following_fragment =
861  (x + 1 < char_choices.length() &&
862  fragments[x+1] != NULL && !fragments[x+1]->is_beginning());
863  bool expand_preceding_fragment =
864  (x > 0 && fragments[x-1] != NULL && !fragments[x-1]->is_ending());
865  if ((expand_following_fragment || expand_preceding_fragment) &&
866  blob_choice->rating() > worst_near_fragment) {
867  worst_index_near_fragment = x;
868  worst_near_fragment = blob_choice->rating();
869  if (chop_debug) {
870  cprintf("worst_index_near_fragment=%d"
871  " expand_following_fragment=%d"
872  " expand_preceding_fragment=%d\n",
873  worst_index_near_fragment,
874  expand_following_fragment,
875  expand_preceding_fragment);
876  }
877  }
878  }
879  }
880  }
881  }
882  if (fragments != NULL) {
883  delete[] fragments;
884  }
885  // TODO(daria): maybe a threshold of badness for
886  // worst_near_fragment would be useful.
887  return worst_index_near_fragment != -1 ?
888  worst_index_near_fragment : worst_index;
889 }
890 
891 /**********************************************************************
892  * select_blob_to_split_from_fixpt
893  *
894  * Given the fix point from a dictionary search, if there is a single
895  * dangerous blob that maps to multiple characters, return that blob
896  * index as a place we need to split. If none, return -1.
897  **********************************************************************/
899  if (!fixpt)
900  return -1;
901  for (int i = 0; i < fixpt->size(); i++) {
902  if ((*fixpt)[i].begin == (*fixpt)[i].end &&
903  (*fixpt)[i].dangerous &&
904  (*fixpt)[i].correct_is_ngram) {
905  return (*fixpt)[i].begin;
906  }
907  }
908  return -1;
909 }
910 
911 /**********************************************************************
912  * set_chopper_blame
913  *
914  * Check whether chops were made at all the character bounding box boundaries
915  * in word->truth_word. If not - blame the chopper for an incorrect answer.
916  **********************************************************************/
918  BlamerBundle *blamer_bundle = word->blamer_bundle;
919  assert(blamer_bundle != NULL);
920  if (blamer_bundle->NoTruth() || !(blamer_bundle->truth_has_char_boxes) ||
921  word->chopped_word->blobs == NULL) {
922  return;
923  }
924  STRING debug;
925  bool missing_chop = false;
926  TBLOB * curr_blob = word->chopped_word->blobs;
927  int b = 0;
928  inT16 truth_x;
929  while (b < blamer_bundle->truth_word.length() && curr_blob != NULL) {
930  truth_x = blamer_bundle->norm_truth_word.BlobBox(b).right();
931  if (curr_blob->bounding_box().right() <
932  (truth_x - blamer_bundle->norm_box_tolerance)) {
933  curr_blob = curr_blob->next;
934  continue; // encountered an extra chop, keep looking
935  } else if (curr_blob->bounding_box().right() >
936  (truth_x + blamer_bundle->norm_box_tolerance)) {
937  missing_chop = true;
938  break;
939  } else {
940  curr_blob = curr_blob->next;
941  ++b;
942  }
943  }
944  if (missing_chop || b < blamer_bundle->norm_truth_word.length()) {
945  STRING debug;
946  char debug_buffer[256];
947  if (missing_chop) {
948  sprintf(debug_buffer, "Detected missing chop (tolerance=%d) at ",
949  blamer_bundle->norm_box_tolerance);
950  debug += debug_buffer;
951  curr_blob->bounding_box().append_debug(&debug);
952  debug.add_str_int("\nNo chop for truth at x=", truth_x);
953  } else {
954  debug.add_str_int("Missing chops for last ",
955  blamer_bundle->norm_truth_word.length()-b);
956  debug += " truth box(es)";
957  }
958  debug += "\nMaximally chopped word boxes:\n";
959  for (curr_blob = word->chopped_word->blobs; curr_blob != NULL;
960  curr_blob = curr_blob->next) {
961  const TBOX &tbox = curr_blob->bounding_box();
962  sprintf(debug_buffer, "(%d,%d)->(%d,%d)\n",
963  tbox.left(), tbox.bottom(), tbox.right(), tbox.top());
964  debug += debug_buffer;
965  }
966  debug += "Truth bounding boxes:\n";
967  for (b = 0; b < blamer_bundle->norm_truth_word.length(); ++b) {
968  const TBOX &tbox = blamer_bundle->norm_truth_word.BlobBox(b);
969  sprintf(debug_buffer, "(%d,%d)->(%d,%d)\n",
970  tbox.left(), tbox.bottom(), tbox.right(), tbox.top());
971  debug += debug_buffer;
972  }
973  blamer_bundle->SetBlame(IRR_CHOPPER, debug, word->best_choice,
975  }
976 }
977 
978 /**********************************************************************
979  * word_associator
980  *
981  * Reassociate and classify the blobs in a word. Continue this process
982  * until a good answer is found or all the possibilities have been tried.
983  **********************************************************************/
984 MATRIX *Wordrec::word_associator(bool only_create_ratings_matrix,
985  WERD_RES *word,
986  STATE *state,
987  BLOB_CHOICE_LIST_VECTOR *best_char_choices,
988  DANGERR *fixpt,
989  STATE *best_state) {
990  CHUNKS_RECORD chunks_record;
991  BLOB_WEIGHTS blob_weights;
992  int x;
993  int num_chunks;
994  BLOB_CHOICE_IT blob_choice_it;
995 
996  num_chunks = array_count(word->seam_array) + 1;
997 
998  TBLOB* blobs = word->chopped_word->blobs;
999  chunks_record.ratings = record_piece_ratings(blobs);
1000  chunks_record.chunks = blobs;
1001  chunks_record.word_res = word;
1002  chunks_record.splits = word->seam_array;
1003  chunks_record.chunk_widths = blobs_widths(blobs);
1004  chunks_record.char_widths = blobs_widths(blobs);
1005  /* Save chunk weights */
1006  for (x = 0; x < num_chunks; x++) {
1007  BLOB_CHOICE_LIST* choices = get_piece_rating(chunks_record.ratings, blobs,
1008  chunks_record.word_res->denorm,
1009  word->seam_array, x, x,
1010  word->blamer_bundle);
1011  blob_choice_it.set_to_list(choices);
1012  //This is done by Jetsoft. Divide by zero is possible.
1013  if (blob_choice_it.data()->certainty() == 0) {
1014  blob_weights[x]=0;
1015  } else {
1016  blob_weights[x] =
1017  -(inT16) (10 * blob_choice_it.data()->rating() /
1018  blob_choice_it.data()->certainty());
1019  }
1020  }
1021  chunks_record.weights = blob_weights;
1022 
1023  if (chop_debug)
1024  chunks_record.ratings->print(getDict().getUnicharset());
1025 
1026  if (!only_create_ratings_matrix) {
1027  if (enable_new_segsearch) {
1028  SegSearch(&chunks_record, word->best_choice,
1029  best_char_choices, word->raw_choice,
1030  state, word->blamer_bundle);
1031  } else {
1032  best_first_search(&chunks_record, best_char_choices, word,
1033  state, fixpt, best_state);
1034  }
1035  }
1036 
1037  free_widths(chunks_record.chunk_widths);
1038  free_widths(chunks_record.char_widths);
1039  return chunks_record.ratings;
1040 }
1041 } // namespace tesseract
1042 
1043 
1044 /**********************************************************************
1045  * total_containment
1046  *
1047  * Check to see if one of these outlines is totally contained within
1048  * the bounding box of the other.
1049  **********************************************************************/
1051  TBOX box1 = blob1->bounding_box();
1052  TBOX box2 = blob2->bounding_box();
1053  return box1.contains(box2) || box2.contains(box1);
1054 }