Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
wordclass.cpp
Go to the documentation of this file.
1 /* -*-C-*-
2  ********************************************************************************
3  *
4  * File: wordclass.c (Formerly wordclass.c)
5  * Description: Word classifier
6  * Author: Mark Seaman, OCR Technology
7  * Created: Tue Jan 30 14:03:25 1990
8  * Modified: Fri Jul 12 16:03:06 1991 (Mark Seaman) marks@hpgrlt
9  * Language: C
10  * Package: N/A
11  * Status: Experimental (Do Not Distribute)
12  *
13  * (c) Copyright 1990, Hewlett-Packard Company.
14  ** Licensed under the Apache License, Version 2.0 (the "License");
15  ** you may not use this file except in compliance with the License.
16  ** You may obtain a copy of the License at
17  ** http://www.apache.org/licenses/LICENSE-2.0
18  ** Unless required by applicable law or agreed to in writing, software
19  ** distributed under the License is distributed on an "AS IS" BASIS,
20  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
21  ** See the License for the specific language governing permissions and
22  ** limitations under the License.
23  *
24  *********************************************************************************/
25 /*----------------------------------------------------------------------
26  I N C L U D E S
27 ----------------------------------------------------------------------*/
28 #include <stdio.h>
29 #ifdef __UNIX__
30 #include <assert.h>
31 #endif
32 
33 #include "wordclass.h"
34 #include "associate.h"
35 #include "render.h"
36 #include "matchtab.h"
37 #include "permute.h"
38 #include "callcpp.h"
39 #include <assert.h>
40 #include "wordrec.h"
41 
42 // Include automatically generated configuration file if running autoconf.
43 #ifdef HAVE_CONFIG_H
44 #include "config_auto.h"
45 #endif
46 
47 /*----------------------------------------------------------------------
48  F u n c t i o n s
49 ----------------------------------------------------------------------*/
50 namespace tesseract {
62 BLOB_CHOICE_LIST *Wordrec::classify_blob(TBLOB *blob, const DENORM& denorm,
63  const char *string, C_COL color,
64  BlamerBundle *blamer_bundle) {
65  fflush(stdout);
66  BLOB_CHOICE_LIST *choices = NULL;
67 #ifndef GRAPHICS_DISABLED
69  display_blob(blob, color);
70 #endif
71  choices = blob_match_table.get_match(blob);
72  if (choices == NULL) {
73  choices = call_matcher(&denorm, blob);
74  blob_match_table.put_match(blob, choices);
75  // If a blob with the same bounding box as one of the truth character
76  // bounding boxes is not classified as the corresponding truth character
77  // blame character classifier for incorrect answer.
78  if (blamer_bundle != NULL && blamer_bundle->truth_has_char_boxes &&
79  blamer_bundle->incorrect_result_reason == IRR_CORRECT) {
80  for (int b = 0; b < blamer_bundle->norm_truth_word.length(); ++b) {
81  const TBOX &truth_box = blamer_bundle->norm_truth_word.BlobBox(b);
82  const TBOX &blob_box = blob->bounding_box();
83  // Note that we are more strict on the bounding box boundaries here
84  // than in other places (chopper, segmentation search), since we do
85  // not have the ability to check the previous and next bounding box.
86  if (blob_box.x_almost_equal(truth_box,
87  blamer_bundle->norm_box_tolerance/2)) {
88  BLOB_CHOICE_IT choices_it(choices);
89  bool found = false;
90  bool incorrect_adapted = false;
91  UNICHAR_ID incorrect_adapted_id = INVALID_UNICHAR_ID;
92  const char *truth_str = blamer_bundle->truth_text[b].string();
93  for (choices_it.mark_cycle_pt(); !choices_it.cycled_list();
94  choices_it.forward()) {
95  if (strcmp(truth_str, getDict().getUnicharset().get_normed_unichar(
96  choices_it.data()->unichar_id())) == 0) {
97  found = true;
98  break;
99  } else if (choices_it.data()->adapted()) {
100  incorrect_adapted = true;
101  incorrect_adapted_id = choices_it.data()->unichar_id();
102  }
103  } // end choices_it for loop
104  if (!found) {
105  STRING debug = "unichar ";
106  debug += truth_str;
107  debug += " not found in classification list";
108  blamer_bundle->SetBlame(IRR_CLASSIFIER, debug,
110  } else if (incorrect_adapted) {
111  STRING debug = "better rating for adapted ";
112  debug += getDict().getUnicharset().id_to_unichar(
113  incorrect_adapted_id);
114  debug += " than for correct ";
115  debug += truth_str;
116  blamer_bundle->SetBlame(IRR_ADAPTION, debug,
118  }
119  break;
120  }
121  } // end iterating over blamer_bundle->norm_truth_word
122  }
123  }
124 #ifndef GRAPHICS_DISABLED
125  if (classify_debug_level && string)
126  print_ratings_list(string, choices, getDict().getUnicharset());
127 
128  if (wordrec_blob_pause)
130 #endif
131 
132  return (choices);
133 }
134 
135 // Returns a valid BLOB_CHOICE_LIST representing the given result.
136 BLOB_CHOICE_LIST *Wordrec::fake_classify_blob(UNICHAR_ID class_id,
137  float rating, float certainty) {
138  BLOB_CHOICE_LIST *ratings = new BLOB_CHOICE_LIST(); // matcher result
139  BLOB_CHOICE *choice =
140  new BLOB_CHOICE(class_id, rating, certainty, -1, -1, 0, 0, 0, false);
141  BLOB_CHOICE_IT temp_it(ratings);
142  temp_it.add_after_stay_put(choice);
143  return ratings;
144 }
145 
153  TWERD *word, const BLOB_CHOICE_LIST_VECTOR &choices) {
154  TBLOB *tblob = word->blobs;
155  int index = 0;
156  for (; tblob != NULL && index < choices.length();
157  tblob = tblob->next, index++) {
158  blob_match_table.add_to_match(tblob, choices.get(index));
159  }
160 }
161 
162 } // namespace tesseract;