Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
wordrec.cpp
Go to the documentation of this file.
1 
2 // File: wordrec.cpp
3 // Description: wordrec class.
4 // Author: Samuel Charron
5 //
6 // (C) Copyright 2006, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
18 
19 #include "wordrec.h"
20 
21 #include "language_model.h"
22 #include "params.h"
23 
24 
25 namespace tesseract {
27  // control parameters
28  BOOL_MEMBER(merge_fragments_in_matrix, TRUE,
29  "Merge the fragments in the ratings matrix and delete them"
30  " after merging", params()),
31  BOOL_MEMBER(wordrec_no_block, FALSE, "Don't output block information",
32  params()),
33  BOOL_MEMBER(wordrec_enable_assoc, TRUE, "Associator Enable",
34  params()),
35  BOOL_MEMBER(force_word_assoc, FALSE,
36  "force associator to run regardless of what enable_assoc is."
37  "This is used for CJK where component grouping is necessary.",
38  CCUtil::params()),
39  INT_MEMBER(wordrec_num_seg_states, 30, "Segmentation states",
40  CCUtil::params()),
41  double_MEMBER(wordrec_worst_state, 1.0, "Worst segmentation state",
42  params()),
43  BOOL_MEMBER(fragments_guide_chopper, FALSE,
44  "Use information from fragments to guide chopping process",
45  params()),
46  INT_MEMBER(repair_unchopped_blobs, 1, "Fix blobs that aren't chopped",
47  params()),
48  double_MEMBER(tessedit_certainty_threshold, -2.25, "Good blob limit",
49  params()),
50  INT_MEMBER(chop_debug, 0, "Chop debug",
51  params()),
52  BOOL_MEMBER(chop_enable, 1, "Chop enable",
53  params()),
54  BOOL_MEMBER(chop_vertical_creep, 0, "Vertical creep",
55  params()),
56  INT_MEMBER(chop_split_length, 10000, "Split Length",
57  params()),
58  INT_MEMBER(chop_same_distance, 2, "Same distance",
59  params()),
60  INT_MEMBER(chop_min_outline_points, 6, "Min Number of Points on Outline",
61  params()),
62  INT_MEMBER(chop_inside_angle, -50, "Min Inside Angle Bend",
63  params()),
64  INT_MEMBER(chop_min_outline_area, 2000, "Min Outline Area",
65  params()),
66  double_MEMBER(chop_split_dist_knob, 0.5, "Split length adjustment",
67  params()),
68  double_MEMBER(chop_overlap_knob, 0.9, "Split overlap adjustment",
69  params()),
70  double_MEMBER(chop_center_knob, 0.15, "Split center adjustment",
71  params()),
72  double_MEMBER(chop_sharpness_knob, 0.06, "Split sharpness adjustment",
73  params()),
74  double_MEMBER(chop_width_change_knob, 5.0, "Width change adjustment",
75  params()),
76  double_MEMBER(chop_ok_split, 100.0, "OK split limit",
77  params()),
78  double_MEMBER(chop_good_split, 50.0, "Good split limit",
79  params()),
80  INT_MEMBER(chop_x_y_weight, 3, "X / Y length weight",
81  params()),
82  INT_MEMBER(segment_adjust_debug, 0, "Segmentation adjustment debug",
83  params()),
84  BOOL_MEMBER(assume_fixed_pitch_char_segment, FALSE,
85  "include fixed-pitch heuristics in char segmentation",
86  params()),
87  BOOL_MEMBER(use_new_state_cost, FALSE,
88  "use new state cost heuristics for segmentation state evaluation",
89  params()),
90  double_MEMBER(heuristic_segcost_rating_base, 1.25,
91  "base factor for adding segmentation cost into word rating."
92  "It's a multiplying factor, the larger the value above 1, "
93  "the bigger the effect of segmentation cost.",
94  params()),
95  double_MEMBER(heuristic_weight_rating, 1.0,
96  "weight associated with char rating in combined cost of state",
97  params()),
98  double_MEMBER(heuristic_weight_width, 1000.0,
99  "weight associated with width evidence in combined cost of"
100  " state", params()),
101  double_MEMBER(heuristic_weight_seamcut, 0.0,
102  "weight associated with seam cut in combined cost of state",
103  params()),
104  double_MEMBER(heuristic_max_char_wh_ratio, 2.0,
105  "max char width-to-height ratio allowed in segmentation",
106  params()),
107  INT_MEMBER(wordrec_debug_level, 0,
108  "Debug level for wordrec", params()),
109  BOOL_MEMBER(wordrec_debug_blamer, false,
110  "Print blamer debug messages", params()),
111  BOOL_MEMBER(wordrec_run_blamer, false,
112  "Try to set the blame for errors", params()),
113  BOOL_MEMBER(enable_new_segsearch, true,
114  "Enable new segmentation search path.", params()),
115  INT_MEMBER(segsearch_debug_level, 0,
116  "SegSearch debug level", params()),
117  INT_MEMBER(segsearch_max_pain_points, 2000,
118  "Maximum number of pain points stored in the queue",
119  params()),
120  INT_MEMBER(segsearch_max_futile_classifications, 10,
121  "Maximum number of pain point classifications per word that"
122  "did not result in finding a better word choice.",
123  params()),
124  double_MEMBER(segsearch_max_char_wh_ratio, 2.0,
125  "Maximum character width-to-height ratio", params()),
126  double_MEMBER(segsearch_max_fixed_pitch_char_wh_ratio, 2.0,
127  "Maximum character width-to-height ratio for"
128  " fixed-pitch fonts",
129  params()),
130  BOOL_MEMBER(save_alt_choices, false,
131  "Save alternative paths found during chopping"
132  " and segmentation search",
133  params()) {
136  &(getDict()));
137  pass2_seg_states = 0;
138  num_joints = 0;
139  num_pushed = 0;
140  num_popped = 0;
142 }
143 
145  delete language_model_;
146 }
147 
150  to->delete_data_pointers();
151  to->clear();
152  for (int i = 0; i < from.size(); ++i) {
153  BLOB_CHOICE_LIST *cc_list = new BLOB_CHOICE_LIST();
154  cc_list->deep_copy(from[i], &BLOB_CHOICE::deep_copy);
155  to->push_back(cc_list);
156  }
157 }
158 
160  const WERD_CHOICE *choice,
161  const GenericVector<STRING> &truth_text) {
162  if (choice == NULL) return false;
163  int i;
164  STRING truth_str;
165  for (i = 0; i < truth_text.length(); ++i) truth_str += truth_text[i];
166  STRING normed_choice_str;
167  for (i = 0; i < choice->length(); ++i) {
168  normed_choice_str += uni_set.get_normed_unichar(choice->unichar_id(i));
169  }
170  return (truth_str == normed_choice_str);
171 }
172 
173 void Wordrec::SaveAltChoices(const LIST &best_choices, WERD_RES *word) {
174  ASSERT_HOST(word->alt_choices.empty());
175  ASSERT_HOST(word->alt_states.empty());
176  LIST list_it;
177  iterate_list(list_it, best_choices) {
178  VIABLE_CHOICE choice =
179  reinterpret_cast<VIABLE_CHOICE>(first_node(list_it));
180  CHAR_CHOICE *char_choice = &(choice->Blob[0]);
181  WERD_CHOICE *alt_choice = new WERD_CHOICE(word->uch_set, choice->Length);
183  GenericVector<int> &alt_state = word->alt_states.back();
184  for (int i = 0; i < choice->Length; char_choice++, i++) {
186  char_choice->Class, 1, 0, 0);
187  alt_state.push_back(char_choice->NumChunks);
188  }
189  alt_choice->set_rating(choice->Rating);
190  alt_choice->set_certainty(choice->Certainty);
191 
192  ASSERT_HOST(choice->blob_choices != NULL);
193  alt_choice->set_blob_choices(choice->blob_choices);
194  choice->blob_choices = NULL;
195 
196  word->alt_choices.push_back(alt_choice);
197  if (wordrec_debug_level > 0) {
198  tprintf("SaveAltChoices: %s %g\n",
199  alt_choice->unichar_string().string(), alt_choice->rating());
200  }
201  }
202 }
203 
204 } // namespace tesseract