Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
mftraining.cpp
Go to the documentation of this file.
1 /******************************************************************************
2 ** Filename: mftraining.c
3 ** Purpose: Separates training pages into files for each character.
4 ** Strips from files only the features and there parameters of
5  the feature type mf.
6 ** Author: Dan Johnson
7 ** Revisment: Christy Russon
8 ** Environment: HPUX 6.5
9 ** Library: HPUX 6.5
10 ** History: Fri Aug 18 08:53:50 1989, DSJ, Created.
11 ** 5/25/90, DSJ, Adapted to multiple feature types.
12 ** Tuesday, May 17, 1998 Changes made to make feature specific and
13 ** simplify structures. First step in simplifying training process.
14 **
15  ** (c) Copyright Hewlett-Packard Company, 1988.
16  ** Licensed under the Apache License, Version 2.0 (the "License");
17  ** you may not use this file except in compliance with the License.
18  ** You may obtain a copy of the License at
19  ** http://www.apache.org/licenses/LICENSE-2.0
20  ** Unless required by applicable law or agreed to in writing, software
21  ** distributed under the License is distributed on an "AS IS" BASIS,
22  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
23  ** See the License for the specific language governing permissions and
24  ** limitations under the License.
25 ******************************************************************************/
29 #include <string.h>
30 #include <stdio.h>
31 #define _USE_MATH_DEFINES
32 #include <math.h>
33 #ifdef _WIN32
34 #ifndef M_PI
35 #define M_PI 3.14159265358979323846
36 #endif
37 #endif
38 
39 // Include automatically generated configuration file if running autoconf.
40 #ifdef HAVE_CONFIG_H
41 #include "config_auto.h"
42 #endif
43 
44 #include "classify.h"
45 #include "cluster.h"
46 #include "clusttool.h"
47 #include "commontraining.h"
48 #include "danerror.h"
49 #include "efio.h"
50 #include "emalloc.h"
51 #include "featdefs.h"
52 #include "fontinfo.h"
53 #include "genericvector.h"
54 #include "indexmapbidi.h"
55 #include "intproto.h"
56 #include "mastertrainer.h"
57 #include "mergenf.h"
58 #include "mf.h"
59 #include "ndminx.h"
60 #include "ocrfeatures.h"
61 #include "oldlist.h"
62 #include "protos.h"
63 #include "shapetable.h"
64 #include "tessopt.h"
65 #include "tprintf.h"
66 #include "unicity_table.h"
67 
73 using tesseract::Shape;
75 
76 #define PROGRAM_FEATURE_TYPE "mf"
77 
78 // Max length of a fake shape label.
79 const int kMaxShapeLabelLength = 10;
80 
82 
86 int main (
87  int argc,
88  char **argv);
89 
90 
91 /*----------------------------------------------------------------------------
92  Public Code
93 -----------------------------------------------------------------------------*/
94 #ifndef GRAPHICS_DISABLED
95 static void DisplayProtoList(const char* ch, LIST protolist) {
96  void* window = c_create_window("Char samples", 50, 200,
97  520, 520, -130.0, 130.0, -130.0, 130.0);
98  LIST proto = protolist;
99  iterate(proto) {
100  PROTOTYPE* prototype = reinterpret_cast<PROTOTYPE *>(first_node(proto));
101  if (prototype->Significant)
102  c_line_color_index(window, Green);
103  else if (prototype->NumSamples == 0)
104  c_line_color_index(window, Blue);
105  else if (prototype->Merged)
106  c_line_color_index(window, Magenta);
107  else
108  c_line_color_index(window, Red);
109  float x = CenterX(prototype->Mean);
110  float y = CenterY(prototype->Mean);
111  double angle = OrientationOf(prototype->Mean) * 2 * M_PI;
112  float dx = static_cast<float>(LengthOf(prototype->Mean) * cos(angle) / 2);
113  float dy = static_cast<float>(LengthOf(prototype->Mean) * sin(angle) / 2);
114  c_move(window, (x - dx) * 256, (y - dy) * 256);
115  c_draw(window, (x + dx) * 256, (y + dy) * 256);
116  if (prototype->Significant)
117  tprintf("Green proto at (%g,%g)+(%g,%g) %d samples\n",
118  x, y, dx, dy, prototype->NumSamples);
119  else if (prototype->NumSamples > 0 && !prototype->Merged)
120  tprintf("Red proto at (%g,%g)+(%g,%g) %d samples\n",
121  x, y, dx, dy, prototype->NumSamples);
122  }
123  c_make_current(window);
124 }
125 #endif // GRAPHICS_DISABLED
126 
127 // Helper to run clustering on a single config.
128 // Mostly copied from the old mftraining, but with renamed variables.
129 static LIST ClusterOneConfig(int shape_id, const char* class_label,
130  LIST mf_classes,
131  const ShapeTable& shape_table,
132  MasterTrainer* trainer) {
133  int num_samples;
134  CLUSTERER *clusterer = trainer->SetupForClustering(shape_table,
135  feature_defs,
136  shape_id,
137  &num_samples);
138  Config.MagicSamples = num_samples;
139  LIST proto_list = ClusterSamples(clusterer, &Config);
140  CleanUpUnusedData(proto_list);
141 
142  // Merge protos where reasonable to make more of them significant by
143  // representing almost all samples of the class/font.
144  MergeInsignificantProtos(proto_list, class_label, clusterer, &Config);
145  #ifndef GRAPHICS_DISABLED
146  if (strcmp(FLAGS_test_ch.c_str(), class_label) == 0)
147  DisplayProtoList(FLAGS_test_ch.c_str(), proto_list);
148  #endif // GRAPHICS_DISABLED
149  // Delete the protos that will not be used in the inttemp output file.
150  proto_list = RemoveInsignificantProtos(proto_list, true,
151  false,
152  clusterer->SampleSize);
153  FreeClusterer(clusterer);
154  MERGE_CLASS merge_class = FindClass(mf_classes, class_label);
155  if (merge_class == NULL) {
156  merge_class = NewLabeledClass(class_label);
157  mf_classes = push(mf_classes, merge_class);
158  }
159  int config_id = AddConfigToClass(merge_class->Class);
160  merge_class->Class->font_set.push_back(shape_id);
161  LIST proto_it = proto_list;
162  iterate(proto_it) {
163  PROTOTYPE* prototype = reinterpret_cast<PROTOTYPE*>(first_node(proto_it));
164  // See if proto can be approximated by existing proto.
165  int p_id = FindClosestExistingProto(merge_class->Class,
166  merge_class->NumMerged, prototype);
167  if (p_id == NO_PROTO) {
168  // Need to make a new proto, as it doesn't match anything.
169  p_id = AddProtoToClass(merge_class->Class);
170  MakeNewFromOld(ProtoIn(merge_class->Class, p_id), prototype);
171  merge_class->NumMerged[p_id] = 1;
172  } else {
173  PROTO_STRUCT dummy_proto;
174  MakeNewFromOld(&dummy_proto, prototype);
175  // Merge with the similar proto.
176  ComputeMergedProto(ProtoIn(merge_class->Class, p_id), &dummy_proto,
177  static_cast<FLOAT32>(merge_class->NumMerged[p_id]),
178  1.0,
179  ProtoIn(merge_class->Class, p_id));
180  merge_class->NumMerged[p_id]++;
181  }
182  AddProtoToConfig(p_id, merge_class->Class->Configurations[config_id]);
183  }
184  FreeProtoList(&proto_list);
185  return mf_classes;
186 }
187 
188 // Helper to setup the config map.
189 // Setup an index mapping from the shapes in the shape table to the classes
190 // that will be trained. In keeping with the original design, each shape
191 // with the same list of unichars becomes a different class and the configs
192 // represent the different combinations of fonts.
193 static void SetupConfigMap(ShapeTable* shape_table, IndexMapBiDi* config_map) {
194  int num_configs = shape_table->NumShapes();
195  config_map->Init(num_configs, true);
196  config_map->Setup();
197  for (int c1 = 0; c1 < num_configs; ++c1) {
198  // Only process ids that are not already merged.
199  if (config_map->SparseToCompact(c1) == c1) {
200  Shape* shape1 = shape_table->MutableShape(c1);
201  // Find all the subsequent shapes that are equal.
202  for (int c2 = c1 + 1; c2 < num_configs; ++c2) {
203  if (shape_table->MutableShape(c2)->IsEqualUnichars(shape1)) {
204  config_map->Merge(c1, c2);
205  }
206  }
207  }
208  }
209  config_map->CompleteMerges();
210 }
211 
212 /*---------------------------------------------------------------------------*/
213 int main (int argc, char **argv) {
214 /*
215 ** Parameters:
216 ** argc number of command line arguments
217 ** argv array of command line arguments
218 ** Globals: none
219 ** Operation:
220 ** This program reads in a text file consisting of feature
221 ** samples from a training page in the following format:
222 **
223 ** FontName UTF8-char-str xmin ymin xmax ymax page-number
224 ** NumberOfFeatureTypes(N)
225 ** FeatureTypeName1 NumberOfFeatures(M)
226 ** Feature1
227 ** ...
228 ** FeatureM
229 ** FeatureTypeName2 NumberOfFeatures(M)
230 ** Feature1
231 ** ...
232 ** FeatureM
233 ** ...
234 ** FeatureTypeNameN NumberOfFeatures(M)
235 ** Feature1
236 ** ...
237 ** FeatureM
238 ** FontName CharName ...
239 **
240 ** The result of this program is a binary inttemp file used by
241 ** the OCR engine.
242 ** Return: none
243 ** Exceptions: none
244 ** History: Fri Aug 18 08:56:17 1989, DSJ, Created.
245 ** Mon May 18 1998, Christy Russson, Revistion started.
246 */
247  ParseArguments(&argc, &argv);
248 
249  ShapeTable* shape_table = NULL;
250  STRING file_prefix;
251  // Load the training data.
252  MasterTrainer* trainer = tesseract::LoadTrainingData(argc, argv,
253  false,
254  &shape_table,
255  &file_prefix);
256  if (trainer == NULL)
257  return 1; // Failed.
258 
259  // Setup an index mapping from the shapes in the shape table to the classes
260  // that will be trained. In keeping with the original design, each shape
261  // with the same list of unichars becomes a different class and the configs
262  // represent the different combinations of fonts.
263  IndexMapBiDi config_map;
264  SetupConfigMap(shape_table, &config_map);
265 
266  WriteShapeTable(file_prefix, *shape_table);
267  // If the shape_table is flat, then either we didn't run shape clustering, or
268  // it did nothing, so we just output the trainer's unicharset.
269  // Otherwise shape_set will hold a fake unicharset with an entry for each
270  // shape in the shape table, and we will output that instead.
271  UNICHARSET shape_set;
272  const UNICHARSET* unicharset = &trainer->unicharset();
273  // If we ran shapeclustering (and it worked) then at least one shape will
274  // have multiple unichars, so we have to build a fake unicharset.
275  if (shape_table->AnyMultipleUnichars()) {
276  unicharset = &shape_set;
277  // Now build a fake unicharset for the compact shape space to keep the
278  // output modules happy that we are doing things correctly.
279  int num_shapes = config_map.CompactSize();
280  for (int s = 0; s < num_shapes; ++s) {
281  char shape_label[kMaxShapeLabelLength + 1];
282  snprintf(shape_label, kMaxShapeLabelLength, "sh%04d", s);
283  shape_set.unichar_insert(shape_label);
284  }
285  }
286 
287  // Now train each config separately.
288  int num_configs = shape_table->NumShapes();
289  LIST mf_classes = NIL_LIST;
290  for (int s = 0; s < num_configs; ++s) {
291  int unichar_id, font_id;
292  if (unicharset == &shape_set) {
293  // Using fake unichar_ids from the config_map/shape_set.
294  unichar_id = config_map.SparseToCompact(s);
295  } else {
296  // Get the real unichar_id from the shape table/unicharset.
297  shape_table->GetFirstUnicharAndFont(s, &unichar_id, &font_id);
298  }
299  const char* class_label = unicharset->id_to_unichar(unichar_id);
300  mf_classes = ClusterOneConfig(s, class_label, mf_classes, *shape_table,
301  trainer);
302  }
303  STRING inttemp_file = file_prefix;
304  inttemp_file += "inttemp";
305  STRING pffmtable_file = file_prefix;
306  pffmtable_file += "pffmtable";
307  CLASS_STRUCT* float_classes = SetUpForFloat2Int(*unicharset, mf_classes);
308  // Now write the inttemp and pffmtable.
309  trainer->WriteInttempAndPFFMTable(trainer->unicharset(), *unicharset,
310  *shape_table, float_classes,
311  inttemp_file.string(),
312  pffmtable_file.string());
313  delete [] float_classes;
314  FreeLabeledClassList(mf_classes);
315  delete trainer;
316  delete shape_table;
317  printf("Done!\n");
318  if (!FLAGS_test_ch.empty()) {
319  // If we are displaying debug window(s), wait for the user to look at them.
320  printf("Hit return to exit...\n");
321  while (getchar() != '\n');
322  }
323  return 0;
324 } /* main */