Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
trainingsampleset.cpp
Go to the documentation of this file.
1 // Copyright 2010 Google Inc. All Rights Reserved.
2 // Author: rays@google.com (Ray Smith)
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
8 // Unless required by applicable law or agreed to in writing, software
9 // distributed under the License is distributed on an "AS IS" BASIS,
10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 // See the License for the specific language governing permissions and
12 // limitations under the License.
13 //
15 
16 #include "trainingsampleset.h"
17 #include "allheaders.h"
18 #include "boxread.h"
19 #include "fontinfo.h"
20 #include "indexmapbidi.h"
21 #include "intfeaturedist.h"
22 #include "intfeaturemap.h"
23 #include "intfeaturespace.h"
24 #include "shapetable.h"
25 #include "trainingsample.h"
26 #include "unicity_table.h"
27 
28 namespace tesseract {
29 
30 const int kTestChar = -1; // 37;
31 // Max number of distances to compute the squared way
32 const int kSquareLimit = 25;
33 // Prime numbers for subsampling distances.
34 const int kPrime1 = 17;
35 const int kPrime2 = 13;
36 // Min samples from which to start discarding outliers.
37 const int kMinOutlierSamples = 5;
38 
39 TrainingSampleSet::FontClassInfo::FontClassInfo()
40  : num_raw_samples(0), canonical_sample(-1), canonical_dist(0.0f) {
41 }
42 
43 // Writes to the given file. Returns false in case of error.
44 bool TrainingSampleSet::FontClassInfo::Serialize(FILE* fp) const {
45  if (fwrite(&num_raw_samples, sizeof(num_raw_samples), 1, fp) != 1)
46  return false;
47  if (fwrite(&canonical_sample, sizeof(canonical_sample), 1, fp) != 1)
48  return false;
49  if (fwrite(&canonical_dist, sizeof(canonical_dist), 1, fp) != 1) return false;
50  if (!samples.Serialize(fp)) return false;
51  return true;
52 }
53 // Reads from the given file. Returns false in case of error.
54 // If swap is true, assumes a big/little-endian swap is needed.
55 bool TrainingSampleSet::FontClassInfo::DeSerialize(bool swap, FILE* fp) {
56  if (fread(&num_raw_samples, sizeof(num_raw_samples), 1, fp) != 1)
57  return false;
58  if (fread(&canonical_sample, sizeof(canonical_sample), 1, fp) != 1)
59  return false;
60  if (fread(&canonical_dist, sizeof(canonical_dist), 1, fp) != 1) return false;
61  if (!samples.DeSerialize(swap, fp)) return false;
62  if (swap) {
63  ReverseN(&num_raw_samples, sizeof(num_raw_samples));
64  ReverseN(&canonical_sample, sizeof(canonical_sample));
65  ReverseN(&canonical_dist, sizeof(canonical_dist));
66  }
67  return true;
68 }
69 
70 TrainingSampleSet::TrainingSampleSet(const UnicityTable<FontInfo>& font_table)
71  : num_raw_samples_(0), unicharset_size_(0),
72  font_class_array_(NULL), fontinfo_table_(font_table) {
73 }
74 
76  delete font_class_array_;
77 }
78 
79 // Writes to the given file. Returns false in case of error.
80 bool TrainingSampleSet::Serialize(FILE* fp) const {
81  if (!samples_.Serialize(fp)) return false;
82  if (!unicharset_.save_to_file(fp)) return false;
83  if (!font_id_map_.Serialize(fp)) return false;
84  inT8 not_null = font_class_array_ != NULL;
85  if (fwrite(&not_null, sizeof(not_null), 1, fp) != 1) return false;
86  if (not_null) {
87  if (!font_class_array_->SerializeClasses(fp)) return false;
88  }
89  return true;
90 }
91 
92 // Reads from the given file. Returns false in case of error.
93 // If swap is true, assumes a big/little-endian swap is needed.
94 bool TrainingSampleSet::DeSerialize(bool swap, FILE* fp) {
95  if (!samples_.DeSerialize(swap, fp)) return false;
96  num_raw_samples_ = samples_.size();
97  if (!unicharset_.load_from_file(fp)) return false;
98  if (!font_id_map_.DeSerialize(swap, fp)) return false;
99  if (font_class_array_ != NULL) {
100  delete font_class_array_;
101  font_class_array_ = NULL;
102  }
103  inT8 not_null;
104  if (fread(&not_null, sizeof(not_null), 1, fp) != 1) return false;
105  if (not_null) {
106  FontClassInfo empty;
107  font_class_array_ = new GENERIC_2D_ARRAY<FontClassInfo >(1, 1 , empty);
108  if (!font_class_array_->DeSerializeClasses(swap, fp)) return false;
109  }
110  unicharset_size_ = unicharset_.size();
111  return true;
112 }
113 
114 // Load an initial unicharset, or set one up if the file cannot be read.
116  if (!unicharset_.load_from_file(filename)) {
117  tprintf("Failed to load unicharset from file %s\n"
118  "Building unicharset for boosting from scratch...\n",
119  filename);
120  unicharset_.clear();
121  // Space character needed to represent NIL_LIST classification.
122  unicharset_.unichar_insert(" ");
123  }
124  unicharset_size_ = unicharset_.size();
125 }
126 
127 // Adds a character sample to this sample set.
128 // If the unichar is not already in the local unicharset, it is added.
129 // Returns the unichar_id of the added sample, from the local unicharset.
131  if (!unicharset_.contains_unichar(unichar)) {
132  unicharset_.unichar_insert(unichar);
133  if (unicharset_.size() > MAX_NUM_CLASSES) {
134  tprintf("Error: Size of unicharset in TrainingSampleSet::AddSample is "
135  "greater than MAX_NUM_CLASSES\n");
136  return -1;
137  }
138  }
139  UNICHAR_ID char_id = unicharset_.unichar_to_id(unichar);
140  AddSample(char_id, sample);
141  return char_id;
142 }
143 
144 // Adds a character sample to this sample set with the given unichar_id,
145 // which must correspond to the local unicharset (in this).
147  sample->set_class_id(unichar_id);
148  samples_.push_back(sample);
149  num_raw_samples_ = samples_.size();
150  unicharset_size_ = unicharset_.size();
151 }
152 
153 // Returns the number of samples for the given font,class pair.
154 // If randomize is true, returns the number of samples accessible
155 // with randomizing on. (Increases the number of samples if small.)
156 // OrganizeByFontAndClass must have been already called.
157 int TrainingSampleSet::NumClassSamples(int font_id, int class_id,
158  bool randomize) const {
159  ASSERT_HOST(font_class_array_ != NULL);
160  if (font_id < 0 || class_id < 0 ||
161  font_id >= font_id_map_.SparseSize() || class_id >= unicharset_size_) {
162  // There are no samples because the font or class doesn't exist.
163  return 0;
164  }
165  int font_index = font_id_map_.SparseToCompact(font_id);
166  if (font_index < 0)
167  return 0; // The font has no samples.
168  if (randomize)
169  return (*font_class_array_)(font_index, class_id).samples.size();
170  else
171  return (*font_class_array_)(font_index, class_id).num_raw_samples;
172 }
173 
174 // Gets a sample by its index.
176  return samples_[index];
177 }
178 
179 // Gets a sample by its font, class, index.
180 // OrganizeByFontAndClass must have been already called.
181 const TrainingSample* TrainingSampleSet::GetSample(int font_id, int class_id,
182  int index) const {
183  ASSERT_HOST(font_class_array_ != NULL);
184  int font_index = font_id_map_.SparseToCompact(font_id);
185  if (font_index < 0) return NULL;
186  int sample_index = (*font_class_array_)(font_index, class_id).samples[index];
187  return samples_[sample_index];
188 }
189 
190 // Get a sample by its font, class, index. Does not randomize.
191 // OrganizeByFontAndClass must have been already called.
193  int index) {
194  ASSERT_HOST(font_class_array_ != NULL);
195  int font_index = font_id_map_.SparseToCompact(font_id);
196  if (font_index < 0) return NULL;
197  int sample_index = (*font_class_array_)(font_index, class_id).samples[index];
198  return samples_[sample_index];
199 }
200 
201 // Returns a string debug representation of the given sample:
202 // font, unichar_str, bounding box, page.
204  STRING boxfile_str;
205  MakeBoxFileStr(unicharset_.id_to_unichar(sample.class_id()),
206  sample.bounding_box(), sample.page_num(), &boxfile_str);
207  return STRING(fontinfo_table_.get(sample.font_id()).name) + " " + boxfile_str;
208 }
209 
210 // Gets the combined set of features used by all the samples of the given
211 // font/class combination.
213  int font_id, int class_id) const {
214  int font_index = font_id_map_.SparseToCompact(font_id);
215  ASSERT_HOST(font_index >= 0);
216  return (*font_class_array_)(font_index, class_id).cloud_features;
217 }
218 // Gets the indexed features of the canonical sample of the given
219 // font/class combination.
221  int font_id, int class_id) const {
222  int font_index = font_id_map_.SparseToCompact(font_id);
223  ASSERT_HOST(font_index >= 0);
224  return (*font_class_array_)(font_index, class_id).canonical_features;
225 }
226 
227 // Returns the distance between the given UniCharAndFonts pair.
228 // If matched_fonts, only matching fonts, are considered, unless that yields
229 // the empty set.
230 // OrganizeByFontAndClass must have been already called.
232  const UnicharAndFonts& uf2,
233  bool matched_fonts,
234  const IntFeatureMap& feature_map) {
235  int num_fonts1 = uf1.font_ids.size();
236  int c1 = uf1.unichar_id;
237  int num_fonts2 = uf2.font_ids.size();
238  int c2 = uf2.unichar_id;
239  double dist_sum = 0.0;
240  int dist_count = 0;
241  bool debug = false;
242  if (matched_fonts) {
243  // Compute distances only where fonts match.
244  for (int i = 0; i < num_fonts1; ++i) {
245  int f1 = uf1.font_ids[i];
246  for (int j = 0; j < num_fonts2; ++j) {
247  int f2 = uf2.font_ids[j];
248  if (f1 == f2) {
249  dist_sum += ClusterDistance(f1, c1, f2, c2, feature_map);
250  ++dist_count;
251  }
252  }
253  }
254  } else if (num_fonts1 * num_fonts2 <= kSquareLimit) {
255  // Small enough sets to compute all the distances.
256  for (int i = 0; i < num_fonts1; ++i) {
257  int f1 = uf1.font_ids[i];
258  for (int j = 0; j < num_fonts2; ++j) {
259  int f2 = uf2.font_ids[j];
260  dist_sum += ClusterDistance(f1, c1, f2, c2, feature_map);
261  if (debug) {
262  tprintf("Cluster dist %d %d %d %d = %g\n",
263  f1, c1, f2, c2,
264  ClusterDistance(f1, c1, f2, c2, feature_map));
265  }
266  ++dist_count;
267  }
268  }
269  } else {
270  // Subsample distances, using the largest set once, and stepping through
271  // the smaller set so as to ensure that all the pairs are different.
272  int increment = kPrime1 != num_fonts2 ? kPrime1 : kPrime2;
273  int index = 0;
274  int num_samples = MAX(num_fonts1, num_fonts2);
275  for (int i = 0; i < num_samples; ++i, index += increment) {
276  int f1 = uf1.font_ids[i % num_fonts1];
277  int f2 = uf2.font_ids[index % num_fonts2];
278  if (debug) {
279  tprintf("Cluster dist %d %d %d %d = %g\n",
280  f1, c1, f2, c2, ClusterDistance(f1, c1, f2, c2, feature_map));
281  }
282  dist_sum += ClusterDistance(f1, c1, f2, c2, feature_map);
283  ++dist_count;
284  }
285  }
286  if (dist_count == 0) {
287  if (matched_fonts)
288  return UnicharDistance(uf1, uf2, false, feature_map);
289  return 0.0f;
290  }
291  return dist_sum / dist_count;
292 }
293 
294 // Returns the distance between the given pair of font/class pairs.
295 // Finds in cache or computes and caches.
296 // OrganizeByFontAndClass must have been already called.
297 float TrainingSampleSet::ClusterDistance(int font_id1, int class_id1,
298  int font_id2, int class_id2,
299  const IntFeatureMap& feature_map) {
300  ASSERT_HOST(font_class_array_ != NULL);
301  int font_index1 = font_id_map_.SparseToCompact(font_id1);
302  int font_index2 = font_id_map_.SparseToCompact(font_id2);
303  if (font_index1 < 0 || font_index2 < 0)
304  return 0.0f;
305  FontClassInfo& fc_info = (*font_class_array_)(font_index1, class_id1);
306  if (font_id1 == font_id2) {
307  // Special case cache for speed.
308  if (fc_info.unichar_distance_cache.size() == 0)
309  fc_info.unichar_distance_cache.init_to_size(unicharset_size_, -1.0f);
310  if (fc_info.unichar_distance_cache[class_id2] < 0) {
311  // Distance has to be calculated.
312  float result = ComputeClusterDistance(font_id1, class_id1,
313  font_id2, class_id2,
314  feature_map);
315  fc_info.unichar_distance_cache[class_id2] = result;
316  // Copy to the symmetric cache entry.
317  FontClassInfo& fc_info2 = (*font_class_array_)(font_index2, class_id2);
318  if (fc_info2.unichar_distance_cache.size() == 0)
319  fc_info2.unichar_distance_cache.init_to_size(unicharset_size_, -1.0f);
320  fc_info2.unichar_distance_cache[class_id1] = result;
321  }
322  return fc_info.unichar_distance_cache[class_id2];
323  } else if (class_id1 == class_id2) {
324  // Another special-case cache for equal class-id.
325  if (fc_info.font_distance_cache.size() == 0)
326  fc_info.font_distance_cache.init_to_size(font_id_map_.CompactSize(),
327  -1.0f);
328  if (fc_info.font_distance_cache[font_index2] < 0) {
329  // Distance has to be calculated.
330  float result = ComputeClusterDistance(font_id1, class_id1,
331  font_id2, class_id2,
332  feature_map);
333  fc_info.font_distance_cache[font_index2] = result;
334  // Copy to the symmetric cache entry.
335  FontClassInfo& fc_info2 = (*font_class_array_)(font_index2, class_id2);
336  if (fc_info2.font_distance_cache.size() == 0)
337  fc_info2.font_distance_cache.init_to_size(font_id_map_.CompactSize(),
338  -1.0f);
339  fc_info2.font_distance_cache[font_index1] = result;
340  }
341  return fc_info.font_distance_cache[font_index2];
342  }
343  // Both font and class are different. Linear search for class_id2/font_id2
344  // in what is a hopefully short list of distances.
345  int cache_index = 0;
346  while (cache_index < fc_info.distance_cache.size() &&
347  (fc_info.distance_cache[cache_index].unichar_id != class_id2 ||
348  fc_info.distance_cache[cache_index].font_id != font_id2))
349  ++cache_index;
350  if (cache_index == fc_info.distance_cache.size()) {
351  // Distance has to be calculated.
352  float result = ComputeClusterDistance(font_id1, class_id1,
353  font_id2, class_id2,
354  feature_map);
355  FontClassDistance fc_dist = { class_id2, font_id2, result };
356  fc_info.distance_cache.push_back(fc_dist);
357  // Copy to the symmetric cache entry. We know it isn't there already, as
358  // we always copy to the symmetric entry.
359  FontClassInfo& fc_info2 = (*font_class_array_)(font_index2, class_id2);
360  fc_dist.unichar_id = class_id1;
361  fc_dist.font_id = font_id1;
362  fc_info2.distance_cache.push_back(fc_dist);
363  }
364  return fc_info.distance_cache[cache_index].distance;
365 }
366 
367 // Computes the distance between the given pair of font/class pairs.
369  int font_id1, int class_id1, int font_id2, int class_id2,
370  const IntFeatureMap& feature_map) const {
371  int dist = ReliablySeparable(font_id1, class_id1, font_id2, class_id2,
372  feature_map, false);
373  dist += ReliablySeparable(font_id2, class_id2, font_id1, class_id1,
374  feature_map, false);
375  int denominator = GetCanonicalFeatures(font_id1, class_id1).size();
376  denominator += GetCanonicalFeatures(font_id2, class_id2).size();
377  return static_cast<float>(dist) / denominator;
378 }
379 
380 // Helper to add a feature and its near neighbors to the good_features.
381 // levels indicates how many times to compute the offset features of what is
382 // already there. This is done by iteration rather than recursion.
383 static void AddNearFeatures(const IntFeatureMap& feature_map, int f, int levels,
384  GenericVector<int>* good_features) {
385  int prev_num_features = 0;
386  good_features->push_back(f);
387  int num_features = 1;
388  for (int level = 0; level < levels; ++level) {
389  for (int i = prev_num_features; i < num_features; ++i) {
390  int feature = (*good_features)[i];
391  for (int dir = -kNumOffsetMaps; dir <= kNumOffsetMaps; ++dir) {
392  if (dir == 0) continue;
393  int f1 = feature_map.OffsetFeature(feature, dir);
394  if (f1 >= 0) {
395  good_features->push_back(f1);
396  }
397  }
398  }
399  prev_num_features = num_features;
400  num_features = good_features->size();
401  }
402 }
403 
404 // Returns the number of canonical features of font/class 2 for which
405 // neither the feature nor any of its near neighbors occurs in the cloud
406 // of font/class 1. Each such feature is a reliable separation between
407 // the classes, ASSUMING that the canonical sample is sufficiently
408 // representative that every sample has a feature near that particular
409 // feature. To check that this is so on the fly would be prohibitively
410 // expensive, but it might be possible to pre-qualify the canonical features
411 // to include only those for which this assumption is true.
412 // ComputeCanonicalFeatures and ComputeCloudFeatures must have been called
413 // first, or the results will be nonsense.
414 int TrainingSampleSet::ReliablySeparable(int font_id1, int class_id1,
415  int font_id2, int class_id2,
416  const IntFeatureMap& feature_map,
417  bool thorough) const {
418  int result = 0;
419  const TrainingSample* sample2 = GetCanonicalSample(font_id2, class_id2);
420  if (sample2 == NULL)
421  return 0; // There are no canonical features.
422  const GenericVector<int>& canonical2 = GetCanonicalFeatures(font_id2,
423  class_id2);
424  const BitVector& cloud1 = GetCloudFeatures(font_id1, class_id1);
425  if (cloud1.size() == 0)
426  return canonical2.size(); // There are no cloud features.
427 
428  // Find a canonical2 feature that is not in cloud1.
429  for (int f = 0; f < canonical2.size(); ++f) {
430  int feature = canonical2[f];
431  if (cloud1[feature])
432  continue;
433  // Gather the near neighbours of f.
434  GenericVector<int> good_features;
435  AddNearFeatures(feature_map, feature, 1, &good_features);
436  // Check that none of the good_features are in the cloud.
437  int i;
438  for (i = 0; i < good_features.size(); ++i) {
439  int good_f = good_features[i];
440  if (cloud1[good_f]) {
441  break;
442  }
443  }
444  if (i < good_features.size())
445  continue; // Found one in the cloud.
446  ++result;
447  }
448  return result;
449 }
450 
451 // Returns the total index of the requested sample.
452 // OrganizeByFontAndClass must have been already called.
453 int TrainingSampleSet::GlobalSampleIndex(int font_id, int class_id,
454  int index) const {
455  ASSERT_HOST(font_class_array_ != NULL);
456  int font_index = font_id_map_.SparseToCompact(font_id);
457  if (font_index < 0) return -1;
458  return (*font_class_array_)(font_index, class_id).samples[index];
459 }
460 
461 // Gets the canonical sample for the given font, class pair.
462 // ComputeCanonicalSamples must have been called first.
464  int font_id, int class_id) const {
465  ASSERT_HOST(font_class_array_ != NULL);
466  int font_index = font_id_map_.SparseToCompact(font_id);
467  if (font_index < 0) return NULL;
468  int sample_index = (*font_class_array_)(font_index,
469  class_id).canonical_sample;
470  return sample_index >= 0 ? samples_[sample_index] : NULL;
471 }
472 
473 // Gets the max distance for the given canonical sample.
474 // ComputeCanonicalSamples must have been called first.
475 float TrainingSampleSet::GetCanonicalDist(int font_id, int class_id) const {
476  ASSERT_HOST(font_class_array_ != NULL);
477  int font_index = font_id_map_.SparseToCompact(font_id);
478  if (font_index < 0) return 0.0f;
479  if ((*font_class_array_)(font_index, class_id).canonical_sample >= 0)
480  return (*font_class_array_)(font_index, class_id).canonical_dist;
481  else
482  return 0.0f;
483 }
484 
485 // Generates indexed features for all samples with the supplied feature_space.
487  for (int s = 0; s < samples_.size(); ++s)
488  samples_[s]->IndexFeatures(feature_space);
489 }
490 
491 // Delete outlier samples with few features that are shared with others.
492 // IndexFeatures must have been called already.
494  bool debug) {
495  if (font_class_array_ == NULL)
497  Pixa* pixa = NULL;
498  if (debug)
499  pixa = pixaCreate(0);
500  GenericVector<int> feature_counts;
501  int fs_size = feature_space.Size();
502  int font_size = font_id_map_.CompactSize();
503  for (int font_index = 0; font_index < font_size; ++font_index) {
504  for (int c = 0; c < unicharset_size_; ++c) {
505  // Create a histogram of the features used by all samples of this
506  // font/class combination.
507  feature_counts.init_to_size(fs_size, 0);
508  FontClassInfo& fcinfo = (*font_class_array_)(font_index, c);
509  int sample_count = fcinfo.samples.size();
510  if (sample_count < kMinOutlierSamples)
511  continue;
512  for (int i = 0; i < sample_count; ++i) {
513  int s = fcinfo.samples[i];
514  const GenericVector<int>& features = samples_[s]->indexed_features();
515  for (int f = 0; f < features.size(); ++f) {
516  ++feature_counts[features[f]];
517  }
518  }
519  for (int i = 0; i < sample_count; ++i) {
520  int s = fcinfo.samples[i];
521  const TrainingSample& sample = *samples_[s];
522  const GenericVector<int>& features = sample.indexed_features();
523  // A feature that has a histogram count of 1 is only used by this
524  // sample, making it 'bad'. All others are 'good'.
525  int good_features = 0;
526  int bad_features = 0;
527  for (int f = 0; f < features.size(); ++f) {
528  if (feature_counts[features[f]] > 1)
529  ++good_features;
530  else
531  ++bad_features;
532  }
533  // If more than 1/3 features are bad, then this is an outlier.
534  if (bad_features * 2 > good_features) {
535  tprintf("Deleting outlier sample of %s, %d good, %d bad\n",
536  SampleToString(sample).string(),
537  good_features, bad_features);
538  if (debug) {
539  pixaAddPix(pixa, sample.RenderToPix(&unicharset_), L_INSERT);
540  // Add the previous sample as well, so it is easier to see in
541  // the output what is wrong with this sample.
542  int t;
543  if (i == 0)
544  t = fcinfo.samples[1];
545  else
546  t = fcinfo.samples[i - 1];
547  const TrainingSample &csample = *samples_[t];
548  pixaAddPix(pixa, csample.RenderToPix(&unicharset_), L_INSERT);
549  }
550  // Mark the sample for deletion.
551  KillSample(samples_[s]);
552  }
553  }
554  }
555  }
556  // Truly delete all bad samples and renumber everything.
558  if (pixa != NULL) {
559  Pix* pix = pixaDisplayTiledInRows(pixa, 1, 2600, 1.0, 0, 10, 10);
560  pixaDestroy(&pixa);
561  pixWrite("outliers.png", pix, IFF_PNG);
562  pixDestroy(&pix);
563  }
564 }
565 
566 // Marks the given sample index for deletion.
567 // Deletion is actually completed by DeleteDeadSamples.
569  sample->set_sample_index(-1);
570 }
571 
572 // Deletes all samples with zero features marked by KillSample.
574  samples_.compact(
576  num_raw_samples_ = samples_.size();
577  // Samples must be re-organized now we have deleted a few.
578 }
579 
580 // Callback function returns true if the given sample is to be deleted, due
581 // to having a negative classid.
583  return sample == NULL || sample->class_id() < 0;
584 }
585 
586 static Pix* DebugSample(const UNICHARSET& unicharset,
588  tprintf("\nOriginal features:\n");
589  for (int i = 0; i < sample->num_features(); ++i) {
590  sample->features()[i].print();
591  }
592  if (sample->features_are_mapped()) {
593  tprintf("\nMapped features:\n");
594  for (int i = 0; i < sample->mapped_features().size(); ++i) {
595  tprintf("%d ", sample->mapped_features()[i]);
596  }
597  tprintf("\n");
598  }
599  return sample->RenderToPix(&unicharset);
600 }
601 
602 // Construct an array to access the samples by font,class pair.
604  // Font indexes are sparse, so we used a map to compact them, so we can
605  // have an efficient 2-d array of fonts and character classes.
606  SetupFontIdMap();
607  int compact_font_size = font_id_map_.CompactSize();
608  // Get a 2-d array of generic vectors.
609  if (font_class_array_ != NULL)
610  delete font_class_array_;
611  FontClassInfo empty;
612  font_class_array_ = new GENERIC_2D_ARRAY<FontClassInfo>(
613  compact_font_size, unicharset_size_, empty);
614  for (int s = 0; s < samples_.size(); ++s) {
615  int font_id = samples_[s]->font_id();
616  int class_id = samples_[s]->class_id();
617  if (font_id < 0 || font_id >= font_id_map_.SparseSize()) {
618  tprintf("Font id = %d/%d, class id = %d/%d on sample %d\n",
619  font_id, font_id_map_.SparseSize(), class_id, unicharset_size_,
620  s);
621  }
622  ASSERT_HOST(font_id >= 0 && font_id < font_id_map_.SparseSize());
623  ASSERT_HOST(class_id >= 0 && class_id < unicharset_size_);
624  int font_index = font_id_map_.SparseToCompact(font_id);
625  (*font_class_array_)(font_index, class_id).samples.push_back(s);
626  }
627  // Set the num_raw_samples member of the FontClassInfo, to set the boundary
628  // between the raw samples and the replicated ones.
629  for (int f = 0; f < compact_font_size; ++f) {
630  for (int c = 0; c < unicharset_size_; ++c)
631  (*font_class_array_)(f, c).num_raw_samples =
632  (*font_class_array_)(f, c).samples.size();
633  }
634  // This is the global number of samples and also marks the boundary between
635  // real and replicated samples.
636  num_raw_samples_ = samples_.size();
637 }
638 
639 // Constructs the font_id_map_ which maps real font_ids (sparse) to a compact
640 // index for the font_class_array_.
642  // Number of samples for each font_id.
643  GenericVector<int> font_counts;
644  for (int s = 0; s < samples_.size(); ++s) {
645  int font_id = samples_[s]->font_id();
646  while (font_id >= font_counts.size())
647  font_counts.push_back(0);
648  ++font_counts[font_id];
649  }
650  font_id_map_.Init(font_counts.size(), false);
651  for (int f = 0; f < font_counts.size(); ++f) {
652  font_id_map_.SetMap(f, font_counts[f] > 0);
653  }
654  font_id_map_.Setup();
655 }
656 
657 
658 // Finds the sample for each font, class pair that has least maximum
659 // distance to all the other samples of the same font, class.
660 // OrganizeByFontAndClass must have been already called.
662  bool debug) {
663  ASSERT_HOST(font_class_array_ != NULL);
664  IntFeatureDist f_table;
665  if (debug) tprintf("feature table size %d\n", map.sparse_size());
666  f_table.Init(&map);
667  int worst_s1 = 0;
668  int worst_s2 = 0;
669  double global_worst_dist = 0.0;
670  // Compute distances independently for each font and char index.
671  int font_size = font_id_map_.CompactSize();
672  for (int font_index = 0; font_index < font_size; ++font_index) {
673  int font_id = font_id_map_.CompactToSparse(font_index);
674  for (int c = 0; c < unicharset_size_; ++c) {
675  int samples_found = 0;
676  FontClassInfo& fcinfo = (*font_class_array_)(font_index, c);
677  if (fcinfo.samples.size() == 0 ||
678  (kTestChar >= 0 && c != kTestChar)) {
679  fcinfo.canonical_sample = -1;
680  fcinfo.canonical_dist = 0.0f;
681  if (debug) tprintf("Skipping class %d\n", c);
682  continue;
683  }
684  // The canonical sample will be the one with the min_max_dist, which
685  // is the sample with the lowest maximum distance to all other samples.
686  double min_max_dist = 2.0;
687  // We keep track of the farthest apart pair (max_s1, max_s2) which
688  // are max_max_dist apart, so we can see how bad the variability is.
689  double max_max_dist = 0.0;
690  int max_s1 = 0;
691  int max_s2 = 0;
692  fcinfo.canonical_sample = fcinfo.samples[0];
693  fcinfo.canonical_dist = 0.0f;
694  for (int i = 0; i < fcinfo.samples.size(); ++i) {
695  int s1 = fcinfo.samples[i];
696  const GenericVector<int>& features1 = samples_[s1]->indexed_features();
697  f_table.Set(features1, features1.size(), true);
698  double max_dist = 0.0;
699  // Run the full squared-order search for similar samples. It is still
700  // reasonably fast because f_table.FeatureDistance is fast, but we
701  // may have to reconsider if we start playing with too many samples
702  // of a single char/font.
703  for (int j = 0; j < fcinfo.samples.size(); ++j) {
704  int s2 = fcinfo.samples[j];
705  if (samples_[s2]->class_id() != c ||
706  samples_[s2]->font_id() != font_id ||
707  s2 == s1)
708  continue;
709  GenericVector<int> features2 = samples_[s2]->indexed_features();
710  double dist = f_table.FeatureDistance(features2);
711  int height = samples_[s2]->geo_feature(GeoTop) -
712  samples_[s2]->geo_feature(GeoBottom);
713  if (dist == 1.0 && height > 64) {
714  // TODO(rays) rethink this when the polygonal approximation goes.
715  // Currently it is possible for dots and other small characters
716  // to be completely different, even within the same class.
717  f_table.DebugFeatureDistance(features2);
718  }
719  if (dist > max_dist) {
720  max_dist = dist;
721  if (dist > max_max_dist) {
722  max_s1 = s1;
723  max_s2 = s2;
724  }
725  }
726  }
727  // Using Set(..., false) is far faster than re initializing, due to
728  // the sparseness of the feature space.
729  f_table.Set(features1, features1.size(), false);
730  samples_[s1]->set_max_dist(max_dist);
731  ++samples_found;
732  if (max_dist < min_max_dist) {
733  fcinfo.canonical_sample = s1;
734  fcinfo.canonical_dist = max_dist;
735  }
736  UpdateRange(max_dist, &min_max_dist, &max_max_dist);
737  }
738  if (max_max_dist > global_worst_dist) {
739  // Keep a record of the worst pair over all characters/fonts too.
740  global_worst_dist = max_max_dist;
741  worst_s1 = max_s1;
742  worst_s2 = max_s2;
743  }
744  if (debug) {
745  tprintf("Found %d samples of class %d=%s, font %d, "
746  "dist range [%g, %g], worst pair= %s, %s\n",
747  samples_found, c, unicharset_.debug_str(c).string(),
748  font_index, min_max_dist, max_max_dist,
749  SampleToString(*samples_[max_s1]).string(),
750  SampleToString(*samples_[max_s2]).string());
751  }
752  }
753  }
754  if (debug) {
755  tprintf("Global worst dist = %g, between sample %d and %d\n",
756  global_worst_dist, worst_s1, worst_s2);
757  Pix* pix1 = DebugSample(unicharset_, samples_[worst_s1]);
758  Pix* pix2 = DebugSample(unicharset_, samples_[worst_s2]);
759  pixOr(pix1, pix1, pix2);
760  pixWrite("worstpair.png", pix1, IFF_PNG);
761  pixDestroy(&pix1);
762  pixDestroy(&pix2);
763  }
764 }
765 
766 // Replicates the samples to a minimum frequency defined by
767 // 2 * kSampleRandomSize, or for larger counts duplicates all samples.
768 // After replication, the replicated samples are perturbed slightly, but
769 // in a predictable and repeatable way.
770 // Use after OrganizeByFontAndClass().
772  ASSERT_HOST(font_class_array_ != NULL);
773  int font_size = font_id_map_.CompactSize();
774  for (int font_index = 0; font_index < font_size; ++font_index) {
775  for (int c = 0; c < unicharset_size_; ++c) {
776  FontClassInfo& fcinfo = (*font_class_array_)(font_index, c);
777  int sample_count = fcinfo.samples.size();
778  int min_samples = 2 * MAX(kSampleRandomSize, sample_count);
779  if (sample_count > 0 && sample_count < min_samples) {
780  int base_count = sample_count;
781  for (int base_index = 0; sample_count < min_samples; ++sample_count) {
782  int src_index = fcinfo.samples[base_index++];
783  if (base_index >= base_count) base_index = 0;
784  TrainingSample* sample = samples_[src_index]->RandomizedCopy(
785  sample_count % kSampleRandomSize);
786  int sample_index = samples_.size();
787  sample->set_sample_index(sample_index);
788  samples_.push_back(sample);
789  fcinfo.samples.push_back(sample_index);
790  }
791  }
792  }
793  }
794 }
795 
796 // Caches the indexed features of the canonical samples.
797 // ComputeCanonicalSamples must have been already called.
798 // TODO(rays) see note on ReliablySeparable and try restricting the
799 // canonical features to those that truly represent all samples.
801  ASSERT_HOST(font_class_array_ != NULL);
802  int font_size = font_id_map_.CompactSize();
803  for (int font_index = 0; font_index < font_size; ++font_index) {
804  int font_id = font_id_map_.CompactToSparse(font_index);
805  for (int c = 0; c < unicharset_size_; ++c) {
806  int num_samples = NumClassSamples(font_id, c, false);
807  if (num_samples == 0)
808  continue;
809  const TrainingSample* sample = GetCanonicalSample(font_id, c);
810  FontClassInfo& fcinfo = (*font_class_array_)(font_index, c);
811  fcinfo.canonical_features = sample->indexed_features();
812  }
813  }
814 }
815 
816 // Computes the combined set of features used by all the samples of each
817 // font/class combination. Use after ReplicateAndRandomizeSamples.
818 void TrainingSampleSet::ComputeCloudFeatures(int feature_space_size) {
819  ASSERT_HOST(font_class_array_ != NULL);
820  int font_size = font_id_map_.CompactSize();
821  for (int font_index = 0; font_index < font_size; ++font_index) {
822  int font_id = font_id_map_.CompactToSparse(font_index);
823  for (int c = 0; c < unicharset_size_; ++c) {
824  int num_samples = NumClassSamples(font_id, c, false);
825  if (num_samples == 0)
826  continue;
827  FontClassInfo& fcinfo = (*font_class_array_)(font_index, c);
828  fcinfo.cloud_features.Init(feature_space_size);
829  for (int s = 0; s < num_samples; ++s) {
830  const TrainingSample* sample = GetSample(font_id, c, s);
831  const GenericVector<int>& sample_features = sample->indexed_features();
832  for (int i = 0; i < sample_features.size(); ++i)
833  fcinfo.cloud_features.SetBit(sample_features[i]);
834  }
835  }
836  }
837 }
838 
839 // Adds all fonts of the given class to the shape.
840 void TrainingSampleSet::AddAllFontsForClass(int class_id, Shape* shape) const {
841  for (int f = 0; f < font_id_map_.CompactSize(); ++f) {
842  int font_id = font_id_map_.CompactToSparse(f);
843  shape->AddToShape(class_id, font_id);
844  }
845 }
846 
847 // Display the samples with the given indexed feature that also match
848 // the given shape.
850  const Shape& shape,
851  const IntFeatureSpace& space,
852  ScrollView::Color color,
853  ScrollView* window) const {
854  for (int s = 0; s < num_raw_samples(); ++s) {
855  const TrainingSample* sample = GetSample(s);
856  if (shape.ContainsUnichar(sample->class_id())) {
857  GenericVector<int> indexed_features;
858  space.IndexAndSortFeatures(sample->features(), sample->num_features(),
859  &indexed_features);
860  for (int f = 0; f < indexed_features.size(); ++f) {
861  if (indexed_features[f] == f_index) {
862  sample->DisplayFeatures(color, window);
863  }
864  }
865  }
866  }
867 }
868 
869 
870 } // namespace tesseract.