Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
trainingsampleset.h
Go to the documentation of this file.
1 // Copyright 2010 Google Inc. All Rights Reserved.
2 // Author: rays@google.com (Ray Smith)
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
8 // Unless required by applicable law or agreed to in writing, software
9 // distributed under the License is distributed on an "AS IS" BASIS,
10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 // See the License for the specific language governing permissions and
12 // limitations under the License.
13 //
15 
16 #ifndef TESSERACT_TRAINING_TRAININGSAMPLESET_H__
17 #define TESSERACT_TRAINING_TRAININGSAMPLESET_H__
18 
19 #include "bitvector.h"
20 #include "genericvector.h"
21 #include "indexmapbidi.h"
22 #include "matrix.h"
23 #include "shapetable.h"
24 #include "trainingsample.h"
25 
26 class UNICHARSET;
27 template <typename T> class UnicityTable;
28 
29 namespace tesseract {
30 
31 struct FontInfo;
32 class IntFeatureMap;
33 class IntFeatureSpace;
34 class TrainingSample;
35 class UnicharAndFonts;
36 
37 // Collection of TrainingSample used for training or testing a classifier.
38 // Provides several useful methods to operate on the collection as a whole,
39 // including outlier detection and deletion, providing access by font and
40 // class, finding the canonical sample, finding the "cloud" features (OR of
41 // all features in all samples), replication of samples, caching of distance
42 // metrics.
44  public:
45  explicit TrainingSampleSet(const UnicityTable<FontInfo>& fontinfo_table);
47 
48  // Writes to the given file. Returns false in case of error.
49  bool Serialize(FILE* fp) const;
50  // Reads from the given file. Returns false in case of error.
51  // If swap is true, assumes a big/little-endian swap is needed.
52  bool DeSerialize(bool swap, FILE* fp);
53 
54  // Accessors
55  int num_samples() const {
56  return samples_.size();
57  }
58  int num_raw_samples() const {
59  return num_raw_samples_;
60  }
61  int NumFonts() const {
62  return font_id_map_.SparseSize();
63  }
64  const UNICHARSET& unicharset() const {
65  return unicharset_;
66  }
67  int charsetsize() const {
68  return unicharset_size_;
69  }
70 
71  // Loads an initial unicharset, or sets one up if the file cannot be read.
72  void LoadUnicharset(const char* filename);
73 
74  // Adds a character sample to this sample set.
75  // If the unichar is not already in the local unicharset, it is added.
76  // Returns the unichar_id of the added sample, from the local unicharset.
77  int AddSample(const char* unichar, TrainingSample* sample);
78  // Adds a character sample to this sample set with the given unichar_id,
79  // which must correspond to the local unicharset (in this).
80  void AddSample(int unichar_id, TrainingSample* sample);
81 
82  // Returns the number of samples for the given font,class pair.
83  // If randomize is true, returns the number of samples accessible
84  // with randomizing on. (Increases the number of samples if small.)
85  // OrganizeByFontAndClass must have been already called.
86  int NumClassSamples(int font_id, int class_id, bool randomize) const;
87 
88  // Gets a sample by its index.
89  const TrainingSample* GetSample(int index) const;
90 
91  // Gets a sample by its font, class, index.
92  // OrganizeByFontAndClass must have been already called.
93  const TrainingSample* GetSample(int font_id, int class_id, int index) const;
94 
95  // Get a sample by its font, class, index. Does not randomize.
96  // OrganizeByFontAndClass must have been already called.
97  TrainingSample* MutableSample(int font_id, int class_id, int index);
98 
99  // Returns a string debug representation of the given sample:
100  // font, unichar_str, bounding box, page.
102 
103  // Gets the combined set of features used by all the samples of the given
104  // font/class combination.
105  const BitVector& GetCloudFeatures(int font_id, int class_id) const;
106  // Gets the indexed features of the canonical sample of the given
107  // font/class combination.
108  const GenericVector<int>& GetCanonicalFeatures(int font_id,
109  int class_id) const;
110 
111  // Returns the distance between the given UniCharAndFonts pair.
112  // If matched_fonts, only matching fonts, are considered, unless that yields
113  // the empty set.
114  // OrganizeByFontAndClass must have been already called.
115  float UnicharDistance(const UnicharAndFonts& uf1, const UnicharAndFonts& uf2,
116  bool matched_fonts, const IntFeatureMap& feature_map);
117 
118  // Returns the distance between the given pair of font/class pairs.
119  // Finds in cache or computes and caches.
120  // OrganizeByFontAndClass must have been already called.
121  float ClusterDistance(int font_id1, int class_id1,
122  int font_id2, int class_id2,
123  const IntFeatureMap& feature_map);
124 
125  // Computes the distance between the given pair of font/class pairs.
126  float ComputeClusterDistance(int font_id1, int class_id1,
127  int font_id2, int class_id2,
128  const IntFeatureMap& feature_map) const;
129 
130  // Returns the number of canonical features of font/class 2 for which
131  // neither the feature nor any of its near neighbors occurs in the cloud
132  // of font/class 1. Each such feature is a reliable separation between
133  // the classes, ASSUMING that the canonical sample is sufficiently
134  // representative that every sample has a feature near that particular
135  // feature. To check that this is so on the fly would be prohibitively
136  // expensive, but it might be possible to pre-qualify the canonical features
137  // to include only those for which this assumption is true.
138  // ComputeCanonicalFeatures and ComputeCloudFeatures must have been called
139  // first, or the results will be nonsense.
140  int ReliablySeparable(int font_id1, int class_id1,
141  int font_id2, int class_id2,
142  const IntFeatureMap& feature_map,
143  bool thorough) const;
144 
145 
146  // Returns the total index of the requested sample.
147  // OrganizeByFontAndClass must have been already called.
148  int GlobalSampleIndex(int font_id, int class_id, int index) const;
149 
150  // Gets the canonical sample for the given font, class pair.
151  // ComputeCanonicalSamples must have been called first.
152  const TrainingSample* GetCanonicalSample(int font_id, int class_id) const;
153  // Gets the max distance for the given canonical sample.
154  // ComputeCanonicalSamples must have been called first.
155  float GetCanonicalDist(int font_id, int class_id) const;
156 
157  // Returns a mutable pointer to the sample with the given index.
159  return samples_[index];
160  }
161  // Gets ownership of the sample with the given index, removing it from this.
163  TrainingSample* sample = samples_[index];
164  samples_[index] = NULL;
165  return sample;
166  }
167 
168  // Generates indexed features for all samples with the supplied feature_space.
169  void IndexFeatures(const IntFeatureSpace& feature_space);
170 
171  // Delete outlier samples with few features that are shared with others.
172  // IndexFeatures must have been called already.
173  void DeleteOutliers(const IntFeatureSpace& feature_space, bool debug);
174 
175  // Marks the given sample for deletion.
176  // Deletion is actually completed by DeleteDeadSamples.
178 
179  // Deletes all samples with a negative sample index marked by KillSample.
180  // Must be called before OrganizeByFontAndClass, and OrganizeByFontAndClass
181  // must be called after as the samples have been renumbered.
182  void DeleteDeadSamples();
183 
184  // Callback function returns true if the given sample is to be deleted, due
185  // to having a negative classid.
187 
188  // Construct an array to access the samples by font,class pair.
189  void OrganizeByFontAndClass();
190 
191  // Constructs the font_id_map_ which maps real font_ids (sparse) to a compact
192  // index for the font_class_array_.
193  void SetupFontIdMap();
194 
195  // Finds the sample for each font, class pair that has least maximum
196  // distance to all the other samples of the same font, class.
197  // OrganizeByFontAndClass must have been already called.
198  void ComputeCanonicalSamples(const IntFeatureMap& map, bool debug);
199 
200  // Replicates the samples to a minimum frequency defined by
201  // 2 * kSampleRandomSize, or for larger counts duplicates all samples.
202  // After replication, the replicated samples are perturbed slightly, but
203  // in a predictable and repeatable way.
204  // Use after OrganizeByFontAndClass().
206 
207  // Caches the indexed features of the canonical samples.
208  // ComputeCanonicalSamples must have been already called.
210  // Computes the combined set of features used by all the samples of each
211  // font/class combination. Use after ReplicateAndRandomizeSamples.
212  void ComputeCloudFeatures(int feature_space_size);
213 
214  // Adds all fonts of the given class to the shape.
215  void AddAllFontsForClass(int class_id, Shape* shape) const;
216 
217  // Display the samples with the given indexed feature that also match
218  // the given shape.
219  void DisplaySamplesWithFeature(int f_index, const Shape& shape,
220  const IntFeatureSpace& feature_space,
221  ScrollView::Color color,
222  ScrollView* window) const;
223 
224  private:
225  // Struct to store a triplet of unichar, font, distance in the distance cache.
226  struct FontClassDistance {
227  int unichar_id;
228  int font_id; // Real font id.
229  float distance;
230  };
231  // Simple struct to store information related to each font/class combination.
232  struct FontClassInfo {
233  FontClassInfo();
234 
235  // Writes to the given file. Returns false in case of error.
236  bool Serialize(FILE* fp) const;
237  // Reads from the given file. Returns false in case of error.
238  // If swap is true, assumes a big/little-endian swap is needed.
239  bool DeSerialize(bool swap, FILE* fp);
240 
241  // Number of raw samples.
243  // Index of the canonical sample.
244  inT32 canonical_sample;
245  // Max distance of the canonical sample from any other.
246  float canonical_dist;
247  // Sample indices for the samples, including replicated.
248  GenericVector<inT32> samples;
249 
250  // Non-serialized cache data.
251  // Indexed features of the canonical sample.
252  GenericVector<int> canonical_features;
253  // The mapped features of all the samples.
254  BitVector cloud_features;
255 
256  // Caches for ClusterDistance.
257  // Caches for other fonts but matching this unichar. -1 indicates not set.
258  // Indexed by compact font index from font_id_map_.
259  GenericVector<float> font_distance_cache;
260  // Caches for other unichars but matching this font. -1 indicates not set.
261  GenericVector<float> unichar_distance_cache;
262  // Cache for the rest (non matching font and unichar.)
263  // A cache of distances computed by ReliablySeparable.
264  GenericVector<FontClassDistance> distance_cache;
265  };
266 
267  PointerVector<TrainingSample> samples_;
268  // Number of samples before replication/randomization.
269  int num_raw_samples_;
270  // Character set we are training for.
271  UNICHARSET unicharset_;
272  // Character set size to which the 2-d arrays below refer.
273  int unicharset_size_;
274  // Map to allow the font_class_array_ below to be compact.
275  // The sparse space is the real font_id, used in samples_ .
276  // The compact space is an index to font_class_array_
277  IndexMapBiDi font_id_map_;
278  // A 2-d array of FontClassInfo holding information related to each
279  // (font_id, class_id) pair.
280  GENERIC_2D_ARRAY<FontClassInfo>* font_class_array_;
281 
282  // Reference to the fontinfo_table_ in MasterTrainer. Provides names
283  // for font_ids in the samples. Not serialized!
284  const UnicityTable<FontInfo>& fontinfo_table_;
285 };
286 
287 } // namespace tesseract.
288 
289 
290 #endif // TRAININGSAMPLESETSET_H_