Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
errorcounter.h
Go to the documentation of this file.
1 // Copyright 2011 Google Inc. All Rights Reserved.
2 // Author: rays@google.com (Ray Smith)
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
8 // Unless required by applicable law or agreed to in writing, software
9 // distributed under the License is distributed on an "AS IS" BASIS,
10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 // See the License for the specific language governing permissions and
12 // limitations under the License.
13 //
15 
16 #ifndef THIRD_PARTY_TESSERACT_CLASSIFY_ERRORCOUNTER_H_
17 #define THIRD_PARTY_TESSERACT_CLASSIFY_ERRORCOUNTER_H_
18 
19 #include "genericvector.h"
20 #include "matrix.h"
21 
22 struct Pix;
23 template <typename T> class UnicityTable;
24 
25 namespace tesseract {
26 
27 struct FontInfo;
28 class SampleIterator;
29 class ShapeClassifier;
30 class ShapeRating;
31 class ShapeTable;
32 class TrainingSample;
33 
34 // Enumeration of the different types of error count.
35 // Error counts work as follows:
36 //
37 // Ground truth is a valid unichar-id / font-id pair:
38 // Number of classifier answers?
39 // 0 >0
40 // CT_REJECT BOTH unichar-id and font-id match top shape?
41 // __________ yes! no
42 // CT_SHAPE_TOP_CORRECT CT_SHAPE_TOP_ERR
43 // | Font attributes match?
44 // | yes! no
45 // | | CT_FONT_ATTR_ERROR
46 // | Top unichar-id matches?
47 // | yes! no
48 // Top shape-id has multiple unichars? CT_UNICHAR_TOP1_ERR
49 // yes! no 2nd shape unichar id matches?
50 // CT_OK_MULTI_UNICHAR ________ yes! no
51 // ___________________ _____ CT_UNICHAR_TOP2_ERR
52 // Any unichar-id matches?
53 // yes! no
54 // ______ CT_UNICHAR_TOPN_ERR
55 // _________________
56 // Note that multiple counts may be activated for a single sample!
57 //
58 // Ground truth is for a fragment/n-gram that is NOT in the unicharset.
59 // This is called junk and is expected to be rejected:
60 // Number of classifier answers?
61 // 0 >0
62 // CT_REJECTED_JUNK CT_ACCEPTED_JUNK
63 //
64 // Also, CT_NUM_RESULTS stores the mean number of results, and CT_RANK stores
65 // the mean rank of the correct result, counting from 0, and with an error
66 // receiving the number of answers as the correct rank.
67 //
68 // Keep in sync with the ReportString function.
69 enum CountTypes {
70  CT_SHAPE_TOP_CORRECT, // Top shape id is actually correct.
71  CT_SHAPE_TOP_ERR, // Top shape id is not correct.
72  CT_FONT_ATTR_ERR, // Font attributes incorrect, ignoring unichar.
73  CT_UNICHAR_TOP1_ERR, // Top shape does not contain correct unichar id.
74  CT_UNICHAR_TOP2_ERR, // Top 2 shapes don't contain correct unichar id.
75  CT_UNICHAR_TOPN_ERR, // No output shape contains correct unichar id.
76  CT_OK_MULTI_UNICHAR, // Top shape id has correct unichar id, and others.
77  CT_REJECT, // Classifier hates this.
78  CT_NUM_RESULTS, // Number of answers produced.
79  CT_RANK, // Rank of correct answer.
80  CT_REJECTED_JUNK, // Junk that was correctly rejected.
81  CT_ACCEPTED_JUNK, // Junk that was incorrectly classified otherwise.
82 
83  CT_SIZE // Number of types for array sizing.
84 };
85 
86 // Class to encapsulate all the functionality and sub-structures required
87 // to count errors for an isolated character classifier (ShapeClassifier).
88 class ErrorCounter {
89  public:
90  // Computes and returns the unweighted boosting_mode error rate of the given
91  // classifier. Can be used for testing, or inside an iterative training
92  // system, including one that uses boosting.
93  // report_levels:
94  // 0 = no output.
95  // 1 = bottom-line error rate.
96  // 2 = bottom-line error rate + time.
97  // 3 = font-level error rate + time.
98  // 4 = list of all errors + short classifier debug output on 16 errors.
99  // 5 = list of all errors + short classifier debug output on 25 errors.
100  // * The boosting_mode determines which error type is used for computing the
101  // scaled_error output, and setting the is_error flag in the samples.
102  // * The fontinfo_table is used to get string font names for the debug
103  // output, and also to count font attributes errors.
104  // * The page_images vector may contain a Pix* (which may be NULL) for each
105  // page index assigned to the samples.
106  // * The it provides encapsulated iteration over some sample set.
107  // * The outputs unichar_error, scaled_error and totals_report are all
108  // optional.
109  // * If not NULL, unichar error gets the top1 unichar error rate.
110  // * Scaled_error gets the error chosen by boosting_mode weighted by the
111  // weights on the samples.
112  // * Fonts_report gets a string summarizing the error rates for each font in
113  // both human-readable form and as a tab-separated list of error counts.
114  // The human-readable form is all before the first tab.
115  // * The return value is the un-weighted version of the scaled_error.
116  static double ComputeErrorRate(ShapeClassifier* classifier,
117  int report_level, CountTypes boosting_mode,
118  const UnicityTable<FontInfo>& fontinfo_table,
119  const GenericVector<Pix*>& page_images,
120  SampleIterator* it,
121  double* unichar_error,
122  double* scaled_error,
123  STRING* fonts_report);
124 
125  private:
126  // Simple struct to hold an array of counts.
127  struct Counts {
128  Counts();
129  // Adds other into this for computing totals.
130  void operator+=(const Counts& other);
131 
132  int n[CT_SIZE];
133  };
134 
135  // Constructor is private. Only anticipated use of ErrorCounter is via
136  // the static ComputeErrorRate.
137  ErrorCounter(int charsetsize, int shapesize, int fontsize);
138  ~ErrorCounter();
139 
140  // Accumulates the errors from the classifier results on a single sample.
141  // Returns true if debug is true and a CT_UNICHAR_TOPN_ERR error occurred.
142  // boosting_mode selects the type of error to be used for boosting and the
143  // is_error_ member of sample is set according to whether the required type
144  // of error occurred. The font_table provides access to font properties
145  // for error counting and shape_table is used to understand the relationship
146  // between unichar_ids and shape_ids in the results
147  bool AccumulateErrors(bool debug, CountTypes boosting_mode,
148  const UnicityTable<FontInfo>& font_table,
149  const ShapeTable& shape_table,
150  const GenericVector<ShapeRating>& results,
152 
153  // Accumulates counts for junk. Counts only whether the junk was correctly
154  // rejected or not.
155  void AccumulateJunk(const ShapeTable& shape_table,
156  const GenericVector<ShapeRating>& results,
157  TrainingSample* sample);
158 
159  // Creates a report of the error rate. The report_level controls the detail
160  // that is reported to stderr via tprintf:
161  // 0 -> no output.
162  // >=1 -> bottom-line error rate.
163  // >=3 -> font-level error rate.
164  // boosting_mode determines the return value. It selects which (un-weighted)
165  // error rate to return.
166  // The fontinfo_table from MasterTrainer provides the names of fonts.
167  // The it determines the current subset of the training samples.
168  // If not NULL, the top-choice unichar error rate is saved in unichar_error.
169  // If not NULL, the report string is saved in fonts_report.
170  // (Ignoring report_level).
171  double ReportErrors(int report_level, CountTypes boosting_mode,
172  const UnicityTable<FontInfo>& fontinfo_table,
173  const SampleIterator& it,
174  double* unichar_error,
175  STRING* fonts_report);
176 
177  // Sets the report string to a combined human and machine-readable report
178  // string of the error rates.
179  // Returns false if there is no data, leaving report unchanged.
180  static bool ReportString(const Counts& counts, STRING* report);
181 
182  // Computes the error rates and returns in rates which is an array of size
183  // CT_SIZE. Returns false if there is no data, leaving rates unchanged.
184  static bool ComputeRates(const Counts& counts, double rates[CT_SIZE]);
185 
186 
187  // Total scaled error used by boosting algorithms.
188  double scaled_error_;
189  // Vector indexed by font_id from the samples of error accumulators.
190  GenericVector<Counts> font_counts_;
191  // Counts of the results that map each unichar_id (from samples) to an
192  // incorrect shape_id.
193  GENERIC_2D_ARRAY<int> unichar_counts_;
194 };
195 
196 } // namespace tesseract.
197 
198 #endif /* THIRD_PARTY_TESSERACT_CLASSIFY_ERRORCOUNTER_H_ */