Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
classifier_tester.cpp
Go to the documentation of this file.
1 // Copyright 2011 Google Inc. All Rights Reserved.
2 // Author: rays@google.com (Ray Smith)
3 
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
8 // Unless required by applicable law or agreed to in writing, software
9 // distributed under the License is distributed on an "AS IS" BASIS,
10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 // See the License for the specific language governing permissions and
12 // limitations under the License.
13 
14 // Filename: classifier_tester.cpp
15 // Purpose: Tests a character classifier on data as formatted for training,
16 // but doesn't have to be the same as the training data.
17 // Author: Ray Smith
18 
19 #ifndef USE_STD_NAMESPACE
20 #include "base/commandlineflags.h"
21 #endif
22 #include "baseapi.h"
23 #include "commontraining.h"
24 #include "cubeclassifier.h"
25 #include "mastertrainer.h"
26 #include "params.h"
27 #include "strngs.h"
28 #include "tessclassifier.h"
29 
30 STRING_PARAM_FLAG(classifier, "", "Classifier to test");
31 STRING_PARAM_FLAG(lang, "eng", "Language to test");
32 STRING_PARAM_FLAG(tessdata_dir, "", "Directory of traineddata files");
33 
40 };
41 
42 const char* names[] = {"pruner", "full", "cube", "cubetess", NULL };
43 
44 // This program has complex setup requirements, so here is some help:
45 // Two different modes, tr files and serialized mastertrainer.
46 // From tr files:
47 // classifier_tester -U unicharset -F font_properties -X xheights
48 // -classifier x -lang lang [-output_trainer trainer] *.tr
49 // From a serialized trainer:
50 // classifier_tester -input_trainer trainer [-lang lang] -classifier x
51 //
52 // In the first case, the unicharset must be the unicharset from within
53 // the classifier under test, and the font_properties and xheights files must
54 // match the files used during training.
55 // In the second case, the trainer file must have been prepared from
56 // some previous run of shapeclustering, mftraining, or classifier_tester
57 // using the same conditions as above, ie matching unicharset/font_properties.
58 //
59 // Available values of classifier (x above) are:
60 // pruner : Tesseract class pruner only.
61 // full : Tesseract full classifier.
62 // cube : Cube classifier. (Not possible with an input trainer.)
63 // cubetess : Tesseract class pruner with rescoring by Cube. (Not possible
64 // with an input trainer.)
65 int main(int argc, char **argv) {
66  ParseArguments(&argc, &argv);
67  // Decode the classifier string.
68  ClassifierName classifier = CN_COUNT;
69  for (int c = 0; c < CN_COUNT; ++c) {
70  if (strcmp(FLAGS_classifier.c_str(), names[c]) == 0) {
71  classifier = static_cast<ClassifierName>(c);
72  break;
73  }
74  }
75  if (classifier == CN_COUNT) {
76  fprintf(stderr, "Invalid classifier name:%s\n", FLAGS_classifier.c_str());
77  return 1;
78  }
79 
80  STRING file_prefix;
82  argc, argv, true, NULL, &file_prefix);
83  // We want to test junk as well if it is available.
84  trainer->IncludeJunk();
85  // We want to test with replicated samples too.
87 
88  // We need to initialize tesseract to test.
91  if (classifier == CN_CUBE || classifier == CN_CUBETESS)
93  if (api.Init(FLAGS_tessdata_dir.c_str(), FLAGS_lang.c_str(),
94  engine_mode) < 0) {
95  fprintf(stderr, "Tesseract initialization failed!\n");
96  return 1;
97  }
98  tesseract::ShapeClassifier* shape_classifier = NULL;
100  const_cast<tesseract::Tesseract*>(api.tesseract());
101  tesseract::Classify* classify =
102  reinterpret_cast<tesseract::Classify*>(tesseract);
103  // Copy the shape_table from the classifier and add the space character if
104  // not already present to count junk.
105  tesseract::ShapeTable shape_table;
106  shape_table.set_unicharset(classify->shape_table()->unicharset());
107  shape_table.AppendMasterShapes(*classify->shape_table());
108  if (shape_table.FindShape(0, -1) < 0)
109  shape_table.AddShape(0, 0);
110  if (classifier == CN_PRUNER) {
111  shape_classifier = new tesseract::TessClassifier(true, classify);
112  } else if (classifier == CN_FULL) {
113  shape_classifier = new tesseract::TessClassifier(false, classify);
114  } else if (classifier == CN_CUBE) {
115  shape_classifier = new tesseract::CubeClassifier(tesseract);
116  } else if (classifier == CN_CUBETESS) {
117  shape_classifier = new tesseract::CubeTessClassifier(tesseract);
118  } else {
119  fprintf(stderr, "%s tester not yet implemented\n",
120  FLAGS_classifier.c_str());
121  return 1;
122  }
123  tprintf("Testing classifier %s:\n", FLAGS_classifier.c_str());
124  trainer->TestClassifierOnSamples(3, false, shape_classifier, NULL);
125  if (classifier != CN_CUBE && classifier != CN_CUBETESS) {
126  // Test with replicated samples as well.
127  trainer->TestClassifierOnSamples(3, true, shape_classifier, NULL);
128  }
129  delete shape_classifier;
130  delete trainer;
131 
132  return 0;
133 } /* main */
134 
135 
136 
137 
138 
139