Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
params_training_featdef.h
Go to the documentation of this file.
1 
2 // File: params_training_featdef.h
3 // Description: Feature definitions for params training.
4 // Author: Rika Antonova
5 // Created: Mon Nov 28 11:26:42 PDT 2011
6 //
7 // (C) Copyright 2011, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
19 
20 #ifndef TESSERACT_WORDREC_PARAMS_TRAINING_FEATDEF_H_
21 #define TESSERACT_WORDREC_PARAMS_TRAINING_FEATDEF_H_
22 
23 #include "genericvector.h"
24 #include "strngs.h"
25 
26 namespace tesseract {
27 
28 // Raw features extracted from a single OCR hypothesis.
29 // The features are non-normalized real-valued quantities with
30 // unbounded range and unknown distribution.
31 // Normalization / binarization of these features is done at a later stage.
32 // Note: when adding new fields to this enum make sure to modify
33 // kParamsTrainingRawFeatureTypeName enum accordingly.
35  // What dictionary (if any) was this hypothesis found in.
36  // See PermuterType enum in ccstruct/ratngs.h for interpretation.
38  // Boolean indicator of whether this hypothesis is ambiguous to a known
39  // dictionary word (or a valid number pattern).
41  // Shape cost of the segmentation path for this hypothesis.
43  // Character ngram probability of the string of unichars of this hypothesis.
45  // Number of bad/inconsistent spots in this hypothesis.
52  // Classifier-related features.
55  // Number of classifier results that came from adapted templates.
57  // Features potentially useful for normalization.
60 
62 };
63 
64 static const char * const kParamsTrainingRawFeatureTypeName[] = {
65  "DICT_MATCH_TYPE", // 0
66  "UNAMBIG_DICT_MATCH", // 1
67  "SHAPE_COST", // 2
68  "NGRAM_PROB", // 3
69  "NUM_BAD_PUNC", // 4
70  "NUM_BAD_CASE", // 5
71  "NUM_BAD_CHAR_TYPE", // 6
72  "NUM_BAD_SPACING", // 7
73  "NUM_BAD_SCRIPT", // 8
74  "NUM_BAD_FONT", // 9
75  "WORST_CERT", // 10
76  "RATING", // 11
77  "ADAPTED", // 12
78  "NUM_UNICHARS", // 13
79  "OUTLINE_LEN", // 14
80 };
81 
82 // Entry with features extracted from a single OCR hypothesis for a word.
85  for (int i = 0; i < PTRAIN_NUM_RAW_FEATURE_TYPES; ++i) features[i] = 0.0;
86  }
88  STRING str; // string corresponding to word hypothesis (for debugging)
89 };
90 
91 // A list of hypotheses explored during one run of segmentation search.
93 
94 // A bundle that accumulates all of the hypothesis lists explored during all
95 // of the runs of segmentation search on a word (e.g. a list of hypotheses
96 // explored on PASS1, PASS2, fix xheight pass, etc).
98  public:
100  // Starts a new hypothesis list.
101  // Should be called at the beginning of a new run of the segmentation search.
104  }
105  // Adds a new ParamsTrainingHypothesis to the current hypothesis list
106  // and returns the reference to the newly added entry.
110  return hyp_list_vec.back().back();
111  }
112 
114 };
115 
116 } // namespace tesseract
117 
118 #endif // TESSERACT_WORDREC_PARAMS_TRAINING_FEATDEF_H_