Tesseract
3.02
Main Page
Related Pages
Modules
Namespaces
Classes
Files
File List
File Members
All
Classes
Namespaces
Files
Functions
Variables
Typedefs
Enumerations
Enumerator
Friends
Macros
Groups
Pages
classify.h
Go to the documentation of this file.
1
2
// File: classify.h
3
// Description: classify class.
4
// Author: Samuel Charron
5
//
6
// (C) Copyright 2006, Google Inc.
7
// Licensed under the Apache License, Version 2.0 (the "License");
8
// you may not use this file except in compliance with the License.
9
// You may obtain a copy of the License at
10
// http://www.apache.org/licenses/LICENSE-2.0
11
// Unless required by applicable law or agreed to in writing, software
12
// distributed under the License is distributed on an "AS IS" BASIS,
13
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
// See the License for the specific language governing permissions and
15
// limitations under the License.
16
//
18
19
#ifndef TESSERACT_CLASSIFY_CLASSIFY_H__
20
#define TESSERACT_CLASSIFY_CLASSIFY_H__
21
22
#include "
adaptive.h
"
23
#include "
ccstruct.h
"
24
#include "
classify.h
"
25
#include "
dict.h
"
26
#include "
featdefs.h
"
27
#include "
fontinfo.h
"
28
#include "
intfx.h
"
29
#include "
intmatcher.h
"
30
#include "
normalis.h
"
31
#include "
ratngs.h
"
32
#include "
ocrfeatures.h
"
33
#include "
unicity_table.h
"
34
35
class
ScrollView
;
36
class
WERD_CHOICE
;
37
class
WERD_RES
;
38
struct
ADAPT_RESULTS
;
39
struct
NORM_PROTOS
;
40
41
static
const
int
kUnknownFontinfoId = -1;
42
static
const
int
kBlankFontinfoId = -2;
43
44
namespace
tesseract
{
45
46
struct
ShapeRating;
47
class
ShapeTable;
48
49
// How segmented is a blob. In this enum, character refers to a classifiable
50
// unit, but that is too long and character is usually easier to understand.
51
enum
CharSegmentationType
{
52
CST_FRAGMENT
,
// A partial character.
53
CST_WHOLE
,
// A correctly segmented character.
54
CST_IMPROPER
,
// More than one but less than 2 characters.
55
CST_NGRAM
// Multiple characters.
56
};
57
58
class
Classify
:
public
CCStruct
{
59
public
:
60
Classify
();
61
virtual
~Classify
();
62
Dict
&
getDict
() {
63
return
dict_;
64
}
65
66
const
ShapeTable
*
shape_table
()
const
{
67
return
shape_table_
;
68
}
69
70
/* adaptive.cpp ************************************************************/
71
ADAPT_TEMPLATES
NewAdaptedTemplates
(
bool
InitFromUnicharset);
72
int
GetFontinfoId
(
ADAPT_CLASS
Class,
uinT8
ConfigId);
73
// Runs the class pruner from int_templates on the given features, returning
74
// the number of classes output in results.
75
// int_templates Class pruner tables
76
// num_features Number of features in blob
77
// features Array of features
78
// normalization_factors (input) Array of int_templates->NumClasses fudge
79
// factors from blob normalization process.
80
// (Indexed by CLASS_INDEX)
81
// expected_num_features (input) Array of int_templates->NumClasses
82
// expected number of features for each class.
83
// (Indexed by CLASS_INDEX)
84
// results (output) Sorted Array of pruned classes.
85
// Array must be sized to take the maximum possible
86
// number of outputs : int_templates->NumClasses.
87
int
PruneClasses
(
const
INT_TEMPLATES_STRUCT
* int_templates,
88
int
num_features,
89
const
INT_FEATURE_STRUCT
* features,
90
const
uinT8
* normalization_factors,
91
const
uinT16
* expected_num_features,
92
CP_RESULT_STRUCT
* results);
93
void
ReadNewCutoffs
(FILE *CutoffFile,
bool
swap,
inT64
end_offset,
94
CLASS_CUTOFF_ARRAY
Cutoffs);
95
void
PrintAdaptedTemplates
(FILE *File,
ADAPT_TEMPLATES
Templates);
96
void
WriteAdaptedTemplates
(FILE *File,
ADAPT_TEMPLATES
Templates);
97
ADAPT_TEMPLATES
ReadAdaptedTemplates
(FILE *File);
98
/* normmatch.cpp ************************************************************/
99
FLOAT32
ComputeNormMatch
(
CLASS_ID
ClassId,
100
const
FEATURE_STRUCT
& feature,
BOOL8
DebugMatch);
101
void
FreeNormProtos
();
102
NORM_PROTOS
*
ReadNormProtos
(FILE *File,
inT64
end_offset);
103
/* protos.cpp ***************************************************************/
104
void
ReadClassFile
();
105
void
ConvertProto
(
PROTO
Proto,
int
ProtoId,
INT_CLASS
Class);
106
INT_TEMPLATES
CreateIntTemplates
(
CLASSES
FloatProtos,
107
const
UNICHARSET
& target_unicharset);
108
/* adaptmatch.cpp ***********************************************************/
109
110
// Learn the given word using its chopped_word, seam_array, denorm,
111
// box_word, best_state, and correct_text to learn both correctly and
112
// incorrectly segmented blobs. If filename is not NULL, then LearnBlob
113
// is called and the data will be written to a file for static training.
114
// Otherwise AdaptToBlob is called for adaption within a document.
115
// If rejmap is not NULL, then only chars with a rejmap entry of '1' will
116
// be learned, otherwise all chars with good correct_text are learned.
117
void
LearnWord
(
const
char
*
filename
,
const
char
*rejmap,
WERD_RES
*word);
118
119
// Builds a blob of length fragments, from the word, starting at start,
120
// and then learn it, as having the given correct_text.
121
// If filename is not NULL, then LearnBlob
122
// is called and the data will be written to a file for static training.
123
// Otherwise AdaptToBlob is called for adaption within a document.
124
// threshold is a magic number required by AdaptToChar and generated by
125
// GetAdaptThresholds.
126
// Although it can be partly inferred from the string, segmentation is
127
// provided to explicitly clarify the character segmentation.
128
void
LearnPieces
(
const
char
*
filename
,
int
start,
int
length,
129
float
threshold,
CharSegmentationType
segmentation,
130
const
char
* correct_text,
WERD_RES
*word);
131
void
InitAdaptiveClassifier
(
bool
load_pre_trained_templates);
132
void
InitAdaptedClass
(
TBLOB
*Blob,
133
const
DENORM
& denorm,
134
CLASS_ID
ClassId,
135
int
FontinfoId,
136
ADAPT_CLASS
Class,
137
ADAPT_TEMPLATES
Templates);
138
void
AdaptToPunc
(
TBLOB
*Blob,
139
const
DENORM
& denorm,
140
CLASS_ID
ClassId,
141
int
FontinfoId,
142
FLOAT32
Threshold);
143
void
AmbigClassifier
(
TBLOB
*Blob,
144
const
DENORM
& denorm,
145
INT_TEMPLATES
Templates,
146
ADAPT_CLASS
*Classes,
147
UNICHAR_ID
*Ambiguities,
148
ADAPT_RESULTS
*Results);
149
void
MasterMatcher
(
INT_TEMPLATES
templates,
150
inT16
num_features,
151
const
INT_FEATURE_STRUCT
* features,
152
const
uinT8
* norm_factors,
153
ADAPT_CLASS
* classes,
154
int
debug,
155
int
num_classes,
156
const
TBOX
& blob_box,
157
CLASS_PRUNER_RESULTS
results,
158
ADAPT_RESULTS
* final_results);
159
// Converts configs to fonts, and if the result is not adapted, and a
160
// shape_table_ is present, the shape is expanded to include all
161
// unichar_ids represented, before applying a set of corrections to the
162
// distance rating in int_result, (see ComputeCorrectedRating.)
163
// The results are added to the final_results output.
164
void
ExpandShapesAndApplyCorrections
(
ADAPT_CLASS
* classes,
165
bool
debug,
166
int
class_id,
167
int
bottom,
int
top,
168
float
cp_rating,
169
int
blob_length,
170
const
uinT8
* cn_factors,
171
INT_RESULT_STRUCT
& int_result,
172
ADAPT_RESULTS
* final_results);
173
// Applies a set of corrections to the distance im_rating,
174
// including the cn_correction, miss penalty and additional penalty
175
// for non-alnums being vertical misfits. Returns the corrected distance.
176
double
ComputeCorrectedRating
(
bool
debug,
int
unichar_id,
double
cp_rating,
177
double
im_rating,
int
feature_misses,
178
int
bottom,
int
top,
179
int
blob_length,
const
uinT8
* cn_factors);
180
void
ConvertMatchesToChoices
(
const
DENORM
& denorm,
const
TBOX
& box,
181
ADAPT_RESULTS
*Results,
182
BLOB_CHOICE_LIST *Choices);
183
void
AddNewResult
(
ADAPT_RESULTS
*results,
184
CLASS_ID
class_id,
185
int
shape_id,
186
FLOAT32
rating,
187
bool
adapted,
188
int
config,
189
int
fontinfo_id,
190
int
fontinfo_id2);
191
int
GetAdaptiveFeatures
(
TBLOB
*Blob,
192
INT_FEATURE_ARRAY
IntFeatures,
193
FEATURE_SET
*FloatFeatures);
194
195
#ifndef GRAPHICS_DISABLED
196
void
DebugAdaptiveClassifier
(
TBLOB
*Blob,
197
const
DENORM
& denorm,
198
ADAPT_RESULTS
*Results);
199
#endif
200
void
GetAdaptThresholds
(
TWERD
* Word,
201
const
DENORM
& denorm,
202
const
WERD_CHOICE
& BestChoice,
203
const
WERD_CHOICE
& BestRawChoice,
204
FLOAT32
Thresholds[]);
205
206
PROTO_ID
MakeNewTempProtos
(
FEATURE_SET
Features,
207
int
NumBadFeat,
208
FEATURE_ID
BadFeat[],
209
INT_CLASS
IClass,
210
ADAPT_CLASS
Class,
211
BIT_VECTOR
TempProtoMask
);
212
int
MakeNewTemporaryConfig
(
ADAPT_TEMPLATES
Templates,
213
CLASS_ID
ClassId,
214
int
FontinfoId,
215
int
NumFeatures,
216
INT_FEATURE_ARRAY
Features,
217
FEATURE_SET
FloatFeatures);
218
void
MakePermanent
(
ADAPT_TEMPLATES
Templates,
219
CLASS_ID
ClassId,
220
int
ConfigId,
221
const
DENORM
& denorm,
222
TBLOB
*Blob);
223
void
PrintAdaptiveMatchResults
(FILE *File,
ADAPT_RESULTS
*Results);
224
void
RemoveExtraPuncs
(
ADAPT_RESULTS
*Results);
225
void
RemoveBadMatches
(
ADAPT_RESULTS
*Results);
226
void
SetAdaptiveThreshold
(
FLOAT32
Threshold);
227
void
ShowBestMatchFor
(
TBLOB
*Blob,
228
const
DENORM
& denorm,
229
CLASS_ID
ClassId,
230
int
shape_id,
231
BOOL8
AdaptiveOn,
232
BOOL8
PreTrainedOn,
233
ADAPT_RESULTS
*Results);
234
// Returns a string for the classifier class_id: either the corresponding
235
// unicharset debug_str or the shape_table_ debug str.
236
STRING
ClassIDToDebugStr
(
const
INT_TEMPLATES_STRUCT
* templates,
237
int
class_id,
int
config_id)
const
;
238
// Converts a classifier class_id index with a config ID to:
239
// shape_table_ present: a shape_table_ index OR
240
// No shape_table_: a font ID.
241
// Without shape training, each class_id, config pair represents a single
242
// unichar id/font combination, so this function looks up the corresponding
243
// font id.
244
// With shape training, each class_id, config pair represents a single
245
// shape table index, so the fontset_table stores the shape table index,
246
// and the shape_table_ must be consulted to obtain the actual unichar_id/
247
// font combinations that the shape represents.
248
int
ClassAndConfigIDToFontOrShapeID
(
int
class_id,
249
int
int_result_config)
const
;
250
// Converts a shape_table_ index to a classifier class_id index (not a
251
// unichar-id!). Uses a search, so not fast.
252
int
ShapeIDToClassID
(
int
shape_id)
const
;
253
UNICHAR_ID
*
BaselineClassifier
(
TBLOB
*Blob,
254
const
DENORM
& denorm,
255
ADAPT_TEMPLATES
Templates,
256
ADAPT_RESULTS
*Results);
257
int
CharNormClassifier
(
TBLOB
*Blob,
258
const
DENORM
& denorm,
259
INT_TEMPLATES
Templates,
260
ADAPT_RESULTS
*Results);
261
262
// As CharNormClassifier, but operates on a TrainingSample and outputs to
263
// a GenericVector of ShapeRating without conversion to classes.
264
int
CharNormTrainingSample
(
bool
pruner_only,
const
TrainingSample
&
sample
,
265
GenericVector<ShapeRating>
* results);
266
UNICHAR_ID
*
GetAmbiguities
(
TBLOB
*Blob,
267
const
DENORM
& denorm,
268
CLASS_ID
CorrectClass);
269
void
DoAdaptiveMatch
(
TBLOB
*Blob,
270
const
DENORM
& denorm,
271
ADAPT_RESULTS
*Results);
272
void
AdaptToChar
(
TBLOB
*Blob,
273
const
DENORM
& denorm,
274
CLASS_ID
ClassId,
275
int
FontinfoId,
276
FLOAT32
Threshold);
277
void
DisplayAdaptedChar
(
TBLOB
* blob,
const
DENORM
& denorm,
278
INT_CLASS_STRUCT
* int_class);
279
int
AdaptableWord
(
TWERD
*Word,
280
const
WERD_CHOICE
&BestChoiceWord,
281
const
WERD_CHOICE
&RawChoiceWord);
282
void
EndAdaptiveClassifier
();
283
void
PrintAdaptiveStatistics
(FILE *File);
284
void
SettupPass1
();
285
void
SettupPass2
();
286
void
AdaptiveClassifier
(
TBLOB
*Blob,
287
const
DENORM
& denorm,
288
BLOB_CHOICE_LIST *Choices,
289
CLASS_PRUNER_RESULTS
cp_results);
290
void
ClassifyAsNoise
(
ADAPT_RESULTS
*Results);
291
void
ResetAdaptiveClassifierInternal
();
292
293
int
GetBaselineFeatures
(
TBLOB
*Blob,
294
const
DENORM
& denorm,
295
INT_TEMPLATES
Templates,
296
INT_FEATURE_ARRAY
IntFeatures,
297
uinT8
* CharNormArray,
298
inT32
*BlobLength);
299
int
GetCharNormFeatures
(
TBLOB
*Blob,
300
const
DENORM
& denorm,
301
INT_TEMPLATES
Templates,
302
INT_FEATURE_ARRAY
IntFeatures,
303
uinT8
* PrunerNormArray,
304
uinT8
* CharNormArray,
305
inT32
*BlobLength,
306
inT32
*FeatureOutlineIndex);
307
// Computes the char_norm_array for the unicharset and, if not NULL, the
308
// pruner_array as appropriate according to the existence of the shape_table.
309
// The norm_feature is deleted as it is almost certainly no longer needed.
310
void
ComputeCharNormArrays
(
FEATURE_STRUCT
* norm_feature,
311
INT_TEMPLATES_STRUCT
* templates,
312
uinT8
* char_norm_array,
313
uinT8
* pruner_array);
314
315
bool
TempConfigReliable
(
CLASS_ID
class_id,
const
TEMP_CONFIG
&config);
316
void
UpdateAmbigsGroup
(
CLASS_ID
class_id,
const
DENORM
& denorm,
TBLOB
*Blob);
317
318
void
ResetFeaturesHaveBeenExtracted
();
319
bool
AdaptiveClassifierIsFull
() {
return
NumAdaptationsFailed > 0; }
320
bool
LooksLikeGarbage
(
const
DENORM
& denorm,
TBLOB
*blob);
321
void
RefreshDebugWindow
(
ScrollView
**win,
const
char
*msg,
322
int
y_offset,
const
TBOX
&wbox);
323
/* float2int.cpp ************************************************************/
324
void
ClearCharNormArray
(
uinT8
* char_norm_array);
325
void
ComputeIntCharNormArray
(
const
FEATURE_STRUCT
& norm_feature,
326
uinT8
* char_norm_array);
327
void
ComputeIntFeatures
(
FEATURE_SET
Features,
INT_FEATURE_ARRAY
IntFeatures);
328
/* intproto.cpp *************************************************************/
329
INT_TEMPLATES
ReadIntTemplates
(FILE *File);
330
void
WriteIntTemplates
(FILE *File,
INT_TEMPLATES
Templates,
331
const
UNICHARSET
& target_unicharset);
332
CLASS_ID
GetClassToDebug
(
const
char
*Prompt,
bool
* adaptive_on,
333
bool
* pretrained_on,
int
* shape_id);
334
void
ShowMatchDisplay
();
335
/* font detection ***********************************************************/
336
UnicityTable<FontInfo>
&
get_fontinfo_table
() {
337
return
fontinfo_table_
;
338
}
339
UnicityTable<FontSet>
&
get_fontset_table
() {
340
return
fontset_table_
;
341
}
342
/* mfoutline.cpp ***********************************************************/
343
void
NormalizeOutlines
(
LIST
Outlines,
FLOAT32
*XScale,
FLOAT32
*YScale);
344
/* outfeat.cpp ***********************************************************/
345
FEATURE_SET
ExtractOutlineFeatures
(
TBLOB
*Blob);
346
/* picofeat.cpp ***********************************************************/
347
FEATURE_SET
ExtractPicoFeatures
(
TBLOB
*Blob);
348
349
350
// Member variables.
351
352
// Parameters.
353
BOOL_VAR_H
(
prioritize_division
,
FALSE
,
354
"Prioritize blob division over chopping"
);
355
INT_VAR_H
(
tessedit_single_match
,
FALSE
,
"Top choice only from CP"
);
356
BOOL_VAR_H
(
classify_enable_learning
,
true
,
"Enable adaptive classifier"
);
357
INT_VAR_H
(
classify_debug_level
, 0,
"Classify debug level"
);
358
359
/* mfoutline.cpp ***********************************************************/
360
/* control knobs used to control normalization of outlines */
361
INT_VAR_H
(
classify_norm_method
,
character
,
"Normalization Method ..."
);
362
double_VAR_H
(
classify_char_norm_range
, 0.2,
363
"Character Normalization Range ..."
);
364
double_VAR_H
(
classify_min_norm_scale_x
, 0.0,
"Min char x-norm scale ..."
);
365
double_VAR_H
(
classify_max_norm_scale_x
, 0.325,
"Max char x-norm scale ..."
);
366
double_VAR_H
(
classify_min_norm_scale_y
, 0.0,
"Min char y-norm scale ..."
);
367
double_VAR_H
(
classify_max_norm_scale_y
, 0.325,
"Max char y-norm scale ..."
);
368
369
/* adaptmatch.cpp ***********************************************************/
370
BOOL_VAR_H
(
tess_cn_matching
, 0,
"Character Normalized Matching"
);
371
BOOL_VAR_H
(
tess_bn_matching
, 0,
"Baseline Normalized Matching"
);
372
BOOL_VAR_H
(
classify_enable_adaptive_matcher
, 1,
"Enable adaptive classifier"
);
373
BOOL_VAR_H
(
classify_use_pre_adapted_templates
, 0,
374
"Use pre-adapted classifier templates"
);
375
BOOL_VAR_H
(
classify_save_adapted_templates
, 0,
376
"Save adapted templates to a file"
);
377
BOOL_VAR_H
(
classify_enable_adaptive_debugger
, 0,
"Enable match debugger"
);
378
INT_VAR_H
(
matcher_debug_level
, 0,
"Matcher Debug Level"
);
379
INT_VAR_H
(
matcher_debug_flags
, 0,
"Matcher Debug Flags"
);
380
INT_VAR_H
(
classify_learning_debug_level
, 0,
"Learning Debug Level: "
);
381
double_VAR_H
(
matcher_good_threshold
, 0.125,
"Good Match (0-1)"
);
382
double_VAR_H
(
matcher_great_threshold
, 0.0,
"Great Match (0-1)"
);
383
double_VAR_H
(
matcher_perfect_threshold
, 0.02,
"Perfect Match (0-1)"
);
384
double_VAR_H
(
matcher_bad_match_pad
, 0.15,
"Bad Match Pad (0-1)"
);
385
double_VAR_H
(
matcher_rating_margin
, 0.1,
"New template margin (0-1)"
);
386
double_VAR_H
(
matcher_avg_noise_size
, 12.0,
"Avg. noise blob length: "
);
387
INT_VAR_H
(
matcher_permanent_classes_min
, 1,
"Min # of permanent classes"
);
388
INT_VAR_H
(
matcher_min_examples_for_prototyping
, 3,
389
"Reliable Config Threshold"
);
390
INT_VAR_H
(
matcher_sufficient_examples_for_prototyping
, 5,
391
"Enable adaption even if the ambiguities have not been seen"
);
392
double_VAR_H
(
matcher_clustering_max_angle_delta
, 0.015,
393
"Maximum angle delta for prototype clustering"
);
394
double_VAR_H
(
classify_misfit_junk_penalty
, 0.0,
395
"Penalty to apply when a non-alnum is vertically out of "
396
"its expected textline position"
);
397
double_VAR_H
(
rating_scale
, 1.5,
"Rating scaling factor"
);
398
double_VAR_H
(
certainty_scale
, 20.0,
"Certainty scaling factor"
);
399
double_VAR_H
(
tessedit_class_miss_scale
, 0.00390625,
400
"Scale factor for features not used"
);
401
INT_VAR_H
(
classify_adapt_proto_threshold
, 230,
402
"Threshold for good protos during adaptive 0-255"
);
403
INT_VAR_H
(
classify_adapt_feature_threshold
, 230,
404
"Threshold for good features during adaptive 0-255"
);
405
BOOL_VAR_H
(
disable_character_fragments
,
TRUE
,
406
"Do not include character fragments in the"
407
" results of the classifier"
);
408
double_VAR_H
(
classify_character_fragments_garbage_certainty_threshold
, -3.0,
409
"Exclude fragments that do not match any whole character"
410
" with at least this certainty"
);
411
BOOL_VAR_H
(
classify_debug_character_fragments
,
FALSE
,
412
"Bring up graphical debugging windows for fragments training"
);
413
BOOL_VAR_H
(
matcher_debug_separate_windows
,
FALSE
,
414
"Use two different windows for debugging the matching: "
415
"One for the protos and one for the features."
);
416
STRING_VAR_H
(
classify_learn_debug_str
,
""
,
"Class str to debug learning"
);
417
418
/* intmatcher.cpp **********************************************************/
419
INT_VAR_H
(
classify_class_pruner_threshold
, 229,
420
"Class Pruner Threshold 0-255"
);
421
INT_VAR_H
(
classify_class_pruner_multiplier
, 30,
422
"Class Pruner Multiplier 0-255: "
);
423
INT_VAR_H
(
classify_cp_cutoff_strength
, 7,
424
"Class Pruner CutoffStrength: "
);
425
INT_VAR_H
(
classify_integer_matcher_multiplier
, 14,
426
"Integer Matcher Multiplier 0-255: "
);
427
428
// Use class variables to hold onto built-in templates and adapted templates.
429
INT_TEMPLATES
PreTrainedTemplates
;
430
ADAPT_TEMPLATES
AdaptedTemplates
;
431
432
// Create dummy proto and config masks for use with the built-in templates.
433
BIT_VECTOR
AllProtosOn
;
434
BIT_VECTOR
PrunedProtos
;
435
BIT_VECTOR
AllConfigsOn
;
436
BIT_VECTOR
AllProtosOff
;
437
BIT_VECTOR
AllConfigsOff
;
438
BIT_VECTOR
TempProtoMask
;
439
bool
EnableLearning
;
440
/* normmatch.cpp */
441
NORM_PROTOS
*
NormProtos
;
442
/* font detection ***********************************************************/
443
UnicityTable<FontInfo>
fontinfo_table_
;
444
// Without shape training, each class_id, config pair represents a single
445
// unichar id/font combination, so each fontset_table_ entry holds font ids
446
// for each config in the class.
447
// With shape training, each class_id, config pair represents a single
448
// shape_table_ index, so the fontset_table_ stores the shape_table_ index,
449
// and the shape_table_ must be consulted to obtain the actual unichar_id/
450
// font combinations that the shape represents.
451
UnicityTable<FontSet>
fontset_table_
;
452
453
INT_VAR_H
(
il1_adaption_test
, 0,
"Dont adapt to i/I at beginning of word"
);
454
BOOL_VAR_H
(
classify_bln_numeric_mode
, 0,
455
"Assume the input is numbers [0-9]."
);
456
457
protected
:
458
IntegerMatcher
im_
;
459
FEATURE_DEFS_STRUCT
feature_defs_
;
460
// If a shape_table_ is present, it is used to remap classifier output in
461
// ExpandShapesAndApplyCorrections. font_ids referenced by configs actually
462
// mean an index to the shape_table_ and the choices returned are *all* the
463
// shape_table_ entries at that index.
464
ShapeTable
*
shape_table_
;
465
466
private
:
467
468
Dict
dict_;
469
470
/* variables used to hold performance statistics */
471
int
AdaptiveMatcherCalls;
472
int
BaselineClassifierCalls;
473
int
CharNormClassifierCalls;
474
int
AmbigClassifierCalls;
475
int
NumWordsAdaptedTo;
476
int
NumCharsAdaptedTo;
477
int
NumBaselineClassesTried;
478
int
NumCharNormClassesTried;
479
int
NumAmbigClassesTried;
480
int
NumClassesOutput;
481
int
NumAdaptationsFailed;
482
483
/* variables used to hold onto extracted features. This is used
484
to map from the old scheme in which baseline features and char norm
485
features are extracted separately, to the new scheme in which they
486
are extracted at the same time. */
487
bool
FeaturesHaveBeenExtracted;
488
bool
FeaturesOK;
489
INT_FEATURE_ARRAY
BaselineFeatures;
490
INT_FEATURE_ARRAY
CharNormFeatures;
491
INT_FX_RESULT_STRUCT
FXInfo;
492
493
// Expected number of features in the class pruner, used to penalize
494
// unknowns that have too few features (like a c being classified as e) so
495
// it doesn't recognize everything as '@' or '#'.
496
// CharNormCutoffs is for the static classifier (with no shapetable).
497
// BaselineCutoffs gets a copy of CharNormCutoffs as an estimate of the real
498
// value in the adaptive classifier. Both are indexed by unichar_id.
499
// shapetable_cutoffs_ provides a similar value for each shape in the
500
// shape_table_
501
uinT16
* CharNormCutoffs;
502
uinT16
* BaselineCutoffs;
503
GenericVector<uinT16>
shapetable_cutoffs_;
504
ScrollView
* learn_debug_win_;
505
ScrollView
* learn_fragmented_word_debug_win_;
506
ScrollView
* learn_fragments_debug_win_;
507
};
508
}
// namespace tesseract
509
510
#endif // TESSERACT_CLASSIFY_CLASSIFY_H__
mnt
data
src
tesseract-ocr
classify
classify.h
Generated on Thu Nov 1 2012 20:19:46 for Tesseract by
1.8.1