Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
cntraining.cpp File Reference
#include "oldlist.h"
#include "efio.h"
#include "emalloc.h"
#include "featdefs.h"
#include "tessopt.h"
#include "ocrfeatures.h"
#include "clusttool.h"
#include "cluster.h"
#include <string.h>
#include <stdio.h>
#include <math.h>
#include "unichar.h"
#include "commontraining.h"

Go to the source code of this file.

Macros

#define PROGRAM_FEATURE_TYPE   "cn"

Functions

 DECLARE_STRING_PARAM_FLAG (D)
int main (int argc, char **argv)
void WriteNormProtos (const char *Directory, LIST LabeledProtoList, CLUSTERER *Clusterer)
void WriteProtos (FILE *File, uinT16 N, LIST ProtoList, BOOL8 WriteSigProtos, BOOL8 WriteInsigProtos)
int main (int argc, char *argv[])

Variables

CLUSTERCONFIG CNConfig

Macro Definition Documentation

#define PROGRAM_FEATURE_TYPE   "cn"

Include Files and Type Defines —————————————————————————-

Definition at line 41 of file cntraining.cpp.


Function Documentation

DECLARE_STRING_PARAM_FLAG ( )
int main ( int  argc,
char **  argv 
)

Public Function Prototypes —————————————————————————-

Definition at line 50 of file tesseractmain.cpp.

{
#ifdef USING_GETTEXT
setlocale (LC_ALL, "");
bindtextdomain (PACKAGE, LOCALEDIR);
textdomain (PACKAGE);
#endif
if ((argc == 2 && strcmp(argv[1], "-v") == 0) ||
(argc == 2 && strcmp(argv[1], "--version") == 0)) {
char *versionStrP;
fprintf(stderr, "tesseract %s\n", tesseract::TessBaseAPI::Version());
versionStrP = getLeptonicaVersion();
fprintf(stderr, " %s\n", versionStrP);
lept_free(versionStrP);
versionStrP = getImagelibVersions();
fprintf(stderr, " %s\n", versionStrP);
lept_free(versionStrP);
exit(0);
}
STRING tessdata_dir;
truncate_path(argv[0], &tessdata_dir);
int rc = api.Init(tessdata_dir.string(), NULL);
if (rc) {
fprintf(stderr, _("Could not initialize tesseract.\n"));
exit(1);
}
if (argc == 2 && strcmp(argv[1], "--list-langs") == 0) {
fprintf(stderr, _("List of available languages (%d):\n"), languages.size());
for (int index = 0; index < languages.size(); ++index) {
STRING& string = languages[index];
fprintf(stderr, "%s\n", string.string());
}
api.Clear();
exit(0);
}
api.End();
// Make the order of args a bit more forgiving than it used to be.
const char* lang = "eng";
const char* image = NULL;
const char* output = NULL;
int arg = 1;
while (arg < argc && (output == NULL || argv[arg][0] == '-')) {
if (strcmp(argv[arg], "-l") == 0 && arg + 1 < argc) {
lang = argv[arg + 1];
++arg;
} else if (strcmp(argv[arg], "-psm") == 0 && arg + 1 < argc) {
pagesegmode = static_cast<tesseract::PageSegMode>(atoi(argv[arg + 1]));
++arg;
} else if (image == NULL) {
image = argv[arg];
} else if (output == NULL) {
output = argv[arg];
}
++arg;
}
if (output == NULL) {
fprintf(stderr, _("Usage:%s imagename outputbase [-l lang] "
"[-psm pagesegmode] [configfile...]\n\n"), argv[0]);
fprintf(stderr,
_("pagesegmode values are:\n"
"0 = Orientation and script detection (OSD) only.\n"
"1 = Automatic page segmentation with OSD.\n"
"2 = Automatic page segmentation, but no OSD, or OCR\n"
"3 = Fully automatic page segmentation, but no OSD. (Default)\n"
"4 = Assume a single column of text of variable sizes.\n"
"5 = Assume a single uniform block of vertically aligned text.\n"
"6 = Assume a single uniform block of text.\n"
"7 = Treat the image as a single text line.\n"
"8 = Treat the image as a single word.\n"
"9 = Treat the image as a single word in a circle.\n"
"10 = Treat the image as a single character.\n"));
fprintf(stderr, _("-l lang and/or -psm pagesegmode must occur before any"
"configfile.\n\n"));
fprintf(stderr, _("Single options:\n"));
fprintf(stderr, _(" -v --version: version info\n"));
fprintf(stderr, _(" --list-langs: list available languages for tesseract "
"engine\n"));
exit(1);
}
api.SetOutputName(output);
rc = api.Init(tessdata_dir.string(), lang, tesseract::OEM_DEFAULT,
&(argv[arg]), argc - arg, NULL, NULL, false);
if (rc) {
fprintf(stderr, _("Could not initialize tesseract.\n"));
exit(1);
}
// We have 2 possible sources of pagesegmode: a config file and
// the command line. For backwards compatability reasons, the
// default in tesseract is tesseract::PSM_SINGLE_BLOCK, but the
// default for this program is tesseract::PSM_AUTO. We will let
// the config file take priority, so the command-line default
// can take priority over the tesseract default, so we use the
// value from the command line only if the retrieved mode
// is still tesseract::PSM_SINGLE_BLOCK, indicating no change
// in any config file. Therefore the only way to force
// tesseract::PSM_SINGLE_BLOCK is from the command line.
// It would be simpler if we could set the value before Init,
// but that doesn't work.
api.SetPageSegMode(pagesegmode);
tprintf("Tesseract Open Source OCR Engine v%s with Leptonica\n",
FILE* fin = fopen(image, "rb");
if (fin == NULL) {
fprintf(stderr, _("Cannot open input file: %s\n"), image);
exit(2);
}
fclose(fin);
PIX *pixs;
if ((pixs = pixRead(image)) == NULL) {
fprintf(stderr, _("Unsupported image type.\n"));
exit(3);
}
pixDestroy(&pixs);
if (!api.ProcessPages(image, NULL, 0, &text_out)) {
fprintf(stderr, _("Error during processing.\n"));
}
bool output_hocr = false;
api.GetBoolVariable("tessedit_create_hocr", &output_hocr);
bool output_box = false;
api.GetBoolVariable("tessedit_create_boxfile", &output_box);
STRING outfile = output;
outfile += output_hocr ? ".html" : output_box ? ".box" : ".txt";
FILE* fout = fopen(outfile.string(), "wb");
if (fout == NULL) {
fprintf(stderr, _("Cannot create output file %s\n"), outfile.string());
exit(1);
}
fwrite(text_out.string(), 1, text_out.length(), fout);
fclose(fout);
return 0; // Normal exit
}
int main ( int  argc,
char *  argv[] 
)

Public Code —————————————————————————-

Definition at line 89 of file cntraining.cpp.

{
// Set the global Config parameters before parsing the command line.
const char *PageName;
FILE *TrainingPage;
LIST CharList = NIL_LIST;
CLUSTERER *Clusterer = NULL;
LIST ProtoList = NIL_LIST;
LIST NormProtoList = NIL_LIST;
LIST pCharList;
LABELEDLIST CharSample;
FEATURE_DEFS_STRUCT FeatureDefs;
InitFeatureDefs(&FeatureDefs);
ParseArguments(&argc, &argv);
int num_fonts = 0;
while ((PageName = GetNextFilename(argc, argv)) != NULL) {
printf("Reading %s ...\n", PageName);
TrainingPage = Efopen(PageName, "rb");
100, NULL, TrainingPage, &CharList);
fclose(TrainingPage);
++num_fonts;
}
printf("Clustering ...\n");
// To allow an individual font to form a separate cluster,
// reduce the min samples:
// Config.MinSamples = 0.5 / num_fonts;
pCharList = CharList;
iterate(pCharList) {
//Cluster
CharSample = (LABELEDLIST)first_node(pCharList);
Clusterer =
SetUpForClustering(FeatureDefs, CharSample, PROGRAM_FEATURE_TYPE);
float SavedMinSamples = Config.MinSamples;
// To disable the tendency to produce a single cluster for all fonts,
// make MagicSamples an impossible to achieve number:
// Config.MagicSamples = CharSample->SampleCount * 10;
while (Config.MinSamples > 0.001) {
ProtoList = ClusterSamples(Clusterer, &Config);
if (NumberOfProtos(ProtoList, 1, 0) > 0) {
break;
} else {
Config.MinSamples *= 0.95;
printf("0 significant protos for %s."
" Retrying clustering with MinSamples = %f%%\n",
CharSample->Label, Config.MinSamples);
}
}
Config.MinSamples = SavedMinSamples;
AddToNormProtosList(&NormProtoList, ProtoList, CharSample->Label);
}
if (Clusterer == NULL) { // To avoid a SIGSEGV
fprintf(stderr, "Error: NULL clusterer!\n");
return 1;
}
WriteNormProtos(FLAGS_D.c_str(), NormProtoList, Clusterer);
FreeNormProtoList(NormProtoList);
FreeProtoList(&ProtoList);
FreeClusterer(Clusterer);
printf ("\n");
return 0;
} // main
void WriteNormProtos ( const char *  Directory,
LIST  LabeledProtoList,
CLUSTERER Clusterer 
)

Private Function Prototypes —————————————————————————-


Private Code —————————————————————————-

Definition at line 215 of file cntraining.cpp.

{
FILE *File;
STRING Filename;
LABELEDLIST LabeledProto;
int N;
Filename = "";
if (Directory != NULL && Directory[0] != '\0')
{
Filename += Directory;
Filename += "/";
}
Filename += "normproto";
printf ("\nWriting %s ...", Filename.string());
File = Efopen (Filename.string(), "wb");
fprintf(File,"%0d\n",Clusterer->SampleSize);
WriteParamDesc(File,Clusterer->SampleSize,Clusterer->ParamDesc);
iterate(LabeledProtoList)
{
LabeledProto = (LABELEDLIST) first_node (LabeledProtoList);
N = NumberOfProtos(LabeledProto->List, true, false);
if (N < 1) {
printf ("\nError! Not enough protos for %s: %d protos"
" (%d significant protos"
", %d insignificant protos)\n",
LabeledProto->Label, N,
NumberOfProtos(LabeledProto->List, 1, 0),
NumberOfProtos(LabeledProto->List, 0, 1));
exit(1);
}
fprintf(File, "\n%s %d\n", LabeledProto->Label, N);
WriteProtos(File, Clusterer->SampleSize, LabeledProto->List, true, false);
}
fclose (File);
} // WriteNormProtos
void WriteProtos ( FILE *  File,
uinT16  N,
LIST  ProtoList,
BOOL8  WriteSigProtos,
BOOL8  WriteInsigProtos 
)

Definition at line 270 of file cntraining.cpp.

{
PROTOTYPE *Proto;
// write prototypes
iterate(ProtoList)
{
Proto = (PROTOTYPE *) first_node ( ProtoList );
if (( Proto->Significant && WriteSigProtos ) ||
( ! Proto->Significant && WriteInsigProtos ) )
WritePrototype( File, N, Proto );
}
} // WriteProtos

Variable Documentation

CLUSTERCONFIG CNConfig
Initial value:
{
elliptical, 0.025, 0.05, 0.8, 1e-3, 0
}

Global Data Definitions and Declarations —————————————————————————-

Definition at line 79 of file cntraining.cpp.