Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
unicharset_extractor.cpp File Reference
#include <stdio.h>
#include <locale.h>
#include "boxread.h"
#include "rect.h"
#include "strngs.h"
#include "tessopt.h"
#include "unichar.h"
#include "unicharset.h"

Go to the source code of this file.

Functions

UNICHAR_ID wc_to_unichar_id (const UNICHARSET &unicharset, int wc)
void set_properties (UNICHARSET *unicharset, const char *const c_string)
int main (int argc, char **argv)

Function Documentation

int main ( int  argc,
char **  argv 
)

Public Function Prototypes —————————————————————————-

Definition at line 102 of file unicharset_extractor.cpp.

{
int option;
const char* output_directory = ".";
STRING unicharset_file_name;
UNICHARSET unicharset;
setlocale(LC_ALL, "");
// Space character needed to represent NIL classification
unicharset.unichar_insert(" ");
// Print usage
if (argc <= 1) {
printf("Usage: %s [-D DIRECTORY] FILE...\n", argv[0]);
exit(1);
}
// Parse arguments
while ((option = tessopt(argc, argv, "D" )) != EOF) {
switch (option) {
case 'D':
output_directory = tessoptarg;
break;
}
}
// Save file name
unicharset_file_name = output_directory;
unicharset_file_name += "/";
unicharset_file_name += kUnicharsetFileName;
// Load box files
for (; tessoptind < argc; ++tessoptind) {
printf("Extracting unicharset from %s\n", argv[tessoptind]);
FILE* box_file = fopen(argv[tessoptind], "rb");
if (box_file == NULL) {
printf("Cannot open box file %s\n", argv[tessoptind]);
return -1;
}
TBOX box;
STRING unichar_string;
int line_number = 0;
while (ReadNextBox(&line_number, box_file, &unichar_string, &box)) {
unicharset.unichar_insert(unichar_string.string());
set_properties(&unicharset, unichar_string.string());
}
}
// Write unicharset file
if (unicharset.save_to_file(unicharset_file_name.string())) {
printf("Wrote unicharset file %s.\n", unicharset_file_name.string());
}
else {
printf("Cannot save unicharset file %s.\n", unicharset_file_name.string());
return -1;
}
return 0;
}
void set_properties ( UNICHARSET unicharset,
const char *const  c_string 
)

Definition at line 61 of file unicharset_extractor.cpp.

{
#ifdef USING_WCTYPE
int wc;
// Convert the string to a unichar id.
id = unicharset->unichar_to_id(c_string);
// Set the other_case property to be this unichar id by default.
unicharset->set_other_case(id, id);
int step = UNICHAR::utf8_step(c_string);
if (step == 0)
return; // Invalid utf-8.
// Get the next Unicode code point in the string.
UNICHAR ch(c_string, step);
wc = ch.first_uni();
/* Copy the properties. */
if (iswalpha(wc)) {
unicharset->set_isalpha(id, 1);
if (iswlower(wc)) {
unicharset->set_islower(id, 1);
unicharset->set_other_case(id, wc_to_unichar_id(*unicharset,
towupper(wc)));
}
if (iswupper(wc)) {
unicharset->set_isupper(id, 1);
unicharset->set_other_case(id, wc_to_unichar_id(*unicharset,
towlower(wc)));
}
}
if (iswdigit(wc))
unicharset->set_isdigit(id, 1);
if(iswpunct(wc))
unicharset->set_ispunctuation(id, 1);
#endif
}
UNICHAR_ID wc_to_unichar_id ( const UNICHARSET unicharset,
int  wc 
)

Definition at line 49 of file unicharset_extractor.cpp.

{
UNICHAR uch(wc);
char *unichar = uch.utf8_str();
UNICHAR_ID unichar_id = unicharset.unichar_to_id(unichar);
delete[] unichar;
return unichar_id;
}