24 #include "config_auto.h"
30 #include "allheaders.h"
53 bool replicate_samples,
55 : norm_mode_(norm_mode), samples_(fontinfo_table_),
56 junk_samples_(fontinfo_table_), verify_samples_(fontinfo_table_),
58 enable_shape_anaylsis_(shape_analysis),
59 enable_replication_(replicate_samples),
60 fragments_(
NULL), prev_unichar_id_(-1), debug_level_(debug_level) {
69 for (
int p = 0; p < page_images_.
size(); ++p)
70 pixDestroy(&page_images_[p]);
77 if (fwrite(&norm_mode_,
sizeof(norm_mode_), 1, fp) != 1)
return false;
79 if (!feature_space_.
Serialize(fp))
return false;
80 if (!samples_.
Serialize(fp))
return false;
81 if (!junk_samples_.
Serialize(fp))
return false;
82 if (!verify_samples_.
Serialize(fp))
return false;
83 if (!master_shapes_.
Serialize(fp))
return false;
84 if (!flat_shapes_.
Serialize(fp))
return false;
89 if (!xheights_.
Serialize(fp))
return false;
96 if (fread(&norm_mode_,
sizeof(norm_mode_), 1, fp) != 1)
return false;
98 ReverseN(&norm_mode_,
sizeof(norm_mode_));
101 charsetsize_ = unicharset_.
size();
102 if (!feature_space_.
DeSerialize(swap, fp))
return false;
103 feature_map_.
Init(feature_space_);
105 if (!junk_samples_.
DeSerialize(swap, fp))
return false;
106 if (!verify_samples_.
DeSerialize(swap, fp))
return false;
107 if (!master_shapes_.
DeSerialize(swap, fp))
return false;
108 if (!flat_shapes_.
DeSerialize(swap, fp))
return false;
114 if (!xheights_.
DeSerialize(swap, fp))
return false;
121 tprintf(
"Failed to load unicharset from file %s\n"
122 "Building unicharset for training from scratch...\n",
128 charsetsize_ = unicharset_.
size();
129 delete [] fragments_;
130 fragments_ =
new int[charsetsize_];
131 memset(fragments_, 0,
sizeof(*fragments_) * charsetsize_);
151 while (fgets(buffer,
sizeof(buffer), fp) !=
NULL) {
152 if (buffer[0] ==
'\n')
155 char* space = strchr(buffer,
' ');
157 tprintf(
"Bad format in tr file, reading fontname, unichar\n");
166 tprintf(
"Bad format in tr file, reading box coords\n");
175 cn_feature_type, geo_feature_type, char_desc);
179 charsetsize_ = unicharset_.
size();
187 verify_samples_.
AddSample(unichar, sample);
188 prev_unichar_id_ = -1;
190 if (prev_unichar_id_ >= 0)
191 fragments_[prev_unichar_id_] = -1;
192 prev_unichar_id_ = samples_.
AddSample(unichar, sample);
196 int junk_id = junk_samples_.
AddSample(unichar, sample);
197 if (prev_unichar_id_ >= 0) {
200 if (fragments_[prev_unichar_id_] == 0)
201 fragments_[prev_unichar_id_] = junk_id;
202 else if (fragments_[prev_unichar_id_] != junk_id)
203 fragments_[prev_unichar_id_] = -1;
207 prev_unichar_id_ = -1;
217 for (page = 0; (pix = pixReadTiff(filename, page)) !=
NULL; ++page) {
220 tprintf(
"Loaded %d page images from %s\n", page, filename);
229 if (debug_level_ > 0)
230 tprintf(
"PostLoadCleanup...\n");
231 if (enable_shape_anaylsis_)
232 ReplaceFragmentedSamples();
243 if (debug_level_ > 0)
244 tprintf(
"ComputeCanonicalSamples...\n");
252 if (debug_level_ > 0)
253 tprintf(
"PreTrainingSetup...\n");
256 if (debug_level_ > 0)
257 tprintf(
"ComputeCloudFeatures...\n");
264 tprintf(
"Building master shape table\n");
265 int num_fonts = samples_.
NumFonts();
272 for (
int f = 0;
f < num_fonts; ++
f) {
280 if (fragment ==
NULL)
281 char_shapes.AppendMasterShapes(shapes);
283 char_shapes_begin_fragment.AppendMasterShapes(shapes);
285 char_shapes_end_fragment.AppendMasterShapes(shapes);
287 char_shapes.AppendMasterShapes(shapes);
291 char_shapes.AppendMasterShapes(char_shapes_begin_fragment);
294 char_shapes.AppendMasterShapes(char_shapes_end_fragment);
317 tprintf(
"Moving %d junk samples to master sample set.\n", num_junks);
318 for (
int s = 0; s < num_junks; ++s) {
323 if (sample_id == INVALID_UNICHAR_ID)
339 if (enable_replication_) {
340 if (debug_level_ > 0)
341 tprintf(
"ReplicateAndRandomize...\n");
351 FILE* fp = fopen(filename,
"rb");
353 fprintf(stderr,
"Failed to load font_properties from %s\n", filename);
356 int italic, bold, fixed, serif, fraktur;
359 char* font_name =
new char[1024];
360 fontinfo.
name = font_name;
363 if (fscanf(fp,
"%1024s %i %i %i %i %i\n", font_name,
364 &italic, &bold, &fixed, &serif, &fraktur) != 6)
372 if (!fontinfo_table_.
contains(fontinfo)) {
383 tprintf(
"fontinfo table is of size %d\n", fontinfo_table_.
size());
385 if (filename ==
NULL)
return true;
386 FILE *
f = fopen(filename,
"rb");
388 fprintf(stderr,
"Failed to load font xheights from %s\n", filename);
391 tprintf(
"Reading x-heights from %s ...\n", filename);
397 int total_xheight = 0;
398 int xheight_count = 0;
400 if (fscanf(f,
"%1024s %d\n", buffer, &xht) != 2)
402 fontinfo.
name = buffer;
403 if (!fontinfo_table_.
contains(fontinfo))
continue;
404 int fontinfo_id = fontinfo_table_.
get_id(fontinfo);
405 xheights_[fontinfo_id] = xht;
406 total_xheight += xht;
409 if (xheight_count == 0) {
410 fprintf(stderr,
"No valid xheights in %s!\n", filename);
413 int mean_xheight =
DivRounded(total_xheight, xheight_count);
414 for (
int i = 0; i < fontinfo_table_.
size(); ++i) {
415 if (xheights_[i] < 0)
416 xheights_[i] = mean_xheight;
423 FILE* fontinfo_file = fopen(filename,
"rb");
424 if (fontinfo_file ==
NULL)
428 if (fontinfo_id < 0) {
429 tprintf(
"No font found matching fontinfo filename %s\n", filename);
430 fclose(fontinfo_file);
433 tprintf(
"Reading spacing from %s for font %d...\n", filename, fontinfo_id);
440 int x_gap, x_gap_before, x_gap_after, num_kerned;
441 ASSERT_HOST(fscanf(fontinfo_file,
"%d\n", &num_unichars) == 1);
445 for (
int l = 0; l < num_unichars; ++l) {
446 if (fscanf(fontinfo_file,
"%s %d %d %d",
447 uch, &x_gap_before, &x_gap_after, &num_kerned) != 4) {
448 tprintf(
"Bad format of font spacing file %s\n", filename);
449 fclose(fontinfo_file);
458 for (
int k = 0; k < num_kerned; ++k) {
459 if (fscanf(fontinfo_file,
"%s %d", kerned_uch, &x_gap) != 2) {
460 tprintf(
"Bad format of font spacing file %s\n", filename);
461 fclose(fontinfo_file);
471 fclose(fontinfo_file);
480 fontinfo.
name =
const_cast<char*
>(font_name);
483 if (!fontinfo_table_.
contains(fontinfo)) {
486 return fontinfo_table_.
get_id(fontinfo);
493 int fontinfo_id = -1;
495 for (
int f = 0;
f < fontinfo_table_.
size(); ++
f) {
496 if (strstr(filename, fontinfo_table_.
get(
f).name) !=
NULL) {
497 int len = strlen(fontinfo_table_.
get(
f).name);
499 if (len > best_len) {
515 int num_shapes = flat_shapes_.
NumShapes();
516 for (
int s = 0; s < num_shapes; ++s) {
517 int font = flat_shapes_.
GetShape(s)[0].font_ids[0];
519 for (f = 0; f < active_fonts.
size(); ++
f) {
520 if (active_fonts[f] == font)
523 if (f == active_fonts.
size())
527 int num_fonts = active_fonts.
size();
528 for (
int f = 0;
f < num_fonts; ++
f) {
529 for (
int s = num_shapes - 1; s >= 0; --s) {
530 int font = flat_shapes_.
GetShape(s)[0].font_ids[0];
531 if (font == active_fonts[
f]) {
555 shape_map.
SetMap(shape_id,
true);
560 it.
Init(&shape_map, &shape_table,
false, &samples_);
565 for (
int i = sample_ptrs.
size() - 1; i >= 0; --i) {
568 for (
int f = 0;
f < num_features; ++
f)
572 *num_samples = sample_id;
584 const char* inttemp_file,
585 const char* pffmtable_file) {
591 FILE* fp = fopen(inttemp_file,
"wb");
601 for (
int c = 0; c < unicharset.
size(); ++c)
604 for (
int i = 0; i < int_templates->
NumClasses; ++i) {
609 for (
int config_id = 0; config_id < Class->
NumConfigs; config_id++) {
613 if (length > max_length)
615 int shape_id = float_classes[i].
font_set.
get(config_id);
617 for (
int c = 0; c < shape.
size(); ++c) {
618 int unichar_id = shape[c].unichar_id;
619 if (length > unichar_cutoffs[unichar_id])
620 unichar_cutoffs[unichar_id] = length;
623 shapetable_cutoffs.
push_back(max_length);
625 fp = fopen(pffmtable_file,
"wb");
627 for (
int c = 0; c < unicharset.
size(); ++c) {
629 if (strcmp(unichar,
" ") == 0) {
632 fprintf(fp,
"%s %d\n", unichar, unichar_cutoffs[c]);
641 const char* unichar_str2) {
644 if (class_id2 == INVALID_UNICHAR_ID)
645 class_id2 = class_id1;
646 if (class_id1 == INVALID_UNICHAR_ID) {
647 tprintf(
"No unicharset entry found for %s\n", unichar_str1);
650 tprintf(
"Font ambiguities for unichar %d = %s and %d = %s\n",
651 class_id1, unichar_str1, class_id2, unichar_str2);
653 int num_fonts = samples_.
NumFonts();
658 for (
int f = 0;
f < num_fonts; ++
f) {
664 for (
int f1 = 0; f1 < num_fonts; ++f1) {
669 for (
int f2 = 0; f2 < num_fonts; ++f2) {
680 for (
int f = 0;
f < num_fonts; ++
f) {
683 if (class_id1 != class_id2 &&
689 #ifndef GRAPHICS_DISABLED
701 const char* unichar_str2,
702 int canonical_font) {
709 if (class_id2 != INVALID_UNICHAR_ID && canonical_font >= 0) {
717 if (class_id1 != INVALID_UNICHAR_ID && cloud_font >= 0) {
719 for (
int f = 0;
f < cloud.
size(); ++
f) {
737 if (feature_index >= 0) {
751 #endif // GRAPHICS_DISABLED
756 bool replicate_samples,
760 test_classifier, report_string);
776 bool replicate_samples,
783 if (report_level > 0) {
787 tprintf(
"Iterator has charset size of %d/%d, %d shapes, %d samples\n",
790 tprintf(
"Testing %sREPLICATED:\n", replicate_samples ?
"" :
"NON-");
792 double unichar_error = 0.0;
795 page_images_, &sample_it, &unichar_error,
796 NULL, report_string);
797 return unichar_error;
806 int num_chars1 = shape1.
size();
807 int num_chars2 = shape2.
size();
808 float dist_sum = 0.0f;
810 if (num_chars1 > 1 || num_chars2 > 1) {
813 for (
int c1 = 0; c1 < num_chars1; ++c1) {
814 for (
int c2 = 0; c2 < num_chars2; ++c2) {
827 return dist_sum / dist_count;
832 void MasterTrainer::ReplaceFragmentedSamples() {
833 if (fragments_ ==
NULL)
return;
837 for (
int s = 0; s < num_samples; ++s) {
839 if (fragments_[sample->
class_id()] > 0)
851 bool* good_junk =
new bool[frag_set.
size()];
852 memset(good_junk, 0,
sizeof(*good_junk) * frag_set.
size());
853 for (
int dead_ch = 1; dead_ch < unicharset_.
size(); ++dead_ch) {
854 int frag_ch = fragments_[dead_ch];
855 if (frag_ch <= 0)
continue;
859 for (
int part = 0; part < frag->
get_total(); ++part) {
862 if (good_ch != INVALID_UNICHAR_ID)
863 good_junk[good_ch] =
true;
870 for (
int s = 0; s < num_junks; ++s) {
887 delete [] fragments_;
897 void MasterTrainer::ClusterShapes(
int min_shapes,
int max_shape_unichars,
900 int max_merges = num_shapes - min_shapes;
906 tprintf(
"Computing shape distances...");
907 for (
int s1 = 0; s1 < num_shapes; ++s1) {
908 for (
int s2 = s1 + 1; s2 < num_shapes; ++s2) {
921 while (num_merged < max_merges && min_dist < max_dist) {
922 tprintf(
"Distance = %f: ", min_dist);
924 shape_dists[min_s1][min_s2 - min_s1 - 1].distance =
kInfiniteDist;
925 if (num_unichars > max_shape_unichars) {
926 tprintf(
"Merge of %d and %d with %d would exceed max of %d unichars\n",
927 min_s1, min_s2, num_unichars, max_shape_unichars);
930 shape_dists[min_s2].
clear();
933 for (
int s = 0; s < min_s1; ++s) {
934 if (!shape_dists[s].empty()) {
935 shape_dists[s][min_s1 - s - 1].distance =
940 for (
int s2 = min_s1 + 1; s2 < num_shapes; ++s2) {
941 if (shape_dists[min_s1][s2 - min_s1 - 1].distance <
kInfiniteDist)
942 shape_dists[min_s1][s2 - min_s1 - 1].distance =
945 for (
int s = min_s1 + 1; s < min_s2; ++s) {
946 if (!shape_dists[s].empty()) {
952 for (
int s1 = 0; s1 < num_shapes; ++s1) {
953 for (
int i = 0; i < shape_dists[s1].
size(); ++i) {
954 if (shape_dists[s1][i].distance < min_dist) {
955 min_dist = shape_dists[s1][i].distance;
962 tprintf(
"Stopped with %d merged, min dist %f\n", num_merged, min_dist);
963 delete [] shape_dists;
964 if (debug_level_ > 1) {
965 for (
int s1 = 0; s1 < num_shapes; ++s1) {