Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
associate.cpp
Go to the documentation of this file.
1 
2 // File: associate.cpp
3 // Description: Functions for scoring segmentation paths according to
4 // their character widths, gap widths and seam cuts.
5 // Author: Daria Antonova
6 // Created: Mon Mar 8 11:26:43 PDT 2010
7 //
8 // (C) Copyright 2010, Google Inc.
9 // Licensed under the Apache License, Version 2.0 (the "License");
10 // you may not use this file except in compliance with the License.
11 // You may obtain a copy of the License at
12 // http://www.apache.org/licenses/LICENSE-2.0
13 // Unless required by applicable law or agreed to in writing, software
14 // distributed under the License is distributed on an "AS IS" BASIS,
15 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 // See the License for the specific language governing permissions and
17 // limitations under the License.
18 //
20 
21 
22 #include <stdio.h>
23 #ifdef __UNIX__
24 #include <assert.h>
25 #endif
26 #include <math.h>
27 
28 #include "associate.h"
29 #include "baseline.h"
30 #include "normalis.h"
31 
32 namespace tesseract {
33 
35 const float AssociateUtils::kMinGap = 0.03f;
36 
37 void AssociateUtils::ComputeStats(int col, int row,
38  const AssociateStats *parent_stats,
39  int parent_path_length,
40  bool fixed_pitch,
41  float max_char_wh_ratio,
42  const DENORM *denorm,
43  CHUNKS_RECORD *chunks_record,
44  int debug_level,
45  AssociateStats *stats) {
46  stats->Clear();
47 
48  if (debug_level > 0) {
49  tprintf("AssociateUtils::ComputeStats() for col=%d, row=%d%s\n",
50  col, row, fixed_pitch ? " (fixed pitch)" : "");
51  }
52  float normalizing_height = BASELINE_SCALE;
53  // TODO(rays/daria) Can unicharset.script_has_xheight be useful here?
54  if (fixed_pitch && denorm != NULL && denorm->row() != NULL) {
55  // For fixed pitch language like CJK, we use the full text height
56  // as the normalizing factor so we are not dependent on xheight
57  // calculation.
58  if (denorm->row()->body_size() > 0.0f) {
59  normalizing_height = denorm->y_scale() * denorm->row()->body_size();
60  } else {
61  normalizing_height = denorm->y_scale() *
62  (denorm->row()->x_height() + denorm->row()->ascenders());
63  }
64  if (debug_level > 0) {
65  tprintf("normalizing height = %g (scale %g xheight %g ascenders %g)\n",
66  normalizing_height, denorm->y_scale(), denorm->row()->x_height(),
67  denorm->row()->ascenders());
68  }
69  }
70  float wh_ratio =
71  GetChunksWidth(chunks_record->chunk_widths, col, row) / normalizing_height;
72  if (debug_level) tprintf("wh_ratio %g\n", wh_ratio);
73  if (wh_ratio > max_char_wh_ratio) stats->bad_shape = true;
74  if (fixed_pitch) {
75  bool end_row = (row == (chunks_record->ratings->dimension() - 1));
76 
77  // Ensure that the blob has gaps on the left and the right sides
78  // (except for beginning and ending punctuation) and that there is
79  // no cutting through ink at the blob boundaries.
80  if (col > 0) {
81  float left_gap =
82  GetChunksGap(chunks_record->chunk_widths, col-1) / normalizing_height;
83  SEAM *left_seam =
84  static_cast<SEAM *>(array_value(chunks_record->splits, col-1));
85  if (debug_level) {
86  tprintf("left_gap %g, left_seam %g\n", left_gap, left_seam->priority);
87  }
88  if ((!end_row && left_gap < kMinGap) || left_seam->priority > 0.0f) {
89  stats->bad_shape = true;
90  }
91  }
92  float right_gap = 0.0f;
93  if (!end_row) {
94  right_gap =
95  GetChunksGap(chunks_record->chunk_widths, row) / normalizing_height;
96  SEAM *right_seam =
97  static_cast<SEAM *>(array_value(chunks_record->splits, row));
98  if (debug_level) {
99  tprintf("right_gap %g right_seam %g\n",
100  right_gap, right_seam->priority);
101  }
102  if (right_gap < kMinGap || right_seam->priority > 0.0f) {
103  stats->bad_shape = true;
104  if (right_gap < kMinGap) stats->bad_fixed_pitch_right_gap = true;
105  }
106  }
107 
108  // Impose additional segmentation penalties if blob widths or gaps
109  // distribution don't fit a fixed-pitch model.
110  // Since we only know the widths and gaps of the path explored so far,
111  // the means and variances are computed for the path so far (not
112  // considering characters to the right of the last character on the path).
113  stats->full_wh_ratio = wh_ratio + right_gap;
114  if (parent_stats != NULL) {
115  stats->full_wh_ratio_total =
116  (parent_stats->full_wh_ratio_total + stats->full_wh_ratio);
117  float mean =
118  stats->full_wh_ratio_total / static_cast<float>(parent_path_length+1);
119  stats->full_wh_ratio_var =
120  parent_stats->full_wh_ratio_var + pow(mean-stats->full_wh_ratio, 2);
121  } else {
122  stats->full_wh_ratio_total = stats->full_wh_ratio;
123  }
124  if (debug_level) {
125  tprintf("full_wh_ratio %g full_wh_ratio_total %g full_wh_ratio_var %g\n",
126  stats->full_wh_ratio, stats->full_wh_ratio_total,
127  stats->full_wh_ratio_var);
128  }
129 
130  stats->shape_cost =
131  FixedPitchWidthCost(wh_ratio, right_gap, end_row, max_char_wh_ratio);
132 
133  // For some reason Tesseract prefers to treat the whole CJ words
134  // as one blob when the initial segmentation is particularly bad.
135  // This hack is to avoid favoring such states.
136  if (col == 0 && end_row && wh_ratio > max_char_wh_ratio) {
137  stats->shape_cost += 10;
138  }
139  stats->shape_cost += stats->full_wh_ratio_var;
140  if (debug_level) tprintf("shape_cost %g\n", stats->shape_cost);
141  }
142 }
143 
145  int start_blob, int last_blob) {
146  int result = 0;
147  for (int x = start_blob * 2; x <= last_blob * 2; x++)
148  result += width_record->widths[x];
149  return result;
150 }
151 
152 float AssociateUtils::FixedPitchWidthCost(float norm_width,
153  float right_gap,
154  bool end_pos,
155  float max_char_wh_ratio) {
156  float cost = 0.0f;
157  if (norm_width > max_char_wh_ratio) cost += norm_width;
158  if (norm_width > kMaxFixedPitchCharAspectRatio)
159  cost += norm_width * norm_width; // extra penalty for merging CJK chars
160  // Penalize skinny blobs, except for punctuation in the last position.
161  if (norm_width+right_gap < 0.5f && !end_pos) {
162  cost += 1.0f - (norm_width + right_gap);
163  }
164  return cost;
165 }
166 
167 } // namespace tesseract