Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
fixxht.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: fixxht.cpp (Formerly fixxht.c)
3  * Description: Improve x_ht and look out for case inconsistencies
4  * Author: Phil Cheatle
5  * Created: Thu Aug 5 14:11:08 BST 1993
6  *
7  * (C) Copyright 1992, Hewlett-Packard Ltd.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #include "mfcpch.h"
21 #include <string.h>
22 #include <ctype.h>
23 #include "params.h"
24 #include "float2int.h"
25 #include "tesseractclass.h"
26 
27 namespace tesseract {
28 
29 // Fixxht overview.
30 // Premise: Initial estimate of x-height is adequate most of the time, but
31 // occasionally it is incorrect. Most notable causes of failure are:
32 // 1. Small caps, where the top of the caps is the same as the body text
33 // xheight. For small caps words the xheight needs to be reduced to correctly
34 // recognize the caps in the small caps word.
35 // 2. All xheight lines, such as summer. Here the initial estimate will have
36 // guessed that the blob tops are caps and will have placed the xheight too low.
37 // 3. Noise/logos beside words, or changes in font size on a line. Such
38 // things can blow the statistics and cause an incorrect estimate.
39 //
40 // Algorithm.
41 // Compare the vertical position (top only) of alphnumerics in a word with
42 // the range of positions in training data (in the unicharset).
43 // See CountMisfitTops. If any characters disagree sufficiently with the
44 // initial xheight estimate, then recalculate the xheight, re-run OCR on
45 // the word, and if the number of vertical misfits goes down, along with
46 // either the word rating or certainty, then keep the new xheight.
47 // The new xheight is calculated as follows:ComputeCompatibleXHeight
48 // For each alphanumeric character that has a vertically misplaced top
49 // (a misfit), yet its bottom is within the acceptable range (ie it is not
50 // likely a sub-or super-script) calculate the range of acceptable xheight
51 // positions from its range of tops, and give each value in the range a
52 // number of votes equal to the distance of its top from its acceptance range.
53 // The x-height position with the median of the votes becomes the new
54 // x-height. This assumes that most characters will be correctly recognized
55 // even if the x-height is incorrect. This is not a terrible assumption, but
56 // it is not great. An improvement would be to use a classifier that does
57 // not care about vertical position or scaling at all.
58 
59 // If the max-min top of a unicharset char is bigger than kMaxCharTopRange
60 // then the char top cannot be used to judge misfits or suggest a new top.
61 const int kMaxCharTopRange = 48;
62 
63 // Returns the number of misfit blob tops in this word.
65  int bad_blobs = 0;
66  TBLOB* blob = word_res->rebuild_word->blobs;
67  int blob_id = 0;
68  for (; blob != NULL; blob = blob->next, ++blob_id) {
69  UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id);
70  if (unicharset.get_isalpha(class_id) || unicharset.get_isdigit(class_id)) {
71  int top = blob->bounding_box().top();
72  if (top >= INT_FEAT_RANGE)
73  top = INT_FEAT_RANGE - 1;
74  int min_bottom, max_bottom, min_top, max_top;
75  unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom,
76  &min_top, &max_top);
77  if (max_top - min_top > kMaxCharTopRange)
78  continue;
79  bool bad = top < min_top - x_ht_acceptance_tolerance ||
80  top > max_top + x_ht_acceptance_tolerance;
81  if (bad)
82  ++bad_blobs;
83  if (debug_x_ht_level >= 1) {
84  tprintf("Class %s is %s with top %d vs limits of %d->%d, +/-%d\n",
85  unicharset.id_to_unichar(class_id),
86  bad ? "Misfit" : "OK", top, min_top, max_top,
87  static_cast<int>(x_ht_acceptance_tolerance));
88  }
89  }
90  }
91  return bad_blobs;
92 }
93 
94 // Returns a new x-height maximally compatible with the result in word_res.
95 // See comment above for overall algorithm.
97  STATS top_stats(0, MAX_UINT8);
98  TBLOB* blob = word_res->rebuild_word->blobs;
99  int blob_id = 0;
100  for (; blob != NULL; blob = blob->next, ++blob_id) {
101  UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id);
102  if (unicharset.get_isalpha(class_id) || unicharset.get_isdigit(class_id)) {
103  int top = blob->bounding_box().top();
104  // Clip the top to the limit of normalized feature space.
105  if (top >= INT_FEAT_RANGE)
106  top = INT_FEAT_RANGE - 1;
107  int bottom = blob->bounding_box().bottom();
108  int min_bottom, max_bottom, min_top, max_top;
109  unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom,
110  &min_top, &max_top);
111  // Chars with a wild top range would mess up the result so ignore them.
112  if (max_top - min_top > kMaxCharTopRange)
113  continue;
114  int misfit_dist = MAX((min_top - x_ht_acceptance_tolerance) - top,
115  top - (max_top + x_ht_acceptance_tolerance));
116  int height = top - kBlnBaselineOffset;
117  if (debug_x_ht_level >= 20) {
118  tprintf("Class %s: height=%d, bottom=%d,%d top=%d,%d, actual=%d,%d : ",
119  unicharset.id_to_unichar(class_id),
120  height, min_bottom, max_bottom, min_top, max_top,
121  bottom, top);
122  }
123  // Use only chars that fit in the expected bottom range, and where
124  // the range of tops is sensibly near the xheight.
125  if (min_bottom <= bottom + x_ht_acceptance_tolerance &&
126  bottom - x_ht_acceptance_tolerance <= max_bottom &&
127  min_top > kBlnBaselineOffset &&
128  max_top - kBlnBaselineOffset >= kBlnXHeight &&
129  misfit_dist > 0) {
130  // Compute the x-height position using proportionality between the
131  // actual height and expected height.
132  int min_xht = DivRounded(height * kBlnXHeight,
133  max_top - kBlnBaselineOffset);
134  int max_xht = DivRounded(height * kBlnXHeight,
135  min_top - kBlnBaselineOffset);
136  if (debug_x_ht_level >= 20) {
137  tprintf(" xht range min=%d, max=%d\n",
138  min_xht, max_xht);
139  }
140  // The range of expected heights gets a vote equal to the distance
141  // of the actual top from the expected top.
142  for (int y = min_xht; y <= max_xht; ++y)
143  top_stats.add(y, misfit_dist);
144  } else if (debug_x_ht_level >= 20) {
145  tprintf(" already OK\n");
146  }
147  }
148  }
149  if (top_stats.get_total() == 0)
150  return 0.0f;
151  // The new xheight is just the median vote, which is then scaled out
152  // of BLN space back to pixel space to get the x-height in pixel space.
153  float new_xht = top_stats.median();
154  if (debug_x_ht_level >= 20) {
155  tprintf("Median xht=%f\n", new_xht);
156  tprintf("Mode20:A: New x-height = %f (norm), %f (orig)\n",
157  new_xht, new_xht / word_res->denorm.y_scale());
158  }
159  // The xheight must change by at least x_ht_min_change to be used.
160  if (fabs(new_xht - kBlnXHeight) >= x_ht_min_change)
161  return new_xht / word_res->denorm.y_scale();
162  else
163  return 0.0f;
164 }
165 
166 } // namespace tesseract