Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
gap_map.cpp
Go to the documentation of this file.
1 #include "mfcpch.h"
2 #include "statistc.h"
3 #include "gap_map.h"
4 
5 #define EXTERN
6 EXTERN BOOL_VAR (gapmap_debug, FALSE, "Say which blocks have tables");
8 "Use large space at start and end of rows");
10 "Ensure gaps not less than 2quanta wide");
11 EXTERN double_VAR (gapmap_big_gaps, 1.75, "xht multiplier");
12 
13 /*************************************************************************
14  * A block gap map is a quantised histogram of whitespace regions in the
15  * block. It is a vertical projection of wide gaps WITHIN lines
16  *
17  * The map is held as an array of counts of rows which have a wide gap
18  * covering that region of the row. Each bucket in the map represents a width
19  * of about half an xheight - (The median of the xhts in the rows is used.)
20  *
21  * The block is considered RECTANGULAR - delimited by the left and right
22  * extremes of the rows in the block. However, ONLY wide gaps WITHIN a row are
23  * counted.
24  *
25  *************************************************************************/
26 
27 GAPMAP::GAPMAP( //Constructor
28  TO_BLOCK *block //block
29  ) {
30  TO_ROW_IT row_it; //row iterator
31  TO_ROW *row; //current row
32  BLOBNBOX_IT blob_it; //iterator
33  TBOX blob_box;
34  TBOX prev_blob_box;
35  inT16 gap_width;
36  inT16 start_of_row;
37  inT16 end_of_row;
38  STATS xht_stats (0, 128);
39  inT16 min_quantum;
40  inT16 max_quantum;
41  inT16 i;
42 
43  row_it.set_to_list (block->get_rows ());
44  /*
45  Find left and right extremes and bucket size
46  */
47  map = NULL;
48  min_left = MAX_INT16;
49  max_right = -MAX_INT16;
50  total_rows = 0;
51  any_tabs = FALSE;
52  for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
53  row = row_it.data ();
54  if (!row->blob_list ()->empty ()) {
55  total_rows++;
56  xht_stats.add ((inT16) floor (row->xheight + 0.5), 1);
57  blob_it.set_to_list (row->blob_list ());
58  start_of_row = blob_it.data ()->bounding_box ().left ();
59  end_of_row = blob_it.data_relative (-1)->bounding_box ().right ();
60  if (min_left > start_of_row)
61  min_left = start_of_row;
62  if (max_right < end_of_row)
63  max_right = end_of_row;
64  }
65  }
66  if ((total_rows < 3) || (min_left >= max_right)) {
67  total_rows = 0;
68  min_left = max_right = 0;
69  return;
70  }
71  bucket_size = (inT16) floor (xht_stats.median () + 0.5) / 2;
72  map_max = (max_right - min_left) / bucket_size;
73  map = (inT16 *) alloc_mem ((map_max + 1) * sizeof (inT16));
74  for (i = 0; i <= map_max; i++)
75  map[i] = 0;
76 
77  for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
78  row = row_it.data ();
79  if (!row->blob_list ()->empty ()) {
80  blob_it.set_to_list (row->blob_list ());
81  blob_it.mark_cycle_pt ();
82  blob_box = box_next (&blob_it);
83  prev_blob_box = blob_box;
84  if (gapmap_use_ends) {
85  /* Leading space */
86  gap_width = blob_box.left () - min_left;
87  if ((gap_width > gapmap_big_gaps * row->xheight)
88  && gap_width > 2) {
89  max_quantum = (blob_box.left () - min_left) / bucket_size;
90  for (i = 0; i <= max_quantum; i++)
91  map[i]++;
92  }
93  }
94  while (!blob_it.cycled_list ()) {
95  blob_box = box_next (&blob_it);
96  gap_width = blob_box.left () - prev_blob_box.right ();
97  if ((gap_width > gapmap_big_gaps * row->xheight)
98  && gap_width > 2) {
99  min_quantum =
100  (prev_blob_box.right () - min_left) / bucket_size;
101  max_quantum = (blob_box.left () - min_left) / bucket_size;
102  for (i = min_quantum; i <= max_quantum; i++)
103  map[i]++;
104  }
105  prev_blob_box = blob_box;
106  }
107  if (gapmap_use_ends) {
108  /* Trailing space */
109  gap_width = max_right - prev_blob_box.right ();
110  if ((gap_width > gapmap_big_gaps * row->xheight)
111  && gap_width > 2) {
112  min_quantum =
113  (prev_blob_box.right () - min_left) / bucket_size;
114  for (i = min_quantum; i <= map_max; i++)
115  map[i]++;
116  }
117  }
118  }
119  }
120  for (i = 0; i <= map_max; i++) {
121  if (map[i] > total_rows / 2) {
123  (((i == 0) &&
124  (map[i + 1] <= total_rows / 2)) ||
125  ((i == map_max) &&
126  (map[i - 1] <= total_rows / 2)) ||
127  ((i > 0) &&
128  (i < map_max) &&
129  (map[i - 1] <= total_rows / 2) &&
130  (map[i + 1] <= total_rows / 2)))) {
131  map[i] = 0; //prevent isolated quantum
132  }
133  else
134  any_tabs = TRUE;
135  }
136  }
137  if (gapmap_debug && any_tabs)
138  tprintf ("Table found\n");
139 }
140 
141 
142 /*************************************************************************
143  * GAPMAP::table_gap()
144  * Is there a bucket in the specified range where more than half the rows in the
145  * block have a wide gap?
146  *************************************************************************/
147 
148 BOOL8 GAPMAP::table_gap( //Is gap a table?
149  inT16 left, //From here
150  inT16 right //To here
151  ) {
152  inT16 min_quantum;
153  inT16 max_quantum;
154  inT16 i;
155  BOOL8 tab_found = FALSE;
156 
157  if (!any_tabs)
158  return FALSE;
159 
160  min_quantum = (left - min_left) / bucket_size;
161  max_quantum = (right - min_left) / bucket_size;
162  for (i = min_quantum; (!tab_found && (i <= max_quantum)); i++)
163  if (map[i] > total_rows / 2)
164  tab_found = TRUE;
165  return tab_found;
166 }