Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
tablefind.cpp
Go to the documentation of this file.
1 
2 // File: tablefind.cpp
3 // Description: Helper classes to find tables from ColPartitions.
4 // Author: Faisal Shafait (faisal.shafait@dfki.de)
5 // Created: Tue Jan 06 11:13:01 PST 2009
6 //
7 // (C) Copyright 2009, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
19 
20 #ifdef _MSC_VER
21 #pragma warning(disable:4244) // Conversion warnings
22 #endif
23 
24 #include "tablefind.h"
25 #include <math.h>
26 #ifdef HAVE_CONFIG_H
27 #include "config_auto.h"
28 #endif
29 
30 #include "allheaders.h"
31 
32 #include "colpartitionset.h"
33 #include "tablerecog.h"
34 
35 namespace tesseract {
36 
37 // These numbers are used to calculate the global median stats.
38 // They just set an upper bound on the stats objects.
39 // Maximum vertical spacing between neighbor partitions.
40 const int kMaxVerticalSpacing = 500;
41 // Maximum width of a blob in a partition.
42 const int kMaxBlobWidth = 500;
43 
44 // Minimum whitespace size to split a partition (measured as a multiple
45 // of a partition's median width).
46 const double kSplitPartitionSize = 2.0;
47 // To insert text, the partition must satisfy these size constraints
48 // in AllowTextPartition(). The idea is to filter noise partitions
49 // determined by the size compared to the global medians.
50 // TODO(nbeato): Need to find good numbers again.
51 const double kAllowTextHeight = 0.5;
52 const double kAllowTextWidth = 0.6;
53 const double kAllowTextArea = 0.8;
54 // The same thing applies to blobs (to filter noise).
55 // TODO(nbeato): These numbers are a shot in the dark...
56 // height and width are 0.5 * gridsize() in colfind.cpp
57 // area is a rough guess for the size of a period.
58 const double kAllowBlobHeight = 0.3;
59 const double kAllowBlobWidth = 0.4;
60 const double kAllowBlobArea = 0.05;
61 
62 // Minimum number of components in a text partition. A partition having fewer
63 // components than that is more likely a data partition and is a candidate
64 // table cell.
65 const int kMinBoxesInTextPartition = 10;
66 
67 // Maximum number of components that a data partition can have
68 const int kMaxBoxesInDataPartition = 20;
69 
70 // Maximum allowed gap in a text partitions as a multiple of its median size.
71 const double kMaxGapInTextPartition = 4.0;
72 
73 // Minimum value that the maximum gap in a text partition should have as a
74 // factor of its median size.
75 const double kMinMaxGapInTextPartition = 0.5;
76 
77 // The amount of overlap that is "normal" for adjacent blobs in a text
78 // partition. This is used to calculate gap between overlapping blobs.
79 const double kMaxBlobOverlapFactor = 4.0;
80 
81 // Maximum x-height a table partition can have as a multiple of global
82 // median x-height
83 const double kMaxTableCellXheight = 2.0;
84 
85 // Maximum line spacing between a table column header and column contents
86 // for merging the two (as a multiple of the partition's median_size).
88 
89 // Minimum ratio of num_table_partitions to num_text_partitions in a column
90 // block to be called it a table column
91 const double kTableColumnThreshold = 3.0;
92 
93 // Search for horizontal ruling lines within the vertical margin as a
94 // multiple of grid size
95 const int kRulingVerticalMargin = 3;
96 
97 // Minimum overlap that a colpartition must have with a table region
98 // to become part of that table
99 const double kMinOverlapWithTable = 0.6;
100 
101 // Maximum side space (distance from column boundary) that a typical
102 // text-line in flowing text should have as a multiple of its x-height
103 // (Median size).
104 const int kSideSpaceMargin = 10;
105 
106 // Fraction of the peak of x-projection of a table region to set the
107 // threshold for the x-projection histogram
108 const double kSmallTableProjectionThreshold = 0.35;
109 const double kLargeTableProjectionThreshold = 0.45;
110 // Minimum number of rows required to look for more rows in the projection.
111 const int kLargeTableRowCount = 6;
112 
113 // Minimum number of rows in a table
114 const int kMinRowsInTable = 3;
115 
116 // The number of "whitespace blobs" that should appear between the
117 // ColPartition's bounding box and the column tab stops to the left/right
118 // when looking for center justified tab stops.
119 const double kRequiredFullJustifiedSpacing = 4.0;
120 
121 // The amount of padding (multiplied by global_median_xheight_ during use)
122 // that is vertically added to the search adjacent leader search during
123 // ColPartition marking.
125 
126 // Used when filtering false positives. When finding the last line
127 // of a paragraph (typically left-aligned), the previous line should have
128 // its center to the right of the last line by this scaled amount.
130 
131 // The maximum amount of whitespace allowed left of a paragraph ending.
132 // Do not filter a ColPartition with more than this space left of it.
134 
135 // Used when filtering false positives. The last line of a paragraph
136 // should be preceded by a line that is predominantly text. This is the
137 // ratio of text to whitespace (to the right of the text) that is required
138 // for the previous line to be a text.
140 
141 // When counting table columns, this is the required gap between two columns
142 // (it is multiplied by global_median_xheight_).
143 const double kMaxXProjectionGapFactor = 2.0;
144 
145 // Used for similarity in partitions using stroke width. Values copied
146 // from ColFind.cpp in Ray's CL.
148 const double kStrokeWidthConstantTolerance = 2.0;
149 
150 BOOL_VAR(textord_dump_table_images, false, "Paint table detection output");
151 BOOL_VAR(textord_show_tables, false, "Show table regions");
153  "Debug table marking steps in detail");
155  "Show page stats used in table finding");
157  "Enables the table recognizer for table layout and filtering.");
158 
161 
162 // Templated helper function used to create destructor callbacks for the
163 // BBGrid::ClearGridData() method.
164 template <typename T> void DeleteObject(T *object) {
165  delete object;
166 }
167 
169  : resolution_(0),
170  global_median_xheight_(0),
171  global_median_blob_width_(0),
172  global_median_ledding_(0),
173  left_to_right_language_(true) {
174 }
175 
177  // ColPartitions and ColSegments created by this class for storage in grids
178  // need to be deleted explicitly.
179  clean_part_grid_.ClearGridData(&DeleteObject<ColPartition>);
180  leader_and_ruling_grid_.ClearGridData(&DeleteObject<ColPartition>);
181  fragmented_text_grid_.ClearGridData(&DeleteObject<ColPartition>);
182  col_seg_grid_.ClearGridData(&DeleteObject<ColSegment>);
183  table_grid_.ClearGridData(&DeleteObject<ColSegment>);
184 }
185 
187  left_to_right_language_ = order;
188 }
189 
190 void TableFinder::Init(int grid_size, const ICOORD& bottom_left,
191  const ICOORD& top_right) {
192  // Initialize clean partitions list and grid
193  clean_part_grid_.Init(grid_size, bottom_left, top_right);
194  leader_and_ruling_grid_.Init(grid_size, bottom_left, top_right);
195  fragmented_text_grid_.Init(grid_size, bottom_left, top_right);
196  col_seg_grid_.Init(grid_size, bottom_left, top_right);
197  table_grid_.Init(grid_size, bottom_left, top_right);
198 }
199 
200 // Copy cleaned partitions from part_grid_ to clean_part_grid_ and
201 // insert leaders and rulers into the leader_and_ruling_grid_
203  TO_BLOCK* block) {
204  // Calculate stats. This lets us filter partitions in AllowTextPartition()
205  // and filter blobs in AllowBlob().
206  SetGlobalSpacings(grid);
207 
208  // Iterate the ColPartitions in the grid.
209  ColPartitionGridSearch gsearch(grid);
210  gsearch.SetUniqueMode(true);
211  gsearch.StartFullSearch();
212  ColPartition* part = NULL;
213  while ((part = gsearch.NextFullSearch()) != NULL) {
214  // Reject partitions with nothing useful inside of them.
215  if (part->blob_type() == BRT_NOISE || part->bounding_box().area() <= 0)
216  continue;
217  ColPartition* clean_part = part->ShallowCopy();
218  ColPartition* leader_part = NULL;
219  if (part->IsLineType()) {
220  InsertRulingPartition(clean_part);
221  continue;
222  }
223  // Insert all non-text partitions to clean_parts
224  if (!part->IsTextType()) {
225  InsertImagePartition(clean_part);
226  continue;
227  }
228  // Insert text colpartitions after removing noisy components from them
229  // The leaders are split into a separate grid.
230  BLOBNBOX_CLIST* part_boxes = part->boxes();
231  BLOBNBOX_C_IT pit(part_boxes);
232  for (pit.mark_cycle_pt(); !pit.cycled_list(); pit.forward()) {
233  BLOBNBOX *pblob = pit.data();
234  // Bad blobs... happens in UNLV set.
235  // news.3G1, page 17 (around x=6)
236  if (!AllowBlob(*pblob))
237  continue;
238  if (pblob->flow() == BTFT_LEADER) {
239  if (leader_part == NULL) {
240  leader_part = part->ShallowCopy();
241  leader_part->set_flow(BTFT_LEADER);
242  }
243  leader_part->AddBox(pblob);
244  } else if (pblob->region_type() != BRT_NOISE) {
245  clean_part->AddBox(pblob);
246  }
247  }
248  clean_part->ComputeLimits();
249  ColPartition* fragmented = clean_part->CopyButDontOwnBlobs();
250  InsertTextPartition(clean_part);
252  if (leader_part != NULL) {
253  // TODO(nbeato): Note that ComputeLimits does not update the column
254  // information. So the leader may appear to span more columns than it
255  // really does later on when IsInSameColumnAs gets called to test
256  // for adjacent leaders.
257  leader_part->ComputeLimits();
258  InsertLeaderPartition(leader_part);
259  }
260  }
261 
262  // Make the partition partners better for upper and lower neighbors.
265 }
266 
267 // High level function to perform table detection
269  ColPartitionSet** all_columns,
270  WidthCallback* width_cb,
271  const FCOORD& reskew) {
272  // initialize spacing, neighbors, and columns
273  InitializePartitions(all_columns);
274  if (textord_show_tables) {
275  ScrollView* table_win = MakeWindow(0, 300, "Column Partitions & Neighbors");
281 
282  table_win = MakeWindow(100, 300, "Fragmented Text");
284  }
285  // mark, filter, and smooth candidate table partitions
287 
288  // Make single-column blocks from good_columns_ partitions. col_segments are
289  // moved to a grid later which takes the ownership
290  ColSegment_LIST column_blocks;
291  GetColumnBlocks(all_columns, &column_blocks);
292  // Set the ratio of candidate table partitions in each column
293  SetColumnsType(&column_blocks);
294 
295  // Move column segments to col_seg_grid_
296  MoveColSegmentsToGrid(&column_blocks, &col_seg_grid_);
297 
298  // Detect split in column layout that might have occurred due to the
299  // presence of a table. In such a case, merge the corresponding columns.
301 
302  // Group horizontally overlapping table partitions into table columns.
303  // table_columns created here get deleted at the end of this method.
304  ColSegment_LIST table_columns;
305  GetTableColumns(&table_columns);
306 
307  // Within each column, mark the range table regions occupy based on the
308  // table columns detected. table_regions are moved to a grid later which
309  // takes the ownership
310  ColSegment_LIST table_regions;
311  GetTableRegions(&table_columns, &table_regions);
312 
314  ScrollView* table_win = MakeWindow(1200, 300, "Table Columns and Regions");
315  DisplayColSegments(table_win, &table_columns, ScrollView::DARK_TURQUOISE);
316  DisplayColSegments(table_win, &table_regions, ScrollView::YELLOW);
317  }
318 
319  // Merge table regions across columns for tables spanning multiple
320  // columns
321  MoveColSegmentsToGrid(&table_regions, &table_grid_);
323 
324  // Adjust table boundaries by including nearby horizontal lines and left
325  // out column headers
328 
330  // Remove false alarms consiting of a single column
332 
333  if (textord_show_tables) {
334  ScrollView* table_win = MakeWindow(1200, 300, "Detected Table Locations");
336  DisplayColSegments(table_win, &table_columns, ScrollView::KHAKI);
337  table_grid_.DisplayBoxes(table_win);
338  }
339 
340  // Find table grid structure and reject tables that are malformed.
341  RecognizeTables();
343  RecognizeTables();
344 
345  if (textord_show_tables) {
346  ScrollView* table_win = MakeWindow(1400, 600, "Recognized Tables");
349  table_grid_.DisplayBoxes(table_win);
350  }
351  } else {
352  // Remove false alarms consiting of a single column
353  // TODO(nbeato): verify this is a NOP after structured table rejection.
354  // Right now it isn't. If the recognize function is doing what it is
355  // supposed to do, this function is obsolete.
357 
358  if (textord_show_tables) {
359  ScrollView* table_win = MakeWindow(1500, 300, "Detected Tables");
362  table_grid_.DisplayBoxes(table_win);
363  }
364  }
365 
367  WriteToPix(reskew);
368 
369  // Merge all colpartitions in table regions to make them a single
370  // colpartition and revert types of isolated table cells not
371  // assigned to any table to their original types.
372  MakeTableBlocks(grid, all_columns, width_cb);
373 }
374 // All grids have the same dimensions. The clean_part_grid_ sizes are set from
375 // the part_grid_ that is passed to InsertCleanPartitions, which was the same as
376 // the grid that is the base of ColumnFinder. Just return the clean_part_grid_
377 // dimensions instead of duplicated memory.
379  return clean_part_grid_.gridsize();
380 }
382  return clean_part_grid_.gridwidth();
383 }
385  return clean_part_grid_.gridheight();
386 }
387 const ICOORD& TableFinder::bleft() const {
388  return clean_part_grid_.bleft();
389 }
390 const ICOORD& TableFinder::tright() const {
391  return clean_part_grid_.tright();
392 }
393 
395  ASSERT_HOST(part != NULL);
396  if (AllowTextPartition(*part)) {
397  clean_part_grid_.InsertBBox(true, true, part);
398  } else {
399  delete part;
400  }
401 }
403  ASSERT_HOST(part != NULL);
404  if (AllowTextPartition(*part)) {
405  fragmented_text_grid_.InsertBBox(true, true, part);
406  } else {
407  delete part;
408  }
409 }
411  ASSERT_HOST(part != NULL);
412  if (!part->IsEmpty() && part->bounding_box().area() > 0) {
413  leader_and_ruling_grid_.InsertBBox(true, true, part);
414  } else {
415  delete part;
416  }
417 }
419  leader_and_ruling_grid_.InsertBBox(true, true, part);
420 }
422  // NOTE: If images are placed into a different grid in the future,
423  // the function SetPartitionSpacings needs to be updated. It should
424  // be the only thing that cares about image partitions.
425  clean_part_grid_.InsertBBox(true, true, part);
426 }
427 
428 // Splits a partition into its "words". The splits happen
429 // at locations with wide inter-blob spacing. This is useful
430 // because it allows the table recognize to "cut through" the
431 // text lines on the page. The assumption is that a table
432 // will have several lines with similar overlapping whitespace
433 // whereas text will not have this type of property.
434 // Note: The code Assumes that blobs are sorted by the left side x!
435 // This will not work (as well) if the blobs are sorted by center/right.
437  ASSERT_HOST(part != NULL);
438  // Bye bye empty partitions!
439  if (part->boxes()->empty()) {
440  delete part;
441  return;
442  }
443 
444  // The AllowBlob function prevents this.
445  ASSERT_HOST(part->median_width() > 0);
446  const double kThreshold = part->median_width() * kSplitPartitionSize;
447 
448  ColPartition* right_part = part;
449  bool found_split = true;
450  while (found_split) {
451  found_split = false;
452  BLOBNBOX_C_IT box_it(right_part->boxes());
453  // Blobs are sorted left side first. If blobs overlap,
454  // the previous blob may have a "more right" right side.
455  // Account for this by always keeping the largest "right"
456  // so far.
457  int previous_right = MIN_INT32;
458 
459  // Look for the next split in the partition.
460  for (box_it.mark_cycle_pt(); !box_it.cycled_list(); box_it.forward()) {
461  const TBOX& box = box_it.data()->bounding_box();
462  if (previous_right != MIN_INT32 &&
463  box.left() - previous_right > kThreshold) {
464  // We have a split position. Split the partition in two pieces.
465  // Insert the left piece in the grid and keep processing the right.
466  int mid_x = (box.left() + previous_right) / 2;
467  ColPartition* left_part = right_part;
468  right_part = left_part->SplitAt(mid_x);
469 
471  found_split = true;
472  break;
473  }
474 
475  // The right side of the previous blobs.
476  previous_right = MAX(previous_right, box.right());
477  }
478  }
479  // When a split is not found, the right part is minimized
480  // as much as possible, so process it.
481  InsertFragmentedTextPartition(right_part);
482 }
483 
484 // Some simple criteria to filter out now. We want to make sure the
485 // average blob size in the partition is consistent with the
486 // global page stats.
487 // The area metric will almost always pass for multi-blob partitions.
488 // It is useful when filtering out noise caused by an isolated blob.
490  const double kHeightRequired = global_median_xheight_ * kAllowTextHeight;
491  const double kWidthRequired = global_median_blob_width_ * kAllowTextWidth;
492  const int median_area = global_median_xheight_ * global_median_blob_width_;
493  const double kAreaPerBlobRequired = median_area * kAllowTextArea;
494  // Keep comparisons strictly greater to disallow 0!
495  return part.median_size() > kHeightRequired &&
496  part.median_width() > kWidthRequired &&
497  part.bounding_box().area() > kAreaPerBlobRequired * part.boxes_count();
498 }
499 
500 // Same as above, applied to blobs. Keep in mind that
501 // leaders, commas, and periods are important in tables.
502 bool TableFinder::AllowBlob(const BLOBNBOX& blob) const {
503  const TBOX& box = blob.bounding_box();
504  const double kHeightRequired = global_median_xheight_ * kAllowBlobHeight;
505  const double kWidthRequired = global_median_blob_width_ * kAllowBlobWidth;
506  const int median_area = global_median_xheight_ * global_median_blob_width_;
507  const double kAreaRequired = median_area * kAllowBlobArea;
508  // Keep comparisons strictly greater to disallow 0!
509  return box.height() > kHeightRequired &&
510  box.width() > kWidthRequired &&
511  box.area() > kAreaRequired;
512 }
513 
514 // TODO(nbeato): The grid that makes the window doesn't seem to matter.
515 // The only downside is that window messages will be caught by
516 // clean_part_grid_ instead of a useful object. This is a temporary solution
517 // for the debug windows created by the TableFinder.
518 ScrollView* TableFinder::MakeWindow(int x, int y, const char* window_name) {
519  return clean_part_grid_.MakeWindow(x, y, window_name);
520 }
521 
522 // Make single-column blocks from good_columns_ partitions.
524  ColSegment_LIST* column_blocks) {
525  for (int i = 0; i < gridheight(); ++i) {
526  ColPartitionSet* columns = all_columns[i];
527  if (columns != NULL) {
528  ColSegment_LIST new_blocks;
529  // Get boxes from the current vertical position on the grid
530  columns->GetColumnBoxes(i * gridsize(), (i+1) * gridsize(), &new_blocks);
531  // Merge the new_blocks boxes into column_blocks if they are well-aligned
532  GroupColumnBlocks(&new_blocks, column_blocks);
533  }
534  }
535 }
536 
537 // Merge column segments into the current list if they are well aligned.
538 void TableFinder::GroupColumnBlocks(ColSegment_LIST* new_blocks,
539  ColSegment_LIST* column_blocks) {
540  ColSegment_IT src_it(new_blocks);
541  ColSegment_IT dest_it(column_blocks);
542  // iterate through the source list
543  for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
544  ColSegment* src_seg = src_it.data();
545  TBOX src_box = src_seg->bounding_box();
546  bool match_found = false;
547  // iterate through the destination list to find a matching column block
548  for (dest_it.mark_cycle_pt(); !dest_it.cycled_list(); dest_it.forward()) {
549  ColSegment* dest_seg = dest_it.data();
550  TBOX dest_box = dest_seg->bounding_box();
551  if (ConsecutiveBoxes(src_box, dest_box)) {
552  // If matching block is found, insert the current block into it
553  // and delete the soure block
554  dest_seg->InsertBox(src_box);
555  match_found = true;
556  delete src_it.extract();
557  break;
558  }
559  }
560  // If no match is found, just append the source block to column_blocks
561  if (!match_found) {
562  dest_it.add_after_then_move(src_it.extract());
563  }
564  }
565 }
566 
567 // are the two boxes immediate neighbors along the vertical direction
568 bool TableFinder::ConsecutiveBoxes(const TBOX &b1, const TBOX &b2) {
569  int x_margin = 20;
570  int y_margin = 5;
571  return (abs(b1.left() - b2.left()) < x_margin) &&
572  (abs(b1.right() - b2.right()) < x_margin) &&
573  (abs(b1.top()-b2.bottom()) < y_margin ||
574  abs(b2.top()-b1.bottom()) < y_margin);
575 }
576 
577 // Set up info for clean_part_grid_ partitions to be valid during detection
578 // code.
580  FindNeighbors();
581  SetPartitionSpacings(&clean_part_grid_, all_columns);
583 }
584 
585 // Set left, right and top, bottom spacings of each colpartition.
587  ColPartitionSet** all_columns) {
588  // Iterate the ColPartitions in the grid.
589  ColPartitionGridSearch gsearch(grid);
590  gsearch.StartFullSearch();
591  ColPartition* part = NULL;
592  while ((part = gsearch.NextFullSearch()) != NULL) {
593  ColPartitionSet* columns = all_columns[gsearch.GridY()];
594  TBOX box = part->bounding_box();
595  int y = part->MidY();
596  ColPartition* left_column = columns->ColumnContaining(box.left(), y);
597  ColPartition* right_column = columns->ColumnContaining(box.right(), y);
598  // set distance from left column as space to the left
599  if (left_column) {
600  int left_space = MAX(0, box.left() - left_column->LeftAtY(y));
601  part->set_space_to_left(left_space);
602  }
603  // set distance from right column as space to the right
604  if (right_column) {
605  int right_space = MAX(0, right_column->RightAtY(y) - box.right());
606  part->set_space_to_right(right_space);
607  }
608 
609  // Look for images that may be closer.
610  // NOTE: used to be part_grid_, might cause issues now
611  ColPartitionGridSearch hsearch(grid);
612  hsearch.StartSideSearch(box.left(), box.bottom(), box.top());
613  ColPartition* neighbor = NULL;
614  while ((neighbor = hsearch.NextSideSearch(true)) != NULL) {
615  if (neighbor->type() == PT_PULLOUT_IMAGE ||
616  neighbor->type() == PT_FLOWING_IMAGE ||
617  neighbor->type() == PT_HEADING_IMAGE) {
618  int right = neighbor->bounding_box().right();
619  if (right < box.left()) {
620  int space = MIN(box.left() - right, part->space_to_left());
621  part->set_space_to_left(space);
622  }
623  }
624  }
625  hsearch.StartSideSearch(box.left(), box.bottom(), box.top());
626  neighbor = NULL;
627  while ((neighbor = hsearch.NextSideSearch(false)) != NULL) {
628  if (neighbor->type() == PT_PULLOUT_IMAGE ||
629  neighbor->type() == PT_FLOWING_IMAGE ||
630  neighbor->type() == PT_HEADING_IMAGE) {
631  int left = neighbor->bounding_box().left();
632  if (left > box.right()) {
633  int space = MIN(left - box.right(), part->space_to_right());
634  part->set_space_to_right(space);
635  }
636  }
637  }
638 
639  ColPartition* upper_part = part->SingletonPartner(true);
640  if (upper_part) {
641  int space = MAX(0, upper_part->bounding_box().bottom() -
642  part->bounding_box().bottom());
643  part->set_space_above(space);
644  } else {
645  // TODO(nbeato): What constitutes a good value?
646  // 0 is the default value when not set, explicitly noting it needs to
647  // be something else.
648  part->set_space_above(MAX_INT32);
649  }
650 
651  ColPartition* lower_part = part->SingletonPartner(false);
652  if (lower_part) {
653  int space = MAX(0, part->bounding_box().bottom() -
654  lower_part->bounding_box().bottom());
655  part->set_space_below(space);
656  } else {
657  // TODO(nbeato): What constitutes a good value?
658  // 0 is the default value when not set, explicitly noting it needs to
659  // be something else.
660  part->set_space_below(MAX_INT32);
661  }
662  }
663 }
664 
665 // Set spacing and closest neighbors above and below a given colpartition.
667  TBOX box = part->bounding_box();
668  int top_range = MIN(box.top() + kMaxVerticalSpacing, tright().y());
669  int bottom_range = MAX(box.bottom() - kMaxVerticalSpacing, bleft().y());
670  box.set_top(top_range);
671  box.set_bottom(bottom_range);
672 
673  TBOX part_box = part->bounding_box();
674  // Start a rect search
676  rectsearch(&clean_part_grid_);
677  rectsearch.StartRectSearch(box);
678  ColPartition* neighbor;
679  int min_space_above = kMaxVerticalSpacing;
680  int min_space_below = kMaxVerticalSpacing;
681  ColPartition* above_neighbor = NULL;
682  ColPartition* below_neighbor = NULL;
683  while ((neighbor = rectsearch.NextRectSearch()) != NULL) {
684  if (neighbor == part)
685  continue;
686  TBOX neighbor_box = neighbor->bounding_box();
687  if (neighbor_box.major_x_overlap(part_box)) {
688  int gap = abs(part->median_bottom() - neighbor->median_bottom());
689  // If neighbor is below current partition
690  if (neighbor_box.top() < part_box.bottom() &&
691  gap < min_space_below) {
692  min_space_below = gap;
693  below_neighbor = neighbor;
694  } // If neighbor is above current partition
695  else if (part_box.top() < neighbor_box.bottom() &&
696  gap < min_space_above) {
697  min_space_above = gap;
698  above_neighbor = neighbor;
699  }
700  }
701  }
702  part->set_space_above(min_space_above);
703  part->set_space_below(min_space_below);
704  part->set_nearest_neighbor_above(above_neighbor);
705  part->set_nearest_neighbor_below(below_neighbor);
706 }
707 
708 // Set global spacing and x-height estimates
710  STATS xheight_stats(0, kMaxVerticalSpacing + 1);
711  STATS width_stats(0, kMaxBlobWidth + 1);
712  STATS ledding_stats(0, kMaxVerticalSpacing + 1);
713  // Iterate the ColPartitions in the grid.
714  ColPartitionGridSearch gsearch(grid);
715  gsearch.SetUniqueMode(true);
716  gsearch.StartFullSearch();
717  ColPartition* part = NULL;
718  while ((part = gsearch.NextFullSearch()) != NULL) {
719  // TODO(nbeato): HACK HACK HACK! medians are equal to partition length.
720  // ComputeLimits needs to get called somewhere outside of TableFinder
721  // to make sure the partitions are properly initialized.
722  // When this is called, SmoothPartitionPartners dies in an assert after
723  // table find runs. Alternative solution.
724  // part->ComputeLimits();
725  if (part->IsTextType()) {
726  // xheight_stats.add(part->median_size(), part->boxes_count());
727  // width_stats.add(part->median_width(), part->boxes_count());
728 
729  // This loop can be removed when above issues are fixed.
730  // Replace it with the 2 lines commented out above.
731  BLOBNBOX_C_IT it(part->boxes());
732  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
733  xheight_stats.add(it.data()->bounding_box().height(), 1);
734  width_stats.add(it.data()->bounding_box().width(), 1);
735  }
736 
737  ledding_stats.add(part->space_above(), 1);
738  ledding_stats.add(part->space_below(), 1);
739  }
740  }
741  // Set estimates based on median of statistics obtained
742  set_global_median_xheight(static_cast<int>(xheight_stats.median() + 0.5));
743  set_global_median_blob_width(static_cast<int>(width_stats.median() + 0.5));
744  set_global_median_ledding(static_cast<int>(ledding_stats.median() + 0.5));
745  #ifndef GRAPHICS_DISABLED
747  const char* kWindowName = "X-height (R), X-width (G), and ledding (B)";
748  ScrollView* stats_win = MakeWindow(500, 10, kWindowName);
749  xheight_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::RED);
750  width_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::GREEN);
751  ledding_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::BLUE);
752  }
753  #endif // GRAPHICS_DISABLED
754 }
755 
757  global_median_xheight_ = xheight;
758 }
761 }
763  global_median_ledding_ = ledding;
764 }
765 
768  gsearch.StartFullSearch();
769  ColPartition* part = NULL;
770  while ((part = gsearch.NextFullSearch()) != NULL) {
771  // TODO(nbeato): Rename this function, meaning is different now.
772  // IT is finding nearest neighbors its own way
773  //SetVerticalSpacing(part);
774 
775  ColPartition* upper = part->SingletonPartner(true);
776  if (upper)
777  part->set_nearest_neighbor_above(upper);
778 
779  ColPartition* lower = part->SingletonPartner(false);
780  if (lower)
781  part->set_nearest_neighbor_below(lower);
782  }
783 }
784 
785 // High level interface. Input is an unmarked ColPartitionGrid
786 // (namely, clean_part_grid_). Partitions are identified using local
787 // information and filter/smoothed. The function exit should contain
788 // a good sampling of the table partitions.
792  ScrollView* table_win = MakeWindow(300, 300, "Initial Table Partitions");
796  }
799  ScrollView* table_win = MakeWindow(600, 300, "Filtered Table Partitions");
803  }
806  ScrollView* table_win = MakeWindow(900, 300, "Smoothed Table Partitions");
810  }
813  ScrollView* table_win = MakeWindow(900, 300, "Final Table Partitions");
817  }
818 }
819 
820 // These types of partitions are marked as table partitions:
821 // 1- Partitions that have at lease one large gap between words
822 // 2- Partitions that consist of only one word (no significant gap
823 // between components)
824 // 3- Partitions that vertically overlap with other partitions within the
825 // same column.
826 // 4- Partitions with leaders before/after them.
828  // Iterate the ColPartitions in the grid.
830  gsearch(&clean_part_grid_);
831  gsearch.StartFullSearch();
832  ColPartition* part = NULL;
833  while ((part = gsearch.NextFullSearch()) != NULL) {
834  if (!part->IsTextType()) // Only consider text partitions
835  continue;
836  // Only consider partitions in dominant font size or smaller
838  continue;
839  // Mark partitions with a large gap, or no significant gap as
840  // table partitions.
841  // Comments: It produces several false alarms at:
842  // - last line of a paragraph (fixed)
843  // - single word section headings
844  // - page headers and footers
845  // - numbered equations
846  // - line drawing regions
847  // TODO(faisal): detect and fix above-mentioned cases
848  if (HasWideOrNoInterWordGap(part) ||
849  HasLeaderAdjacent(*part)) {
850  part->set_table_type();
851  }
852  }
853 }
854 
855 // Check if the partition has at least one large gap between words or no
856 // significant gap at all
858  // Should only get text partitions.
859  ASSERT_HOST(part->IsTextType());
860  // Blob access
861  BLOBNBOX_CLIST* part_boxes = part->boxes();
862  BLOBNBOX_C_IT it(part_boxes);
863  // Check if this is a relatively small partition (such as a single word)
864  if (part->bounding_box().width() <
866  part_boxes->length() < kMinBoxesInTextPartition)
867  return true;
868 
869  // Variables used to compute inter-blob spacing.
870  int current_x0 = -1;
871  int current_x1 = -1;
872  int previous_x1 = -1;
873  // Stores the maximum gap detected.
874  int largest_partition_gap_found = -1;
875  // Text partition gap limits. If this is text (and not a table),
876  // there should be at least one gap larger than min_gap and no gap
877  // larger than max_gap.
878  const double max_gap = kMaxGapInTextPartition * part->median_size();
879  const double min_gap = kMinMaxGapInTextPartition * part->median_size();
880 
881  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
882  BLOBNBOX* blob = it.data();
883  current_x0 = blob->bounding_box().left();
884  current_x1 = blob->bounding_box().right();
885  if (previous_x1 != -1) {
886  int gap = current_x0 - previous_x1;
887 
888  // TODO(nbeato): Boxes may overlap? Huh?
889  // For example, mag.3B 8003_033.3B.tif in UNLV data. The titles/authors
890  // on the top right of the page are filtered out with this line.
891  // Note 2: Iterating over blobs in a partition, so we are looking for
892  // spacing between the words.
893  if (gap < 0) {
894  // More likely case, the blobs slightly overlap. This can happen
895  // with diacritics (accents) or broken alphabet symbols (characters).
896  // Merge boxes together by taking max of right sides.
897  if (-gap < part->median_size() * kMaxBlobOverlapFactor) {
898  previous_x1 = MAX(previous_x1, current_x1);
899  continue;
900  }
901  // Extreme case, blobs overlap significantly in the same partition...
902  // This should not happen often (if at all), but it does.
903  // TODO(nbeato): investigate cases when this happens.
904  else {
905  // The behavior before was to completely ignore this case.
906  }
907  }
908 
909  // If a large enough gap is found, mark it as a table cell (return true)
910  if (gap > max_gap)
911  return true;
912  if (gap > largest_partition_gap_found)
913  largest_partition_gap_found = gap;
914  }
915  previous_x1 = current_x1;
916  }
917  // Since no large gap was found, return false if the partition is too
918  // long to be a data cell
919  if (part->bounding_box().width() >
921  part_boxes->length() > kMaxBoxesInDataPartition)
922  return false;
923 
924  // A partition may be a single blob. In this case, it's an isolated symbol
925  // or non-text (such as a ruling or image).
926  // Detect these as table partitions? Shouldn't this be case by case?
927  // The behavior before was to ignore this, making max_partition_gap < 0
928  // and implicitly return true. Just making it explicit.
929  if (largest_partition_gap_found == -1)
930  return true;
931 
932  // return true if the maximum gap found is smaller than the minimum allowed
933  // max_gap in a text partition. This indicates that there is no signficant
934  // space in the partition, hence it is likely a single word.
935  return largest_partition_gap_found < min_gap;
936 }
937 
938 // A criteria for possible tables is that a table may have leaders
939 // between data cells. An aggressive solution to find such tables is to
940 // explicitly mark partitions that have adjacent leaders.
941 // Note that this includes overlapping leaders. However, it does not
942 // include leaders in different columns on the page.
943 // Possible false-positive will include lists, such as a table of contents.
944 // As these arise, the agressive nature of this search may need to be
945 // trimmed down.
947  if (part.flow() == BTFT_LEADER)
948  return true;
949  // Search range is left and right bounded by an offset of the
950  // median xheight. This offset is to allow some tolerance to the
951  // the leaders on the page in the event that the alignment is still
952  // a bit off.
953  const TBOX& box = part.bounding_box();
954  const int search_size = kAdjacentLeaderSearchPadding * global_median_xheight_;
955  const int top = box.top() + search_size;
956  const int bottom = box.bottom() - search_size;
958  for (int direction = 0; direction < 2; ++direction) {
959  bool right_to_left = (direction == 0);
960  int x = right_to_left ? box.right() : box.left();
961  hsearch.StartSideSearch(x, bottom, top);
962  ColPartition* leader = NULL;
963  while ((leader = hsearch.NextSideSearch(right_to_left)) != NULL) {
964  // This should not happen, they are in different grids.
965  ASSERT_HOST(&part != leader);
966  // The leader could be a horizontal ruling in the grid.
967  // Make sure it is actually a leader.
968  if (leader->flow() != BTFT_LEADER)
969  continue;
970  // Make sure the leader shares a page column with the partition,
971  // otherwise we are spreading across columns.
972  if (!part.IsInSameColumnAs(*leader))
973  break;
974  // There should be a significant vertical overlap
975  if (!leader->VSignificantCoreOverlap(part))
976  continue;
977  // Leader passed all tests, so it is adjacent.
978  return true;
979  }
980  }
981  // No leaders are adjacent to the given partition.
982  return false;
983 }
984 
985 // Filter individual text partitions marked as table partitions
986 // consisting of paragraph endings, small section headings, and
987 // headers and footers.
991  // TODO(nbeato): Fully justified text as non-table?
992 }
993 
995  // Detect last line of paragraph
996  // Iterate the ColPartitions in the grid.
998  gsearch.StartFullSearch();
999  ColPartition* part = NULL;
1000  while ((part = gsearch.NextFullSearch()) != NULL) {
1001  if (part->type() != PT_TABLE)
1002  continue; // Consider only table partitions
1003 
1004  // Paragraph ending should have flowing text above it.
1005  ColPartition* upper_part = part->nearest_neighbor_above();
1006  if (!upper_part)
1007  continue;
1008  if (upper_part->type() != PT_FLOWING_TEXT)
1009  continue;
1010  if (upper_part->bounding_box().width() <
1011  2 * part->bounding_box().width())
1012  continue;
1013  // Check if its the last line of a paragraph.
1014  // In most cases, a paragraph ending should be left-aligned to text line
1015  // above it. Sometimes, it could be a 2 line paragraph, in which case
1016  // the line above it is indented.
1017  // To account for that, check if the partition center is to
1018  // the left of the one above it.
1019  int mid = (part->bounding_box().left() + part->bounding_box().right()) / 2;
1020  int upper_mid = (upper_part->bounding_box().left() +
1021  upper_part->bounding_box().right()) / 2;
1022  int current_spacing = 0; // spacing of the current line to margin
1023  int upper_spacing = 0; // spacing of the previous line to the margin
1025  // Left to right languages, use mid - left to figure out the distance
1026  // the middle is from the left margin.
1027  int left = MIN(part->bounding_box().left(),
1028  upper_part->bounding_box().left());
1029  current_spacing = mid - left;
1030  upper_spacing = upper_mid - left;
1031  } else {
1032  // Right to left languages, use right - mid to figure out the distance
1033  // the middle is from the right margin.
1034  int right = MAX(part->bounding_box().right(),
1035  upper_part->bounding_box().right());
1036  current_spacing = right - mid;
1037  upper_spacing = right - upper_mid;
1038  }
1039  if (current_spacing * kParagraphEndingPreviousLineRatio > upper_spacing)
1040  continue;
1041 
1042  // Paragraphs should have similar fonts.
1043  if (!part->MatchingSizes(*upper_part) ||
1046  continue;
1047  }
1048 
1049  // The last line of a paragraph should be left aligned.
1050  // TODO(nbeato): This would be untrue if the text was right aligned.
1051  // How often is that?
1052  if (part->space_to_left() >
1054  continue;
1055  // The line above it should be right aligned (assuming justified format).
1056  // Since we can't assume justified text, we compare whitespace to text.
1057  // The above line should have majority spanning text (or the current
1058  // line could have fit on the previous line). So compare
1059  // whitespace to text.
1060  if (upper_part->bounding_box().width() <
1062  continue;
1063 
1064  // Ledding above the line should be less than ledding below
1065  if (part->space_above() >= part->space_below() ||
1066  part->space_above() > 2 * global_median_ledding_)
1067  continue;
1068 
1069  // If all checks failed, it is probably text.
1070  part->clear_table_type();
1071  }
1072 }
1073 
1075  // Consider top-most text colpartition as header and bottom most as footer
1076  ColPartition* header = NULL;
1077  ColPartition* footer = NULL;
1078  int max_top = MIN_INT32;
1079  int min_bottom = MAX_INT32;
1081  gsearch.StartFullSearch();
1082  ColPartition* part = NULL;
1083  while ((part = gsearch.NextFullSearch()) != NULL) {
1084  if (!part->IsTextType())
1085  continue; // Consider only text partitions
1086  int top = part->bounding_box().top();
1087  int bottom = part->bounding_box().bottom();
1088  if (top > max_top) {
1089  max_top = top;
1090  header = part;
1091  }
1092  if (bottom < min_bottom) {
1093  min_bottom = bottom;
1094  footer = part;
1095  }
1096  }
1097  if (header)
1098  header->clear_table_type();
1099  if (footer)
1100  footer->clear_table_type();
1101 }
1102 
1103 // Mark all ColPartitions as table cells that have a table cell above
1104 // and below them
1105 // TODO(faisal): This is too aggressive at the moment. The method needs to
1106 // consider spacing and alignment as well. Detection of false alarm table cells
1107 // should also be done as part of it.
1109  // Iterate the ColPartitions in the grid.
1111  gsearch.StartFullSearch();
1112  ColPartition* part = NULL;
1113  while ((part = gsearch.NextFullSearch()) != NULL) {
1114  if (part->type() >= PT_TABLE || part->type() == PT_UNKNOWN)
1115  continue; // Consider only text partitions
1116  ColPartition* upper_part = part->nearest_neighbor_above();
1117  ColPartition* lower_part = part->nearest_neighbor_below();
1118  if (!upper_part || !lower_part)
1119  continue;
1120  if (upper_part->type() == PT_TABLE && lower_part->type() == PT_TABLE)
1121  part->set_table_type();
1122  }
1123 
1124  // Pass 2, do the opposite. If both the upper and lower neighbors
1125  // exist and are not tables, this probably shouldn't be a table.
1126  gsearch.StartFullSearch();
1127  part = NULL;
1128  while ((part = gsearch.NextFullSearch()) != NULL) {
1129  if (part->type() != PT_TABLE)
1130  continue; // Consider only text partitions
1131  ColPartition* upper_part = part->nearest_neighbor_above();
1132  ColPartition* lower_part = part->nearest_neighbor_below();
1133 
1134  // table can't be by itself
1135  if ((upper_part && upper_part->type() != PT_TABLE) &&
1136  (lower_part && lower_part->type() != PT_TABLE)) {
1137  part->clear_table_type();
1138  }
1139  }
1140 }
1141 
1142 // Set the type of a column segment based on the ratio of table to text cells
1143 void TableFinder::SetColumnsType(ColSegment_LIST* column_blocks) {
1144  ColSegment_IT it(column_blocks);
1145  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1146  ColSegment* seg = it.data();
1147  TBOX box = seg->bounding_box();
1148  int num_table_cells = 0;
1149  int num_text_cells = 0;
1151  rsearch(&clean_part_grid_);
1152  rsearch.SetUniqueMode(true);
1153  rsearch.StartRectSearch(box);
1154  ColPartition* part = NULL;
1155  while ((part = rsearch.NextRectSearch()) != NULL) {
1156  if (part->type() == PT_TABLE) {
1157  num_table_cells++;
1158  } else if (part->type() == PT_FLOWING_TEXT) {
1159  num_text_cells++;
1160  }
1161  }
1162  // If a column block has no text or table partition in it, it is not needed
1163  // for table detection.
1164  if (!num_table_cells && !num_text_cells) {
1165  delete it.extract();
1166  } else {
1167  seg->set_num_table_cells(num_table_cells);
1168  seg->set_num_text_cells(num_text_cells);
1169  // set column type based on the ratio of table to text cells
1170  seg->set_type();
1171  }
1172  }
1173 }
1174 
1175 // Move column blocks to grid
1176 void TableFinder::MoveColSegmentsToGrid(ColSegment_LIST *segments,
1177  ColSegmentGrid *col_seg_grid) {
1178  ColSegment_IT it(segments);
1179  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1180  ColSegment* seg = it.extract();
1181  col_seg_grid->InsertBBox(true, true, seg);
1182  }
1183 }
1184 
1185 // Merge column blocks if a split is detected due to the presence of a
1186 // table. A text block is considered split if it has multiple
1187 // neighboring blocks above/below it, and at least one of the
1188 // neighboring blocks is of table type (has a high density of table
1189 // partitions). In this case neighboring blocks in the direction
1190 // (above/below) of the table block are merged with the text block.
1191 
1192 // Comment: This method does not handle split due to a full page table
1193 // since table columns in this case do not have a text column on which
1194 // split decision can be based.
1196  int margin = gridsize();
1197 
1198  // Iterate the Column Blocks in the grid.
1200  gsearch(&col_seg_grid_);
1201  gsearch.StartFullSearch();
1202  ColSegment* seg;
1203  while ((seg = gsearch.NextFullSearch()) != NULL) {
1204  if (seg->type() != COL_TEXT)
1205  continue; // only consider text blocks for split detection
1206  bool neighbor_found = false;
1207  bool modified = false; // Modified at least once
1208  // keep expanding current box as long as neighboring table columns
1209  // are found above or below it.
1210  do {
1211  TBOX box = seg->bounding_box();
1212  // slightly expand the search region vertically
1213  int top_range = MIN(box.top() + margin, tright().y());
1214  int bottom_range = MAX(box.bottom() - margin, bleft().y());
1215  box.set_top(top_range);
1216  box.set_bottom(bottom_range);
1217  neighbor_found = false;
1219  rectsearch(&col_seg_grid_);
1220  rectsearch.StartRectSearch(box);
1221  ColSegment* neighbor = NULL;
1222  while ((neighbor = rectsearch.NextRectSearch()) != NULL) {
1223  if (neighbor == seg)
1224  continue;
1225  const TBOX& neighbor_box = neighbor->bounding_box();
1226  // If the neighbor box significantly overlaps with the current
1227  // box (due to the expansion of the current box in the
1228  // previous iteration of this loop), remove the neighbor box
1229  // and expand the current box to include it.
1230  if (neighbor_box.overlap_fraction(box) >= 0.9) {
1231  seg->InsertBox(neighbor_box);
1232  modified = true;
1233  rectsearch.RemoveBBox();
1234  gsearch.RepositionIterator();
1235  delete neighbor;
1236  continue;
1237  }
1238  // Only expand if the neighbor box is of table type
1239  if (neighbor->type() != COL_TABLE)
1240  continue;
1241  // Insert the neighbor box into the current column block
1242  if (neighbor_box.major_x_overlap(box) &&
1243  !box.contains(neighbor_box)) {
1244  seg->InsertBox(neighbor_box);
1245  neighbor_found = true;
1246  modified = true;
1247  rectsearch.RemoveBBox();
1248  gsearch.RepositionIterator();
1249  delete neighbor;
1250  }
1251  }
1252  } while (neighbor_found);
1253  if (modified) {
1254  // Because the box has changed, it has to be removed first.
1255  gsearch.RemoveBBox();
1256  col_seg_grid_.InsertBBox(true, true, seg);
1257  gsearch.RepositionIterator();
1258  }
1259  }
1260 }
1261 
1262 // Group horizontally overlapping table partitions into table columns.
1263 // TODO(faisal): This is too aggressive at the moment. The method should
1264 // consider more attributes to group table partitions together. Some common
1265 // errors are:
1266 // 1- page number is merged with a table column above it even
1267 // if there is a large vertical gap between them.
1268 // 2- column headers go on to catch one of the columns arbitrarily
1269 // 3- an isolated noise blob near page top or bottom merges with the table
1270 // column below/above it
1271 // 4- cells from two vertically adjacent tables merge together to make a
1272 // single column resulting in merging of the two tables
1273 void TableFinder::GetTableColumns(ColSegment_LIST *table_columns) {
1274  ColSegment_IT it(table_columns);
1275  // Iterate the ColPartitions in the grid.
1277  gsearch(&clean_part_grid_);
1278  gsearch.StartFullSearch();
1279  ColPartition* part;
1280  while ((part = gsearch.NextFullSearch()) != NULL) {
1281  if (part->inside_table_column() || part->type() != PT_TABLE)
1282  continue; // prevent a partition to be assigned to multiple columns
1283  const TBOX& box = part->bounding_box();
1284  ColSegment* col = new ColSegment();
1285  col->InsertBox(box);
1286  part->set_inside_table_column(true);
1287  // Start a search below the current cell to find bottom neighbours
1288  // Note: a full search will always process things above it first, so
1289  // this should be starting at the highest cell and working its way down.
1291  vsearch(&clean_part_grid_);
1292  vsearch.StartVerticalSearch(box.left(), box.right(), box.bottom());
1293  ColPartition* neighbor = NULL;
1294  bool found_neighbours = false;
1295  while ((neighbor = vsearch.NextVerticalSearch(true)) != NULL) {
1296  // only consider neighbors not assigned to any column yet
1297  if (neighbor->inside_table_column())
1298  continue;
1299  // Horizontal lines should not break the flow
1300  if (neighbor->IsHorizontalLine())
1301  continue;
1302  // presence of a non-table neighbor marks the end of current
1303  // table column
1304  if (neighbor->type() != PT_TABLE)
1305  break;
1306  // add the neighbor partition to the table column
1307  const TBOX& neighbor_box = neighbor->bounding_box();
1308  col->InsertBox(neighbor_box);
1309  neighbor->set_inside_table_column(true);
1310  found_neighbours = true;
1311  }
1312  if (found_neighbours) {
1313  it.add_after_then_move(col);
1314  } else {
1315  part->set_inside_table_column(false);
1316  delete col;
1317  }
1318  }
1319 }
1320 
1321 // Mark regions in a column that are x-bounded by the column boundaries and
1322 // y-bounded by the table columns' projection on the y-axis as table regions
1323 void TableFinder::GetTableRegions(ColSegment_LIST* table_columns,
1324  ColSegment_LIST* table_regions) {
1325  ColSegment_IT cit(table_columns);
1326  ColSegment_IT rit(table_regions);
1327  // Iterate through column blocks
1329  gsearch(&col_seg_grid_);
1330  gsearch.StartFullSearch();
1331  ColSegment* part;
1332  int page_height = tright().y() - bleft().y();
1333  ASSERT_HOST(page_height > 0);
1334  // create a bool array to hold projection on y-axis
1335  bool* table_region = new bool[page_height];
1336  while ((part = gsearch.NextFullSearch()) != NULL) {
1337  TBOX part_box = part->bounding_box();
1338  // reset the projection array
1339  for (int i = 0; i < page_height; i++) {
1340  table_region[i] = false;
1341  }
1342  // iterate through all table columns to find regions in the current
1343  // page column block
1344  cit.move_to_first();
1345  for (cit.mark_cycle_pt(); !cit.cycled_list(); cit.forward()) {
1346  TBOX col_box = cit.data()->bounding_box();
1347  // find intersection region of table column and page column
1348  TBOX intersection_box = col_box.intersection(part_box);
1349  // project table column on the y-axis
1350  for (int i = intersection_box.bottom(); i < intersection_box.top(); i++) {
1351  table_region[i - bleft().y()] = true;
1352  }
1353  }
1354  // set x-limits of table regions to page column width
1355  TBOX current_table_box;
1356  current_table_box.set_left(part_box.left());
1357  current_table_box.set_right(part_box.right());
1358  // go through the y-axis projection to find runs of table
1359  // regions. Each run makes one table region.
1360  for (int i = 1; i < page_height; i++) {
1361  // detect start of a table region
1362  if (!table_region[i - 1] && table_region[i]) {
1363  current_table_box.set_bottom(i + bleft().y());
1364  }
1365  // TODO(nbeato): Is it guaranteed that the last row is not a table region?
1366  // detect end of a table region
1367  if (table_region[i - 1] && !table_region[i]) {
1368  current_table_box.set_top(i + bleft().y());
1369  if (!current_table_box.null_box()) {
1370  ColSegment* seg = new ColSegment();
1371  seg->InsertBox(current_table_box);
1372  rit.add_after_then_move(seg);
1373  }
1374  }
1375  }
1376  }
1377  delete[] table_region;
1378 }
1379 
1380 // Merge table regions corresponding to tables spanning multiple columns if
1381 // there is a colpartition (horizontal ruling line or normal text) that
1382 // touches both regions.
1383 // TODO(faisal): A rare error occurs if there are two horizontally adjacent
1384 // tables with aligned ruling lines. In this case, line finder returns a
1385 // single line and hence the tables get merged together
1387  // Iterate the table regions in the grid.
1389  gsearch(&table_grid_);
1390  gsearch.StartFullSearch();
1391  ColSegment* seg = NULL;
1392  while ((seg = gsearch.NextFullSearch()) != NULL) {
1393  bool neighbor_found = false;
1394  bool modified = false; // Modified at least once
1395  do {
1396  // Start a rectangle search x-bounded by the image and y by the table
1397  const TBOX& box = seg->bounding_box();
1398  TBOX search_region(box);
1399  search_region.set_left(bleft().x());
1400  search_region.set_right(tright().x());
1401  neighbor_found = false;
1403  rectsearch(&table_grid_);
1404  rectsearch.StartRectSearch(search_region);
1405  ColSegment* neighbor = NULL;
1406  while ((neighbor = rectsearch.NextRectSearch()) != NULL) {
1407  if (neighbor == seg)
1408  continue;
1409  const TBOX& neighbor_box = neighbor->bounding_box();
1410  // Check if a neighbor box has a large overlap with the table
1411  // region. This may happen as a result of merging two table
1412  // regions in the previous iteration.
1413  if (neighbor_box.overlap_fraction(box) >= 0.9) {
1414  seg->InsertBox(neighbor_box);
1415  rectsearch.RemoveBBox();
1416  gsearch.RepositionIterator();
1417  delete neighbor;
1418  modified = true;
1419  continue;
1420  }
1421  // Check if two table regions belong together based on a common
1422  // horizontal ruling line
1423  if (BelongToOneTable(box, neighbor_box)) {
1424  seg->InsertBox(neighbor_box);
1425  neighbor_found = true;
1426  modified = true;
1427  rectsearch.RemoveBBox();
1428  gsearch.RepositionIterator();
1429  delete neighbor;
1430  }
1431  }
1432  } while (neighbor_found);
1433  if (modified) {
1434  // Because the box has changed, it has to be removed first.
1435  gsearch.RemoveBBox();
1436  table_grid_.InsertBBox(true, true, seg);
1437  gsearch.RepositionIterator();
1438  }
1439  }
1440 }
1441 
1442 // Decide if two table regions belong to one table based on a common
1443 // horizontal ruling line or another colpartition
1444 bool TableFinder::BelongToOneTable(const TBOX &box1, const TBOX &box2) {
1445  // Check the obvious case. Most likely not true because overlapping boxes
1446  // should already be merged, but seems like a good thing to do in case things
1447  // change.
1448  if (box1.overlap(box2))
1449  return true;
1450  // Check for ColPartitions spanning both table regions
1451  TBOX bbox = box1.bounding_union(box2);
1452  // Start a rect search on bbox
1454  rectsearch(&clean_part_grid_);
1455  rectsearch.StartRectSearch(bbox);
1456  ColPartition* part = NULL;
1457  while ((part = rectsearch.NextRectSearch()) != NULL) {
1458  const TBOX& part_box = part->bounding_box();
1459  // return true if a colpartition spanning both table regions is found
1460  if (part_box.overlap(box1) && part_box.overlap(box2))
1461  return true;
1462  }
1463  return false;
1464 }
1465 
1466 // Adjust table boundaries by:
1467 // - building a tight bounding box around all ColPartitions contained in it.
1468 // - expanding table boundaries to include all colpartitions that overlap the
1469 // table by more than half of their area
1470 // - expanding table boundaries to include nearby horizontal rule lines
1471 // - expanding table vertically to include left out column headers
1472 // TODO(faisal): Expansion of table boundaries is quite aggressive. It usually
1473 // makes following errors:
1474 // 1- horizontal lines consisting of underlines are included in the table if
1475 // they are close enough
1476 // 2- horizontal lines originating from noise tend to get merged with a table
1477 // near the top of the page
1478 // 3- the criteria for including horizontal lines is very generous. Many times
1479 // horizontal lines separating headers and footers get merged with a
1480 // single-column table in a multi-column page thereby including text
1481 // from the neighboring column inside the table
1482 // 4- the criteria for including left out column headers also tends to
1483 // occasionally include text-lines above the tables, typically from
1484 // table caption
1486  // Iterate the table regions in the grid
1487  ColSegment_CLIST adjusted_tables;
1488  ColSegment_C_IT it(&adjusted_tables);
1490  gsearch.StartFullSearch();
1491  ColSegment* table = NULL;
1492  while ((table = gsearch.NextFullSearch()) != NULL) {
1493  const TBOX& table_box = table->bounding_box();
1494  TBOX grown_box = table_box;
1495  GrowTableBox(table_box, &grown_box);
1496  // To prevent a table from expanding again, do not insert the
1497  // modified box back to the grid. Instead move it to a list and
1498  // and remove it from the grid. The list is moved later back to the grid.
1499  if (!grown_box.null_box()) {
1500  ColSegment* col = new ColSegment();
1501  col->InsertBox(grown_box);
1502  it.add_after_then_move(col);
1503  }
1504  gsearch.RemoveBBox();
1505  delete table;
1506  }
1507  // clear table grid to move final tables in it
1508  // TODO(nbeato): table_grid_ should already be empty. The above loop
1509  // removed everything. Maybe just assert it is empty?
1510  table_grid_.Clear();
1511  it.move_to_first();
1512  // move back final tables to table_grid_
1513  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1514  ColSegment* seg = it.extract();
1515  table_grid_.InsertBBox(true, true, seg);
1516  }
1517 }
1518 
1519 void TableFinder::GrowTableBox(const TBOX& table_box, TBOX* result_box) {
1520  // TODO(nbeato): The growing code is a bit excessive right now.
1521  // By removing these lines, the partitions considered need
1522  // to have some overlap or be special cases. These lines could
1523  // be added again once a check is put in place to make sure that
1524  // growing tables don't stomp on a lot of non-table partitions.
1525 
1526  // search for horizontal ruling lines within the vertical margin
1527  // int vertical_margin = kRulingVerticalMargin * gridsize();
1528  TBOX search_box = table_box;
1529  // int top = MIN(search_box.top() + vertical_margin, tright().y());
1530  // int bottom = MAX(search_box.bottom() - vertical_margin, bleft().y());
1531  // search_box.set_top(top);
1532  // search_box.set_bottom(bottom);
1533 
1534  GrowTableToIncludePartials(table_box, search_box, result_box);
1535  GrowTableToIncludeLines(table_box, search_box, result_box);
1536  IncludeLeftOutColumnHeaders(result_box);
1537 }
1538 
1539 // Grow a table by increasing the size of the box to include
1540 // partitions with significant overlap with the table.
1542  const TBOX& search_range,
1543  TBOX* result_box) {
1544  // Rulings are in a different grid, so search 2 grids for rulings, text,
1545  // and table partitions that are not entirely within the new box.
1546  for (int i = 0; i < 2; ++i) {
1547  ColPartitionGrid* grid = (i == 0) ? &fragmented_text_grid_ :
1549  ColPartitionGridSearch rectsearch(grid);
1550  rectsearch.StartRectSearch(search_range);
1551  ColPartition* part = NULL;
1552  while ((part = rectsearch.NextRectSearch()) != NULL) {
1553  // Only include text and table types.
1554  if (part->IsImageType())
1555  continue;
1556  const TBOX& part_box = part->bounding_box();
1557  // Include partition in the table if more than half of it
1558  // is covered by the table
1559  if (part_box.overlap_fraction(table_box) > kMinOverlapWithTable) {
1560  *result_box = result_box->bounding_union(part_box);
1561  continue;
1562  }
1563  }
1564  }
1565 }
1566 
1567 // Grow a table by expanding to the extents of significantly
1568 // overlapping lines.
1570  const TBOX& search_range,
1571  TBOX* result_box) {
1573  rsearch.SetUniqueMode(true);
1574  rsearch.StartRectSearch(search_range);
1575  ColPartition* part = NULL;
1576  while ((part = rsearch.NextRectSearch()) != NULL) {
1577  // TODO(nbeato) This should also do vertical, but column
1578  // boundaries are breaking things. This function needs to be
1579  // updated to allow vertical lines as well.
1580  if (!part->IsLineType())
1581  continue;
1582  // Avoid the following function call if the result of the
1583  // function is irrelevant.
1584  const TBOX& part_box = part->bounding_box();
1585  if (result_box->contains(part_box))
1586  continue;
1587  // Include a partially overlapping horizontal line only if the
1588  // extra ColPartitions that will be included due to expansion
1589  // have large side spacing w.r.t. columns containing them.
1590  if (HLineBelongsToTable(*part, table_box))
1591  *result_box = result_box->bounding_union(part_box);
1592  // TODO(nbeato): Vertical
1593  }
1594 }
1595 
1596 // Checks whether the horizontal line belong to the table by looking at the
1597 // side spacing of extra ColParitions that will be included in the table
1598 // due to expansion
1600  const TBOX& table_box) {
1601  if (!part.IsHorizontalLine())
1602  return false;
1603  const TBOX& part_box = part.bounding_box();
1604  if (!part_box.major_x_overlap(table_box))
1605  return false;
1606  // Do not consider top-most horizontal line since it usually
1607  // originates from noise.
1608  // TODO(nbeato): I had to comment this out because the ruling grid doesn't
1609  // have neighbors solved.
1610  // if (!part.nearest_neighbor_above())
1611  // return false;
1612  const TBOX bbox = part_box.bounding_union(table_box);
1613  // In the "unioned table" box (the table extents expanded by the line),
1614  // keep track of how many partitions have significant padding to the left
1615  // and right. If more than half of the partitions covered by the new table
1616  // have significant spacing, the line belongs to the table and the table
1617  // grows to include all of the partitions.
1618  int num_extra_partitions = 0;
1619  int extra_space_to_right = 0;
1620  int extra_space_to_left = 0;
1621  // Rulings are in a different grid, so search 2 grids for rulings, text,
1622  // and table partitions that are introduced by the new box.
1623  for (int i = 0; i < 2; ++i) {
1624  ColPartitionGrid* grid = (i == 0) ? &clean_part_grid_ :
1626  // Start a rect search on bbox
1627  ColPartitionGridSearch rectsearch(grid);
1628  rectsearch.SetUniqueMode(true);
1629  rectsearch.StartRectSearch(bbox);
1630  ColPartition* extra_part = NULL;
1631  while ((extra_part = rectsearch.NextRectSearch()) != NULL) {
1632  // ColPartition already in table
1633  const TBOX& extra_part_box = extra_part->bounding_box();
1634  if (extra_part_box.overlap_fraction(table_box) > kMinOverlapWithTable)
1635  continue;
1636  // Non-text ColPartitions do not contribute
1637  if (extra_part->IsImageType())
1638  continue;
1639  // Consider this partition.
1640  num_extra_partitions++;
1641  // presence of a table cell is a strong hint, so just increment the scores
1642  // without looking at the spacing.
1643  if (extra_part->type() == PT_TABLE || extra_part->IsLineType()) {
1644  extra_space_to_right++;
1645  extra_space_to_left++;
1646  continue;
1647  }
1648  int space_threshold = kSideSpaceMargin * part.median_size();
1649  if (extra_part->space_to_right() > space_threshold)
1650  extra_space_to_right++;
1651  if (extra_part->space_to_left() > space_threshold)
1652  extra_space_to_left++;
1653  }
1654  }
1655  // tprintf("%d %d %d\n",
1656  // num_extra_partitions,extra_space_to_right,extra_space_to_left);
1657  return (extra_space_to_right > num_extra_partitions / 2) ||
1658  (extra_space_to_left > num_extra_partitions / 2);
1659 }
1660 
1661 // Look for isolated column headers above the given table box and
1662 // include them in the table
1664  // Start a search above the current table to look for column headers
1666  vsearch.StartVerticalSearch(table_box->left(), table_box->right(),
1667  table_box->top());
1668  ColPartition* neighbor = NULL;
1669  ColPartition* previous_neighbor = NULL;
1670  while ((neighbor = vsearch.NextVerticalSearch(false)) != NULL) {
1671  // Max distance to find a table heading.
1672  const int max_distance = kMaxColumnHeaderDistance *
1673  neighbor->median_size();
1674  int table_top = table_box->top();
1675  const TBOX& box = neighbor->bounding_box();
1676  // Do not continue if the next box is way above
1677  if (box.bottom() - table_top > max_distance)
1678  break;
1679  // Unconditionally include partitions of type TABLE or LINE
1680  // TODO(faisal): add some reasonable conditions here
1681  if (neighbor->type() == PT_TABLE || neighbor->IsLineType()) {
1682  table_box->set_top(box.top());
1683  previous_neighbor = NULL;
1684  continue;
1685  }
1686  // If there are two text partitions, one above the other, without a table
1687  // cell on their left or right side, consider them a barrier and quit
1688  if (previous_neighbor == NULL) {
1689  previous_neighbor = neighbor;
1690  } else {
1691  const TBOX& previous_box = previous_neighbor->bounding_box();
1692  if (!box.major_y_overlap(previous_box))
1693  break;
1694  }
1695  }
1696 }
1697 
1698 // Remove false alarms consiting of a single column based on their
1699 // projection on the x-axis. Projection of a real table on the x-axis
1700 // should have at least one zero-valley larger than the global median
1701 // x-height of the page.
1703  int page_width = tright().x() - bleft().x();
1704  ASSERT_HOST(page_width > 0);
1705  // create an integer array to hold projection on x-axis
1706  int* table_xprojection = new int[page_width];
1707  // Iterate through all tables in the table grid
1709  table_search(&table_grid_);
1710  table_search.StartFullSearch();
1711  ColSegment* table;
1712  while ((table = table_search.NextFullSearch()) != NULL) {
1713  TBOX table_box = table->bounding_box();
1714  // reset the projection array
1715  for (int i = 0; i < page_width; i++) {
1716  table_xprojection[i] = 0;
1717  }
1718  // Start a rect search on table_box
1720  rectsearch(&clean_part_grid_);
1721  rectsearch.SetUniqueMode(true);
1722  rectsearch.StartRectSearch(table_box);
1723  ColPartition* part;
1724  while ((part = rectsearch.NextRectSearch()) != NULL) {
1725  if (!part->IsTextType())
1726  continue; // Do not consider non-text partitions
1727  if (part->flow() == BTFT_LEADER)
1728  continue; // Assume leaders are in tables
1729  TBOX part_box = part->bounding_box();
1730  // Do not consider partitions partially covered by the table
1731  if (part_box.overlap_fraction(table_box) < kMinOverlapWithTable)
1732  continue;
1733  BLOBNBOX_CLIST* part_boxes = part->boxes();
1734  BLOBNBOX_C_IT pit(part_boxes);
1735 
1736  // Make sure overlapping blobs don't artificially inflate the number
1737  // of rows in the table. This happens frequently with things such as
1738  // decimals and split characters. Do this by assuming the column
1739  // partition is sorted mostly left to right and just clip
1740  // bounding boxes by the previous box's extent.
1741  int next_position_to_write = 0;
1742 
1743  for (pit.mark_cycle_pt(); !pit.cycled_list(); pit.forward()) {
1744  BLOBNBOX *pblob = pit.data();
1745  // ignore blob height for the purpose of projection since we
1746  // are only interested in finding valleys
1747  int xstart = pblob->bounding_box().left();
1748  int xend = pblob->bounding_box().right();
1749 
1750  xstart = MAX(xstart, next_position_to_write);
1751  for (int i = xstart; i < xend; i++)
1752  table_xprojection[i - bleft().x()]++;
1753  next_position_to_write = xend;
1754  }
1755  }
1756  // Find largest valley between two reasonable peaks in the table
1757  if (!GapInXProjection(table_xprojection, page_width)) {
1758  table_search.RemoveBBox();
1759  delete table;
1760  }
1761  }
1762  delete[] table_xprojection;
1763 }
1764 
1765 // Return true if at least one gap larger than the global x-height
1766 // exists in the horizontal projection
1767 bool TableFinder::GapInXProjection(int* xprojection, int length) {
1768  // Find peak value of the histogram
1769  int peak_value = 0;
1770  for (int i = 0; i < length; i++) {
1771  if (xprojection[i] > peak_value) {
1772  peak_value = xprojection[i];
1773  }
1774  }
1775  // Peak value represents the maximum number of horizontally
1776  // overlapping colpartitions, so this can be considered as the
1777  // number of rows in the table
1778  if (peak_value < kMinRowsInTable)
1779  return false;
1780  double projection_threshold = kSmallTableProjectionThreshold * peak_value;
1781  if (peak_value >= kLargeTableRowCount)
1782  projection_threshold = kLargeTableProjectionThreshold * peak_value;
1783  // Threshold the histogram
1784  for (int i = 0; i < length; i++) {
1785  xprojection[i] = (xprojection[i] >= projection_threshold) ? 1 : 0;
1786  }
1787  // Find the largest run of zeros between two ones
1788  int largest_gap = 0;
1789  int run_start = -1;
1790  for (int i = 1; i < length; i++) {
1791  // detect start of a run of zeros
1792  if (xprojection[i - 1] && !xprojection[i]) {
1793  run_start = i;
1794  }
1795  // detect end of a run of zeros and update the value of largest gap
1796  if (run_start != -1 && !xprojection[i - 1] && xprojection[i]) {
1797  int gap = i - run_start;
1798  if (gap > largest_gap)
1799  largest_gap = gap;
1800  run_start = -1;
1801  }
1802  }
1803  return largest_gap > kMaxXProjectionGapFactor * global_median_xheight_;
1804 }
1805 
1806 // Given the location of a table "guess", try to overlay a cellular
1807 // grid in the location, adjusting the boundaries.
1808 // TODO(nbeato): Falsely introduces:
1809 // -headers/footers (not any worse, too much overlap destroys cells)
1810 // -page numbers (not worse, included because maximize margins)
1811 // -equations (nicely fit into a celluar grid, but more sparsely)
1812 // -figures (random text box, also sparse)
1813 // -small left-aligned text areas with overlapping positioned whitespace
1814 // (rejected before)
1815 // Overall, this just needs some more work.
1817  ScrollView* table_win = NULL;
1818  if (textord_show_tables) {
1819  table_win = MakeWindow(0, 0, "Table Structure");
1822  // table_grid_.DisplayBoxes(table_win);
1823  }
1824 
1825 
1826  TableRecognizer recognizer;
1827  recognizer.Init();
1829  recognizer.set_text_grid(&fragmented_text_grid_);
1830  recognizer.set_max_text_height(global_median_xheight_ * 2.0);
1831  recognizer.set_min_height(1.5 * gridheight());
1832  // Loop over all of the tables and try to fit them.
1833  // Store the good tables here.
1834  ColSegment_CLIST good_tables;
1835  ColSegment_C_IT good_it(&good_tables);
1836 
1838  gsearch.StartFullSearch();
1839  ColSegment* found_table = NULL;
1840  while ((found_table = gsearch.NextFullSearch()) != NULL) {
1841  gsearch.RemoveBBox();
1842 
1843  // The goal is to make the tables persistent in a list.
1844  // When that happens, this will move into the search loop.
1845  const TBOX& found_box = found_table->bounding_box();
1846  StructuredTable* table_structure = recognizer.RecognizeTable(found_box);
1847 
1848  // Process a table. Good tables are inserted into the grid again later on
1849  // We can't change boxes in the grid while it is running a search.
1850  if (table_structure != NULL) {
1851  if (textord_show_tables) {
1852  table_structure->Display(table_win, ScrollView::LIME_GREEN);
1853  }
1854  found_table->set_bounding_box(table_structure->bounding_box());
1855  delete table_structure;
1856  good_it.add_after_then_move(found_table);
1857  } else {
1858  delete found_table;
1859  }
1860  }
1861  // TODO(nbeato): MERGE!! There is awesome info now available for merging.
1862 
1863  // At this point, the grid is empty. We can safely insert the good tables
1864  // back into grid.
1865  for (good_it.mark_cycle_pt(); !good_it.cycled_list(); good_it.forward())
1866  table_grid_.InsertBBox(true, true, good_it.extract());
1867 }
1868 
1869 // Displays the column segments in some window.
1871  ColSegment_LIST *segments,
1872  ScrollView::Color color) {
1873 #ifndef GRAPHICS_DISABLED
1874  win->Pen(color);
1875  win->Brush(ScrollView::NONE);
1876  ColSegment_IT it(segments);
1877  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1878  ColSegment* col = it.data();
1879  const TBOX& box = col->bounding_box();
1880  int left_x = box.left();
1881  int right_x = box.right();
1882  int top_y = box.top();
1883  int bottom_y = box.bottom();
1884  win->Rectangle(left_x, bottom_y, right_x, top_y);
1885  }
1886  win->UpdateWindow();
1887 #endif
1888 }
1889 
1891  ScrollView::Color color) {
1892 #ifndef GRAPHICS_DISABLED
1893  // Iterate the ColPartitions in the grid.
1895  gsearch(grid);
1896  gsearch.StartFullSearch();
1897  ColSegment* seg = NULL;
1898  while ((seg = gsearch.NextFullSearch()) != NULL) {
1899  const TBOX& box = seg->bounding_box();
1900  int left_x = box.left();
1901  int right_x = box.right();
1902  int top_y = box.top();
1903  int bottom_y = box.bottom();
1904  win->Brush(ScrollView::NONE);
1905  win->Pen(color);
1906  win->Rectangle(left_x, bottom_y, right_x, top_y);
1907  }
1908  win->UpdateWindow();
1909 #endif
1910 }
1911 
1912 // Displays the colpartitions using a new coloring on an existing window.
1913 // Note: This method is only for debug purpose during development and
1914 // would not be part of checked in code
1916  ColPartitionGrid* grid,
1917  ScrollView::Color default_color,
1918  ScrollView::Color table_color) {
1919 #ifndef GRAPHICS_DISABLED
1920  ScrollView::Color color = default_color;
1921  // Iterate the ColPartitions in the grid.
1923  gsearch(grid);
1924  gsearch.StartFullSearch();
1925  ColPartition* part = NULL;
1926  while ((part = gsearch.NextFullSearch()) != NULL) {
1927  color = default_color;
1928  if (part->type() == PT_TABLE)
1929  color = table_color;
1930 
1931  const TBOX& box = part->bounding_box();
1932  int left_x = box.left();
1933  int right_x = box.right();
1934  int top_y = box.top();
1935  int bottom_y = box.bottom();
1936  win->Brush(ScrollView::NONE);
1937  win->Pen(color);
1938  win->Rectangle(left_x, bottom_y, right_x, top_y);
1939  }
1940  win->UpdateWindow();
1941 #endif
1942 }
1944  ColPartitionGrid* grid,
1945  ScrollView::Color default_color) {
1946  DisplayColPartitions(win, grid, default_color, ScrollView::YELLOW);
1947 }
1948 
1950  ScrollView* win,
1951  ColPartitionGrid* grid,
1952  ScrollView::Color color) {
1953 #ifndef GRAPHICS_DISABLED
1954  // Iterate the ColPartitions in the grid.
1956  gsearch(grid);
1957  gsearch.StartFullSearch();
1958  ColPartition* part = NULL;
1959  while ((part = gsearch.NextFullSearch()) != NULL) {
1960  const TBOX& box = part->bounding_box();
1961  int left_x = box.left();
1962  int right_x = box.right();
1963  int top_y = box.top();
1964  int bottom_y = box.bottom();
1965 
1966  ColPartition* upper_part = part->nearest_neighbor_above();
1967  if (upper_part) {
1968  TBOX upper_box = upper_part->bounding_box();
1969  int mid_x = (left_x + right_x) / 2;
1970  int mid_y = (top_y + bottom_y) / 2;
1971  int other_x = (upper_box.left() + upper_box.right()) / 2;
1972  int other_y = (upper_box.top() + upper_box.bottom()) / 2;
1973  win->Brush(ScrollView::NONE);
1974  win->Pen(color);
1975  win->Line(mid_x, mid_y, other_x, other_y);
1976  }
1977  ColPartition* lower_part = part->nearest_neighbor_below();
1978  if (lower_part) {
1979  TBOX lower_box = lower_part->bounding_box();
1980  int mid_x = (left_x + right_x) / 2;
1981  int mid_y = (top_y + bottom_y) / 2;
1982  int other_x = (lower_box.left() + lower_box.right()) / 2;
1983  int other_y = (lower_box.top() + lower_box.bottom()) / 2;
1984  win->Brush(ScrollView::NONE);
1985  win->Pen(color);
1986  win->Line(mid_x, mid_y, other_x, other_y);
1987  }
1988  }
1989  win->UpdateWindow();
1990 #endif
1991 }
1992 
1993 
1994 // Write debug image and text file.
1995 // Note: This method is only for debug purpose during development and
1996 // would not be part of checked in code
1997 void TableFinder::WriteToPix(const FCOORD& reskew) {
1998  // Input file must be named test1.tif
1999  PIX* pix = pixRead("test1.tif");
2000  if (!pix) {
2001  tprintf("Input file test1.tif not found.\n");
2002  return;
2003  }
2004  int img_height = pixGetHeight(pix);
2005  int img_width = pixGetWidth(pix);
2006  // Maximum number of text or table partitions
2007  int num_boxes = 10;
2008  BOXA* text_box_array = boxaCreate(num_boxes);
2009  BOXA* table_box_array = boxaCreate(num_boxes);
2011  gsearch(&clean_part_grid_);
2012  gsearch.StartFullSearch();
2013  ColPartition* part;
2014  // load colpartitions into text_box_array and table_box_array
2015  while ((part = gsearch.NextFullSearch()) != NULL) {
2016  TBOX box = part->bounding_box();
2017  box.rotate_large(reskew);
2018  BOX* lept_box = boxCreate(box.left(), img_height - box.top(),
2019  box.right() - box.left(),
2020  box.top() - box.bottom());
2021  if (part->type() == PT_TABLE)
2022  boxaAddBox(table_box_array, lept_box, L_INSERT);
2023  else
2024  boxaAddBox(text_box_array, lept_box, L_INSERT);
2025  }
2026  // draw colpartitions on the output image
2027  PIX* out = pixDrawBoxa(pix, text_box_array, 3, 0xff000000);
2028  out = pixDrawBoxa(out, table_box_array, 3, 0x0000ff00);
2029 
2030  BOXA* table_array = boxaCreate(num_boxes);
2031  // text file containing detected table bounding boxes
2032  FILE* fptr = fopen("tess-table.txt", "wb");
2034  table_search(&table_grid_);
2035  table_search.StartFullSearch();
2036  ColSegment* table;
2037  // load table boxes to table_array and write them to text file as well
2038  while ((table = table_search.NextFullSearch()) != NULL) {
2039  TBOX box = table->bounding_box();
2040  box.rotate_large(reskew);
2041  // Since deskewing introduces negative coordinates, reskewing
2042  // might not completely recover from that since both steps enlarge
2043  // the actual box. Hence a box that undergoes deskewing/reskewing
2044  // may go out of image boundaries. Crop a table box if needed to
2045  // contain it inside the image dimensions.
2046  box = box.intersection(TBOX(0, 0, img_width - 1, img_height - 1));
2047  BOX* lept_box = boxCreate(box.left(), img_height - box.top(),
2048  box.right() - box.left(),
2049  box.top() - box.bottom());
2050  boxaAddBox(table_array, lept_box, L_INSERT);
2051  fprintf(fptr, "%d %d %d %d TABLE\n", box.left(),
2052  img_height - box.top(), box.right(), img_height - box.bottom());
2053  }
2054  fclose(fptr);
2055  // paint table boxes on the debug image
2056  out = pixDrawBoxa(out, table_array, 5, 0x7fff0000);
2057 
2058  pixWrite("out.png", out, IFF_PNG);
2059  // memory cleanup
2060  boxaDestroy(&text_box_array);
2061  boxaDestroy(&table_box_array);
2062  boxaDestroy(&table_array);
2063  pixDestroy(&pix);
2064  pixDestroy(&out);
2065 }
2066 
2067 // Merge all colpartitions in table regions to make them a single
2068 // colpartition and revert types of isolated table cells not
2069 // assigned to any table to their original types.
2071  ColPartitionSet** all_columns,
2072  WidthCallback* width_cb) {
2073  // Since we have table blocks already, remove table tags from all
2074  // colpartitions
2076  gsearch(grid);
2077  gsearch.StartFullSearch();
2078  ColPartition* part = NULL;
2079 
2080  while ((part = gsearch.NextFullSearch()) != NULL) {
2081  if (part->type() == PT_TABLE) {
2082  part->clear_table_type();
2083  }
2084  }
2085  // Now make a single colpartition out of each table block and remove
2086  // all colpartitions contained within a table
2088  table_search(&table_grid_);
2089  table_search.StartFullSearch();
2090  ColSegment* table;
2091  while ((table = table_search.NextFullSearch()) != NULL) {
2092  TBOX table_box = table->bounding_box();
2093  // Start a rect search on table_box
2095  rectsearch(grid);
2096  rectsearch.StartRectSearch(table_box);
2097  ColPartition* part;
2098  ColPartition* table_partition = NULL;
2099  while ((part = rectsearch.NextRectSearch()) != NULL) {
2100  // Do not consider image partitions
2101  if (!part->IsTextType())
2102  continue;
2103  TBOX part_box = part->bounding_box();
2104  // Include partition in the table if more than half of it
2105  // is covered by the table
2106  if (part_box.overlap_fraction(table_box) > kMinOverlapWithTable) {
2107  rectsearch.RemoveBBox();
2108  if (table_partition) {
2109  table_partition->Absorb(part, width_cb);
2110  } else {
2111  table_partition = part;
2112  }
2113  }
2114  }
2115  // Insert table colpartition back to part_grid_
2116  if (table_partition) {
2117  // To match the columns used when transforming to blocks, the new table
2118  // partition must have its first and last column set at the grid y that
2119  // corresponds to its bottom.
2120  const TBOX& table_box = table_partition->bounding_box();
2121  int grid_x, grid_y;
2122  grid->GridCoords(table_box.left(), table_box.bottom(), &grid_x, &grid_y);
2123  table_partition->SetPartitionType(resolution_, all_columns[grid_y]);
2124  table_partition->set_table_type();
2125  table_partition->set_blob_type(BRT_TEXT);
2126  table_partition->set_flow(BTFT_CHAIN);
2127  table_partition->SetBlobTypes();
2128  grid->InsertBBox(true, true, table_partition);
2129  }
2130  }
2131 }
2132 
2136  : ELIST_LINK(),
2137  num_table_cells_(0),
2138  num_text_cells_(0),
2139  type_(COL_UNKNOWN) {
2140 }
2142 }
2143 
2144 // Provides a color for BBGrid to draw the rectangle.
2146  const ScrollView::Color kBoxColors[PT_COUNT] = {
2151  };
2152  return kBoxColors[type_];
2153 }
2154 
2155 // Insert a box into this column segment
2156 void ColSegment::InsertBox(const TBOX& other) {
2157  bounding_box_ = bounding_box_.bounding_union(other);
2158 }
2159 
2160 // Set column segment type based on the ratio of text and table partitions
2161 // in it.
2163  if (num_table_cells_ > kTableColumnThreshold * num_text_cells_)
2164  type_ = COL_TABLE;
2165  else if (num_text_cells_ > num_table_cells_)
2166  type_ = COL_TEXT;
2167  else
2168  type_ = COL_MIXED;
2169 }
2170 
2171 } // namespace tesseract.