Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
tesseract::StructuredTable Class Reference

#include <tablerecog.h>

List of all members.

Public Member Functions

 StructuredTable ()
 ~StructuredTable ()
void Init ()
void set_text_grid (ColPartitionGrid *text)
void set_line_grid (ColPartitionGrid *lines)
void set_max_text_height (int height)
bool is_lined () const
int row_count () const
int column_count () const
int cell_count () const
void set_bounding_box (const TBOX &box)
const TBOXbounding_box () const
int median_cell_height ()
int median_cell_width ()
int row_height (int row) const
int column_width (int column) const
int space_above () const
int space_below () const
bool FindLinedStructure ()
bool FindWhitespacedStructure ()
bool DoesPartitionFit (const ColPartition &part) const
int CountFilledCells ()
int CountFilledCellsInRow (int row)
int CountFilledCellsInColumn (int column)
int CountFilledCells (int row_start, int row_end, int column_start, int column_end)
bool VerifyRowFilled (int row)
double CalculateCellFilledPercentage (int row, int column)
void Display (ScrollView *window, ScrollView::Color color)

Protected Member Functions

void ClearStructure ()
bool VerifyLinedTableCells ()
bool VerifyWhitespacedTable ()
void FindWhitespacedColumns ()
void FindWhitespacedRows ()
void CalculateMargins ()
void UpdateMargins (ColPartitionGrid *grid)
int FindVerticalMargin (ColPartitionGrid *grid, int start_x, bool decrease) const
int FindHorizontalMargin (ColPartitionGrid *grid, int start_y, bool decrease) const
void CalculateStats ()
void AbsorbNearbyLines ()
int CountVerticalIntersections (int x)
int CountHorizontalIntersections (int y)
int CountPartitions (const TBOX &box)

Static Protected Member Functions

static void FindCellSplitLocations (const GenericVector< int > &min_list, const GenericVector< int > &max_list, int max_merged, GenericVector< int > *locations)

Protected Attributes

ColPartitionGridtext_grid_
ColPartitionGridline_grid_
TBOX bounding_box_
GenericVectorEqEq< int > cell_x_
GenericVectorEqEq< int > cell_y_
bool is_lined_
int space_above_
int space_below_
int space_left_
int space_right_
int median_cell_height_
int median_cell_width_
int max_text_height_

Detailed Description

Definition at line 72 of file tablerecog.h.


Constructor & Destructor Documentation

tesseract::StructuredTable::StructuredTable ( )
tesseract::StructuredTable::~StructuredTable ( )

Definition at line 76 of file tablerecog.cpp.

{
}

Member Function Documentation

void tesseract::StructuredTable::AbsorbNearbyLines ( )
protected

Definition at line 531 of file tablerecog.cpp.

{
gsearch.SetUniqueMode(true);
// Is the closest line above good? Loop multiple times for tables with
// multi-line (sometimes 2) borders. Limit the number of lines by
// making sure they stay within a table cell or so.
ColPartition* line = NULL;
gsearch.StartVerticalSearch(bounding_box_.left(), bounding_box_.right(),
while ((line = gsearch.NextVerticalSearch(false)) != NULL) {
if (!line->IsHorizontalLine())
break;
TBOX text_search(bounding_box_.left(), bounding_box_.top() + 1,
bounding_box_.right(), line->MidY());
if (text_search.height() > median_cell_height_ * 2)
break;
if (CountPartitions(text_search) > 0)
break;
bounding_box_.set_top(line->MidY());
}
// As above, is the closest line below good?
line = NULL;
gsearch.StartVerticalSearch(bounding_box_.left(), bounding_box_.right(),
while ((line = gsearch.NextVerticalSearch(true)) != NULL) {
if (!line->IsHorizontalLine())
break;
TBOX text_search(bounding_box_.left(), line->MidY(),
if (text_search.height() > median_cell_height_ * 2)
break;
if (CountPartitions(text_search) > 0)
break;
bounding_box_.set_bottom(line->MidY());
}
// TODO(nbeato): vertical lines
}
const TBOX & tesseract::StructuredTable::bounding_box ( ) const

Definition at line 106 of file tablerecog.cpp.

{
return bounding_box_;
}
double tesseract::StructuredTable::CalculateCellFilledPercentage ( int  row,
int  column 
)

Definition at line 263 of file tablerecog.cpp.

{
ASSERT_HOST(0 <= row && row <= row_count());
ASSERT_HOST(0 <= column && column <= column_count());
const TBOX kCellBox(cell_x_[column], cell_y_[row],
cell_x_[column + 1], cell_y_[row + 1]);
ASSERT_HOST(!kCellBox.null_box());
gsearch.SetUniqueMode(true);
gsearch.StartRectSearch(kCellBox);
double area_covered = 0;
ColPartition* text = NULL;
while ((text = gsearch.NextRectSearch()) != NULL) {
if (text->IsTextType())
area_covered += text->bounding_box().intersection(kCellBox).area();
}
return MIN(1.0, area_covered / kCellBox.area());
}
void tesseract::StructuredTable::CalculateMargins ( )
protected
void tesseract::StructuredTable::CalculateStats ( )
protected

Definition at line 511 of file tablerecog.cpp.

{
const int kMaxCellHeight = 1000;
const int kMaxCellWidth = 1000;
STATS height_stats(0, kMaxCellHeight + 1);
STATS width_stats(0, kMaxCellWidth + 1);
for (int i = 0; i < row_count(); ++i)
height_stats.add(row_height(i), column_count());
for (int i = 0; i < column_count(); ++i)
width_stats.add(column_width(i), row_count());
median_cell_height_ = static_cast<int>(height_stats.median() + 0.5);
median_cell_width_ = static_cast<int>(width_stats.median() + 0.5);
}
int tesseract::StructuredTable::cell_count ( ) const

Definition at line 100 of file tablerecog.cpp.

{
return row_count() * column_count();
}
void tesseract::StructuredTable::ClearStructure ( )
protected

Definition at line 301 of file tablerecog.cpp.

int tesseract::StructuredTable::column_count ( ) const

Definition at line 97 of file tablerecog.cpp.

{
return cell_x_.length() == 0 ? 0 : cell_x_.length() - 1;
}
int tesseract::StructuredTable::column_width ( int  column) const

Definition at line 119 of file tablerecog.cpp.

{
ASSERT_HOST(0 <= column && column < column_count());
return cell_x_[column + 1] - cell_x_[column];
}
int tesseract::StructuredTable::CountFilledCells ( )

Definition at line 220 of file tablerecog.cpp.

{
return CountFilledCells(0, row_count() - 1, 0, column_count() - 1);
}
int tesseract::StructuredTable::CountFilledCells ( int  row_start,
int  row_end,
int  column_start,
int  column_end 
)

Definition at line 229 of file tablerecog.cpp.

{
ASSERT_HOST(0 <= row_start && row_start <= row_end && row_end < row_count());
ASSERT_HOST(0 <= column_start && column_start <= column_end &&
column_end < column_count());
int cell_count = 0;
TBOX cell_box;
for (int row = row_start; row <= row_end; ++row) {
cell_box.set_bottom(cell_y_[row]);
cell_box.set_top(cell_y_[row + 1]);
for (int col = column_start; col <= column_end; ++col) {
cell_box.set_left(cell_x_[col]);
cell_box.set_right(cell_x_[col + 1]);
if (CountPartitions(cell_box) > 0)
}
}
return cell_count;
}
int tesseract::StructuredTable::CountFilledCellsInColumn ( int  column)

Definition at line 226 of file tablerecog.cpp.

{
return CountFilledCells(0, row_count() - 1, column, column);
}
int tesseract::StructuredTable::CountFilledCellsInRow ( int  row)

Definition at line 223 of file tablerecog.cpp.

{
return CountFilledCells(row, row, 0, column_count() - 1);
}
int tesseract::StructuredTable::CountHorizontalIntersections ( int  y)
protected

Definition at line 655 of file tablerecog.cpp.

{
int count = 0;
// Make a small box to keep the search time down.
const int kGridSize = text_grid_->gridsize();
TBOX horizontal_box = bounding_box_;
horizontal_box.set_bottom(y - kGridSize);
horizontal_box.set_top(y + kGridSize);
gsearch.SetUniqueMode(true);
gsearch.StartRectSearch(horizontal_box);
ColPartition* text = NULL;
while ((text = gsearch.NextRectSearch()) != NULL) {
if (!text->IsTextType())
continue;
const TBOX& box = text->bounding_box();
if (box.bottom() < y && y < box.top())
++count;
}
return count;
}
int tesseract::StructuredTable::CountPartitions ( const TBOX box)
protected

Definition at line 681 of file tablerecog.cpp.

{
gsearch.SetUniqueMode(true);
gsearch.StartRectSearch(box);
int count = 0;
ColPartition* text = NULL;
while ((text = gsearch.NextRectSearch()) != NULL) {
if (text->IsTextType())
++count;
}
return count;
}
int tesseract::StructuredTable::CountVerticalIntersections ( int  x)
protected

Definition at line 631 of file tablerecog.cpp.

{
int count = 0;
// Make a small box to keep the search time down.
const int kGridSize = text_grid_->gridsize();
TBOX vertical_box = bounding_box_;
vertical_box.set_left(x - kGridSize);
vertical_box.set_right(x + kGridSize);
gsearch.SetUniqueMode(true);
gsearch.StartRectSearch(vertical_box);
ColPartition* text = NULL;
while ((text = gsearch.NextRectSearch()) != NULL) {
if (!text->IsTextType())
continue;
const TBOX& box = text->bounding_box();
if (box.left() < x && x < box.right())
++count;
}
return count;
}
void tesseract::StructuredTable::Display ( ScrollView window,
ScrollView::Color  color 
)

Definition at line 282 of file tablerecog.cpp.

{
#ifndef GRAPHICS_DISABLED
window->Pen(color);
for (int i = 0; i < cell_x_.length(); i++) {
}
for (int i = 0; i < cell_y_.length(); i++) {
window->Line(bounding_box_.left(), cell_y_[i],
}
window->UpdateWindow();
#endif
}
bool tesseract::StructuredTable::DoesPartitionFit ( const ColPartition part) const

Definition at line 208 of file tablerecog.cpp.

{
const TBOX& box = part.bounding_box();
for (int i = 0; i < cell_x_.length(); ++i)
if (box.left() < cell_x_[i] && cell_x_[i] < box.right())
return false;
for (int i = 0; i < cell_y_.length(); ++i)
if (box.bottom() < cell_y_[i] && cell_y_[i] < box.top())
return false;
return true;
}
void tesseract::StructuredTable::FindCellSplitLocations ( const GenericVector< int > &  min_list,
const GenericVector< int > &  max_list,
int  max_merged,
GenericVector< int > *  locations 
)
staticprotected

Definition at line 585 of file tablerecog.cpp.

{
locations->clear();
ASSERT_HOST(min_list.length() == max_list.length());
if (min_list.length() == 0)
return;
ASSERT_HOST(min_list.get(0) < max_list.get(0));
ASSERT_HOST(min_list.get(min_list.length() - 1) <
max_list.get(max_list.length() - 1));
locations->push_back(min_list.get(0));
int min_index = 0;
int max_index = 0;
int stacked_partitions = 0;
int last_cross_position = MAX_INT32;
// max_index will expire after min_index.
// However, we can't "increase" the hill size if min_index expired.
// So finish processing when min_index expires.
while (min_index < min_list.length()) {
// Increase the hill count.
if (min_list[min_index] < max_list[max_index]) {
++stacked_partitions;
if (last_cross_position != MAX_INT32 &&
stacked_partitions > max_merged) {
int mid = (last_cross_position + min_list[min_index]) / 2;
locations->push_back(mid);
last_cross_position = MAX_INT32;
}
++min_index;
} else {
// Decrease the hill count.
--stacked_partitions;
if (last_cross_position == MAX_INT32 &&
stacked_partitions <= max_merged) {
last_cross_position = max_list[max_index];
}
++max_index;
}
}
locations->push_back(max_list.get(max_list.length() - 1));
}
int tesseract::StructuredTable::FindHorizontalMargin ( ColPartitionGrid grid,
int  start_y,
bool  decrease 
) const
protected

Definition at line 494 of file tablerecog.cpp.

{
ColPartitionGridSearch gsearch(grid);
gsearch.SetUniqueMode(true);
gsearch.StartSideSearch(border, bounding_box_.bottom(), bounding_box_.top());
ColPartition* part = NULL;
while ((part = gsearch.NextSideSearch(decrease)) != NULL) {
if (!part->IsTextType() && !part->IsVerticalLine())
continue;
int distance = decrease ? border - part->bounding_box().right()
: part->bounding_box().left() - border;
if (distance >= 0)
return distance;
}
return MAX_INT32;
}
bool tesseract::StructuredTable::FindLinedStructure ( )

Definition at line 137 of file tablerecog.cpp.

{
// Search for all of the lines in the current box.
// Update the cellular structure with the exact lines.
box_search.SetUniqueMode(true);
box_search.StartRectSearch(bounding_box_);
ColPartition* line = NULL;
while ((line = box_search.NextRectSearch()) != NULL) {
if (line->IsHorizontalLine())
cell_y_.push_back(line->MidY());
if (line->IsVerticalLine())
cell_x_.push_back(line->MidX());
}
// HasSignificantLines should guarantee cells.
// Because that code is a different class, just gracefully
// return false. This could be an assert.
if (cell_x_.length() < 3 || cell_y_.length() < 3)
return false;
// Remove duplicates that may have occurred due to split lines.
// The border should be the extents of line boxes, not middle.
// Remove duplicates that may have occurred due to moving the borders.
return is_lined_;
}
int tesseract::StructuredTable::FindVerticalMargin ( ColPartitionGrid grid,
int  start_x,
bool  decrease 
) const
protected

Definition at line 477 of file tablerecog.cpp.

{
ColPartitionGridSearch gsearch(grid);
gsearch.SetUniqueMode(true);
gsearch.StartVerticalSearch(bounding_box_.left(), bounding_box_.right(),
border);
ColPartition* part = NULL;
while ((part = gsearch.NextVerticalSearch(decrease)) != NULL) {
if (!part->IsTextType() && !part->IsHorizontalLine())
continue;
int distance = decrease ? border - part->bounding_box().top()
: part->bounding_box().bottom() - border;
if (distance >= 0)
return distance;
}
return MAX_INT32;
}
void tesseract::StructuredTable::FindWhitespacedColumns ( )
protected

Definition at line 347 of file tablerecog.cpp.

{
// Set of the extents of all partitions on the page.
// Look at each text partition. We want to find the partitions
// that have extremal left/right sides. These will give us a basis
// for the table columns.
gsearch.SetUniqueMode(true);
gsearch.StartRectSearch(bounding_box_);
ColPartition* text = NULL;
while ((text = gsearch.NextRectSearch()) != NULL) {
if (!text->IsTextType())
continue;
ASSERT_HOST(text->bounding_box().left() < text->bounding_box().right());
int spacing = static_cast<int>(text->median_width() *
kHorizontalSpacing / 2.0 + 0.5);
left_sides.push_back(text->bounding_box().left() - spacing);
right_sides.push_back(text->bounding_box().right() + spacing);
}
// It causes disaster below, so avoid it!
if (left_sides.length() == 0 || right_sides.length() == 0)
return;
// Since data may be inserted in grid order, we sort the left/right sides.
left_sides.sort();
right_sides.sort();
// At this point, in the "merged list", we expect to have a left side,
// followed by either more left sides or a right side. The last number
// should be a right side. We find places where the splits occur by looking
// for "valleys". If we want to force gap sizes or allow overlap, change
// the spacing above. If you want to let lines "slice" partitions as long
// as it is infrequent, change the following function.
}
void tesseract::StructuredTable::FindWhitespacedRows ( )
protected

Definition at line 392 of file tablerecog.cpp.

{
// Set of the extents of all partitions on the page.
GenericVectorEqEq<int> bottom_sides;
// We will be "shrinking" partitions, so keep the min/max around to
// make sure the bottom/top lines do not intersect text.
int min_bottom = MAX_INT32;
int max_top = MIN_INT32;
// Look at each text partition. We want to find the partitions
// that have extremal bottom/top sides. These will give us a basis
// for the table rows. Because the textlines can be skewed and close due
// to warping, the height of the partitions is toned down a little bit.
gsearch.SetUniqueMode(true);
gsearch.StartRectSearch(bounding_box_);
ColPartition* text = NULL;
while ((text = gsearch.NextRectSearch()) != NULL) {
if (!text->IsTextType())
continue;
ASSERT_HOST(text->bounding_box().bottom() < text->bounding_box().top());
min_bottom = MIN(min_bottom, text->bounding_box().bottom());
max_top = MAX(max_top, text->bounding_box().top());
// Ignore "tall" text partitions, as these are usually false positive
// vertical text or multiple lines pulled together.
if (text->bounding_box().height() > max_text_height_)
continue;
int spacing = static_cast<int>(text->bounding_box().height() *
kVerticalSpacing / 2.0 + 0.5);
int bottom = text->bounding_box().bottom() - spacing;
int top = text->bounding_box().top() + spacing;
// For horizontal text, the factor can be negative. This should
// probably cause a warning or failure. I haven't actually checked if
// it happens.
if (bottom >= top)
continue;
bottom_sides.push_back(bottom);
top_sides.push_back(top);
}
// It causes disaster below, so avoid it!
if (bottom_sides.length() == 0 || top_sides.length() == 0)
return;
// Since data may be inserted in grid order, we sort the bottom/top sides.
bottom_sides.sort();
top_sides.sort();
// At this point, in the "merged list", we expect to have a bottom side,
// followed by either more bottom sides or a top side. The last number
// should be a top side. We find places where the splits occur by looking
// for "valleys". If we want to force gap sizes or allow overlap, change
// the spacing above. If you want to let lines "slice" partitions as long
// as it is infrequent, change the following function.
// Recover the min/max correctly since it was shifted.
cell_y_[0] = min_bottom;
cell_y_[cell_y_.length() - 1] = max_top;
}
bool tesseract::StructuredTable::FindWhitespacedStructure ( )
void tesseract::StructuredTable::Init ( )

Definition at line 79 of file tablerecog.cpp.

{
}
bool tesseract::StructuredTable::is_lined ( ) const

Definition at line 91 of file tablerecog.cpp.

{
return is_lined_;
}
int tesseract::StructuredTable::median_cell_height ( )

Definition at line 109 of file tablerecog.cpp.

{
}
int tesseract::StructuredTable::median_cell_width ( )

Definition at line 112 of file tablerecog.cpp.

{
}
int tesseract::StructuredTable::row_count ( ) const

Definition at line 94 of file tablerecog.cpp.

{
return cell_y_.length() == 0 ? 0 : cell_y_.length() - 1;
}
int tesseract::StructuredTable::row_height ( int  row) const

Definition at line 115 of file tablerecog.cpp.

{
ASSERT_HOST(0 <= row && row < row_count());
return cell_y_[row + 1] - cell_y_[row];
}
void tesseract::StructuredTable::set_bounding_box ( const TBOX box)

Definition at line 103 of file tablerecog.cpp.

{
}
void tesseract::StructuredTable::set_line_grid ( ColPartitionGrid lines)

Definition at line 85 of file tablerecog.cpp.

{
line_grid_ = line_grid;
}
void tesseract::StructuredTable::set_max_text_height ( int  height)

Definition at line 88 of file tablerecog.cpp.

{
max_text_height_ = height;
}
void tesseract::StructuredTable::set_text_grid ( ColPartitionGrid text)

Definition at line 82 of file tablerecog.cpp.

{
text_grid_ = text_grid;
}
int tesseract::StructuredTable::space_above ( ) const

Definition at line 123 of file tablerecog.cpp.

{
return space_above_;
}
int tesseract::StructuredTable::space_below ( ) const

Definition at line 126 of file tablerecog.cpp.

{
return space_below_;
}
void tesseract::StructuredTable::UpdateMargins ( ColPartitionGrid grid)
protected

Definition at line 467 of file tablerecog.cpp.

{
int below = FindVerticalMargin(grid, bounding_box_.bottom(), true);
space_below_ = MIN(space_below_, below);
int above = FindVerticalMargin(grid, bounding_box_.top(), false);
space_above_ = MIN(space_above_, above);
int left = FindHorizontalMargin(grid, bounding_box_.left(), true);
space_left_ = MIN(space_left_, left);
int right = FindHorizontalMargin(grid, bounding_box_.right(), false);
space_right_ = MIN(space_right_, right);
}
bool tesseract::StructuredTable::VerifyLinedTableCells ( )
protected

Definition at line 315 of file tablerecog.cpp.

{
// Function only called when lines exist.
for (int i = 0; i < cell_y_.length(); ++i) {
return false;
}
for (int i = 0; i < cell_x_.length(); ++i) {
return false;
}
return true;
}
bool tesseract::StructuredTable::VerifyRowFilled ( int  row)

Definition at line 252 of file tablerecog.cpp.

{
for (int i = 0; i < column_count(); ++i) {
double area_filled = CalculateCellFilledPercentage(row, i);
if (area_filled >= kMinFilledArea)
return true;
}
return false;
}
bool tesseract::StructuredTable::VerifyWhitespacedTable ( )
protected

Definition at line 337 of file tablerecog.cpp.

{
// criteria for a table, must be at least 2x3 or 3x2
return row_count() >= 2 && column_count() >= 2 && cell_count() >= 6;
}

Member Data Documentation

TBOX tesseract::StructuredTable::bounding_box_
protected

Definition at line 242 of file tablerecog.h.

GenericVectorEqEq<int> tesseract::StructuredTable::cell_x_
protected

Definition at line 243 of file tablerecog.h.

GenericVectorEqEq<int> tesseract::StructuredTable::cell_y_
protected

Definition at line 244 of file tablerecog.h.

bool tesseract::StructuredTable::is_lined_
protected

Definition at line 245 of file tablerecog.h.

ColPartitionGrid* tesseract::StructuredTable::line_grid_
protected

Definition at line 238 of file tablerecog.h.

int tesseract::StructuredTable::max_text_height_
protected

Definition at line 254 of file tablerecog.h.

int tesseract::StructuredTable::median_cell_height_
protected

Definition at line 251 of file tablerecog.h.

int tesseract::StructuredTable::median_cell_width_
protected

Definition at line 252 of file tablerecog.h.

int tesseract::StructuredTable::space_above_
protected

Definition at line 247 of file tablerecog.h.

int tesseract::StructuredTable::space_below_
protected

Definition at line 248 of file tablerecog.h.

int tesseract::StructuredTable::space_left_
protected

Definition at line 249 of file tablerecog.h.

int tesseract::StructuredTable::space_right_
protected

Definition at line 250 of file tablerecog.h.

ColPartitionGrid* tesseract::StructuredTable::text_grid_
protected

Definition at line 237 of file tablerecog.h.


The documentation for this class was generated from the following files: