Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
tesseract::TableRecognizer Class Reference

#include <tablerecog.h>

List of all members.

Public Member Functions

 TableRecognizer ()
 ~TableRecognizer ()
void Init ()
void set_text_grid (ColPartitionGrid *text)
void set_line_grid (ColPartitionGrid *lines)
void set_min_height (int height)
void set_min_width (int width)
void set_max_text_height (int height)
StructuredTableRecognizeTable (const TBOX &guess_box)

Protected Member Functions

bool RecognizeLinedTable (const TBOX &guess_box, StructuredTable *table)
bool HasSignificantLines (const TBOX &guess)
bool FindLinesBoundingBox (TBOX *bounding_box)
bool FindLinesBoundingBoxIteration (TBOX *bounding_box)
bool RecognizeWhitespacedTable (const TBOX &guess_box, StructuredTable *table)
int NextHorizontalSplit (int left, int right, int y, bool top_to_bottom)

Static Protected Member Functions

static bool IsWeakTableRow (StructuredTable *table, int row)

Protected Attributes

ColPartitionGridtext_grid_
ColPartitionGridline_grid_
int min_height_
int min_width_
int max_text_height_

Detailed Description

Definition at line 257 of file tablerecog.h.


Constructor & Destructor Documentation

tesseract::TableRecognizer::TableRecognizer ( )
tesseract::TableRecognizer::~TableRecognizer ( )

Definition at line 706 of file tablerecog.cpp.

{
}

Member Function Documentation

bool tesseract::TableRecognizer::FindLinesBoundingBox ( TBOX bounding_box)
protected

Definition at line 807 of file tablerecog.cpp.

{
// The first iteration will tell us if there are lines
// present and shrink the box to a minimal iterative size.
if (!FindLinesBoundingBoxIteration(bounding_box))
return false;
// Keep growing until the area of the table stabilizes.
// The box can only get bigger, increasing area.
bool changed = true;
while (changed) {
changed = false;
int old_area = bounding_box->area();
bool check = FindLinesBoundingBoxIteration(bounding_box);
// At this point, the function will return true.
ASSERT_HOST(check);
ASSERT_HOST(bounding_box->area() >= old_area);
changed = (bounding_box->area() > old_area);
}
return true;
}
bool tesseract::TableRecognizer::FindLinesBoundingBoxIteration ( TBOX bounding_box)
protected

Definition at line 829 of file tablerecog.cpp.

{
// Search for all of the lines in the current box, keeping track of extents.
box_search.SetUniqueMode(true);
box_search.StartRectSearch(*bounding_box);
ColPartition* line = NULL;
bool first_line = true;
while ((line = box_search.NextRectSearch()) != NULL) {
if (line->IsLineType()) {
if (first_line) {
// The first iteration can shrink the box.
*bounding_box = line->bounding_box();
first_line = false;
} else {
*bounding_box += line->bounding_box();
}
}
}
return !first_line;
}
bool tesseract::TableRecognizer::HasSignificantLines ( const TBOX guess)
protected

Definition at line 768 of file tablerecog.cpp.

{
box_search.SetUniqueMode(true);
box_search.StartRectSearch(guess);
ColPartition* line = NULL;
int vertical_count = 0;
int horizontal_count = 0;
while ((line = box_search.NextRectSearch()) != NULL) {
if (line->IsHorizontalLine())
++horizontal_count;
if (line->IsVerticalLine())
++vertical_count;
}
return vertical_count >= kLinedTableMinVerticalLines &&
horizontal_count >= kLinedTableMinHorizontalLines;
}
void tesseract::TableRecognizer::Init ( )

Definition at line 709 of file tablerecog.cpp.

{
}
bool tesseract::TableRecognizer::IsWeakTableRow ( StructuredTable table,
int  row 
)
staticprotected

Definition at line 1045 of file tablerecog.cpp.

{
if (!table->VerifyRowFilled(row))
return false;
double threshold = 0.0;
if (table->column_count() > kGoodRowNumberOfColumnsSmallSize)
threshold = table->column_count() * kGoodRowNumberOfColumnsLarge;
else
threshold = kGoodRowNumberOfColumnsSmall[table->column_count()];
return table->CountFilledCellsInRow(row) < threshold;
}
int tesseract::TableRecognizer::NextHorizontalSplit ( int  left,
int  right,
int  y,
bool  top_to_bottom 
)
protected

Definition at line 1011 of file tablerecog.cpp.

{
gsearch.SetUniqueMode(true);
gsearch.StartVerticalSearch(left, right, y);
ColPartition* text = NULL;
int last_y = y;
while ((text = gsearch.NextVerticalSearch(top_to_bottom)) != NULL) {
if (!text->IsTextType() || !text->IsHorizontalType())
continue;
if (text->bounding_box().height() > max_text_height_)
continue;
const TBOX& text_box = text->bounding_box();
if (top_to_bottom && (last_y >= y || last_y <= text_box.top())) {
last_y = MIN(last_y, text_box.bottom());
continue;
}
if (!top_to_bottom && (last_y <= y || last_y >= text_box.bottom())) {
last_y = MAX(last_y, text_box.top());
continue;
}
return last_y;
}
// If none is found, we at least want to preserve the min/max,
// which defines the overlap of y with the last partition in the grid.
return last_y;
}
bool tesseract::TableRecognizer::RecognizeLinedTable ( const TBOX guess_box,
StructuredTable table 
)
protected

Definition at line 751 of file tablerecog.cpp.

{
if (!HasSignificantLines(guess_box))
return false;
TBOX line_bound = guess_box;
if (!FindLinesBoundingBox(&line_bound))
return false;
table->set_bounding_box(line_bound);
return table->FindLinedStructure();
}
StructuredTable * tesseract::TableRecognizer::RecognizeTable ( const TBOX guess_box)

Definition at line 728 of file tablerecog.cpp.

{
StructuredTable* table = new StructuredTable();
table->Init();
table->set_text_grid(text_grid_);
table->set_line_grid(line_grid_);
table->set_max_text_height(max_text_height_);
// Try to solve ths simple case, a table with *both*
// vertical and horizontal lines.
if (RecognizeLinedTable(guess, table))
return table;
// Fallback to whitespace if that failed.
// TODO(nbeato): Break this apart to take advantage of horizontal
// lines or vertical lines when present.
if (RecognizeWhitespacedTable(guess, table))
return table;
// No table found...
delete table;
return NULL;
}
bool tesseract::TableRecognizer::RecognizeWhitespacedTable ( const TBOX guess_box,
StructuredTable table 
)
protected

Definition at line 867 of file tablerecog.cpp.

{
TBOX best_box = guess_box; // Best borders known.
int best_below = 0; // Margin size above best table.
int best_above = 0; // Margin size below best table.
TBOX adjusted = guess_box; // The search box.
// We assume that the guess box is somewhat accurate, so we don't allow
// the adjusted border to pass half of the guessed area. This prevents
// "negative" tables from forming.
const int kMidGuessY = (guess_box.bottom() + guess_box.top()) / 2;
// Keeps track of the most columns in an accepted table. The resulting table
// may be less than the max, but we don't want to stray too far.
int best_cols = 0;
// Make sure we find a good border.
bool found_good_border = false;
// Find the bottom of the table by trying a few different locations. For
// each location, the top, left, and right are fixed. We start the search
// in a smaller table to favor best_cols getting a good estimate sooner.
int last_bottom = MAX_INT32;
int bottom = NextHorizontalSplit(guess_box.left(), guess_box.right(),
kMidGuessY - min_height_ / 2, true);
int top = NextHorizontalSplit(guess_box.left(), guess_box.right(),
kMidGuessY + min_height_ / 2, false);
adjusted.set_top(top);
// Headers/footers can be spaced far from everything.
// Make sure that the space below is greater than the space above
// the lowest row.
int previous_below = 0;
const int kMaxChances = 10;
int chances = kMaxChances;
while (bottom != last_bottom) {
adjusted.set_bottom(bottom);
if (adjusted.height() >= min_height_) {
// Try to fit the grid on the current box. We give it a chance
// if the number of columns didn't significantly drop.
table->set_bounding_box(adjusted);
if (table->FindWhitespacedStructure() &&
table->column_count() >= best_cols * kRequiredColumns) {
if (false && IsWeakTableRow(table, 0)) {
// Currently buggy, but was looking promising so disabled.
--chances;
} else {
// We favor 2 things,
// 1- Adding rows that have partitioned data.
// 2- Better margins (to find header/footer).
// For better tables, we just look for multiple cells in the
// bottom row with data in them.
// For margins, the space below the last row should
// be better than a table with the last row removed.
chances = kMaxChances;
double max_row_height = kMaxRowSize * table->median_cell_height();
if ((table->space_below() * kMarginFactor >= best_below &&
table->space_below() >= previous_below) ||
(table->CountFilledCellsInRow(0) > 1 &&
table->row_height(0) < max_row_height)) {
best_box.set_bottom(bottom);
best_below = table->space_below();
best_cols = MAX(table->column_count(), best_cols);
found_good_border = true;
}
}
previous_below = table->space_below();
} else {
--chances;
}
}
if (chances <= 0)
break;
last_bottom = bottom;
bottom = NextHorizontalSplit(guess_box.left(), guess_box.right(),
last_bottom, true);
}
if (!found_good_border)
return false;
// TODO(nbeato) comments: follow modified code above... put it in a function!
found_good_border = false;
int last_top = MIN_INT32;
top = NextHorizontalSplit(guess_box.left(), guess_box.right(),
kMidGuessY + min_height_ / 2, false);
int previous_above = 0;
chances = kMaxChances;
adjusted.set_bottom(best_box.bottom());
while (last_top != top) {
adjusted.set_top(top);
if (adjusted.height() >= min_height_) {
table->set_bounding_box(adjusted);
if (table->FindWhitespacedStructure() &&
table->column_count() >= best_cols * kRequiredColumns) {
int last_row = table->row_count() - 1;
if (false && IsWeakTableRow(table, last_row)) {
// Currently buggy, but was looking promising so disabled.
--chances;
} else {
chances = kMaxChances;
double max_row_height = kMaxRowSize * table->median_cell_height();
if ((table->space_above() * kMarginFactor >= best_above &&
table->space_above() >= previous_above) ||
(table->CountFilledCellsInRow(last_row) > 1 &&
table->row_height(last_row) < max_row_height)) {
best_box.set_top(top);
best_above = table->space_above();
best_cols = MAX(table->column_count(), best_cols);
found_good_border = true;
}
}
previous_above = table->space_above();
} else {
--chances;
}
}
if (chances <= 0)
break;
last_top = top;
top = NextHorizontalSplit(guess_box.left(), guess_box.right(),
last_top, false);
}
if (!found_good_border)
return false;
// If we get here, this shouldn't happen. It can be an assert, but
// I haven't tested it enough to make it crash things.
if (best_box.null_box())
return false;
// Given the best locations, fit the box to those locations.
table->set_bounding_box(best_box);
return table->FindWhitespacedStructure();
}
void tesseract::TableRecognizer::set_line_grid ( ColPartitionGrid lines)

Definition at line 715 of file tablerecog.cpp.

{
line_grid_ = line_grid;
}
void tesseract::TableRecognizer::set_max_text_height ( int  height)

Definition at line 724 of file tablerecog.cpp.

{
max_text_height_ = height;
}
void tesseract::TableRecognizer::set_min_height ( int  height)

Definition at line 718 of file tablerecog.cpp.

{
min_height_ = height;
}
void tesseract::TableRecognizer::set_min_width ( int  width)

Definition at line 721 of file tablerecog.cpp.

{
min_width_ = width;
}
void tesseract::TableRecognizer::set_text_grid ( ColPartitionGrid text)

Definition at line 712 of file tablerecog.cpp.

{
text_grid_ = text_grid;
}

Member Data Documentation

ColPartitionGrid* tesseract::TableRecognizer::line_grid_
protected

Definition at line 368 of file tablerecog.h.

int tesseract::TableRecognizer::max_text_height_
protected

Definition at line 373 of file tablerecog.h.

int tesseract::TableRecognizer::min_height_
protected

Definition at line 370 of file tablerecog.h.

int tesseract::TableRecognizer::min_width_
protected

Definition at line 371 of file tablerecog.h.

ColPartitionGrid* tesseract::TableRecognizer::text_grid_
protected

Definition at line 367 of file tablerecog.h.


The documentation for this class was generated from the following files: