Tesseract
3.02
Main Page
Related Pages
Modules
Namespaces
Classes
Files
File List
File Members
All
Classes
Namespaces
Files
Functions
Variables
Typedefs
Enumerations
Enumerator
Friends
Macros
Groups
Pages
tablerecog.h
Go to the documentation of this file.
1
2
// File: tablerecog.h
3
// Description: Functions to detect structure of tables.
4
// Author: Nicholas Beato
5
// Created: Aug 17, 2010
6
//
7
// (C) Copyright 2010, Google Inc.
8
// Licensed under the Apache License, Version 2.0 (the "License");
9
// you may not use this file except in compliance with the License.
10
// You may obtain a copy of the License at
11
// http://www.apache.org/licenses/LICENSE-2.0
12
// Unless required by applicable law or agreed to in writing, software
13
// distributed under the License is distributed on an "AS IS" BASIS,
14
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
// See the License for the specific language governing permissions and
16
// limitations under the License.
17
//
19
20
#ifndef TABLERECOG_H_
21
#define TABLERECOG_H_
22
23
#include "
colpartitiongrid.h
"
24
#include "
genericvector.h
"
25
26
namespace
tesseract
{
27
28
// There are 2 classes in this file. They have 2 different purposes.
29
// - StructuredTable contains the methods to find the structure given
30
// a specific bounding box and grow that structure.
31
// - TableRecognizer contains the methods to adjust the possible positions
32
// of a table without worrying about structure.
33
//
34
// To use these classes, the assumption is that the TableFinder will
35
// have a guess of the location of a table (or possibly over/undersegmented
36
// tables). The TableRecognizer is responsible for finding the table boundaries
37
// at a high level. The StructuredTable class is responsible for determining
38
// the structure of the table and trying to maximize its bounds while retaining
39
// the structure.
40
// (The latter part is not implemented yet, but that was the goal).
41
//
42
// While on the boundary discussion, keep in mind that this is a first pass.
43
// There should eventually be some things like internal structure checks,
44
// and, more importantly, surrounding text flow checks.
45
//
46
47
// Usage:
48
// The StructuredTable class contains methods to query a potential table.
49
// It has functions to find structure, count rows, find ColPartitions that
50
// intersect gridlines, etc. It is not meant to blindly find a table. It
51
// is meant to start with a known table location and enhance it.
52
// Usage:
53
// ColPartitionGrid text_grid, line_grid; // init
54
// TBOX table_box; // known location of table location
55
//
56
// StructuredTable table;
57
// table.Init(); // construction code
58
// table.set_text_grid(/* text */); // These 2 grids can be the same!
59
// table.set_line_grid(/* lines */);
60
// table.set_min_text_height(10); // Filter vertical and tall text.
61
// // IMPORTANT! The table needs to be told where it is!
62
// table.set_bounding_box(table_box); // Set initial table location.
63
// if (table.FindWhitespacedStructure()) {
64
// // process table
65
// table.column_count(); // number of columns
66
// table.row_count(); // number of rows
67
// table.cells_count(); // number of cells
68
// table.bounding_box(); // updated bounding box
69
// // etc.
70
// }
71
//
72
class
StructuredTable
{
73
public
:
74
StructuredTable
();
75
~StructuredTable
();
76
77
// Initialization code. Must be called after the constructor.
78
void
Init
();
79
80
// Sets the grids used by the table. These can be changed between
81
// calls to Recognize. They are treated as read-only data.
82
void
set_text_grid
(
ColPartitionGrid
* text);
83
void
set_line_grid
(
ColPartitionGrid
* lines);
84
// Filters text partitions that are ridiculously tall to prevent
85
// merging rows.
86
void
set_max_text_height
(
int
height);
87
88
// Basic accessors. Some are treated as attributes despite having indirect
89
// representation.
90
bool
is_lined
()
const
;
91
int
row_count
()
const
;
92
int
column_count
()
const
;
93
int
cell_count
()
const
;
94
void
set_bounding_box
(
const
TBOX
& box);
95
const
TBOX
&
bounding_box
()
const
;
96
int
median_cell_height
();
97
int
median_cell_width
();
98
int
row_height
(
int
row)
const
;
99
int
column_width
(
int
column)
const
;
100
int
space_above
()
const
;
101
int
space_below
()
const
;
102
103
// Given enough horizontal and vertical lines in a region, create this table
104
// based on the structure given by the lines. Return true if it worked out.
105
// Code assumes the lines exist. It is the caller's responsibility to check
106
// for lines and find an appropriate bounding box.
107
bool
FindLinedStructure
();
108
109
// The main subroutine for finding generic table structure. The function
110
// finds the grid structure in the given box. Returns true if a good grid
111
// exists, implying that "this" table is valid.
112
bool
FindWhitespacedStructure
();
113
117
118
// Returns true if inserting part into the table does not cause any
119
// cell merges.
120
bool
DoesPartitionFit
(
const
ColPartition
& part)
const
;
121
// Checks if a sub-table has multiple data cells filled.
122
int
CountFilledCells
();
123
int
CountFilledCellsInRow
(
int
row);
124
int
CountFilledCellsInColumn
(
int
column);
125
int
CountFilledCells
(
int
row_start,
int
row_end,
126
int
column_start,
int
column_end);
127
128
// Makes sure that at least one cell in a row has substantial area filled.
129
// This can filter out large whitespace caused by growing tables too far
130
// and page numbers.
131
// (currently bugged for some reason).
132
bool
VerifyRowFilled
(
int
row);
133
// Finds the filled area in a cell.
134
double
CalculateCellFilledPercentage
(
int
row,
int
column);
135
136
// Debug display, draws the table in the given color. If the table is not
137
// valid, the table and "best" grid lines are still drawn in the given color.
138
void
Display
(
ScrollView
* window,
ScrollView::Color
color);
139
140
protected
:
141
// Clear the structure information.
142
void
ClearStructure
();
143
147
148
// Verifies the lines do not intersect partitions. This happens when
149
// the lines are in column boundaries and extend the full page. As a result,
150
// the grid lines go through column text. The condition is detectable.
151
bool
VerifyLinedTableCells
();
152
156
157
// This is the function to change if you want to filter resulting tables
158
// better. Right now it just checks for a minimum cell count and such.
159
// You could add things like maximum number of ColPartitions per cell or
160
// similar.
161
bool
VerifyWhitespacedTable
();
162
// Find the columns of a table using whitespace.
163
void
FindWhitespacedColumns
();
164
// Find the rows of a table using whitespace.
165
void
FindWhitespacedRows
();
166
170
171
// Calculates the whitespace around the table using the table boundary and
172
// the supplied grids (set_text_grid and set_line_grid).
173
void
CalculateMargins
();
174
// Update the table margins with the supplied grid. This is
175
// only called by calculate margins to use multiple grid sources.
176
void
UpdateMargins
(
ColPartitionGrid
* grid);
177
int
FindVerticalMargin
(
ColPartitionGrid
* grid,
int
start_x,
178
bool
decrease)
const
;
179
int
FindHorizontalMargin
(
ColPartitionGrid
* grid,
int
start_y,
180
bool
decrease)
const
;
181
// Calculates stats on the table, namely the median cell height and width.
182
void
CalculateStats
();
183
187
188
// Given a whitespaced table, this looks for bordering lines that might
189
// be page layout boxes around the table. It is necessary to get the margins
190
// correct on the table. If the lines are not joined, the margins will be
191
// the distance to the line, which is not right.
192
void
AbsorbNearbyLines
();
193
194
// Nice utility function for finding partition gaps. You feed it a sorted
195
// list of all of the mins/maxes of the partitions in the table, and it gives
196
// you the gaps (middle). This works for both vertical and horizontal
197
// gaps.
198
//
199
// If you want to allow slight overlap in the division and the partitions,
200
// just scale down the partitions before inserting them in the list.
201
// Likewise, you can force at least some space between partitions.
202
// This trick is how the horizontal partitions are done (since the page
203
// skew could make it hard to find splits in the text).
204
//
205
// As a result, "0 distance" between closest partitions causes a gap.
206
// This is not a programmatic assumption. It is intentional and simplifies
207
// things.
208
//
209
// "max_merged" indicates both the minimum number of stacked partitions
210
// to cause a cell (add 1 to it), and the maximum number of partitions that
211
// a grid line can intersect. For example, if max_merged is 0, then lines
212
// are inserted wherever space exists between partitions. If it is 2,
213
// lines may intersect 2 partitions at most, but you also need at least
214
// 2 partitions to generate a line.
215
static
void
FindCellSplitLocations
(
const
GenericVector<int>
& min_list,
216
const
GenericVector<int>
& max_list,
217
int
max_merged,
218
GenericVector<int>
* locations);
219
223
224
// Counts the number of ColPartitions that intersect vertical cell
225
// division at this x value. Used by VerifyLinedTable.
226
int
CountVerticalIntersections
(
int
x);
227
int
CountHorizontalIntersections
(
int
y);
228
229
// Counts how many text partitions are in this box.
230
int
CountPartitions
(
const
TBOX
& box);
231
235
236
// Input data, used as read only data to make decisions.
237
ColPartitionGrid
*
text_grid_
;
// Text ColPartitions
238
ColPartitionGrid
*
line_grid_
;
// Line ColPartitions
239
// Table structure.
240
// bounding box is a convenient external representation.
241
// cell_x_ and cell_y_ indicate the grid lines.
242
TBOX
bounding_box_
;
// Bounding box
243
GenericVectorEqEq<int>
cell_x_
;
// Locations of vertical divisions (sorted)
244
GenericVectorEqEq<int>
cell_y_
;
// Locations of horizontal divisions (sorted)
245
bool
is_lined_
;
// Is the table backed up by a line structure
246
// Table margins, set via CalculateMargins
247
int
space_above_
;
248
int
space_below_
;
249
int
space_left_
;
250
int
space_right_
;
251
int
median_cell_height_
;
252
int
median_cell_width_
;
253
// Filters, used to prevent awkward partitions from destroying structure.
254
int
max_text_height_
;
255
};
256
257
class
TableRecognizer
{
258
public
:
259
TableRecognizer
();
260
~TableRecognizer
();
261
262
// Initialization code. Must be called after the constructor.
263
void
Init
();
264
268
269
// Sets the grids used by the table. These can be changed between
270
// calls to Recognize. They are treated as read-only data.
271
void
set_text_grid
(
ColPartitionGrid
* text);
272
void
set_line_grid
(
ColPartitionGrid
* lines);
273
// Sets some additional constraints on the table.
274
void
set_min_height
(
int
height);
275
void
set_min_width
(
int
width);
276
// Filters text partitions that are ridiculously tall to prevent
277
// merging rows. Note that "filters" refers to allowing horizontal
278
// cells to slice through them on the premise that they were
279
// merged text rows during previous layout.
280
void
set_max_text_height
(
int
height);
281
282
// Given a guess location, the RecognizeTable function will try to find a
283
// structured grid in the area. On success, it will return a new
284
// StructuredTable (and assumes you will delete it). Otherwise,
285
// NULL is returned.
286
//
287
// Keep in mind, this may "overgrow" or "undergrow" the size of guess.
288
// Ideally, there is a either a one-to-one correspondence between
289
// the guess and table or no table at all. This is not the best of
290
// assumptions right now, but was made to try to keep things simple in
291
// the first pass.
292
//
293
// If a line structure is available on the page in the given region,
294
// the table will use the linear structure as it is.
295
// Otherwise, it will try to maximize the whitespace around it while keeping
296
// a grid structure. This is somewhat working.
297
//
298
// Since the combination of adjustments can get high, effort was
299
// originally made to keep the number of adjustments linear in the number
300
// of partitions. The underlying structure finding code used to be
301
// much more complex. I don't know how necessary this constraint is anymore.
302
// The evaluation of a possible table is kept within O(nlogn) in the size of
303
// the table (where size is the number of partitions in the table).
304
// As a result, the algorithm is capable of O(n^2 log n). Depending
305
// on the grid search size, it may be higher.
306
//
307
// Last note: it is possible to just try all partition boundaries at a high
308
// level O(n^4) and do a verification scheme (at least O(nlogn)). If there
309
// area 200 partitions on a page, this could be too costly. Effort could go
310
// into pruning the search, but I opted for something quicker. I'm confident
311
// that the independent adjustments can get similar results and keep the
312
// complextiy down. However, the other approach could work without using
313
// TableFinder at all if it is fast enough. It comes down to properly
314
// deciding what is a table. The code currently relies on TableFinder's
315
// guess to the location of a table for that.
316
StructuredTable
*
RecognizeTable
(
const
TBOX
& guess_box);
317
318
protected
:
322
323
// Returns true if the given box has a lined table within it. The
324
// table argument will be updated with the table if the table exists.
325
bool
RecognizeLinedTable
(
const
TBOX
& guess_box,
StructuredTable
* table);
326
// Returns true if the given box has a large number of horizontal and
327
// vertical lines present. If so, we assume the extent of these lines
328
// uniquely defines a table and find that table via SolveLinedTable.
329
bool
HasSignificantLines
(
const
TBOX
& guess);
330
331
// Given enough horizontal and vertical lines in a region, find a bounding
332
// box that encloses all of them (as well as newly introduced lines).
333
// The bounding box is the smallest box that encloses the lines in guess
334
// without having any lines sticking out of it.
335
// bounding_box is an in/out parameter.
336
// On input, it in the extents of the box to search.
337
// On output, it is the resulting bounding box.
338
bool
FindLinesBoundingBox
(
TBOX
* bounding_box);
339
// Iteration in above search.
340
// bounding_box is an in/out parameter.
341
// On input, it in the extents of the box to search.
342
// On output, it is the resulting bounding box.
343
bool
FindLinesBoundingBoxIteration
(
TBOX
* bounding_box);
344
348
349
// Returns true if the given box has a whitespaced table within it. The
350
// table argument will be updated if the table exists. Also note
351
// that this method will fail if the guess_box center is not
352
// mostly within the table.
353
bool
RecognizeWhitespacedTable
(
const
TBOX
& guess_box,
StructuredTable
* table);
354
355
// Finds the location of a horizontal split relative to y.
356
// This function is mostly unused now. If the SolveWhitespacedTable
357
// changes much, it can be removed. Note, it isn't really as reliable
358
// as I thought. I went with alternatives for most of the other uses.
359
int
NextHorizontalSplit
(
int
left,
int
right,
int
y,
bool
top_to_bottom);
360
361
// Indicates that a table row is weak. This means that it has
362
// many missing data cells or very large cell heights compared.
363
// to the rest of the table.
364
static
bool
IsWeakTableRow
(
StructuredTable
* table,
int
row);
365
366
// Input data, used as read only data to make decisions.
367
ColPartitionGrid
*
text_grid_
;
// Text ColPartitions
368
ColPartitionGrid
*
line_grid_
;
// Line ColPartitions
369
// Table constraints, a "good" table must satisfy these.
370
int
min_height_
;
371
int
min_width_
;
372
// Filters, used to prevent awkward partitions from destroying structure.
373
int
max_text_height_
;
// Horizontal lines may intersect taller text.
374
};
375
376
}
// namespace tesseract
377
378
#endif
/* TABLERECOG_H_ */
mnt
data
src
tesseract-ocr
textord
tablerecog.h
Generated on Thu Nov 1 2012 20:19:51 for Tesseract by
1.8.1