Tesseract
3.02
Main Page
Related Pages
Modules
Namespaces
Classes
Files
File List
File Members
All
Classes
Namespaces
Files
Functions
Variables
Typedefs
Enumerations
Enumerator
Friends
Macros
Groups
Pages
paragraphs_internal.h
Go to the documentation of this file.
1
/**********************************************************************
2
* File: paragraphs.h
3
* Description: Paragraph Detection internal data structures.
4
* Author: David Eger
5
* Created: 11 March 2011
6
*
7
* (C) Copyright 2011, Google Inc.
8
** Licensed under the Apache License, Version 2.0 (the "License");
9
** you may not use this file except in compliance with the License.
10
** You may obtain a copy of the License at
11
** http://www.apache.org/licenses/LICENSE-2.0
12
** Unless required by applicable law or agreed to in writing, software
13
** distributed under the License is distributed on an "AS IS" BASIS,
14
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
** See the License for the specific language governing permissions and
16
** limitations under the License.
17
*
18
**********************************************************************/
19
20
#ifndef TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_
21
#define TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_
22
23
#include "
paragraphs.h
"
24
#ifdef _MSC_VER
25
#include <string>
26
#else
27
#include "strings.h"
28
#endif
29
30
// NO CODE OUTSIDE OF paragraphs.cpp AND TESTS SHOULD NEED TO ACCESS
31
// DATA STRUCTURES OR FUNCTIONS IN THIS FILE.
32
33
class
WERD_CHOICE
;
34
35
namespace
tesseract
{
36
37
// Return whether the given word is likely to be a list item start word.
38
bool
AsciiLikelyListItem
(
const
STRING
&word);
39
40
// Return the first Unicode Codepoint from werd[pos].
41
int
UnicodeFor
(
const
UNICHARSET
*u,
const
WERD_CHOICE
*werd,
int
pos);
42
43
// Set right word attributes given either a unicharset and werd or a utf8
44
// string.
45
void
RightWordAttributes
(
const
UNICHARSET
*unicharset,
const
WERD_CHOICE
*werd,
46
const
STRING
&utf8,
47
bool
*is_list,
bool
*starts_idea,
bool
*ends_idea);
48
49
// Set left word attributes given either a unicharset and werd or a utf8 string.
50
void
LeftWordAttributes
(
const
UNICHARSET
*unicharset,
const
WERD_CHOICE
*werd,
51
const
STRING
&utf8,
52
bool
*is_list,
bool
*starts_idea,
bool
*ends_idea);
53
54
enum
LineType
{
55
LT_START
=
'S'
,
// First line of a paragraph.
56
LT_BODY
=
'C'
,
// Continuation line of a paragraph.
57
LT_UNKNOWN
=
'U'
,
// No clues.
58
LT_MULTIPLE
=
'M'
,
// Matches for both LT_START and LT_BODY.
59
};
60
61
// The first paragraph in a page of body text is often un-indented.
62
// This is a typographic convention which is common to indicate either that:
63
// (1) The paragraph is the continuation of a previous paragraph, or
64
// (2) The paragraph is the first paragraph in a chapter.
65
//
66
// I refer to such paragraphs as "crown"s, and the output of the paragraph
67
// detection algorithm attempts to give them the same paragraph model as
68
// the rest of the body text.
69
//
70
// Nonetheless, while building hypotheses, it is useful to mark the lines
71
// of crown paragraphs temporarily as crowns, either aligned left or right.
72
extern
const
ParagraphModel
*
kCrownLeft
;
73
extern
const
ParagraphModel
*
kCrownRight
;
74
75
inline
bool
StrongModel
(
const
ParagraphModel
*model) {
76
return
model !=
NULL
&& model !=
kCrownLeft
&& model !=
kCrownRight
;
77
}
78
79
struct
LineHypothesis
{
80
LineHypothesis
() :
ty
(
LT_UNKNOWN
),
model
(
NULL
) {}
81
LineHypothesis
(
LineType
line_type,
const
ParagraphModel
*m)
82
:
ty
(line_type),
model
(m) {}
83
LineHypothesis
(
const
LineHypothesis
&other)
84
:
ty
(other.
ty
),
model
(other.
model
) {}
85
86
bool
operator==
(
const
LineHypothesis
&other)
const
{
87
return
ty
== other.
ty
&&
model
== other.
model
;
88
}
89
90
LineType
ty
;
91
const
ParagraphModel
*
model
;
92
};
93
94
class
ParagraphTheory
;
// Forward Declaration
95
96
typedef
GenericVectorEqEq<const ParagraphModel *>
SetOfModels
;
97
98
// Row Scratch Registers are data generated by the paragraph detection
99
// algorithm based on a RowInfo input.
100
class
RowScratchRegisters
{
101
public
:
102
// We presume row will outlive us.
103
void
Init
(
const
RowInfo
&row);
104
105
LineType
GetLineType
()
const
;
106
107
LineType
GetLineType
(
const
ParagraphModel
*model)
const
;
108
109
// Mark this as a start line type, sans model. This is useful for the
110
// initial marking of probable body lines or paragraph start lines.
111
void
SetStartLine
();
112
113
// Mark this as a body line type, sans model. This is useful for the
114
// initial marking of probably body lines or paragraph start lines.
115
void
SetBodyLine
();
116
117
// Record that this row fits as a paragraph start line in the given model,
118
void
AddStartLine
(
const
ParagraphModel
*model);
119
// Record that this row fits as a paragraph body line in the given model,
120
void
AddBodyLine
(
const
ParagraphModel
*model);
121
122
// Clear all hypotheses about this line.
123
void
SetUnknown
() { hypotheses_.
truncate
(0); }
124
125
// Append all hypotheses of strong models that match this row as a start.
126
void
StartHypotheses
(
SetOfModels
*models)
const
;
127
128
// Append all hypotheses of strong models matching this row.
129
void
StrongHypotheses
(
SetOfModels
*models)
const
;
130
131
// Append all hypotheses for this row.
132
void
NonNullHypotheses
(
SetOfModels
*models)
const
;
133
134
// Discard any hypotheses whose model is not in the given list.
135
void
DiscardNonMatchingHypotheses
(
const
SetOfModels
&models);
136
137
// If we have only one hypothesis and that is that this line is a paragraph
138
// start line of a certain model, return that model. Else return NULL.
139
const
ParagraphModel
*
UniqueStartHypothesis
()
const
;
140
141
// If we have only one hypothesis and that is that this line is a paragraph
142
// body line of a certain model, return that model. Else return NULL.
143
const
ParagraphModel
*
UniqueBodyHypothesis
()
const
;
144
145
// Return the indentation for the side opposite of the aligned side.
146
int
OffsideIndent
(
tesseract::ParagraphJustification
just)
const
{
147
switch
(just) {
148
case
tesseract::JUSTIFICATION_RIGHT
:
return
lindent_
;
149
case
tesseract::JUSTIFICATION_LEFT
:
return
rindent_
;
150
default
:
return
lindent_
>
rindent_
?
lindent_
:
rindent_
;
151
}
152
}
153
154
// Return the indentation for the side the text is aligned to.
155
int
AlignsideIndent
(
tesseract::ParagraphJustification
just)
const
{
156
switch
(just) {
157
case
tesseract::JUSTIFICATION_RIGHT
:
return
rindent_
;
158
case
tesseract::JUSTIFICATION_LEFT
:
return
lindent_
;
159
default
:
return
lindent_
>
rindent_
?
lindent_
:
rindent_
;
160
}
161
}
162
163
// Append header fields to a vector of row headings.
164
static
void
AppendDebugHeaderFields
(
GenericVector<STRING>
*header);
165
166
// Append data for this row to a vector of debug strings.
167
void
AppendDebugInfo
(
const
ParagraphTheory
&theory,
168
GenericVector<STRING>
*dbg)
const
;
169
170
const
RowInfo
*
ri_
;
171
172
// These four constants form a horizontal box model for the white space
173
// on the edges of each line. At each point in the algorithm, the following
174
// shall hold:
175
// ri_->pix_ldistance = lmargin_ + lindent_
176
// ri_->pix_rdistance = rindent_ + rmargin_
177
int
lmargin_
;
178
int
lindent_
;
179
int
rindent_
;
180
int
rmargin_
;
181
182
private
:
183
// Hypotheses of either LT_START or LT_BODY
184
GenericVectorEqEq<LineHypothesis>
hypotheses_;
185
};
186
187
// A collection of convenience functions for wrapping the set of
188
// Paragraph Models we believe correctly model the paragraphs in the image.
189
class
ParagraphTheory
{
190
public
:
191
// We presume models will outlive us, and that models will take ownership
192
// of any ParagraphModel *'s we add.
193
explicit
ParagraphTheory
(
GenericVector<ParagraphModel *>
*
models
)
194
: models_(models) {}
195
GenericVector<ParagraphModel *>
&
models
() {
return
*models_; }
196
const
GenericVector<ParagraphModel *>
&
models
()
const
{
return
*models_; }
197
198
// Return an existing model if one that is Comparable() can be found.
199
// Else, allocate a new copy of model to save and return a pointer to it.
200
const
ParagraphModel
*
AddModel
(
const
ParagraphModel
&model);
201
202
// Discard any models we've made that are not in the list of used models.
203
void
DiscardUnusedModels
(
const
SetOfModels
&used_models);
204
205
// Return the set of all non-centered models.
206
void
NonCenteredModels
(
SetOfModels
*
models
);
207
208
// If any of the non-centered paragraph models we know about fit
209
// rows[start, end), return it. Else NULL.
210
const
ParagraphModel
*
Fits
(
const
GenericVector<RowScratchRegisters>
*rows,
211
int
start,
int
end)
const
;
212
213
int
IndexOf
(
const
ParagraphModel
*model)
const
;
214
215
private
:
216
GenericVector<ParagraphModel *>
*models_;
217
GenericVectorEqEq<ParagraphModel *>
models_we_added_;
218
};
219
220
bool
ValidFirstLine
(
const
GenericVector<RowScratchRegisters>
*rows,
221
int
row,
const
ParagraphModel
*model);
222
bool
ValidBodyLine
(
const
GenericVector<RowScratchRegisters>
*rows,
223
int
row,
const
ParagraphModel
*model);
224
bool
CrownCompatible
(
const
GenericVector<RowScratchRegisters>
*rows,
225
int
a,
int
b,
const
ParagraphModel
*model);
226
227
// A class for smearing Paragraph Model hypotheses to surrounding rows.
228
// The idea here is that StrongEvidenceClassify first marks only exceedingly
229
// obvious start and body rows and constructs models of them. Thereafter,
230
// we may have left over unmarked lines (mostly end-of-paragraph lines) which
231
// were too short to have much confidence about, but which fit the models we've
232
// constructed perfectly and which we ought to mark. This class is used to
233
// "smear" our models over the text.
234
class
ParagraphModelSmearer
{
235
public
:
236
ParagraphModelSmearer
(
GenericVector<RowScratchRegisters>
*rows,
237
int
row_start,
int
row_end,
238
ParagraphTheory
*theory);
239
240
// Smear forward paragraph models from existing row markings to subsequent
241
// text lines if they fit, and mark any thereafter still unmodeled rows
242
// with any model in the theory that fits them.
243
void
Smear
();
244
245
private
:
246
// Record in open_models_ for rows [start_row, end_row) the list of models
247
// currently open at each row.
248
// A model is still open in a row if some previous row has said model as a
249
// start hypothesis, and all rows since (including this row) would fit as
250
// either a body or start line in that model.
251
void
CalculateOpenModels(
int
row_start,
int
row_end);
252
253
SetOfModels
&OpenModels(
int
row) {
254
return
open_models_[row - row_start_ + 1];
255
}
256
257
ParagraphTheory
*theory_;
258
GenericVector<RowScratchRegisters>
*rows_;
259
int
row_start_;
260
int
row_end_;
261
262
// open_models_ corresponds to rows[start_row_ - 1, end_row_]
263
//
264
// open_models_: Contains models which there was an active (open) paragraph
265
// as of the previous line and for which the left and right
266
// indents admit the possibility that this text line continues
267
// to fit the same model.
268
// TODO(eger): Think about whether we can get rid of "Open" models and just
269
// use the current hypotheses on RowScratchRegisters.
270
GenericVector<SetOfModels>
open_models_;
271
};
272
273
// Clear all hypotheses about lines [start, end) and reset the margins to the
274
// percentile (0..100) value of the left and right row edges for this run of
275
// rows.
276
void
RecomputeMarginsAndClearHypotheses
(
277
GenericVector<RowScratchRegisters>
*rows,
int
start,
int
end,
278
int
percentile);
279
280
// Return the median inter-word space in rows[row_start, row_end).
281
int
InterwordSpace
(
const
GenericVector<RowScratchRegisters>
&rows,
282
int
row_start,
int
row_end);
283
284
// Return whether the first word on the after line can fit in the space at
285
// the end of the before line (knowing which way the text is aligned and read).
286
bool
FirstWordWouldHaveFit
(
const
RowScratchRegisters
&before,
287
const
RowScratchRegisters
&after,
288
tesseract::ParagraphJustification
justification);
289
290
// Return whether the first word on the after line can fit in the space at
291
// the end of the before line (not knowing the text alignment).
292
bool
FirstWordWouldHaveFit
(
const
RowScratchRegisters
&before,
293
const
RowScratchRegisters
&after);
294
295
// Do rows[start, end) form a single instance of the given paragraph model?
296
bool
RowsFitModel
(
const
GenericVector<RowScratchRegisters>
*rows,
297
int
start,
int
end,
const
ParagraphModel
*model);
298
299
// Do the text and geometry of two rows support a paragraph break between them?
300
bool
LikelyParagraphStart
(
const
RowScratchRegisters
&before,
301
const
RowScratchRegisters
&after,
302
tesseract::ParagraphJustification
j);
303
304
// Given a set of row_owners pointing to PARAs or NULL (no paragraph known),
305
// normalize each row_owner to point to an actual PARA, and output the
306
// paragraphs in order onto paragraphs.
307
void
CanonicalizeDetectionResults
(
308
GenericVector<PARA *>
*row_owners,
309
PARA_LIST *paragraphs);
310
311
}
// namespace
312
#endif // TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_
mnt
data
src
tesseract-ocr
ccmain
paragraphs_internal.h
Generated on Thu Nov 1 2012 20:19:43 for Tesseract by
1.8.1