Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
resultiterator.cpp
Go to the documentation of this file.
1 
2 // File: resultiterator.cpp
3 // Description: Iterator for tesseract results that is capable of
4 // iterating in proper reading order over Bi Directional
5 // (e.g. mixed Hebrew and English) text.
6 // Author: David Eger
7 // Created: Fri May 27 13:58:06 PST 2011
8 //
9 // (C) Copyright 2011, Google Inc.
10 // Licensed under the Apache License, Version 2.0 (the "License");
11 // you may not use this file except in compliance with the License.
12 // You may obtain a copy of the License at
13 // http://www.apache.org/licenses/LICENSE-2.0
14 // Unless required by applicable law or agreed to in writing, software
15 // distributed under the License is distributed on an "AS IS" BASIS,
16 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 // See the License for the specific language governing permissions and
18 // limitations under the License.
19 //
21 
22 #include "resultiterator.h"
23 
24 #include "allheaders.h"
25 #include "pageres.h"
26 #include "strngs.h"
27 #include "tesseractclass.h"
28 #include "unicharset.h"
29 #include "unicodes.h"
30 
31 namespace tesseract {
32 
34  : LTRResultIterator(resit) {
35  in_minor_direction_ = false;
36  at_beginning_of_minor_run_ = false;
37  current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
38  MoveToLogicalStartOfTextline();
39 }
40 
42  const LTRResultIterator &resit) {
43  return new ResultIterator(resit);
44 }
45 
47  return current_paragraph_is_ltr_;
48 }
49 
50 bool ResultIterator::CurrentParagraphIsLtr() const {
51  if (!it_->word())
52  return true; // doesn't matter.
53  LTRResultIterator it(*this);
54  it.RestartParagraph();
55  // Try to figure out the ltr-ness of the paragraph. The rules below
56  // make more sense in the context of a difficult paragraph example.
57  // Here we denote {ltr characters, RTL CHARACTERS}:
58  //
59  // "don't go in there!" DAIS EH
60  // EHT OTNI DEPMUJ FELSMIH NEHT DNA
61  // .GNIDLIUB GNINRUB
62  //
63  // On the first line, the left-most word is LTR and the rightmost word
64  // is RTL. Thus, we are better off taking the majority direction for
65  // the whole paragraph contents. So instead of "the leftmost word is LTR"
66  // indicating an LTR paragraph, we use a heuristic about what RTL paragraphs
67  // would not do: Typically an RTL paragraph would *not* start with an LTR
68  // word. So our heuristics are as follows:
69  //
70  // (1) If the first text line has an RTL word in the left-most position
71  // it is RTL.
72  // (2) If the first text line has an LTR word in the right-most position
73  // it is LTR.
74  // (3) If neither of the above is true, take the majority count for the
75  // paragraph -- if there are more rtl words, it is RTL. If there
76  // are more LTR words, it's LTR.
77  bool leftmost_rtl = it.WordDirection() == DIR_RIGHT_TO_LEFT;
78  bool rightmost_ltr = it.WordDirection() == DIR_LEFT_TO_RIGHT;
79  int num_ltr, num_rtl;
80  num_rtl = leftmost_rtl ? 1 : 0;
81  num_ltr = (it.WordDirection() == DIR_LEFT_TO_RIGHT) ? 1 : 0;
82  for (it.Next(RIL_WORD);
83  !it.Empty(RIL_WORD) && !it.IsAtBeginningOf(RIL_TEXTLINE);
84  it.Next(RIL_WORD)) {
85  StrongScriptDirection dir = it.WordDirection();
86  rightmost_ltr = (dir == DIR_LEFT_TO_RIGHT);
87  num_rtl += (dir == DIR_RIGHT_TO_LEFT) ? 1 : 0;
88  num_ltr += rightmost_ltr ? 1 : 0;
89  }
90  if (leftmost_rtl)
91  return false;
92  if (rightmost_ltr)
93  return true;
94  // First line is ambiguous. Take statistics on the whole paragraph.
95  if (!it.Empty(RIL_WORD) && !it.IsAtBeginningOf(RIL_PARA)) do {
96  StrongScriptDirection dir = it.WordDirection();
97  num_rtl += (dir == DIR_RIGHT_TO_LEFT) ? 1 : 0;
98  num_ltr += (dir == DIR_LEFT_TO_RIGHT) ? 1 : 0;
99  } while (it.Next(RIL_WORD) && !it.IsAtBeginningOf(RIL_PARA));
100  return num_ltr >= num_rtl;
101 }
102 
103 const int ResultIterator::kMinorRunStart = -1;
104 const int ResultIterator::kMinorRunEnd = -2;
105 const int ResultIterator::kComplexWord = -3;
106 
107 void ResultIterator::CalculateBlobOrder(
108  GenericVector<int> *blob_indices) const {
109  bool context_is_ltr = current_paragraph_is_ltr_ ^ in_minor_direction_;
110  blob_indices->clear();
111  if (Empty(RIL_WORD)) return;
112  if (context_is_ltr || it_->word()->UnicharsInReadingOrder()) {
113  // Easy! just return the blobs in order;
114  for (int i = 0; i < word_length_; i++)
115  blob_indices->push_back(i);
116  return;
117  }
118 
119  // The blobs are in left-to-right order, but the current reading context
120  // is right-to-left.
121  const int U_LTR = UNICHARSET::U_LEFT_TO_RIGHT;
122  const int U_RTL = UNICHARSET::U_RIGHT_TO_LEFT;
123  const int U_EURO_NUM = UNICHARSET::U_EUROPEAN_NUMBER;
124  const int U_EURO_NUM_SEP = UNICHARSET::U_EUROPEAN_NUMBER_SEPARATOR;
125  const int U_EURO_NUM_TERM = UNICHARSET::U_EUROPEAN_NUMBER_TERMINATOR;
126  const int U_COMMON_NUM_SEP = UNICHARSET::U_COMMON_NUMBER_SEPARATOR;
127  const int U_OTHER_NEUTRAL = UNICHARSET::U_OTHER_NEUTRAL;
128 
129  // Step 1: Scan for and mark European Number sequences
130  // [:ET:]*[:EN:]+(([:ES:]|[:CS:])?[:EN:]+)*[:ET:]*
131  GenericVector<int> letter_types;
132  for (int i = 0; i < word_length_; i++) {
133  letter_types.push_back(it_->word()->SymbolDirection(i));
134  }
135  // Convert a single separtor sandwiched between two EN's into an EN.
136  for (int i = 0; i + 2 < word_length_; i++) {
137  if (letter_types[i] == U_EURO_NUM && letter_types[i + 2] == U_EURO_NUM &&
138  (letter_types[i + 1] == U_EURO_NUM_SEP ||
139  letter_types[i + 1] == U_COMMON_NUM_SEP)) {
140  letter_types[i + 1] = U_EURO_NUM;
141  }
142  }
143  // Scan for sequences of European Number Terminators around ENs and convert
144  // them to ENs.
145  for (int i = 0; i < word_length_; i++) {
146  if (letter_types[i] == U_EURO_NUM_TERM) {
147  int j = i + 1;
148  while (j < word_length_ && letter_types[j] == U_EURO_NUM_TERM) { j++; }
149  if (j < word_length_ && letter_types[j] == U_EURO_NUM) {
150  // The sequence [i..j] should be converted to all European Numbers.
151  for (int k = i; k < j; k++) letter_types[k] = U_EURO_NUM;
152  }
153  j = i - 1;
154  while (j > -1 && letter_types[j] == U_EURO_NUM_TERM) { j--; }
155  if (j > -1 && letter_types[j] == U_EURO_NUM) {
156  // The sequence [j..i] should be converted to all European Numbers.
157  for (int k = j; k <= i; k++) letter_types[k] = U_EURO_NUM;
158  }
159  }
160  }
161  // Step 2: Convert all remaining types to either L or R.
162  // Sequences ([:L:]|[:EN:])+ (([:CS:]|[:ON:])+ ([:L:]|[:EN:])+)* -> L.
163  // All other are R.
164  for (int i = 0; i < word_length_;) {
165  int ti = letter_types[i];
166  if (ti == U_LTR || ti == U_EURO_NUM) {
167  // Left to right sequence; scan to the end of it.
168  int last_good = i;
169  for (int j = i + 1; j < word_length_; j++) {
170  int tj = letter_types[j];
171  if (tj == U_LTR || tj == U_EURO_NUM) {
172  last_good = j;
173  } else if (tj == U_COMMON_NUM_SEP || tj == U_OTHER_NEUTRAL) {
174  // do nothing.
175  } else {
176  break;
177  }
178  }
179  // [i..last_good] is the L sequence
180  for (int k = i; k <= last_good; k++) letter_types[k] = U_LTR;
181  i = last_good + 1;
182  } else {
183  letter_types[i] = U_RTL;
184  i++;
185  }
186  }
187 
188  // At this point, letter_types is entirely U_LTR or U_RTL.
189  for (int i = word_length_ - 1; i >= 0;) {
190  if (letter_types[i] == U_RTL) {
191  blob_indices->push_back(i);
192  i--;
193  } else {
194  // left to right sequence. scan to the beginning.
195  int j = i - 1;
196  for (; j >= 0 && letter_types[j] != U_RTL; j--) { } // pass
197  // Now (j, i] is LTR
198  for (int k = j + 1; k <= i; k++) blob_indices->push_back(k);
199  i = j;
200  }
201  }
202  ASSERT_HOST(blob_indices->size() == word_length_);
203 }
204 
205 static void PrintScriptDirs(const GenericVector<StrongScriptDirection> &dirs) {
206  for (int i = 0; i < dirs.size(); i++) {
207  switch (dirs[i]) {
208  case DIR_NEUTRAL: tprintf ("N "); break;
209  case DIR_LEFT_TO_RIGHT: tprintf("L "); break;
210  case DIR_RIGHT_TO_LEFT: tprintf("R "); break;
211  case DIR_MIX: tprintf("Z "); break;
212  default: tprintf("? "); break;
213  }
214  }
215  tprintf("\n");
216 }
217 
219  bool paragraph_is_ltr,
220  const LTRResultIterator &resit,
221  GenericVectorEqEq<int> *word_indices) const {
223  CalculateTextlineOrder(paragraph_is_ltr, resit, &directions, word_indices);
224 }
225 
227  bool paragraph_is_ltr,
228  const LTRResultIterator &resit,
230  GenericVectorEqEq<int> *word_indices) const {
233  directions = (dirs_arg != NULL) ? dirs_arg : &dirs;
234  directions->truncate(0);
235 
236  // A LTRResultIterator goes strictly left-to-right word order.
237  LTRResultIterator ltr_it(resit);
238  ltr_it.RestartRow();
239  if (ltr_it.Empty(RIL_WORD)) return;
240  do {
241  directions->push_back(ltr_it.WordDirection());
242  } while (ltr_it.Next(RIL_WORD) && !ltr_it.IsAtBeginningOf(RIL_TEXTLINE));
243 
244  word_indices->truncate(0);
245  CalculateTextlineOrder(paragraph_is_ltr, *directions, word_indices);
246 }
247 
249  bool paragraph_is_ltr,
250  const GenericVector<StrongScriptDirection> &word_dirs,
251  GenericVectorEqEq<int> *reading_order) {
252  reading_order->truncate(0);
253  if (word_dirs.size() == 0) return;
254 
255  // Take all of the runs of minor direction words and insert them
256  // in reverse order.
257  int minor_direction, major_direction, major_step, start, end;
258  if (paragraph_is_ltr) {
259  start = 0;
260  end = word_dirs.size();
261  major_step = 1;
262  major_direction = DIR_LEFT_TO_RIGHT;
263  minor_direction = DIR_RIGHT_TO_LEFT;
264  } else {
265  start = word_dirs.size() - 1;
266  end = -1;
267  major_step = -1;
268  major_direction = DIR_RIGHT_TO_LEFT;
269  minor_direction = DIR_LEFT_TO_RIGHT;
270  // Special rule: if there are neutral words at the right most side
271  // of a line adjacent to a left-to-right word in the middle of the
272  // line, we interpret the end of the line as a single LTR sequence.
273  if (word_dirs[start] == DIR_NEUTRAL) {
274  int neutral_end = start;
275  while (neutral_end > 0 && word_dirs[neutral_end] == DIR_NEUTRAL) {
276  neutral_end--;
277  }
278  if (neutral_end >= 0 && word_dirs[neutral_end] == DIR_LEFT_TO_RIGHT) {
279  // LTR followed by neutrals.
280  // Scan for the beginning of the minor left-to-right run.
281  int left = neutral_end;
282  for (int i = left; i >= 0 && word_dirs[i] != DIR_RIGHT_TO_LEFT; i--) {
283  if (word_dirs[i] == DIR_LEFT_TO_RIGHT) left = i;
284  }
285  reading_order->push_back(kMinorRunStart);
286  for (int i = left; i < word_dirs.size(); i++) {
287  reading_order->push_back(i);
288  if (word_dirs[i] == DIR_MIX) reading_order->push_back(kComplexWord);
289  }
290  reading_order->push_back(kMinorRunEnd);
291  start = left - 1;
292  }
293  }
294  }
295  for (int i = start; i != end;) {
296  if (word_dirs[i] == minor_direction) {
297  int j = i;
298  while (j != end && word_dirs[j] != major_direction)
299  j += major_step;
300  if (j == end) j -= major_step;
301  while (j != i && word_dirs[j] != minor_direction)
302  j -= major_step;
303  // [j..i] is a minor direction run.
304  reading_order->push_back(kMinorRunStart);
305  for (int k = j; k != i; k -= major_step) {
306  reading_order->push_back(k);
307  }
308  reading_order->push_back(i);
309  reading_order->push_back(kMinorRunEnd);
310  i = j + major_step;
311  } else {
312  reading_order->push_back(i);
313  if (word_dirs[i] == DIR_MIX) reading_order->push_back(kComplexWord);
314  i += major_step;
315  }
316  }
317 }
318 
319 int ResultIterator::LTRWordIndex() const {
320  int this_word_index = 0;
321  LTRResultIterator textline(*this);
322  textline.RestartRow();
323  while (!textline.PositionedAtSameWord(it_)) {
324  this_word_index++;
325  textline.Next(RIL_WORD);
326  }
327  return this_word_index;
328 }
329 
330 void ResultIterator::MoveToLogicalStartOfWord() {
331  if (word_length_ == 0) {
332  BeginWord(0);
333  return;
334  }
335  GenericVector<int> blob_order;
336  CalculateBlobOrder(&blob_order);
337  if (blob_order.size() == 0 || blob_order[0] == 0) return;
338  BeginWord(blob_order[0]);
339 }
340 
341 bool ResultIterator::IsAtFinalSymbolOfWord() const {
342  if (!it_->word()) return true;
343  GenericVector<int> blob_order;
344  CalculateBlobOrder(&blob_order);
345  return blob_order.size() == 0 || blob_order.back() == blob_index_;
346 }
347 
348 bool ResultIterator::IsAtFirstSymbolOfWord() const {
349  if (!it_->word()) return true;
350  GenericVector<int> blob_order;
351  CalculateBlobOrder(&blob_order);
352  return blob_order.size() == 0 || blob_order[0] == blob_index_;
353 }
354 
355 void ResultIterator::AppendSuffixMarks(STRING *text) const {
356  if (!it_->word()) return;
357  bool reading_direction_is_ltr =
358  current_paragraph_is_ltr_ ^ in_minor_direction_;
359  // scan forward to see what meta-information the word ordering algorithm
360  // left us.
361  // If this word is at the *end* of a minor run, insert the other
362  // direction's mark; else if this was a complex word, insert the
363  // current reading order's mark.
364  GenericVectorEqEq<int> textline_order;
365  CalculateTextlineOrder(current_paragraph_is_ltr_,
366  *this, &textline_order);
367  int this_word_index = LTRWordIndex();
368  int i = textline_order.get_index(this_word_index);
369  if (i < 0) return;
370 
371  int last_non_word_mark = 0;
372  for (i++; i < textline_order.size() && textline_order[i] < 0; i++) {
373  last_non_word_mark = textline_order[i];
374  }
375  if (last_non_word_mark == kComplexWord) {
376  *text += reading_direction_is_ltr ? kLRM : kRLM;
377  } else if (last_non_word_mark == kMinorRunEnd) {
378  if (current_paragraph_is_ltr_) {
379  *text += kRLM;
380  *text += kLRM;
381  } else {
382  *text += kRLM;
383  *text += kLRM;
384  }
385  }
386 }
387 
388 void ResultIterator::MoveToLogicalStartOfTextline() {
389  GenericVectorEqEq<int> word_indices;
390  RestartRow();
391  CalculateTextlineOrder(current_paragraph_is_ltr_,
392  dynamic_cast<const LTRResultIterator&>(*this),
393  &word_indices);
394  int i = 0;
395  for (; i < word_indices.size() && word_indices[i] < 0; i++) {
396  if (word_indices[i] == kMinorRunStart) in_minor_direction_ = true;
397  else if (word_indices[i] == kMinorRunEnd) in_minor_direction_ = false;
398  }
399  if (in_minor_direction_) at_beginning_of_minor_run_ = true;
400  if (i >= word_indices.size()) return;
401  int first_word_index = word_indices[i];
402  for (int j = 0; j < first_word_index; j++) {
404  }
405  MoveToLogicalStartOfWord();
406 }
407 
410  current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
411  in_minor_direction_ = false;
412  at_beginning_of_minor_run_ = false;
413  MoveToLogicalStartOfTextline();
414 }
415 
417  if (it_->block() == NULL) return false; // already at end!
418  switch (level) {
419  case RIL_BLOCK: // explicit fall-through
420  case RIL_PARA: // explicit fall-through
421  case RIL_TEXTLINE:
422  if (!PageIterator::Next(level)) return false;
424  // if we've advanced to a new paragraph,
425  // recalculate current_paragraph_is_ltr_
426  current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
427  }
428  in_minor_direction_ = false;
429  MoveToLogicalStartOfTextline();
430  return it_->block() != NULL;
431  case RIL_SYMBOL:
432  {
433  GenericVector<int> blob_order;
434  CalculateBlobOrder(&blob_order);
435  int next_blob = 0;
436  while (next_blob < blob_order.size() &&
437  blob_index_ != blob_order[next_blob])
438  next_blob++;
439  next_blob++;
440  if (next_blob < blob_order.size()) {
441  // we're in the same word; simply advance one blob.
442  BeginWord(blob_order[next_blob]);
443  at_beginning_of_minor_run_ = false;
444  return true;
445  }
446  level = RIL_WORD; // we've fallen through to the next word.
447  }
448  case RIL_WORD: // explicit fall-through.
449  {
450  if (it_->word() == NULL) return Next(RIL_BLOCK);
451  GenericVectorEqEq<int> word_indices;
452  int this_word_index = LTRWordIndex();
453  CalculateTextlineOrder(current_paragraph_is_ltr_,
454  *this,
455  &word_indices);
456  int final_real_index = word_indices.size() - 1;
457  while (final_real_index > 0 && word_indices[final_real_index] < 0)
458  final_real_index--;
459  for (int i = 0; i < final_real_index; i++) {
460  if (word_indices[i] == this_word_index) {
461  int j = i + 1;
462  for (; j < final_real_index && word_indices[j] < 0; j++) {
463  if (word_indices[j] == kMinorRunStart) in_minor_direction_ = true;
464  if (word_indices[j] == kMinorRunEnd) in_minor_direction_ = false;
465  }
466  at_beginning_of_minor_run_ = (word_indices[j - 1] == kMinorRunStart);
467  // awesome, we move to word_indices[j]
468  if (BidiDebug(3)) {
469  tprintf("Next(RIL_WORD): %d -> %d\n",
470  this_word_index, word_indices[j]);
471  }
473  for (int k = 0; k < word_indices[j]; k++) {
475  }
476  MoveToLogicalStartOfWord();
477  return true;
478  }
479  }
480  if (BidiDebug(3)) {
481  tprintf("Next(RIL_WORD): %d -> EOL\n", this_word_index);
482  }
483  // we're going off the end of the text line.
484  return Next(RIL_TEXTLINE);
485  }
486  }
487  ASSERT_HOST(false); // shouldn't happen.
488  return false;
489 }
490 
492  if (it_->block() == NULL) return false; // Already at the end!
493  if (it_->word() == NULL) return true; // In an image block.
494  if (level == RIL_SYMBOL) return true; // Always at beginning of a symbol.
495 
496  bool at_word_start = IsAtFirstSymbolOfWord();
497  if (level == RIL_WORD) return at_word_start;
498 
499  ResultIterator line_start(*this);
500  // move to the first word in the line...
501  line_start.MoveToLogicalStartOfTextline();
502 
503  bool at_textline_start = at_word_start && *line_start.it_ == *it_;
504  if (level == RIL_TEXTLINE) return at_textline_start;
505 
506  // now we move to the left-most word...
507  line_start.RestartRow();
508  bool at_block_start = at_textline_start &&
509  line_start.it_->block() != line_start.it_->prev_block();
510  if (level == RIL_BLOCK) return at_block_start;
511 
512  bool at_para_start = at_block_start ||
513  (at_textline_start &&
514  line_start.it_->row()->row->para() !=
515  line_start.it_->prev_row()->row->para());
516  if (level == RIL_PARA) return at_para_start;
517 
518  ASSERT_HOST(false); // shouldn't happen.
519  return false;
520 }
521 
528  PageIteratorLevel element) const {
529  if (Empty(element)) return true; // Already at the end!
530  // The result is true if we step forward by element and find we are
531  // at the the end of the page or at beginning of *all* levels in:
532  // [level, element).
533  // When there is more than one level difference between element and level,
534  // we could for instance move forward one symbol and still be at the first
535  // word on a line, so we also have to be at the first symbol in a word.
536  ResultIterator next(*this);
537  next.Next(element);
538  if (next.Empty(element)) return true; // Reached the end of the page.
539  while (element > level) {
540  element = static_cast<PageIteratorLevel>(element - 1);
541  if (!next.IsAtBeginningOf(element))
542  return false;
543  }
544  return true;
545 }
546 
552  if (it_->word() == NULL) return NULL; // Already at the end!
553  STRING text;
554  switch (level) {
555  case RIL_BLOCK:
556  {
557  ResultIterator pp(*this);
558  do {
559  pp.AppendUTF8ParagraphText(&text);
560  } while (pp.Next(RIL_PARA) && pp.it_->block() == it_->block());
561  }
562  break;
563  case RIL_PARA:
564  AppendUTF8ParagraphText(&text);
565  break;
566  case RIL_TEXTLINE:
567  {
568  ResultIterator it(*this);
569  it.MoveToLogicalStartOfTextline();
570  it.IterateAndAppendUTF8TextlineText(&text);
571  }
572  break;
573  case RIL_WORD:
574  AppendUTF8WordText(&text);
575  break;
576  case RIL_SYMBOL:
577  {
578  bool reading_direction_is_ltr =
579  current_paragraph_is_ltr_ ^ in_minor_direction_;
580  if (at_beginning_of_minor_run_) {
581  text += reading_direction_is_ltr ? kLRM : kRLM;
582  }
583  text = it_->word()->BestUTF8(blob_index_, !reading_direction_is_ltr);
584  if (IsAtFinalSymbolOfWord()) AppendSuffixMarks(&text);
585  }
586  break;
587  }
588  int length = text.length() + 1;
589  char* result = new char[length];
590  strncpy(result, text.string(), length);
591  return result;
592 }
593 
594 void ResultIterator::AppendUTF8WordText(STRING *text) const {
595  if (!it_->word()) return;
597  bool reading_direction_is_ltr =
598  current_paragraph_is_ltr_ ^ in_minor_direction_;
599  if (at_beginning_of_minor_run_) {
600  *text += reading_direction_is_ltr ? kLRM : kRLM;
601  }
602 
603  GenericVector<int> blob_order;
604  CalculateBlobOrder(&blob_order);
605  for (int i = 0; i < blob_order.size(); i++) {
606  *text += it_->word()->BestUTF8(blob_order[i], !reading_direction_is_ltr);
607  }
608  AppendSuffixMarks(text);
609 }
610 
611 void ResultIterator::IterateAndAppendUTF8TextlineText(STRING *text) {
612  if (Empty(RIL_WORD)) {
613  Next(RIL_WORD);
614  return;
615  }
616  if (BidiDebug(1)) {
617  GenericVectorEqEq<int> textline_order;
619  CalculateTextlineOrder(current_paragraph_is_ltr_,
620  *this, &dirs, &textline_order);
621  tprintf("Strong Script dirs [%p/P=%s]: ", it_->row(),
622  current_paragraph_is_ltr_ ? "ltr" : "rtl");
623  PrintScriptDirs(dirs);
624  tprintf("Logical textline order [%p/P=%s]: ", it_->row(),
625  current_paragraph_is_ltr_ ? "ltr" : "rtl");
626  for (int i = 0; i < textline_order.size(); i++) {
627  tprintf("%d ", textline_order[i]);
628  }
629  tprintf("\n");
630  }
631 
632  int words_appended = 0;
633  do {
634  AppendUTF8WordText(text);
635  words_appended++;
636  *text += " ";
637  } while (Next(RIL_WORD) && !IsAtBeginningOf(RIL_TEXTLINE));
638  if (BidiDebug(1)) {
639  tprintf("%d words printed\n", words_appended);
640  }
641  text->truncate_at(text->length() - 1);
642  *text += line_separator_;
643  // If we just finished a paragraph, add an extra newline.
644  if (it_->block() == NULL || IsAtBeginningOf(RIL_PARA))
645  *text += paragraph_separator_;
646 }
647 
648 void ResultIterator::AppendUTF8ParagraphText(STRING *text) const {
649  ResultIterator it(*this);
650  it.RestartParagraph();
651  it.MoveToLogicalStartOfTextline();
652  if (it.Empty(RIL_WORD)) return;
653  do {
654  it.IterateAndAppendUTF8TextlineText(text);
655  } while (it.it_->block() != NULL && !it.IsAtBeginningOf(RIL_PARA));
656 }
657 
658 bool ResultIterator::BidiDebug(int min_level) const {
659  int debug_level = 1;
660  IntParam *p = ParamUtils::FindParam<IntParam>(
661  "bidi_debug", GlobalParams()->int_params,
663  if (p != NULL) debug_level = (inT32)(*p);
664  return debug_level >= min_level;
665 }
666 
667 } // namespace tesseract.