Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
pageres.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: pageres.cpp (Formerly page_res.c)
3  * Description: Results classes used by control.c
4  * Author: Phil Cheatle
5  * Created: Tue Sep 22 08:42:49 BST 1992
6  *
7  * (C) Copyright 1992, Hewlett-Packard Ltd.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 #include "mfcpch.h"
20 #include <stdlib.h>
21 #ifdef __UNIX__
22 #include <assert.h>
23 #endif
24 #include "pageres.h"
25 #include "blobs.h"
26 
27 const char kBlameCorrect[] = "corr";
28 const char kBlameClassifier[] = "cl";
29 const char kBlameChopper[] = "chop";
30 const char kBlameClassLMTradeoff[] = "cl/LM";
31 const char kBlamePageLayout[] = "pglt";
32 const char kBlameSegsearchHeur[] = "ss_heur";
33 const char kBlameSegsearchPP[] = "ss_pp";
34 const char kBlameClassOldLMTradeoff[] = "cl/old_LM";
35 const char kBlameAdaption[] = "adapt";
36 const char kBlameNoTruthSplit[] = "no_tr_spl";
37 const char kBlameNoTruth[] = "no_tr";
38 const char kBlameUnknown[] = "unkn";
39 
40 const char * const kIncorrectResultReasonNames[] = {
53 };
54 
56  return kIncorrectResultReasonNames[irr];
57 }
58 
59 const char *BlamerBundle::IncorrectReason() const {
61 }
62 
64  const WERD_CHOICE *choice,
65  STRING *debug) {
66  (*debug) += "Truth ";
67  for (int i = 0; i < this->truth_text.length(); ++i) {
68  (*debug) += this->truth_text[i];
69  }
70  if (!this->truth_has_char_boxes) (*debug) += " (no char boxes)";
71  if (choice != NULL) {
72  (*debug) += " Choice ";
73  STRING choice_str;
74  choice->string_and_lengths(&choice_str, NULL);
75  (*debug) += choice_str;
76  }
77  if (msg.length() > 0) {
78  (*debug) += "\n";
79  (*debug) += msg;
80  }
81  (*debug) += "\n";
82 }
83 
86 /*************************************************************************
87  * PAGE_RES::PAGE_RES
88  *
89  * Constructor for page results
90  *************************************************************************/
92  BLOCK_LIST *the_block_list,
93  WERD_CHOICE **prev_word_best_choice_ptr) {
94  Init();
95  BLOCK_IT block_it(the_block_list);
96  BLOCK_RES_IT block_res_it(&block_res_list);
97  for (block_it.mark_cycle_pt();
98  !block_it.cycled_list(); block_it.forward()) {
99  block_res_it.add_to_end(new BLOCK_RES(block_it.data()));
100  }
101  prev_word_best_choice = prev_word_best_choice_ptr;
102 }
103 
104 /*************************************************************************
105  * BLOCK_RES::BLOCK_RES
106  *
107  * Constructor for BLOCK results
108  *************************************************************************/
109 
111  ROW_IT row_it (the_block->row_list ());
112  ROW_RES_IT row_res_it(&row_res_list);
113 
114  char_count = 0;
115  rej_count = 0;
116  font_class = -1; //not assigned
117  x_height = -1.0;
119  bold = FALSE;
120  italic = FALSE;
121  row_count = 0;
122 
123  block = the_block;
124 
125  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
126  row_res_it.add_to_end(new ROW_RES(row_it.data()));
127  }
128 }
129 
130 
131 /*************************************************************************
132  * ROW_RES::ROW_RES
133  *
134  * Constructor for ROW results
135  *************************************************************************/
136 
138  WERD_IT word_it(the_row->word_list());
139  WERD_RES_IT word_res_it(&word_res_list);
140  WERD_RES *combo = NULL; // current combination of fuzzies
141  WERD_RES *word_res; // current word
142  WERD *copy_word;
143 
144  char_count = 0;
145  rej_count = 0;
147 
148  row = the_row;
149  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
150  word_res = new WERD_RES(word_it.data());
151  word_res->x_height = the_row->x_height();
152 
153  if (word_res->word->flag(W_FUZZY_NON)) {
154  ASSERT_HOST(combo != NULL);
155  word_res->part_of_combo = TRUE;
156  combo->copy_on(word_res);
157  }
158  if (word_it.data_relative(1)->flag(W_FUZZY_NON)) {
159  if (combo == NULL) {
160  copy_word = new WERD;
161  //deep copy
162  *copy_word = *(word_it.data());
163  combo = new WERD_RES(copy_word);
164  combo->x_height = the_row->x_height();
165  combo->combination = TRUE;
166  word_res_it.add_to_end(combo);
167  }
168  word_res->part_of_combo = TRUE;
169  } else {
170  combo = NULL;
171  }
172  word_res_it.add_to_end(word_res);
173  }
174 }
175 
176 
178  this->ELIST_LINK::operator=(source);
179  Clear();
180  if (source.combination) {
181  word = new WERD;
182  *word = *(source.word); // deep copy
183  } else {
184  word = source.word; // pt to same word
185  }
186  if (source.bln_boxes != NULL)
187  bln_boxes = new tesseract::BoxWord(*source.bln_boxes);
188  if (source.chopped_word != NULL)
189  chopped_word = new TWERD(*source.chopped_word);
190  if (source.rebuild_word != NULL)
191  rebuild_word = new TWERD(*source.rebuild_word);
192  // TODO(rays) Do we ever need to copy the seam_array?
193  denorm = source.denorm;
194  if (source.box_word != NULL)
195  box_word = new tesseract::BoxWord(*source.box_word);
196  best_state = source.best_state;
197  correct_text = source.correct_text;
198 
199  if (source.best_choice != NULL) {
200  best_choice = new WERD_CHOICE(*source.best_choice);
201  raw_choice = new WERD_CHOICE(*source.raw_choice);
203  }
204  else {
205  best_choice = NULL;
206  raw_choice = NULL;
209  }
210  }
211  for (int i = 0; i < source.alt_choices.length(); ++i) {
212  const WERD_CHOICE *choice = source.alt_choices[i];
213  ASSERT_HOST(choice != NULL);
214  alt_choices.push_back(new WERD_CHOICE(*choice));
215  }
216  alt_states = source.alt_states;
217  if (source.ep_choice != NULL) {
218  ep_choice = new WERD_CHOICE(*source.ep_choice);
219  } else {
220  ep_choice = NULL;
221  }
222  reject_map = source.reject_map;
223  combination = source.combination;
224  part_of_combo = source.part_of_combo;
225  CopySimpleFields(source);
226  if (source.blamer_bundle != NULL) {
227  blamer_bundle = new BlamerBundle(*(source.blamer_bundle));
228  }
229  return *this;
230 }
231 
232 // Copies basic fields that don't involve pointers that might be useful
233 // to copy when making one WERD_RES from another.
235  tess_failed = source.tess_failed;
236  tess_accepted = source.tess_accepted;
238  done = source.done;
240  small_caps = source.small_caps;
241  italic = source.italic;
242  bold = source.bold;
243  fontinfo = source.fontinfo;
244  fontinfo2 = source.fontinfo2;
247  x_height = source.x_height;
248  caps_height = source.caps_height;
249  guessed_x_ht = source.guessed_x_ht;
251  reject_spaces = source.reject_spaces;
252  uch_set = source.uch_set;
253  tesseract = source.tesseract;
254 }
255 
256 // Initializes a blank (default constructed) WERD_RES from one that has
257 // already been recognized.
258 // Use SetupFor*Recognition afterwards to complete the setup and make
259 // it ready for a retry recognition.
261  word = source.word;
262  CopySimpleFields(source);
263  if (source.blamer_bundle != NULL) {
264  blamer_bundle = new BlamerBundle();
266  }
267 }
268 
269 // Sets up the members used in recognition:
270 // bln_boxes, chopped_word, seam_array, denorm, best_choice, raw_choice.
271 // Returns false if the word is empty and sets up fake results.
273  tesseract::Tesseract* tess, Pix* pix,
274  bool numeric_mode,
275  bool use_body_size,
276  ROW *row, BLOCK* block) {
277  tesseract = tess;
278  POLY_BLOCK* pb = block != NULL ? block->poly_block() : NULL;
279  if (word->cblob_list()->empty() || (pb != NULL && !pb->IsText())) {
280  // Empty words occur when all the blobs have been moved to the rej_blobs
281  // list, which seems to occur frequently in junk.
282  SetupFake(unicharset_in);
283  word->set_flag(W_REP_CHAR, false);
284  return false;
285  }
286  ClearResults();
287  SetupWordScript(unicharset_in);
289  if (use_body_size && row->body_size() > 0.0f) {
290  chopped_word->SetupBLNormalize(block, row, row->body_size(),
291  numeric_mode, &denorm);
292  } else {
293  chopped_word->SetupBLNormalize(block, row, x_height, numeric_mode, &denorm);
294  }
295  // The image will be 8-bit grey if the input was grey or color. Note that in
296  // a grey image 0 is black and 255 is white. If the input was binary, then
297  // the pix will be binary and 0 is white, with 1 being black.
298  // To tell the difference pixGetDepth() will return 8 or 1.
299  denorm.set_pix(pix);
300  // The inverse flag will be true iff the word has been determined to be white
301  // on black, and is independent of whether the pix is 8 bit or 1 bit.
306  best_choice = new WERD_CHOICE(&unicharset_in);
308  raw_choice = new WERD_CHOICE(&unicharset_in);
309  raw_choice->make_bad();
311  return true;
312 }
313 
314 // Sets up the members used in recognition:
315 // bln_boxes, chopped_word, seam_array, denorm, best_choice, raw_choice.
316 // Returns false if the word is empty and sets up fake results.
318  tesseract::Tesseract* tess,
319  const BLOCK* block) {
320  tesseract = tess;
321  POLY_BLOCK* pb = block != NULL ? block->poly_block() : NULL;
322  if (pb != NULL && !pb->IsText()) {
323  // Ignore words in graphic regions.
324  SetupFake(unicharset_in);
325  word->set_flag(W_REP_CHAR, false);
326  return false;
327  }
328  ClearResults();
329  SetupWordScript(unicharset_in);
330  TBOX word_box = word->bounding_box();
332  word_box.left(), word_box.bottom(),
333  1.0f, 1.0f, 0.0f, 0.0f);
335  return true;
336 }
337 
338 // Sets up the members used in recognition for an empty recognition result:
339 // bln_boxes, chopped_word, seam_array, denorm, best_choice, raw_choice.
340 void WERD_RES::SetupFake(const UNICHARSET& unicharset_in) {
341  ClearResults();
342  SetupWordScript(unicharset_in);
343  chopped_word = new TWERD;
344  rebuild_word = new TWERD;
347  int blob_count = word->cblob_list()->length();
348  best_choice = new WERD_CHOICE("", NULL, 10.0f, -1.0f,
349  TOP_CHOICE_PERM, unicharset_in);
350  raw_choice = new WERD_CHOICE("", NULL, 10.0f, -1.0f,
351  TOP_CHOICE_PERM, unicharset_in);
352  if (blob_count > 0) {
353  BLOB_CHOICE** fake_choices = new BLOB_CHOICE*[blob_count];
354  // For non-text blocks, just pass any blobs through to the box_word
355  // and call the word failed with a fake classification.
356  C_BLOB_IT b_it(word->cblob_list());
357  int blob_id = 0;
358  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
359  TBOX box = b_it.data()->bounding_box();
360  box_word->InsertBox(box_word->length(), box);
361  fake_choices[blob_id++] = new BLOB_CHOICE(0, 10.0f, -1.0f,
362  -1, -1, -1, 0, 0, false);
363  }
364  FakeClassifyWord(blob_count, fake_choices);
365  delete [] fake_choices;
366  }
367  tess_failed = true;
368 }
369 
371  uch_set = &uch;
372  int script = uch.default_sid();
373  word->set_script_id(script);
375  word->set_flag(W_SCRIPT_IS_LATIN, script == uch.latin_sid());
376 }
377 
378 // Sets up the blamer_bundle if it is not null, using the initialized denorm.
380  if (blamer_bundle != NULL) {
381  blamer_bundle->norm_box_tolerance = kBlamerBoxTolerance * denorm.x_scale();
382  TPOINT topleft;
383  TPOINT botright;
384  TPOINT norm_topleft;
385  TPOINT norm_botright;
386  for (int b = 0; b < blamer_bundle->truth_word.length(); ++b) {
387  const TBOX &box = blamer_bundle->truth_word.BlobBox(b);
388  topleft.x = box.left();
389  topleft.y = box.top();
390  botright.x = box.right();
391  botright.y = box.bottom();
392  denorm.NormTransform(topleft, &norm_topleft);
393  denorm.NormTransform(botright, &norm_botright);
394  TBOX norm_box(norm_topleft.x, norm_botright.y,
395  norm_botright.x, norm_topleft.y);
397  }
398  }
399 }
400 
401 // Simple helper moves the ownership of the pointer data from src to dest,
402 // first deleting anything in dest, and nulling out src afterwards.
403 template<class T> static void MovePointerData(T** dest, T**src) {
404  delete *dest;
405  *dest = *src;
406  *src = NULL;
407 }
408 
409 // Moves the results fields from word to this. This takes ownership of all
410 // the data, so src can be destructed.
412  denorm = word->denorm;
413  MovePointerData(&chopped_word, &word->chopped_word);
414  MovePointerData(&rebuild_word, &word->rebuild_word);
415  MovePointerData(&box_word, &word->box_word);
416  if (seam_array != NULL)
418  seam_array = word->seam_array;
419  word->seam_array = NULL;
420  best_state.move(&word->best_state);
422  MovePointerData(&best_choice, &word->best_choice);
423  MovePointerData(&raw_choice, &word->raw_choice);
425  alt_choices.move(&word->alt_choices);
426  alt_states.move(&word->alt_states);
427  reject_map = word->reject_map;
428  if (word->blamer_bundle != NULL) {
429  assert(blamer_bundle != NULL);
431  }
432  CopySimpleFields(*word);
433 }
434 
435 // Replace the best choice and rebuild box word.
437  const WERD_CHOICE& choice,
438  const GenericVector<int>& segmentation_state) {
439  delete best_choice;
440  best_choice = new WERD_CHOICE(choice);
441  best_state = segmentation_state;
443  SetupBoxWord();
444  // Make up a fake reject map of the right length to keep the
445  // rejection pass happy.
446  reject_map.initialise(segmentation_state.length());
449 }
450 
451 // Builds the rebuild_word from the chopped_word and the best_state.
453  if (rebuild_word != NULL)
454  delete rebuild_word;
455  rebuild_word = new TWERD;
456  if (seam_array == NULL) {
458  }
459  TBLOB* prev_blob = NULL;
460  int start = 0;
461  for (int i = 0; i < best_state.size(); ++i) {
462  int length = best_state[i];
463  join_pieces(chopped_word->blobs, seam_array, start, start + length - 1);
464  TBLOB* blob = chopped_word->blobs;
465  for (int i = 0; i < start; ++i)
466  blob = blob->next;
467  TBLOB* copy_blob = new TBLOB(*blob);
468  if (prev_blob == NULL)
469  rebuild_word->blobs = copy_blob;
470  else
471  prev_blob->next = copy_blob;
472  prev_blob = copy_blob;
473  break_pieces(blob, seam_array, start, start + length - 1);
474  start += length;
475  }
476 }
477 
478 // Copies the chopped_word to the rebuild_word, faking a best_state as well.
479 // Also sets up the output box_word.
481  if (rebuild_word != NULL)
482  delete rebuild_word;
484  SetupBoxWord();
485  int word_len = box_word->length();
486  best_state.reserve(word_len);
487  correct_text.reserve(word_len);
488  for (int i = 0; i < word_len; ++i) {
491  }
492 }
493 
494 // Sets/replaces the box_word with one made from the rebuild_word.
496  if (box_word != NULL)
497  delete box_word;
501 }
502 
503 // Sets up the script positions in the output boxword using the best_choice
504 // to get the unichars, and the unicharset to get the target positions.
507  best_choice);
508 }
509 
510 void WERD_RES::WithoutFootnoteSpan(int *pstart, int *pend) const {
511  int end = best_choice->length();
512  while (end > 0 &&
513  uch_set->get_isdigit(best_choice->unichar_ids()[end - 1]) &&
515  end--;
516  }
517  int start = 0;
518  while (start < end &&
521  start++;
522  }
523  *pstart = start;
524  *pend = end;
525 }
526 
528  const WERD_CHOICE &word, const GenericVector<int> &state,
529  int *pstart, int *pend) const {
530  int len = word.length();
531  *pstart = 0;
532  *pend = len;
533  if (len < 2) return;
534  if (!word.unicharset()->get_isdigit(word.unichar_ids()[len - 1]) &&
535  !word.unicharset()->get_isdigit(word.unichar_ids()[0])) return;
536 
537  // ok, now that we know the word ends in digits, do the expensive bit of
538  // figuring out if they're superscript.
539  WERD_RES copy(*this);
540  copy.ReplaceBestChoice(word, state);
541  copy.WithoutFootnoteSpan(pstart, pend);
542 }
543 
544 // Classifies the word with some already-calculated BLOB_CHOICEs.
545 // The choices are an array of blob_count pointers to BLOB_CHOICE,
546 // providing a single classifier result for each blob.
547 // The BLOB_CHOICEs are consumed and the word takes ownership.
548 // The number of blobs in the outword must match blob_count.
549 void WERD_RES::FakeClassifyWord(int blob_count, BLOB_CHOICE** choices) {
550  // Setup the WERD_RES.
552  ASSERT_HOST(blob_count == box_word->length());
554  BLOB_CHOICE_LIST_CLIST* word_choices = new BLOB_CHOICE_LIST_CLIST;
555  BLOB_CHOICE_LIST_C_IT bc_it(word_choices);
556  for (int c = 0; c < blob_count; ++c) {
558  choices[c]->unichar_id(), 1,
559  choices[c]->rating(), choices[c]->certainty());
560  BLOB_CHOICE_LIST* choice_list = new BLOB_CHOICE_LIST;
561  BLOB_CHOICE_IT choice_it(choice_list);
562  choice_it.add_after_then_move(choices[c]);
563  bc_it.add_after_then_move(choice_list);
564  }
565  best_choice->set_blob_choices(word_choices);
566  delete raw_choice;
568  reject_map.initialise(blob_count);
569 }
570 
571 // Copies the best_choice strings to the correct_text for adaption/training.
575  for (int i = 0; i < best_choice->length(); ++i) {
576  UNICHAR_ID choice_id = best_choice->unichar_id(i);
577  const char* blob_choice = uch_set->id_to_unichar(choice_id);
578  correct_text.push_back(STRING(blob_choice));
579  }
580 }
581 
582 // Merges 2 adjacent blobs in the result if the permanent callback
583 // class_cb returns other than INVALID_UNICHAR_ID, AND the permanent
584 // callback box_cb is NULL or returns true, setting the merged blob
585 // result to the class returned from class_cb.
586 // Returns true if anything was merged.
590 
591  BLOB_CHOICE_LIST_CLIST *blob_choices) {
592  bool modified = false;
593  for (int i = 0; i + 1 < best_choice->length(); ++i) {
594  UNICHAR_ID new_id = class_cb->Run(best_choice->unichar_id(i),
595  best_choice->unichar_id(i+1));
596  if (new_id != INVALID_UNICHAR_ID &&
597  (box_cb == NULL || box_cb->Run(box_word->BlobBox(i),
598  box_word->BlobBox(i + 1)))) {
599  if (reject_map.length() == best_choice->length())
601  best_choice->set_unichar_id(new_id, i);
603  raw_choice->set_unichar_id(new_id, i);
605  modified = true;
606  rebuild_word->MergeBlobs(i, i + 2);
607  box_word->MergeBoxes(i, i + 2);
608  if (i + 1 < best_state.length()) {
609  best_state[i] += best_state[i + 1];
610  best_state.remove(i + 1);
611  }
612 
613  BLOB_CHOICE_LIST_C_IT blob_choices_it(blob_choices);
614  for (int j = 0; j < i; ++j)
615  blob_choices_it.forward();
616  BLOB_CHOICE_IT it1(blob_choices_it.data()); // first choices
617  BLOB_CHOICE_LIST* target_choices = blob_choices_it.data_relative(1);
618  BLOB_CHOICE_IT it2(target_choices); // second choices
619  float certainty = it2.data()->certainty();
620  float rating = it2.data()->rating();
621  if (it1.data()->certainty() < certainty) {
622  certainty = it1.data()->certainty();
623  rating = it1.data()->rating();
624  target_choices = blob_choices_it.data();
625  blob_choices_it.forward();
626  }
627  delete blob_choices_it.extract(); // get rid of spare
628  // TODO(rays) Fix the choices so they contain the desired result.
629  // Do we really need to ? Only needed for fix_quotes, which should be
630  // going away.
631  }
632  }
633  delete class_cb;
634  delete box_cb;
635  return modified;
636 }
637 
638 // TODO(tkielbus) Decide between keeping this behavior here or modifying the
639 // training data.
640 
641 // Utility function for fix_quotes
642 // Return true if the next character in the string (given the UTF8 length in
643 // bytes) is a quote character.
644 static int is_simple_quote(const char* signed_str, int length) {
645  const unsigned char* str =
646  reinterpret_cast<const unsigned char*>(signed_str);
647  // Standard 1 byte quotes.
648  return (length == 1 && (*str == '\'' || *str == '`')) ||
649  // UTF-8 3 bytes curved quotes.
650  (length == 3 && ((*str == 0xe2 &&
651  *(str + 1) == 0x80 &&
652  *(str + 2) == 0x98) ||
653  (*str == 0xe2 &&
654  *(str + 1) == 0x80 &&
655  *(str + 2) == 0x99)));
656 }
657 
658 // Callback helper for fix_quotes returns a double quote if both
659 // arguments are quote, otherwise INVALID_UNICHAR_ID.
661  const char *ch = uch_set->id_to_unichar(id1);
662  const char *next_ch = uch_set->id_to_unichar(id2);
663  if (is_simple_quote(ch, strlen(ch)) &&
664  is_simple_quote(next_ch, strlen(next_ch)))
665  return uch_set->unichar_to_id("\"");
666  return INVALID_UNICHAR_ID;
667 }
668 
669 // Change pairs of quotes to double quotes.
670 void WERD_RES::fix_quotes(BLOB_CHOICE_LIST_CLIST* blob_choices) {
671  if (!uch_set->contains_unichar("\"") ||
673  return; // Don't create it if it is disallowed.
674 
677  NULL,
678  blob_choices);
679 }
680 
681 // Callback helper for fix_hyphens returns UNICHAR_ID of - if both
682 // arguments are hyphen, otherwise INVALID_UNICHAR_ID.
684  const char *ch = uch_set->id_to_unichar(id1);
685  const char *next_ch = uch_set->id_to_unichar(id2);
686  if (strlen(ch) == 1 && strlen(next_ch) == 1 &&
687  (*ch == '-' || *ch == '~') && (*next_ch == '-' || *next_ch == '~'))
688  return uch_set->unichar_to_id("-");
689  return INVALID_UNICHAR_ID;
690 }
691 
692 // Callback helper for fix_hyphens returns true if box1 and box2 overlap
693 // (assuming both on the same textline, are in order and a chopped em dash.)
694 bool WERD_RES::HyphenBoxesOverlap(const TBOX& box1, const TBOX& box2) {
695  return box1.right() >= box2.left();
696 }
697 
698 // Change pairs of hyphens to a single hyphen if the bounding boxes touch
699 // Typically a long dash which has been segmented.
700 void WERD_RES::fix_hyphens(BLOB_CHOICE_LIST_CLIST *blob_choices) {
701  if (!uch_set->contains_unichar("-") ||
703  return; // Don't create it if it is disallowed.
704 
708  blob_choices);
709 }
710 
711 // Callback helper for merge_tess_fails returns a space if both
712 // arguments are space, otherwise INVALID_UNICHAR_ID.
714  if (id1 == id2 && id1 == uch_set->unichar_to_id(" "))
715  return id1;
716  else
717  return INVALID_UNICHAR_ID;
718 }
719 
720 // Change pairs of tess failures to a single one
725  int len = best_choice->length();
726  ASSERT_HOST(reject_map.length() == len);
727  ASSERT_HOST(box_word->length() == len);
728  }
729 }
730 
731 // Returns true if the collection of count pieces, starting at start, are all
732 // natural connected components, ie there are no real chops involved.
733 bool WERD_RES::PiecesAllNatural(int start, int count) const {
734  // all seams must have no splits.
735  for (int index = start; index < start + count - 1; ++index) {
736  if (index >= 0 && index < array_count(seam_array)) {
737  SEAM* seam = reinterpret_cast<SEAM *>(array_value(seam_array, index));
738  if (seam != NULL && seam->split1 != NULL)
739  return false;
740  }
741  }
742  return true;
743 }
744 
745 
747  Clear();
748 }
749 
751  tess_failed = FALSE;
754  done = FALSE;
756  small_caps = false;
757  italic = FALSE;
758  bold = FALSE;
759  // The fontinfos and tesseract count as non-pointers as they point to
760  // data owned elsewhere.
761  fontinfo = NULL;
762  fontinfo2 = NULL;
763  tesseract = NULL;
764  fontinfo_id_count = 0;
765  fontinfo_id2_count = 0;
766  x_height = 0.0;
767  caps_height = 0.0;
768  guessed_x_ht = TRUE;
770  combination = FALSE;
773 }
774 
776  word = NULL;
777  bln_boxes = NULL;
778  uch_set = NULL;
779  chopped_word = NULL;
780  rebuild_word = NULL;
781  box_word = NULL;
782  seam_array = NULL;
783  best_choice = NULL;
784  raw_choice = NULL;
785  ep_choice = NULL;
787 }
788 
790  if (word != NULL && combination) {
791  delete word;
792  }
793  word = NULL;
794  delete blamer_bundle;
796  ClearResults();
797 }
798 
800  done = false;
801  fontinfo = NULL;
802  fontinfo2 = NULL;
803  fontinfo_id_count = 0;
804  fontinfo_id2_count = 0;
805  if (bln_boxes != NULL) {
806  delete bln_boxes;
807  bln_boxes = NULL;
808  }
809  if (chopped_word != NULL) {
810  delete chopped_word;
811  chopped_word = NULL;
812  }
813  if (rebuild_word != NULL) {
814  delete rebuild_word;
815  rebuild_word = NULL;
816  }
817  if (box_word != NULL) {
818  delete box_word;
819  box_word = NULL;
820  }
821  best_state.clear();
823  if (seam_array != NULL) {
825  seam_array = NULL;
826  }
827  if (best_choice != NULL) {
828  delete best_choice;
829  delete raw_choice;
830  best_choice = NULL;
831  raw_choice = NULL;
832  }
833  if (!alt_choices.empty()) {
835  alt_choices.clear();
836  }
837  alt_states.clear();
838  if (ep_choice != NULL) {
839  delete ep_choice;
840  ep_choice = NULL;
841  }
843 }
844 
845 bool PAGE_RES_IT::operator ==(const PAGE_RES_IT &other) const {
846  return word_res == other.word_res &&
847  row_res == other.row_res &&
848  block_res == other.block_res;
849 }
850 
851 int PAGE_RES_IT::cmp(const PAGE_RES_IT &other) const {
852  ASSERT_HOST(page_res == other.page_res);
853  if (other.block_res == NULL) {
854  // other points to the end of the page.
855  if (block_res == NULL)
856  return 0;
857  return -1;
858  }
859  if (block_res == NULL) {
860  return 1; // we point to the end of the page.
861  }
862  if (block_res == other.block_res) {
863  if (other.row_res == NULL || row_res == NULL) {
864  // this should only happen if we hit an image block.
865  return 0;
866  }
867  if (row_res == other.row_res) {
868  // we point to the same block and row.
869  ASSERT_HOST(other.word_res != NULL && word_res != NULL);
870  if (word_res == other.word_res) {
871  // we point to the same word!
872  return 0;
873  }
874 
875  WERD_RES_IT word_res_it(&row_res->word_res_list);
876  for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
877  word_res_it.forward()) {
878  if (word_res_it.data() == word_res) {
879  return -1;
880  } else if (word_res_it.data() == other.word_res) {
881  return 1;
882  }
883  }
884  ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == NULL);
885  }
886 
887  // we both point to the same block, but different rows.
888  ROW_RES_IT row_res_it(&block_res->row_res_list);
889  for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list();
890  row_res_it.forward()) {
891  if (row_res_it.data() == row_res) {
892  return -1;
893  } else if (row_res_it.data() == other.row_res) {
894  return 1;
895  }
896  }
897  ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == NULL);
898  }
899 
900  // We point to different blocks.
901  BLOCK_RES_IT block_res_it(&page_res->block_res_list);
902  for (block_res_it.mark_cycle_pt();
903  !block_res_it.cycled_list(); block_res_it.forward()) {
904  if (block_res_it.data() == block_res) {
905  return -1;
906  } else if (block_res_it.data() == other.block_res) {
907  return 1;
908  }
909  }
910  // Shouldn't happen...
911  ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == NULL);
912  return 0;
913 }
914 
915 // Inserts the new_word and a corresponding WERD_RES before the current
916 // position. The simple fields of the WERD_RES are copied from clone_res and
917 // the resulting WERD_RES is returned for further setup with best_choice etc.
919  WERD* new_word) {
920  // Insert new_word into the ROW.
921  WERD_IT w_it(row()->row->word_list());
922  for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
923  WERD* word = w_it.data();
924  if (word == word_res->word)
925  break;
926  }
927  ASSERT_HOST(!w_it.cycled_list());
928  w_it.add_before_then_move(new_word);
929  // Make a WERD_RES for the new_word.
930  WERD_RES* new_res = new WERD_RES(new_word);
931  new_res->CopySimpleFields(clone_res);
932  // Insert into the appropriate place in the ROW_RES.
933  WERD_RES_IT wr_it(&row()->word_res_list);
934  for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
935  WERD_RES* word = wr_it.data();
936  if (word == word_res)
937  break;
938  }
939  ASSERT_HOST(!wr_it.cycled_list());
940  wr_it.add_before_then_move(new_res);
941  if (wr_it.at_first()) {
942  // This is the new first word, so reset the member iterator so it
943  // detects the cycled_list state correctly.
944  ResetWordIterator();
945  }
946  return new_res;
947 }
948 
949 // Deletes the current WERD_RES and its underlying WERD.
951  // Check that this word is as we expect. part_of_combos are NEVER iterated
952  // by the normal iterator, so we should never be trying to delete them.
953  ASSERT_HOST(!word_res->part_of_combo);
954  if (!word_res->combination) {
955  // Combinations own their own word, so we won't find the word on the
956  // row's word_list, but it is legitimate to try to delete them.
957  // Delete word from the ROW when not a combination.
958  WERD_IT w_it(row()->row->word_list());
959  for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
960  if (w_it.data() == word_res->word) {
961  break;
962  }
963  }
964  ASSERT_HOST(!w_it.cycled_list());
965  delete w_it.extract();
966  }
967  // Remove the WERD_RES for the new_word.
968  // Remove the WORD_RES from the ROW_RES.
969  WERD_RES_IT wr_it(&row()->word_res_list);
970  for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
971  if (wr_it.data() == word_res) {
972  word_res = NULL;
973  break;
974  }
975  }
976  ASSERT_HOST(!wr_it.cycled_list());
977  delete wr_it.extract();
978  ResetWordIterator();
979 }
980 
981 /*************************************************************************
982  * PAGE_RES_IT::restart_page
983  *
984  * Set things up at the start of the page
985  *************************************************************************/
986 
988  block_res_it.set_to_list(&page_res->block_res_list);
989  block_res_it.mark_cycle_pt();
990  prev_block_res = NULL;
991  prev_row_res = NULL;
992  prev_word_res = NULL;
993  block_res = NULL;
994  row_res = NULL;
995  word_res = NULL;
996  next_block_res = NULL;
997  next_row_res = NULL;
998  next_word_res = NULL;
999  internal_forward(true, empty_ok);
1000  return internal_forward(false, empty_ok);
1001 }
1002 
1003 // Recovers from operations on the current word, such as in InsertCloneWord
1004 // and DeleteCurrentWord.
1005 // Resets the word_res_it so that it is one past the next_word_res, as
1006 // it should be after internal_forward. If next_row_res != row_res,
1007 // then the next_word_res is in the next row, so there is no need to do
1008 // anything, since operations on the current word will not have disturbed
1009 // the word_res_it.
1010 void PAGE_RES_IT::ResetWordIterator() {
1011  if (row_res == next_row_res) {
1012  // Reset the member iterator so it can move forward and detect the
1013  // cycled_list state correctly.
1014  word_res_it.move_to_first();
1015  word_res_it.mark_cycle_pt();
1016  while (!word_res_it.cycled_list() && word_res_it.data() != next_word_res)
1017  word_res_it.forward();
1018  ASSERT_HOST(!word_res_it.cycled_list());
1019  word_res_it.forward();
1020  }
1021 }
1022 
1023 /*************************************************************************
1024  * PAGE_RES_IT::internal_forward
1025  *
1026  * Find the next word on the page. If empty_ok is true, then non-text blocks
1027  * and text blocks with no text are visited as if they contain a single
1028  * imaginary word in a single imaginary row. (word() and row() both return NULL
1029  * in such a block and the return value is NULL.)
1030  * If empty_ok is false, the old behaviour is maintained. Each real word
1031  * is visited and empty and non-text blocks and rows are skipped.
1032  * new_block is used to initialize the iterators for a new block.
1033  * The iterator maintains pointers to block, row and word for the previous,
1034  * current and next words. These are correct, regardless of block/row
1035  * boundaries. NULL values denote start and end of the page.
1036  *************************************************************************/
1037 
1038 WERD_RES *PAGE_RES_IT::internal_forward(bool new_block, bool empty_ok) {
1039  bool new_row = false;
1040 
1041  prev_block_res = block_res;
1042  prev_row_res = row_res;
1043  prev_word_res = word_res;
1044  block_res = next_block_res;
1045  row_res = next_row_res;
1046  word_res = next_word_res;
1047  next_block_res = NULL;
1048  next_row_res = NULL;
1049  next_word_res = NULL;
1050 
1051  while (!block_res_it.cycled_list()) {
1052  if (new_block) {
1053  new_block = false;
1054  row_res_it.set_to_list(&block_res_it.data()->row_res_list);
1055  row_res_it.mark_cycle_pt();
1056  if (row_res_it.empty() && empty_ok) {
1057  next_block_res = block_res_it.data();
1058  break;
1059  }
1060  new_row = true;
1061  }
1062  while (!row_res_it.cycled_list()) {
1063  if (new_row) {
1064  new_row = false;
1065  word_res_it.set_to_list(&row_res_it.data()->word_res_list);
1066  word_res_it.mark_cycle_pt();
1067  }
1068  // Skip any part_of_combo words.
1069  while (!word_res_it.cycled_list() && word_res_it.data()->part_of_combo)
1070  word_res_it.forward();
1071  if (!word_res_it.cycled_list()) {
1072  next_block_res = block_res_it.data();
1073  next_row_res = row_res_it.data();
1074  next_word_res = word_res_it.data();
1075  word_res_it.forward();
1076  goto foundword;
1077  }
1078  // end of row reached
1079  row_res_it.forward();
1080  new_row = true;
1081  }
1082  // end of block reached
1083  block_res_it.forward();
1084  new_block = true;
1085  }
1086  foundword:
1087  // Update prev_word_best_choice pointer.
1090  (new_block || prev_word_res == NULL) ? NULL : prev_word_res->best_choice;
1091  }
1092  return word_res;
1093 }
1094 
1095 /*************************************************************************
1096  * PAGE_RES_IT::restart_row()
1097  *
1098  * Move to the beginning (leftmost word) of the current row.
1099  *************************************************************************/
1101  ROW_RES *row = this->row();
1102  if (!row) return NULL;
1103  for (restart_page(); this->row() != row; forward()) {
1104  // pass
1105  }
1106  return word();
1107 }
1108 
1109 /*************************************************************************
1110  * PAGE_RES_IT::forward_paragraph
1111  *
1112  * Move to the beginning of the next paragraph, allowing empty blocks.
1113  *************************************************************************/
1114 
1116  while (block_res == next_block_res &&
1117  (next_row_res != NULL && next_row_res->row != NULL &&
1118  row_res->row->para() == next_row_res->row->para())) {
1119  internal_forward(false, true);
1120  }
1121  return internal_forward(false, true);
1122 }
1123 
1124 /*************************************************************************
1125  * PAGE_RES_IT::forward_block
1126  *
1127  * Move to the beginning of the next block, allowing empty blocks.
1128  *************************************************************************/
1129 
1131  while (block_res == next_block_res) {
1132  internal_forward(false, true);
1133  }
1134  return internal_forward(false, true);
1135 }
1136 
1138  inT16 chars_in_word;
1139  inT16 rejects_in_word = 0;
1140 
1141  chars_in_word = word_res->reject_map.length ();
1142  page_res->char_count += chars_in_word;
1143  block_res->char_count += chars_in_word;
1144  row_res->char_count += chars_in_word;
1145 
1146  rejects_in_word = word_res->reject_map.reject_count ();
1147 
1148  page_res->rej_count += rejects_in_word;
1149  block_res->rej_count += rejects_in_word;
1150  row_res->rej_count += rejects_in_word;
1151  if (chars_in_word == rejects_in_word)
1152  row_res->whole_word_rej_count += rejects_in_word;
1153 }