Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
ratngs.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: ratngs.cpp (Formerly ratings.c)
3  * Description: Code to manipulate the BLOB_CHOICE and WERD_CHOICE classes.
4  * Author: Ray Smith
5  * Created: Thu Apr 23 13:23:29 BST 1992
6  *
7  * (C) Copyright 1992, Hewlett-Packard Ltd.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #include "mfcpch.h"
21 #include "ratngs.h"
22 
23 #include "callcpp.h"
24 #include "genericvector.h"
25 #include "unicharset.h"
26 
27 ELISTIZE (BLOB_CHOICE) CLISTIZE (BLOB_CHOICE_LIST) CLISTIZE (WERD_CHOICE);
28 
29 const float WERD_CHOICE::kBadRating = 100000.0;
30 
31 static const char kPermuterTypeNoPerm[] = "None";
32 static const char kPermuterTypePuncPerm[] = "Punctuation";
33 static const char kPermuterTypeTopPerm[] = "Top Choice";
34 static const char kPermuterTypeLowerPerm[] = "Top Lower Case";
35 static const char kPermuterTypeUpperPerm[] = "Top Upper Case";
36 static const char kPermuterTypeNgramPerm[] = "Ngram";
37 static const char kPermuterTypeNumberPerm[] = "Number";
38 static const char kPermuterTypeUserPatPerm[] = "User Pattern";
39 static const char kPermuterTypeSysDawgPerm[] = "System Dictionary";
40 static const char kPermuterTypeDocDawgPerm[] = "Document Dictionary";
41 static const char kPermuterTypeUserDawgPerm[] = "User Dictionary";
42 static const char kPermuterTypeFreqDawgPerm[] = "Frequent Words Dictionary";
43 static const char kPermuterTypeCompoundPerm[] = "Compound";
44 
45 static const char * const kPermuterTypeNames[] = {
46  kPermuterTypeNoPerm, // 0
47  kPermuterTypePuncPerm, // 1
48  kPermuterTypeTopPerm, // 2
49  kPermuterTypeLowerPerm, // 3
50  kPermuterTypeUpperPerm, // 4
51  kPermuterTypeNgramPerm, // 5
52  kPermuterTypeNumberPerm, // 6
53  kPermuterTypeUserPatPerm, // 7
54  kPermuterTypeSysDawgPerm, // 8
55  kPermuterTypeDocDawgPerm, // 9
56  kPermuterTypeUserDawgPerm, // 10
57  kPermuterTypeFreqDawgPerm, // 11
58  kPermuterTypeCompoundPerm // 12
59 };
60 
66 BLOB_CHOICE::BLOB_CHOICE(UNICHAR_ID src_unichar_id, // character id
67  float src_rating, // rating
68  float src_cert, // certainty
69  inT16 src_fontinfo_id, // font
70  inT16 src_fontinfo_id2, // 2nd choice font
71  int src_script_id, // script
72  inT16 min_xheight, // min xheight allowed
73  inT16 max_xheight, // max xheight by this char
74  bool adapted // adapted match or not
75  ) {
76  unichar_id_ = src_unichar_id;
77  rating_ = src_rating;
78  certainty_ = src_cert;
79  fontinfo_id_ = src_fontinfo_id;
80  fontinfo_id2_ = src_fontinfo_id2;
81  script_id_ = src_script_id;
82  language_model_state_ = NULL;
83  min_xheight_ = min_xheight;
84  max_xheight_ = max_xheight;
85  adapted_ = adapted;
86 }
87 
94  unichar_id_ = other.unichar_id();
95  rating_ = other.rating();
96  certainty_ = other.certainty();
97  fontinfo_id_ = other.fontinfo_id();
98  fontinfo_id2_ = other.fontinfo_id2();
99  script_id_ = other.script_id();
100  language_model_state_ = NULL;
101  min_xheight_ = other.min_xheight_;
102  max_xheight_ = other.max_xheight_;
103  adapted_ = other.adapted_;
104 }
105 
112 WERD_CHOICE::WERD_CHOICE(const char *src_string,
113  const UNICHARSET &unicharset)
114  : unicharset_(&unicharset){
115  STRING src_lengths;
116  const char *ptr = src_string;
117  const char *end = src_string + strlen(src_string);
118  int step = unicharset.step(ptr);
119  for (; ptr < end && step > 0;
120  step = unicharset.step(ptr), src_lengths += step, ptr += step);
121  if (step != 0 && ptr == end) {
122  this->init(src_string, src_lengths.string(),
123  0.0, 0.0, NO_PERM);
124  } else { // there must have been an invalid unichar in the string
125  this->init(8);
126  this->make_bad();
127  }
128 }
129 
140 void WERD_CHOICE::init(const char *src_string,
141  const char *src_lengths,
142  float src_rating,
143  float src_certainty,
144  uinT8 src_permuter) {
145  int src_string_len = strlen(src_string);
146  if (src_string_len == 0) {
147  this->init(8);
148  } else {
149  this->init(src_lengths ? strlen(src_lengths): src_string_len);
150  length_ = reserved_;
151  int offset = 0;
152  for (int i = 0; i < length_; ++i) {
153  int unichar_length = src_lengths ? src_lengths[i] : 1;
154  unichar_ids_[i] =
155  unicharset_->unichar_to_id(src_string+offset, unichar_length);
156  fragment_lengths_[i] = 1;
157  offset += unichar_length;
158  }
159  }
160  rating_ = src_rating;
161  certainty_ = src_certainty;
162  permuter_ = src_permuter;
163 }
164 
169  delete[] unichar_ids_;
170  delete[] fragment_lengths_;
171  delete_blob_choices();
172 }
173 
174 const char *WERD_CHOICE::permuter_name() const {
175  return kPermuterTypeNames[permuter_];
176 }
177 
184 void WERD_CHOICE::set_blob_choices(BLOB_CHOICE_LIST_CLIST *blob_choices) {
185  if (blob_choices_ != blob_choices) {
186  delete_blob_choices();
187  blob_choices_ = blob_choices;
188  }
189 }
190 
191 
198  for (int i = 0; i < length_; ++i) {
199  if (unichar_ids_[i] == unichar_id) {
200  return true;
201  }
202  }
203  return false;
204 }
205 
213 void WERD_CHOICE::remove_unichar_ids(int start, int num) {
214  ASSERT_HOST(start >= 0 && start + num <= length_);
215  for (int i = start; i+num < length_; ++i) {
216  unichar_ids_[i] = unichar_ids_[i+num];
217  fragment_lengths_[i] = fragment_lengths_[i+num];
218  }
219  length_ -= num;
220 }
221 
228  for (int i = 0; i < length_/2; ++i) {
229  UNICHAR_ID tmp_id = unichar_ids_[i];
230  unichar_ids_[i] = unicharset_->get_mirror(unichar_ids_[length_-1-i]);
231  unichar_ids_[length_-1-i] = unicharset_->get_mirror(tmp_id);
232  }
233  if (length_ % 2 != 0) {
234  unichar_ids_[length_/2] = unicharset_->get_mirror(unichar_ids_[length_/2]);
235  }
236 }
237 
245 void WERD_CHOICE::punct_stripped(int *start, int *end) const {
246  *start = 0;
247  *end = length() - 1;
248  while (*start < length() &&
249  unicharset()->get_ispunctuation(unichar_id(*start))) {
250  (*start)++;
251  }
252  while (*end > -1 &&
253  unicharset()->get_ispunctuation(unichar_id(*end))) {
254  (*end)--;
255  }
256  (*end)++;
257 }
258 
259 WERD_CHOICE WERD_CHOICE::shallow_copy(int start, int end) const {
260  ASSERT_HOST(start >= 0 && start <= length_);
261  ASSERT_HOST(end >= 0 && end <= length_);
262  if (end < start) { end = start; }
263  WERD_CHOICE retval(unicharset_, end - start);
264  for (int i = start; i < end; i++) {
266  unichar_ids_[i], fragment_lengths_[i], 0.0f, 0.0f);
267  }
268  return retval;
269 }
270 
277  int i;
278  for (i = 0; i < length_; ++i) {
279  UNICHARSET::Direction dir = unicharset_->get_direction(unichar_ids_[i]);
280  if (dir == UNICHARSET::U_RIGHT_TO_LEFT ||
282  return true;
283  }
284  }
285  return false;
286 }
287 
295  STRING *word_lengths_str) const {
296  *word_str = "";
297  if (word_lengths_str != NULL) *word_lengths_str = "";
298  for (int i = 0; i < length_; ++i) {
299  const char *ch = unicharset_->id_to_unichar_ext(unichar_ids_[i]);
300  *word_str += ch;
301  if (word_lengths_str != NULL) {
302  *word_lengths_str += strlen(ch);
303  }
304  }
305 }
306 
314  UNICHAR_ID unichar_id, char fragment_length,
315  float rating, float certainty) {
316  if (length_ == reserved_) {
317  this->double_the_size();
318  }
319  this->append_unichar_id_space_allocated(unichar_id, fragment_length,
320  rating, certainty);
321 }
322 
331  // TODO(daria): find out why the choice was cleared this way if any
332  // of the pieces are empty. Add the description of this behavior
333  // to the comments.
334  // if (word_string.length () == 0 || second.word_string.length () == 0) {
335  // word_string = NULL; //make it empty
336  // word_lengths = NULL;
337  // delete_blob_choices();
338  // } else {
339  ASSERT_HOST(unicharset_ == second.unicharset_);
340  while (reserved_ < length_ + second.length()) {
341  this->double_the_size();
342  }
343  const UNICHAR_ID *other_unichar_ids = second.unichar_ids();
344  const char *other_fragment_lengths = second.fragment_lengths();
345  for (int i = 0; i < second.length(); ++i) {
346  unichar_ids_[length_ + i] = other_unichar_ids[i];
347  fragment_lengths_[length_ + i] = other_fragment_lengths[i];
348  }
349  length_ += second.length();
350  rating_ += second.rating(); // add ratings
351  if (second.certainty() < certainty_) // take min
352  certainty_ = second.certainty();
353  if (permuter_ == NO_PERM) {
354  permuter_ = second.permuter();
355  } else if (second.permuter() != NO_PERM &&
356  second.permuter() != permuter_) {
357  permuter_ = COMPOUND_PERM;
358  }
359 
360  // Append a deep copy of second blob_choices if it exists.
361  if (second.blob_choices_ != NULL) {
362  if (this->blob_choices_ == NULL)
363  this->blob_choices_ = new BLOB_CHOICE_LIST_CLIST;
364 
365  BLOB_CHOICE_LIST_C_IT this_blob_choices_it;
366  BLOB_CHOICE_LIST_C_IT second_blob_choices_it;
367 
368  this_blob_choices_it.set_to_list(this->blob_choices_);
369  this_blob_choices_it.move_to_last();
370 
371  second_blob_choices_it.set_to_list(second.blob_choices_);
372 
373  for (second_blob_choices_it.mark_cycle_pt();
374  !second_blob_choices_it.cycled_list();
375  second_blob_choices_it.forward()) {
376 
377  BLOB_CHOICE_LIST* blob_choices_copy = new BLOB_CHOICE_LIST();
378  blob_choices_copy->deep_copy(second_blob_choices_it.data(),
380 
381  this_blob_choices_it.add_after_then_move(blob_choices_copy);
382  }
383  }
384  return *this;
385 }
386 
387 
395  while (reserved_ < source.length()) {
396  this->double_the_size();
397  }
398 
399  unicharset_ = source.unicharset_;
400  const UNICHAR_ID *other_unichar_ids = source.unichar_ids();
401  const char *other_fragment_lengths = source.fragment_lengths();
402  for (int i = 0; i < source.length(); ++i) {
403  unichar_ids_[i] = other_unichar_ids[i];
404  fragment_lengths_[i] = other_fragment_lengths[i];
405  }
406  length_ = source.length();
407  rating_ = source.rating();
408  certainty_ = source.certainty();
409  permuter_ = source.permuter();
410  fragment_mark_ = source.fragment_mark();
411 
412  // Delete existing blob_choices
413  this->delete_blob_choices();
414 
415  // Deep copy blob_choices of source
416  if (source.blob_choices_ != NULL) {
417  BLOB_CHOICE_LIST_C_IT this_blob_choices_it;
418  BLOB_CHOICE_LIST_C_IT source_blob_choices_it;
419 
420  this->blob_choices_ = new BLOB_CHOICE_LIST_CLIST();
421 
422  this_blob_choices_it.set_to_list(this->blob_choices_);
423  source_blob_choices_it.set_to_list(source.blob_choices_);
424 
425  for (source_blob_choices_it.mark_cycle_pt();
426  !source_blob_choices_it.cycled_list();
427  source_blob_choices_it.forward()) {
428 
429  BLOB_CHOICE_LIST* blob_choices_copy = new BLOB_CHOICE_LIST();
430  blob_choices_copy->deep_copy(source_blob_choices_it.data(),
432 
433  this_blob_choices_it.add_after_then_move(blob_choices_copy);
434  }
435  }
436  return *this;
437 }
438 
439 /**********************************************************************
440  * WERD_CHOICE::delete_blob_choices
441  *
442  * Clear the blob_choices list, delete it and set it to NULL.
443  **********************************************************************/
444 void WERD_CHOICE::delete_blob_choices() {
445  if (blob_choices_ != NULL) {
446  blob_choices_->deep_clear();
447  delete blob_choices_;
448  blob_choices_ = NULL;
449  }
450 }
451 
457 const void WERD_CHOICE::print(const char *msg) const {
458  tprintf("%s WERD_CHOICE:\n", msg);
459  tprintf("length_ %d reserved_ %d permuter_ %d\n",
460  length_, reserved_, permuter_);
461  tprintf("rating_ %.4f certainty_ %.4f", rating_, certainty_);
462  if (fragment_mark_) {
463  tprintf(" fragment_mark_ true");
464  }
465  tprintf("\n");
466  if (unichar_string_.length() > 0) {
467  tprintf("unichar_string_ %s unichar_lengths_ %s\n",
468  unichar_string_.string(), unichar_lengths_.string());
469  }
470  tprintf("unichar_ids: ");
471  int i;
472  for (i = 0; i < length_; ++i) {
473  tprintf("%d ", unichar_ids_[i]);
474  }
475  tprintf("\nfragment_lengths_: ");
476  for (i = 0; i < length_; ++i) {
477  tprintf("%d ", fragment_lengths_[i]);
478  }
479  tprintf("\n");
480  fflush(stdout);
481 }
482 
484  const WERD_CHOICE &word2) {
485  const UNICHARSET *uchset = word1.unicharset();
486  if (word2.unicharset() != uchset) return false;
487  int w1start, w1end;
488  word1.punct_stripped(&w1start, &w1end);
489  int w2start, w2end;
490  word2.punct_stripped(&w2start, &w2end);
491  if (w1end - w1start != w2end - w2start) return false;
492  for (int i = 0; i < w1end - w1start; i++) {
493  if (uchset->to_lower(word1.unichar_id(w1start + i)) !=
494  uchset->to_lower(word2.unichar_id(w2start + i))) {
495  return false;
496  }
497  }
498  return true;
499 }
500 
511 void print_ratings_list(const char *msg,
512  BLOB_CHOICE_LIST *ratings,
513  const UNICHARSET &current_unicharset) {
514  if (ratings->length() == 0) {
515  tprintf("%s:<none>\n", msg);
516  return;
517  }
518  if (*msg != '\0') {
519  tprintf("%s\n", msg);
520  }
521  BLOB_CHOICE_IT c_it;
522  c_it.set_to_list(ratings);
523  for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
524  c_it.data()->print(&current_unicharset);
525  if (!c_it.at_last()) tprintf("\n");
526  }
527  tprintf("\n");
528  fflush(stdout);
529 }
530 
536 void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings) {
537  if (ratings->length() == 0) {
538  tprintf("%s:<none>\n", msg);
539  return;
540  }
541  if (*msg != '\0') {
542  tprintf("%s\n", msg);
543  }
544  BLOB_CHOICE_IT c_it;
545  c_it.set_to_list(ratings);
546  for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
547  c_it.data()->print(NULL);
548  if (!c_it.at_last()) tprintf("\n");
549  }
550  tprintf("\n");
551  fflush(stdout);
552 }
553 
564 void print_ratings_info(FILE *fp,
565  BLOB_CHOICE_LIST *ratings,
566  const UNICHARSET &current_unicharset) {
567  inT32 index; // to list
568  const char* first_char = NULL; // character
569  FLOAT32 first_rat; // rating
570  FLOAT32 first_cert; // certainty
571  const char* sec_char = NULL; // character
572  FLOAT32 sec_rat = 0.0f; // rating
573  FLOAT32 sec_cert = 0.0f; // certainty
574  BLOB_CHOICE_IT c_it = ratings; // iterator
575 
576  index = ratings->length();
577  if (index > 0) {
578  first_char = current_unicharset.id_to_unichar(c_it.data()->unichar_id());
579  first_rat = c_it.data()->rating();
580  first_cert = -c_it.data()->certainty();
581  if (index > 1) {
582  sec_char = current_unicharset.id_to_unichar(
583  c_it.data_relative(1)->unichar_id());
584  sec_rat = c_it.data_relative(1)->rating();
585  sec_cert = -c_it.data_relative(1)->certainty();
586  } else {
587  sec_char = NULL;
588  sec_rat = -1;
589  sec_cert = -1;
590  }
591  } else {
592  first_char = NULL;
593  first_rat = -1;
594  first_cert = -1;
595  }
596  if (first_char != NULL && (*first_char == '\0' || *first_char == ' '))
597  first_char = NULL;
598  if (sec_char != NULL && (*sec_char == '\0' || *sec_char == ' '))
599  sec_char = NULL;
600  tprintf(" " INT32FORMAT " %s %g %g %s %g %g\n",
601  ratings->length(),
602  first_char != NULL ? first_char : "~",
603  first_rat, first_cert, sec_char != NULL ? sec_char : "~",
604  sec_rat, sec_cert);
605 }
606 
610 void print_char_choices_list(const char *msg,
611  const BLOB_CHOICE_LIST_VECTOR &char_choices,
612  const UNICHARSET &current_unicharset,
613  BOOL8 detailed) {
614  if (*msg != '\0') tprintf("%s\n", msg);
615  for (int x = 0; x < char_choices.length(); ++x) {
616  BLOB_CHOICE_IT c_it;
617  c_it.set_to_list(char_choices.get(x));
618  tprintf("\nchar[%d]: %s\n", x,
619  current_unicharset.debug_str( c_it.data()->unichar_id()).string());
620  if (detailed)
621  print_ratings_list("", char_choices.get(x), current_unicharset);
622  }
623 }
624 
629  WERD_CHOICE *word,
630  GenericVector<WERD_CHOICE *> *alternates) {
631  if (!word || !alternates) return;
632 
633  STRING alternates_str;
634  for (int i = 0; i < alternates->size(); i++) {
635  if (i > 0) alternates_str += "\", \"";
636  alternates_str += alternates->get(i)->unichar_string();
637  }
638  tprintf("Alternates for \"%s\": {\"%s\"}\n",
639  word->unichar_string().string(), alternates_str.string());
640 }