Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
permdawg.cpp
Go to the documentation of this file.
1 /* -*-C-*-
2  ********************************************************************************
3  *
4  * File: permdawg.c (Formerly permdawg.c)
5  * Description: Scale word choices by a dictionary
6  * Author: Mark Seaman, OCR Technology
7  * Created: Fri Oct 16 14:37:00 1987
8  * Modified: Tue Jul 9 15:43:18 1991 (Mark Seaman) marks@hpgrlt
9  * Language: C
10  * Package: N/A
11  * Status: Reusable Software Component
12  *
13  * (c) Copyright 1987, Hewlett-Packard Company.
14  ** Licensed under the Apache License, Version 2.0 (the "License");
15  ** you may not use this file except in compliance with the License.
16  ** You may obtain a copy of the License at
17  ** http://www.apache.org/licenses/LICENSE-2.0
18  ** Unless required by applicable law or agreed to in writing, software
19  ** distributed under the License is distributed on an "AS IS" BASIS,
20  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
21  ** See the License for the specific language governing permissions and
22  ** limitations under the License.
23  *
24  *********************************************************************************/
25 /*----------------------------------------------------------------------
26  I n c l u d e s
27 ----------------------------------------------------------------------*/
28 
29 #include "cutil.h"
30 #include "dawg.h"
31 #include "freelist.h"
32 #include "globals.h"
33 #include "ndminx.h"
34 #include "permute.h"
35 #include "stopper.h"
36 #include "tprintf.h"
37 #include "params.h"
38 
39 #include <ctype.h>
40 #include "dict.h"
41 #include "image.h"
42 
43 /*----------------------------------------------------------------------
44  F u n c t i o n s
45 ----------------------------------------------------------------------*/
46 namespace tesseract {
47 
48 static const float kPermDawgRatingPad = 5.0;
49 
68  const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
69  int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
70  bool word_ending, WERD_CHOICE *word, float certainties[], float *limit,
71  WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args) {
72  DawgArgs *more_args = reinterpret_cast<DawgArgs*>(void_more_args);
73  word_ending = (char_choice_index == more_args->end_char_choice_index);
74  int word_index = word->length() - 1;
75 
76  if (ambigs_mode(*limit)) {
77  if (best_choice->rating() < *limit) return;
78  } else {
79  // Prune bad subwords
80  if (more_args->rating_array[word_index] == NO_RATING) {
81  more_args->rating_array[word_index] = word->rating();
82  } else {
83  float permdawg_limit = more_args->rating_array[word_index] *
84  more_args->rating_margin + kPermDawgRatingPad;
85  if (permdawg_limit < word->rating()) {
87  tprintf("early pruned word rating=%4.2f,"
88  " permdawg_limit=%4.2f, word=%s\n", word->rating(),
89  permdawg_limit, word->debug_string().string());
90  }
91  return;
92  }
93  }
94  }
95  // Deal with hyphens
96  if (word_ending && more_args->sought_word_length == kAnyWordLength &&
97  has_hyphen_end(*word) && !ambigs_mode(*limit)) {
98  // Copy more_args->active_dawgs to clean_active_dawgs removing
99  // dawgs of type DAWG_TYPE_PATTERN.
100  DawgInfoVector clean_active_dawgs;
101  const DawgInfoVector &active_dawgs = *(more_args->active_dawgs);
102  for (int i = 0; i < active_dawgs.size(); ++i) {
103  if (dawgs_[active_dawgs[i].dawg_index]->type() != DAWG_TYPE_PATTERN) {
104  clean_active_dawgs += active_dawgs[i];
105  }
106  }
107  if (clean_active_dawgs.size() > 0) {
109  tprintf("new hyphen choice = %s\n", word->debug_string().string());
110  word->set_permuter(more_args->permuter);
111  adjust_word(word, certainties, &char_choices, permute_debug);
112  set_hyphen_word(*word, *(more_args->active_dawgs),
113  *(more_args->constraints));
114  update_best_choice(*word, best_choice);
115  }
116  } else { // Look up char in DAWG
117  // TODO(daria): update the rest of the code that specifies alternative
118  // letter_is_okay_ functions (e.g. TessCharNgram class) to work with
119  // multi-byte unichars and/or unichar ids.
120 
121  // If the current unichar is an ngram first try calling
122  // letter_is_okay() for each unigram it contains separately.
123  UNICHAR_ID orig_uch_id = word->unichar_id(word_index);
124  bool checked_unigrams = false;
125  if (getUnicharset().get_isngram(orig_uch_id)) {
127  tprintf("checking unigrams in an ngram %s\n",
128  getUnicharset().debug_str(orig_uch_id).string());
129  }
130  int orig_num_fragments = word->fragment_length(word_index);
131  int num_unigrams = 0;
132  word->remove_last_unichar_id();
133  const char *ngram_str = getUnicharset().id_to_unichar(orig_uch_id);
134  const char *ngram_str_end = ngram_str + strlen(ngram_str);
135  const char *ngram_ptr = ngram_str;
136  bool unigrams_ok = true;
137  // Construct DawgArgs that reflect the current state.
138  DawgInfoVector unigram_active_dawgs = *(more_args->active_dawgs);
139  DawgInfoVector unigram_constraints = *(more_args->constraints);
140  DawgInfoVector unigram_updated_active_dawgs;
141  DawgInfoVector unigram_updated_constraints;
142  DawgArgs unigram_dawg_args(&unigram_active_dawgs,
143  &unigram_constraints,
144  &unigram_updated_active_dawgs,
145  &unigram_updated_constraints, 0.0,
146  more_args->permuter,
147  more_args->sought_word_length,
148  more_args->end_char_choice_index);
149  // Check unigrams in the ngram with letter_is_okay().
150  while (unigrams_ok && ngram_ptr < ngram_str_end) {
151  int step = getUnicharset().step(ngram_ptr);
152  UNICHAR_ID uch_id = (step <= 0) ? INVALID_UNICHAR_ID :
153  getUnicharset().unichar_to_id(ngram_ptr, step);
154  ngram_ptr += step;
155  ++num_unigrams;
156  word->append_unichar_id(uch_id, 1, 0.0, 0.0);
157  unigrams_ok = unigrams_ok && (this->*letter_is_okay_)(
158  &unigram_dawg_args,
159  word->unichar_id(word_index+num_unigrams-1),
160  word_ending && (ngram_ptr == ngram_str_end));
161  (*unigram_dawg_args.active_dawgs) =
162  *(unigram_dawg_args.updated_active_dawgs);
163  (*unigram_dawg_args.constraints) =
164  *(unigram_dawg_args.updated_constraints);
166  tprintf("unigram %s is %s\n",
167  getUnicharset().debug_str(uch_id).string(),
168  unigrams_ok ? "OK" : "not OK");
169  }
170  }
171  // Restore the word and copy the updated dawg state if needed.
172  while (num_unigrams-- > 0) word->remove_last_unichar_id();
174  orig_uch_id, orig_num_fragments, 0.0, 0.0);
175  if (unigrams_ok) {
176  checked_unigrams = true;
177  more_args->permuter = unigram_dawg_args.permuter;
178  *(more_args->updated_active_dawgs) =
179  *(unigram_dawg_args.updated_active_dawgs);
180  *(more_args->updated_constraints) =
181  *(unigram_dawg_args.updated_constraints);
182  }
183  }
184 
185  // Check which dawgs from the dawgs_ vector contain the word
186  // up to and including the current unichar.
187  if (checked_unigrams || (this->*letter_is_okay_)(
188  more_args, word->unichar_id(word_index), word_ending)) {
189  // Add a new word choice
190  if (word_ending) {
192  tprintf("found word = %s\n", word->debug_string().string());
193  }
194  if (ambigs_mode(*limit) &&
195  strcmp(output_ambig_words_file.string(), "") != 0) {
196  if (output_ambig_words_file_ == NULL) {
197  output_ambig_words_file_ =
198  fopen(output_ambig_words_file.string(), "wb+");
199  if (output_ambig_words_file_ == NULL) {
200  tprintf("Failed to open output_ambig_words_file %s\n",
201  output_ambig_words_file.string());
202  exit(1);
203  }
204  }
205  STRING word_str;
206  word->string_and_lengths(&word_str, NULL);
207  word_str += " ";
208  fprintf(output_ambig_words_file_, word_str.string());
209  }
210  WERD_CHOICE *adjusted_word = word;
211  WERD_CHOICE hyphen_tail_word(&getUnicharset());
212  if (hyphen_base_size() > 0) {
213  hyphen_tail_word = *word;
214  remove_hyphen_head(&hyphen_tail_word);
215  adjusted_word = &hyphen_tail_word;
216  }
217  adjusted_word->set_permuter(more_args->permuter);
218  if (!ambigs_mode(*limit)) {
219  adjust_word(adjusted_word, &certainties[hyphen_base_size()],
220  &char_choices, permute_debug);
221  }
222  update_best_choice(*adjusted_word, best_choice);
223  } else { // search the next letter
224  // Make updated_* point to the next entries in the DawgInfoVector
225  // arrays (that were originally created in dawg_permute_and_select)
226  ++(more_args->updated_active_dawgs);
227  ++(more_args->updated_constraints);
228  // Make active_dawgs and constraints point to the updated ones.
229  ++(more_args->active_dawgs);
230  ++(more_args->constraints);
231  permute_choices(debug, char_choices, char_choice_index + 1,
232  prev_char_frag_info, word, certainties, limit,
233  best_choice, attempts_left, more_args);
234  // Restore previous state to explore another letter in this position.
235  --(more_args->updated_active_dawgs);
236  --(more_args->updated_constraints);
237  --(more_args->active_dawgs);
238  --(more_args->constraints);
239  }
240  } else {
242  tprintf("last unichar not OK at index %d in %s\n",
243  word_index, word->debug_string().string());
244  }
245  }
246  }
247 }
248 
264  const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit,
265  int sought_word_length, int start_char_choice_index) {
266  WERD_CHOICE *best_choice = new WERD_CHOICE(&getUnicharset());
267  best_choice->make_bad();
268  best_choice->set_rating(rating_limit);
269  if (char_choices.length() == 0) return best_choice;
270  DawgInfoVector *active_dawgs = new DawgInfoVector[char_choices.length() + 1];
271  DawgInfoVector *constraints = new DawgInfoVector[char_choices.length() + 1];
272  init_active_dawgs(sought_word_length, &(active_dawgs[0]),
273  ambigs_mode(rating_limit));
274  init_constraints(&(constraints[0]));
275  int end_char_choice_index = (sought_word_length == kAnyWordLength) ?
276  char_choices.length()-1 : start_char_choice_index+sought_word_length-1;
277  // Need to skip accumulating word choices if we are only searching a part of
278  // the word (e.g. for the phrase search in non-space delimited languages).
279  // Also need to skip accumulating choices if char_choices are expanded
280  // with ambiguities.
281  bool re_enable_choice_accum = ChoiceAccumEnabled();
282  if (sought_word_length != kAnyWordLength ||
283  ambigs_mode(rating_limit)) DisableChoiceAccum();
284  DawgArgs dawg_args(&(active_dawgs[0]), &(constraints[0]),
285  &(active_dawgs[1]), &(constraints[1]),
288  NO_PERM, sought_word_length, end_char_choice_index);
290  copy_hyphen_info(&word);
291  // Discard rating and certainty of the hyphen base (if any).
292  word.set_rating(0.0);
293  word.set_certainty(0.0);
294  if (word.length() + char_choices.length() > MAX_WERD_LENGTH) {
295  delete[] active_dawgs;
296  delete[] constraints;
297  return best_choice; // the word is too long to permute
298  }
299  float certainties[MAX_WERD_LENGTH];
301  int attempts_left = max_permuter_attempts;
303  "permute_dawg_debug" : NULL,
304  char_choices, start_char_choice_index, NULL, &word,
305  certainties, &rating_limit, best_choice, &attempts_left,
306  &dawg_args);
307  delete[] active_dawgs;
308  delete[] constraints;
309  if (re_enable_choice_accum) EnableChoiceAccum();
310  return best_choice;
311 }
312 
313 } // namespace tesseract