Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
dict.cpp
Go to the documentation of this file.
1 
2 // File: dict.cpp
3 // Description: dict class.
4 // Author: Samuel Charron
5 //
6 // (C) Copyright 2006, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
18 
19 #include <stdio.h>
20 
21 #include "dict.h"
22 #include "unicodes.h"
23 
24 #ifdef _MSC_VER
25 #pragma warning(disable:4244) // Conversion warnings
26 #endif
27 #include "tprintf.h"
28 
29 namespace tesseract {
30 
31 class Image;
32 
33 Dict::Dict(Image* image_ptr)
34  : letter_is_okay_(&tesseract::Dict::def_letter_is_okay),
35  probability_in_context_(&tesseract::Dict::def_probability_in_context),
36  image_ptr_(image_ptr),
37  STRING_INIT_MEMBER(user_words_suffix, "",
38  "A list of user-provided words.",
39  getImage()->getCCUtil()->params()),
40  STRING_INIT_MEMBER(user_patterns_suffix, "",
41  "A list of user-provided patterns.",
42  getImage()->getCCUtil()->params()),
43  BOOL_INIT_MEMBER(load_system_dawg, true, "Load system word dawg.",
44  getImage()->getCCUtil()->params()),
45  BOOL_INIT_MEMBER(load_freq_dawg, true, "Load frequent word dawg.",
46  getImage()->getCCUtil()->params()),
47  BOOL_INIT_MEMBER(load_unambig_dawg, true, "Load unambiguous word dawg.",
48  getImage()->getCCUtil()->params()),
49  BOOL_INIT_MEMBER(load_punc_dawg, true, "Load dawg with punctuation"
50  " patterns.", getImage()->getCCUtil()->params()),
51  BOOL_INIT_MEMBER(load_number_dawg, true, "Load dawg with number"
52  " patterns.", getImage()->getCCUtil()->params()),
53  BOOL_INIT_MEMBER(load_fixed_length_dawgs, true, "Load fixed length dawgs"
54  " (e.g. for non-space delimited languages)",
55  getImage()->getCCUtil()->params()),
56  BOOL_INIT_MEMBER(load_bigram_dawg, false, "Load dawg with special word "
57  "bigrams.", getImage()->getCCUtil()->params()),
58  double_MEMBER(segment_penalty_dict_frequent_word, 1.0,
59  "Score multiplier for word matches which have good case and"
60  "are frequent in the given language (lower is better).",
61  getImage()->getCCUtil()->params()),
62  double_MEMBER(segment_penalty_dict_case_ok, 1.1,
63  "Score multiplier for word matches that have good case "
64  "(lower is better).", getImage()->getCCUtil()->params()),
65  double_MEMBER(segment_penalty_dict_case_bad, 1.3125,
66  "Default score multiplier for word matches, which may have "
67  "case issues (lower is better).",
68  getImage()->getCCUtil()->params()),
69  double_MEMBER(segment_penalty_ngram_best_choice, 1.24,
70  "Multipler to for the best choice from the ngram model.",
71  getImage()->getCCUtil()->params()),
72  double_MEMBER(segment_penalty_dict_nonword, 1.25,
73  "Score multiplier for glyph fragment segmentations which "
74  "do not match a dictionary word (lower is better).",
75  getImage()->getCCUtil()->params()),
76  double_MEMBER(segment_penalty_garbage, 1.50,
77  "Score multiplier for poorly cased strings that are not in"
78  " the dictionary and generally look like garbage (lower is"
79  " better).", getImage()->getCCUtil()->params()),
80  STRING_MEMBER(output_ambig_words_file, "",
81  "Output file for ambiguities found in the dictionary",
82  getImage()->getCCUtil()->params()),
83  INT_MEMBER(dawg_debug_level, 0, "Set to 1 for general debug info"
84  ", to 2 for more details, to 3 to see all the debug messages",
85  getImage()->getCCUtil()->params()),
86  INT_MEMBER(hyphen_debug_level, 0, "Debug level for hyphenated words.",
87  getImage()->getCCUtil()->params()),
88  INT_MEMBER(max_viterbi_list_size, 10, "Maximum size of viterbi list.",
89  getImage()->getCCUtil()->params()),
90  BOOL_MEMBER(use_only_first_uft8_step, false,
91  "Use only the first UTF8 step of the given string"
92  " when computing log probabilities.",
93  getImage()->getCCUtil()->params()),
94  double_MEMBER(certainty_scale, 20.0, "Certainty scaling factor",
95  getImage()->getCCUtil()->params()),
96  double_MEMBER(stopper_nondict_certainty_base, -2.50,
97  "Certainty threshold for non-dict words",
98  getImage()->getCCUtil()->params()),
99  double_MEMBER(stopper_phase2_certainty_rejection_offset, 1.0,
100  "Reject certainty offset",
101  getImage()->getCCUtil()->params()),
102  INT_MEMBER(stopper_smallword_size, 2,
103  "Size of dict word to be treated as non-dict word",
104  getImage()->getCCUtil()->params()),
105  double_MEMBER(stopper_certainty_per_char, -0.50, "Certainty to add"
106  " for each dict char above small word size.",
107  getImage()->getCCUtil()->params()),
108  double_MEMBER(stopper_allowable_character_badness, 3.0,
109  "Max certaintly variation allowed in a word (in sigma)",
110  getImage()->getCCUtil()->params()),
111  INT_MEMBER(stopper_debug_level, 0, "Stopper debug level",
112  getImage()->getCCUtil()->params()),
113  BOOL_MEMBER(stopper_no_acceptable_choices, false,
114  "Make AcceptableChoice() always return false. Useful"
115  " when there is a need to explore all segmentations",
116  getImage()->getCCUtil()->params()),
117  double_MEMBER(stopper_ambiguity_threshold_gain, 8.0,
118  "Gain factor for ambiguity threshold.",
119  getImage()->getCCUtil()->params()),
120  double_MEMBER(stopper_ambiguity_threshold_offset, 1.5,
121  "Certainty offset for ambiguity threshold.",
122  getImage()->getCCUtil()->params()),
123  BOOL_MEMBER(save_raw_choices, false, "Save all explored raw choices",
124  getImage()->getCCUtil()->params()),
125  INT_MEMBER(tessedit_truncate_wordchoice_log, 10,
126  "Max words to keep in list",
127  getImage()->getCCUtil()->params()),
128  STRING_MEMBER(word_to_debug, "", "Word for which stopper debug"
129  " information should be printed to stdout",
130  getImage()->getCCUtil()->params()),
131  STRING_MEMBER(word_to_debug_lengths, "",
132  "Lengths of unichars in word_to_debug",
133  getImage()->getCCUtil()->params()),
134  INT_MEMBER(fragments_debug, 0, "Debug character fragments",
135  getImage()->getCCUtil()->params()),
136  INT_MEMBER(segment_debug, 0, "Debug the whole segmentation process",
137  getImage()->getCCUtil()->params()),
138  BOOL_MEMBER(permute_debug, 0, "Debug char permutation process",
139  getImage()->getCCUtil()->params()),
140  double_MEMBER(bestrate_pruning_factor, 2.0, "Multiplying factor of"
141  " current best rate to prune other hypotheses",
142  getImage()->getCCUtil()->params()),
144  "Turn on word script consistency permuter",
145  getImage()->getCCUtil()->params()),
147  "incorporate segmentation cost in word rating?",
148  getImage()->getCCUtil()->params()),
149  BOOL_MEMBER(segment_nonalphabetic_script, false,
150  "Don't use any alphabetic-specific tricks."
151  "Set to true in the traineddata config file for"
152  " scripts that are cursive or inherently fixed-pitch",
153  getImage()->getCCUtil()->params()),
155  "Score multipler for script consistency within a word. "
156  "Being a 'reward' factor, it should be <= 1. "
157  "Smaller value implies bigger reward.",
158  getImage()->getCCUtil()->params()),
160  "Turn on fixed-length phrasebook search permuter",
161  getImage()->getCCUtil()->params()),
163  "Turn on character type (property) consistency permuter",
164  getImage()->getCCUtil()->params()),
166  "Score multipler for char type consistency within a word. ",
167  getImage()->getCCUtil()->params()),
169  "Score multipler for ngram permuter's best choice"
170  " (only used in the Han script path).",
171  getImage()->getCCUtil()->params()),
172  BOOL_MEMBER(save_doc_words, 0, "Save Document Words",
173  getImage()->getCCUtil()->params()),
174  BOOL_MEMBER(doc_dict_enable, 1, "Enable Document Dictionary ",
175  getImage()->getCCUtil()->params()),
176  double_MEMBER(doc_dict_pending_threshold, 0.0,
177  "Worst certainty for using pending dictionary",
178  getImage()->getCCUtil()->params()),
179  double_MEMBER(doc_dict_certainty_threshold, -2.25,
180  "Worst certainty for words that can be inserted into the"
181  "document dictionary", getImage()->getCCUtil()->params()),
182  BOOL_MEMBER(ngram_permuter_activated, false,
183  "Activate character-level n-gram-based permuter",
184  getImage()->getCCUtil()->params()),
185  INT_MEMBER(max_permuter_attempts, 10000, "Maximum number of different"
186  " character choices to consider during permutation."
187  " This limit is especially useful when user patterns"
188  " are specified, since overly generic patterns can result in"
189  " dawg search exploring an overly large number of options.",
190  getImage()->getCCUtil()->params()),
191  BOOL_MEMBER(permute_only_top, false, "Run only the top choice permuter",
192  getImage()->getCCUtil()->params()) {
193  dang_ambigs_table_ = NULL;
194  replace_ambigs_table_ = NULL;
195  keep_word_choices_ = false;
196  reject_offset_ = 0.0;
197  best_raw_choice_ = NULL;
198  best_choices_ = NIL_LIST;
199  raw_choices_ = NIL_LIST;
201  hyphen_word_ = NULL;
202  last_word_on_line_ = false;
203  hyphen_unichar_id_ = INVALID_UNICHAR_ID;
204  document_words_ = NULL;
205  pending_words_ = NULL;
206  bigram_dawg_ = NULL;
207  freq_dawg_ = NULL;
208  punc_dawg_ = NULL;
209  max_fixed_length_dawgs_wdlen_ = -1;
210  wordseg_rating_adjust_factor_ = -1.0f;
211  output_ambig_words_file_ = NULL;
212 }
213 
215  if (hyphen_word_ != NULL) delete hyphen_word_;
216  if (output_ambig_words_file_ != NULL) fclose(output_ambig_words_file_);
217 }
218 
219 void Dict::Load() {
220  STRING name;
221  STRING &lang = getImage()->getCCUtil()->lang;
222 
223  if (dawgs_.length() != 0) this->End();
224 
225  hyphen_unichar_id_ = getUnicharset().unichar_to_id(kHyphenSymbol);
226 
229 
230  TessdataManager &tessdata_manager =
232 
233  // Load dawgs_.
234  if (load_punc_dawg && tessdata_manager.SeekToStart(TESSDATA_PUNC_DAWG)) {
235  punc_dawg_ = new SquishedDawg(tessdata_manager.GetDataFilePtr(),
236  DAWG_TYPE_PUNCTUATION, lang, PUNC_PERM,
238  dawgs_ += punc_dawg_;
239  }
240  if (load_system_dawg && tessdata_manager.SeekToStart(TESSDATA_SYSTEM_DAWG)) {
241  dawgs_ += new SquishedDawg(tessdata_manager.GetDataFilePtr(),
242  DAWG_TYPE_WORD, lang, SYSTEM_DAWG_PERM,
244  }
245  if (load_number_dawg && tessdata_manager.SeekToStart(TESSDATA_NUMBER_DAWG)) {
246  dawgs_ +=
247  new SquishedDawg(tessdata_manager.GetDataFilePtr(),
248  DAWG_TYPE_NUMBER, lang, NUMBER_PERM, dawg_debug_level);
249  }
250  if (load_bigram_dawg && tessdata_manager.SeekToStart(TESSDATA_BIGRAM_DAWG)) {
251  bigram_dawg_ = new SquishedDawg(tessdata_manager.GetDataFilePtr(),
252  DAWG_TYPE_WORD, // doesn't actually matter.
253  lang,
254  COMPOUND_PERM, // doesn't actually matter.
256  }
257  if (load_freq_dawg && tessdata_manager.SeekToStart(TESSDATA_FREQ_DAWG)) {
258  freq_dawg_ = new SquishedDawg(tessdata_manager.GetDataFilePtr(),
259  DAWG_TYPE_WORD, lang, FREQ_DAWG_PERM,
261  dawgs_ += freq_dawg_;
262  }
263  if (load_unambig_dawg &&
264  tessdata_manager.SeekToStart(TESSDATA_UNAMBIG_DAWG)) {
265  unambig_dawg_ = new SquishedDawg(tessdata_manager.GetDataFilePtr(),
266  DAWG_TYPE_WORD, lang, SYSTEM_DAWG_PERM,
268  dawgs_ += unambig_dawg_;
269  }
270 
271  if (((STRING &)user_words_suffix).length() > 0) {
272  Trie *trie_ptr = new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM,
273  kMaxUserDawgEdges, getUnicharset().size(),
276  name += user_words_suffix;
277  if (!trie_ptr->read_word_list(name.string(), getUnicharset(),
279  tprintf("Error: failed to load %s\n", name.string());
280  exit(1);
281  }
282  dawgs_ += trie_ptr;
283  }
284 
285  if (((STRING &)user_patterns_suffix).length() > 0) {
286  Trie *trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM,
287  kMaxUserDawgEdges, getUnicharset().size(),
289  trie_ptr->initialize_patterns(&(getUnicharset()));
291  name += user_patterns_suffix;
292  if (!trie_ptr->read_pattern_list(name.string(), getUnicharset())) {
293  tprintf("Error: failed to load %s\n", name.string());
294  exit(1);
295  }
296  dawgs_ += trie_ptr;
297  }
298 
299  document_words_ = new Trie(DAWG_TYPE_WORD, lang, DOC_DAWG_PERM,
300  kMaxDocDawgEdges, getUnicharset().size(),
302  dawgs_ += document_words_;
303 
304  // This dawg is temporary and should not be searched by letter_is_ok.
305  pending_words_ = new Trie(DAWG_TYPE_WORD, lang, NO_PERM,
306  kMaxDocDawgEdges, getUnicharset().size(),
308 
309  // Load fixed length dawgs if necessary (used for phrase search
310  // for non-space delimited languages).
312  tessdata_manager.SeekToStart(TESSDATA_FIXED_LENGTH_DAWGS)) {
313  ReadFixedLengthDawgs(DAWG_TYPE_WORD, lang, SYSTEM_DAWG_PERM,
314  dawg_debug_level, tessdata_manager.GetDataFilePtr(),
315  &dawgs_, &max_fixed_length_dawgs_wdlen_);
316  }
317 
318  // Construct a list of corresponding successors for each dawg. Each entry i
319  // in the successors_ vector is a vector of integers that represent the
320  // indices into the dawgs_ vector of the successors for dawg i.
321  successors_.reserve(dawgs_.length());
322  for (int i = 0; i < dawgs_.length(); ++i) {
323  const Dawg *dawg = dawgs_[i];
324  SuccessorList *lst = new SuccessorList();
325  for (int j = 0; j < dawgs_.length(); ++j) {
326  const Dawg *other = dawgs_[j];
327  if (dawg != NULL && other != NULL &&
328  (dawg->lang() == other->lang()) &&
329  kDawgSuccessors[dawg->type()][other->type()]) *lst += j;
330  }
331  successors_ += lst;
332  }
333 }
334 
335 void Dict::End() {
336  if (dawgs_.length() == 0)
337  return; // Not safe to call twice.
338  dawgs_.delete_data_pointers();
339  successors_.delete_data_pointers();
340  dawgs_.clear();
341  delete bigram_dawg_;
342  successors_.clear();
343  document_words_ = NULL;
344  max_fixed_length_dawgs_wdlen_ = -1;
345  if (pending_words_ != NULL) {
346  delete pending_words_;
347  pending_words_ = NULL;
348  }
349 }
350 
351 // Create unicharset adaptations of known, short lists of UTF-8 equivalent
352 // characters (think all hyphen-like symbols). The first version of the
353 // list is taken as equivalent for matching against the dictionary.
354 void Dict::LoadEquivalenceList(const char *unichar_strings[]) {
355  equivalent_symbols_.push_back(GenericVectorEqEq<UNICHAR_ID>());
356  const UNICHARSET &unicharset = getUnicharset();
357  GenericVectorEqEq<UNICHAR_ID> *equiv_list = &equivalent_symbols_.back();
358  for (int i = 0; unichar_strings[i] != 0; i++) {
359  UNICHAR_ID unichar_id = unicharset.unichar_to_id(unichar_strings[i]);
360  if (unichar_id != INVALID_UNICHAR_ID) {
361  equiv_list->push_back(unichar_id);
362  }
363  }
364 }
365 
366 // Normalize all hyphen and apostrophes to the canonicalized one for
367 // matching; pass everything else through as is.
369  for (int i = 0; i < equivalent_symbols_.size(); i++) {
370  if (equivalent_symbols_[i].contains(unichar_id)) {
371  return equivalent_symbols_[i][0];
372  }
373  }
374  return unichar_id;
375 }
376 
377 // Returns true if in light of the current state unichar_id is allowed
378 // according to at least one of the dawgs in the dawgs_ vector.
379 // See more extensive comments in dict.h where this function is declared.
380 int Dict::def_letter_is_okay(void* void_dawg_args,
381  UNICHAR_ID unichar_id,
382  bool word_end) const {
383  DawgArgs *dawg_args = reinterpret_cast<DawgArgs*>(void_dawg_args);
384 
385  if (dawg_debug_level >= 3) {
386  tprintf("def_letter_is_okay: current unichar=%s word_end=%d"
387  " num active dawgs=%d num constraints=%d\n",
388  getUnicharset().debug_str(unichar_id).string(), word_end,
389  dawg_args->active_dawgs->length(),
390  dawg_args->constraints->length());
391  }
392 
393  // Do not accept words that contain kPatternUnicharID.
394  // (otherwise pattern dawgs would not function correctly).
395  // Do not accept words containing INVALID_UNICHAR_IDs.
396  if (unichar_id == Dawg::kPatternUnicharID ||
397  unichar_id == INVALID_UNICHAR_ID) {
398  dawg_args->permuter = NO_PERM;
399  return NO_PERM;
400  }
401 
402  // Initialization.
403  PermuterType curr_perm = NO_PERM;
404  dawg_args->updated_active_dawgs->clear();
405  const DawgInfoVector &constraints = *(dawg_args->constraints);
406  *dawg_args->updated_constraints = constraints;
407 
408  // Go over the active_dawgs vector and insert DawgInfo records with the
409  // updated ref (an edge with the corresponding unichar id) into
410  // dawg_args->updated_active_dawgs.
411  for (int a = 0; a < dawg_args->active_dawgs->length(); ++a) {
412  const DawgInfo &info = (*dawg_args->active_dawgs)[a];
413  const Dawg *dawg = dawgs_[info.dawg_index];
414  // dawg_unichar_id will contain the literal unichar_id to be found in the
415  // dawgs (e.g. didgit pattern if unichar_id is a digit and dawg contains
416  // number patterns, word pattern if dawg is a puncutation dawg and we
417  // reached an end of beginning puntuation pattern, etc).
418  UNICHAR_ID dawg_unichar_id = unichar_id;
419 
420  // If we are dealing with the pattern dawg, look up all the
421  // possible edges, not only for the exact unichar_id, but also
422  // for all its character classes (alpha, digit, etc).
423  if (dawg->type() == DAWG_TYPE_PATTERN) {
424  ProcessPatternEdges(dawg, info, dawg_unichar_id, word_end,
425  dawg_args, &curr_perm);
426  // There can't be any successors to dawg that is of type
427  // DAWG_TYPE_PATTERN, so we are done examining this DawgInfo.
428  continue;
429  }
430 
431  // The number dawg generalizes all digits to be kPatternUnicharID,
432  // so try to match kPatternUnicharID if the current unichar is a digit.
433  if (dawg->type() == DAWG_TYPE_NUMBER &&
434  getUnicharset().get_isdigit(dawg_unichar_id)) {
435  dawg_unichar_id = Dawg::kPatternUnicharID;
436  }
437 
438  // Find the edge out of the node for the dawg_unichar_id.
439  NODE_REF node = GetStartingNode(dawg, info.ref);
440  EDGE_REF edge = (node != NO_EDGE) ?
441  dawg->edge_char_of(node, dawg_unichar_id, word_end) : NO_EDGE;
442 
443  if (dawg_debug_level >= 3) {
444  tprintf("Active dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n",
445  info.dawg_index, node, edge);
446  }
447 
448  if (edge != NO_EDGE) { // the unichar was found in the current dawg
449  if (ConstraintsOk(*(dawg_args->updated_constraints),
450  word_end, dawg->type())) {
451  if (dawg_debug_level >=3) {
452  tprintf("Letter found in dawg %d\n", info.dawg_index);
453  }
454  if (dawg->permuter() > curr_perm) curr_perm = dawg->permuter();
455  dawg_args->updated_active_dawgs->add_unique(
456  DawgInfo(info.dawg_index, edge), dawg_debug_level > 0,
457  "Append current dawg to updated active dawgs: ");
458  }
459  } else if (dawg_args->sought_word_length == kAnyWordLength) {
460  // The unichar was not found in the current dawg.
461  // Explore the successor dawgs (but only if we are not
462  // just searching one dawg with a fixed word length).
463 
464  // Handle leading/trailing punctuation dawgs that denote a word pattern
465  // as an edge with kPatternUnicharID. If such an edge is found we add a
466  // constraint denoting the state of the dawg before the word pattern.
467  // This constraint will be applied later when this dawg is found among
468  // successor dawgs as well potentially at the end of the word.
469  if (dawg->type() == DAWG_TYPE_PUNCTUATION) {
470  edge = dawg->edge_char_of(node, Dawg::kPatternUnicharID, word_end);
471  if (edge != NO_EDGE) {
472  dawg_args->updated_constraints->add_unique(
473  DawgInfo(info.dawg_index, edge), dawg_debug_level > 0,
474  "Recording constraint: ");
475  } else {
476  // Do not explore successors of this dawg, since this
477  // must be invalid leading or trailing punctuation.
478  if (dawg_debug_level >= 3) {
479  tprintf("Invalid punctuation from dawg %d\n", info.dawg_index);
480  }
481  continue;
482  }
483  }
484 
485  if (info.ref == NO_EDGE) {
486  if (dawg_debug_level >= 3) {
487  tprintf("No letters matched in dawg %d\n", info.dawg_index);
488  }
489  continue;
490  }
491 
492  // Discard the dawg if the pattern can not end at previous letter.
493  if (edge == NO_EDGE && // previous part is not leading punctuation
494  !dawg->end_of_word(info.ref)) {
495  if (dawg_debug_level >= 3) {
496  tprintf("No valid pattern end in dawg %d\n", info.dawg_index);
497  }
498  continue;
499  }
500 
501  // Look for the unichar in each of this dawg's successors
502  // and append those in which it is found to active_dawgs.
503  const SuccessorList &slist = *(successors_[info.dawg_index]);
504  for (int s = 0; s < slist.length(); ++s) {
505  int sdawg_index = slist[s];
506  const Dawg *sdawg = dawgs_[sdawg_index];
507  NODE_REF snode = 0;
508  // Apply constraints to the successor dawg.
509  for (int c = 0; c < constraints.length(); ++c) {
510  // If the successor dawg is described in the constraints change
511  // the start ref from 0 to the one recorded as the constraint.
512  const DawgInfo &cinfo = constraints[c];
513  if (cinfo.dawg_index == sdawg_index) {
514  snode = sdawg->next_node(cinfo.ref);
515  // Make sure we do not search the successor dawg if after
516  // applying the saved constraint we are at the end of the word.
517  if (snode == 0) snode = NO_EDGE;
518  if (dawg_debug_level >= 3) {
519  tprintf("Applying constraint [%d, " REFFORMAT "]\n",
520  sdawg_index, snode);
521  }
522  }
523  }
524  // Look for the letter in this successor dawg.
525  EDGE_REF sedge = sdawg->edge_char_of(snode, unichar_id, word_end);
526  // If we found the letter append sdawg to the active_dawgs list.
527  if (sedge != NO_EDGE &&
528  ConstraintsOk(*(dawg_args->updated_constraints), word_end,
529  dawgs_[sdawg_index]->type())) {
530  if (dawg_debug_level >= 3) {
531  tprintf("Letter found in the successor dawg %d\n", sdawg_index);
532  }
533  if (sdawg->permuter() > curr_perm) curr_perm = sdawg->permuter();
534  if (sdawg->next_node(sedge) != 0) { // if not word end
535  dawg_args->updated_active_dawgs->add_unique(
536  DawgInfo(sdawg_index, sedge), dawg_debug_level > 0,
537  "Append successor to updated active dawgs: ");
538  }
539  }
540  } // end successors loop
541  } // end if/else
542  } // end for
543  // Update dawg_args->permuter if it used to be NO_PERM or became NO_PERM
544  // or if we found the current letter in a non-punctuation dawg. This
545  // allows preserving information on which dawg the "core" word came from.
546  // Keep the old value of dawg_args->permuter if it is COMPOUND_PERM.
547  if (dawg_args->permuter == NO_PERM || curr_perm == NO_PERM ||
548  (curr_perm != PUNC_PERM && dawg_args->permuter != COMPOUND_PERM)) {
549  dawg_args->permuter = curr_perm;
550  }
551  return dawg_args->permuter;
552 }
553 
554 void Dict::ProcessPatternEdges(const Dawg *dawg, const DawgInfo &info,
555  UNICHAR_ID unichar_id, bool word_end,
556  DawgArgs *dawg_args,
557  PermuterType *curr_perm) const {
558  NODE_REF node = GetStartingNode(dawg, info.ref);
559  // Try to find the edge corresponding to the exact unichar_id and to all the
560  // edges corresponding to the character class of unichar_id.
561  GenericVector<UNICHAR_ID> unichar_id_patterns;
562  unichar_id_patterns.push_back(unichar_id);
563  dawg->unichar_id_to_patterns(unichar_id, getUnicharset(),
564  &unichar_id_patterns);
565  for (int i = 0; i < unichar_id_patterns.size(); ++i) {
566  // On the first iteration check all the outgoing edges.
567  // On the second iteration check all self-loops.
568  for (int k = 0; k < 2; ++k) {
569  EDGE_REF edge = (k == 0) ?
570  dawg->edge_char_of(node, unichar_id_patterns[i], word_end)
571  : dawg->pattern_loop_edge(info.ref, unichar_id_patterns[i], word_end);
572  if (edge != NO_EDGE) {
573  if (dawg_debug_level >= 3) {
574  tprintf("Pattern dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n",
575  info.dawg_index, node, edge);
576  }
577  if (ConstraintsOk(*(dawg_args->updated_constraints),
578  word_end, dawg->type())) {
579  if (dawg_debug_level >=3) {
580  tprintf("Letter found in pattern dawg %d\n", info.dawg_index);
581  }
582  if (dawg->permuter() > *curr_perm) *curr_perm = dawg->permuter();
583  dawg_args->updated_active_dawgs->add_unique(
584  DawgInfo(info.dawg_index, edge), dawg_debug_level > 0,
585  "Append current dawg to updated active dawgs: ");
586  }
587  }
588  }
589  }
590 }
591 
593  PermuterType perm, int debug_level,
594  FILE *file, DawgVector *dawg_vec,
595  int *max_wdlen) {
596  int i;
597  DawgVector dawg_vec_copy;
598  dawg_vec_copy.move(dawg_vec); // save the input dawg_vec.
599  inT32 num_dawgs;
600  fread(&num_dawgs, sizeof(inT32), 1, file);
601  bool swap = (num_dawgs > MAX_WERD_LENGTH);
602  if (swap) num_dawgs = reverse32(num_dawgs);
603  inT32 word_length;
604  int max_word_length = 0;
605  // Read and record pointers to fixed-length dawgs such that:
606  // dawg_vec[word_length] = pointer to dawg with word length of word_length,
607  // NULL if such fixed-length dawg does not exist.
608  for (i = 0; i < num_dawgs; ++i) {
609  fread(&word_length, sizeof(inT32), 1, file);
610  if (swap) word_length = reverse32(word_length);
611  ASSERT_HOST(word_length > 0 && word_length <= MAX_WERD_LENGTH);
612  while (word_length >= dawg_vec->size()) dawg_vec->push_back(NULL);
613  (*dawg_vec)[word_length] =
614  new SquishedDawg(file, type, lang, perm, debug_level);
615  if (word_length > max_word_length) max_word_length = word_length;
616  }
617  *max_wdlen = max_word_length;
618  // Entries dawg_vec[0] to dawg_vec[max_word_length] now hold pointers
619  // to fixed-length dawgs. The rest of the vector will contain the dawg
620  // pointers from the original input dawg_vec.
621  for (i = 0; i < dawg_vec_copy.size(); ++i) {
622  dawg_vec->push_back(dawg_vec_copy[i]);
623  }
624 }
625 
627  const GenericVector<SquishedDawg *> &dawg_vec,
628  int num_dawgs, int debug_level, FILE *output_file) {
629  fwrite(&num_dawgs, sizeof(inT32), 1, output_file);
630  if (debug_level) tprintf("Writing %d split length dawgs\n", num_dawgs);
631  for (int i = 1; i < dawg_vec.size(); ++i) {
632  if ((dawg_vec)[i] != NULL) {
633  fwrite(&i, sizeof(inT32), 1, output_file);
634  dawg_vec[i]->write_squished_dawg(output_file);
635  if (debug_level) tprintf("Wrote Dawg with word length %d\n", i);
636  }
637  }
638 }
639 
640 // Fill the given active_dawgs vector with dawgs that could contain the
641 // beginning of the word. If hyphenated() returns true, copy the entries
642 // from hyphen_active_dawgs_ instead.
643 void Dict::init_active_dawgs(int sought_word_length,
644  DawgInfoVector *active_dawgs,
645  bool ambigs_mode) const {
646  int i;
647  if (sought_word_length != kAnyWordLength) {
648  // Only search one fixed word length dawg.
649  if (sought_word_length <= max_fixed_length_dawgs_wdlen_ &&
650  dawgs_[sought_word_length] != NULL) {
651  *active_dawgs += DawgInfo(sought_word_length, NO_EDGE);
652  }
653  } else if (hyphenated()) {
654  *active_dawgs = hyphen_active_dawgs_;
655  if (dawg_debug_level >= 3) {
656  for (i = 0; i < hyphen_active_dawgs_.size(); ++i) {
657  tprintf("Adding hyphen beginning dawg [%d, " REFFORMAT "]\n",
658  hyphen_active_dawgs_[i].dawg_index,
659  hyphen_active_dawgs_[i].ref);
660  }
661  }
662  } else {
663  for (i = 0; i < dawgs_.length(); ++i) {
664  if (dawgs_[i] != NULL && kBeginningDawgsType[(dawgs_[i])->type()] &&
665  !(ambigs_mode && (dawgs_[i])->type() == DAWG_TYPE_PATTERN)) {
666  *active_dawgs += DawgInfo(i, NO_EDGE);
667  if (dawg_debug_level >= 3) {
668  tprintf("Adding beginning dawg [%d, " REFFORMAT "]\n", i, NO_EDGE);
669  }
670  }
671  }
672  }
673 }
674 
675 // If hyphenated() returns true, copy the entries from hyphen_constraints_
676 // into the given constraints vector.
677 void Dict::init_constraints(DawgInfoVector *constraints) const {
678  if (hyphenated()) {
679  *constraints = hyphen_constraints_;
680  if (dawg_debug_level >= 3) {
681  for (int i = 0; i < hyphen_constraints_.size(); ++i) {
682  tprintf("Adding hyphen constraint [%d, " REFFORMAT "]\n",
683  hyphen_constraints_[i].dawg_index,
684  hyphen_constraints_[i].ref);
685  }
686  }
687  }
688 }
689 
690 void Dict::add_document_word(const WERD_CHOICE &best_choice) {
691  // Do not add hyphenated word parts to the document dawg.
692  // hyphen_word_ will be non-NULL after the set_hyphen_word() is
693  // called when the first part of the hyphenated word is
694  // discovered and while the second part of the word is recognized.
695  // hyphen_word_ is cleared in cc_recg() before the next word on
696  // the line is recognized.
697  if (hyphen_word_) return;
698 
699  char filename[CHARS_PER_LINE];
700  FILE *doc_word_file;
701  int stringlen = best_choice.length();
702 
703  if (!doc_dict_enable || valid_word(best_choice) ||
704  CurrentWordAmbig() || stringlen < 2)
705  return;
706 
707  // Discard words that contain >= kDocDictMaxRepChars repeating unichars.
708  if (best_choice.length() >= kDocDictMaxRepChars) {
709  int num_rep_chars = 1;
710  UNICHAR_ID uch_id = best_choice.unichar_id(0);
711  for (int i = 1; i < best_choice.length(); ++i) {
712  if (best_choice.unichar_id(i) != uch_id) {
713  num_rep_chars = 1;
714  uch_id = best_choice.unichar_id(i);
715  } else {
716  ++num_rep_chars;
717  if (num_rep_chars == kDocDictMaxRepChars) return;
718  }
719  }
720  }
721 
722  if (best_choice.certainty() < doc_dict_certainty_threshold ||
723  stringlen == 2) {
724  if (best_choice.certainty() < doc_dict_pending_threshold)
725  return;
726 
727  if (!pending_words_->word_in_dawg(best_choice)) {
728  if (stringlen > 2 ||
729  (stringlen == 2 &&
730  getUnicharset().get_isupper(best_choice.unichar_id(0)) &&
731  getUnicharset().get_isupper(best_choice.unichar_id(1)))) {
732  pending_words_->add_word_to_dawg(best_choice);
733  }
734  return;
735  }
736  }
737 
738  if (save_doc_words) {
739  strcpy(filename, getImage()->getCCUtil()->imagefile.string());
740  strcat(filename, ".doc");
741  doc_word_file = open_file (filename, "a");
742  fprintf(doc_word_file, "%s\n",
743  best_choice.debug_string().string());
744  fclose(doc_word_file);
745  }
746  document_words_->add_word_to_dawg(best_choice);
747 }
748 
750  float *certainty_array,
751  const BLOB_CHOICE_LIST_VECTOR *char_choices,
752  bool nonword,
753  float additional_adjust,
754  bool debug) {
755  bool is_han = (char_choices != NULL &&
757  get_top_word_script(*char_choices, getUnicharset()) ==
758  getUnicharset().han_sid());
759  bool case_is_ok = (is_han || case_ok(*word, getUnicharset()));
760  bool punc_is_ok = (is_han || !nonword || valid_punctuation(*word));
761 
762  float adjust_factor = additional_adjust;
763  float new_rating = word->rating();
764  if (debug) {
765  tprintf("%sWord: %s %4.2f ", nonword ? "Non-" : "",
766  word->debug_string().string(), word->rating());
767  }
768  new_rating += kRatingPad;
769  if (nonword) { // non-dictionary word
770  if (case_is_ok && punc_is_ok) {
771  adjust_factor += segment_penalty_dict_nonword;
772  new_rating *= adjust_factor;
773  if (debug) tprintf(", W");
774  } else {
775  adjust_factor += segment_penalty_garbage;
776  new_rating *= adjust_factor;
777  if (debug) {
778  if (!case_is_ok) tprintf(", C");
779  if (!punc_is_ok) tprintf(", P");
780  }
781  }
782  } else { // dictionary word
783  if (case_is_ok) {
784  if (!is_han && freq_dawg_ != NULL && freq_dawg_->word_in_dawg(*word)) {
785  word->set_permuter(FREQ_DAWG_PERM);
786  adjust_factor += segment_penalty_dict_frequent_word;
787  new_rating *= adjust_factor;
788  if (debug) tprintf(", F");
789  } else {
790  adjust_factor += segment_penalty_dict_case_ok;
791  new_rating *= adjust_factor;
792  if (debug) tprintf(", ");
793  }
794  } else {
795  adjust_factor += segment_penalty_dict_case_bad;
796  new_rating *= adjust_factor;
797  if (debug) tprintf(", C");
798  }
799  }
800  new_rating -= kRatingPad;
801  word->set_rating(new_rating);
802  if (debug) tprintf(" %4.2f --> %4.2f\n", adjust_factor, new_rating);
803  LogNewChoice(adjust_factor, certainty_array, false, word,
804  *char_choices);
805 }
806 
807 int Dict::valid_word(const WERD_CHOICE &word, bool numbers_ok) const {
808  const WERD_CHOICE *word_ptr = &word;
809  WERD_CHOICE temp_word(word.unicharset());
810  if (hyphenated()) {
811  copy_hyphen_info(&temp_word);
812  temp_word += word;
813  word_ptr = &temp_word;
814  }
815  if (word_ptr->length() == 0) return NO_PERM;
816  // Allocate vectors for holding current and updated
817  // active_dawgs and constraints and initialize them.
818  DawgInfoVector *active_dawgs = new DawgInfoVector[2];
819  DawgInfoVector *constraints = new DawgInfoVector[2];
820  init_active_dawgs(kAnyWordLength, &(active_dawgs[0]), false);
821  init_constraints(&(constraints[0]));
822  DawgArgs dawg_args(&(active_dawgs[0]), &(constraints[0]),
823  &(active_dawgs[1]), &(constraints[1]),
824  0.0, NO_PERM, kAnyWordLength, 0);
825  int last_index = word_ptr->length() - 1;
826  // Call leter_is_okay for each letter in the word.
827  for (int i = hyphen_base_size(); i <= last_index; ++i) {
828  if (!((this->*letter_is_okay_)(&dawg_args, word_ptr->unichar_id(i),
829  i == last_index))) break;
830  // Swap active_dawgs, constraints with the corresponding updated vector.
831  if (dawg_args.updated_active_dawgs == &(active_dawgs[1])) {
832  dawg_args.updated_active_dawgs = &(active_dawgs[0]);
833  dawg_args.updated_constraints = &(constraints[0]);
834  ++(dawg_args.active_dawgs);
835  ++(dawg_args.constraints);
836  } else {
837  ++(dawg_args.updated_active_dawgs);
838  ++(dawg_args.updated_constraints);
839  dawg_args.active_dawgs = &(active_dawgs[0]);
840  dawg_args.constraints = &(constraints[0]);
841  }
842  }
843  delete[] active_dawgs;
844  delete[] constraints;
845  return valid_word_permuter(dawg_args.permuter, numbers_ok) ?
846  dawg_args.permuter : NO_PERM;
847 }
848 
849 bool Dict::valid_bigram(const WERD_CHOICE &word1,
850  const WERD_CHOICE &word2) const {
851  if (bigram_dawg_ == NULL) return false;
852 
853  // Extract the core word from the middle of each word with any digits
854  // replaced with question marks.
855  int w1start, w1end, w2start, w2end;
856  word1.punct_stripped(&w1start, &w1end);
857  word2.punct_stripped(&w2start, &w2end);
858 
859  // We don't want to penalize a single guillemet, hyphen, etc.
860  // But our bigram list doesn't have any information about punctuation.
861  if (w1start >= w1end) return word1.length() < 3;
862  if (w2start >= w2end) return word2.length() < 3;
863 
864  const UNICHARSET& uchset = getUnicharset();
865  STRING bigram_string;
866  for (int i = w1start; i < w1end; i++) {
868  bigram_string += uchset.get_isdigit(ch) ? "?" : uchset.id_to_unichar(ch);
869  }
870  bigram_string += " ";
871  for (int i = w2start; i < w2end; i++) {
873  bigram_string += uchset.get_isdigit(ch) ? "?" : uchset.id_to_unichar(ch);
874  }
875  WERD_CHOICE normalized_word(bigram_string.string(), uchset);
876  return bigram_dawg_->word_in_dawg(normalized_word);
877 }
878 
880  if (word.length() == 0) return NO_PERM;
881  int i;
882  WERD_CHOICE new_word(word.unicharset());
883  int last_index = word.length() - 1;
884  int new_len = 0;
885  for (i = 0; i <= last_index; ++i) {
886  UNICHAR_ID unichar_id = (word.unichar_id(i));
887  if (getUnicharset().get_ispunctuation(unichar_id)) {
888  new_word.append_unichar_id(unichar_id, 1, 0.0, 0.0);
889  } else if (!getUnicharset().get_isalpha(unichar_id) &&
890  !getUnicharset().get_isdigit(unichar_id)) {
891  return false; // neither punc, nor alpha, nor digit
892  } else if ((new_len = new_word.length()) == 0 ||
893  new_word.unichar_id(new_len-1) != Dawg::kPatternUnicharID) {
894  new_word.append_unichar_id(Dawg::kPatternUnicharID, 1, 0.0, 0.0);
895  }
896  }
897  for (i = 0; i < dawgs_.size(); ++i) {
898  if (dawgs_[i] != NULL &&
899  dawgs_[i]->type() == DAWG_TYPE_PUNCTUATION &&
900  dawgs_[i]->word_in_dawg(new_word)) return true;
901  }
902  return false;
903 }
904 
905 // Returns the "dominant" script ID for the word. By "dominant", the script
906 // must account for at least half the characters. Otherwise, it returns 0.
907 // Note that for Japanese, Hiragana and Katakana are simply treated as Han.
909  const UNICHARSET &unicharset) {
910  int max_script = unicharset.get_script_table_size();
911  int *sid = new int[max_script];
912  int x;
913  for (x = 0; x < max_script; x++) sid[x] = 0;
914  for (x = 0; x < char_choices.length(); ++x) {
915  BLOB_CHOICE_IT blob_choice_it(char_choices.get(x));
916  sid[blob_choice_it.data()->script_id()]++;
917  }
918  if (unicharset.han_sid() != unicharset.null_sid()) {
919  // Add the Hiragana & Katakana counts to Han and zero them out.
920  if (unicharset.hiragana_sid() != unicharset.null_sid()) {
921  sid[unicharset.han_sid()] += sid[unicharset.hiragana_sid()];
922  sid[unicharset.hiragana_sid()] = 0;
923  }
924  if (unicharset.katakana_sid() != unicharset.null_sid()) {
925  sid[unicharset.han_sid()] += sid[unicharset.katakana_sid()];
926  sid[unicharset.katakana_sid()] = 0;
927  }
928  }
929  // Note that high script ID overrides lower one on a tie, thus biasing
930  // towards non-Common script (if sorted that way in unicharset file).
931  int max_sid = 0;
932  for (x = 1; x < max_script; x++)
933  if (sid[x] >= sid[max_sid]) max_sid = x;
934  if (sid[max_sid] < char_choices.length() / 2)
935  max_sid = unicharset.null_sid();
936  delete[] sid;
937  return max_sid;
938 }
939 
940 } // namespace tesseract