Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
dawg.cpp
Go to the documentation of this file.
1 /* -*-C-*-
2  ********************************************************************************
3  *
4  * File: dawg.c (Formerly dawg.c)
5  * Description: Use a Directed Accyclic Word Graph
6  * Author: Mark Seaman, OCR Technology
7  * Created: Fri Oct 16 14:37:00 1987
8  * Modified: Wed Jul 24 16:59:16 1991 (Mark Seaman) marks@hpgrlt
9  * Language: C
10  * Package: N/A
11  * Status: Reusable Software Component
12  *
13  * (c) Copyright 1987, Hewlett-Packard Company.
14  ** Licensed under the Apache License, Version 2.0 (the "License");
15  ** you may not use this file except in compliance with the License.
16  ** You may obtain a copy of the License at
17  ** http://www.apache.org/licenses/LICENSE-2.0
18  ** Unless required by applicable law or agreed to in writing, software
19  ** distributed under the License is distributed on an "AS IS" BASIS,
20  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
21  ** See the License for the specific language governing permissions and
22  ** limitations under the License.
23  *
24  *********************************************************************************/
25 /*----------------------------------------------------------------------
26  I n c l u d e s
27 ----------------------------------------------------------------------*/
28 
29 #ifdef _MSC_VER
30 #pragma warning(disable:4244) // Conversion warnings
31 #pragma warning(disable:4800) // int/bool warnings
32 #endif
33 #include "dawg.h"
34 
35 #include "cutil.h"
36 #include "dict.h"
37 #include "emalloc.h"
38 #include "freelist.h"
39 #include "helpers.h"
40 #include "strngs.h"
41 #include "tprintf.h"
42 
43 /*----------------------------------------------------------------------
44  F u n c t i o n s f o r D a w g
45 ----------------------------------------------------------------------*/
46 namespace tesseract {
47 
48 bool Dawg::word_in_dawg(const WERD_CHOICE &word) const {
49  if (word.length() == 0) return false;
50  NODE_REF node = 0;
51  int end_index = word.length() - 1;
52  for (int i = 0; i <= end_index; i++) {
53  if (debug_level_ > 1) {
54  tprintf("word_in_dawg: exploring node " REFFORMAT ":\n", node);
56  tprintf("\n");
57  }
58  EDGE_REF edge = edge_char_of(node, word.unichar_id(i), i == end_index);
59  if (edge != NO_EDGE) {
60  node = next_node(edge);
61  if (node == 0) node = NO_EDGE;
62  } else {
63  return false;
64  }
65  }
66  return true;
67 }
68 
70  const UNICHARSET &unicharset,
71  bool enable_wildcard) const {
72  if (filename == NULL) return 0;
73 
74  FILE *word_file;
75  char string [CHARS_PER_LINE];
76  int misses = 0;
77  UNICHAR_ID wildcard = unicharset.unichar_to_id(kWildcard);
78 
79  word_file = open_file (filename, "r");
80 
81  while (fgets (string, CHARS_PER_LINE, word_file) != NULL) {
82  chomp_string(string); // remove newline
83  WERD_CHOICE word(string, unicharset);
84  if (word.length() > 0 &&
85  !word.contains_unichar_id(INVALID_UNICHAR_ID)) {
86  if (!match_words(&word, 0, 0,
87  enable_wildcard ? wildcard : INVALID_UNICHAR_ID)) {
88  tprintf("Missing word: %s\n", string);
89  ++misses;
90  }
91  } else {
92  tprintf("Failed to create a valid word from %s\n", string);
93  }
94  }
95  fclose (word_file);
96  // Make sure the user sees this with fprintf instead of tprintf.
97  if (debug_level_) tprintf("Number of lost words=%d\n", misses);
98  return misses;
99 }
100 
101 void Dawg::iterate_words(const UNICHARSET &unicharset,
102  TessCallback1<const char *> *cb) const {
103  WERD_CHOICE word(&unicharset);
104  iterate_words_rec(word, 0, cb);
105 }
106 
107 void Dawg::iterate_words_rec(const WERD_CHOICE &word_so_far,
108  NODE_REF to_explore,
109  TessCallback1<const char *> *cb) const {
110  NodeChildVector children;
111  this->unichar_ids_of(to_explore, &children);
112  for (int i = 0; i < children.size(); i++) {
113  WERD_CHOICE next_word(word_so_far);
114  next_word.append_unichar_id(children[i].unichar_id, 1, 0.0, 0.0);
115  if (this->end_of_word(children[i].edge_ref)) {
116  STRING s;
117  next_word.string_and_lengths(&s, NULL);
118  cb->Run(s.string());
119  }
120  NODE_REF next = next_node(children[i].edge_ref);
121  if (next != 0) {
122  iterate_words_rec(next_word, next, cb);
123  }
124  }
125 }
126 
128  NODE_REF node, UNICHAR_ID wildcard) const {
129  EDGE_REF edge;
130  inT32 word_end;
131 
132  if (wildcard != INVALID_UNICHAR_ID && word->unichar_id(index) == wildcard) {
133  bool any_matched = false;
134  NodeChildVector vec;
135  this->unichar_ids_of(node, &vec);
136  for (int i = 0; i < vec.size(); ++i) {
137  word->set_unichar_id(vec[i].unichar_id, index);
138  if (match_words(word, index, node, wildcard))
139  any_matched = true;
140  }
141  word->set_unichar_id(wildcard, index);
142  return any_matched;
143  } else {
144  word_end = index == word->length() - 1;
145  edge = edge_char_of(node, word->unichar_id(index), word_end);
146  if (edge != NO_EDGE) { // normal edge in DAWG
147  node = next_node(edge);
148  if (word_end) {
149  if (debug_level_ > 1) word->print("match_words() found: ");
150  return true;
151  } else if (node != 0) {
152  return match_words(word, index+1, node, wildcard);
153  }
154  }
155  }
156  return false;
157 }
158 
159 void Dawg::init(DawgType type, const STRING &lang,
160  PermuterType perm, int unicharset_size, int debug_level) {
161  type_ = type;
162  lang_ = lang;
163  perm_ = perm;
164  ASSERT_HOST(unicharset_size > 0);
165  unicharset_size_ = unicharset_size;
166  // Set bit masks.
167  flag_start_bit_ = ceil(log(static_cast<double>(unicharset_size_)) / log(2.0));
169  letter_mask_ = ~(~0 << flag_start_bit_);
172 
173  debug_level_ = debug_level;
174 }
175 
176 
177 /*----------------------------------------------------------------------
178  F u n c t i o n s f o r S q u i s h e d D a w g
179 ----------------------------------------------------------------------*/
180 
182 
184  UNICHAR_ID unichar_id,
185  bool word_end) const {
186  EDGE_REF edge = node;
187  if (node == 0) { // binary search
188  EDGE_REF start = 0;
189  EDGE_REF end = num_forward_edges_in_node0 - 1;
190  int compare;
191  while (start <= end) {
192  edge = (start + end) >> 1; // (start + end) / 2
193  compare = given_greater_than_edge_rec(NO_EDGE, word_end,
194  unichar_id, edges_[edge]);
195  if (compare == 0) { // given == vec[k]
196  return edge;
197  } else if (compare == 1) { // given > vec[k]
198  start = edge + 1;
199  } else { // given < vec[k]
200  end = edge - 1;
201  }
202  }
203  } else { // linear search
204  if (edge != NO_EDGE && edge_occupied(edge)) {
205  do {
206  if ((unichar_id_from_edge_rec(edges_[edge]) == unichar_id) &&
207  (!word_end || end_of_word_from_edge_rec(edges_[edge])))
208  return (edge);
209  } while (!last_edge(edge++));
210  }
211  }
212  return (NO_EDGE); // not found
213 }
214 
215 inT32 SquishedDawg::num_forward_edges(NODE_REF node) const {
216  EDGE_REF edge = node;
217  inT32 num = 0;
218 
219  if (forward_edge (edge)) {
220  do {
221  num++;
222  } while (!last_edge(edge++));
223  }
224 
225  return (num);
226 }
227 
228 void SquishedDawg::print_node(NODE_REF node, int max_num_edges) const {
229  if (node == NO_EDGE) return; // nothing to print
230 
231  EDGE_REF edge = node;
232  const char *forward_string = "FORWARD";
233  const char *backward_string = " ";
234 
235  const char *last_string = "LAST";
236  const char *not_last_string = " ";
237 
238  const char *eow_string = "EOW";
239  const char *not_eow_string = " ";
240 
241  const char *direction;
242  const char *is_last;
243  const char *eow;
244 
245  UNICHAR_ID unichar_id;
246 
247  if (edge_occupied(edge)) {
248  do {
249  direction =
250  forward_edge(edge) ? forward_string : backward_string;
251  is_last = last_edge(edge) ? last_string : not_last_string;
252  eow = end_of_word(edge) ? eow_string : not_eow_string;
253 
254  unichar_id = edge_letter(edge);
255  tprintf(REFFORMAT " : next = " REFFORMAT ", unichar_id = %d, %s %s %s\n",
256  edge, next_node(edge), unichar_id,
257  direction, is_last, eow);
258 
259  if (edge - node > max_num_edges) return;
260  } while (!last_edge(edge++));
261 
262  if (edge < num_edges_ &&
263  edge_occupied(edge) && backward_edge(edge)) {
264  do {
265  direction =
266  forward_edge(edge) ? forward_string : backward_string;
267  is_last = last_edge(edge) ? last_string : not_last_string;
268  eow = end_of_word(edge) ? eow_string : not_eow_string;
269 
270  unichar_id = edge_letter(edge);
271  tprintf(REFFORMAT " : next = " REFFORMAT
272  ", unichar_id = %d, %s %s %s\n",
273  edge, next_node(edge), unichar_id,
274  direction, is_last, eow);
275 
276  if (edge - node > MAX_NODE_EDGES_DISPLAY) return;
277  } while (!last_edge(edge++));
278  }
279  }
280  else {
281  tprintf(REFFORMAT " : no edges in this node\n", node);
282  }
283  tprintf("\n");
284 }
285 
286 void SquishedDawg::print_edge(EDGE_REF edge) const {
287  if (edge == NO_EDGE) {
288  tprintf("NO_EDGE\n");
289  } else {
290  tprintf(REFFORMAT " : next = " REFFORMAT
291  ", unichar_id = '%d', %s %s %s\n", edge,
292  next_node(edge), edge_letter(edge),
293  (forward_edge(edge) ? "FORWARD" : " "),
294  (last_edge(edge) ? "LAST" : " "),
295  (end_of_word(edge) ? "EOW" : ""));
296  }
297 }
298 
299 void SquishedDawg::read_squished_dawg(FILE *file,
300  DawgType type,
301  const STRING &lang,
302  PermuterType perm,
303  int debug_level) {
304  if (debug_level) tprintf("Reading squished dawg\n");
305 
306  // Read the magic number and if it does not match kDawgMagicNumber
307  // set swap to true to indicate that we need to switch endianness.
308  inT16 magic;
309  fread(&magic, sizeof(inT16), 1, file);
310  bool swap = (magic != kDawgMagicNumber);
311 
312  int unicharset_size;
313  fread(&unicharset_size, sizeof(inT32), 1, file);
314  fread(&num_edges_, sizeof(inT32), 1, file);
315 
316  if (swap) {
317  unicharset_size = reverse32(unicharset_size);
318  num_edges_ = reverse32(num_edges_);
319  }
320  ASSERT_HOST(num_edges_ > 0); // DAWG should not be empty
321  Dawg::init(type, lang, perm, unicharset_size, debug_level);
322 
323  edges_ = (EDGE_ARRAY) memalloc(sizeof(EDGE_RECORD) * num_edges_);
324  fread(&edges_[0], sizeof(EDGE_RECORD), num_edges_, file);
325  EDGE_REF edge;
326  if (swap) {
327  for (edge = 0; edge < num_edges_; ++edge) {
328  edges_[edge] = reverse64(edges_[edge]);
329  }
330  }
331  if (debug_level > 2) {
332  tprintf("type: %d lang: %s perm: %d unicharset_size: %d num_edges: %d\n",
333  type_, lang_.string(), perm_, unicharset_size_, num_edges_);
334  for (edge = 0; edge < num_edges_; ++edge)
335  print_edge(edge);
336  }
337 }
338 
339 NODE_MAP SquishedDawg::build_node_map(inT32 *num_nodes) const {
340  EDGE_REF edge;
341  NODE_MAP node_map;
342  inT32 node_counter;
343  inT32 num_edges;
344 
345  node_map = (NODE_MAP) malloc(sizeof(EDGE_REF) * num_edges_);
346 
347  for (edge = 0; edge < num_edges_; edge++) // init all slots
348  node_map [edge] = -1;
349 
350  node_counter = num_forward_edges(0);
351 
352  *num_nodes = 0;
353  for (edge = 0; edge < num_edges_; edge++) { // search all slots
354 
355  if (forward_edge(edge)) {
356  (*num_nodes)++; // count nodes links
357  node_map[edge] = (edge ? node_counter : 0);
358  num_edges = num_forward_edges(edge);
359  if (edge != 0) node_counter += num_edges;
360  edge += num_edges;
361  if (edge >= num_edges_) break;
362  if (backward_edge(edge)) while (!last_edge(edge++));
363  edge--;
364  }
365  }
366  return (node_map);
367 }
368 
370  EDGE_REF edge;
371  inT32 num_edges;
372  inT32 node_count = 0;
373  NODE_MAP node_map;
374  EDGE_REF old_index;
375  EDGE_RECORD temp_record;
376 
377  if (debug_level_) tprintf("write_squished_dawg\n");
378 
379  node_map = build_node_map(&node_count);
380 
381  // Write the magic number to help detecting a change in endianness.
382  inT16 magic = kDawgMagicNumber;
383  fwrite(&magic, sizeof(inT16), 1, file);
384  fwrite(&unicharset_size_, sizeof(inT32), 1, file);
385 
386  // Count the number of edges in this Dawg.
387  num_edges = 0;
388  for (edge=0; edge < num_edges_; edge++)
389  if (forward_edge(edge))
390  num_edges++;
391 
392  fwrite(&num_edges, sizeof(inT32), 1, file); // write edge count to file
393 
394  if (debug_level_) {
395  tprintf("%d nodes in DAWG\n", node_count);
396  tprintf("%d edges in DAWG\n", num_edges);
397  }
398 
399  for (edge = 0; edge < num_edges_; edge++) {
400  if (forward_edge(edge)) { // write forward edges
401  do {
402  old_index = next_node_from_edge_rec(edges_[edge]);
403  set_next_node(edge, node_map[old_index]);
404  temp_record = edges_[edge];
405  fwrite(&(temp_record), sizeof(EDGE_RECORD), 1, file);
406  set_next_node(edge, old_index);
407  } while (!last_edge(edge++));
408 
409  if (edge >= num_edges_) break;
410  if (backward_edge(edge)) // skip back links
411  while (!last_edge(edge++));
412 
413  edge--;
414  }
415  }
416  free(node_map);
417 }
418 
419 } // namespace tesseract