Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
dawg.h
Go to the documentation of this file.
1 /* -*-C-*-
2  ********************************************************************************
3  *
4  * File: dawg.h (Formerly dawg.h)
5  * Description: Definition of a class that represents Directed Accyclic Word
6  * Graph (DAWG), functions to build and manipulate the DAWG.
7  * Author: Mark Seaman, SW Productivity
8  * Created: Fri Oct 16 14:37:00 1987
9  * Modified: Wed Jun 19 16:50:24 1991 (Mark Seaman) marks@hpgrlt
10  * Language: C
11  * Package: N/A
12  * Status: Reusable Software Component
13  *
14  * (c) Copyright 1987, Hewlett-Packard Company.
15  ** Licensed under the Apache License, Version 2.0 (the "License");
16  ** you may not use this file except in compliance with the License.
17  ** You may obtain a copy of the License at
18  ** http://www.apache.org/licenses/LICENSE-2.0
19  ** Unless required by applicable law or agreed to in writing, software
20  ** distributed under the License is distributed on an "AS IS" BASIS,
21  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
22  ** See the License for the specific language governing permissions and
23  ** limitations under the License.
24  *
25  *********************************************************************************/
26 
27 #ifndef DICT_DAWG_H_
28 #define DICT_DAWG_H_
29 
30 /*----------------------------------------------------------------------
31  I n c l u d e s
32 ----------------------------------------------------------------------*/
33 
34 #include "elst.h"
35 #include "ratngs.h"
36 #include "params.h"
37 #include "tesscallback.h"
38 
39 #ifndef __GNUC__
40 #ifdef _WIN32
41 #define NO_EDGE (inT64) 0xffffffffffffffffi64
42 #endif /*_WIN32*/
43 #else
44 #define NO_EDGE (inT64) 0xffffffffffffffffll
45 #endif /*__GNUC__*/
46 
47 /*----------------------------------------------------------------------
48  T y p e s
49 ----------------------------------------------------------------------*/
50 class UNICHARSET;
51 
52 typedef uinT64 EDGE_RECORD;
54 typedef inT64 EDGE_REF;
55 typedef inT64 NODE_REF;
56 typedef EDGE_REF *NODE_MAP;
57 
58 namespace tesseract {
59 
60 struct NodeChild {
64  NodeChild(): unichar_id(INVALID_UNICHAR_ID), edge_ref(NO_EDGE) {}
65 };
66 
70 
71 enum DawgType {
76 
77  DAWG_TYPE_COUNT // number of enum entries
78 };
79 
80 /*----------------------------------------------------------------------
81  C o n s t a n t s
82 ----------------------------------------------------------------------*/
83 
84 #define FORWARD_EDGE (inT32) 0
85 #define BACKWARD_EDGE (inT32) 1
86 #define MAX_NODE_EDGES_DISPLAY (inT64) 100
87 #define MARKER_FLAG (inT64) 1
88 #define DIRECTION_FLAG (inT64) 2
89 #define WERD_END_FLAG (inT64) 4
90 #define LETTER_START_BIT 0
91 #define NUM_FLAG_BITS 3
92 #define REFFORMAT "%lld"
93 
94 // Set kBeginningDawgsType[i] to true if a Dawg of
95 // DawgType i can contain the beginning of a word.
96 static const bool kBeginningDawgsType[] = { 1, 1, 1, 1 };
97 
98 static const bool kDawgSuccessors[DAWG_TYPE_COUNT][DAWG_TYPE_COUNT] = {
99  { 0, 1, 1, 0 }, // for DAWG_TYPE_PUNCTUATION
100  { 1, 0, 0, 0 }, // for DAWG_TYPE_WORD
101  { 1, 0, 0, 0 }, // for DAWG_TYPE_NUMBER
102  { 0, 0, 0, 0 }, // for DAWG_TYPE_PATTERN
103 };
104 
105 static const char kWildcard[] = "*";
106 
107 
108 /*----------------------------------------------------------------------
109  C l a s s e s a n d S t r u c t s
110 ----------------------------------------------------------------------*/
111 //
121 //
122 class Dawg {
123  public:
125  static const inT16 kDawgMagicNumber = 42;
129  static const UNICHAR_ID kPatternUnicharID = 0;
130 
131  inline DawgType type() const { return type_; }
132  inline const STRING &lang() const { return lang_; }
133  inline PermuterType permuter() const { return perm_; }
134 
135  virtual ~Dawg() {};
136 
138  bool word_in_dawg(const WERD_CHOICE &word) const;
139 
142  int check_for_words(const char *filename,
143  const UNICHARSET &unicharset,
144  bool enable_wildcard) const;
145 
146  // For each word in the Dawg, call the given (permanent) callback with the
147  // text (UTF-8) version of the word.
148  void iterate_words(const UNICHARSET &unicharset,
149  TessCallback1<const char *> *cb) const;
150 
151  // Pure virtual function that should be implemented by the derived classes.
152 
154  virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id,
155  bool word_end) const = 0;
156 
159  virtual void unichar_ids_of(NODE_REF node, NodeChildVector *vec) const = 0;
160 
163  virtual NODE_REF next_node(EDGE_REF edge_ref) const = 0;
164 
167  virtual bool end_of_word(EDGE_REF edge_ref) const = 0;
168 
170  virtual UNICHAR_ID edge_letter(EDGE_REF edge_ref) const = 0;
171 
174  virtual void print_node(NODE_REF node, int max_num_edges) const = 0;
175 
178  virtual void unichar_id_to_patterns(UNICHAR_ID unichar_id,
179  const UNICHARSET &unicharset,
180  GenericVector<UNICHAR_ID> *vec) const {};
181 
186  EDGE_REF edge_ref, UNICHAR_ID unichar_id, bool word_end) const {
187  return false;
188  }
189 
190  protected:
191  Dawg() {}
192 
194  inline NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const {
195  return ((edge_rec & next_node_mask_) >> next_node_start_bit_);
196  }
198  inline bool marker_flag_from_edge_rec(const EDGE_RECORD &edge_rec) const {
199  return (edge_rec & (MARKER_FLAG << flag_start_bit_)) != 0;
200  }
202  inline int direction_from_edge_rec(const EDGE_RECORD &edge_rec) const {
203  return ((edge_rec & (DIRECTION_FLAG << flag_start_bit_))) ?
205  }
207  inline bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const {
208  return (edge_rec & (WERD_END_FLAG << flag_start_bit_)) != 0;
209  }
212  const EDGE_RECORD &edge_rec) const {
213  return ((edge_rec & letter_mask_) >> LETTER_START_BIT);
214  }
217  EDGE_RECORD *edge_rec, EDGE_REF value) {
218  *edge_rec &= (~next_node_mask_);
219  *edge_rec |= ((value << next_node_start_bit_) & next_node_mask_);
220  }
222  inline void set_marker_flag_in_edge_rec(EDGE_RECORD *edge_rec) {
223  *edge_rec |= (MARKER_FLAG << flag_start_bit_);
224  }
233  bool word_end,
234  UNICHAR_ID unichar_id,
235  const EDGE_RECORD &edge_rec) const {
236  UNICHAR_ID curr_unichar_id = unichar_id_from_edge_rec(edge_rec);
237  NODE_REF curr_next_node = next_node_from_edge_rec(edge_rec);
238  bool curr_word_end = end_of_word_from_edge_rec(edge_rec);
239  if (edge_rec_match(next_node, word_end, unichar_id, curr_next_node,
240  curr_word_end, curr_unichar_id)) return 0;
241  if (unichar_id > curr_unichar_id) return 1;
242  if (unichar_id == curr_unichar_id) {
243  if (next_node > curr_next_node) return 1;
244  if (next_node == curr_next_node) {
245  if (word_end > curr_word_end) return 1;
246  }
247  }
248  return -1;
249  }
254  bool word_end,
255  UNICHAR_ID unichar_id,
256  NODE_REF other_next_node,
257  bool other_word_end,
258  UNICHAR_ID other_unichar_id) const {
259  return ((unichar_id == other_unichar_id) &&
260  (next_node == NO_EDGE || next_node == other_next_node) &&
261  (!word_end || (word_end == other_word_end)));
262  }
263 
266  void init(DawgType type, const STRING &lang,
267  PermuterType perm, int unicharset_size, int debug_level);
268 
274  bool match_words(WERD_CHOICE *word, inT32 index,
275  NODE_REF node, UNICHAR_ID wildcard) const;
276 
277  // Recursively iterate over all words in a dawg (see public iterate_words).
278  void iterate_words_rec(const WERD_CHOICE &word_so_far,
279  NODE_REF to_explore,
280  TessCallback1<const char *> *cb) const;
281 
282  // Member Variables.
286  PermuterType perm_;
287  // Variables to construct various edge masks. Formerly:
288  // #define NEXT_EDGE_MASK (inT64) 0xfffffff800000000i64
289  // #define FLAGS_MASK (inT64) 0x0000000700000000i64
290  // #define LETTER_MASK (inT64) 0x00000000ffffffffi64
297  // Level of debug statements to print to stdout.
299 };
300 
301 //
304 //
305 struct DawgInfo {
306  DawgInfo() : dawg_index(-1), ref(NO_EDGE) {}
307  DawgInfo(int i, EDGE_REF r) : dawg_index(i), ref(r) {}
308  bool operator==(const DawgInfo &other) {
309  return (this->dawg_index == other.dawg_index && this->ref == other.ref);
310  }
313 };
314 class DawgInfoVector : public GenericVector<DawgInfo> {
315  public:
318  if (size_reserved_ > 0) {
319  delete[] data_;
320  size_used_ = 0;
321  size_reserved_ = 0;
322  }
323  }
326  void clear() { size_used_ = 0; }
330  inline bool add_unique(const DawgInfo &new_info, bool debug,
331  const char *debug_msg) {
332  for (int i = 0; i < size_used_; ++i) {
333  if (data_[i] == new_info) return false;
334  }
335  push_back(new_info);
336  if (debug) {
337  tprintf("%s[%d, " REFFORMAT "]\n", debug_msg,
338  new_info.dawg_index, new_info.ref);
339  }
340  return true;
341  }
342 };
343 
344 //
351 //
352 class SquishedDawg : public Dawg {
353  public:
354  SquishedDawg(FILE *file, DawgType type, const STRING &lang,
355  PermuterType perm, int debug_level) {
356  read_squished_dawg(file, type, lang, perm, debug_level);
357  num_forward_edges_in_node0 = num_forward_edges(0);
358  }
360  const STRING &lang, PermuterType perm, int debug_level) {
361  FILE *file = fopen(filename, "rb");
362  if (file == NULL) {
363  tprintf("Failed to open dawg file %s\n", filename);
364  exit(1);
365  }
366  read_squished_dawg(file, type, lang, perm, debug_level);
367  num_forward_edges_in_node0 = num_forward_edges(0);
368  fclose(file);
369  }
370  SquishedDawg(EDGE_ARRAY edges, int num_edges, DawgType type,
371  const STRING &lang, PermuterType perm,
372  int unicharset_size, int debug_level) :
373  edges_(edges), num_edges_(num_edges) {
374  init(type, lang, perm, unicharset_size, debug_level);
375  num_forward_edges_in_node0 = num_forward_edges(0);
376  if (debug_level > 3) print_all("SquishedDawg:");
377  }
378  ~SquishedDawg();
379 
380  int NumEdges() { return num_edges_; }
381 
383  EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id,
384  bool word_end) const;
385 
388  void unichar_ids_of(NODE_REF node, NodeChildVector *vec) const {
389  EDGE_REF edge = node;
390  if (!edge_occupied(edge) || edge == NO_EDGE) return;
391  assert(forward_edge(edge)); // we don't expect any backward edges to
392  do { // be present when this funciton is called
393  vec->push_back(NodeChild(unichar_id_from_edge_rec(edges_[edge]), edge));
394  } while (!last_edge(edge++));
395  }
396 
400  return next_node_from_edge_rec((edges_[edge]));
401  }
402 
405  bool end_of_word(EDGE_REF edge_ref) const {
406  return end_of_word_from_edge_rec((edges_[edge_ref]));
407  }
408 
410  UNICHAR_ID edge_letter(EDGE_REF edge_ref) const {
411  return unichar_id_from_edge_rec((edges_[edge_ref]));
412  }
413 
416  void print_node(NODE_REF node, int max_num_edges) const;
417 
419  void write_squished_dawg(FILE *file);
420 
423  void write_squished_dawg(const char *filename) {
424  FILE *file = fopen(filename, "wb");
425  if (file == NULL) {
426  tprintf("Error opening %s\n", filename);
427  exit(1);
428  }
429  this->write_squished_dawg(file);
430  fclose(file);
431  }
432 
433  private:
435  inline void set_next_node(EDGE_REF edge_ref, EDGE_REF value) {
436  set_next_node_in_edge_rec(&(edges_[edge_ref]), value);
437  }
439  inline void set_empty_edge(EDGE_REF edge_ref) {
440  (edges_[edge_ref] = next_node_mask_);
441  }
443  inline void clear_all_edges() {
444  for (int edge = 0; edge < num_edges_; edge++) set_empty_edge(edge);
445  }
447  inline void clear_marker_flag(EDGE_REF edge_ref) {
448  (edges_[edge_ref] &= ~(MARKER_FLAG << flag_start_bit_));
449  }
451  inline bool forward_edge(EDGE_REF edge_ref) const {
452  return (edge_occupied(edge_ref) &&
453  (FORWARD_EDGE == direction_from_edge_rec(edges_[edge_ref])));
454  }
456  inline bool backward_edge(EDGE_REF edge_ref) const {
457  return (edge_occupied(edge_ref) &&
458  (BACKWARD_EDGE == direction_from_edge_rec(edges_[edge_ref])));
459  }
461  inline bool edge_occupied(EDGE_REF edge_ref) const {
462  return (edges_[edge_ref] != next_node_mask_);
463  }
465  inline bool last_edge(EDGE_REF edge_ref) const {
466  return (edges_[edge_ref] & (MARKER_FLAG << flag_start_bit_)) != 0;
467  }
468 
470  inT32 num_forward_edges(NODE_REF node) const;
471 
473  void read_squished_dawg(FILE *file, DawgType type, const STRING &lang,
474  PermuterType perm, int debug_level);
475 
477  void print_edge(EDGE_REF edge) const;
478 
480  void print_all(const char* msg) {
481  tprintf("\n__________________________\n%s\n", msg);
482  for (int i = 0; i < num_edges_; ++i) print_edge(i);
483  tprintf("__________________________\n");
484  }
486  NODE_MAP build_node_map(inT32 *num_nodes) const;
487 
488 
489  // Member variables.
490  EDGE_ARRAY edges_;
491  int num_edges_;
492  int num_forward_edges_in_node0;
493 };
494 
495 } // namespace tesseract
496 
497 #endif // DICT_DAWG_H_