Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
wordlist2dawg.cpp
Go to the documentation of this file.
1 
2 // File: wordlist2dawg.cpp
3 // Description: Program to generate a DAWG from a word list file
4 // Author: Thomas Kielbus
5 // Created: Thu May 10 18:11:42 PDT 2007
6 //
7 // (C) Copyright 2006, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
19 
20 // Given a file that contains a list of words (one word per line) this program
21 // generates the corresponding squished DAWG file.
22 
23 #include <stdio.h>
24 
25 #include "classify.h"
26 #include "dawg.h"
27 #include "dict.h"
28 #include "emalloc.h"
29 #include "freelist.h"
30 #include "helpers.h"
31 #include "serialis.h"
32 #include "trie.h"
33 #include "unicharset.h"
34 
35 static const int kMaxNumEdges = 30000000;
36 
37 int main(int argc, char** argv) {
38  int min_word_length;
39  int max_word_length;
40  if (!(argc == 4 || (argc == 5 && strcmp(argv[1], "-t") == 0) ||
41  (argc == 6 && strcmp(argv[1], "-r") == 0) ||
42  (argc == 7 && strcmp(argv[1], "-l") == 0 &&
43  sscanf(argv[2], "%d", &min_word_length) == 1 &&
44  sscanf(argv[3], "%d", &max_word_length) == 1))) {
45  printf("Usage: %s [-t | -r [reverse policy] |"
46  " -l min_len max_len] word_list_file"
47  " dawg_file unicharset_file\n", argv[0]);
48  return 1;
49  }
50  tesseract::Classify *classify = new tesseract::Classify();
51  int argv_index = 0;
52  if (argc == 5) ++argv_index;
53  tesseract::Trie::RTLReversePolicy reverse_policy =
55  if (argc == 6) {
56  ++argv_index;
57  int tmp_int;
58  sscanf(argv[++argv_index], "%d", &tmp_int);
59  reverse_policy = static_cast<tesseract::Trie::RTLReversePolicy>(tmp_int);
60  tprintf("Set reverse_policy to %s\n",
62  }
63  if (argc == 7) argv_index += 3;
64  const char* wordlist_filename = argv[++argv_index];
65  const char* dawg_filename = argv[++argv_index];
66  const char* unicharset_file = argv[++argv_index];
67  tprintf("Loading unicharset from '%s'\n", unicharset_file);
68  if (!classify->getDict().getUnicharset().load_from_file(unicharset_file)) {
69  tprintf("Failed to load unicharset from '%s'\n", unicharset_file);
70  delete classify;
71  return 1;
72  }
73  const UNICHARSET &unicharset = classify->getDict().getUnicharset();
74  if (argc == 4 || argc == 6) {
75  tesseract::Trie trie(
76  // the first 3 arguments are not used in this case
77  tesseract::DAWG_TYPE_WORD, "", SYSTEM_DAWG_PERM,
78  kMaxNumEdges, unicharset.size(),
79  classify->getDict().dawg_debug_level);
80  tprintf("Reading word list from '%s'\n", wordlist_filename);
81  if (!trie.read_word_list(wordlist_filename, unicharset, reverse_policy)) {
82  tprintf("Failed to read word list from '%s'\n", wordlist_filename);
83  exit(1);
84  }
85  tprintf("Reducing Trie to SquishedDawg\n");
86  tesseract::SquishedDawg *dawg = trie.trie_to_dawg();
87  if (dawg != NULL && dawg->NumEdges() > 0) {
88  tprintf("Writing squished DAWG to '%s'\n", dawg_filename);
89  dawg->write_squished_dawg(dawg_filename);
90  } else {
91  tprintf("Dawg is empty, skip producing the output file\n");
92  }
93  delete dawg;
94  } else if (argc == 5) {
95  tprintf("Loading dawg DAWG from '%s'\n", dawg_filename);
97  dawg_filename,
98  // these 3 arguments are not used in this case
99  tesseract::DAWG_TYPE_WORD, "", SYSTEM_DAWG_PERM,
100  classify->getDict().dawg_debug_level);
101  tprintf("Checking word list from '%s'\n", wordlist_filename);
102  words.check_for_words(wordlist_filename, unicharset, true);
103  } else if (argc == 7) {
104  // Place words of different lengths in separate Dawgs.
105  char str[CHARS_PER_LINE];
106  FILE *word_file = fopen(wordlist_filename, "rb");
107  if (word_file == NULL) {
108  tprintf("Failed to open wordlist file %s\n", wordlist_filename);
109  exit(1);
110  }
111  FILE *dawg_file = fopen(dawg_filename, "wb");
112  if (dawg_file == NULL) {
113  tprintf("Failed to open dawg output file %s\n", dawg_filename);
114  exit(1);
115  }
116  tprintf("Reading word list from '%s'\n", wordlist_filename);
118  int i;
119  for (i = min_word_length; i <= max_word_length; ++i) {
120  trie_vec.push_back(new tesseract::Trie(
121  // the first 3 arguments are not used in this case
122  tesseract::DAWG_TYPE_WORD, "", SYSTEM_DAWG_PERM,
123  kMaxNumEdges, unicharset.size(),
124  classify->getDict().dawg_debug_level));
125  }
126  while (fgets(str, CHARS_PER_LINE, word_file) != NULL) {
127  chomp_string(str); // remove newline
128  int badpos;
129  if (!unicharset.encodable_string(str, &badpos)) {
130  tprintf("String '%s' not compatible with unicharset. "
131  "Bad chars here: '%s'\n", str, str + badpos);
132  continue;
133  }
134  WERD_CHOICE word(str, unicharset);
135  if ((reverse_policy == tesseract::Trie::RRP_REVERSE_IF_HAS_RTL &&
136  word.has_rtl_unichar_id()) ||
137  reverse_policy == tesseract::Trie::RRP_FORCE_REVERSE) {
139  }
140  if (word.length() >= min_word_length &&
141  word.length() <= max_word_length &&
142  !word.contains_unichar_id(INVALID_UNICHAR_ID)) {
143  tesseract::Trie *curr_trie = trie_vec[word.length()-min_word_length];
144  if (!curr_trie->word_in_dawg(word)) {
145  if (!curr_trie->add_word_to_dawg(word)) {
146  tprintf("Failed to add the following word to dawg:\n");
147  word.print();
148  exit(1);
149  }
150  if (classify->getDict().dawg_debug_level > 1) {
151  tprintf("Added word %s of length %d\n", str, word.length());
152  }
153  if (!curr_trie->word_in_dawg(word)) {
154  tprintf("Error: word '%s' not in DAWG after adding it\n", str);
155  exit(1);
156  }
157  }
158  }
159  }
160  fclose(word_file);
161  tprintf("Writing fixed length dawgs to '%s'\n", dawg_filename);
163  for (i = 0; i <= max_word_length; ++i) {
164  dawg_vec.push_back(i < min_word_length ? NULL :
165  trie_vec[i-min_word_length]->trie_to_dawg());
166  }
168  dawg_vec, max_word_length - min_word_length + 1,
169  classify->getDict().dawg_debug_level, dawg_file);
170  fclose(dawg_file);
171  dawg_vec.delete_data_pointers();
172  trie_vec.delete_data_pointers();
173  } else { // should never get here
174  tprintf("Invalid command-line options\n");
175  exit(1);
176  }
177  delete classify;
178  return 0;
179 }