Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
tessdatamanager.cpp
Go to the documentation of this file.
1 
2 // File: tessdatamanager.cpp
3 // Description: Functions to handle loading/combining tesseract data files.
4 // Author: Daria Antonova
5 // Created: Wed Jun 03 11:26:43 PST 2009
6 //
7 // (C) Copyright 2009, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
19 
20 #ifdef _MSC_VER
21 #pragma warning(disable:4244) // Conversion warnings
22 #endif
23 
24 #include "tessdatamanager.h"
25 
26 #include <stdio.h>
27 
28 #include "serialis.h"
29 #include "strngs.h"
30 #include "tprintf.h"
31 #include "params.h"
32 
33 namespace tesseract {
34 
35 bool TessdataManager::Init(const char *data_file_name, int debug_level) {
36  int i;
37  debug_level_ = debug_level;
38  data_file_ = fopen(data_file_name, "rb");
39  if (data_file_ == NULL) {
40  tprintf("Error opening data file %s\n", data_file_name);
41  tprintf("Please make sure the TESSDATA_PREFIX environment variable is set "
42  "to the parent directory of your \"tessdata\" directory.\n");
43  return false;
44  }
45  fread(&actual_tessdata_num_entries_, sizeof(inT32), 1, data_file_);
46  swap_ = (actual_tessdata_num_entries_ > kMaxNumTessdataEntries);
47  if (swap_) {
48  actual_tessdata_num_entries_ = reverse32(actual_tessdata_num_entries_);
49  }
50  ASSERT_HOST(actual_tessdata_num_entries_ <= TESSDATA_NUM_ENTRIES);
51  fread(offset_table_, sizeof(inT64),
52  actual_tessdata_num_entries_, data_file_);
53  if (swap_) {
54  for (i = 0 ; i < actual_tessdata_num_entries_; ++i) {
55  offset_table_[i] = reverse64(offset_table_[i]);
56  }
57  }
58  if (debug_level_) {
59  tprintf("TessdataManager loaded %d types of tesseract data files.\n",
60  actual_tessdata_num_entries_);
61  for (i = 0; i < actual_tessdata_num_entries_; ++i) {
62  tprintf("Offset for type %d is %lld\n", i, offset_table_[i]);
63  }
64  }
65  return true;
66 }
67 
68 void TessdataManager::CopyFile(FILE *input_file, FILE *output_file,
69  bool newline_end, inT64 num_bytes_to_copy) {
70  if (num_bytes_to_copy == 0) return;
71  int buffer_size = 1024;
72  if (num_bytes_to_copy > 0 && buffer_size > num_bytes_to_copy) {
73  buffer_size = num_bytes_to_copy;
74  }
75  inT64 num_bytes_copied = 0;
76  char *chunk = new char[buffer_size];
77  int bytes_read;
78  char last_char = 0x0;
79  while ((bytes_read = fread(chunk, sizeof(char),
80  buffer_size, input_file))) {
81  fwrite(chunk, sizeof(char), bytes_read, output_file);
82  last_char = chunk[bytes_read-1];
83  if (num_bytes_to_copy > 0) {
84  num_bytes_copied += bytes_read;
85  if (num_bytes_copied == num_bytes_to_copy) break;
86  if (num_bytes_copied + buffer_size > num_bytes_to_copy) {
87  buffer_size = num_bytes_to_copy - num_bytes_copied;
88  }
89  }
90  }
91  if (newline_end) ASSERT_HOST(last_char == '\n');
92  delete[] chunk;
93 }
94 
95 void TessdataManager::WriteMetadata(inT64 *offset_table, FILE *output_file) {
96  fseek(output_file, 0, SEEK_SET);
97  inT32 num_entries = TESSDATA_NUM_ENTRIES;
98  fwrite(&num_entries, sizeof(inT32), 1, output_file);
99  fwrite(offset_table, sizeof(inT64), TESSDATA_NUM_ENTRIES, output_file);
100  fclose(output_file);
101 
102  tprintf("TessdataManager combined tesseract data files.\n");
103  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
104  tprintf("Offset for type %d is %lld\n", i, offset_table[i]);
105  }
106 }
107 
109  const char *language_data_path_prefix,
110  const char *output_filename) {
111  int i;
112  inT64 offset_table[TESSDATA_NUM_ENTRIES];
113  for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) offset_table[i] = -1;
114  FILE *output_file = fopen(output_filename, "wb");
115  if (output_file == NULL) {
116  tprintf("Error opening %s for writing\n", output_filename);
117  return false;
118  }
119  // Leave some space for recording the offset_table.
120  fseek(output_file,
121  sizeof(inT32) + sizeof(inT64) * TESSDATA_NUM_ENTRIES, SEEK_SET);
122 
124  bool text_file = false;
125  FILE *file_ptr[TESSDATA_NUM_ENTRIES];
126 
127  // Load individual tessdata components from files.
128  for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
130  kTessdataFileSuffixes[i], &type, &text_file));
131  STRING filename = language_data_path_prefix;
132  filename += kTessdataFileSuffixes[i];
133  file_ptr[i] = fopen(filename.string(), "rb");
134  if (file_ptr[i] != NULL) {
135  offset_table[type] = ftell(output_file);
136  CopyFile(file_ptr[i], output_file, text_file, -1);
137  fclose(file_ptr[i]);
138  }
139  }
140 
141  // Make sure that the required components are present.
142  if (file_ptr[TESSDATA_UNICHARSET] == NULL) {
143  tprintf("Error opening unicharset file\n");
144  fclose(output_file);
145  return false;
146  }
147  if (file_ptr[TESSDATA_INTTEMP] != NULL &&
148  (file_ptr[TESSDATA_PFFMTABLE] == NULL ||
149  file_ptr[TESSDATA_NORMPROTO] == NULL)) {
150  tprintf("Error opening pffmtable and/or normproto files"
151  " while inttemp file was present\n");
152  fclose(output_file);
153  return false;
154  }
155 
156  WriteMetadata(offset_table, output_file);
157  return true;
158 }
159 
161  const char *new_traineddata_filename,
162  char **component_filenames,
163  int num_new_components) {
164  int i;
165  inT64 offset_table[TESSDATA_NUM_ENTRIES];
167  bool text_file = false;
168  FILE *file_ptr[TESSDATA_NUM_ENTRIES];
169  for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
170  offset_table[i] = -1;
171  file_ptr[i] = NULL;
172  }
173  FILE *output_file = fopen(new_traineddata_filename, "wb");
174  if (output_file == NULL) {
175  tprintf("Error opening %s for writing\n", new_traineddata_filename);
176  return false;
177  }
178 
179  // Leave some space for recording the offset_table.
180  fseek(output_file,
181  sizeof(inT32) + sizeof(inT64) * TESSDATA_NUM_ENTRIES, SEEK_SET);
182 
183  // Open the files with the new components.
184  for (i = 0; i < num_new_components; ++i) {
185  TessdataTypeFromFileName(component_filenames[i], &type, &text_file);
186  file_ptr[type] = fopen(component_filenames[i], "rb");
187  }
188 
189  // Write updated data to the output traineddata file.
190  for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
191  if (file_ptr[i] != NULL) {
192  // Get the data from the opened component file.
193  offset_table[i] = ftell(output_file);
194  CopyFile(file_ptr[i], output_file, kTessdataFileIsText[i], -1);
195  fclose(file_ptr[i]);
196  } else {
197  // Get this data component from the loaded data file.
198  if (SeekToStart(static_cast<TessdataType>(i))) {
199  offset_table[i] = ftell(output_file);
200  CopyFile(data_file_, output_file, kTessdataFileIsText[i],
201  GetEndOffset(static_cast<TessdataType>(i)) -
202  ftell(data_file_) + 1);
203  }
204  }
205  }
206 
207  WriteMetadata(offset_table, output_file);
208  return true;
209 }
210 
212  const char *suffix, TessdataType *type, bool *text_file) {
213  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
214  if (strcmp(kTessdataFileSuffixes[i], suffix) == 0) {
215  *type = static_cast<TessdataType>(i);
216  *text_file = kTessdataFileIsText[i];
217  return true;
218  }
219  }
220  printf("TessdataManager can't determine which tessdata"
221  " component is represented by %s\n", suffix);
222  return false;
223 }
224 
226  const char *filename, TessdataType *type, bool *text_file) {
227  // Get the file suffix (extension)
228  const char *suffix = strrchr(filename, '.');
229  if (suffix == NULL || *(++suffix) == '\0') return false;
230  return TessdataTypeFromFileSuffix(suffix, type, text_file);
231 }
232 
235  bool text_file = false;
237  filename, &type, &text_file));
238  if (!SeekToStart(type)) return false;
239 
240  FILE *output_file = fopen(filename, "wb");
241  if (output_file == NULL) {
242  printf("Error openning %s\n", filename);
243  exit(1);
244  }
245  inT64 begin_offset = ftell(GetDataFilePtr());
246  inT64 end_offset = GetEndOffset(type);
248  GetDataFilePtr(), output_file, text_file,
249  end_offset - begin_offset + 1);
250  fclose(output_file);
251  return true;
252 }
253 
254 } // namespace tesseract