Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
shapetable.cpp
Go to the documentation of this file.
1 // Copyright 2010 Google Inc. All Rights Reserved.
2 // Author: rays@google.com (Ray Smith)
4 // File: shapetable.cpp
5 // Description: Class to map a classifier shape index to unicharset
6 // indices and font indices.
7 // Author: Ray Smith
8 // Created: Tue Nov 02 15:31:32 PDT 2010
9 //
10 // (C) Copyright 2010, Google Inc.
11 // Licensed under the Apache License, Version 2.0 (the "License");
12 // you may not use this file except in compliance with the License.
13 // You may obtain a copy of the License at
14 // http://www.apache.org/licenses/LICENSE-2.0
15 // Unless required by applicable law or agreed to in writing, software
16 // distributed under the License is distributed on an "AS IS" BASIS,
17 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 // See the License for the specific language governing permissions and
19 // limitations under the License.
20 //
22 
23 #include "shapetable.h"
24 
25 #include "intfeaturespace.h"
26 #include "strngs.h"
27 #include "unicharset.h"
28 
29 namespace tesseract {
30 
31 // Writes to the given file. Returns false in case of error.
32 bool UnicharAndFonts::Serialize(FILE* fp) const {
33  if (fwrite(&unichar_id, sizeof(unichar_id), 1, fp) != 1) return false;
34  if (!font_ids.Serialize(fp)) return false;
35  return true;
36 }
37 // Reads from the given file. Returns false in case of error.
38 // If swap is true, assumes a big/little-endian swap is needed.
39 bool UnicharAndFonts::DeSerialize(bool swap, FILE* fp) {
40  if (fread(&unichar_id, sizeof(unichar_id), 1, fp) != 1) return false;
41  if (swap)
42  ReverseN(&unichar_id, sizeof(unichar_id));
43  if (!font_ids.DeSerialize(swap, fp)) return false;
44  return true;
45 }
46 
47 // Sort function to sort a pair of UnicharAndFonts by unichar_id.
48 int UnicharAndFonts::SortByUnicharId(const void* v1, const void* v2) {
49  const UnicharAndFonts* p1 = reinterpret_cast<const UnicharAndFonts*>(v1);
50  const UnicharAndFonts* p2 = reinterpret_cast<const UnicharAndFonts*>(v2);
51  return p1->unichar_id - p2->unichar_id;
52 }
53 
54 // Writes to the given file. Returns false in case of error.
55 bool Shape::Serialize(FILE* fp) const {
56  uinT8 sorted = unichars_sorted_;
57  if (fwrite(&sorted, sizeof(sorted), 1, fp) != 1)
58  return false;
59  if (!unichars_.SerializeClasses(fp)) return false;
60  return true;
61 }
62 // Reads from the given file. Returns false in case of error.
63 // If swap is true, assumes a big/little-endian swap is needed.
64 bool Shape::DeSerialize(bool swap, FILE* fp) {
65  uinT8 sorted;
66  if (fread(&sorted, sizeof(sorted), 1, fp) != 1)
67  return false;
68  unichars_sorted_ = sorted != 0;
69  if (!unichars_.DeSerializeClasses(swap, fp)) return false;
70  return true;
71 }
72 
73 // Adds a font_id for the given unichar_id. If the unichar_id is not
74 // in the shape, it is added.
75 void Shape::AddToShape(int unichar_id, int font_id) {
76  for (int c = 0; c < unichars_.size(); ++c) {
77  if (unichars_[c].unichar_id == unichar_id) {
78  // Found the unichar in the shape table.
79  GenericVector<int>& font_list = unichars_[c].font_ids;
80  for (int f = 0; f < font_list.size(); ++f) {
81  if (font_list[f] == font_id)
82  return; // Font is already there.
83  }
84  font_list.push_back(font_id);
85  return;
86  }
87  }
88  // Unichar_id is not in shape, so add it to shape.
89  unichars_.push_back(UnicharAndFonts(unichar_id, font_id));
90  unichars_sorted_ = unichars_.size() <= 1;
91 }
92 
93 // Adds everything in other to this.
94 void Shape::AddShape(const Shape& other) {
95  for (int c = 0; c < other.unichars_.size(); ++c) {
96  for (int f = 0; f < other.unichars_[c].font_ids.size(); ++f) {
97  AddToShape(other.unichars_[c].unichar_id,
98  other.unichars_[c].font_ids[f]);
99  }
100  }
101  unichars_sorted_ = unichars_.size() <= 1;
102 }
103 
104 // Returns true if the shape contains the given unichar_id, font_id pair.
105 bool Shape::ContainsUnicharAndFont(int unichar_id, int font_id) const {
106  for (int c = 0; c < unichars_.size(); ++c) {
107  if (unichars_[c].unichar_id == unichar_id) {
108  // Found the unichar, so look for the font.
109  GenericVector<int>& font_list = unichars_[c].font_ids;
110  for (int f = 0; f < font_list.size(); ++f) {
111  if (font_list[f] == font_id)
112  return true;
113  }
114  return false;
115  }
116  }
117  return false;
118 }
119 
120 // Returns true if the shape contains the given unichar_id, ignoring font.
121 bool Shape::ContainsUnichar(int unichar_id) const {
122  for (int c = 0; c < unichars_.size(); ++c) {
123  if (unichars_[c].unichar_id == unichar_id) {
124  return true;
125  }
126  }
127  return false;
128 }
129 
130 // Returns true if the shape contains the given font, ignoring unichar_id.
131 bool Shape::ContainsFont(int font_id) const {
132  for (int c = 0; c < unichars_.size(); ++c) {
133  GenericVector<int>& font_list = unichars_[c].font_ids;
134  for (int f = 0; f < font_list.size(); ++f) {
135  if (font_list[f] == font_id)
136  return true;
137  }
138  }
139  return false;
140 }
141 
142 // Returns true if this is a subset (including equal) of other.
143 bool Shape::IsSubsetOf(const Shape& other) const {
144  for (int c = 0; c < unichars_.size(); ++c) {
145  int unichar_id = unichars_[c].unichar_id;
146  const GenericVector<int>& font_list = unichars_[c].font_ids;
147  for (int f = 0; f < font_list.size(); ++f) {
148  if (!other.ContainsUnicharAndFont(unichar_id, font_list[f]))
149  return false;
150  }
151  }
152  return true;
153 }
154 
155 // Returns true if the lists of unichar ids are the same in this and other,
156 // ignoring fonts.
157 // NOT const, as it will sort the unichars on demand.
159  if (unichars_.size() != other->unichars_.size()) return false;
160  if (!unichars_sorted_) SortUnichars();
161  if (!other->unichars_sorted_) other->SortUnichars();
162  for (int c = 0; c < unichars_.size(); ++c) {
163  if (unichars_[c].unichar_id != other->unichars_[c].unichar_id)
164  return false;
165  }
166  return true;
167 }
168 
169 // Sorts the unichars_ vector by unichar.
170 void Shape::SortUnichars() {
172  unichars_sorted_ = true;
173 }
174 
175 ShapeTable::ShapeTable() : unicharset_(NULL) {
176 }
178  : unicharset_(&unicharset) {
179 }
180 
181 // Writes to the given file. Returns false in case of error.
182 bool ShapeTable::Serialize(FILE* fp) const {
183  if (!shape_table_.Serialize(fp)) return false;
184  return true;
185 }
186 // Reads from the given file. Returns false in case of error.
187 // If swap is true, assumes a big/little-endian swap is needed.
188 bool ShapeTable::DeSerialize(bool swap, FILE* fp) {
189  if (!shape_table_.DeSerialize(swap, fp)) return false;
190  return true;
191 }
192 
193 // Returns a string listing the classes/fonts in a shape.
194 STRING ShapeTable::DebugStr(int shape_id) const {
195  if (shape_id < 0 || shape_id >= shape_table_.size())
196  return STRING("INVALID_UNICHAR_ID");
197  const Shape& shape = GetShape(shape_id);
198  STRING result;
199  result.add_str_int("Shape", shape_id);
200  if (shape.size() > 100) {
201  result.add_str_int(" Num unichars=", shape.size());
202  return result;
203  }
204  for (int c = 0; c < shape.size(); ++c) {
205  result.add_str_int(" c_id=", shape[c].unichar_id);
206  result += "=";
207  result += unicharset_->id_to_unichar(shape[c].unichar_id);
208  if (shape.size() < 10) {
209  result.add_str_int(", ", shape[c].font_ids.size());
210  result += " fonts =";
211  int num_fonts = shape[c].font_ids.size();
212  if (num_fonts > 10) {
213  result.add_str_int(" ", shape[c].font_ids[0]);
214  result.add_str_int(" ... ", shape[c].font_ids[num_fonts - 1]);
215  } else {
216  for (int f = 0; f < num_fonts; ++f) {
217  result.add_str_int(" ", shape[c].font_ids[f]);
218  }
219  }
220  }
221  }
222  return result;
223 }
224 
225 // Returns a debug string summarizing the table.
227  int max_unichars = 0;
228  int num_multi_shapes = 0;
229  int num_master_shapes = 0;
230  for (int s = 0; s < shape_table_.size(); ++s) {
231  if (MasterDestinationIndex(s) != s) continue;
232  ++num_master_shapes;
233  int shape_size = GetShape(s).size();
234  if (shape_size > 1)
235  ++num_multi_shapes;
236  if (shape_size > max_unichars)
237  max_unichars = shape_size;
238  }
239  STRING result;
240  result.add_str_int("Number of shapes = ", num_master_shapes);
241  result.add_str_int(" max unichars = ", max_unichars);
242  result.add_str_int(" number with multiple unichars = ", num_multi_shapes);
243  return result;
244 }
245 
246 
247 // Adds a new shape starting with the given unichar_id and font_id.
248 // Returns the assigned index.
249 int ShapeTable::AddShape(int unichar_id, int font_id) {
250  int index = shape_table_.size();
251  Shape* shape = new Shape;
252  shape->AddToShape(unichar_id, font_id);
253  shape_table_.push_back(shape);
254  return index;
255 }
256 
257 // Adds a copy of the given shape.
258 // Returns the assigned index.
259 int ShapeTable::AddShape(const Shape& other) {
260  int index = shape_table_.size();
261  Shape* shape = new Shape(other);
262  shape_table_.push_back(shape);
263  return index;
264 }
265 
266 // Removes the shape given by the shape index.
267 void ShapeTable::DeleteShape(int shape_id) {
268  delete shape_table_[shape_id];
269  shape_table_[shape_id] = NULL;
270  shape_table_.remove(shape_id);
271 }
272 
273 // Adds a font_id to the given existing shape index for the given
274 // unichar_id. If the unichar_id is not in the shape, it is added.
275 void ShapeTable::AddToShape(int shape_id, int unichar_id, int font_id) {
276  Shape& shape = *shape_table_[shape_id];
277  shape.AddToShape(unichar_id, font_id);
278 }
279 
280 // Adds the given shape to the existing shape with the given index.
281 void ShapeTable::AddShapeToShape(int shape_id, const Shape& other) {
282  Shape& shape = *shape_table_[shape_id];
283  shape.AddShape(other);
284 }
285 
286 // Returns the id of the shape that contains the given unichar and font.
287 // If not found, returns -1.
288 // If font_id < 0, the font_id is ignored and the first shape that matches
289 // the unichar_id is returned.
290 int ShapeTable::FindShape(int unichar_id, int font_id) const {
291  for (int s = 0; s < shape_table_.size(); ++s) {
292  const Shape& shape = GetShape(s);
293  for (int c = 0; c < shape.size(); ++c) {
294  if (shape[c].unichar_id == unichar_id) {
295  if (font_id < 0)
296  return s; // We don't care about the font.
297  for (int f = 0; f < shape[c].font_ids.size(); ++f) {
298  if (shape[c].font_ids[f] == font_id)
299  return s;
300  }
301  }
302  }
303  }
304  return -1;
305 }
306 
307 // Returns the first unichar_id and font_id in the given shape.
309  int* unichar_id, int* font_id) const {
310  const UnicharAndFonts& unichar_and_fonts = (*shape_table_[shape_id])[0];
311  *unichar_id = unichar_and_fonts.unichar_id;
312  *font_id = unichar_and_fonts.font_ids[0];
313 }
314 
315 // Expands all the classes/fonts in the shape individually to build
316 // a ShapeTable.
318  const ShapeTable& master_shapes) {
319  int num_masters = 0;
320  for (int u_ind = 0; u_ind < shape.size(); ++u_ind) {
321  for (int f_ind = 0; f_ind < shape[u_ind].font_ids.size(); ++f_ind) {
322  int c = shape[u_ind].unichar_id;
323  int f = shape[u_ind].font_ids[f_ind];
324  if (FindShape(c, f) < 0) {
325  int shape_id = AddShape(c, f);
326  int master_id = master_shapes.FindShape(c, f);
327  if (master_id >= 0 && shape.size() > 1) {
328  const Shape& master = master_shapes.GetShape(master_id);
329  if (master.IsSubsetOf(shape) && !shape.IsSubsetOf(master)) {
330  // Add everything else from the master shape.
331  shape_table_[shape_id]->AddShape(master);
332  ++num_masters;
333  }
334  }
335  }
336  }
337  }
338  return num_masters;
339 }
340 
341 // Returns true if the shapes are already merged.
342 bool ShapeTable::AlreadyMerged(int shape_id1, int shape_id2) const {
343  return MasterDestinationIndex(shape_id1) == MasterDestinationIndex(shape_id2);
344 }
345 
346 // Returns true if any shape contains multiple unichars.
348  int num_shapes = NumShapes();
349  for (int s1 = 0; s1 < num_shapes; ++s1) {
350  if (MasterDestinationIndex(s1) != s1) continue;
351  if (GetShape(s1).size() > 1)
352  return true;
353  }
354  return false;
355 }
356 
357 // Returns the maximum number of unichars over all shapes.
359  int max_num_unichars = 0;
360  int num_shapes = NumShapes();
361  for (int s = 0; s < num_shapes; ++s) {
362  if (GetShape(s).size() > max_num_unichars)
363  max_num_unichars = GetShape(s).size();
364  }
365  return max_num_unichars;
366 }
367 
368 
369 // Merges shapes with a common unichar over the [start, end) interval.
370 // Assumes single unichar per shape.
371 void ShapeTable::ForceFontMerges(int start, int end) {
372  for (int s1 = start; s1 < end; ++s1) {
373  if (MasterDestinationIndex(s1) == s1 && GetShape(s1).size() == 1) {
374  int unichar_id = GetShape(s1)[0].unichar_id;
375  for (int s2 = s1 + 1; s2 < end; ++s2) {
376  if (MasterDestinationIndex(s2) == s2 && GetShape(s2).size() == 1 &&
377  unichar_id == GetShape(s2)[0].unichar_id) {
378  MergeShapes(s1, s2);
379  }
380  }
381  }
382  }
383  ShapeTable compacted(*unicharset_);
384  compacted.AppendMasterShapes(*this);
385  *this = compacted;
386 }
387 
388 // Returns the number of unichars in the master shape.
389 int ShapeTable::MasterUnicharCount(int shape_id) const {
390  int master_id = MasterDestinationIndex(shape_id);
391  return GetShape(master_id).size();
392 }
393 
394 // Returns the sum of the font counts in the master shape.
395 int ShapeTable::MasterFontCount(int shape_id) const {
396  int master_id = MasterDestinationIndex(shape_id);
397  const Shape& shape = GetShape(master_id);
398  int font_count = 0;
399  for (int c = 0; c < shape.size(); ++c) {
400  font_count += shape[c].font_ids.size();
401  }
402  return font_count;
403 }
404 
405 // Returns the number of unichars that would result from merging the shapes.
406 int ShapeTable::MergedUnicharCount(int shape_id1, int shape_id2) const {
407  // Do it the easy way for now.
408  int master_id1 = MasterDestinationIndex(shape_id1);
409  int master_id2 = MasterDestinationIndex(shape_id2);
410  Shape combined_shape(*shape_table_[master_id1]);
411  combined_shape.AddShape(*shape_table_[master_id2]);
412  return combined_shape.size();
413 }
414 
415 // Merges two shape_ids, leaving shape_id2 marked as merged.
416 void ShapeTable::MergeShapes(int shape_id1, int shape_id2) {
417  int master_id1 = MasterDestinationIndex(shape_id1);
418  int master_id2 = MasterDestinationIndex(shape_id2);
419  // Point master_id2 (and all merged shapes) to master_id1.
420  shape_table_[master_id2]->set_destination_index(master_id1);
421  // Add all the shapes of master_id2 to master_id1.
422  shape_table_[master_id1]->AddShape(*shape_table_[master_id2]);
423 }
424 
425 // Returns the destination of this shape, (if merged), taking into account
426 // the fact that the destination may itself have been merged.
427 int ShapeTable::MasterDestinationIndex(int shape_id) const {
428  int dest_id = shape_table_[shape_id]->destination_index();
429  if (dest_id == shape_id || dest_id < 0)
430  return shape_id; // Is master already.
431  int master_id = shape_table_[dest_id]->destination_index();
432  if (master_id == dest_id || master_id < 0)
433  return dest_id; // Dest is the master and shape_id points to it.
434  master_id = MasterDestinationIndex(master_id);
435  return master_id;
436 }
437 
438 // Appends the master shapes from other to this.
440  for (int s = 0; s < other.shape_table_.size(); ++s) {
441  if (other.shape_table_[s]->destination_index() < 0) {
442  AddShape(*other.shape_table_[s]);
443  }
444  }
445 }
446 
447 // Returns the number of master shapes remaining after merging.
449  int num_shapes = 0;
450  for (int s = 0; s < shape_table_.size(); ++s) {
451  if (shape_table_[s]->destination_index() < 0)
452  ++num_shapes;
453  }
454  return num_shapes;
455 }
456 
457 
458 } // namespace tesseract
459 
460