Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
unicharset.h
Go to the documentation of this file.
1 
2 // File: unicharset.h
3 // Description: Unicode character/ligature set class.
4 // Author: Thomas Kielbus
5 // Created: Wed Jun 28 17:05:01 PDT 2006
6 //
7 // (C) Copyright 2006, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
19 
20 #ifndef TESSERACT_CCUTIL_UNICHARSET_H__
21 #define TESSERACT_CCUTIL_UNICHARSET_H__
22 
23 #include "errcode.h"
24 #include "helpers.h"
25 #include "strngs.h"
26 #include "tesscallback.h"
27 #include "unichar.h"
28 #include "unicharmap.h"
29 
31  public:
32  // Minimum number of characters used for fragment representation.
33  static const int kMinLen = 6;
34  // Maximum number of characters used for fragment representation.
35  static const int kMaxLen = 3 + UNICHAR_LEN + 2;
36  // Maximum number of fragments per character.
37  static const int kMaxChunks = 5;
38 
39  // Setters and Getters.
40  inline void set_all(const char *unichar, int pos, int total, bool natural) {
41  set_unichar(unichar);
42  set_pos(pos);
43  set_total(total);
44  set_natural(natural);
45  }
46  inline void set_unichar(const char *uch) {
47  strncpy(this->unichar, uch, UNICHAR_LEN);
48  this->unichar[UNICHAR_LEN] = '\0';
49  }
50  inline void set_pos(int p) { this->pos = p; }
51  inline void set_total(int t) { this->total = t; }
52  inline const char* get_unichar() const { return this->unichar; }
53  inline int get_pos() const { return this->pos; }
54  inline int get_total() const { return this->total; }
55 
56  // Returns the string that represents a fragment
57  // with the given unichar, pos and total.
58  static STRING to_string(const char *unichar, int pos, int total,
59  bool natural);
60  // Returns the string that represents this fragment.
61  STRING to_string() const {
62  return to_string(unichar, pos, total, natural);
63  }
64 
65  // Checks whether a fragment has the same unichar,
66  // position and total as the given inputs.
67  inline bool equals(const char *other_unichar,
68  int other_pos, int other_total) const {
69  return (strcmp(this->unichar, other_unichar) == 0 &&
70  this->pos == other_pos && this->total == other_total);
71  }
72  inline bool equals(const CHAR_FRAGMENT *other) const {
73  return this->equals(other->get_unichar(),
74  other->get_pos(),
75  other->get_total());
76  }
77 
78  // Checks whether a given fragment is a continuation of this fragment.
79  // Assumes that the given fragment pointer is not NULL.
80  inline bool is_continuation_of(const CHAR_FRAGMENT *fragment) const {
81  return (strcmp(this->unichar, fragment->get_unichar()) == 0 &&
82  this->total == fragment->get_total() &&
83  this->pos == fragment->get_pos() + 1);
84  }
85 
86  // Returns true if this fragment is a beginning fragment.
87  inline bool is_beginning() const { return this->pos == 0; }
88 
89  // Returns true if this fragment is an ending fragment.
90  inline bool is_ending() const { return this->pos == this->total-1; }
91 
92  // Returns true if the fragment was a separate component to begin with,
93  // ie did not need chopping to be isolated, but may have been separated
94  // out from a multi-outline blob.
95  inline bool is_natural() const { return natural; }
96  void set_natural(bool value) { natural = value; }
97 
98  // Parses the string to see whether it represents a character fragment
99  // (rather than a regular character). If so, allocates memory for a new
100  // CHAR_FRAGMENT instance and fills it in with the corresponding fragment
101  // information. Fragments are of the form:
102  // |m|1|2, meaning chunk 1 of 2 of character m, or
103  // |:|1n2, meaning chunk 1 of 2 of character :, and no chopping was needed
104  // to divide the parts, as they were already separate connected components.
105  //
106  // If parsing succeeded returns the pointer to the allocated CHAR_FRAGMENT
107  // instance, otherwise (if the string does not represent a fragment or it
108  // looks like it does, but parsing it as a fragment fails) returns NULL.
109  //
110  // Note: The caller is responsible for deallocating memory
111  // associated with the returned pointer.
112  static CHAR_FRAGMENT *parse_from_string(const char *str);
113 
114  private:
115  char unichar[UNICHAR_LEN + 1];
116  // True if the fragment was a separate component to begin with,
117  // ie did not need chopping to be isolated, but may have been separated
118  // out from a multi-outline blob.
119  bool natural;
120  inT16 pos; // fragment position in the character
121  inT16 total; // total number of fragments in the character
122 };
123 
124 // The UNICHARSET class is an utility class for Tesseract that holds the
125 // set of characters that are used by the engine. Each character is identified
126 // by a unique number, from 0 to (size - 1).
127 class UNICHARSET {
128  public:
129  // Custom list of characters and their ligature forms (UTF8)
130  // These map to unicode values in the private use area (PUC) and are supported
131  // by only few font families (eg. Wyld, Adobe Caslon Pro).
132  static const char* kCustomLigatures[][2];
133 
134  // ICU 2.0 UCharDirection enum (from third_party/icu/include/unicode/uchar.h)
135  enum Direction {
156  };
157 
158  // Create an empty UNICHARSET
159  UNICHARSET();
160 
161  ~UNICHARSET();
162 
163  // Return the UNICHAR_ID of a given unichar representation within the
164  // UNICHARSET.
165  const UNICHAR_ID unichar_to_id(const char* const unichar_repr) const;
166 
167  // Return the UNICHAR_ID of a given unichar representation within the
168  // UNICHARSET. Only the first length characters from unichar_repr are used.
169  const UNICHAR_ID unichar_to_id(const char* const unichar_repr,
170  int length) const;
171 
172  // Return the minimum number of bytes that matches a legal UNICHAR_ID,
173  // while leaving a legal UNICHAR_ID afterwards. In other words, if there
174  // is both a short and a long match to the string, return the length that
175  // ensures there is a legal match after it.
176  int step(const char* str) const;
177 
178  // Return whether the given UTF-8 string is encodable with this UNICHARSET.
179  // If not encodable, write the first byte offset which cannot be converted
180  // into the second (return) argument.
181  bool encodable_string(const char *str, int *first_bad_position) const;
182 
183  // Return the unichar representation corresponding to the given UNICHAR_ID
184  // within the UNICHARSET.
185  const char* const id_to_unichar(UNICHAR_ID id) const;
186 
187  // Return the UTF8 representation corresponding to the given UNICHAR_ID after
188  // resolving any private encodings internal to Tesseract. This method is
189  // preferrable to id_to_unichar for outputting text that will be visible to
190  // external applications.
191  const char* const id_to_unichar_ext(UNICHAR_ID id) const;
192 
193  // Return a STRING that reformats the utf8 str into the str followed
194  // by its hex unicodes.
195  static STRING debug_utf8_str(const char* str);
196 
197  // Return a STRING containing debug information on the unichar, including
198  // the id_to_unichar, its hex unicodes and the properties.
199  STRING debug_str(UNICHAR_ID id) const;
200  STRING debug_str(const char * unichar_repr) const {
201  return debug_str(unichar_to_id(unichar_repr));
202  }
203 
204  // Add a unichar representation to the set.
205  void unichar_insert(const char* const unichar_repr);
206 
207  // Return true if the given unichar id exists within the set.
208  // Relies on the fact that unichar ids are contiguous in the unicharset.
209  bool contains_unichar_id(UNICHAR_ID unichar_id) const {
210  return unichar_id != INVALID_UNICHAR_ID && unichar_id < size_used &&
211  unichar_id >= 0;
212  }
213 
214  // Return true if the given unichar representation exists within the set.
215  bool contains_unichar(const char* const unichar_repr) const;
216  bool contains_unichar(const char* const unichar_repr, int length) const;
217 
218  // Return true if the given unichar representation corresponds to the given
219  // UNICHAR_ID within the set.
220  bool eq(UNICHAR_ID unichar_id, const char* const unichar_repr) const;
221 
222  // Delete CHAR_FRAGMENTs stored in properties of unichars array.
224  for (int i = 0; i < size_used; ++i) {
225  if (unichars[i].properties.fragment != NULL) {
226  delete unichars[i].properties.fragment;
227  unichars[i].properties.fragment = NULL;
228  }
229  }
230  }
231 
232  // Clear the UNICHARSET (all the previous data is lost).
233  void clear() {
234  if (script_table != NULL) {
235  for (int i = 0; i < script_table_size_used; ++i)
236  delete[] script_table[i];
237  delete[] script_table;
238  script_table = NULL;
239  script_table_size_used = 0;
240  }
241  if (unichars != NULL) {
243  delete[] unichars;
244  unichars = NULL;
245  }
246  script_table_size_reserved = 0;
247  size_reserved = 0;
248  size_used = 0;
249  ids.clear();
250  top_bottom_set_ = false;
251  script_has_upper_lower_ = false;
252  script_has_xheight_ = false;
253  null_sid_ = 0;
254  common_sid_ = 0;
255  latin_sid_ = 0;
256  cyrillic_sid_ = 0;
257  greek_sid_ = 0;
258  han_sid_ = 0;
259  hiragana_sid_ = 0;
260  katakana_sid_ = 0;
261  }
262 
263  // Return the size of the set (the number of different UNICHAR it holds).
264  int size() const {
265  return size_used;
266  }
267 
268  // Reserve enough memory space for the given number of UNICHARS
269  void reserve(int unichars_number);
270 
271  // Opens the file indicated by filename and saves unicharset to that file.
272  // Returns true if the operation is successful.
273  bool save_to_file(const char * const filename) const {
274  FILE* file = fopen(filename, "w+b");
275  if (file == NULL) return false;
276  bool result = save_to_file(file);
277  fclose(file);
278  return result;
279  }
280 
281  // Saves the content of the UNICHARSET to the given file.
282  // Returns true if the operation is successful.
283  bool save_to_file(FILE *file) const;
284 
285  // Load a unicharset from a unicharset file that has been loaded into
286  // the given memory buffer.
287  // Returns true if the operation is successful.
288  bool load_from_inmemory_file(const char* const memory, int mem_size,
289  bool skip_fragments);
290  // Returns true if the operation is successful.
291  bool load_from_inmemory_file(const char* const memory, int mem_size) {
292  return load_from_inmemory_file(memory, mem_size, false);
293  }
294 
295  // Opens the file indicated by filename and loads the UNICHARSET
296  // from the given file. The previous data is lost.
297  // Returns true if the operation is successful.
298  bool load_from_file(const char* const filename, bool skip_fragments) {
299  FILE* file = fopen(filename, "rb");
300  if (file == NULL) return false;
301  bool result = load_from_file(file, skip_fragments);
302  fclose(file);
303  return result;
304  }
305  // returns true if the operation is successful.
306  bool load_from_file(const char* const filename) {
307  return load_from_file(filename, false);
308  }
309 
310  // Loads the UNICHARSET from the given file. The previous data is lost.
311  // Returns true if the operation is successful.
312  bool load_from_file(FILE *file, bool skip_fragments);
313  bool load_from_file(FILE *file) { return load_from_file(file, false); }
314 
315  // Sets up internal data after loading the file, based on the char
316  // properties. Called from load_from_file, but also needs to be run
317  // during set_unicharset_properties.
318  void post_load_setup();
319 
320  // Returns true if right_to_left scripts are significant in the unicharset,
321  // but without being so sensitive that "universal" unicharsets containing
322  // characters from many scripts, like orientation and script detection,
323  // look like they are right_to_left.
324  bool major_right_to_left() const;
325 
326  // Set a whitelist and/or blacklist of characters to recognize.
327  // An empty or NULL whitelist enables everything (minus any blacklist).
328  // An empty or NULL blacklist disables nothing.
329  // The blacklist overrides the whitelist.
330  // Each list is a string of utf8 character strings. Boundaries between
331  // unicharset units are worked out automatically, and characters not in
332  // the unicharset are silently ignored.
333  void set_black_and_whitelist(const char* blacklist, const char* whitelist);
334 
335  // Set the isalpha property of the given unichar to the given value.
336  void set_isalpha(UNICHAR_ID unichar_id, bool value) {
337  unichars[unichar_id].properties.isalpha = value;
338  }
339 
340  // Set the islower property of the given unichar to the given value.
341  void set_islower(UNICHAR_ID unichar_id, bool value) {
342  unichars[unichar_id].properties.islower = value;
343  }
344 
345  // Set the isupper property of the given unichar to the given value.
346  void set_isupper(UNICHAR_ID unichar_id, bool value) {
347  unichars[unichar_id].properties.isupper = value;
348  }
349 
350  // Set the isdigit property of the given unichar to the given value.
351  void set_isdigit(UNICHAR_ID unichar_id, bool value) {
352  unichars[unichar_id].properties.isdigit = value;
353  }
354 
355  // Set the ispunctuation property of the given unichar to the given value.
356  void set_ispunctuation(UNICHAR_ID unichar_id, bool value) {
357  unichars[unichar_id].properties.ispunctuation = value;
358  }
359 
360  // Set the isngram property of the given unichar to the given value.
361  void set_isngram(UNICHAR_ID unichar_id, bool value) {
362  unichars[unichar_id].properties.isngram = value;
363  }
364 
365  // Set the script name of the given unichar to the given value.
366  // Value is copied and thus can be a temporary;
367  void set_script(UNICHAR_ID unichar_id, const char* value) {
368  unichars[unichar_id].properties.script_id = add_script(value);
369  }
370 
371  // Set other_case unichar id in the properties for the given unichar id.
372  void set_other_case(UNICHAR_ID unichar_id, UNICHAR_ID other_case) {
373  unichars[unichar_id].properties.other_case = other_case;
374  }
375 
376  // Set the direction property of the given unichar to the given value.
378  unichars[unichar_id].properties.direction = value;
379  }
380 
381  // Set mirror unichar id in the properties for the given unichar id.
382  void set_mirror(UNICHAR_ID unichar_id, UNICHAR_ID mirror) {
383  unichars[unichar_id].properties.mirror = mirror;
384  }
385 
386  // Record normalized version of unichar with the given unichar_id.
387  void set_normed(UNICHAR_ID unichar_id, const char* normed) {
388  unichars[unichar_id].properties.normed = normed;
389  }
390 
391  // Return the isalpha property of the given unichar.
392  bool get_isalpha(UNICHAR_ID unichar_id) const {
393  if (INVALID_UNICHAR_ID == unichar_id) return false;
394  ASSERT_HOST(contains_unichar_id(unichar_id));
395  return unichars[unichar_id].properties.isalpha;
396  }
397 
398  // Return the islower property of the given unichar.
399  bool get_islower(UNICHAR_ID unichar_id) const {
400  if (INVALID_UNICHAR_ID == unichar_id) return false;
401  ASSERT_HOST(contains_unichar_id(unichar_id));
402  return unichars[unichar_id].properties.islower;
403  }
404 
405  // Return the isupper property of the given unichar.
406  bool get_isupper(UNICHAR_ID unichar_id) const {
407  if (INVALID_UNICHAR_ID == unichar_id) return false;
408  ASSERT_HOST(contains_unichar_id(unichar_id));
409  return unichars[unichar_id].properties.isupper;
410  }
411 
412  // Return the isdigit property of the given unichar.
413  bool get_isdigit(UNICHAR_ID unichar_id) const {
414  if (INVALID_UNICHAR_ID == unichar_id) return false;
415  ASSERT_HOST(contains_unichar_id(unichar_id));
416  return unichars[unichar_id].properties.isdigit;
417  }
418 
419  // Return the ispunctuation property of the given unichar.
420  bool get_ispunctuation(UNICHAR_ID unichar_id) const {
421  if (INVALID_UNICHAR_ID == unichar_id) return false;
422  ASSERT_HOST(contains_unichar_id(unichar_id));
423  return unichars[unichar_id].properties.ispunctuation;
424  }
425 
426  // Return the isngram property of the given unichar.
427  bool get_isngram(UNICHAR_ID unichar_id) const {
428  if (INVALID_UNICHAR_ID == unichar_id) return false;
429  ASSERT_HOST(contains_unichar_id(unichar_id));
430  return unichars[unichar_id].properties.isngram;
431  }
432 
433  // Returns whether the unichar id represents a unicode value in the private
434  // use area.
435  bool get_isprivate(UNICHAR_ID unichar_id) const;
436 
437  // Returns true if the ids have useful min/max top/bottom values.
438  bool top_bottom_useful() const {
439  return top_bottom_set_;
440  }
441  // Sets all ranges to empty, so they can be expanded to set the values.
442  void set_ranges_empty();
443  // Sets all the properties for this unicharset given a src_unicharset with
444  // everything set. The unicharsets don't have to be the same, and graphemes
445  // are correctly accounted for.
446  void SetPropertiesFromOther(const UNICHARSET& src);
447  // Expands the tops and bottoms and widths for this unicharset given a
448  // src_unicharset with ranges in it. The unicharsets don't have to be the
449  // same, and graphemes are correctly accounted for.
450  void ExpandRangesFromOther(const UNICHARSET& src);
451  // For each id in src, if it does not occur in this, add it, as in
452  // SetPropertiesFromOther, otherwise expand the ranges, as in
453  // ExpandRangesFromOther.
454  void AppendOtherUnicharset(const UNICHARSET& src);
455  // Returns the min and max bottom and top of the given unichar in
456  // baseline-normalized coordinates, ie, where the baseline is
457  // kBlnBaselineOffset and the meanline is kBlnBaselineOffset + kBlnXHeight
458  // (See normalis.h for the definitions).
459  void get_top_bottom(UNICHAR_ID unichar_id,
460  int* min_bottom, int* max_bottom,
461  int* min_top, int* max_top) const {
462  if (INVALID_UNICHAR_ID == unichar_id) {
463  *min_bottom = *min_top = 0;
464  *max_bottom = *max_top = 256; // kBlnCellHeight
465  return;
466  }
467  ASSERT_HOST(contains_unichar_id(unichar_id));
468  *min_bottom = unichars[unichar_id].properties.min_bottom;
469  *max_bottom = unichars[unichar_id].properties.max_bottom;
470  *min_top = unichars[unichar_id].properties.min_top;
471  *max_top = unichars[unichar_id].properties.max_top;
472  }
473  void set_top_bottom(UNICHAR_ID unichar_id,
474  int min_bottom, int max_bottom,
475  int min_top, int max_top) {
476  unichars[unichar_id].properties.min_bottom =
477  static_cast<uinT8>(ClipToRange(min_bottom, 0, MAX_UINT8));
478  unichars[unichar_id].properties.max_bottom =
479  static_cast<uinT8>(ClipToRange(max_bottom, 0, MAX_UINT8));
480  unichars[unichar_id].properties.min_top =
481  static_cast<uinT8>(ClipToRange(min_top, 0, MAX_UINT8));
482  unichars[unichar_id].properties.max_top =
483  static_cast<uinT8>(ClipToRange(max_top, 0, MAX_UINT8));
484  }
485  // Returns the width range of the given unichar in baseline-normalized
486  // coordinates, ie, where the baseline is kBlnBaselineOffset and the
487  // meanline is kBlnBaselineOffset + kBlnXHeight.
488  // (See normalis.h for the definitions).
489  void get_width_range(UNICHAR_ID unichar_id,
490  int* min_width, int* max_width) const {
491  if (INVALID_UNICHAR_ID == unichar_id) {
492  *min_width = 0;
493  *max_width = 256; // kBlnCellHeight;
494  return;
495  }
496  ASSERT_HOST(contains_unichar_id(unichar_id));
497  *min_width = unichars[unichar_id].properties.min_width;
498  *max_width = unichars[unichar_id].properties.max_width;
499  }
500  void set_width_range(UNICHAR_ID unichar_id, int min_width, int max_width) {
501  unichars[unichar_id].properties.min_width =
502  static_cast<inT16>(ClipToRange(min_width, 0, MAX_INT16));
503  unichars[unichar_id].properties.max_width =
504  static_cast<inT16>(ClipToRange(max_width, 0, MAX_INT16));
505  }
506  // Returns the range of the x-bearing of the given unichar in
507  // baseline-normalized coordinates, ie, where the baseline is
508  // kBlnBaselineOffset and the meanline is kBlnBaselineOffset + kBlnXHeight.
509  // (See normalis.h for the definitions).
510  void get_bearing_range(UNICHAR_ID unichar_id,
511  int* min_bearing, int* max_bearing) const {
512  if (INVALID_UNICHAR_ID == unichar_id) {
513  *min_bearing = *max_bearing = 0;
514  return;
515  }
516  ASSERT_HOST(contains_unichar_id(unichar_id));
517  *min_bearing = unichars[unichar_id].properties.min_bearing;
518  *max_bearing = unichars[unichar_id].properties.max_bearing;
519  }
520  void set_bearing_range(UNICHAR_ID unichar_id,
521  int min_bearing, int max_bearing) {
522  unichars[unichar_id].properties.min_bearing =
523  static_cast<inT16>(ClipToRange(min_bearing, 0, MAX_INT16));
524  unichars[unichar_id].properties.max_bearing =
525  static_cast<inT16>(ClipToRange(max_bearing, 0, MAX_INT16));
526  }
527  // Returns the range of the x-advance of the given unichar in
528  // baseline-normalized coordinates, ie, where the baseline is
529  // kBlnBaselineOffset and the meanline is kBlnBaselineOffset + kBlnXHeight.
530  // (See normalis.h for the definitions).
531  void get_advance_range(UNICHAR_ID unichar_id,
532  int* min_advance, int* max_advance) const {
533  if (INVALID_UNICHAR_ID == unichar_id) {
534  *min_advance = *max_advance = 0;
535  return;
536  }
537  ASSERT_HOST(contains_unichar_id(unichar_id));
538  *min_advance = unichars[unichar_id].properties.min_advance;
539  *max_advance = unichars[unichar_id].properties.max_advance;
540  }
541  void set_advance_range(UNICHAR_ID unichar_id,
542  int min_advance, int max_advance) {
543  unichars[unichar_id].properties.min_advance =
544  static_cast<inT16>(ClipToRange(min_advance, 0, MAX_INT16));
545  unichars[unichar_id].properties.max_advance =
546  static_cast<inT16>(ClipToRange(max_advance, 0, MAX_INT16));
547  }
548 
549  // Return the script name of the given unichar.
550  // The returned pointer will always be the same for the same script, it's
551  // managed by unicharset and thus MUST NOT be deleted
552  int get_script(UNICHAR_ID unichar_id) const {
553  if (INVALID_UNICHAR_ID == unichar_id) return null_sid_;
554  ASSERT_HOST(contains_unichar_id(unichar_id));
555  return unichars[unichar_id].properties.script_id;
556  }
557 
558  // Return the character properties, eg. alpha/upper/lower/digit/punct,
559  // as a bit field of unsigned int.
560  unsigned int get_properties(UNICHAR_ID unichar_id) const;
561 
562  // Return the character property as a single char. If a character has
563  // multiple attributes, the main property is defined by the following order:
564  // upper_case : 'A'
565  // lower_case : 'a'
566  // alpha : 'x'
567  // digit : '0'
568  // punctuation: 'p'
569  char get_chartype(UNICHAR_ID unichar_id) const;
570 
571  // Get other_case unichar id in the properties for the given unichar id.
573  if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
574  ASSERT_HOST(contains_unichar_id(unichar_id));
575  return unichars[unichar_id].properties.other_case;
576  }
577 
578  // Returns the direction property of the given unichar.
579  Direction get_direction(UNICHAR_ID unichar_id) const {
580  if (INVALID_UNICHAR_ID == unichar_id) return UNICHARSET::U_OTHER_NEUTRAL;
581  ASSERT_HOST(contains_unichar_id(unichar_id));
582  return unichars[unichar_id].properties.direction;
583  }
584 
585  // Get mirror unichar id in the properties for the given unichar id.
586  UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const {
587  if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
588  ASSERT_HOST(contains_unichar_id(unichar_id));
589  return unichars[unichar_id].properties.mirror;
590  }
591 
592  // Returns UNICHAR_ID of the corresponding lower-case unichar.
593  UNICHAR_ID to_lower(UNICHAR_ID unichar_id) const {
594  if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
595  ASSERT_HOST(contains_unichar_id(unichar_id));
596  if (unichars[unichar_id].properties.islower) return unichar_id;
597  return unichars[unichar_id].properties.other_case;
598  }
599 
600  // Returns UNICHAR_ID of the corresponding upper-case unichar.
601  UNICHAR_ID to_upper(UNICHAR_ID unichar_id) const {
602  if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
603  ASSERT_HOST(contains_unichar_id(unichar_id));
604  if (unichars[unichar_id].properties.isupper) return unichar_id;
605  return unichars[unichar_id].properties.other_case;
606  }
607 
608  // Return a pointer to the CHAR_FRAGMENT class if the given
609  // unichar id represents a character fragment.
610  const CHAR_FRAGMENT *get_fragment(UNICHAR_ID unichar_id) const {
611  if (INVALID_UNICHAR_ID == unichar_id) return NULL;
612  ASSERT_HOST(contains_unichar_id(unichar_id));
613  return unichars[unichar_id].properties.fragment;
614  }
615 
616  // Return the isalpha property of the given unichar representation.
617  bool get_isalpha(const char* const unichar_repr) const {
618  return get_isalpha(unichar_to_id(unichar_repr));
619  }
620 
621  // Return the islower property of the given unichar representation.
622  bool get_islower(const char* const unichar_repr) const {
623  return get_islower(unichar_to_id(unichar_repr));
624  }
625 
626  // Return the isupper property of the given unichar representation.
627  bool get_isupper(const char* const unichar_repr) const {
628  return get_isupper(unichar_to_id(unichar_repr));
629  }
630 
631  // Return the isdigit property of the given unichar representation.
632  bool get_isdigit(const char* const unichar_repr) const {
633  return get_isdigit(unichar_to_id(unichar_repr));
634  }
635 
636  // Return the ispunctuation property of the given unichar representation.
637  bool get_ispunctuation(const char* const unichar_repr) const {
638  return get_ispunctuation(unichar_to_id(unichar_repr));
639  }
640 
641  // Return the character properties, eg. alpha/upper/lower/digit/punct,
642  // of the given unichar representation
643  unsigned int get_properties(const char* const unichar_repr) const {
644  return get_properties(unichar_to_id(unichar_repr));
645  }
646 
647  char get_chartype(const char* const unichar_repr) const {
648  return get_chartype(unichar_to_id(unichar_repr));
649  }
650 
651  // Return the script name of the given unichar representation.
652  // The returned pointer will always be the same for the same script, it's
653  // managed by unicharset and thus MUST NOT be deleted
654  int get_script(const char* const unichar_repr) const {
655  return get_script(unichar_to_id(unichar_repr));
656  }
657 
658  // Return a pointer to the CHAR_FRAGMENT class struct if the given
659  // unichar representation represents a character fragment.
660  const CHAR_FRAGMENT *get_fragment(const char* const unichar_repr) const {
661  if (unichar_repr == NULL || unichar_repr[0] == '\0' ||
662  !ids.contains(unichar_repr)) {
663  return NULL;
664  }
665  return get_fragment(unichar_to_id(unichar_repr));
666  }
667 
668  // Return the isalpha property of the given unichar representation.
669  // Only the first length characters from unichar_repr are used.
670  bool get_isalpha(const char* const unichar_repr,
671  int length) const {
672  return get_isalpha(unichar_to_id(unichar_repr, length));
673  }
674 
675  // Return the islower property of the given unichar representation.
676  // Only the first length characters from unichar_repr are used.
677  bool get_islower(const char* const unichar_repr,
678  int length) const {
679  return get_islower(unichar_to_id(unichar_repr, length));
680  }
681 
682  // Return the isupper property of the given unichar representation.
683  // Only the first length characters from unichar_repr are used.
684  bool get_isupper(const char* const unichar_repr,
685  int length) const {
686  return get_isupper(unichar_to_id(unichar_repr, length));
687  }
688 
689  // Return the isdigit property of the given unichar representation.
690  // Only the first length characters from unichar_repr are used.
691  bool get_isdigit(const char* const unichar_repr,
692  int length) const {
693  return get_isdigit(unichar_to_id(unichar_repr, length));
694  }
695 
696  // Return the ispunctuation property of the given unichar representation.
697  // Only the first length characters from unichar_repr are used.
698  bool get_ispunctuation(const char* const unichar_repr,
699  int length) const {
700  return get_ispunctuation(unichar_to_id(unichar_repr, length));
701  }
702 
703  // Returns normalized version of unichar with the given unichar_id.
704  const char *get_normed_unichar(UNICHAR_ID unichar_id) const {
705  return unichars[unichar_id].properties.normed.string();
706  }
707 
708  // Return the script name of the given unichar representation.
709  // Only the first length characters from unichar_repr are used.
710  // The returned pointer will always be the same for the same script, it's
711  // managed by unicharset and thus MUST NOT be deleted
712  int get_script(const char* const unichar_repr,
713  int length) const {
714  return get_script(unichar_to_id(unichar_repr, length));
715  }
716 
717  // Return the (current) number of scripts in the script table
718  int get_script_table_size() const {
719  return script_table_size_used;
720  }
721 
722  // Return the script string from its id
723  const char* get_script_from_script_id(int id) const {
724  if (id >= script_table_size_used || id < 0)
725  return null_script;
726  return script_table[id];
727  }
728 
729  // Returns the id from the name of the script, or 0 if script is not found.
730  // Note that this is an expensive operation since it involves iteratively
731  // comparing strings in the script table. To avoid dependency on STL, we
732  // won't use a hash. Instead, the calling function can use this to lookup
733  // and save the ID for relevant scripts for fast comparisons later.
734  int get_script_id_from_name(const char* script_name) const;
735 
736  // Return true if the given script is the null script
737  bool is_null_script(const char* script) const {
738  return script == null_script;
739  }
740 
741  // Uniquify the given script. For two scripts a and b, if strcmp(a, b) == 0,
742  // then the returned pointer will be the same.
743  // The script parameter is copied and thus can be a temporary.
744  int add_script(const char* script);
745 
746  // Return the enabled property of the given unichar.
747  bool get_enabled(UNICHAR_ID unichar_id) const {
748  return unichars[unichar_id].properties.enabled;
749  }
750 
751 
752  int null_sid() const { return null_sid_; }
753  int common_sid() const { return common_sid_; }
754  int latin_sid() const { return latin_sid_; }
755  int cyrillic_sid() const { return cyrillic_sid_; }
756  int greek_sid() const { return greek_sid_; }
757  int han_sid() const { return han_sid_; }
758  int hiragana_sid() const { return hiragana_sid_; }
759  int katakana_sid() const { return katakana_sid_; }
760  int default_sid() const { return default_sid_; }
761 
762  // Returns true if the unicharset has the concept of upper/lower case.
763  bool script_has_upper_lower() const {
764  return script_has_upper_lower_;
765  }
766  // Returns true if the unicharset has the concept of x-height.
767  // script_has_xheight can be true even if script_has_upper_lower is not,
768  // when the script has a sufficiently predominant top line with ascenders,
769  // such as Devanagari and Thai.
770  bool script_has_xheight() const {
771  return script_has_xheight_;
772  }
773 
774  private:
775 
776  struct UNICHAR_PROPERTIES {
777  UNICHAR_PROPERTIES();
778  // Initializes all properties to sensible default values.
779  void Init();
780  // Sets all ranges wide open. Initialization default in case there are
781  // no useful values available.
782  void SetRangesOpen();
783  // Sets all ranges to empty. Used before expanding with font-based data.
784  void SetRangesEmpty();
785  // Returns true if any of the top/bottom/width/bearing/advance ranges is
786  // emtpy.
787  bool AnyRangeEmpty() const;
788  // Expands the ranges with the ranges from the src properties.
789  void ExpandRangesFrom(const UNICHAR_PROPERTIES& src);
790  // Copies the properties from src into this.
791  void CopyFrom(const UNICHAR_PROPERTIES& src);
792 
793  bool isalpha;
794  bool islower;
795  bool isupper;
796  bool isdigit;
797  bool ispunctuation;
798  bool isngram;
799  bool enabled;
800  // Possible limits of the top and bottom of the bounding box in
801  // baseline-normalized coordinates, ie, where the baseline is
802  // kBlnBaselineOffset and the meanline is kBlnBaselineOffset + kBlnXHeight
803  // (See normalis.h for the definitions).
804  uinT8 min_bottom;
805  uinT8 max_bottom;
806  uinT8 min_top;
807  uinT8 max_top;
808  // Limits on the widths of bounding box, also in baseline-normalized coords.
809  inT16 min_width;
810  inT16 max_width;
811  // Limits on the x-bearing and advance, also in baseline-normalized coords.
812  inT16 min_bearing;
813  inT16 max_bearing;
814  inT16 min_advance;
815  inT16 max_advance;
816  int script_id;
817  UNICHAR_ID other_case; // id of the corresponding upper/lower case unichar
818  Direction direction; // direction of this unichar
819  // Mirror property is useful for reverse DAWG lookup for words in
820  // right-to-left languages (e.g. "(word)" would be in
821  // '[open paren]' 'w' 'o' 'r' 'd' '[close paren]' in a UTF8 string.
822  // However, what we want in our DAWG is
823  // '[open paren]', 'd', 'r', 'o', 'w', '[close paren]' not
824  // '[close paren]', 'd', 'r', 'o', 'w', '[open paren]'.
825  UNICHAR_ID mirror;
826  STRING normed; // normalized version of this unichar
827  // Contains meta information about the fragment if a unichar represents
828  // a fragment of a character, otherwise should be set to NULL.
829  // It is assumed that character fragments are added to the unicharset
830  // after the corresponding 'base' characters.
831  CHAR_FRAGMENT *fragment;
832  };
833 
834  struct UNICHAR_SLOT {
835  char representation[UNICHAR_LEN + 1];
836  UNICHAR_PROPERTIES properties;
837  };
838 
839  // Gets the properties for a grapheme string, combining properties for
840  // multiple characters in a meaningful way where possible.
841  // Returns false if no valid match was found in the unicharset.
842  // NOTE that script_id, mirror, and other_case refer to this unicharset on
843  // return and will need redirecting if the target unicharset is different.
844  bool GetStrProperties(const char* utf8_str,
845  UNICHAR_PROPERTIES* props) const;
846 
847  // Load ourselves from a "file" where our only interface to the file is
848  // an implementation of fgets(). This is the parsing primitive accessed by
849  // the public routines load_from_file() and load_from_inmemory_file().
850  bool load_via_fgets(TessResultCallback2<char *, char *, int> *fgets_cb,
851  bool skip_fragments);
852 
853  UNICHAR_SLOT* unichars;
854  UNICHARMAP ids;
855  int size_used;
856  int size_reserved;
857  char** script_table;
858  int script_table_size_used;
859  int script_table_size_reserved;
860  const char* null_script;
861  // True if the unichars have their tops/bottoms set.
862  bool top_bottom_set_;
863  // True if the unicharset has significant upper/lower case chars.
864  bool script_has_upper_lower_;
865  // True if the unicharset has a significant mean-line with significant
866  // ascenders above that.
867  bool script_has_xheight_;
868 
869  // A few convenient script name-to-id mapping without using hash.
870  // These are initialized when unicharset file is loaded. Anything
871  // missing from this list can be looked up using get_script_id_from_name.
872  int null_sid_;
873  int common_sid_;
874  int latin_sid_;
875  int cyrillic_sid_;
876  int greek_sid_;
877  int han_sid_;
878  int hiragana_sid_;
879  int katakana_sid_;
880  // The most frequently occurring script in the charset.
881  int default_sid_;
882 };
883 
884 #endif // TESSERACT_CCUTIL_UNICHARSET_H__