Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
unicharset.cpp
Go to the documentation of this file.
1 
2 // File: unicharset.cpp
3 // Description: Unicode character/ligature set class.
4 // Author: Thomas Kielbus
5 // Created: Wed Jun 28 17:05:01 PDT 2006
6 //
7 // (C) Copyright 2006, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
19 
20 #include <assert.h>
21 #include <stdio.h>
22 #include <string.h>
23 
24 #include "tesscallback.h"
25 #include "tprintf.h"
26 #include "unichar.h"
27 #include "unicharset.h"
28 #include "params.h"
29 
30 // Special character used in representing character fragments.
31 static const char kSeparator = '|';
32 // Special character used in representing 'natural' character fragments.
33 static const char kNaturalFlag = 'n';
34 
35 static const int ISALPHA_MASK = 0x1;
36 static const int ISLOWER_MASK = 0x2;
37 static const int ISUPPER_MASK = 0x4;
38 static const int ISDIGIT_MASK = 0x8;
39 static const int ISPUNCTUATION_MASK = 0x10;
40 
41 // Y coordinate threshold for determining cap-height vs x-height.
42 // TODO(rays) Bring the global definition down to the ccutil library level,
43 // so this constant is relative to some other constants.
44 static const int kMeanlineThreshold = 220;
45 // Let C be the number of alpha chars for which all tops exceed
46 // kMeanlineThreshold, and X the number of alpha chars for which all
47 // tops are below kMeanlineThreshold, then if X > C *
48 // kMinXHeightFraction and C > X * kMinCapHeightFraction or more than
49 // half the alpha characters have upper or lower case, then the
50 // unicharset "has x-height".
51 const double kMinXHeightFraction = 0.25;
52 const double kMinCapHeightFraction = 0.05;
53 
54 /*static */
55 const char* UNICHARSET::kCustomLigatures[][2] = {
56  {"ct", "\uE003"}, // c + t -> U+E003
57  {"ſh", "\uE006"}, // long-s + h -> U+E006
58  {"ſi", "\uE007"}, // long-s + i -> U+E007
59  {"ſl", "\uE008"}, // long-s + l -> U+E008
60  {"ſſ", "\uE009"}, // long-s + long-s -> U+E009
61  {NULL, NULL}
62 };
63 
64 UNICHARSET::UNICHAR_PROPERTIES::UNICHAR_PROPERTIES() {
65  Init();
66 }
67 
68 // Initialize all properties to sensible default values.
69 void UNICHARSET::UNICHAR_PROPERTIES::Init() {
70  isalpha = false;
71  islower = false;
72  isupper = false;
73  isdigit = false;
74  ispunctuation = false;
75  isngram = false;
76  enabled = false;
77  SetRangesOpen();
78  script_id = 0;
79  other_case = 0;
80  mirror = 0;
81  normed = "";
83  fragment = NULL;
84 }
85 
86 // Sets all ranges wide open. Initialization default in case there are
87 // no useful values available.
88 void UNICHARSET::UNICHAR_PROPERTIES::SetRangesOpen() {
89  min_bottom = 0;
90  max_bottom = MAX_UINT8;
91  min_top = 0;
92  max_top = MAX_UINT8;
93  min_width = 0;
94  max_width = MAX_INT16;
95  min_bearing = 0;
96  max_bearing = MAX_INT16;
97  min_advance = 0;
98  max_advance = MAX_INT16;
99 }
100 
101 // Sets all ranges to empty. Used before expanding with font-based data.
102 void UNICHARSET::UNICHAR_PROPERTIES::SetRangesEmpty() {
103  min_bottom = MAX_UINT8;
104  max_bottom = 0;
105  min_top = MAX_UINT8;
106  max_top = 0;
107  min_width = MAX_INT16;
108  max_width = 0;
109  min_bearing = MAX_INT16;
110  max_bearing = 0;
111  min_advance = MAX_INT16;
112  max_advance = 0;
113 }
114 
115 // Returns true if any of the top/bottom/width/bearing/advance ranges is
116 // emtpy.
117 bool UNICHARSET::UNICHAR_PROPERTIES::AnyRangeEmpty() const {
118  return min_bottom > max_bottom || min_top > max_top ||
119  min_width > max_width || min_bearing > max_bearing ||
120  min_advance > max_advance;
121 }
122 
123 // Expands the ranges with the ranges from the src properties.
124 void UNICHARSET::UNICHAR_PROPERTIES::ExpandRangesFrom(
125  const UNICHAR_PROPERTIES& src) {
126  UpdateRange(src.min_bottom, &min_bottom, &max_bottom);
127  UpdateRange(src.max_bottom, &min_bottom, &max_bottom);
128  UpdateRange(src.min_top, &min_top, &max_top);
129  UpdateRange(src.max_top, &min_top, &max_top);
130  UpdateRange(src.min_width, &min_width, &max_width);
131  UpdateRange(src.max_width, &min_width, &max_width);
132  UpdateRange(src.min_bearing, &min_bearing, &max_bearing);
133  UpdateRange(src.max_bearing, &min_bearing, &max_bearing);
134  UpdateRange(src.min_advance, &min_advance, &max_advance);
135  UpdateRange(src.max_advance, &min_advance, &max_advance);
136 }
137 
138 // Copies the properties from src into this.
139 void UNICHARSET::UNICHAR_PROPERTIES::CopyFrom(const UNICHAR_PROPERTIES& src) {
140  // Apart from the fragment, everything else can be done with a default copy.
141  CHAR_FRAGMENT* saved_fragment = fragment;
142  *this = src; // Bitwise copy.
143  fragment = saved_fragment;
144 }
145 
147  unichars(NULL),
148  ids(),
149  size_used(0),
150  size_reserved(0),
151  script_table(NULL),
152  script_table_size_used(0),
153  null_script("NULL") {
154  clear();
155 }
156 
158  clear();
159 }
160 
161 void UNICHARSET::reserve(int unichars_number) {
162  if (unichars_number > size_reserved) {
163  UNICHAR_SLOT* unichars_new = new UNICHAR_SLOT[unichars_number];
164  for (int i = 0; i < size_used; ++i)
165  unichars_new[i] = unichars[i];
166  for (int j = size_used; j < unichars_number; ++j) {
167  unichars_new[j].properties.script_id = add_script(null_script);
168  }
169  delete[] unichars;
170  unichars = unichars_new;
171  size_reserved = unichars_number;
172  }
173 }
174 
175 const UNICHAR_ID
176 UNICHARSET::unichar_to_id(const char* const unichar_repr) const {
177  return ids.contains(unichar_repr) ?
178  ids.unichar_to_id(unichar_repr) : INVALID_UNICHAR_ID;
179 }
180 
181 const UNICHAR_ID UNICHARSET::unichar_to_id(const char* const unichar_repr,
182  int length) const {
183  assert(length > 0 && length <= UNICHAR_LEN);
184  return ids.contains(unichar_repr, length) ?
185  ids.unichar_to_id(unichar_repr, length) : INVALID_UNICHAR_ID;
186 }
187 
188 // Return the minimum number of bytes that matches a legal UNICHAR_ID,
189 // while leaving a legal UNICHAR_ID afterwards. In other words, if there
190 // is both a short and a long match to the string, return the length that
191 // ensures there is a legal match after it.
192 int UNICHARSET::step(const char* str) const {
193  // Find the length of the first matching unicharset member.
194  int minlength = ids.minmatch(str);
195  if (minlength == 0)
196  return 0; // Empty string or illegal char.
197 
198  int goodlength = minlength;
199  while (goodlength <= UNICHAR_LEN) {
200  if (str[goodlength] == '\0' || ids.minmatch(str + goodlength) > 0)
201  return goodlength; // This length works!
202 
203  // The next char is illegal so find the next usable length.
204  do {
205  ++goodlength;
206  } while (str[goodlength] != '\0' && goodlength <= UNICHAR_LEN &&
207  !ids.contains(str, goodlength));
208  if (goodlength > UNICHAR_LEN || !ids.contains(str, goodlength)) {
209  // This does not constitute a good length!
210  return minlength;
211  }
212  }
213  // Search to find a subsequent legal char failed so return the minlength.
214  return minlength;
215 }
216 
217 // Return whether the given UTF-8 string is encodable with this UNICHARSET.
218 // If not encodable, write the first byte offset which cannot be converted
219 // into the second (return) argument.
220 bool UNICHARSET::encodable_string(const char *str,
221  int *first_bad_position) const {
222  for (int i = 0, len = strlen(str); i < len; ) {
223  int increment = step(str + i);
224  if (increment == 0) {
225  if (first_bad_position) *first_bad_position = i;
226  return false;
227  }
228  i += increment;
229  }
230  return true;
231 }
232 
233 const char* const UNICHARSET::id_to_unichar(UNICHAR_ID id) const {
234  if (id == INVALID_UNICHAR_ID) {
235  return INVALID_UNICHAR;
236  }
237  ASSERT_HOST(id < this->size());
238  return unichars[id].representation;
239 }
240 
241 const char* const UNICHARSET::id_to_unichar_ext(UNICHAR_ID id) const {
242  if (id == INVALID_UNICHAR_ID) {
243  return INVALID_UNICHAR;
244  }
245  ASSERT_HOST(id < this->size());
246  // Resolve from the kCustomLigatures table if this is a private encoding.
247  if (get_isprivate(id)) {
248  const char* ch = id_to_unichar(id);
249  for (int i = 0; kCustomLigatures[i][0] != NULL; ++i) {
250  if (!strcmp(ch, kCustomLigatures[i][1])) {
251  return kCustomLigatures[i][0];
252  }
253  }
254  }
255  // Otherwise return the stored representation.
256  return unichars[id].representation;
257 }
258 
259 // Return a STRING that reformats the utf8 str into the str followed
260 // by its hex unicodes.
262  STRING result = str;
263  result += " [";
264  int step = 1;
265  // Chop into unicodes and code each as hex.
266  for (int i = 0; str[i] != '\0'; i += step) {
267  char hex[sizeof(int) * 2 + 1];
268  step = UNICHAR::utf8_step(str + i);
269  if (step == 0) {
270  step = 1;
271  sprintf(hex, "%x", str[i]);
272  } else {
273  UNICHAR ch(str + i, step);
274  sprintf(hex, "%x", ch.first_uni());
275  }
276  result += hex;
277  result += " ";
278  }
279  result += "]";
280  return result;
281 }
282 
283 // Return a STRING containing debug information on the unichar, including
284 // the id_to_unichar, its hex unicodes and the properties.
286  if (id == INVALID_UNICHAR_ID) return STRING(id_to_unichar(id));
287  const CHAR_FRAGMENT *fragment = this->get_fragment(id);
288  if (fragment) {
289  return fragment->to_string();
290  }
291  const char* str = id_to_unichar(id);
292  STRING result = debug_utf8_str(str);
293  // Append a for lower alpha, A for upper alpha, and x if alpha but neither.
294  if (get_isalpha(id)) {
295  if (get_islower(id))
296  result += "a";
297  else if (get_isupper(id))
298  result += "A";
299  else
300  result += "x";
301  }
302  // Append 0 if a digit.
303  if (get_isdigit(id)) {
304  result += "0";
305  }
306  // Append p is a punctuation symbol.
307  if (get_ispunctuation(id)) {
308  result += "p";
309  }
310  return result;
311 }
312 
313 // Returns whether the unichar id represents a unicode value in the private use
314 // area. We use this range only internally to represent uncommon ligatures
315 // (eg. 'ct') that do not have regular unicode values.
316 bool UNICHARSET::get_isprivate(UNICHAR_ID unichar_id) const {
317  UNICHAR uc(id_to_unichar(unichar_id), -1);
318  int uni = uc.first_uni();
319  return (uni >= 0xE000 && uni <= 0xF8FF);
320 }
321 
322 
323 // Sets all ranges to empty, so they can be expanded to set the values.
325  for (int id = 0; id < size_used; ++id) {
326  unichars[id].properties.SetRangesEmpty();
327  }
328 }
329 
330 // Sets all the properties for this unicharset given a src unicharset with
331 // everything set. The unicharsets don't have to be the same, and graphemes
332 // are correctly accounted for.
334  for (int ch = 0; ch < size_used; ++ch) {
335  const char* utf8 = id_to_unichar(ch);
336  UNICHAR_PROPERTIES properties;
337  if (src.GetStrProperties(utf8, &properties)) {
338  // Setup the script_id, other_case, and mirror properly.
339  const char* script = src.get_script_from_script_id(properties.script_id);
340  properties.script_id = add_script(script);
341  const char* other_case = src.id_to_unichar(properties.other_case);
342  if (contains_unichar(other_case)) {
343  properties.other_case = unichar_to_id(other_case);
344  } else {
345  properties.other_case = ch;
346  }
347  const char* mirror_str = src.id_to_unichar(properties.mirror);
348  if (contains_unichar(mirror_str)) {
349  properties.mirror = unichar_to_id(mirror_str);
350  } else {
351  properties.mirror = ch;
352  }
353  unichars[ch].properties.CopyFrom(properties);
354  }
355  }
356 }
357 
358 // Expands the tops and bottoms and widths for this unicharset given a
359 // src unicharset with ranges in it. The unicharsets don't have to be the
360 // same, and graphemes are correctly accounted for.
362  for (int ch = 0; ch < size_used; ++ch) {
363  const char* utf8 = id_to_unichar(ch);
364  UNICHAR_PROPERTIES properties;
365  if (src.GetStrProperties(utf8, &properties)) {
366  // Expand just the ranges from properties.
367  unichars[ch].properties.ExpandRangesFrom(properties);
368  }
369  }
370 }
371 
372 // For each id in src, if it does not occur in this, add it, as in
373 // SetPropertiesFromOther, otherwise expand the ranges, as in
374 // ExpandRangesFromOther.
376  for (int ch = 0; ch < src.size_used; ++ch) {
377  const UNICHAR_PROPERTIES& src_props = src.unichars[ch].properties;
378  const char* utf8 = src.id_to_unichar(ch);
379  if (strcmp(utf8, " ") != 0 && src_props.AnyRangeEmpty()) {
380  // Only use fully valid entries.
381  tprintf("Bad properties for char %s: %d,%d %d,%d %d,%d %d,%d %d,%d\n",
382  utf8, src_props.min_bottom, src_props.max_bottom,
383  src_props.min_top, src_props.max_top,
384  src_props.min_width, src_props.max_width,
385  src_props.min_bearing, src_props.max_bearing,
386  src_props.min_advance, src_props.max_advance);
387  continue;
388  }
389  int id = size_used;
390  if (contains_unichar(utf8)) {
391  id = unichar_to_id(utf8);
392  } else {
393  unichar_insert(utf8);
394  unichars[id].properties.SetRangesEmpty();
395  }
396  if (!unichars[id].properties.AnyRangeEmpty()) {
397  // Just expand current ranges.
398  unichars[id].properties.ExpandRangesFrom(src_props);
399  } else {
400  // Copy properties from src_props.
401  unichars[id].properties.CopyFrom(src_props);
402  // Setup the script_id, other_case and mirror properly.
403  const char* script = src.get_script_from_script_id(src_props.script_id);
404  unichars[id].properties.script_id = add_script(script);
405  const char* other_case = src.id_to_unichar(src_props.other_case);
406  if (!contains_unichar(other_case)) {
407  unichar_insert(other_case);
408  unichars[size_used - 1].properties.SetRangesEmpty();
409  // Other_case will have its ranges set later as it is contained in src.
410  }
411  unichars[id].properties.other_case = unichar_to_id(other_case);
412  const char* mirror_str = src.id_to_unichar(src_props.mirror);
413  if (!contains_unichar(mirror_str)) {
414  unichar_insert(mirror_str);
415  unichars[size_used - 1].properties.SetRangesEmpty();
416  // Mirror will have its ranges set later as it is contained in src.
417  }
418  unichars[id].properties.mirror = unichar_to_id(mirror_str);
419  }
420  }
421 }
422 
423 // Gets the properties for a grapheme string, combining properties for
424 // multiple characters in a meaningful way where possible.
425 // Returns false if no valid match was found in the unicharset.
426 // NOTE that script_id, mirror, and other_case refer to this unicharset on
427 // return and will need translation if the target unicharset is different.
428 bool UNICHARSET::GetStrProperties(const char* utf8_str,
429  UNICHAR_PROPERTIES* props) const {
430  props->Init();
431  props->SetRangesEmpty();
432  props->min_advance = 0;
433  props->max_advance = 0;
434  int utf8_step = 0;
435  int total_unicodes = 0;
436  for (int offset = 0; utf8_str[offset] != '\0'; offset += utf8_step) {
437  utf8_step = step(utf8_str + offset);
438  if (utf8_step == 0) return false;
439  int id = unichar_to_id(utf8_str + offset, utf8_step);
440  if (id < 0) return false;
441  const UNICHAR_PROPERTIES& src_props = unichars[id].properties;
442  // Logical OR all the bools.
443  if (src_props.isalpha) props->isalpha = true;
444  if (src_props.islower) props->islower = true;
445  if (src_props.isupper) props->isupper = true;
446  if (src_props.isdigit) props->isdigit = true;
447  if (src_props.ispunctuation) props->ispunctuation = true;
448  if (src_props.isngram) props->isngram = true;
449  if (src_props.enabled) props->enabled = true;
450  // Min/max the tops/bottoms.
451  UpdateRange(src_props.min_bottom, &props->min_bottom, &props->max_bottom);
452  UpdateRange(src_props.max_bottom, &props->min_bottom, &props->max_bottom);
453  UpdateRange(src_props.min_top, &props->min_top, &props->max_top);
454  UpdateRange(src_props.max_top, &props->min_top, &props->max_top);
455  int bearing = props->min_advance + src_props.min_bearing;
456  if (total_unicodes == 0 || bearing < props->min_bearing)
457  props->min_bearing = bearing;
458  bearing = props->max_advance + src_props.max_bearing;
459  if (total_unicodes == 0 || bearing < props->max_bearing)
460  props->max_bearing = bearing;
461  props->min_advance += src_props.min_advance;
462  props->max_advance += src_props.max_advance;
463  // With a single width, just use the widths stored in the unicharset.
464  props->min_width = src_props.min_width;
465  props->max_width = src_props.max_width;
466  // Use the first script id, other_case, mirror, direction.
467  // Note that these will need translation, except direction.
468  if (total_unicodes == 0) {
469  props->script_id = src_props.script_id;
470  props->other_case = src_props.other_case;
471  props->mirror = src_props.mirror;
472  props->direction = src_props.direction;
473  }
474  // The normed string for the compound character is the concatenation of
475  // the normed versions of the individual characters.
476  props->normed += src_props.normed;
477  ++total_unicodes;
478  }
479  if (total_unicodes > 1) {
480  // Estimate the total widths from the advance - bearing.
481  props->min_width = props->min_advance - props->max_bearing;
482  props->max_width = props->max_advance - props->min_bearing;
483  }
484  return total_unicodes > 0;
485 }
486 
487 unsigned int UNICHARSET::get_properties(UNICHAR_ID id) const {
488  unsigned int properties = 0;
489  if (this->get_isalpha(id))
490  properties |= ISALPHA_MASK;
491  if (this->get_islower(id))
492  properties |= ISLOWER_MASK;
493  if (this->get_isupper(id))
494  properties |= ISUPPER_MASK;
495  if (this->get_isdigit(id))
496  properties |= ISDIGIT_MASK;
497  if (this->get_ispunctuation(id))
498  properties |= ISPUNCTUATION_MASK;
499  return properties;
500 }
501 
503  if (this->get_isupper(id)) return 'A';
504  if (this->get_islower(id)) return 'a';
505  if (this->get_isalpha(id)) return 'x';
506  if (this->get_isdigit(id)) return '0';
507  if (this->get_ispunctuation(id)) return 'p';
508  return 0;
509 }
510 
511 void UNICHARSET::unichar_insert(const char* const unichar_repr) {
512  if (!ids.contains(unichar_repr)) {
513  if (strlen(unichar_repr) > UNICHAR_LEN) {
514  fprintf(stderr, "Utf8 buffer too big, size=%d for %s\n",
515  int(strlen(unichar_repr)), unichar_repr);
516  return;
517  }
518  if (size_used == size_reserved) {
519  if (size_used == 0)
520  reserve(8);
521  else
522  reserve(2 * size_used);
523  }
524 
525  strcpy(unichars[size_used].representation, unichar_repr);
526  this->set_script(size_used, null_script);
527  // If the given unichar_repr represents a fragmented character, set
528  // fragment property to a pointer to CHAR_FRAGMENT class instance with
529  // information parsed from the unichar representation. Use the script
530  // of the base unichar for the fragmented character if possible.
531  CHAR_FRAGMENT *frag = CHAR_FRAGMENT::parse_from_string(unichar_repr);
532  this->unichars[size_used].properties.fragment = frag;
533  if (frag != NULL && this->contains_unichar(frag->get_unichar())) {
534  this->unichars[size_used].properties.script_id =
535  this->get_script(frag->get_unichar());
536  }
537  this->unichars[size_used].properties.enabled = true;
538  ids.insert(unichar_repr, size_used);
539  ++size_used;
540  }
541 }
542 
543 bool UNICHARSET::contains_unichar(const char* const unichar_repr) const {
544  return ids.contains(unichar_repr);
545 }
546 
547 bool UNICHARSET::contains_unichar(const char* const unichar_repr,
548  int length) const {
549  if (length == 0) {
550  return false;
551  }
552  return ids.contains(unichar_repr, length);
553 }
554 
555 bool UNICHARSET::eq(UNICHAR_ID unichar_id,
556  const char* const unichar_repr) const {
557  return strcmp(this->id_to_unichar(unichar_id), unichar_repr) == 0;
558 }
559 
560 bool UNICHARSET::save_to_file(FILE *file) const {
561  fprintf(file, "%d\n", this->size());
562  for (UNICHAR_ID id = 0; id < this->size(); ++id) {
563  int min_bottom, max_bottom, min_top, max_top;
564  get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top);
565  int min_width, max_width;
566  get_width_range(id, &min_width, &max_width);
567  int min_bearing, max_bearing;
568  get_bearing_range(id, &min_bearing, &max_bearing);
569  int min_advance, max_advance;
570  get_advance_range(id, &min_advance, &max_advance);
571  unsigned int properties = this->get_properties(id);
572  if (strcmp(this->id_to_unichar(id), " ") == 0) {
573  fprintf(file, "%s %x %s %d\n", "NULL", properties,
574  this->get_script_from_script_id(this->get_script(id)),
575  this->get_other_case(id));
576  } else {
577  fprintf(file,
578  "%s %x %d,%d,%d,%d,%d,%d,%d,%d,%d,%d %s %d %d %d %s\t# %s\n",
579  this->id_to_unichar(id), properties,
580  min_bottom, max_bottom, min_top, max_top, min_width, max_width,
581  min_bearing, max_bearing, min_advance, max_advance,
582  this->get_script_from_script_id(this->get_script(id)),
583  this->get_other_case(id), this->get_direction(id),
584  this->get_mirror(id), this->get_normed_unichar(id),
585  this->debug_str(id).string());
586  }
587  }
588  return true;
589 }
590 
592  public:
593  InMemoryFilePointer(const char *memory, int mem_size)
594  : memory_(memory), fgets_ptr_(memory), mem_size_(mem_size) { }
595 
596  char *fgets(char *orig_dst, int size) {
597  const char *src_end = memory_ + mem_size_;
598  char *dst_end = orig_dst + size - 1;
599  if (size < 1) {
600  return fgets_ptr_ < src_end ? orig_dst : NULL;
601  }
602 
603  char *dst = orig_dst;
604  char ch = '^';
605  while (fgets_ptr_ < src_end && dst < dst_end && ch != '\n') {
606  ch = *dst++ = *fgets_ptr_++;
607  }
608  *dst = 0;
609  return (dst == orig_dst) ? NULL : orig_dst;
610  }
611 
612  private:
613  const char *memory_;
614  const char *fgets_ptr_;
615  const int mem_size_;
616 };
617 
619  const char *memory, int mem_size, bool skip_fragments) {
620  InMemoryFilePointer mem_fp(memory, mem_size);
623  bool success = load_via_fgets(fgets_cb, skip_fragments);
624  delete fgets_cb;
625  return success;
626 }
627 
629  public:
630  LocalFilePointer(FILE *stream) : fp_(stream) {}
631  char *fgets(char *dst, int size) {
632  return ::fgets(dst, size, fp_);
633  }
634  private:
635  FILE *fp_;
636 };
637 
638 bool UNICHARSET::load_from_file(FILE *file, bool skip_fragments) {
639  LocalFilePointer lfp(file);
642  bool success = load_via_fgets(fgets_cb, skip_fragments);
643  delete fgets_cb;
644  return success;
645 }
646 
647 bool UNICHARSET::load_via_fgets(
649  bool skip_fragments) {
650  int unicharset_size;
651  char buffer[256];
652 
653  this->clear();
654  if (fgets_cb->Run(buffer, sizeof(buffer)) == NULL ||
655  sscanf(buffer, "%d", &unicharset_size) != 1) {
656  return false;
657  }
658  this->reserve(unicharset_size);
659  for (UNICHAR_ID id = 0; id < unicharset_size; ++id) {
660  char unichar[256];
661  unsigned int properties;
662  char script[64];
663 
664  strcpy(script, null_script);
665  int min_bottom = 0;
666  int max_bottom = MAX_UINT8;
667  int min_top = 0;
668  int max_top = MAX_UINT8;
669  int min_width = 0;
670  int max_width = MAX_INT16;
671  int min_bearing = 0;
672  int max_bearing = MAX_INT16;
673  int min_advance = 0;
674  int max_advance = MAX_INT16;
675  // TODO(eger): check that this default it ok
676  // after enabling BiDi iterator for Arabic+Cube.
678  UNICHAR_ID other_case = id;
679  UNICHAR_ID mirror = id;
680  char normed[64];
681  int v = -1;
682  if (fgets_cb->Run(buffer, sizeof (buffer)) == NULL ||
683  ((v = sscanf(buffer,
684  "%s %x %d,%d,%d,%d,%d,%d,%d,%d,%d,%d %63s %d %d %d %63s",
685  unichar, &properties,
686  &min_bottom, &max_bottom, &min_top, &max_top,
687  &min_width, &max_width, &min_bearing, &max_bearing,
688  &min_advance, &max_advance, script, &other_case,
689  &direction, &mirror, normed)) != 17 &&
690  (v = sscanf(buffer,
691  "%s %x %d,%d,%d,%d,%d,%d,%d,%d,%d,%d %63s %d %d %d",
692  unichar, &properties,
693  &min_bottom, &max_bottom, &min_top, &max_top,
694  &min_width, &max_width, &min_bearing, &max_bearing,
695  &min_advance, &max_advance,
696  script, &other_case, &direction, &mirror)) != 16 &&
697  (v = sscanf(buffer, "%s %x %d,%d,%d,%d %63s %d %d %d",
698  unichar, &properties,
699  &min_bottom, &max_bottom, &min_top, &max_top,
700  script, &other_case, &direction, &mirror)) != 10 &&
701  (v = sscanf(buffer, "%s %x %d,%d,%d,%d %63s %d", unichar, &properties,
702  &min_bottom, &max_bottom, &min_top, &max_top,
703  script, &other_case)) != 8 &&
704  (v = sscanf(buffer, "%s %x %63s %d", unichar, &properties,
705  script, &other_case)) != 4 &&
706  (v = sscanf(buffer, "%s %x %63s",
707  unichar, &properties, script)) != 3 &&
708  (v = sscanf(buffer, "%s %x", unichar, &properties) != 2))) {
709  return false;
710  }
711 
712  // Skip fragments if needed.
713  CHAR_FRAGMENT *frag = NULL;
714  if (skip_fragments && (frag = CHAR_FRAGMENT::parse_from_string(unichar))) {
715  delete frag;
716  continue;
717  }
718  // Insert unichar into unicharset and set its properties.
719  if (strcmp(unichar, "NULL") == 0)
720  this->unichar_insert(" ");
721  else
722  this->unichar_insert(unichar);
723 
724  this->set_isalpha(id, properties & ISALPHA_MASK);
725  this->set_islower(id, properties & ISLOWER_MASK);
726  this->set_isupper(id, properties & ISUPPER_MASK);
727  this->set_isdigit(id, properties & ISDIGIT_MASK);
728  this->set_ispunctuation(id, properties & ISPUNCTUATION_MASK);
729  this->set_isngram(id, false);
730  this->set_script(id, script);
731  this->unichars[id].properties.enabled = true;
732  this->set_top_bottom(id, min_bottom, max_bottom, min_top, max_top);
733  this->set_width_range(id, min_width, max_width);
734  this->set_bearing_range(id, min_bearing, max_bearing);
735  this->set_advance_range(id, min_advance, max_advance);
736  this->set_direction(id, static_cast<UNICHARSET::Direction>(direction));
737  ASSERT_HOST(other_case < unicharset_size);
738  this->set_other_case(id, (v>3) ? other_case : id);
739  ASSERT_HOST(mirror < unicharset_size);
740  this->set_mirror(id, (v>8) ? mirror : id);
741  this->set_normed(id, (v>16) ? normed : unichar);
742  }
743  post_load_setup();
744  return true;
745 }
746 
747 // Sets up internal data after loading the file, based on the char
748 // properties. Called from load_from_file, but also needs to be run
749 // during set_unicharset_properties.
751  // Number of alpha chars with the case property minus those without,
752  // in order to determine that half the alpha chars have case.
753  int net_case_alphas = 0;
754  int x_height_alphas = 0;
755  int cap_height_alphas = 0;
756  top_bottom_set_ = false;
757  for (UNICHAR_ID id = 0; id < size_used; ++id) {
758  int min_bottom = 0;
759  int max_bottom = MAX_UINT8;
760  int min_top = 0;
761  int max_top = MAX_UINT8;
762  get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top);
763  if (min_top > 0)
764  top_bottom_set_ = true;
765  if (get_isalpha(id)) {
766  if (get_islower(id) || get_isupper(id))
767  ++net_case_alphas;
768  else
769  --net_case_alphas;
770  if (min_top < kMeanlineThreshold && max_top < kMeanlineThreshold)
771  ++x_height_alphas;
772  else if (min_top > kMeanlineThreshold && max_top > kMeanlineThreshold)
773  ++cap_height_alphas;
774  }
775  }
776 
777  script_has_upper_lower_ = net_case_alphas > 0;
778  script_has_xheight_ = script_has_upper_lower_ ||
779  (x_height_alphas > cap_height_alphas * kMinXHeightFraction &&
780  cap_height_alphas > x_height_alphas * kMinCapHeightFraction);
781 
782  null_sid_ = get_script_id_from_name(null_script);
783  ASSERT_HOST(null_sid_ == 0);
784  common_sid_ = get_script_id_from_name("Common");
785  latin_sid_ = get_script_id_from_name("Latin");
786  cyrillic_sid_ = get_script_id_from_name("Cyrillic");
787  greek_sid_ = get_script_id_from_name("Greek");
788  han_sid_ = get_script_id_from_name("Han");
789  hiragana_sid_ = get_script_id_from_name("Hiragana");
790  katakana_sid_ = get_script_id_from_name("Katakana");
791 
792  // Compute default script. Use the highest-counting alpha script, that is
793  // not the common script, as that still contains some "alphas".
794  int* script_counts = new int[script_table_size_used];
795  memset(script_counts, 0, sizeof(*script_counts) * script_table_size_used);
796  for (int id = 0; id < size_used; ++id) {
797  if (get_isalpha(id)) {
798  ++script_counts[get_script(id)];
799  }
800  }
801  default_sid_ = 0;
802  for (int s = 1; s < script_table_size_used; ++s) {
803  if (script_counts[s] > script_counts[default_sid_] && s != common_sid_)
804  default_sid_ = s;
805  }
806  delete [] script_counts;
807 }
808 
809 // Returns true if right_to_left scripts are significant in the unicharset,
810 // but without being so sensitive that "universal" unicharsets containing
811 // characters from many scripts, like orientation and script detection,
812 // look like they are right_to_left.
814  int ltr_count = 0;
815  int rtl_count = 0;
816  for (int id = 0; id < size_used; ++id) {
817  int dir = get_direction(id);
818  if (dir == UNICHARSET::U_LEFT_TO_RIGHT) ltr_count++;
819  if (dir == UNICHARSET::U_RIGHT_TO_LEFT ||
821  dir == UNICHARSET::U_ARABIC_NUMBER) rtl_count++;
822  }
823  return rtl_count > ltr_count;
824 }
825 
826 // Set a whitelist and/or blacklist of characters to recognize.
827 // An empty or NULL whitelist enables everything (minus any blacklist).
828 // An empty or NULL blacklist disables nothing.
829 void UNICHARSET::set_black_and_whitelist(const char* blacklist,
830  const char* whitelist) {
831  bool def_enabled = whitelist == NULL || whitelist[0] == '\0';
832  // Set everything to default
833  for (int ch = 0; ch < size_used; ++ch)
834  unichars[ch].properties.enabled = def_enabled;
835  int ch_step;
836  if (!def_enabled) {
837  // Enable the whitelist.
838  for (int w_ind = 0; whitelist[w_ind] != '\0'; w_ind += ch_step) {
839  ch_step = step(whitelist + w_ind);
840  if (ch_step > 0) {
841  UNICHAR_ID u_id = unichar_to_id(whitelist + w_ind, ch_step);
842  if (u_id != INVALID_UNICHAR_ID) {
843  unichars[u_id].properties.enabled = true;
844  }
845  } else {
846  ch_step = 1;
847  }
848  }
849  }
850  if (blacklist != NULL && blacklist[0] != '\0') {
851  // Disable the blacklist.
852  for (int b_ind = 0; blacklist[b_ind] != '\0'; b_ind += ch_step) {
853  ch_step = step(blacklist + b_ind);
854  if (ch_step > 0) {
855  UNICHAR_ID u_id = unichar_to_id(blacklist + b_ind, ch_step);
856  if (u_id != INVALID_UNICHAR_ID) {
857  unichars[u_id].properties.enabled = false;
858  }
859  } else {
860  ch_step = 1;
861  }
862  }
863  }
864 }
865 
866 int UNICHARSET::add_script(const char* script) {
867  for (int i = 0; i < script_table_size_used; ++i) {
868  if (strcmp(script, script_table[i]) == 0)
869  return i;
870  }
871  if (script_table_size_reserved == 0) {
872  script_table_size_reserved = 8;
873  script_table = new char*[script_table_size_reserved];
874  }
875  if (script_table_size_used + 1 >= script_table_size_reserved) {
876  char** new_script_table = new char*[script_table_size_reserved * 2];
877  memcpy(new_script_table, script_table, script_table_size_reserved * sizeof(char*));
878  delete[] script_table;
879  script_table = new_script_table;
880  script_table_size_reserved = 2 * script_table_size_reserved;
881  }
882  script_table[script_table_size_used] = new char[strlen(script) + 1];
883  strcpy(script_table[script_table_size_used], script);
884  return script_table_size_used++;
885 }
886 
887 // Returns the string that represents a fragment
888 // with the given unichar, pos and total.
889 STRING CHAR_FRAGMENT::to_string(const char *unichar, int pos, int total,
890  bool natural) {
891  if (total == 1) return STRING(unichar);
892  STRING result = "";
893  result += kSeparator;
894  result += unichar;
895  char buffer[kMaxLen];
896  snprintf(buffer, kMaxLen, "%c%d%c%d", kSeparator, pos,
897  natural ? kNaturalFlag : kSeparator, total);
898  result += buffer;
899  return result;
900 }
901 
903  const char *ptr = string;
904  int len = strlen(string);
905  if (len < kMinLen || *ptr != kSeparator) {
906  return NULL; // this string can not represent a fragment
907  }
908  ptr++; // move to the next character
909  int step = 0;
910  while ((ptr + step) < (string + len) && *(ptr + step) != kSeparator) {
911  step += UNICHAR::utf8_step(ptr + step);
912  }
913  if (step == 0 || step > UNICHAR_LEN) {
914  return NULL; // no character for unichar or the character is too long
915  }
916  char unichar[UNICHAR_LEN + 1];
917  strncpy(unichar, ptr, step);
918  unichar[step] = '\0'; // null terminate unichar
919  ptr += step; // move to the next fragment separator
920  int pos = 0;
921  int total = 0;
922  bool natural = false;
923  char *end_ptr = NULL;
924  for (int i = 0; i < 2; i++) {
925  if (ptr > string + len || *ptr != kSeparator) {
926  if (i == 1 && *ptr == kNaturalFlag)
927  natural = true;
928  else
929  return NULL; // Failed to parse fragment representation.
930  }
931  ptr++; // move to the next character
932  i == 0 ? pos = static_cast<int>(strtol(ptr, &end_ptr, 10))
933  : total = static_cast<int>(strtol(ptr, &end_ptr, 10));
934  ptr = end_ptr;
935  }
936  if (ptr != string + len) {
937  return NULL; // malformed fragment representation
938  }
939  CHAR_FRAGMENT *fragment = new CHAR_FRAGMENT();
940  fragment->set_all(unichar, pos, total, natural);
941  return fragment;
942 }
943 
944 int UNICHARSET::get_script_id_from_name(const char* script_name) const {
945  for (int i = 0; i < script_table_size_used; ++i) {
946  if (strcmp(script_name, script_table[i]) == 0)
947  return i;
948  }
949  return 0; // 0 is always the null_script
950 }