Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
docqual.cpp
Go to the documentation of this file.
1 /******************************************************************
2  * File: docqual.cpp (Formerly docqual.c)
3  * Description: Document Quality Metrics
4  * Author: Phil Cheatle
5  * Created: Mon May 9 11:27:28 BST 1994
6  *
7  * (C) Copyright 1994, Hewlett-Packard Ltd.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #ifdef _MSC_VER
21 #pragma warning(disable:4244) // Conversion warnings
22 #endif
23 
24 #include "mfcpch.h"
25 #include <ctype.h>
26 #include "docqual.h"
27 #include "tfacep.h"
28 #include "reject.h"
29 #include "tesscallback.h"
30 #include "tessvars.h"
31 #include "secname.h"
32 #include "globals.h"
33 #include "tesseractclass.h"
34 
35 namespace tesseract{
36 
37 // A little class to provide the callbacks as we have no pre-bound args.
39  explicit DocQualCallbacks(WERD_RES* word0)
40  : word(word0), match_count(0), accepted_match_count(0) {}
41 
42  void CountMatchingBlobs(int index) {
43  ++match_count;
44  }
45 
46  void CountAcceptedBlobs(int index) {
47  if (word->reject_map[index].accepted())
49  ++match_count;
50  }
51 
52  void AcceptIfGoodQuality(int index) {
53  if (word->reject_map[index].accept_if_good_quality())
54  word->reject_map[index].setrej_quality_accept();
55  }
56 
60 };
61 
62 /*************************************************************************
63  * word_blob_quality()
64  * How many blobs in the box_word are identical to those of the inword?
65  * ASSUME blobs in both initial word and box_word are in ascending order of
66  * left hand blob edge.
67  *************************************************************************/
69  if (word->bln_boxes == NULL ||
70  word->rebuild_word == NULL || word->rebuild_word->blobs == NULL)
71  return 0;
72 
73  DocQualCallbacks cb(word);
75  *word->rebuild_word,
77  return cb.match_count;
78 }
79 
81  inT16 i = 0;
82  inT16 err_count = 0;
83 
84  if (word->rebuild_word != NULL) {
85  TBLOB* blob = word->rebuild_word->blobs;
86  for (; blob != NULL; blob = blob->next) {
87  err_count += count_outline_errs(word->best_choice->unichar_string()[i],
88  blob->NumOutlines());
89  i++;
90  }
91  }
92  return err_count;
93 }
94 
95 /*************************************************************************
96  * word_char_quality()
97  * Combination of blob quality and outline quality - how many good chars are
98  * there? - I.e chars which pass the blob AND outline tests.
99  *************************************************************************/
101  ROW *row,
102  inT16 *match_count,
103  inT16 *accepted_match_count) {
104  if (word->bln_boxes == NULL ||
105  word->rebuild_word == NULL || word->rebuild_word->blobs == NULL)
106  return;
107 
108  DocQualCallbacks cb(word);
110  *word->rebuild_word,
112  *match_count = cb.match_count;
113  *accepted_match_count = cb.accepted_match_count;
114 }
115 
116 /*************************************************************************
117  * unrej_good_chs()
118  * Unreject POTENTIAL rejects if the blob passes the blob and outline checks
119  *************************************************************************/
121  if (word->bln_boxes == NULL ||
122  word->rebuild_word == NULL || word->rebuild_word->blobs == NULL)
123  return;
124 
125  DocQualCallbacks cb(word);
127  *word->rebuild_word,
129 }
130 
131 inT16 Tesseract::count_outline_errs(char c, inT16 outline_count) {
132  int expected_outline_count;
133 
134  if (STRING (outlines_odd).contains (c))
135  return 0; //Dont use this char
136  else if (STRING (outlines_2).contains (c))
137  expected_outline_count = 2;
138  else
139  expected_outline_count = 1;
140  return abs (outline_count - expected_outline_count);
141 }
142 
144  BOOL8 good_quality_doc) {
145  if ((tessedit_good_quality_unrej && good_quality_doc))
146  unrej_good_quality_words(page_res_it);
147  doc_and_block_rejection(page_res_it, good_quality_doc);
148  if (unlv_tilde_crunching) {
149  tilde_crunch(page_res_it);
150  tilde_delete(page_res_it);
151  }
152 }
153 
154 
155 /*************************************************************************
156  * unrej_good_quality_words()
157  * Accept potential rejects in words which pass the following checks:
158  * - Contains a potential reject
159  * - Word looks like a sensible alpha word.
160  * - Word segmentation is the same as the original image
161  * - All characters have the expected number of outlines
162  * NOTE - the rejection counts are recalculated after unrejection
163  * - CANT do it in a single pass without a bit of fiddling
164  * - keep it simple but inefficient
165  *************************************************************************/
166 void Tesseract::unrej_good_quality_words( //unreject potential
167  PAGE_RES_IT &page_res_it) {
168  WERD_RES *word;
169  ROW_RES *current_row;
170  BLOCK_RES *current_block;
171  int i;
172 
173  page_res_it.restart_page ();
174  while (page_res_it.word () != NULL) {
175  check_debug_pt (page_res_it.word (), 100);
176  if (bland_unrej) {
177  word = page_res_it.word ();
178  for (i = 0; i < word->reject_map.length (); i++) {
179  if (word->reject_map[i].accept_if_good_quality ())
180  word->reject_map[i].setrej_quality_accept ();
181  }
182  page_res_it.forward ();
183  }
184  else if ((page_res_it.row ()->char_count > 0) &&
185  ((page_res_it.row ()->rej_count /
186  (float) page_res_it.row ()->char_count) <=
188  word = page_res_it.word ();
192  word->best_choice->unichar_string().string(),
194  != AC_UNACCEPTABLE)) {
195  unrej_good_chs(word, page_res_it.row ()->row);
196  }
197  page_res_it.forward ();
198  }
199  else {
200  /* Skip to end of dodgy row */
201  current_row = page_res_it.row ();
202  while ((page_res_it.word () != NULL) &&
203  (page_res_it.row () == current_row))
204  page_res_it.forward ();
205  }
206  check_debug_pt (page_res_it.word (), 110);
207  }
208  page_res_it.restart_page ();
209  page_res_it.page_res->char_count = 0;
210  page_res_it.page_res->rej_count = 0;
211  current_block = NULL;
212  current_row = NULL;
213  while (page_res_it.word () != NULL) {
214  if (current_block != page_res_it.block ()) {
215  current_block = page_res_it.block ();
216  current_block->char_count = 0;
217  current_block->rej_count = 0;
218  }
219  if (current_row != page_res_it.row ()) {
220  current_row = page_res_it.row ();
221  current_row->char_count = 0;
222  current_row->rej_count = 0;
223  current_row->whole_word_rej_count = 0;
224  }
225  page_res_it.rej_stat_word ();
226  page_res_it.forward ();
227  }
228 }
229 
230 
231 /*************************************************************************
232  * doc_and_block_rejection()
233  *
234  * If the page has too many rejects - reject all of it.
235  * If any block has too many rejects - reject all words in the block
236  *************************************************************************/
237 
238 void Tesseract::doc_and_block_rejection( //reject big chunks
239  PAGE_RES_IT &page_res_it,
240  BOOL8 good_quality_doc) {
241  inT16 block_no = 0;
242  inT16 row_no = 0;
243  BLOCK_RES *current_block;
244  ROW_RES *current_row;
245 
246  BOOL8 rej_word;
247  BOOL8 prev_word_rejected;
248  inT16 char_quality = 0;
249  inT16 accepted_char_quality;
250 
251  if (page_res_it.page_res->rej_count * 100.0 /
253  reject_whole_page(page_res_it);
255  tprintf("REJECT ALL #chars: %d #Rejects: %d; \n",
256  page_res_it.page_res->char_count,
257  page_res_it.page_res->rej_count);
258  }
259  } else {
261  tprintf("NO PAGE REJECTION #chars: %d # Rejects: %d; \n",
262  page_res_it.page_res->char_count,
263  page_res_it.page_res->rej_count);
264  }
265 
266  /* Walk blocks testing for block rejection */
267 
268  page_res_it.restart_page();
269  WERD_RES* word;
270  while ((word = page_res_it.word()) != NULL) {
271  current_block = page_res_it.block();
272  block_no = current_block->block->index();
273  if (current_block->char_count > 0 &&
274  (current_block->rej_count * 100.0 / current_block->char_count) >
277  tprintf("REJECTING BLOCK %d #chars: %d; #Rejects: %d\n",
278  block_no, current_block->char_count,
279  current_block->rej_count);
280  }
281  prev_word_rejected = FALSE;
282  while ((word = page_res_it.word()) != NULL &&
283  (page_res_it.block() == current_block)) {
285  rej_word = word->reject_map.reject_count() > 0 ||
287  if (rej_word && tessedit_dont_blkrej_good_wds &&
290  *word->uch_set,
291  word->best_choice->unichar_string().string(),
292  word->best_choice->unichar_lengths().string()) !=
293  AC_UNACCEPTABLE) {
294  word_char_quality(word, page_res_it.row()->row,
295  &char_quality,
296  &accepted_char_quality);
297  rej_word = char_quality != word->reject_map.length();
298  }
299  } else {
300  rej_word = TRUE;
301  }
302  if (rej_word) {
303  /*
304  Reject spacing if both current and prev words are rejected.
305  NOTE - this is NOT restricted to FUZZY spaces. - When tried this
306  generated more space errors.
307  */
309  prev_word_rejected &&
310  page_res_it.prev_row() == page_res_it.row() &&
311  word->word->space() == 1)
312  word->reject_spaces = TRUE;
314  }
315  prev_word_rejected = rej_word;
316  page_res_it.forward();
317  }
318  } else {
320  tprintf("NOT REJECTING BLOCK %d #chars: %d # Rejects: %d; \n",
321  block_no, page_res_it.block()->char_count,
322  page_res_it.block()->rej_count);
323  }
324 
325  /* Walk rows in block testing for row rejection */
326  row_no = 0;
327  while ((word = page_res_it.word()) != NULL &&
328  page_res_it.block() == current_block) {
329  current_row = page_res_it.row();
330  row_no++;
331  /* Reject whole row if:
332  fraction of chars on row which are rejected exceed a limit AND
333  fraction rejects which occur in WHOLE WERD rejects is LESS THAN a
334  limit
335  */
336  if (current_row->char_count > 0 &&
337  (current_row->rej_count * 100.0 / current_row->char_count) >
339  (current_row->whole_word_rej_count * 100.0 /
340  current_row->rej_count) <
343  tprintf("REJECTING ROW %d #chars: %d; #Rejects: %d\n",
344  row_no, current_row->char_count,
345  current_row->rej_count);
346  }
347  prev_word_rejected = FALSE;
348  while ((word = page_res_it.word()) != NULL &&
349  page_res_it.row () == current_row) {
350  /* Preserve words on good docs unless they are mostly rejected*/
351  if (!tessedit_row_rej_good_docs && good_quality_doc) {
352  rej_word = word->reject_map.reject_count() /
353  static_cast<float>(word->reject_map.length()) >
356  /* Preserve perfect words anyway */
357  rej_word = word->reject_map.reject_count() > 0 ||
359  if (rej_word && tessedit_dont_rowrej_good_wds &&
362  word->best_choice->unichar_string().string(),
363  word->best_choice->unichar_lengths().string()) !=
364  AC_UNACCEPTABLE) {
365  word_char_quality(word, page_res_it.row()->row,
366  &char_quality,
367  &accepted_char_quality);
368  rej_word = char_quality != word->reject_map.length();
369  }
370  } else {
371  rej_word = TRUE;
372  }
373  if (rej_word) {
374  /*
375  Reject spacing if both current and prev words are rejected.
376  NOTE - this is NOT restricted to FUZZY spaces. - When tried
377  this generated more space errors.
378  */
380  prev_word_rejected &&
381  page_res_it.prev_row() == page_res_it.row() &&
382  word->word->space () == 1)
383  word->reject_spaces = TRUE;
385  }
386  prev_word_rejected = rej_word;
387  page_res_it.forward();
388  }
389  } else {
391  tprintf("NOT REJECTING ROW %d #chars: %d # Rejects: %d; \n",
392  row_no, current_row->char_count, current_row->rej_count);
393  }
394  while (page_res_it.word() != NULL &&
395  page_res_it.row() == current_row)
396  page_res_it.forward();
397  }
398  }
399  }
400  }
401  }
402 }
403 
404 } // namespace tesseract
405 
406 
407 /*************************************************************************
408  * reject_whole_page()
409  * Dont believe any of it - set the reject map to 00..00 in all words
410  *
411  *************************************************************************/
412 
413 void reject_whole_page(PAGE_RES_IT &page_res_it) {
414  page_res_it.restart_page ();
415  while (page_res_it.word () != NULL) {
416  page_res_it.word ()->reject_map.rej_word_doc_rej ();
417  page_res_it.forward ();
418  }
419  //whole page is rejected
420  page_res_it.page_res->rejected = TRUE;
421 }
422 
423 namespace tesseract {
425  WERD_RES *word;
426  GARBAGE_LEVEL garbage_level;
427  PAGE_RES_IT copy_it;
428  BOOL8 prev_potential_marked = FALSE;
429  BOOL8 found_terrible_word = FALSE;
430  BOOL8 ok_dict_word;
431 
432  page_res_it.restart_page();
433  while (page_res_it.word() != NULL) {
434  POLY_BLOCK* pb = page_res_it.block()->block->poly_block();
435  if (pb != NULL && !pb->IsText()) {
436  page_res_it.forward();
437  continue;
438  }
439  word = page_res_it.word();
440 
442  convert_bad_unlv_chs(word);
443 
445  word->merge_tess_fails();
446 
447  if (word->reject_map.accept_count () != 0) {
448  found_terrible_word = FALSE;
449  //Forget earlier potential crunches
450  prev_potential_marked = FALSE;
451  }
452  else {
453  ok_dict_word = safe_dict_word(word);
454  garbage_level = garbage_word (word, ok_dict_word);
455 
456  if ((garbage_level != G_NEVER_CRUNCH) &&
457  (terrible_word_crunch (word, garbage_level))) {
458  if (crunch_debug > 0) {
459  tprintf ("T CRUNCHING: \"%s\"\n",
460  word->best_choice->unichar_string().string());
461  }
463  if (prev_potential_marked) {
464  while (copy_it.word () != word) {
465  if (crunch_debug > 0) {
466  tprintf ("P1 CRUNCHING: \"%s\"\n",
467  copy_it.word()->best_choice->unichar_string().string());
468  }
469  copy_it.word ()->unlv_crunch_mode = CR_KEEP_SPACE;
470  copy_it.forward ();
471  }
472  prev_potential_marked = FALSE;
473  }
474  found_terrible_word = TRUE;
475  }
476  else if ((garbage_level != G_NEVER_CRUNCH) &&
477  (potential_word_crunch (word,
478  garbage_level, ok_dict_word))) {
479  if (found_terrible_word) {
480  if (crunch_debug > 0) {
481  tprintf ("P2 CRUNCHING: \"%s\"\n",
482  word->best_choice->unichar_string().string());
483  }
485  }
486  else if (!prev_potential_marked) {
487  copy_it = page_res_it;
488  prev_potential_marked = TRUE;
489  if (crunch_debug > 1) {
490  tprintf ("P3 CRUNCHING: \"%s\"\n",
491  word->best_choice->unichar_string().string());
492  }
493  }
494  }
495  else {
496  found_terrible_word = FALSE;
497  //Forget earlier potential crunches
498  prev_potential_marked = FALSE;
499  if (crunch_debug > 2) {
500  tprintf ("NO CRUNCH: \"%s\"\n",
501  word->best_choice->unichar_string().string());
502  }
503  }
504  }
505  page_res_it.forward ();
506  }
507 }
508 
509 
511  GARBAGE_LEVEL garbage_level) {
512  float rating_per_ch;
513  int adjusted_len;
514  int crunch_mode = 0;
515 
516  if ((word->best_choice->unichar_string().length () == 0) ||
517  (strspn (word->best_choice->unichar_string().string(), " ") ==
518  word->best_choice->unichar_string().length ()))
519  crunch_mode = 1;
520  else {
521  adjusted_len = word->reject_map.length ();
522  if (adjusted_len > crunch_rating_max)
523  adjusted_len = crunch_rating_max;
524  rating_per_ch = word->best_choice->rating () / adjusted_len;
525 
526  if (rating_per_ch > crunch_terrible_rating)
527  crunch_mode = 2;
528  else if (crunch_terrible_garbage && (garbage_level == G_TERRIBLE))
529  crunch_mode = 3;
530  else if ((word->best_choice->certainty () < crunch_poor_garbage_cert) &&
531  (garbage_level != G_OK))
532  crunch_mode = 4;
533  else if ((rating_per_ch > crunch_poor_garbage_rate) &&
534  (garbage_level != G_OK))
535  crunch_mode = 5;
536  }
537  if (crunch_mode > 0) {
538  if (crunch_debug > 2) {
539  tprintf ("Terrible_word_crunch (%d) on \"%s\"\n",
540  crunch_mode, word->best_choice->unichar_string().string());
541  }
542  return TRUE;
543  }
544  else
545  return FALSE;
546 }
547 
549  GARBAGE_LEVEL garbage_level,
550  BOOL8 ok_dict_word) {
551  float rating_per_ch;
552  int adjusted_len;
553  const char *str = word->best_choice->unichar_string().string();
554  const char *lengths = word->best_choice->unichar_lengths().string();
555  BOOL8 word_crunchable;
556  int poor_indicator_count = 0;
557 
558  word_crunchable = !crunch_leave_accept_strings ||
559  word->reject_map.length() < 3 ||
561  str, lengths) == AC_UNACCEPTABLE &&
562  !ok_dict_word);
563 
564  adjusted_len = word->reject_map.length();
565  if (adjusted_len > 10)
566  adjusted_len = 10;
567  rating_per_ch = word->best_choice->rating() / adjusted_len;
568 
569  if (rating_per_ch > crunch_pot_poor_rate) {
570  if (crunch_debug > 2) {
571  tprintf("Potential poor rating on \"%s\"\n",
572  word->best_choice->unichar_string().string());
573  }
574  poor_indicator_count++;
575  }
576 
577  if (word_crunchable &&
579  if (crunch_debug > 2) {
580  tprintf("Potential poor cert on \"%s\"\n",
581  word->best_choice->unichar_string().string());
582  }
583  poor_indicator_count++;
584  }
585 
586  if (garbage_level != G_OK) {
587  if (crunch_debug > 2) {
588  tprintf("Potential garbage on \"%s\"\n",
589  word->best_choice->unichar_string().string());
590  }
591  poor_indicator_count++;
592  }
593  return poor_indicator_count >= crunch_pot_indicators;
594 }
595 
597  WERD_RES *word;
598  PAGE_RES_IT copy_it;
599  BOOL8 deleting_from_bol = FALSE;
600  BOOL8 marked_delete_point = FALSE;
601  inT16 debug_delete_mode;
602  CRUNCH_MODE delete_mode;
603  inT16 x_debug_delete_mode;
604  CRUNCH_MODE x_delete_mode;
605 
606  page_res_it.restart_page();
607  while (page_res_it.word() != NULL) {
608  word = page_res_it.word();
609 
610  delete_mode = word_deletable (word, debug_delete_mode);
611  if (delete_mode != CR_NONE) {
612  if (word->word->flag (W_BOL) || deleting_from_bol) {
613  if (crunch_debug > 0) {
614  tprintf ("BOL CRUNCH DELETING(%d): \"%s\"\n",
615  debug_delete_mode,
616  word->best_choice->unichar_string().string());
617  }
618  word->unlv_crunch_mode = delete_mode;
619  deleting_from_bol = TRUE;
620  } else if (word->word->flag(W_EOL)) {
621  if (marked_delete_point) {
622  while (copy_it.word() != word) {
623  x_delete_mode = word_deletable (copy_it.word (),
624  x_debug_delete_mode);
625  if (crunch_debug > 0) {
626  tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
627  x_debug_delete_mode,
628  copy_it.word()->best_choice->unichar_string().string());
629  }
630  copy_it.word ()->unlv_crunch_mode = x_delete_mode;
631  copy_it.forward ();
632  }
633  }
634  if (crunch_debug > 0) {
635  tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
636  debug_delete_mode,
637  word->best_choice->unichar_string().string());
638  }
639  word->unlv_crunch_mode = delete_mode;
640  deleting_from_bol = FALSE;
641  marked_delete_point = FALSE;
642  }
643  else {
644  if (!marked_delete_point) {
645  copy_it = page_res_it;
646  marked_delete_point = TRUE;
647  }
648  }
649  }
650  else {
651  deleting_from_bol = FALSE;
652  //Forget earlier potential crunches
653  marked_delete_point = FALSE;
654  }
655  /*
656  The following step has been left till now as the tess fails are used to
657  determine if the word is deletable.
658  */
660  word->merge_tess_fails();
661  page_res_it.forward ();
662  }
663 }
664 
665 
667  int i;
668  UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
669  UNICHAR_ID unichar_space = word_res->uch_set->unichar_to_id(" ");
670  UNICHAR_ID unichar_tilde = word_res->uch_set->unichar_to_id("~");
671  UNICHAR_ID unichar_pow = word_res->uch_set->unichar_to_id("^");
672  bool modified = false;
673  for (i = 0; i < word_res->reject_map.length(); ++i) {
674  if (word_res->best_choice->unichar_id(i) == unichar_tilde) {
675  word_res->best_choice->set_unichar_id(unichar_dash, i);
676  modified = true;
677  if (word_res->reject_map[i].accepted ())
678  word_res->reject_map[i].setrej_unlv_rej ();
679  }
680  if (word_res->best_choice->unichar_id(i) == unichar_pow) {
681  word_res->best_choice->set_unichar_id(unichar_space, i);
682  modified = true;
683  if (word_res->reject_map[i].accepted ())
684  word_res->reject_map[i].setrej_unlv_rej ();
685  }
686  }
687 }
688 
690  enum STATES
691  {
692  JUNK,
693  FIRST_UPPER,
694  FIRST_LOWER,
695  FIRST_NUM,
696  SUBSEQUENT_UPPER,
697  SUBSEQUENT_LOWER,
698  SUBSEQUENT_NUM
699  };
700  const char *str = word->best_choice->unichar_string().string();
701  const char *lengths = word->best_choice->unichar_lengths().string();
702  STATES state = JUNK;
703  int len = 0;
704  int isolated_digits = 0;
705  int isolated_alphas = 0;
706  int bad_char_count = 0;
707  int tess_rejs = 0;
708  int dodgy_chars = 0;
709  int ok_chars;
710  UNICHAR_ID last_char = -1;
711  int alpha_repetition_count = 0;
712  int longest_alpha_repetition_count = 0;
713  int longest_lower_run_len = 0;
714  int lower_string_count = 0;
715  int longest_upper_run_len = 0;
716  int upper_string_count = 0;
717  int total_alpha_count = 0;
718  int total_digit_count = 0;
719 
720  for (; *str != '\0'; str += *(lengths++)) {
721  len++;
722  if (word->uch_set->get_isupper (str, *lengths)) {
723  total_alpha_count++;
724  switch (state) {
725  case SUBSEQUENT_UPPER:
726  case FIRST_UPPER:
727  state = SUBSEQUENT_UPPER;
728  upper_string_count++;
729  if (longest_upper_run_len < upper_string_count)
730  longest_upper_run_len = upper_string_count;
731  if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {
732  alpha_repetition_count++;
733  if (longest_alpha_repetition_count < alpha_repetition_count) {
734  longest_alpha_repetition_count = alpha_repetition_count;
735  }
736  }
737  else {
738  last_char = word->uch_set->unichar_to_id(str, *lengths);
739  alpha_repetition_count = 1;
740  }
741  break;
742  case FIRST_NUM:
743  isolated_digits++;
744  default:
745  state = FIRST_UPPER;
746  last_char = word->uch_set->unichar_to_id(str, *lengths);
747  alpha_repetition_count = 1;
748  upper_string_count = 1;
749  break;
750  }
751  }
752  else if (word->uch_set->get_islower (str, *lengths)) {
753  total_alpha_count++;
754  switch (state) {
755  case SUBSEQUENT_LOWER:
756  case FIRST_LOWER:
757  state = SUBSEQUENT_LOWER;
758  lower_string_count++;
759  if (longest_lower_run_len < lower_string_count)
760  longest_lower_run_len = lower_string_count;
761  if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {
762  alpha_repetition_count++;
763  if (longest_alpha_repetition_count < alpha_repetition_count) {
764  longest_alpha_repetition_count = alpha_repetition_count;
765  }
766  }
767  else {
768  last_char = word->uch_set->unichar_to_id(str, *lengths);
769  alpha_repetition_count = 1;
770  }
771  break;
772  case FIRST_NUM:
773  isolated_digits++;
774  default:
775  state = FIRST_LOWER;
776  last_char = word->uch_set->unichar_to_id(str, *lengths);
777  alpha_repetition_count = 1;
778  lower_string_count = 1;
779  break;
780  }
781  }
782  else if (word->uch_set->get_isdigit (str, *lengths)) {
783  total_digit_count++;
784  switch (state) {
785  case FIRST_NUM:
786  state = SUBSEQUENT_NUM;
787  case SUBSEQUENT_NUM:
788  break;
789  case FIRST_UPPER:
790  case FIRST_LOWER:
791  isolated_alphas++;
792  default:
793  state = FIRST_NUM;
794  break;
795  }
796  }
797  else {
798  if (*lengths == 1 && *str == ' ')
799  tess_rejs++;
800  else
801  bad_char_count++;
802  switch (state) {
803  case FIRST_NUM:
804  isolated_digits++;
805  break;
806  case FIRST_UPPER:
807  case FIRST_LOWER:
808  isolated_alphas++;
809  default:
810  break;
811  }
812  state = JUNK;
813  }
814  }
815 
816  switch (state) {
817  case FIRST_NUM:
818  isolated_digits++;
819  break;
820  case FIRST_UPPER:
821  case FIRST_LOWER:
822  isolated_alphas++;
823  default:
824  break;
825  }
826 
828  total_alpha_count += total_digit_count - isolated_digits;
829  }
830 
831  if (crunch_leave_ok_strings && len >= 4 &&
832  2 * (total_alpha_count - isolated_alphas) > len &&
833  longest_alpha_repetition_count < crunch_long_repetitions) {
834  if ((crunch_accept_ok &&
835  acceptable_word_string(*word->uch_set, str, lengths) !=
836  AC_UNACCEPTABLE) ||
837  longest_lower_run_len > crunch_leave_lc_strings ||
838  longest_upper_run_len > crunch_leave_uc_strings)
839  return G_NEVER_CRUNCH;
840  }
841  if (word->reject_map.length() > 1 &&
842  strpbrk(str, " ") == NULL &&
843  (word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
844  word->best_choice->permuter() == FREQ_DAWG_PERM ||
845  word->best_choice->permuter() == USER_DAWG_PERM ||
846  word->best_choice->permuter() == NUMBER_PERM ||
847  acceptable_word_string(*word->uch_set, str, lengths) !=
848  AC_UNACCEPTABLE || ok_dict_word))
849  return G_OK;
850 
851  ok_chars = len - bad_char_count - isolated_digits -
852  isolated_alphas - tess_rejs;
853 
854  if (crunch_debug > 3) {
855  tprintf("garbage_word: \"%s\"\n",
856  word->best_choice->unichar_string().string());
857  tprintf("LEN: %d bad: %d iso_N: %d iso_A: %d rej: %d\n",
858  len,
859  bad_char_count, isolated_digits, isolated_alphas, tess_rejs);
860  }
861  if (bad_char_count == 0 &&
862  tess_rejs == 0 &&
863  (len > isolated_digits + isolated_alphas || len <= 2))
864  return G_OK;
865 
866  if (tess_rejs > ok_chars ||
867  (tess_rejs > 0 && (bad_char_count + tess_rejs) * 2 > len))
868  return G_TERRIBLE;
869 
870  if (len > 4) {
871  dodgy_chars = 2 * tess_rejs + bad_char_count + isolated_digits +
872  isolated_alphas;
873  if (dodgy_chars > 5 || (dodgy_chars / (float) len) > 0.5)
874  return G_DODGY;
875  else
876  return G_OK;
877  } else {
878  dodgy_chars = 2 * tess_rejs + bad_char_count;
879  if ((len == 4 && dodgy_chars > 2) ||
880  (len == 3 && dodgy_chars > 2) || dodgy_chars >= len)
881  return G_DODGY;
882  else
883  return G_OK;
884  }
885 }
886 
887 
888 /*************************************************************************
889  * word_deletable()
890  * DELETE WERDS AT ENDS OF ROWS IF
891  * Word is crunched &&
892  * ( string length = 0 OR
893  * > 50% of chars are "|" (before merging) OR
894  * certainty < -10 OR
895  * rating /char > 60 OR
896  * TOP of word is more than 0.5 xht BELOW baseline OR
897  * BOTTOM of word is more than 0.5 xht ABOVE xht OR
898  * length of word < 3xht OR
899  * height of word < 0.7 xht OR
900  * height of word > 3.0 xht OR
901  * >75% of the outline BBs have longest dimension < 0.5xht
902  *************************************************************************/
903 
905  int word_len = word->reject_map.length ();
906  float rating_per_ch;
907  TBOX box; //BB of word
908 
909  if (word->unlv_crunch_mode == CR_NONE) {
910  delete_mode = 0;
911  return CR_NONE;
912  }
913 
914  if (word_len == 0) {
915  delete_mode = 1;
916  return CR_DELETE;
917  }
918 
919  if (word->rebuild_word != NULL) {
920  // Cube leaves rebuild_word NULL.
921  box = word->rebuild_word->bounding_box();
922  if (box.height () < crunch_del_min_ht * kBlnXHeight) {
923  delete_mode = 4;
924  return CR_DELETE;
925  }
926 
927  if (noise_outlines(word->rebuild_word)) {
928  delete_mode = 5;
929  return CR_DELETE;
930  }
931  }
932 
933  if ((failure_count (word) * 1.5) > word_len) {
934  delete_mode = 2;
935  return CR_LOOSE_SPACE;
936  }
937 
938  if (word->best_choice->certainty () < crunch_del_cert) {
939  delete_mode = 7;
940  return CR_LOOSE_SPACE;
941  }
942 
943  rating_per_ch = word->best_choice->rating () / word_len;
944 
945  if (rating_per_ch > crunch_del_rating) {
946  delete_mode = 8;
947  return CR_LOOSE_SPACE;
948  }
949 
951  delete_mode = 9;
952  return CR_LOOSE_SPACE;
953  }
954 
955  if (box.bottom () >
957  delete_mode = 10;
958  return CR_LOOSE_SPACE;
959  }
960 
961  if (box.height () > crunch_del_max_ht * kBlnXHeight) {
962  delete_mode = 11;
963  return CR_LOOSE_SPACE;
964  }
965 
966  if (box.width () < crunch_del_min_width * kBlnXHeight) {
967  delete_mode = 3;
968  return CR_LOOSE_SPACE;
969  }
970 
971  delete_mode = 0;
972  return CR_NONE;
973 }
974 
976  const char *str = word->best_choice->unichar_string().string();
977  int tess_rejs = 0;
978 
979  for (; *str != '\0'; str++) {
980  if (*str == ' ')
981  tess_rejs++;
982  }
983  return tess_rejs;
984 }
985 
986 
988  TBOX box; // BB of outline
989  inT16 outline_count = 0;
990  inT16 small_outline_count = 0;
991  inT16 max_dimension;
992  float small_limit = kBlnXHeight * crunch_small_outlines_size;
993 
994  for (TBLOB* blob = word->blobs; blob != NULL; blob = blob->next) {
995  for (TESSLINE* ol = blob->outlines; ol != NULL; ol = ol->next) {
996  outline_count++;
997  box = ol->bounding_box();
998  if (box.height() > box.width())
999  max_dimension = box.height();
1000  else
1001  max_dimension = box.width();
1002  if (max_dimension < small_limit)
1003  small_outline_count++;
1004  }
1005  }
1006  return (small_outline_count >= outline_count);
1007 }
1008 } // namespace tesseract