Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
reject.cpp File Reference
#include "mfcpch.h"
#include "tessvars.h"
#include "scanutils.h"
#include <ctype.h>
#include <string.h>
#include "memry.h"
#include "reject.h"
#include "tfacep.h"
#include "imgs.h"
#include "control.h"
#include "docqual.h"
#include "secname.h"
#include "globals.h"
#include "helpers.h"
#include "tesseractclass.h"
#include "notdll.h"

Go to the source code of this file.

Namespaces

namespace  tesseract

Functions

 CLISTIZEH (STRING) CLISTIZE(STRING) namespace tesseract
void reject_blanks (WERD_RES *word)
void reject_poor_matches (WERD_RES *word, BLOB_CHOICE_LIST_CLIST *blob_choices)
float compute_reject_threshold (BLOB_CHOICE_LIST_CLIST *blob_choices)

Function Documentation

CLISTIZEH ( STRING  )

Definition at line 55 of file reject.cpp.

{
void Tesseract::set_done( //set done flag
WERD_RES *word,
inT16 pass) {
/*
0: Original heuristic used in Tesseract and Ray's prototype Resaljet
*/
if (tessedit_ok_mode == 0) {
/* NOTE - done even if word contains some or all spaces !!! */
word->done = word->tess_accepted;
}
/*
1: Reject words containing blanks and on pass 1 reject I/l/1 conflicts
*/
else if (tessedit_ok_mode == 1) {
word->done = word->tess_accepted &&
(strchr (word->best_choice->unichar_string().string (), ' ') == NULL);
if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
word->done = FALSE;
}
/*
2: as 1 + only accept dict words or numerics in pass 1
*/
else if (tessedit_ok_mode == 2) {
word->done = word->tess_accepted &&
(strchr (word->best_choice->unichar_string().string (), ' ') == NULL);
if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
word->done = FALSE;
if (word->done &&
(pass == 1) &&
(word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
(word->best_choice->permuter () != FREQ_DAWG_PERM) &&
(word->best_choice->permuter () != USER_DAWG_PERM) &&
(word->best_choice->permuter () != NUMBER_PERM)) {
#ifndef SECURE_NAMES
if (tessedit_rejection_debug)
tprintf ("\nVETO Tess accepting poor word \"%s\"\n",
#endif
word->done = FALSE;
}
}
/*
3: as 2 + only accept dict words or numerics in pass 2 as well
*/
else if (tessedit_ok_mode == 3) {
word->done = word->tess_accepted &&
(strchr (word->best_choice->unichar_string().string (), ' ') == NULL);
if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
word->done = FALSE;
if (word->done &&
(word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
(word->best_choice->permuter () != FREQ_DAWG_PERM) &&
(word->best_choice->permuter () != USER_DAWG_PERM) &&
(word->best_choice->permuter () != NUMBER_PERM)) {
#ifndef SECURE_NAMES
if (tessedit_rejection_debug)
tprintf ("\nVETO Tess accepting poor word \"%s\"\n",
#endif
word->done = FALSE;
}
}
/*
4: as 2 + reject dict ambigs in pass 1
*/
else if (tessedit_ok_mode == 4) {
word->done = word->tess_accepted &&
(strchr (word->best_choice->unichar_string().string (), ' ') == NULL);
if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
word->done = FALSE;
if (word->done &&
(pass == 1) &&
(((word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
(word->best_choice->permuter () != FREQ_DAWG_PERM) &&
(word->best_choice->permuter () != USER_DAWG_PERM) &&
(word->best_choice->permuter () != NUMBER_PERM)) ||
(test_ambig_word (word)))) {
#ifndef SECURE_NAMES
if (tessedit_rejection_debug)
tprintf ("\nVETO Tess accepting poor word \"%s\"\n",
#endif
word->done = FALSE;
}
}
/*
5: as 3 + reject dict ambigs in both passes
*/
else if (tessedit_ok_mode == 5) {
word->done = word->tess_accepted &&
(strchr (word->best_choice->unichar_string().string (), ' ') == NULL);
if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
word->done = FALSE;
if (word->done &&
(((word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
(word->best_choice->permuter () != FREQ_DAWG_PERM) &&
(word->best_choice->permuter () != USER_DAWG_PERM) &&
(word->best_choice->permuter () != NUMBER_PERM)) ||
(test_ambig_word (word)))) {
#ifndef SECURE_NAMES
if (tessedit_rejection_debug)
tprintf ("\nVETO Tess accepting poor word \"%s\"\n",
#endif
word->done = FALSE;
}
}
else {
tprintf ("BAD tessedit_ok_mode\n");
}
}
/*************************************************************************
* make_reject_map()
*
* Sets the done flag to indicate whether the resylt is acceptable.
*
* Sets a reject map for the word.
*************************************************************************/
void Tesseract::make_reject_map( //make rej map for wd //detailed results
WERD_RES *word,
BLOB_CHOICE_LIST_CLIST *blob_choices,
ROW *row,
inT16 pass //1st or 2nd?
) {
int i;
int offset;
flip_0O(word);
check_debug_pt(word, -1); // For trap only
set_done(word, pass); // Set acceptance
/*
0: Rays original heuristic - the baseline
*/
if (tessedit_reject_mode == 0) {
if (!word->done)
reject_poor_matches(word, blob_choices);
} else if (tessedit_reject_mode == 5) {
/*
5: Reject I/1/l from words where there is no strong contextual confirmation;
the whole of any unacceptable words (incl PERM rej of dubious 1/I/ls);
and the whole of any words which are very small
*/
if (kBlnXHeight / word->denorm.y_scale() <= min_sane_x_ht_pixels) {
} else {
one_ell_conflict(word, TRUE);
/*
Originally the code here just used the done flag. Now I have duplicated
and unpacked the conditions for setting the done flag so that each
mechanism can be turned on or off independently. This works WITHOUT
affecting the done flag setting.
*/
if (rej_use_tess_accepted && !word->tess_accepted)
if (rej_use_tess_blanks &&
(strchr (word->best_choice->unichar_string().string (), ' ') != NULL))
WERD_CHOICE* best_choice = word->best_choice;
if (rej_use_good_perm) {
if ((best_choice->permuter() == SYSTEM_DAWG_PERM ||
best_choice->permuter() == FREQ_DAWG_PERM ||
best_choice->permuter() == USER_DAWG_PERM) &&
(!rej_use_sensible_wd ||
acceptable_word_string(*word->uch_set,
best_choice->unichar_string().string(),
best_choice->unichar_lengths().string()) !=
// PASSED TEST
} else if (best_choice->permuter() == NUMBER_PERM) {
if (rej_alphas_in_number_perm) {
for (i = 0, offset = 0;
best_choice->unichar_string()[offset] != '\0';
offset += best_choice->unichar_lengths()[i++]) {
if (word->reject_map[i].accepted() &&
best_choice->unichar_string().string() + offset,
best_choice->unichar_lengths()[i]))
word->reject_map[i].setrej_bad_permuter();
// rej alpha
}
}
} else {
}
}
/* Ambig word rejection was here once !!*/
}
} else {
tprintf("BAD tessedit_reject_mode\n");
}
if (tessedit_image_border > -1)
reject_edge_blobs(word);
check_debug_pt (word, 10);
if (tessedit_rejection_debug) {
tprintf("Permuter Type = %d\n", word->best_choice->permuter ());
tprintf("Certainty: %f Rating: %f\n",
word->best_choice->certainty (), word->best_choice->rating ());
tprintf("Dict word: %d\n", dict_word(*(word->best_choice)));
}
flip_hyphens(word);
check_debug_pt(word, 20);
}
} // namespace tesseract
float compute_reject_threshold ( BLOB_CHOICE_LIST_CLIST *  blob_choices)

Definition at line 370 of file reject.cpp.

{
inT16 index; //to ratings
inT16 blob_count; //no of blobs in word
inT16 ok_blob_count = 0; //non TESS rej blobs in word
float *ratings; //array of confidences
float threshold; //rejection threshold
float bestgap; //biggest gap
float gapstart; //bottom of gap
//super iterator
BLOB_CHOICE_LIST_C_IT list_it = blob_choices;
BLOB_CHOICE_IT choice_it; //real iterator
blob_count = blob_choices->length ();
ratings = (float *) alloc_mem (blob_count * sizeof (float));
for (list_it.mark_cycle_pt (), index = 0;
!list_it.cycled_list (); list_it.forward (), index++) {
choice_it.set_to_list (list_it.data ());
if (choice_it.length () > 0) {
ratings[ok_blob_count] = choice_it.data ()->certainty ();
//get in an array
// tprintf("Rating[%d]=%c %g %g\n",
// index,choice_it.data()->char_class(),
// choice_it.data()->rating(),choice_it.data()->certainty());
ok_blob_count++;
}
}
ASSERT_HOST (index == blob_count);
qsort (ratings, ok_blob_count, sizeof (float), sort_floats);
//sort them
bestgap = 0;
gapstart = ratings[0] - 1; //all reject if none better
if (ok_blob_count >= 3) {
for (index = 0; index < ok_blob_count - 1; index++) {
if (ratings[index + 1] - ratings[index] > bestgap) {
bestgap = ratings[index + 1] - ratings[index];
//find biggest
gapstart = ratings[index];
}
}
}
threshold = gapstart + bestgap / 2;
// tprintf("First=%g, last=%g, gap=%g, threshold=%g\n",
// ratings[0],ratings[index],bestgap,threshold);
free_mem(ratings);
return threshold;
}
void reject_blanks ( WERD_RES word)

Definition at line 290 of file reject.cpp.

{
inT16 i;
inT16 offset;
for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
offset += word->best_choice->unichar_lengths()[i], i += 1) {
if (word->best_choice->unichar_string()[offset] == ' ')
//rej unrecognised blobs
word->reject_map[i].setrej_tess_failure ();
}
}
void reject_poor_matches ( WERD_RES word,
BLOB_CHOICE_LIST_CLIST *  blob_choices 
)

Definition at line 319 of file reject.cpp.

{
float threshold;
inT16 i = 0;
inT16 offset = 0;
//super iterator
BLOB_CHOICE_LIST_C_IT list_it = blob_choices;
BLOB_CHOICE_IT choice_it; //real iterator
#ifndef SECURE_NAMES
if (strlen(word->best_choice->unichar_lengths().string()) !=
list_it.length()) {
("ASSERT FAIL string:\"%s\"; strlen=%d; choices len=%d; blob len=%d\n",
strlen (word->best_choice->unichar_lengths().string()), list_it.length(),
word->box_word->length());
}
#endif
ASSERT_HOST (strlen (word->best_choice->unichar_lengths().string ()) ==
list_it.length ());
ASSERT_HOST(word->box_word->length() == list_it.length());
threshold = compute_reject_threshold (blob_choices);
for (list_it.mark_cycle_pt ();
!list_it.cycled_list (); list_it.forward (), i++,
offset += word->best_choice->unichar_lengths()[i]) {
/* NB - only compares the threshold against the TOP choice char in the
choices list for a blob !! - the selected one may be below the threshold
*/
choice_it.set_to_list (list_it.data ());
if ((word->best_choice->unichar_string()[offset] == ' ') ||
(choice_it.length () == 0))
//rej unrecognised blobs
word->reject_map[i].setrej_tess_failure ();
else if (choice_it.data ()->certainty () < threshold)
//rej poor score blob
word->reject_map[i].setrej_poor_match ();
}
}