Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
tesseract::TessLangModel Class Reference

#include <tess_lang_model.h>

Inheritance diagram for tesseract::TessLangModel:
tesseract::LangModel

List of all members.

Public Member Functions

 TessLangModel (const string &lm_params, const string &data_file_path, bool load_system_dawg, TessdataManager *tessdata_manager, CubeRecoContext *cntxt)
 ~TessLangModel ()
TessLangModEdgeRoot ()
LangModEdge ** GetEdges (CharAltList *alt_list, LangModEdge *edge, int *edge_cnt)
bool IsValidSequence (const char_32 *sequence, bool eow_flag, LangModEdge **final_edge=NULL)
bool IsLeadingPunc (char_32 ch)
bool IsTrailingPunc (char_32 ch)
bool IsDigit (char_32 ch)
void RemoveInvalidCharacters (string *lm_str)
- Public Member Functions inherited from tesseract::LangModel
 LangModel ()
virtual ~LangModel ()
bool OOD ()
bool Numeric ()
bool WordList ()
bool Punc ()
void SetOOD (bool ood)
void SetNumeric (bool numeric)
void SetWordList (bool word_list)
void SetPunc (bool punc_enabled)

Additional Inherited Members

- Protected Attributes inherited from tesseract::LangModel
bool ood_enabled_
bool numeric_enabled_
bool word_list_enabled_
bool punc_enabled_

Detailed Description

Definition at line 38 of file tess_lang_model.h.


Constructor & Destructor Documentation

tesseract::TessLangModel::TessLangModel ( const string &  lm_params,
const string &  data_file_path,
bool  load_system_dawg,
TessdataManager tessdata_manager,
CubeRecoContext cntxt 
)

Definition at line 60 of file tess_lang_model.cpp.

{
cntxt_ = cntxt;
has_case_ = cntxt_->HasCase();
// Load the rest of the language model elements from file
LoadLangModelElements(lm_params);
// Load word_dawgs_ if needed.
if (tessdata_manager->SeekToStart(TESSDATA_CUBE_UNICHARSET)) {
word_dawgs_ = new DawgVector();
if (load_system_dawg &&
tessdata_manager->SeekToStart(TESSDATA_CUBE_SYSTEM_DAWG)) {
// The last parameter to the Dawg constructor (the debug level) is set to
// false, until Cube has a way to express its preferred debug level.
*word_dawgs_ += new SquishedDawg(tessdata_manager->GetDataFilePtr(),
cntxt_->Lang().c_str(),
SYSTEM_DAWG_PERM, false);
}
} else {
word_dawgs_ = NULL;
}
}
tesseract::TessLangModel::~TessLangModel ( )
inline

Definition at line 45 of file tess_lang_model.h.

{
if (word_dawgs_ != NULL) {
word_dawgs_->delete_data_pointers();
delete word_dawgs_;
}
}

Member Function Documentation

LangModEdge ** tesseract::TessLangModel::GetEdges ( CharAltList alt_list,
LangModEdge edge,
int *  edge_cnt 
)
virtual

Implements tesseract::LangModel.

Definition at line 169 of file tess_lang_model.cpp.

{
TessLangModEdge *tess_lm_edge =
reinterpret_cast<TessLangModEdge *>(lang_mod_edge);
LangModEdge **edge_array = NULL;
(*edge_cnt) = 0;
// if we are starting from the root, we'll instantiate every DAWG
// and get the all the edges that emerge from the root
if (tess_lm_edge == NULL) {
// get DAWG count from Tesseract
int dawg_cnt = NumDawgs();
// preallocate the edge buffer
(*edge_cnt) = dawg_cnt * max_edge_;
edge_array = new LangModEdge *[(*edge_cnt)];
if (edge_array == NULL) {
return NULL;
}
for (int dawg_idx = (*edge_cnt) = 0; dawg_idx < dawg_cnt; dawg_idx++) {
const Dawg *curr_dawg = GetDawg(dawg_idx);
// Only look through word Dawgs (since there is a special way of
// handling numbers and punctuation).
if (curr_dawg->type() == DAWG_TYPE_WORD) {
(*edge_cnt) += FanOut(alt_list, curr_dawg, 0, 0, NULL, true,
edge_array + (*edge_cnt));
}
} // dawg
(*edge_cnt) += FanOut(alt_list, number_dawg_, 0, 0, NULL, true,
edge_array + (*edge_cnt));
// OOD: it is intentionally not added to the list to make sure it comes
// at the end
(*edge_cnt) += FanOut(alt_list, ood_dawg_, 0, 0, NULL, true,
edge_array + (*edge_cnt));
// set the root flag for all root edges
for (int edge_idx = 0; edge_idx < (*edge_cnt); edge_idx++) {
edge_array[edge_idx]->SetRoot(true);
}
} else { // not starting at the root
// preallocate the edge buffer
(*edge_cnt) = max_edge_;
// allocate memory for edges
edge_array = new LangModEdge *[(*edge_cnt)];
if (edge_array == NULL) {
return NULL;
}
// get the FanOut edges from the root of each dawg
(*edge_cnt) = FanOut(alt_list,
tess_lm_edge->GetDawg(),
tess_lm_edge->EndEdge(), tess_lm_edge->EdgeMask(),
tess_lm_edge->EdgeString(), false, edge_array);
}
return edge_array;
}
bool tesseract::TessLangModel::IsDigit ( char_32  ch)
virtual

Implements tesseract::LangModel.

Definition at line 162 of file tess_lang_model.cpp.

{
return digits_.find(ch) != string::npos;
}
bool tesseract::TessLangModel::IsLeadingPunc ( char_32  ch)
virtual

Implements tesseract::LangModel.

Definition at line 154 of file tess_lang_model.cpp.

{
return lead_punc_.find(ch) != string::npos;
}
bool tesseract::TessLangModel::IsTrailingPunc ( char_32  ch)
virtual

Implements tesseract::LangModel.

Definition at line 158 of file tess_lang_model.cpp.

{
return trail_punc_.find(ch) != string::npos;
}
bool tesseract::TessLangModel::IsValidSequence ( const char_32 sequence,
bool  eow_flag,
LangModEdge **  final_edge = NULL 
)
virtual

Implements tesseract::LangModel.

Definition at line 145 of file tess_lang_model.cpp.

{
if (final_edge != NULL) {
(*final_edge) = NULL;
}
return IsValidSequence(NULL, sequence, eow_flag, final_edge);
}
void tesseract::TessLangModel::RemoveInvalidCharacters ( string *  lm_str)

Definition at line 482 of file tess_lang_model.cpp.

{
CharSet *char_set = cntxt_->CharacterSet();
CubeUtils::UTF8ToUTF32(lm_str->c_str(), &lm_str32);
int len = CubeUtils::StrLen(lm_str32.c_str());
char_32 *clean_str32 = new char_32[len + 1];
if (!clean_str32)
return;
int clean_len = 0;
for (int i = 0; i < len; ++i) {
int class_id = char_set->ClassID((char_32)lm_str32[i]);
if (class_id != INVALID_UNICHAR_ID) {
clean_str32[clean_len] = lm_str32[i];
++clean_len;
}
}
clean_str32[clean_len] = 0;
if (clean_len < len) {
lm_str->clear();
CubeUtils::UTF32ToUTF8(clean_str32, lm_str);
}
delete [] clean_str32;
}
TessLangModEdge* tesseract::TessLangModel::Root ( )
inlinevirtual

Implements tesseract::LangModel.

Definition at line 53 of file tess_lang_model.h.

{
return NULL;
}

The documentation for this class was generated from the following files: