Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
tesseract::Trie Class Reference

#include <trie.h>

Inheritance diagram for tesseract::Trie:
tesseract::Dawg

List of all members.

Public Types

enum  RTLReversePolicy { RRP_DO_NO_REVERSE, RRP_REVERSE_IF_HAS_RTL, RRP_FORCE_REVERSE }

Public Member Functions

 Trie (DawgType type, const STRING &lang, PermuterType perm, uinT64 max_num_edges, int unicharset_size, int debug_level)
virtual ~Trie ()
void clear ()
EDGE_REF edge_char_of (NODE_REF node_ref, UNICHAR_ID unichar_id, bool word_end) const
void unichar_ids_of (NODE_REF node, NodeChildVector *vec) const
NODE_REF next_node (EDGE_REF edge_ref) const
bool end_of_word (EDGE_REF edge_ref) const
UNICHAR_ID edge_letter (EDGE_REF edge_ref) const
void print_node (NODE_REF node, int max_num_edges) const
SquishedDawgtrie_to_dawg ()
bool read_word_list (const char *filename, const UNICHARSET &unicharset, Trie::RTLReversePolicy reverse)
bool read_pattern_list (const char *filename, const UNICHARSET &unicharset)
void initialize_patterns (UNICHARSET *unicharset)
void unichar_id_to_patterns (UNICHAR_ID unichar_id, const UNICHARSET &unicharset, GenericVector< UNICHAR_ID > *vec) const
virtual EDGE_REF pattern_loop_edge (EDGE_REF edge_ref, UNICHAR_ID unichar_id, bool word_end) const
bool add_word_to_dawg (const WERD_CHOICE &word, const GenericVector< bool > *repetitions)
bool add_word_to_dawg (const WERD_CHOICE &word)
- Public Member Functions inherited from tesseract::Dawg
DawgType type () const
const STRINGlang () const
PermuterType permuter () const
virtual ~Dawg ()
bool word_in_dawg (const WERD_CHOICE &word) const
 Returns true if the given word is in the Dawg.
int check_for_words (const char *filename, const UNICHARSET &unicharset, bool enable_wildcard) const
void iterate_words (const UNICHARSET &unicharset, TessCallback1< const char * > *cb) const

Static Public Member Functions

static const char * get_reverse_policy_name (RTLReversePolicy reverse_policy)

Static Public Attributes

static const int kSaneNumConcreteChars = 4
static const char kAlphaPatternUnicode [] = "\u2000"
static const char kDigitPatternUnicode [] = "\u2001"
static const char kAlphanumPatternUnicode [] = "\u2002"
static const char kPuncPatternUnicode [] = "\u2003"
static const char kLowerPatternUnicode [] = "\u2004"
static const char kUpperPatternUnicode [] = "\u2005"
- Static Public Attributes inherited from tesseract::Dawg
static const inT16 kDawgMagicNumber = 42
 Magic number to determine endianness when reading the Dawg from file.
static const UNICHAR_ID kPatternUnicharID = 0

Protected Member Functions

EDGE_RECORDderef_edge_ref (EDGE_REF edge_ref) const
EDGE_REF make_edge_ref (NODE_REF node_index, EDGE_INDEX edge_index) const
void link_edge (EDGE_RECORD *edge, NODE_REF nxt, bool repeats, int direction, bool word_end, UNICHAR_ID unichar_id)
void print_edge_rec (const EDGE_RECORD &edge_rec) const
bool can_be_eliminated (const EDGE_RECORD &edge_rec)
void print_all (const char *msg, int max_num_edges)
bool edge_char_of (NODE_REF node_ref, NODE_REF next_node, int direction, bool word_end, UNICHAR_ID unichar_id, EDGE_RECORD **edge_ptr, EDGE_INDEX *edge_index) const
bool add_edge_linkage (NODE_REF node1, NODE_REF node2, bool repeats, int direction, bool word_end, UNICHAR_ID unichar_id)
bool add_new_edge (NODE_REF node1, NODE_REF node2, bool repeats, bool word_end, UNICHAR_ID unichar_id)
void add_word_ending (EDGE_RECORD *edge, NODE_REF the_next_node, bool repeats, UNICHAR_ID unichar_id)
NODE_REF new_dawg_node ()
void remove_edge_linkage (NODE_REF node1, NODE_REF node2, int direction, bool word_end, UNICHAR_ID unichar_id)
void remove_edge (NODE_REF node1, NODE_REF node2, bool word_end, UNICHAR_ID unichar_id)
bool eliminate_redundant_edges (NODE_REF node, const EDGE_RECORD &edge1, const EDGE_RECORD &edge2)
bool reduce_lettered_edges (EDGE_INDEX edge_index, UNICHAR_ID unichar_id, NODE_REF node, const EDGE_VECTOR &backward_edges, NODE_MARKER reduced_nodes)
void sort_edges (EDGE_VECTOR *edges)
void reduce_node_input (NODE_REF node, NODE_MARKER reduced_nodes)
UNICHAR_ID character_class_to_pattern (char ch)
- Protected Member Functions inherited from tesseract::Dawg
 Dawg ()
NODE_REF next_node_from_edge_rec (const EDGE_RECORD &edge_rec) const
 Returns the next node visited by following this edge.
bool marker_flag_from_edge_rec (const EDGE_RECORD &edge_rec) const
 Returns the marker flag of this edge.
int direction_from_edge_rec (const EDGE_RECORD &edge_rec) const
 Returns the direction flag of this edge.
bool end_of_word_from_edge_rec (const EDGE_RECORD &edge_rec) const
 Returns true if this edge marks the end of a word.
UNICHAR_ID unichar_id_from_edge_rec (const EDGE_RECORD &edge_rec) const
 Returns UNICHAR_ID recorded in this edge.
void set_next_node_in_edge_rec (EDGE_RECORD *edge_rec, EDGE_REF value)
 Sets the next node link for this edge in the Dawg.
void set_marker_flag_in_edge_rec (EDGE_RECORD *edge_rec)
 Sets this edge record to be the last one in a sequence of edges.
int given_greater_than_edge_rec (NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, const EDGE_RECORD &edge_rec) const
bool edge_rec_match (NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, NODE_REF other_next_node, bool other_word_end, UNICHAR_ID other_unichar_id) const
void init (DawgType type, const STRING &lang, PermuterType perm, int unicharset_size, int debug_level)
bool match_words (WERD_CHOICE *word, inT32 index, NODE_REF node, UNICHAR_ID wildcard) const
void iterate_words_rec (const WERD_CHOICE &word_so_far, NODE_REF to_explore, TessCallback1< const char * > *cb) const

Protected Attributes

TRIE_NODES nodes_
uinT64 num_edges_
uinT64 max_num_edges_
uinT64 deref_direction_mask_
uinT64 deref_node_index_mask_
bool initialized_patterns_
UNICHAR_ID alpha_pattern_
UNICHAR_ID digit_pattern_
UNICHAR_ID alphanum_pattern_
UNICHAR_ID punc_pattern_
UNICHAR_ID lower_pattern_
UNICHAR_ID upper_pattern_
- Protected Attributes inherited from tesseract::Dawg
DawgType type_
STRING lang_
PermuterType perm_
 Permuter code that should be used if the word is found in this Dawg.
int unicharset_size_
int flag_start_bit_
int next_node_start_bit_
uinT64 next_node_mask_
uinT64 flags_mask_
uinT64 letter_mask_
int debug_level_

Detailed Description

Concrete class for Trie data structure that allows to store a list of words (extends Dawg base class) as well as dynamically add new words. This class stores a vector of pointers to TRIE_NODE_RECORDs, each of which has a vector of forward and backward edges.

Definition at line 62 of file trie.h.


Member Enumeration Documentation

Enumerator:
RRP_DO_NO_REVERSE 
RRP_REVERSE_IF_HAS_RTL 
RRP_FORCE_REVERSE 

Definition at line 64 of file trie.h.


Constructor & Destructor Documentation

tesseract::Trie::Trie ( DawgType  type,
const STRING lang,
PermuterType  perm,
uinT64  max_num_edges,
int  unicharset_size,
int  debug_level 
)
inline

Definition at line 89 of file trie.h.

{
init(type, lang, perm, unicharset_size, debug_level);
max_num_edges_ = max_num_edges;
new_dawg_node(); // need to allocate node 0
}
virtual tesseract::Trie::~Trie ( )
inlinevirtual

Definition at line 98 of file trie.h.


Member Function Documentation

bool tesseract::Trie::add_edge_linkage ( NODE_REF  node1,
NODE_REF  node2,
bool  repeats,
int  direction,
bool  word_end,
UNICHAR_ID  unichar_id 
)
protected

Definition at line 123 of file trie.cpp.

{
if (num_edges_ == max_num_edges_) return false;
&(nodes_[node1]->forward_edges) : &(nodes_[node1]->backward_edges);
int search_index;
if (node1 == 0) {
search_index = 0; // find the index to make the add sorted
while (search_index < vec->size() &&
given_greater_than_edge_rec(node2, word_end, unichar_id,
(*vec)[search_index]) == 1) {
search_index++;
}
} else {
search_index = vec->size(); // add is unsorted, so index does not matter
}
EDGE_RECORD edge_rec;
link_edge(&edge_rec, node2, marker_flag, direction, word_end, unichar_id);
if (search_index < vec->size()) {
vec->insert(edge_rec, search_index);
} else {
vec->push_back(edge_rec);
}
if (debug_level_ > 1) {
tprintf("new edge in nodes_[" REFFORMAT "]: ", node1);
print_edge_rec(edge_rec);
tprintf("\n");
}
return true;
}
bool tesseract::Trie::add_new_edge ( NODE_REF  node1,
NODE_REF  node2,
bool  repeats,
bool  word_end,
UNICHAR_ID  unichar_id 
)
inlineprotected

Definition at line 332 of file trie.h.

{
return (add_edge_linkage(node1, node2, repeats, FORWARD_EDGE,
word_end, unichar_id) &&
add_edge_linkage(node2, node1, repeats, BACKWARD_EDGE,
word_end, unichar_id));
}
void tesseract::Trie::add_word_ending ( EDGE_RECORD edge,
NODE_REF  the_next_node,
bool  repeats,
UNICHAR_ID  unichar_id 
)
protected

Definition at line 156 of file trie.cpp.

{
EDGE_RECORD *back_edge_ptr;
EDGE_INDEX back_edge_index;
ASSERT_HOST(edge_char_of(the_next_node, NO_EDGE, BACKWARD_EDGE, false,
unichar_id, &back_edge_ptr, &back_edge_index));
if (marker_flag) {
*back_edge_ptr |= (MARKER_FLAG << flag_start_bit_);
*edge_ptr |= (MARKER_FLAG << flag_start_bit_);
}
// Mark both directions as end of word.
*back_edge_ptr |= (WERD_END_FLAG << flag_start_bit_);
*edge_ptr |= (WERD_END_FLAG << flag_start_bit_);
}
bool tesseract::Trie::add_word_to_dawg ( const WERD_CHOICE word,
const GenericVector< bool > *  repetitions 
)

Definition at line 173 of file trie.cpp.

{
if (word.length() <= 0) return false; // can't add empty words
if (repetitions != NULL) ASSERT_HOST(repetitions->size() == word.length());
// Make sure the word does not contain invalid unchar ids.
for (int i = 0; i < word.length(); ++i) {
if (word.unichar_id(i) < 0 ||
word.unichar_id(i) >= unicharset_size_) return false;
}
EDGE_RECORD *edge_ptr;
NODE_REF last_node = 0;
NODE_REF the_next_node;
bool marker_flag = false;
EDGE_INDEX edge_index;
int i;
inT32 still_finding_chars = true;
inT32 word_end = false;
bool add_failed = false;
bool found;
if (debug_level_ > 1) word.print("\nAdding word: ");
UNICHAR_ID unichar_id;
for (i = 0; i < word.length() - 1; ++i) {
unichar_id = word.unichar_id(i);
marker_flag = (repetitions != NULL) ? (*repetitions)[i] : false;
if (debug_level_ > 1) tprintf("Adding letter %d\n", unichar_id);
if (still_finding_chars) {
found = edge_char_of(last_node, NO_EDGE, FORWARD_EDGE, word_end,
unichar_id, &edge_ptr, &edge_index);
if (found && debug_level_ > 1) {
tprintf("exploring edge " REFFORMAT " in node " REFFORMAT "\n",
edge_index, last_node);
}
if (!found) {
still_finding_chars = false;
} else if (next_node_from_edge_rec(*edge_ptr) == 0) {
word_end = true;
still_finding_chars = false;
remove_edge(last_node, 0, word_end, unichar_id);
} else {
if (marker_flag) set_marker_flag_in_edge_rec(edge_ptr);
last_node = next_node_from_edge_rec(*edge_ptr);
}
}
if (!still_finding_chars) {
the_next_node = new_dawg_node();
if (debug_level_ > 1)
tprintf("adding node " REFFORMAT "\n", the_next_node);
if (the_next_node == 0) {
add_failed = true;
break;
}
if (!add_new_edge(last_node, the_next_node,
marker_flag, word_end, unichar_id)) {
add_failed = true;
break;
}
word_end = false;
last_node = the_next_node;
}
}
the_next_node = 0;
unichar_id = word.unichar_id(i);
marker_flag = (repetitions != NULL) ? (*repetitions)[i] : false;
if (debug_level_ > 1) tprintf("Adding letter %d\n", unichar_id);
if (still_finding_chars &&
edge_char_of(last_node, NO_EDGE, FORWARD_EDGE, false,
unichar_id, &edge_ptr, &edge_index)) {
// An extension of this word already exists in the trie, so we
// only have to add the ending flags in both directions.
marker_flag, unichar_id);
} else {
if (!add_failed &&
!add_new_edge(last_node, the_next_node, marker_flag, true, unichar_id))
add_failed = true;
}
if (add_failed) {
tprintf("Re-initializing document dictionary...\n");
clear();
return false;
} else {
return true;
}
}
bool tesseract::Trie::add_word_to_dawg ( const WERD_CHOICE word)
inline

Definition at line 245 of file trie.h.

{
return add_word_to_dawg(word, NULL);
}
bool tesseract::Trie::can_be_eliminated ( const EDGE_RECORD edge_rec)
inlineprotected

Definition at line 302 of file trie.h.

{
NODE_REF node_ref = next_node_from_edge_rec(edge_rec);
return (node_ref != NO_EDGE &&
nodes_[static_cast<int>(node_ref)]->forward_edges.size() == 1);
}
UNICHAR_ID tesseract::Trie::character_class_to_pattern ( char  ch)
protected

Definition at line 346 of file trie.cpp.

{
if (ch == 'c') {
} else if (ch == 'd') {
} else if (ch == 'n') {
} else if (ch == 'p') {
return punc_pattern_;
} else if (ch == 'a') {
} else if (ch == 'A') {
} else {
return INVALID_UNICHAR_ID;
}
}
void tesseract::Trie::clear ( )

Definition at line 65 of file trie.cpp.

{
new_dawg_node(); // Need to allocate node 0.
}
EDGE_RECORD* tesseract::Trie::deref_edge_ref ( EDGE_REF  edge_ref) const
inlineprotected

Definition at line 267 of file trie.h.

{
int edge_index = static_cast<int>(
(edge_ref & letter_mask_) >> LETTER_START_BIT);
int node_index = static_cast<int>(
TRIE_NODE_RECORD *node_rec = nodes_[node_index];
return &(node_rec->forward_edges[edge_index]);
}
EDGE_REF tesseract::Trie::edge_char_of ( NODE_REF  node_ref,
UNICHAR_ID  unichar_id,
bool  word_end 
) const
inlinevirtual

Returns the edge that corresponds to the letter out of this node.

Implements tesseract::Dawg.

Definition at line 104 of file trie.h.

{
EDGE_RECORD *edge_ptr;
EDGE_INDEX edge_index;
if (!edge_char_of(node_ref, NO_EDGE, FORWARD_EDGE, word_end, unichar_id,
&edge_ptr, &edge_index)) return NO_EDGE;
return make_edge_ref(node_ref, edge_index);
}
bool tesseract::Trie::edge_char_of ( NODE_REF  node_ref,
NODE_REF  next_node,
int  direction,
bool  word_end,
UNICHAR_ID  unichar_id,
EDGE_RECORD **  edge_ptr,
EDGE_INDEX edge_index 
) const
protected

Definition at line 72 of file trie.cpp.

{
if (debug_level_ == 3) {
tprintf("edge_char_of() given node_ref " REFFORMAT " next_node " REFFORMAT
" direction %d word_end %d unichar_id %d, exploring node:\n",
node_ref, next_node, direction, word_end, unichar_id);
if (node_ref != NO_EDGE) {
print_node(node_ref, nodes_[node_ref]->forward_edges.size());
}
}
if (node_ref == NO_EDGE) return false;
assert(node_ref < nodes_.size());
nodes_[node_ref]->forward_edges : nodes_[node_ref]->backward_edges;
int vec_size = vec.size();
if (node_ref == 0) { // binary search
EDGE_INDEX start = 0;
EDGE_INDEX end = vec_size - 1;
int compare;
while (start <= end) {
k = (start + end) >> 1; // (start + end) / 2
unichar_id, vec[k]);
if (compare == 0) { // given == vec[k]
*edge_ptr = &(vec[k]);
*edge_index = k;
return true;
} else if (compare == 1) { // given > vec[k]
start = k + 1;
} else { // given < vec[k]
end = k - 1;
}
}
} else { // linear search
for (int i = 0; i < vec_size; ++i) {
EDGE_RECORD &edge_rec = vec[i];
if (edge_rec_match(next_node, word_end, unichar_id,
*edge_ptr = &(edge_rec);
*edge_index = i;
return true;
}
}
}
return false; // not found
}
UNICHAR_ID tesseract::Trie::edge_letter ( EDGE_REF  edge_ref) const
inlinevirtual

Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.

Implements tesseract::Dawg.

Definition at line 145 of file trie.h.

{
if (edge_ref == NO_EDGE || num_edges_ == 0) return INVALID_UNICHAR_ID;
}
bool tesseract::Trie::eliminate_redundant_edges ( NODE_REF  node,
const EDGE_RECORD edge1,
const EDGE_RECORD edge2 
)
protected

Definition at line 512 of file trie.cpp.

{
if (debug_level_ > 1) {
tprintf("\nCollapsing node %d:\n", node);
tprintf("Candidate edges: ");
tprintf(", ");
tprintf("\n\n");
}
NODE_REF next_node1 = next_node_from_edge_rec(edge1);
NODE_REF next_node2 = next_node_from_edge_rec(edge2);
TRIE_NODE_RECORD *next_node2_ptr = nodes_[next_node2];
// Translate all edges going to/from next_node2 to go to/from next_node1.
EDGE_RECORD *edge_ptr = NULL;
EDGE_INDEX edge_index;
int i;
// Remove the backward link in node to next_node2.
const EDGE_RECORD &fwd_edge = next_node2_ptr->forward_edges[0];
// Copy all the backward links in next_node2 to node next_node1
for (i = 0; i < next_node2_ptr->backward_edges.size(); ++i) {
const EDGE_RECORD &bkw_edge = next_node2_ptr->backward_edges[i];
NODE_REF curr_next_node = next_node_from_edge_rec(bkw_edge);
UNICHAR_ID curr_unichar_id = unichar_id_from_edge_rec(bkw_edge);
int curr_word_end = end_of_word_from_edge_rec(bkw_edge);
bool marker_flag = marker_flag_from_edge_rec(bkw_edge);
add_edge_linkage(next_node1, curr_next_node, marker_flag, BACKWARD_EDGE,
curr_word_end, curr_unichar_id);
// Relocate the corresponding forward edge in curr_next_node
ASSERT_HOST(edge_char_of(curr_next_node, next_node2, FORWARD_EDGE,
curr_word_end, curr_unichar_id,
&edge_ptr, &edge_index));
set_next_node_in_edge_rec(edge_ptr, next_node1);
}
int next_node2_num_edges = (next_node2_ptr->forward_edges.size() +
next_node2_ptr->backward_edges.size());
if (debug_level_ > 1) {
tprintf("removed %d edges from node " REFFORMAT "\n",
next_node2_num_edges, next_node2);
}
next_node2_ptr->forward_edges.clear();
next_node2_ptr->backward_edges.clear();
num_edges_ -= next_node2_num_edges;
return true;
}
bool tesseract::Trie::end_of_word ( EDGE_REF  edge_ref) const
inlinevirtual

Returns true if the edge indicated by the given EDGE_REF marks the end of a word.

Implements tesseract::Dawg.

Definition at line 139 of file trie.h.

{
if (edge_ref == NO_EDGE || num_edges_ == 0) return false;
}
const char * tesseract::Trie::get_reverse_policy_name ( RTLReversePolicy  reverse_policy)
static

Definition at line 60 of file trie.cpp.

{
return RTLReversePolicyNames[reverse_policy];
}
void tesseract::Trie::link_edge ( EDGE_RECORD edge,
NODE_REF  nxt,
bool  repeats,
int  direction,
bool  word_end,
UNICHAR_ID  unichar_id 
)
inlineprotected

Sets up this edge record to the requested values.

Definition at line 282 of file trie.h.

{
EDGE_RECORD flags = 0;
if (repeats) flags |= MARKER_FLAG;
if (word_end) flags |= WERD_END_FLAG;
*edge = ((nxt << next_node_start_bit_) |
(static_cast<EDGE_RECORD>(flags) << flag_start_bit_) |
(static_cast<EDGE_RECORD>(unichar_id) << LETTER_START_BIT));
}
EDGE_REF tesseract::Trie::make_edge_ref ( NODE_REF  node_index,
EDGE_INDEX  edge_index 
) const
inlineprotected

Constructs EDGE_REF from the given node_index and edge_index.

Definition at line 276 of file trie.h.

{
return ((node_index << flag_start_bit_) |
(edge_index << LETTER_START_BIT));
}
NODE_REF tesseract::Trie::new_dawg_node ( )
protected

Definition at line 261 of file trie.cpp.

{
if (node == NULL) return 0; // failed to create new node
return nodes_.length() - 1;
}
NODE_REF tesseract::Trie::next_node ( EDGE_REF  edge_ref) const
inlinevirtual

Returns the next node visited by following the edge indicated by the given EDGE_REF.

Implements tesseract::Dawg.

Definition at line 130 of file trie.h.

{
if (edge_ref == NO_EDGE || num_edges_ == 0) return NO_EDGE;
}
virtual EDGE_REF tesseract::Trie::pattern_loop_edge ( EDGE_REF  edge_ref,
UNICHAR_ID  unichar_id,
bool  word_end 
) const
inlinevirtual

Returns the given EDGE_REF if the EDGE_RECORD that it points to has a self loop and the given unichar_id matches the unichar_id stored in the EDGE_RECORD, returns NO_EDGE otherwise.

Reimplemented from tesseract::Dawg.

Definition at line 223 of file trie.h.

{
if (edge_ref == NO_EDGE) return NO_EDGE;
EDGE_RECORD *edge_rec = deref_edge_ref(edge_ref);
return (marker_flag_from_edge_rec(*edge_rec) &&
unichar_id == unichar_id_from_edge_rec(*edge_rec) &&
word_end == end_of_word_from_edge_rec(*edge_rec)) ?
edge_ref : NO_EDGE;
}
void tesseract::Trie::print_all ( const char *  msg,
int  max_num_edges 
)
inlineprotected

Definition at line 310 of file trie.h.

{
tprintf("\n__________________________\n%s\n", msg);
for (int i = 0; i < nodes_.size(); ++i) print_node(i, max_num_edges);
tprintf("__________________________\n");
}
void tesseract::Trie::print_edge_rec ( const EDGE_RECORD edge_rec) const
inlineprotected

Prints the given EDGE_RECORD.

Definition at line 293 of file trie.h.

{
tprintf("|" REFFORMAT "|%s%s%s|%d|", next_node_from_edge_rec(edge_rec),
marker_flag_from_edge_rec(edge_rec) ? "R," : "",
(direction_from_edge_rec(edge_rec) == FORWARD_EDGE) ? "F" : "B",
end_of_word_from_edge_rec(edge_rec) ? ",E" : "",
}
void tesseract::Trie::print_node ( NODE_REF  node,
int  max_num_edges 
) const
virtual

Prints the contents of the node indicated by the given NODE_REF. At most max_num_edges will be printed.

Implements tesseract::Dawg.

Definition at line 648 of file trie.cpp.

{
if (node == NO_EDGE) return; // nothing to print
TRIE_NODE_RECORD *node_ptr = nodes_[node];
int num_fwd = node_ptr->forward_edges.size();
int num_bkw = node_ptr->backward_edges.size();
for (int dir = 0; dir < 2; ++dir) {
if (dir == 0) {
vec = &(node_ptr->forward_edges);
tprintf(REFFORMAT " (%d %d): ", node, num_fwd, num_bkw);
} else {
vec = &(node_ptr->backward_edges);
tprintf("\t");
}
int i;
for (i = 0; (dir == 0 ? i < num_fwd : i < num_bkw) &&
i < max_num_edges; ++i) {
print_edge_rec((*vec)[i]);
tprintf(" ");
}
if (dir == 0 ? i < num_fwd : i < num_bkw) tprintf("...");
tprintf("\n");
}
}
bool tesseract::Trie::read_pattern_list ( const char *  filename,
const UNICHARSET unicharset 
)

Definition at line 364 of file trie.cpp.

{
tprintf("please call initialize_patterns() before read_pattern_list()\n");
return false;
}
FILE *pattern_file = open_file (filename, "r");
if (pattern_file == NULL) {
tprintf("Error opening pattern file %s\n", filename);
return false;
}
int pattern_count = 0;
char string[CHARS_PER_LINE];
while (fgets(string, CHARS_PER_LINE, pattern_file) != NULL) {
chomp_string(string); // remove newline
// Parse the pattern and construct a unichar id vector.
// Record the number of repetitions of each unichar in the parallel vector.
WERD_CHOICE word(&unicharset);
GenericVector<bool> repetitions_vec;
const char *str_ptr = string;
int step = unicharset.step(str_ptr);
bool failed = false;
while (step > 0) {
UNICHAR_ID curr_unichar_id = INVALID_UNICHAR_ID;
if (step == 1 && *str_ptr == '\\') {
++str_ptr;
if (*str_ptr == '\\') { // regular '\' unichar that was escaped
curr_unichar_id = unicharset.unichar_to_id(str_ptr, step);
} else {
if (word.length() < kSaneNumConcreteChars) {
tprintf("Please provide at least %d concrete characters at the"
" beginning of the pattern\n", kSaneNumConcreteChars);
failed = true;
break;
}
// Parse character class from expression.
curr_unichar_id = character_class_to_pattern(*str_ptr);
}
} else {
curr_unichar_id = unicharset.unichar_to_id(str_ptr, step);
}
if (curr_unichar_id == INVALID_UNICHAR_ID) {
failed = true;
break; // failed to parse this pattern
}
word.append_unichar_id(curr_unichar_id, 1, 0.0, 0.0);
repetitions_vec.push_back(false);
str_ptr += step;
step = unicharset.step(str_ptr);
// Check if there is a repetition pattern specified after this unichar.
if (step == 1 && *str_ptr == '\\' && *(str_ptr+1) == '*') {
repetitions_vec[repetitions_vec.size()-1] = true;
str_ptr += 2;
step = unicharset.step(str_ptr);
}
}
if (failed) {
tprintf("Invalid user pattern %s\n", string);
continue;
}
// Insert the pattern into the trie.
if (debug_level_ > 2) {
tprintf("Inserting expanded user pattern %s\n",
word.debug_string().string());
}
if (!this->word_in_dawg(word)) {
this->add_word_to_dawg(word, &repetitions_vec);
if (!this->word_in_dawg(word)) {
tprintf("Error: failed to insert pattern '%s'\n", string);
}
}
++pattern_count;
}
if (debug_level_) {
tprintf("Read %d valid patterns from %s\n", pattern_count, filename);
}
fclose(pattern_file);
return true;
}
bool tesseract::Trie::read_word_list ( const char *  filename,
const UNICHARSET unicharset,
Trie::RTLReversePolicy  reverse 
)

Definition at line 268 of file trie.cpp.

{
FILE *word_file;
char string[CHARS_PER_LINE];
int word_count = 0;
word_file = open_file (filename, "r");
while (fgets(string, CHARS_PER_LINE, word_file) != NULL) {
chomp_string(string); // remove newline
WERD_CHOICE word(string, unicharset);
if ((reverse_policy == RRP_REVERSE_IF_HAS_RTL &&
word.has_rtl_unichar_id()) ||
reverse_policy == RRP_FORCE_REVERSE) {
word.reverse_and_mirror_unichar_ids();
}
++word_count;
if (debug_level_ && word_count % 10000 == 0)
tprintf("Read %d words so far\n", word_count);
if (word.length() != 0 && !word.contains_unichar_id(INVALID_UNICHAR_ID)) {
if (!this->word_in_dawg(word)) {
this->add_word_to_dawg(word);
if (!this->word_in_dawg(word)) {
tprintf("Error: word '%s' not in DAWG after adding it\n", string);
return false;
}
}
} else if (debug_level_) {
tprintf("Skipping invalid word %s\n", string);
if (debug_level_ >= 3) word.print();
}
}
tprintf("Read %d words total.\n", word_count);
fclose(word_file);
return true;
}
bool tesseract::Trie::reduce_lettered_edges ( EDGE_INDEX  edge_index,
UNICHAR_ID  unichar_id,
NODE_REF  node,
const EDGE_VECTOR backward_edges,
NODE_MARKER  reduced_nodes 
)
protected

Definition at line 563 of file trie.cpp.

{
if (debug_level_ > 1)
tprintf("reduce_lettered_edges(edge=" REFFORMAT ")\n", edge_index);
// Compare each of the edge pairs with the given unichar_id.
bool did_something = false;
for (int i = edge_index; i < backward_edges.size() - 1; ++i) {
// Find the first edge that can be eliminated.
UNICHAR_ID curr_unichar_id = INVALID_UNICHAR_ID;
while (i < backward_edges.size() &&
((curr_unichar_id = unichar_id_from_edge_rec(backward_edges[i])) ==
unichar_id) &&
!can_be_eliminated(backward_edges[i])) ++i;
if (i == backward_edges.size() || curr_unichar_id != unichar_id) break;
const EDGE_RECORD &edge_rec = backward_edges[i];
// Compare it to the rest of the edges with the given unichar_id.
for (int j = i + 1; j < backward_edges.size(); ++j) {
const EDGE_RECORD &next_edge_rec = backward_edges[j];
if (unichar_id_from_edge_rec(next_edge_rec) != unichar_id) break;
if (end_of_word_from_edge_rec(next_edge_rec) ==
can_be_eliminated(next_edge_rec) &&
eliminate_redundant_edges(node, edge_rec, next_edge_rec)) {
reduced_nodes[next_node_from_edge_rec(edge_rec)] = 0;
did_something = true;
--j; // do not increment j if next_edge_rec was removed
}
}
}
return did_something;
}
void tesseract::Trie::reduce_node_input ( NODE_REF  node,
NODE_MARKER  reduced_nodes 
)
protected

Eliminates any redundant edges from this node in the Trie.

Definition at line 615 of file trie.cpp.

{
if (debug_level_ > 1) {
tprintf("reduce_node_input(node=" REFFORMAT ")\n", node);
}
EDGE_VECTOR &backward_edges = nodes_[node]->backward_edges;
if (node != 0) sort_edges(&backward_edges);
EDGE_INDEX edge_index = 0;
while (edge_index < backward_edges.size()) {
UNICHAR_ID unichar_id =
unichar_id_from_edge_rec(backward_edges[edge_index]);
while (reduce_lettered_edges(edge_index, unichar_id, node,
backward_edges, reduced_nodes));
while (++edge_index < backward_edges.size() &&
unichar_id_from_edge_rec(backward_edges[edge_index]) == unichar_id);
}
reduced_nodes[node] = true; // mark as reduced
if (debug_level_ > 1) {
tprintf("Node " REFFORMAT " after reduction:\n", node);
}
for (int i = 0; i < backward_edges.size(); ++i) {
if (next_node != 0 && !reduced_nodes[next_node]) {
reduce_node_input(next_node, reduced_nodes);
}
}
}
void tesseract::Trie::remove_edge ( NODE_REF  node1,
NODE_REF  node2,
bool  word_end,
UNICHAR_ID  unichar_id 
)
inlineprotected

Definition at line 357 of file trie.h.

{
remove_edge_linkage(node1, node2, FORWARD_EDGE, word_end, unichar_id);
remove_edge_linkage(node2, node1, BACKWARD_EDGE, word_end, unichar_id);
}
void tesseract::Trie::remove_edge_linkage ( NODE_REF  node1,
NODE_REF  node2,
int  direction,
bool  word_end,
UNICHAR_ID  unichar_id 
)
protected

Definition at line 446 of file trie.cpp.

{
EDGE_RECORD *edge_ptr = NULL;
EDGE_INDEX edge_index = 0;
ASSERT_HOST(edge_char_of(node1, node2, direction, word_end,
unichar_id, &edge_ptr, &edge_index));
if (debug_level_ > 1) {
tprintf("removed edge in nodes_[" REFFORMAT "]: ", node1);
print_edge_rec(*edge_ptr);
tprintf("\n");
}
nodes_[node1]->forward_edges.remove(edge_index);
} else {
nodes_[node1]->backward_edges.remove(edge_index);
}
}
void tesseract::Trie::sort_edges ( EDGE_VECTOR edges)
protected

Order num_edges of consequtive EDGE_RECORDS in the given EDGE_VECTOR in increasing order of unichar ids. This function is normally called for all edges in a single node, and since number of edges in each node is usually quite small, selection sort is used.

Definition at line 598 of file trie.cpp.

{
int num_edges = edges->size();
if (num_edges <= 1) return;
for (int i = 0; i < num_edges - 1; ++i) {
int min = i;
for (int j = (i + 1); j < num_edges; ++j) {
if (unichar_id_from_edge_rec((*edges)[j]) <
unichar_id_from_edge_rec((*edges)[min])) min = j;
}
if (i != min) {
EDGE_RECORD temp = (*edges)[i];
(*edges)[i] = (*edges)[min];
(*edges)[min] = temp;
}
}
}
SquishedDawg * tesseract::Trie::trie_to_dawg ( )

Definition at line 465 of file trie.cpp.

{
if (debug_level_ > 2) {
print_all("Before reduction:", MAX_NODE_EDGES_DISPLAY);
}
NODE_MARKER reduced_nodes = new bool[nodes_.size()];
for (int i = 0; i < nodes_.size(); i++) reduced_nodes[i] = 0;
this->reduce_node_input(0, reduced_nodes);
delete[] reduced_nodes;
if (debug_level_ > 2) {
print_all("After reduction:", MAX_NODE_EDGES_DISPLAY);
}
// Build a translation map from node indices in nodes_ vector to
// their target indices in EDGE_ARRAY.
NODE_REF *node_ref_map = new NODE_REF[nodes_.size() + 1];
int i, j;
node_ref_map[0] = 0;
for (i = 0; i < nodes_.size(); ++i) {
node_ref_map[i+1] = node_ref_map[i] + nodes_[i]->forward_edges.size();
}
int num_forward_edges = node_ref_map[i];
// Convert nodes_ vector into EDGE_ARRAY translating the next node references
// in edges using node_ref_map. Empty nodes and backward edges are dropped.
EDGE_ARRAY edge_array =
(EDGE_ARRAY)memalloc(num_forward_edges * sizeof(EDGE_RECORD));
EDGE_ARRAY edge_array_ptr = edge_array;
for (i = 0; i < nodes_.size(); ++i) {
TRIE_NODE_RECORD *node_ptr = nodes_[i];
int end = node_ptr->forward_edges.size();
for (j = 0; j < end; ++j) {
EDGE_RECORD &edge_rec = node_ptr->forward_edges[j];
NODE_REF node_ref = next_node_from_edge_rec(edge_rec);
ASSERT_HOST(node_ref < nodes_.size());
UNICHAR_ID unichar_id = unichar_id_from_edge_rec(edge_rec);
link_edge(edge_array_ptr, node_ref_map[node_ref], false, FORWARD_EDGE,
end_of_word_from_edge_rec(edge_rec), unichar_id);
if (j == end - 1) set_marker_flag_in_edge_rec(edge_array_ptr);
++edge_array_ptr;
}
}
delete[] node_ref_map;
return new SquishedDawg(edge_array, num_forward_edges, type_, lang_,
}
void tesseract::Trie::unichar_id_to_patterns ( UNICHAR_ID  unichar_id,
const UNICHARSET unicharset,
GenericVector< UNICHAR_ID > *  vec 
) const
virtual

Fills vec with unichar ids that represent the character classes of the given unichar_id.

Reimplemented from tesseract::Dawg.

Definition at line 324 of file trie.cpp.

{
bool is_alpha = unicharset.get_isalpha(unichar_id);
if (is_alpha) {
if (unicharset.get_islower(unichar_id)) {
} else if (unicharset.get_isupper(unichar_id)) {
}
}
if (unicharset.get_isdigit(unichar_id)) {
if (!is_alpha) vec->push_back(alphanum_pattern_);
}
if (unicharset.get_ispunctuation(unichar_id)) {
}
}
void tesseract::Trie::unichar_ids_of ( NODE_REF  node,
NodeChildVector vec 
) const
inlinevirtual

Fills the given NodeChildVector with all the unichar ids (and the corresponding EDGE_REFs) for which there is an edge out of this node.

Implements tesseract::Dawg.

Definition at line 117 of file trie.h.

{
const EDGE_VECTOR &forward_edges =
nodes_[static_cast<int>(node)]->forward_edges;
for (int i = 0; i < forward_edges.size(); ++i) {
vec->push_back(NodeChild(unichar_id_from_edge_rec(forward_edges[i]),
make_edge_ref(node, i)));
}
}

Member Data Documentation

UNICHAR_ID tesseract::Trie::alpha_pattern_
protected

Definition at line 403 of file trie.h.

UNICHAR_ID tesseract::Trie::alphanum_pattern_
protected

Definition at line 405 of file trie.h.

uinT64 tesseract::Trie::deref_direction_mask_
protected

Definition at line 398 of file trie.h.

uinT64 tesseract::Trie::deref_node_index_mask_
protected

Definition at line 399 of file trie.h.

UNICHAR_ID tesseract::Trie::digit_pattern_
protected

Definition at line 404 of file trie.h.

bool tesseract::Trie::initialized_patterns_
protected

Definition at line 402 of file trie.h.

const char tesseract::Trie::kAlphanumPatternUnicode = "\u2002"
static

Definition at line 77 of file trie.h.

const char tesseract::Trie::kAlphaPatternUnicode = "\u2000"
static

Definition at line 75 of file trie.h.

const char tesseract::Trie::kDigitPatternUnicode = "\u2001"
static

Definition at line 76 of file trie.h.

const char tesseract::Trie::kLowerPatternUnicode = "\u2004"
static

Definition at line 79 of file trie.h.

const char tesseract::Trie::kPuncPatternUnicode = "\u2003"
static

Definition at line 78 of file trie.h.

const int tesseract::Trie::kSaneNumConcreteChars = 4
static

Definition at line 71 of file trie.h.

const char tesseract::Trie::kUpperPatternUnicode = "\u2005"
static

Definition at line 80 of file trie.h.

UNICHAR_ID tesseract::Trie::lower_pattern_
protected

Definition at line 407 of file trie.h.

uinT64 tesseract::Trie::max_num_edges_
protected

Definition at line 397 of file trie.h.

TRIE_NODES tesseract::Trie::nodes_
protected

Definition at line 395 of file trie.h.

uinT64 tesseract::Trie::num_edges_
protected

Definition at line 396 of file trie.h.

UNICHAR_ID tesseract::Trie::punc_pattern_
protected

Definition at line 406 of file trie.h.

UNICHAR_ID tesseract::Trie::upper_pattern_
protected

Definition at line 408 of file trie.h.


The documentation for this class was generated from the following files: