Tesseract
3.02
|
#include <trie.h>
Public Types | |
enum | RTLReversePolicy { RRP_DO_NO_REVERSE, RRP_REVERSE_IF_HAS_RTL, RRP_FORCE_REVERSE } |
Public Member Functions | |
Trie (DawgType type, const STRING &lang, PermuterType perm, uinT64 max_num_edges, int unicharset_size, int debug_level) | |
virtual | ~Trie () |
void | clear () |
EDGE_REF | edge_char_of (NODE_REF node_ref, UNICHAR_ID unichar_id, bool word_end) const |
void | unichar_ids_of (NODE_REF node, NodeChildVector *vec) const |
NODE_REF | next_node (EDGE_REF edge_ref) const |
bool | end_of_word (EDGE_REF edge_ref) const |
UNICHAR_ID | edge_letter (EDGE_REF edge_ref) const |
void | print_node (NODE_REF node, int max_num_edges) const |
SquishedDawg * | trie_to_dawg () |
bool | read_word_list (const char *filename, const UNICHARSET &unicharset, Trie::RTLReversePolicy reverse) |
bool | read_pattern_list (const char *filename, const UNICHARSET &unicharset) |
void | initialize_patterns (UNICHARSET *unicharset) |
void | unichar_id_to_patterns (UNICHAR_ID unichar_id, const UNICHARSET &unicharset, GenericVector< UNICHAR_ID > *vec) const |
virtual EDGE_REF | pattern_loop_edge (EDGE_REF edge_ref, UNICHAR_ID unichar_id, bool word_end) const |
bool | add_word_to_dawg (const WERD_CHOICE &word, const GenericVector< bool > *repetitions) |
bool | add_word_to_dawg (const WERD_CHOICE &word) |
![]() | |
DawgType | type () const |
const STRING & | lang () const |
PermuterType | permuter () const |
virtual | ~Dawg () |
bool | word_in_dawg (const WERD_CHOICE &word) const |
Returns true if the given word is in the Dawg. | |
int | check_for_words (const char *filename, const UNICHARSET &unicharset, bool enable_wildcard) const |
void | iterate_words (const UNICHARSET &unicharset, TessCallback1< const char * > *cb) const |
Static Public Member Functions | |
static const char * | get_reverse_policy_name (RTLReversePolicy reverse_policy) |
Static Public Attributes | |
static const int | kSaneNumConcreteChars = 4 |
static const char | kAlphaPatternUnicode [] = "\u2000" |
static const char | kDigitPatternUnicode [] = "\u2001" |
static const char | kAlphanumPatternUnicode [] = "\u2002" |
static const char | kPuncPatternUnicode [] = "\u2003" |
static const char | kLowerPatternUnicode [] = "\u2004" |
static const char | kUpperPatternUnicode [] = "\u2005" |
![]() | |
static const inT16 | kDawgMagicNumber = 42 |
Magic number to determine endianness when reading the Dawg from file. | |
static const UNICHAR_ID | kPatternUnicharID = 0 |
Protected Member Functions | |
EDGE_RECORD * | deref_edge_ref (EDGE_REF edge_ref) const |
EDGE_REF | make_edge_ref (NODE_REF node_index, EDGE_INDEX edge_index) const |
void | link_edge (EDGE_RECORD *edge, NODE_REF nxt, bool repeats, int direction, bool word_end, UNICHAR_ID unichar_id) |
void | print_edge_rec (const EDGE_RECORD &edge_rec) const |
bool | can_be_eliminated (const EDGE_RECORD &edge_rec) |
void | print_all (const char *msg, int max_num_edges) |
bool | edge_char_of (NODE_REF node_ref, NODE_REF next_node, int direction, bool word_end, UNICHAR_ID unichar_id, EDGE_RECORD **edge_ptr, EDGE_INDEX *edge_index) const |
bool | add_edge_linkage (NODE_REF node1, NODE_REF node2, bool repeats, int direction, bool word_end, UNICHAR_ID unichar_id) |
bool | add_new_edge (NODE_REF node1, NODE_REF node2, bool repeats, bool word_end, UNICHAR_ID unichar_id) |
void | add_word_ending (EDGE_RECORD *edge, NODE_REF the_next_node, bool repeats, UNICHAR_ID unichar_id) |
NODE_REF | new_dawg_node () |
void | remove_edge_linkage (NODE_REF node1, NODE_REF node2, int direction, bool word_end, UNICHAR_ID unichar_id) |
void | remove_edge (NODE_REF node1, NODE_REF node2, bool word_end, UNICHAR_ID unichar_id) |
bool | eliminate_redundant_edges (NODE_REF node, const EDGE_RECORD &edge1, const EDGE_RECORD &edge2) |
bool | reduce_lettered_edges (EDGE_INDEX edge_index, UNICHAR_ID unichar_id, NODE_REF node, const EDGE_VECTOR &backward_edges, NODE_MARKER reduced_nodes) |
void | sort_edges (EDGE_VECTOR *edges) |
void | reduce_node_input (NODE_REF node, NODE_MARKER reduced_nodes) |
UNICHAR_ID | character_class_to_pattern (char ch) |
![]() | |
Dawg () | |
NODE_REF | next_node_from_edge_rec (const EDGE_RECORD &edge_rec) const |
Returns the next node visited by following this edge. | |
bool | marker_flag_from_edge_rec (const EDGE_RECORD &edge_rec) const |
Returns the marker flag of this edge. | |
int | direction_from_edge_rec (const EDGE_RECORD &edge_rec) const |
Returns the direction flag of this edge. | |
bool | end_of_word_from_edge_rec (const EDGE_RECORD &edge_rec) const |
Returns true if this edge marks the end of a word. | |
UNICHAR_ID | unichar_id_from_edge_rec (const EDGE_RECORD &edge_rec) const |
Returns UNICHAR_ID recorded in this edge. | |
void | set_next_node_in_edge_rec (EDGE_RECORD *edge_rec, EDGE_REF value) |
Sets the next node link for this edge in the Dawg. | |
void | set_marker_flag_in_edge_rec (EDGE_RECORD *edge_rec) |
Sets this edge record to be the last one in a sequence of edges. | |
int | given_greater_than_edge_rec (NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, const EDGE_RECORD &edge_rec) const |
bool | edge_rec_match (NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, NODE_REF other_next_node, bool other_word_end, UNICHAR_ID other_unichar_id) const |
void | init (DawgType type, const STRING &lang, PermuterType perm, int unicharset_size, int debug_level) |
bool | match_words (WERD_CHOICE *word, inT32 index, NODE_REF node, UNICHAR_ID wildcard) const |
void | iterate_words_rec (const WERD_CHOICE &word_so_far, NODE_REF to_explore, TessCallback1< const char * > *cb) const |
Protected Attributes | |
TRIE_NODES | nodes_ |
uinT64 | num_edges_ |
uinT64 | max_num_edges_ |
uinT64 | deref_direction_mask_ |
uinT64 | deref_node_index_mask_ |
bool | initialized_patterns_ |
UNICHAR_ID | alpha_pattern_ |
UNICHAR_ID | digit_pattern_ |
UNICHAR_ID | alphanum_pattern_ |
UNICHAR_ID | punc_pattern_ |
UNICHAR_ID | lower_pattern_ |
UNICHAR_ID | upper_pattern_ |
![]() | |
DawgType | type_ |
STRING | lang_ |
PermuterType | perm_ |
Permuter code that should be used if the word is found in this Dawg. | |
int | unicharset_size_ |
int | flag_start_bit_ |
int | next_node_start_bit_ |
uinT64 | next_node_mask_ |
uinT64 | flags_mask_ |
uinT64 | letter_mask_ |
int | debug_level_ |
Concrete class for Trie data structure that allows to store a list of words (extends Dawg base class) as well as dynamically add new words. This class stores a vector of pointers to TRIE_NODE_RECORDs, each of which has a vector of forward and backward edges.
|
inline |
Definition at line 89 of file trie.h.
|
inlinevirtual |
Definition at line 98 of file trie.h.
|
protected |
Definition at line 123 of file trie.cpp.
|
inlineprotected |
Definition at line 332 of file trie.h.
|
protected |
Definition at line 156 of file trie.cpp.
bool tesseract::Trie::add_word_to_dawg | ( | const WERD_CHOICE & | word, |
const GenericVector< bool > * | repetitions | ||
) |
Definition at line 173 of file trie.cpp.
|
inline |
|
inlineprotected |
|
protected |
Definition at line 346 of file trie.cpp.
void tesseract::Trie::clear | ( | ) |
|
inlineprotected |
Definition at line 267 of file trie.h.
|
inlinevirtual |
Returns the edge that corresponds to the letter out of this node.
Implements tesseract::Dawg.
Definition at line 104 of file trie.h.
|
protected |
Definition at line 72 of file trie.cpp.
|
inlinevirtual |
Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.
Implements tesseract::Dawg.
Definition at line 145 of file trie.h.
|
protected |
Definition at line 512 of file trie.cpp.
|
inlinevirtual |
Returns true if the edge indicated by the given EDGE_REF marks the end of a word.
Implements tesseract::Dawg.
Definition at line 139 of file trie.h.
|
static |
void tesseract::Trie::initialize_patterns | ( | UNICHARSET * | unicharset | ) |
Definition at line 307 of file trie.cpp.
|
inlineprotected |
Sets up this edge record to the requested values.
Definition at line 282 of file trie.h.
|
inlineprotected |
Constructs EDGE_REF from the given node_index and edge_index.
Returns the next node visited by following the edge indicated by the given EDGE_REF.
Implements tesseract::Dawg.
Definition at line 130 of file trie.h.
|
inlinevirtual |
Returns the given EDGE_REF if the EDGE_RECORD that it points to has a self loop and the given unichar_id matches the unichar_id stored in the EDGE_RECORD, returns NO_EDGE otherwise.
Reimplemented from tesseract::Dawg.
Definition at line 223 of file trie.h.
|
inlineprotected |
|
inlineprotected |
Prints the given EDGE_RECORD.
|
virtual |
Prints the contents of the node indicated by the given NODE_REF. At most max_num_edges will be printed.
Implements tesseract::Dawg.
Definition at line 648 of file trie.cpp.
bool tesseract::Trie::read_pattern_list | ( | const char * | filename, |
const UNICHARSET & | unicharset | ||
) |
Definition at line 364 of file trie.cpp.
bool tesseract::Trie::read_word_list | ( | const char * | filename, |
const UNICHARSET & | unicharset, | ||
Trie::RTLReversePolicy | reverse | ||
) |
Definition at line 268 of file trie.cpp.
|
protected |
Definition at line 563 of file trie.cpp.
|
protected |
Eliminates any redundant edges from this node in the Trie.
Definition at line 615 of file trie.cpp.
|
inlineprotected |
Definition at line 357 of file trie.h.
|
protected |
Definition at line 446 of file trie.cpp.
|
protected |
Order num_edges of consequtive EDGE_RECORDS in the given EDGE_VECTOR in increasing order of unichar ids. This function is normally called for all edges in a single node, and since number of edges in each node is usually quite small, selection sort is used.
Definition at line 598 of file trie.cpp.
SquishedDawg * tesseract::Trie::trie_to_dawg | ( | ) |
Definition at line 465 of file trie.cpp.
|
virtual |
Fills vec with unichar ids that represent the character classes of the given unichar_id.
Reimplemented from tesseract::Dawg.
Definition at line 324 of file trie.cpp.
|
inlinevirtual |
Fills the given NodeChildVector with all the unichar ids (and the corresponding EDGE_REFs) for which there is an edge out of this node.
Implements tesseract::Dawg.
Definition at line 117 of file trie.h.
|
protected |
|
protected |
|
protected |
|
static |
|
static |
|
static |
|
static |
|
static |
|
static |
|
protected |
|
protected |
|
protected |
|
protected |