29 #pragma warning(disable:4244) // Conversion warnings
30 #pragma warning(disable:4800) // int/bool warnings
77 " direction %d word_end %d unichar_id %d, exploring node:\n",
78 node_ref, next_node, direction, word_end, unichar_id);
79 if (node_ref != NO_EDGE) {
83 if (node_ref == NO_EDGE)
return false;
86 nodes_[node_ref]->forward_edges :
nodes_[node_ref]->backward_edges;
87 int vec_size = vec.
size();
93 while (start <= end) {
94 k = (start + end) >> 1;
98 *edge_ptr = &(vec[k]);
101 }
else if (compare == 1) {
108 for (
int i = 0; i < vec_size; ++i) {
114 *edge_ptr = &(edge_rec);
128 &(
nodes_[node1]->forward_edges) : &(
nodes_[node1]->backward_edges);
132 while (search_index < vec->size() &&
134 (*vec)[search_index]) == 1) {
138 search_index = vec->
size();
141 link_edge(&edge_rec, node2, marker_flag, direction, word_end, unichar_id);
142 if (search_index < vec->size()) {
143 vec->
insert(edge_rec, search_index);
163 unichar_id, &back_edge_ptr, &back_edge_index));
175 if (word.
length() <= 0)
return false;
178 for (
int i = 0; i < word.
length(); ++i) {
186 bool marker_flag =
false;
189 inT32 still_finding_chars =
true;
190 inT32 word_end =
false;
191 bool add_failed =
false;
197 for (i = 0; i < word.
length() - 1; ++i) {
199 marker_flag = (repetitions !=
NULL) ? (*repetitions)[i] :
false;
201 if (still_finding_chars) {
203 unichar_id, &edge_ptr, &edge_index);
206 edge_index, last_node);
209 still_finding_chars =
false;
212 still_finding_chars =
false;
219 if (!still_finding_chars) {
223 if (the_next_node == 0) {
228 marker_flag, word_end, unichar_id)) {
233 last_node = the_next_node;
238 marker_flag = (repetitions !=
NULL) ? (*repetitions)[i] :
false;
240 if (still_finding_chars &&
242 unichar_id, &edge_ptr, &edge_index)) {
246 marker_flag, unichar_id);
249 !
add_new_edge(last_node, the_next_node, marker_flag,
true, unichar_id))
253 tprintf(
"Re-initializing document dictionary...\n");
263 if (node ==
NULL)
return 0;
287 tprintf(
"Read %d words so far\n", word_count);
292 tprintf(
"Error: word '%s' not in DAWG after adding it\n",
string);
297 tprintf(
"Skipping invalid word %s\n",
string);
302 tprintf(
"Read %d words total.\n", word_count);
327 bool is_alpha = unicharset.
get_isalpha(unichar_id);
349 }
else if (ch ==
'd') {
351 }
else if (ch ==
'n') {
353 }
else if (ch ==
'p') {
355 }
else if (ch ==
'a') {
357 }
else if (ch ==
'A') {
360 return INVALID_UNICHAR_ID;
367 tprintf(
"please call initialize_patterns() before read_pattern_list()\n");
371 FILE *pattern_file =
open_file (filename,
"r");
372 if (pattern_file ==
NULL) {
373 tprintf(
"Error opening pattern file %s\n", filename);
377 int pattern_count = 0;
385 const char *str_ptr = string;
386 int step = unicharset.
step(str_ptr);
389 UNICHAR_ID curr_unichar_id = INVALID_UNICHAR_ID;
390 if (step == 1 && *str_ptr ==
'\\') {
392 if (*str_ptr ==
'\\') {
396 tprintf(
"Please provide at least %d concrete characters at the"
407 if (curr_unichar_id == INVALID_UNICHAR_ID) {
414 step = unicharset.
step(str_ptr);
416 if (step == 1 && *str_ptr ==
'\\' && *(str_ptr+1) ==
'*') {
417 repetitions_vec[repetitions_vec.
size()-1] =
true;
419 step = unicharset.
step(str_ptr);
423 tprintf(
"Invalid user pattern %s\n",
string);
428 tprintf(
"Inserting expanded user pattern %s\n",
434 tprintf(
"Error: failed to insert pattern '%s'\n",
string);
440 tprintf(
"Read %d valid patterns from %s\n", pattern_count, filename);
442 fclose(pattern_file);
451 unichar_id, &edge_ptr, &edge_index));
470 for (
int i = 0; i <
nodes_.
size(); i++) reduced_nodes[i] = 0;
472 delete[] reduced_nodes;
483 node_ref_map[i+1] = node_ref_map[i] +
nodes_[i]->forward_edges.
size();
485 int num_forward_edges = node_ref_map[i];
495 for (j = 0; j < end; ++j) {
506 delete[] node_ref_map;
516 tprintf(
"\nCollapsing node %d:\n", node);
544 curr_word_end, curr_unichar_id);
547 curr_word_end, curr_unichar_id,
548 &edge_ptr, &edge_index));
555 next_node2_num_edges, next_node2);
571 bool did_something =
false;
572 for (
int i = edge_index; i < backward_edges.
size() - 1; ++i) {
574 UNICHAR_ID curr_unichar_id = INVALID_UNICHAR_ID;
575 while (i < backward_edges.
size() &&
579 if (i == backward_edges.
size() || curr_unichar_id != unichar_id)
break;
582 for (
int j = i + 1; j < backward_edges.
size(); ++j) {
583 const EDGE_RECORD &next_edge_rec = backward_edges[j];
590 did_something =
true;
595 return did_something;
599 int num_edges = edges->
size();
600 if (num_edges <= 1)
return;
601 for (
int i = 0; i < num_edges - 1; ++i) {
603 for (
int j = (i + 1); j < num_edges; ++j) {
609 (*edges)[i] = (*edges)[min];
610 (*edges)[min] = temp;
625 while (edge_index < backward_edges.
size()) {
629 backward_edges, reduced_nodes));
630 while (++edge_index < backward_edges.
size() &&
633 reduced_nodes[node] =
true;
640 for (
int i = 0; i < backward_edges.
size(); ++i) {
642 if (next_node != 0 && !reduced_nodes[next_node]) {
649 if (node == NO_EDGE)
return;
654 for (
int dir = 0; dir < 2; ++dir) {
663 for (i = 0; (dir == 0 ? i < num_fwd : i < num_bkw) &&
664 i < max_num_edges; ++i) {
668 if (dir == 0 ? i < num_fwd : i < num_bkw)
tprintf(
"...");