dict/trie.cpp File Reference

#include "trie.h"
#include "callcpp.h"
#include <stdio.h>

Go to the source code of this file.

Functions

Variables


Function Documentation

void add_edge_linkage ( EDGE_ARRAY  dawg,
NODE_REF  node1,
NODE_REF  node2,
INT32  direction,
char  character,
INT32  word_end 
)

Add a single edge linkage to between the two nodes;.

can be used to add either forward or backward links.

Definition at line 46 of file trie.cpp.

References backward_edge, copy_edges, cprintf(), debug, DIRECTION_FLAG, edge_loop, edges_in_node(), FORWARD_EDGE, forward_edge, LAST_FLAG, link_edge, and WERD_END_FLAG.

Referenced by add_new_edge().

00051                                       {
00052   EDGE_REF edge1 = node1;
00053   EDGE_REF edge2;
00054   INT32      num_edges = edges_in_node (dawg, node1);
00055   INT32      last_one;
00056 
00057   word_end  = (word_end ? WERD_END_FLAG : 0);
00058 
00059   if (num_edges == 0) {          /* No edges yet */
00060     direction = ((direction == FORWARD_EDGE) ? DIRECTION_FLAG : 0);
00061     link_edge  (dawg, edge1, node2,  character,
00062       LAST_FLAG | direction | word_end);
00063   }
00064   else {
00065                                  /* Forward links */
00066     if (direction == FORWARD_EDGE) {
00067       last_one = (forward_edge (dawg, edge1) ? 0 : LAST_FLAG);
00068       if (debug)
00069         cprintf ("moving edges (nodes = %ld, %dl, num = %ld)\n",
00070           edge1, edge1+1, num_edges);
00071       copy_edges (dawg, edge1, edge1+1, num_edges);
00072       link_edge  (dawg, edge1, node2,  character,
00073         last_one | DIRECTION_FLAG | word_end);
00074     }
00075     else {                       /* Backward links */
00076 
00077       if (forward_edge (dawg, edge1))
00078         edge_loop(dawg, edge1); 
00079 
00080                                  /* Existing back edges */
00081       if (backward_edge (dawg,edge1)) {
00082         num_edges = 0;
00083         edge2 = edge1;
00084         do { num_edges++; }
00085         edge_loop(dawg, edge2); 
00086 
00087         if (debug)
00088           cprintf ("moving edges (nodes = %ld, %ld, num = %ld)\n",
00089             edge1, edge1+1, num_edges);
00090         copy_edges (dawg, edge1, edge1+1, num_edges);
00091         link_edge(dawg, edge1, node2, character, word_end); 
00092       }
00093       else {                     /* First back edge */
00094         link_edge  (dawg, edge1, node2,  character,
00095           LAST_FLAG | word_end);
00096       }
00097     }
00098   }
00099 }

void add_new_edge ( EDGE_ARRAY  dawg,
NODE_REF node1,
NODE_REF node2,
char  character,
INT32  word_end,
INT32  max_num_edges,
INT32  reserved_edges 
)

Add an edge between two nodes in the DAWG & link the nodes both ways.

Definition at line 105 of file trie.cpp.

References add_edge_linkage(), BACKWARD_EDGE, cprintf(), debug, direction(), edge_counter, FORWARD_EDGE, and move_node_if_needed.

Referenced by add_word_to_dawg().

00111                                         {
00112   int direction;
00113 
00114   if (debug)
00115     cprintf ("add_new_edge (nodes = %ld, %ld, ch = '%c', end = %d)\n",
00116       *node1, *node2,  character, word_end);
00117   edge_counter++;
00118 
00119   move_node_if_needed(dawg, *node1, max_num_edges, reserved_edges); 
00120   move_node_if_needed(dawg, *node2, max_num_edges, reserved_edges); 
00121 
00122   direction = (int) FORWARD_EDGE;
00123 
00124   add_edge_linkage(dawg, *node1, *node2, direction, character, word_end); 
00125 
00126   direction = (int) BACKWARD_EDGE;
00127   add_edge_linkage(dawg, *node2, *node1, direction, character, word_end); 
00128 }

void add_word_to_dawg ( EDGE_ARRAY  dawg,
char *  string,
INT32  max_num_edges,
INT32  reserved_edges 
)

Add in a word by creating the necessary nodes and edges.

Definition at line 134 of file trie.cpp.

References add_new_edge(), case_sensative, cprintf(), debug, DEFAULT_NODE_SIZE, edge_char_of(), edges_in_node(), FALSE, new_dawg_node(), next_node, NO_EDGE, remove_edge(), and TRUE.

Referenced by add_document_word(), and read_word_list().

00137                                             {
00138   EDGE_REF    edge;
00139   NODE_REF    last_node = 0;
00140   NODE_REF    the_next_node;
00141   INT32         i;
00142   INT32         still_finding_chars = TRUE;
00143   INT32         word_end = FALSE;
00144 
00145   for (i=0; i<strlen(string)-1; i++) {
00146     if (still_finding_chars) {
00147       edge = edge_char_of (dawg, last_node, string[i], word_end);
00148       if (debug) cprintf ("exploring edge = %d\n", edge);
00149       if (edge == NO_EDGE)
00150         still_finding_chars = FALSE;
00151       else
00152       if (next_node (dawg, edge) == 0) {
00153         word_end = TRUE;
00154         still_finding_chars = FALSE;
00155         if (! case_sensative) string[i] = tolower (string[i]);
00156         remove_edge (dawg, last_node, 0, string[i], word_end);
00157       }
00158       else {
00159         last_node = next_node (dawg, edge);
00160       }
00161     }
00162 
00163     if (! still_finding_chars) {
00164       the_next_node = new_dawg_node (dawg, DEFAULT_NODE_SIZE,
00165       max_num_edges, reserved_edges);
00166       if (edges_in_node (dawg, last_node) + last_node == the_next_node) {
00167         cprintf ("Node collision at %d\n", the_next_node);
00168         the_next_node = new_dawg_node (dawg, DEFAULT_NODE_SIZE,
00169          max_num_edges, reserved_edges);
00170       }
00171       if (! case_sensative) string[i] = tolower (string[i]);
00172       add_new_edge (dawg, &last_node, &the_next_node,
00173         string[i], word_end, max_num_edges, reserved_edges);
00174       word_end = FALSE;
00175       if (debug)
00176         cprintf ("adding node = %ld\n", the_next_node);
00177       last_node = the_next_node;
00178     }
00179   }
00180 
00181   the_next_node = 0;
00182   if (! case_sensative) string[i] = tolower (string[i]);
00183   add_new_edge (dawg, &last_node, &the_next_node,
00184     string[i], TRUE, max_num_edges, reserved_edges);
00185 
00186   if (edges_in_node (dawg, 0) > reserved_edges) {
00187     cprintf ("error: Not enough room in root node, %d\n",
00188       edges_in_node (dawg, 0));
00189     exit (1);
00190   }
00191 }

void initialize_dawg ( EDGE_ARRAY  dawg,
INT32  max_num_edges 
)

Initialize the DAWG data structure for further used & reset each of the edge cells to NO_EDGE.

Definition at line 198 of file trie.cpp.

References clear_all_edges.

Referenced by init_permute(), and read_word_list().

00198                                                            { 
00199   INT32 x;
00200 
00201   clear_all_edges(dawg, x, max_num_edges); 
00202 }

NODE_REF move_node ( EDGE_ARRAY  dawg,
NODE_REF  node,
INT32  max_num_edges,
INT32  reserved_edges 
)

Move the location in the edge array of this node in the DAWG.

Definition at line 208 of file trie.cpp.

References backward_edge, cprintf(), debug, edge_loop, EDGE_NUM_MARGIN, edges_in_node(), forward_edge, move_counter, move_edges, new_dawg_node(), next_node, print_dawg_node(), and relocate_edge().

00211                                          {
00212   NODE_REF   this_new_node;
00213   EDGE_REF   edge;
00214   INT32        num_edges = edges_in_node (dawg, node);
00215 
00216   if (debug)
00217     print_dawg_node(dawg, node); 
00218 
00219   this_new_node = new_dawg_node (dawg, num_edges + EDGE_NUM_MARGIN,
00220    max_num_edges, reserved_edges);
00221 
00222   if (debug)
00223     cprintf ("move_node  (from = %ld, to = %ld, num = %ld)\n",
00224       node, this_new_node, num_edges);
00225 
00226   move_edges(dawg, node, this_new_node, num_edges); 
00227 
00228   if (debug)
00229     print_dawg_node(dawg, this_new_node); 
00230 
00231   edge = this_new_node;
00232   if (forward_edge (dawg, edge)) {
00233     do {
00234       relocate_edge (dawg, next_node (dawg, edge), node, this_new_node);
00235     } edge_loop (dawg, edge);
00236   }
00237   if (backward_edge (dawg, edge)) {
00238     do {
00239       relocate_edge (dawg, next_node (dawg, edge), node, this_new_node);
00240     } edge_loop (dawg, edge);
00241   }
00242 
00243   move_counter++;
00244   return (this_new_node);
00245 }

NODE_REF new_dawg_node ( EDGE_ARRAY  dawg,
INT32  num_edges,
INT32  max_num_edges,
INT32  reserved_edges 
)

Make space for node in DAWG.

Create a space within the DAWG data structure to house a node that consists of the requested number of edges.

Definition at line 254 of file trie.cpp.

References cprintf(), edge_counter, edge_occupied, FALSE, long_rand(), MAX_WERD_LENGTH, move_counter, new_counter, NUM_PLACEMENT_ATTEMPTS, and TRUE.

Referenced by add_word_to_dawg(), and move_node().

00257                                              {
00258   INT32        i;
00259   INT32        n;
00260   INT32        edge_index;
00261   INT32        edge_collision;
00262   /* Try several times */
00263   for (i=0; i<NUM_PLACEMENT_ATTEMPTS; i++) {
00264 
00265     edge_index = reserved_edges +
00266       long_rand (max_num_edges - MAX_WERD_LENGTH - reserved_edges);
00267     edge_collision = FALSE;
00268     /* Find N adjacent slots */
00269     for  (n=0; n<num_edges && !edge_collision; n++) {
00270       if (edge_occupied (dawg, edge_index + n))
00271         edge_collision = TRUE;
00272     }
00273 
00274     if (! edge_collision) {
00275       new_counter++;
00276       //?   max_new_attempts = max (max_new_attempts, i);
00277       return (edge_index);
00278     }
00279   }
00280 
00281   cprintf ("DAWG Table is too full, nodes = %ld, edges = %ld, moves %ld\n",
00282     new_counter, edge_counter, move_counter);
00283   exit (1);
00284   return 0;
00285 }

void read_word_list ( char *  filename,
EDGE_ARRAY  dawg,
INT32  max_num_edges,
INT32  reserved_edges 
)

Read the requested file (containing a list of words) and add all the words to the DAWG.

Definition at line 292 of file trie.cpp.

References add_word_to_dawg(), CHARS_PER_LINE, cprintf(), initialize_dawg(), NULL, open_file(), and word_in_dawg().

Referenced by init_permute().

00295                                           {
00296   FILE       *word_file;
00297   char       string [CHARS_PER_LINE];
00298 
00299   word_file = open_file (filename, "r");
00300 
00301   initialize_dawg(dawg, max_num_edges); 
00302 
00303   while (fgets (string, CHARS_PER_LINE, word_file) != NULL) {
00304     string [strlen (string) - 1] = (char) 0;
00305     if (strlen (string)) {
00306       add_word_to_dawg(dawg, string, max_num_edges, reserved_edges); 
00307 
00308       if (! word_in_dawg (dawg, string)) {
00309         cprintf ("error: word not in DAWG after adding it '%s'\n", string);
00310         return;
00311       }
00312     }
00313   }
00314 }

void relocate_edge ( EDGE_ARRAY  dawg,
NODE_REF  node,
NODE_REF  old_node,
NODE_REF  new_node 
)

The location of a node has moved, so an edge entry in another node must be changed & change the value of this edge entry to match the new location of the node.

Definition at line 322 of file trie.cpp.

References backward_edge, cprintf(), debug, edge_loop, forward_edge, next_node, and set_next_edge.

Referenced by move_node().

00325                                       {
00326   EDGE_REF   edge;
00327 
00328   if (debug) cprintf ("relocate (%ld, %ld ==> %ld)\n",
00329    node, old_node, new_node);
00330 
00331   edge = node;
00332   if (forward_edge (dawg, edge)) {
00333     do {
00334       if (next_node (dawg, edge) == old_node) {
00335         if (debug)
00336           cprintf ("forward assign (%ld, %ld ==> %ld)\n",
00337          edge, old_node, new_node);
00338 
00339         set_next_edge(dawg, edge, new_node); 
00340       }
00341     } edge_loop (dawg, edge);
00342   }
00343 
00344   if (backward_edge (dawg, edge)) {
00345     do {
00346       if (next_node (dawg, edge) == old_node) {
00347         if (debug)
00348           cprintf ("backward assign (%ld, %ld ==> %ld)\n",
00349          edge, old_node, new_node);
00350 
00351         set_next_edge(dawg, edge, new_node); 
00352       }
00353     }
00354     edge_loop(dawg, edge); 
00355   }
00356 }

void remove_edge ( EDGE_ARRAY  dawg,
NODE_REF  node1,
NODE_REF  node2,
char  character,
INT32  word_end 
)

Add a single edge linkage to between the two nodes; can be used to add either forward or backward links.

Definition at line 363 of file trie.cpp.

References BACKWARD_EDGE, FORWARD_EDGE, and remove_edge_linkage().

Referenced by add_word_to_dawg().

00367                                  {
00368   remove_edge_linkage(dawg, node1, node2, FORWARD_EDGE, character, word_end); 
00369 
00370   remove_edge_linkage(dawg, node2, node1, BACKWARD_EDGE, character, word_end); 
00371 }

void remove_edge_linkage ( EDGE_ARRAY  dawg,
NODE_REF  node,
NODE_REF  next,
INT32  direction,
char  character,
INT32  word_end 
)

Remove this edge reference from this node & move the edge entries in this node to fill the gap.

Definition at line 378 of file trie.cpp.

References backward_edge, cprintf(), debug, edge_letter, edges_in_node(), end_of_word, FORWARD_EDGE, forward_edge, last_edge, move_edges, next_node, num_forward_edges(), print_dawg_node(), set_empty_edge, and set_last_flag.

Referenced by remove_edge().

00383                                          {
00384   INT32      forward_edges;
00385   INT32      num_edges;
00386   INT32      e = node;
00387   INT32      last_flag;
00388 
00389   forward_edges = num_forward_edges (dawg, node);
00390   num_edges = edges_in_node (dawg, node);
00391 
00392   for (e=node; e<node+num_edges; e++) {
00393     /* Is this the edge*/
00394     if ((edge_letter (dawg, e) == character) &&
00395       ((direction == FORWARD_EDGE) ?
00396       forward_edge(dawg,e) : backward_edge(dawg,e)) &&
00397       (next_node (dawg, e) == next) &&
00398     (word_end ? end_of_word(dawg,e) : (! end_of_word(dawg,e)))) {
00399 
00400       if (debug)
00401         cprintf ("remove (edge = %ld, character is '%c')\n",
00402           e, edge_letter (dawg, e));
00403 
00404       /* Delete the slot */
00405       last_flag = last_edge (dawg, e);
00406       set_empty_edge(dawg, e); 
00407       move_edges (dawg, e+1, e, num_edges+node-e-1);
00408       /* Restore 'last' flag */
00409       if (direction == FORWARD_EDGE) {
00410         if ((forward_edges - 1) &&
00411         (forward_edges - 1 == e - node)) {
00412           set_last_flag (dawg, e - 1);
00413         }
00414       }
00415       else {
00416         if ((num_edges - forward_edges - 1) &&
00417         (num_edges - 1 == e - node)) {
00418           set_last_flag (dawg, e - 1);
00419         }
00420       }
00421       if (debug)
00422         print_dawg_node(dawg, node); 
00423       return;
00424     }
00425   }
00426   cprintf ("error: Could not find the edge to remove, %d\n", next);
00427   print_dawg_node(dawg, node); 
00428   exit (1);
00429 }

INT32 room_in_node ( EDGE_ARRAY  dawg,
NODE_REF  node 
)

Check to see if there is enough room left in this node for one more edge link which may be a forward or backward link.

Definition at line 436 of file trie.cpp.

References edge_occupied, edges_in_node(), FALSE, and TRUE.

00436                                                    { 
00437   EDGE_REF   edge = node;
00438 
00439   if (edge_occupied (dawg, edge + edges_in_node (dawg, node))) {
00440     return (FALSE);
00441   }
00442   else {
00443     return (TRUE);
00444   }
00445 }


Variable Documentation

INT32 edge_counter = 0 [static]

Definition at line 36 of file trie.cpp.

Referenced by add_new_edge(), and new_dawg_node().

INT32 move_counter = 0 [static]

Note:
File: trie.cpp (Formerly trie.c)
Functions to build a trie data structure.
Author:
Mark Seaman, OCR Technology
Date:
Oct 16 14:37:00 1987 Fri Jul 26 12:18:10 1991 (Mark Seaman) marks
 * (c) Copyright 1987, Hewlett-Packard Company.
 ** Licensed under the Apache License, Version 2.0 (the "License");
 ** you may not use this file except in compliance with the License.
 ** You may obtain a copy of the License at
 ** http://www.apache.org/licenses/LICENSE-2.0
 ** Unless required by applicable law or agreed to in writing, software
 ** distributed under the License is distributed on an "AS IS" BASIS,
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 ** See the License for the specific language governing permissions and
 ** limitations under the License.

Definition at line 34 of file trie.cpp.

Referenced by move_node(), and new_dawg_node().

INT32 new_counter = 0 [static]

Definition at line 35 of file trie.cpp.

Referenced by new_dawg_node().


Generated on Wed Feb 28 19:49:23 2007 for Tesseract by  doxygen 1.5.1