00001
00020
00021
00022
00023 #include "trie.h"
00024 #include "callcpp.h"
00025
00026 #ifdef __UNIX__
00027 #include <assert.h>
00028 #endif
00029 #include <stdio.h>
00030
00031
00032
00033
00034 static INT32 move_counter = 0;
00035 static INT32 new_counter = 0;
00036 static INT32 edge_counter = 0;
00037
00038
00039
00040
00046 void add_edge_linkage(EDGE_ARRAY dawg,
00047 NODE_REF node1,
00048 NODE_REF node2,
00049 INT32 direction,
00050 char character,
00051 INT32 word_end) {
00052 EDGE_REF edge1 = node1;
00053 EDGE_REF edge2;
00054 INT32 num_edges = edges_in_node (dawg, node1);
00055 INT32 last_one;
00056
00057 word_end = (word_end ? WERD_END_FLAG : 0);
00058
00059 if (num_edges == 0) {
00060 direction = ((direction == FORWARD_EDGE) ? DIRECTION_FLAG : 0);
00061 link_edge (dawg, edge1, node2, character,
00062 LAST_FLAG | direction | word_end);
00063 }
00064 else {
00065
00066 if (direction == FORWARD_EDGE) {
00067 last_one = (forward_edge (dawg, edge1) ? 0 : LAST_FLAG);
00068 if (debug)
00069 cprintf ("moving edges (nodes = %ld, %dl, num = %ld)\n",
00070 edge1, edge1+1, num_edges);
00071 copy_edges (dawg, edge1, edge1+1, num_edges);
00072 link_edge (dawg, edge1, node2, character,
00073 last_one | DIRECTION_FLAG | word_end);
00074 }
00075 else {
00076
00077 if (forward_edge (dawg, edge1))
00078 edge_loop(dawg, edge1);
00079
00080
00081 if (backward_edge (dawg,edge1)) {
00082 num_edges = 0;
00083 edge2 = edge1;
00084 do { num_edges++; }
00085 edge_loop(dawg, edge2);
00086
00087 if (debug)
00088 cprintf ("moving edges (nodes = %ld, %ld, num = %ld)\n",
00089 edge1, edge1+1, num_edges);
00090 copy_edges (dawg, edge1, edge1+1, num_edges);
00091 link_edge(dawg, edge1, node2, character, word_end);
00092 }
00093 else {
00094 link_edge (dawg, edge1, node2, character,
00095 LAST_FLAG | word_end);
00096 }
00097 }
00098 }
00099 }
00100
00101
00105 void add_new_edge(EDGE_ARRAY dawg,
00106 NODE_REF *node1,
00107 NODE_REF *node2,
00108 char character,
00109 INT32 word_end,
00110 INT32 max_num_edges,
00111 INT32 reserved_edges) {
00112 int direction;
00113
00114 if (debug)
00115 cprintf ("add_new_edge (nodes = %ld, %ld, ch = '%c', end = %d)\n",
00116 *node1, *node2, character, word_end);
00117 edge_counter++;
00118
00119 move_node_if_needed(dawg, *node1, max_num_edges, reserved_edges);
00120 move_node_if_needed(dawg, *node2, max_num_edges, reserved_edges);
00121
00122 direction = (int) FORWARD_EDGE;
00123
00124 add_edge_linkage(dawg, *node1, *node2, direction, character, word_end);
00125
00126 direction = (int) BACKWARD_EDGE;
00127 add_edge_linkage(dawg, *node2, *node1, direction, character, word_end);
00128 }
00129
00130
00134 void add_word_to_dawg(EDGE_ARRAY dawg,
00135 char *string,
00136 INT32 max_num_edges,
00137 INT32 reserved_edges) {
00138 EDGE_REF edge;
00139 NODE_REF last_node = 0;
00140 NODE_REF the_next_node;
00141 INT32 i;
00142 INT32 still_finding_chars = TRUE;
00143 INT32 word_end = FALSE;
00144
00145 for (i=0; i<strlen(string)-1; i++) {
00146 if (still_finding_chars) {
00147 edge = edge_char_of (dawg, last_node, string[i], word_end);
00148 if (debug) cprintf ("exploring edge = %d\n", edge);
00149 if (edge == NO_EDGE)
00150 still_finding_chars = FALSE;
00151 else
00152 if (next_node (dawg, edge) == 0) {
00153 word_end = TRUE;
00154 still_finding_chars = FALSE;
00155 if (! case_sensative) string[i] = tolower (string[i]);
00156 remove_edge (dawg, last_node, 0, string[i], word_end);
00157 }
00158 else {
00159 last_node = next_node (dawg, edge);
00160 }
00161 }
00162
00163 if (! still_finding_chars) {
00164 the_next_node = new_dawg_node (dawg, DEFAULT_NODE_SIZE,
00165 max_num_edges, reserved_edges);
00166 if (edges_in_node (dawg, last_node) + last_node == the_next_node) {
00167 cprintf ("Node collision at %d\n", the_next_node);
00168 the_next_node = new_dawg_node (dawg, DEFAULT_NODE_SIZE,
00169 max_num_edges, reserved_edges);
00170 }
00171 if (! case_sensative) string[i] = tolower (string[i]);
00172 add_new_edge (dawg, &last_node, &the_next_node,
00173 string[i], word_end, max_num_edges, reserved_edges);
00174 word_end = FALSE;
00175 if (debug)
00176 cprintf ("adding node = %ld\n", the_next_node);
00177 last_node = the_next_node;
00178 }
00179 }
00180
00181 the_next_node = 0;
00182 if (! case_sensative) string[i] = tolower (string[i]);
00183 add_new_edge (dawg, &last_node, &the_next_node,
00184 string[i], TRUE, max_num_edges, reserved_edges);
00185
00186 if (edges_in_node (dawg, 0) > reserved_edges) {
00187 cprintf ("error: Not enough room in root node, %d\n",
00188 edges_in_node (dawg, 0));
00189 exit (1);
00190 }
00191 }
00192
00193
00198 void initialize_dawg(EDGE_ARRAY dawg, INT32 max_num_edges) {
00199 INT32 x;
00200
00201 clear_all_edges(dawg, x, max_num_edges);
00202 }
00203
00204
00208 NODE_REF move_node(EDGE_ARRAY dawg,
00209 NODE_REF node,
00210 INT32 max_num_edges,
00211 INT32 reserved_edges) {
00212 NODE_REF this_new_node;
00213 EDGE_REF edge;
00214 INT32 num_edges = edges_in_node (dawg, node);
00215
00216 if (debug)
00217 print_dawg_node(dawg, node);
00218
00219 this_new_node = new_dawg_node (dawg, num_edges + EDGE_NUM_MARGIN,
00220 max_num_edges, reserved_edges);
00221
00222 if (debug)
00223 cprintf ("move_node (from = %ld, to = %ld, num = %ld)\n",
00224 node, this_new_node, num_edges);
00225
00226 move_edges(dawg, node, this_new_node, num_edges);
00227
00228 if (debug)
00229 print_dawg_node(dawg, this_new_node);
00230
00231 edge = this_new_node;
00232 if (forward_edge (dawg, edge)) {
00233 do {
00234 relocate_edge (dawg, next_node (dawg, edge), node, this_new_node);
00235 } edge_loop (dawg, edge);
00236 }
00237 if (backward_edge (dawg, edge)) {
00238 do {
00239 relocate_edge (dawg, next_node (dawg, edge), node, this_new_node);
00240 } edge_loop (dawg, edge);
00241 }
00242
00243 move_counter++;
00244 return (this_new_node);
00245 }
00246
00247
00254 NODE_REF new_dawg_node(EDGE_ARRAY dawg,
00255 INT32 num_edges,
00256 INT32 max_num_edges,
00257 INT32 reserved_edges) {
00258 INT32 i;
00259 INT32 n;
00260 INT32 edge_index;
00261 INT32 edge_collision;
00262
00263 for (i=0; i<NUM_PLACEMENT_ATTEMPTS; i++) {
00264
00265 edge_index = reserved_edges +
00266 long_rand (max_num_edges - MAX_WERD_LENGTH - reserved_edges);
00267 edge_collision = FALSE;
00268
00269 for (n=0; n<num_edges && !edge_collision; n++) {
00270 if (edge_occupied (dawg, edge_index + n))
00271 edge_collision = TRUE;
00272 }
00273
00274 if (! edge_collision) {
00275 new_counter++;
00276
00277 return (edge_index);
00278 }
00279 }
00280
00281 cprintf ("DAWG Table is too full, nodes = %ld, edges = %ld, moves %ld\n",
00282 new_counter, edge_counter, move_counter);
00283 exit (1);
00284 return 0;
00285 }
00286
00287
00292 void read_word_list(char *filename,
00293 EDGE_ARRAY dawg,
00294 INT32 max_num_edges,
00295 INT32 reserved_edges) {
00296 FILE *word_file;
00297 char string [CHARS_PER_LINE];
00298
00299 word_file = open_file (filename, "r");
00300
00301 initialize_dawg(dawg, max_num_edges);
00302
00303 while (fgets (string, CHARS_PER_LINE, word_file) != NULL) {
00304 string [strlen (string) - 1] = (char) 0;
00305 if (strlen (string)) {
00306 add_word_to_dawg(dawg, string, max_num_edges, reserved_edges);
00307
00308 if (! word_in_dawg (dawg, string)) {
00309 cprintf ("error: word not in DAWG after adding it '%s'\n", string);
00310 return;
00311 }
00312 }
00313 }
00314 }
00315
00316
00322 void relocate_edge(EDGE_ARRAY dawg,
00323 NODE_REF node,
00324 NODE_REF old_node,
00325 NODE_REF new_node) {
00326 EDGE_REF edge;
00327
00328 if (debug) cprintf ("relocate (%ld, %ld ==> %ld)\n",
00329 node, old_node, new_node);
00330
00331 edge = node;
00332 if (forward_edge (dawg, edge)) {
00333 do {
00334 if (next_node (dawg, edge) == old_node) {
00335 if (debug)
00336 cprintf ("forward assign (%ld, %ld ==> %ld)\n",
00337 edge, old_node, new_node);
00338
00339 set_next_edge(dawg, edge, new_node);
00340 }
00341 } edge_loop (dawg, edge);
00342 }
00343
00344 if (backward_edge (dawg, edge)) {
00345 do {
00346 if (next_node (dawg, edge) == old_node) {
00347 if (debug)
00348 cprintf ("backward assign (%ld, %ld ==> %ld)\n",
00349 edge, old_node, new_node);
00350
00351 set_next_edge(dawg, edge, new_node);
00352 }
00353 }
00354 edge_loop(dawg, edge);
00355 }
00356 }
00357
00358
00363 void remove_edge(EDGE_ARRAY dawg,
00364 NODE_REF node1,
00365 NODE_REF node2,
00366 char character,
00367 INT32 word_end) {
00368 remove_edge_linkage(dawg, node1, node2, FORWARD_EDGE, character, word_end);
00369
00370 remove_edge_linkage(dawg, node2, node1, BACKWARD_EDGE, character, word_end);
00371 }
00372
00373
00378 void remove_edge_linkage(EDGE_ARRAY dawg,
00379 NODE_REF node,
00380 NODE_REF next,
00381 INT32 direction,
00382 char character,
00383 INT32 word_end) {
00384 INT32 forward_edges;
00385 INT32 num_edges;
00386 INT32 e = node;
00387 INT32 last_flag;
00388
00389 forward_edges = num_forward_edges (dawg, node);
00390 num_edges = edges_in_node (dawg, node);
00391
00392 for (e=node; e<node+num_edges; e++) {
00393
00394 if ((edge_letter (dawg, e) == character) &&
00395 ((direction == FORWARD_EDGE) ?
00396 forward_edge(dawg,e) : backward_edge(dawg,e)) &&
00397 (next_node (dawg, e) == next) &&
00398 (word_end ? end_of_word(dawg,e) : (! end_of_word(dawg,e)))) {
00399
00400 if (debug)
00401 cprintf ("remove (edge = %ld, character is '%c')\n",
00402 e, edge_letter (dawg, e));
00403
00404
00405 last_flag = last_edge (dawg, e);
00406 set_empty_edge(dawg, e);
00407 move_edges (dawg, e+1, e, num_edges+node-e-1);
00408
00409 if (direction == FORWARD_EDGE) {
00410 if ((forward_edges - 1) &&
00411 (forward_edges - 1 == e - node)) {
00412 set_last_flag (dawg, e - 1);
00413 }
00414 }
00415 else {
00416 if ((num_edges - forward_edges - 1) &&
00417 (num_edges - 1 == e - node)) {
00418 set_last_flag (dawg, e - 1);
00419 }
00420 }
00421 if (debug)
00422 print_dawg_node(dawg, node);
00423 return;
00424 }
00425 }
00426 cprintf ("error: Could not find the edge to remove, %d\n", next);
00427 print_dawg_node(dawg, node);
00428 exit (1);
00429 }
00430
00431
00436 INT32 room_in_node(EDGE_ARRAY dawg, NODE_REF node) {
00437 EDGE_REF edge = node;
00438
00439 if (edge_occupied (dawg, edge + edges_in_node (dawg, node))) {
00440 return (FALSE);
00441 }
00442 else {
00443 return (TRUE);
00444 }
00445 }