00001
00020
00021
00022
00023 #include "dawg.h"
00024 #include "cutil.h"
00025 #include "callcpp.h"
00026 #include "context.h"
00027 #include "strngs.h"
00028
00029
00030
00031
00032 INT32 debug = 0;
00033 INT32 case_sensative = 0;
00034
00035
00036
00037
00041 EDGE_REF edge_char_of(EDGE_ARRAY dawg,
00042 NODE_REF node,
00043 int character,
00044 int word_end) {
00045 EDGE_REF edge = node;
00046
00047 if (! case_sensative) character = tolower (character);
00048
00049 if (edge_occupied (dawg, edge)) {
00050 do {
00051 if ((edge_letter (dawg, edge) == character) &&
00052 (! word_end || end_of_word(dawg,edge)))
00053 return (edge);
00054
00055 } edge_loop (dawg, edge);
00056 }
00057
00058 return (NO_EDGE);
00059 }
00060
00061
00066 INT32 edges_in_node(EDGE_ARRAY dawg, NODE_REF node) {
00067 EDGE_REF edge = node;
00068
00069 if (edge_occupied (dawg, edge)) {
00070 edge_loop(dawg, edge);
00071 if (edge_occupied (dawg, edge) && backward_edge (dawg, edge)) {
00072 edge_loop(dawg, edge);
00073 return (edge - node);
00074 }
00075 else {
00076 return (edge - node);
00077 }
00078 }
00079 else {
00080 return (edge - node);
00081 }
00082 }
00083
00084
00096 INT32 letter_is_okay(EDGE_ARRAY dawg,
00097 NODE_REF *node,
00098 INT32 char_index,
00099 char prevchar,
00100 const char *word,
00101 INT32 word_end) {
00102 EDGE_REF edge;
00103 STRING dummy_word(word);
00104
00105 if (*node == NO_EDGE) {
00106 if (trailing_punc (dummy_word [char_index])
00107 && (!trailing_punc (prevchar)
00108 || punctuation_ok(dummy_word.string())>=0))
00109 return (TRUE);
00110 else
00111 return (FALSE);
00112 }
00113 else {
00114
00115 if (*node == 0 &&
00116 char_index != 0 &&
00117 isalpha (dummy_word [char_index]) &&
00118 ! leading_punc (dummy_word [char_index-1]) &&
00119 dummy_word [char_index-1] != '-') {
00120 return (FALSE);
00121 }
00122 }
00123
00124 if (dummy_word [char_index] == '-') {
00125 if (char_index>0 && !word_end
00126 && word [char_index-1] == '-'
00127 && word [char_index+1] == '-')
00128 return FALSE;
00129 dummy_word [char_index] = (char) 0;
00130 if (word_in_dawg (dawg, dummy_word.string())) {
00131 dummy_word [char_index] = '-';
00132 *node = 0;
00133 return (TRUE);
00134 }
00135 else {
00136 dummy_word [char_index] = '-';
00137 return (FALSE);
00138 }
00139 }
00140
00141 edge = edge_char_of (dawg, *node, dummy_word [char_index], word_end);
00142
00143 if (edge != NO_EDGE) {
00144 if (case_sensative || case_is_okay (dummy_word, char_index)) {
00145
00146 *node = (dawg)[edge] & NO_EDGE;
00147 return (TRUE);
00148 }
00149 else {
00150 return (FALSE);
00151 }
00152 }
00153 else {
00154
00155 if (leading_punc (word [char_index]) &&
00156 (char_index == 0 || leading_punc (dummy_word [char_index-1]))) {
00157 *node = 0;
00158 if (leading_punc (prevchar) || punctuation_ok (word)>=0)
00159 return (TRUE);
00160 else
00161 return FALSE;
00162 }
00163
00164 if (verify_trailing_punct (dawg, &dummy_word[0], char_index)) {
00165 *node = NO_EDGE;
00166 return (TRUE);
00167 }
00168
00169 return (FALSE);
00170 }
00171 }
00172
00173
00177 INT32 num_forward_edges(EDGE_ARRAY dawg, NODE_REF node) {
00178 EDGE_REF edge = node;
00179 INT32 num = 0;
00180
00181 if (forward_edge (dawg, edge)) {
00182 do {
00183 num++;
00184 } edge_loop (dawg, edge);
00185 }
00186
00187 return (num);
00188 }
00189
00190
00194 void print_dawg_node(EDGE_ARRAY dawg, NODE_REF node) {
00195 EDGE_REF edge = node;
00196 const char *forward_string = "FORWARD";
00197 const char *backward_string = " ";
00198
00199 const char *last_string = "LAST";
00200 const char *not_last_string = " ";
00201
00202 const char *eow_string = "EOW";
00203 const char *not_eow_string = " ";
00204
00205 const char *direction;
00206 const char *is_last;
00207 const char *eow;
00208
00209 char ch;
00210
00211 if (edge_occupied (dawg, edge)) {
00212 do {
00213 if (forward_edge (dawg, edge)) direction = forward_string;
00214 else direction = backward_string;
00215
00216 if (last_edge (dawg, edge)) is_last = last_string;
00217 else is_last = not_last_string;
00218
00219 if (end_of_word (dawg, edge)) eow = eow_string;
00220 else eow = not_eow_string;
00221
00222 ch = edge_letter (dawg, edge);
00223 cprintf ("%7d : next = %7d, char = '%c', %s %s %s\n",
00224 (int) edge, (int) next_node (dawg, edge), ch,
00225 direction, is_last, eow);
00226
00227 if (edge - node > MAX_NODE_EDGES) return;
00228 } edge_loop (dawg, edge);
00229
00230 if (edge_occupied (dawg, edge) && backward_edge (dawg, edge)) {
00231 do {
00232 if (forward_edge (dawg, edge)) direction = forward_string;
00233 else direction = backward_string;
00234
00235 if (last_edge (dawg, edge)) is_last = last_string;
00236 else is_last = not_last_string;
00237
00238 if (end_of_word (dawg, edge)) eow = eow_string;
00239 else eow = not_eow_string;
00240
00241 ch = edge_letter (dawg, edge);
00242 cprintf ("%7d : next = %7d, char = '%c', %s %s %s\n",
00243 (int) edge, (int) next_node (dawg, edge), ch,
00244 direction, is_last, eow);
00245
00246 if (edge - node > MAX_NODE_EDGES) return;
00247 } edge_loop (dawg, edge);
00248 }
00249 }
00250 else {
00251 cprintf ("%5d : no edges in this node\n", node);
00252 }
00253 new_line();
00254 }
00255
00256
00262 void read_squished_dawg(char *filename, EDGE_ARRAY dawg, INT32 max_num_edges) {
00263 FILE *file;
00264 EDGE_REF edge;
00265 INT32 num_edges;
00266 INT32 node_count = 0;
00267
00268 if (debug) print_string ("read_debug");
00269
00270 clear_all_edges(dawg, edge, max_num_edges);
00271
00272 #ifdef __UNIX__
00273 file = open_file (filename, "r");
00274 #else
00275 file = open_file (filename, "rb");
00276 #endif
00277 fseek(file, 0, SEEK_END);
00278 long fsize = ftell(file);
00279 rewind(file);
00280 fread (&num_edges, sizeof (int), 1, file);
00281
00282
00283 long diff1 = sizeof(EDGE_RECORD)*num_edges + sizeof(int) - fsize;
00284 reverse32(&num_edges);
00285 long diff2 = sizeof(EDGE_RECORD)*num_edges + sizeof(int) - fsize;
00286 reverse32(&num_edges);
00287
00288
00289 if (diff1 < 0) diff1 = -diff1;
00290 if (diff2 < 0) diff2 = -diff2;
00291 bool swap = diff2 < diff1;
00292 if (swap)
00293 reverse32(&num_edges);
00294 fread (&dawg[0], sizeof (EDGE_RECORD), num_edges, file);
00295 fclose(file);
00296 if (swap)
00297 for (edge=0;edge<num_edges;edge++)
00298 reverse32(&dawg[edge]);
00299
00300 for (edge=0; edge<max_num_edges; edge++)
00301 if (last_edge (dawg, edge)) node_count++;
00302 }
00303
00304
00314 INT32 verify_trailing_punct(EDGE_ARRAY dawg, char *word, INT32 char_index) {
00315 char last_char;
00316 char *first_char;
00317
00318 if (trailing_punc (word [char_index])) {
00319
00320 last_char = word [char_index];
00321 word [char_index] = (char) 0;
00322
00323 for (first_char = word; leading_punc (first_char[0]); first_char++);
00324
00325 if (word_in_dawg (dawg, first_char)) {
00326 word [char_index] = last_char;
00327 return (TRUE);
00328 }
00329 word [char_index] = last_char;
00330 }
00331 return (FALSE);
00332 }
00333
00334
00338 INT32 word_in_dawg(EDGE_ARRAY dawg, const char *string) {
00339 NODE_REF node = 0;
00340 INT32 i;
00341 INT32 length;
00342
00343 length=strlen(string);
00344 if (length==0)
00345 return FALSE;
00346 for (i=0; i<length; i++) {
00347 if (debug) {
00348 print_dawg_node(dawg, node);
00349 new_line();
00350 }
00351
00352 if (! letter_is_okay (dawg, &node, i,'\0', string, (string[i+1]==0))) {
00353 return (FALSE);
00354 }
00355 }
00356
00357 return (TRUE);
00358 }