00001
00019
00020
00021
00022 #include "permdawg.h"
00023 #include "debug.h"
00024 #include "hyphen.h"
00025 #include "permute.h"
00026 #include "tordvars.h"
00027 #include "context.h"
00028 #include "stopper.h"
00029 #include "freelist.h"
00030 #include "globals.h"
00031 #include "dawg.h"
00032 #include <ctype.h>
00033
00034
00035
00036
00037 #define FREQ_WERD 1.0
00038 #define GOOD_WERD 1.1
00039 #define OK_WERD 1.25
00040 #define MAX_FREQ_EDGES 1000
00041 #define NO_RATING -1
00042
00043
00044
00045
00046 static EDGE_ARRAY frequent_words;
00047 static float rating_margin;
00048 static float rating_pad = 5.0;
00049
00052 make_toggle_var (dawg_debug, 0, make_dawg_debug,
00053 8, 10, set_dawg_debug, "DAWG Debug ");
00054 make_float_var (ok_word, OK_WERD, make_ok_word,
00055 8, 17, set_ok_word, "Bad word adjustment");
00056 make_float_var (good_word, GOOD_WERD, make_good_word,
00057 8, 18, set_good_word, "Good word adjustment");
00058 make_float_var (freq_word, FREQ_WERD, make_freq_word,
00059 8, 19, set_freq_word, "Freq word adjustment");
00062
00063
00064
00065
00066
00077 void adjust_word(A_CHOICE *best_choice, float *certainty_array) {
00078 char *this_word;
00079 int punct_status;
00080 float adjust_factor;
00081
00082 if (adjust_debug)
00083 cprintf ("%s %4.2f ",
00084 class_string (best_choice), class_probability (best_choice));
00085
00086 this_word = class_string (best_choice);
00087 punct_status = punctuation_ok (this_word);
00088
00089 class_probability (best_choice) += RATING_PAD;
00090 if (case_ok (this_word) && punct_status != -1) {
00091 if (punct_status < 1 && word_in_dawg (frequent_words, this_word)) {
00092 class_probability (best_choice) *= freq_word;
00093 class_permuter (best_choice) = FREQ_DAWG_PERM;
00094 adjust_factor = freq_word;
00095 if (adjust_debug)
00096 cprintf (", F, %4.2f ", freq_word);
00097 }
00098 else {
00099 class_probability (best_choice) *= good_word;
00100 adjust_factor = good_word;
00101 if (adjust_debug)
00102 cprintf (", %4.2f ", good_word);
00103 }
00104 }
00105 else {
00106 class_probability (best_choice) *= ok_word;
00107 adjust_factor = ok_word;
00108 if (adjust_debug) {
00109 if (!case_ok (this_word))
00110 cprintf (", C");
00111 if (punctuation_ok (this_word) == -1)
00112 cprintf (", P");
00113 cprintf (", %4.2f ", ok_word);
00114 }
00115 }
00116
00117 class_probability (best_choice) -= RATING_PAD;
00118
00119 LogNewWordChoice(best_choice, adjust_factor, certainty_array);
00120
00121 if (adjust_debug)
00122 cprintf (" --> %4.2f\n", class_probability (best_choice));
00123 }
00124
00125
00149 void append_next_choice(
00150 EDGE_ARRAY dawg,
00151 NODE_REF node,
00152 char permuter,
00153 char *word,
00154 CHOICES_LIST choices,
00155 int char_index,
00156 A_CHOICE *this_choice,
00157 char prevchar,
00158 float *limit,
00159 float rating,
00160 float certainty,
00161 float *rating_array,
00162 float *certainty_array,
00163 int word_ending,
00164 int last_word,
00165 CHOICES *result) {
00166 A_CHOICE *better_choice;
00167
00168 word[char_index] = class_string (this_choice)[0];
00169 word[char_index + 1] = 0;
00170 if (word[char_index] == 0)
00171 word[char_index] = ' ';
00172 certainty_array[char_index] = class_certainty (this_choice);
00173
00174 rating += class_probability (this_choice);
00175 certainty = min (class_certainty (this_choice), certainty);
00176
00177 if (rating_array[char_index] == NO_RATING) {
00178
00179 rating_array[char_index] = rating;
00180 }
00181 else {
00182 if (rating_array[char_index] * rating_margin + rating_pad < rating) {
00183 if (dawg_debug)
00184 cprintf ("early pruned word (%s, rating=%4.2f, limit=%4.2f)\n",
00185 word, rating, *limit);
00186 return;
00187 }
00188 }
00189
00190
00191 if (word_ending && last_word && word[char_index] == '-' && char_index > 0) {
00192 *limit = rating;
00193 if (dawg_debug)
00194 cprintf ("new hyphen choice = %s\n", word);
00195
00196 better_choice = new_choice (word, rating, certainty, -1, permuter);
00197 adjust_word(better_choice, certainty_array);
00198 push_on(*result, better_choice);
00199 set_hyphen_word(word, rating, node);
00200 }
00201
00202 else if (letter_is_okay (dawg, &node, char_index, prevchar,
00203 word, word_ending)) {
00204
00205 if (word_ending) {
00206 if (dawg_debug == 1)
00207 cprintf ("new choice = %s\n", word);
00208 *limit = rating;
00209
00210 better_choice = new_choice (hyphen_tail (word), rating, certainty,
00211 -1, permuter);
00212 adjust_word (better_choice, &certainty_array[hyphen_base_size ()]);
00213 push_on(*result, better_choice);
00214 }
00215 else {
00216
00217 JOIN_ON (*result,
00218 dawg_permute (dawg, node, permuter,
00219 choices, char_index + 1, limit,
00220 word, rating, certainty,
00221 rating_array, certainty_array, last_word));
00222 }
00223 }
00224 }
00225
00226
00237 CHOICES dawg_permute(EDGE_ARRAY dawg,
00238 NODE_REF node,
00239 char permuter,
00240 CHOICES_LIST choices,
00241 int char_index,
00242 float *limit,
00243 char *word,
00244 float rating,
00245 float certainty,
00246 float *rating_array,
00247 float *certainty_array,
00248 int last_word) {
00249 CHOICES result = NIL;
00250 CHOICES c;
00251 char *prevchar;
00252 int word_ending = FALSE;
00253
00254 if (dawg_debug) {
00255 cprintf ("dawg_permute (node=%d, char_index=%d, limit=%4.2f, ",
00256 node, char_index, *limit);
00257 cprintf ("word=%s, rating=%4.2f, certainty=%4.2f)\n",
00258 word, rating, certainty);
00259 }
00260
00261 if (1 + char_index == array_count (choices) + hyphen_base_size ())
00262 word_ending = TRUE;
00263
00264 if (char_index < array_count (choices) + hyphen_base_size ()) {
00265 prevchar = NULL;
00266 iterate_list (c,
00267 (CHOICES) array_index (choices,
00268 char_index - hyphen_base_size ())) {
00269 append_next_choice (dawg, node, permuter, word, choices, char_index,
00270 (A_CHOICE *) first (c),
00271 prevchar != NULL ? *prevchar : '\0', limit,
00272 rating, certainty, rating_array, certainty_array,
00273 word_ending, last_word, &result);
00274 prevchar = best_string (c);
00275 }
00276 }
00277
00278 if (result && (dawg_debug == 1))
00279 print_choices ("dawg_permute", result);
00280 return (result);
00281 }
00282
00283
00302 void dawg_permute_and_select(const char *string,
00303 EDGE_ARRAY dawg,
00304 char permuter,
00305 CHOICES_LIST character_choices,
00306 A_CHOICE *best_choice,
00307 INT16 system_words) {
00308 CHOICES result = NIL;
00309 char word[MAX_WERD_LENGTH + 1];
00310 float certainty_array[MAX_WERD_LENGTH + 1];
00311 float rating_array[MAX_WERD_LENGTH + 1];
00312 float rating;
00313 int char_index;
00314 NODE_REF dawg_node = 0;
00315
00316
00317 rating_margin = ok_word / good_word;
00318
00319 word[0] = '\0';
00320 rating = class_probability (best_choice);
00321
00322 for (char_index = 0; char_index < MAX_WERD_LENGTH + 1; char_index++)
00323 rating_array[char_index] = NO_RATING;
00324 char_index = 0;
00325
00326 if (!is_last_word () && hyphen_string) {
00327 strcpy(word, hyphen_string);
00328 char_index = strlen (hyphen_string);
00329 if (system_words)
00330 dawg_node = hyphen_state;
00331 }
00332 result = dawg_permute (dawg, dawg_node, permuter, character_choices,
00333 char_index, &rating, word, 0.0, 0.0,
00334 rating_array, certainty_array, is_last_word ());
00335
00336 #ifdef TEXT_VERBOSE
00337
00338 cprintf("p");
00339 #endif
00340 if (display_ratings && result)
00341 print_choices(string, result);
00342
00343 while (result != NIL) {
00344 if (best_probability (result) < class_probability (best_choice)) {
00345 clone_choice (best_choice, first (result));
00346 }
00347 free_choice (first (result));
00348 pop_off(result);
00349 }
00350 }
00351
00355 void init_permdawg() {
00356 char name[1024];
00357 make_dawg_debug();
00358 make_ok_word();
00359 make_good_word();
00360 make_freq_word();
00361
00362 frequent_words = (EDGE_ARRAY) memalloc (sizeof (EDGE_RECORD) *
00363 MAX_FREQ_EDGES);
00364 strcpy(name, demodir);
00365 strcat (name, "tessdata/freq-dawg");
00366 read_squished_dawg(name, frequent_words, MAX_FREQ_EDGES);
00367 }
00368
00372 void end_permdawg() {
00373 memfree(frequent_words);
00374 frequent_words = NULL;
00375 }
00376
00380 int test_freq_words(const char *word) {
00381 return (word_in_dawg (frequent_words, word));
00382 }