dict/permdawg.cpp

Go to the documentation of this file.
00001 
00019 /*----------------------------------------------------------------------
00020               I n c l u d e s
00021 ----------------------------------------------------------------------*/
00022 #include "permdawg.h"
00023 #include "debug.h"
00024 #include "hyphen.h"
00025 #include "permute.h"
00026 #include "tordvars.h"
00027 #include "context.h"
00028 #include "stopper.h"
00029 #include "freelist.h"
00030 #include "globals.h"
00031 #include "dawg.h"
00032 #include <ctype.h>
00033 
00034 /*----------------------------------------------------------------------
00035               T y p e s
00036 ----------------------------------------------------------------------*/
00037 #define FREQ_WERD     1.0
00038 #define GOOD_WERD     1.1
00039 #define OK_WERD       1.25
00040 #define MAX_FREQ_EDGES    1000
00041 #define NO_RATING         -1
00042 
00043 /*----------------------------------------------------------------------
00044               V a r i a b l e s
00045 ----------------------------------------------------------------------*/
00046 static EDGE_ARRAY frequent_words;
00047 static float rating_margin;
00048 static float rating_pad = 5.0;
00049 
00052 make_toggle_var (dawg_debug, 0, make_dawg_debug,
00053 8, 10, set_dawg_debug, "DAWG Debug ");
00054 make_float_var (ok_word, OK_WERD, make_ok_word,
00055 8, 17, set_ok_word, "Bad word adjustment");
00056 make_float_var (good_word, GOOD_WERD, make_good_word,
00057 8, 18, set_good_word, "Good word adjustment");
00058 make_float_var (freq_word, FREQ_WERD, make_freq_word,
00059 8, 19, set_freq_word, "Freq word adjustment");
00062 //extern char *demodir;
00063 
00064 /*----------------------------------------------------------------------
00065               F u n c t i o n s
00066 ----------------------------------------------------------------------*/
00077 void adjust_word(A_CHOICE *best_choice, float *certainty_array) { 
00078   char *this_word;
00079   int punct_status;
00080   float adjust_factor;
00081 
00082   if (adjust_debug)
00083     cprintf ("%s %4.2f ",
00084       class_string (best_choice), class_probability (best_choice));
00085 
00086   this_word = class_string (best_choice);
00087   punct_status = punctuation_ok (this_word);
00088 
00089   class_probability (best_choice) += RATING_PAD;
00090   if (case_ok (this_word) && punct_status != -1) {
00091     if (punct_status < 1 && word_in_dawg (frequent_words, this_word)) {
00092       class_probability (best_choice) *= freq_word;
00093       class_permuter (best_choice) = FREQ_DAWG_PERM;
00094       adjust_factor = freq_word;
00095       if (adjust_debug)
00096         cprintf (", F, %4.2f ", freq_word);
00097     }
00098     else {
00099       class_probability (best_choice) *= good_word;
00100       adjust_factor = good_word;
00101       if (adjust_debug)
00102         cprintf (", %4.2f ", good_word);
00103     }
00104   }
00105   else {
00106     class_probability (best_choice) *= ok_word;
00107     adjust_factor = ok_word;
00108     if (adjust_debug) {
00109       if (!case_ok (this_word))
00110         cprintf (", C");
00111       if (punctuation_ok (this_word) == -1)
00112         cprintf (", P");
00113       cprintf (", %4.2f ", ok_word);
00114     }
00115   }
00116 
00117   class_probability (best_choice) -= RATING_PAD;
00118 
00119   LogNewWordChoice(best_choice, adjust_factor, certainty_array);
00120 
00121   if (adjust_debug)
00122     cprintf (" --> %4.2f\n", class_probability (best_choice));
00123 }
00124 
00125 
00149 void append_next_choice(
00150                         EDGE_ARRAY dawg,
00151                         NODE_REF node,
00152                         char permuter,
00153                         char *word,
00154                         CHOICES_LIST choices,
00155                         int char_index,
00156                         A_CHOICE *this_choice,
00157                         char prevchar,
00158                         float *limit,
00159                         float rating,
00160                         float certainty,
00161                         float *rating_array,
00162                         float *certainty_array,
00163                         int word_ending,
00164                         int last_word,
00165                         CHOICES *result) {
00166   A_CHOICE *better_choice;
00167   /* Add new character */
00168   word[char_index] = class_string (this_choice)[0];
00169   word[char_index + 1] = 0;
00170   if (word[char_index] == 0)
00171     word[char_index] = ' ';
00172   certainty_array[char_index] = class_certainty (this_choice);
00173 
00174   rating += class_probability (this_choice);
00175   certainty = min (class_certainty (this_choice), certainty);
00176 
00177   if (rating_array[char_index] == NO_RATING) {
00178                                  /* Prune bad subwords */
00179     rating_array[char_index] = rating;
00180   }
00181   else {
00182     if (rating_array[char_index] * rating_margin + rating_pad < rating) {
00183       if (dawg_debug)
00184         cprintf ("early pruned word (%s, rating=%4.2f, limit=%4.2f)\n",
00185           word, rating, *limit);
00186       return;
00187     }
00188   }
00189 
00190   /* Deal with hyphens */
00191   if (word_ending && last_word && word[char_index] == '-' && char_index > 0) {
00192     *limit = rating;
00193     if (dawg_debug)
00194       cprintf ("new hyphen choice = %s\n", word);
00195 
00196     better_choice = new_choice (word, rating, certainty, -1, permuter);
00197     adjust_word(better_choice, certainty_array);
00198     push_on(*result, better_choice);
00199     set_hyphen_word(word, rating, node);
00200   }
00201   /* Look up char in DAWG */
00202   else if (letter_is_okay (dawg, &node, char_index, prevchar,
00203   word, word_ending)) {
00204     /* Add a new word choice */
00205     if (word_ending) {
00206       if (dawg_debug == 1)
00207         cprintf ("new choice = %s\n", word);
00208       *limit = rating;
00209 
00210       better_choice = new_choice (hyphen_tail (word), rating, certainty,
00211         -1, permuter);
00212       adjust_word (better_choice, &certainty_array[hyphen_base_size ()]);
00213       push_on(*result, better_choice);
00214     }
00215     else {
00216                                  /* Search the next letter */
00217       JOIN_ON (*result,
00218         dawg_permute (dawg, node, permuter,
00219         choices, char_index + 1, limit,
00220         word, rating, certainty,
00221         rating_array, certainty_array, last_word));
00222     }
00223   }
00224 }
00225 
00226 
00237 CHOICES dawg_permute(EDGE_ARRAY dawg,
00238                      NODE_REF node,
00239                      char permuter,
00240                      CHOICES_LIST choices,
00241                      int char_index,
00242                      float *limit,
00243                      char *word,
00244                      float rating,
00245                      float certainty,
00246                      float *rating_array,
00247                      float *certainty_array,
00248                      int last_word) {
00249   CHOICES result = NIL;
00250   CHOICES c;
00251   char *prevchar;
00252   int word_ending = FALSE;
00253 
00254   if (dawg_debug) {
00255     cprintf ("dawg_permute (node=%d, char_index=%d, limit=%4.2f, ",
00256       node, char_index, *limit);
00257     cprintf ("word=%s, rating=%4.2f, certainty=%4.2f)\n",
00258       word, rating, certainty);
00259   }
00260   /* Check for End of Word (EOW) */
00261   if (1 + char_index == array_count (choices) + hyphen_base_size ())
00262     word_ending = TRUE;
00263 
00264   if (char_index < array_count (choices) + hyphen_base_size ()) {
00265     prevchar = NULL;
00266     iterate_list (c,
00267       (CHOICES) array_index (choices,
00268     char_index - hyphen_base_size ())) {
00269       append_next_choice (dawg, node, permuter, word, choices, char_index,
00270         (A_CHOICE *) first (c),
00271         prevchar != NULL ? *prevchar : '\0', limit,
00272         rating, certainty, rating_array, certainty_array,
00273         word_ending, last_word, &result);
00274       prevchar = best_string (c);
00275     }
00276   }
00277 
00278   if (result && (dawg_debug == 1))
00279     print_choices ("dawg_permute", result);
00280   return (result);
00281 }
00282 
00283 
00302 void dawg_permute_and_select(const char *string,
00303                              EDGE_ARRAY dawg,
00304                              char permuter,
00305                              CHOICES_LIST character_choices,
00306                              A_CHOICE *best_choice,
00307                              INT16 system_words) {
00308   CHOICES result = NIL;
00309   char word[MAX_WERD_LENGTH + 1];
00310   float certainty_array[MAX_WERD_LENGTH + 1];
00311   float rating_array[MAX_WERD_LENGTH + 1];
00312   float rating;
00313   int char_index;
00314   NODE_REF dawg_node = 0;
00315 
00316                                  /* Pruning margin ratio */
00317   rating_margin = ok_word / good_word;
00318 
00319   word[0] = '\0';
00320   rating = class_probability (best_choice);
00321 
00322   for (char_index = 0; char_index < MAX_WERD_LENGTH + 1; char_index++)
00323     rating_array[char_index] = NO_RATING;
00324   char_index = 0;
00325 
00326   if (!is_last_word () && hyphen_string) {
00327     strcpy(word, hyphen_string);
00328     char_index = strlen (hyphen_string);
00329     if (system_words)
00330       dawg_node = hyphen_state;
00331   }
00332   result = dawg_permute (dawg, dawg_node, permuter, character_choices,
00333     char_index, &rating, word, 0.0, 0.0,
00334     rating_array, certainty_array, is_last_word ());
00335 
00336 #ifdef TEXT_VERBOSE
00337   // gets a 'p', see ccmain/tesseractmain.dox
00338   cprintf("p");
00339 #endif
00340   if (display_ratings && result)
00341     print_choices(string, result);
00342 
00343   while (result != NIL) {
00344     if (best_probability (result) < class_probability (best_choice)) {
00345       clone_choice (best_choice, first (result));
00346     }
00347     free_choice (first (result));
00348     pop_off(result);
00349   }
00350 }
00351 
00355 void init_permdawg() { 
00356   char name[1024];
00357   make_dawg_debug();
00358   make_ok_word();
00359   make_good_word();
00360   make_freq_word();
00361 
00362   frequent_words = (EDGE_ARRAY) memalloc (sizeof (EDGE_RECORD) *
00363     MAX_FREQ_EDGES);
00364   strcpy(name, demodir);
00365   strcat (name, "tessdata/freq-dawg");
00366   read_squished_dawg(name, frequent_words, MAX_FREQ_EDGES);
00367 }
00368 
00372 void end_permdawg() {
00373   memfree(frequent_words);
00374   frequent_words = NULL;
00375 }
00376 
00380 int test_freq_words(const char *word) { 
00381   return (word_in_dawg (frequent_words, word));
00382 }

Generated on Wed Feb 28 19:49:10 2007 for Tesseract by  doxygen 1.5.1