dict/permnum.cpp

Go to the documentation of this file.
00001 
00019 /*----------------------------------------------------------------------
00020               I n c l u d e s
00021 ----------------------------------------------------------------------*/
00022 #include "const.h"
00023 #include "permnum.h"
00024 #include "debug.h"
00025 #include "permute.h"
00026 #include "dawg.h"
00027 #include "tordvars.h"
00028 #include "stopper.h"
00029 
00030 #include <math.h>
00031 #include <ctype.h>
00032 
00033 /*----------------------------------------------------------------------
00034               V a r i a b l e s
00035 ----------------------------------------------------------------------*/
00036 static const char *allowed_alpha_strs[] = {
00037   "jan", "feb", "mar", "apr", "may", "jun",
00038   "jul", "aug", "sep", "oct", "nov", "dec", NULL
00039 };
00040 
00051 static const char *allowed_char_strs[] = {
00052   "adfjmnos", "aceopu", "bcglnrptvy"
00053 };
00054 
00058 const int kNumStates = 7;
00059 
00060 static int number_state_table[kNumStates][8] = { {
00061                                  /* 0. Beginning of string        */
00062   /*l  d  o  a    t    1  2    3                    */
00063     0, 1, 1, -99, -99, 4, -99, -99
00064   },
00065   {                              /* 1. After a digit or operator  */
00066     -99, 1, 1, 3, 2, 4, 3, 3
00067   },
00068   {                              /* 2. After trailing punctuation */
00069     -99, -99, 1, -99, 2, -99, -99, -99
00070   },
00071   {                              /* 3. After a alpha character    */
00072     -99, -99, 3, 3, 2, 3, 3, 3
00073   },
00074   {                              /* 4. After 1st  char */
00075     -99, -1, -1, -99, -2, -99, 5, -99
00076   },
00077   {                              /* 5. After 2nd  char */
00078     -99, -1, -1, -99, -2, -99, -99, 6
00079   },
00080   {                              /* 6. After 3rd  char */
00081     -99, -1, -1, -99, -2, -99, -99, -99
00082   }
00083 };
00084 
00085 // The state is coded with its true state shifted left by kStateShift.
00086 // A repeat count (starting with 0) is stored in the lower bits
00087 // No state is allowed to occur more than kMaxRepeats times.
00088 const int kStateShift = 4;
00089 const int kRepeatMask = (1 << kStateShift) - 1;
00090 
00091 const int kMaxRepeats[kNumStates] = {
00092   3, 10, 3, 3, 3, 3, 3
00093 };
00094 
00097 make_float_var (good_number, GOOD_NUMBER, make_good_number,
00098 8, 15, set_good_number, "Good number adjustment");
00099 
00100 make_float_var (ok_number, OK_NUMBER, make_ok_number,
00101 8, 16, set_ok_number, "Bad number adjustment");
00102 
00103 make_toggle_var (number_debug, 0, make_number_debug,
00104 8, 23, set_number_debug, "Number debug");
00105 
00106 make_int_var (number_depth, 3, make_number_depth,
00107 8, 24, set_number_depth, "Number depth");
00110 /*----------------------------------------------------------------------
00111               M a c r o s
00112 ----------------------------------------------------------------------*/
00117 #define isleading(ch)     \
00118 ((ch == '{'  ) ||       \
00119    (ch == '['  ) ||       \
00120    (ch == '('  ) ||       \
00121    (ch == '#'  ) ||       \
00122    (ch == '@'  ) ||       \
00123    (ch == '$'  ))
00124 
00129 #define istrailing(ch)    \
00130 ((ch == '}'  ) ||       \
00131    (ch == ']'  ) ||       \
00132    (ch == ')'  ) ||       \
00133    (ch == ';'  ) ||       \
00134    (ch == ':'  ) ||       \
00135    (ch == ','  ) ||       \
00136    (ch == '.'  ) ||       \
00137    (ch == '%'  ))
00138 
00143 #define isoperator(ch)    \
00144 ((ch == '*'  ) ||       \
00145    (ch == '+'  ) ||       \
00146    (ch == '-'  ) ||       \
00147    (ch == '/'  ) ||       \
00148    (ch == '.'  ) ||       \
00149    (ch == ':'  ) ||       \
00150    (ch == ','  ))
00151 
00152 /*----------------------------------------------------------------------
00153               F u n c t i o n s
00154 ----------------------------------------------------------------------*/
00159 void adjust_number(A_CHOICE *best_choice, float *certainty_array) { 
00160   float adjust_factor;
00161 
00162   if (adjust_debug)
00163     cprintf ("Number: %s %4.2f ",
00164       class_string (best_choice), class_probability (best_choice));
00165 
00166   class_probability (best_choice) += RATING_PAD;
00167   if (pure_number (class_string (best_choice))) {
00168     class_probability (best_choice) *= good_number;
00169     adjust_factor = good_number;
00170     if (adjust_debug)
00171       cprintf (", %4.2f ", good_number);
00172   }
00173   else {
00174     class_probability (best_choice) *= ok_number;
00175     adjust_factor = ok_number;
00176     if (adjust_debug)
00177       cprintf (", N, %4.2f ", ok_number);
00178   }
00179 
00180   class_probability (best_choice) -= RATING_PAD;
00181   LogNewWordChoice(best_choice, adjust_factor, certainty_array);
00182   if (adjust_debug)
00183     cprintf (" --> %4.2f\n", class_probability (best_choice));
00184 }
00185 
00186 
00192 void append_number_choices(int state,
00193                            char *word,
00194                            CHOICES_LIST choices,
00195                            int char_index,
00196                            A_CHOICE *this_choice,
00197                            float *limit,
00198                            float rating,
00199                            float certainty,
00200                            float *certainty_array,
00201                            CHOICES *result) {
00202   int word_ending = FALSE;
00203   int x;
00204 
00205   if (char_index == (array_count (choices) - 1))
00206     word_ending = TRUE;
00207 
00208   word[char_index] = class_string (this_choice)[0];
00209   word[char_index + 1] = '\0';
00210   if (word[char_index] == '\0')
00211     word[char_index] = ' ';
00212   certainty_array[char_index] = class_certainty (this_choice);
00213 
00214   rating += class_probability (this_choice);
00215   certainty = min (class_certainty (this_choice), certainty);
00216 
00217   if (rating < *limit) {
00218 
00219     state = number_state_change (state, word + char_index);
00220     if (number_debug)
00221       cprintf ("%-20s prob=%4.2f  state=%d\n", word, rating, state);
00222 
00223     if (state != -1) {
00224 
00225       if ((state >> kStateShift) == 3 &&
00226           char_index + 3 < array_count (choices)) {
00227         return;
00228       }
00229 
00230       if (word_ending) {
00231         for (x = 0; x <= char_index; x++) {
00232           if (isdigit (word[x])) {
00233             if (number_debug)
00234               cprintf ("new choice = %s\n", word);
00235             push_on (*result, new_choice (word, rating, certainty,
00236               -1, NUMBER_PERM));
00237             adjust_number ((A_CHOICE *) first (*result),
00238               certainty_array);
00239             if (best_probability (*result) > *limit) {
00240               free_choice (first (*result));
00241               pop_off(*result);
00242             }
00243             else {
00244               *limit = best_probability (*result);
00245               break;
00246             }
00247           }
00248         }
00249       }
00250       else {
00251         JOIN_ON (*result,
00252           number_permute (state, choices, char_index + 1, limit,
00253           word, rating, certainty,
00254           certainty_array));
00255       }
00256     }
00257   }
00258   else {
00259     if (number_debug)
00260       cprintf ("pruned word (%s, rating=%4.2f, limit=%4.2f)\n",
00261         word, rating, *limit);
00262   }
00263 }
00264 
00265 
00270 void init_permnum() { 
00271   make_good_number(); 
00272   make_ok_number(); 
00273   make_number_debug(); 
00274   make_number_depth(); 
00275 }
00276 
00277 
00283 int number_character_type(  //current state
00284                           char ch,
00285                           int state) {
00286   char lower_char = tolower (ch);
00287 
00288   if (isalpha (ch)) {
00289     if (state < 4 && strchr (allowed_char_strs[0], lower_char) != NULL)
00290       return 5;
00291     else if (state == 4
00292       && strchr (allowed_char_strs[1], lower_char) != NULL)
00293       return 6;
00294     else if (state == 5
00295       && strchr (allowed_char_strs[2], lower_char) != NULL)
00296       return 7;
00297     return 3;
00298   }
00299   else if (isdigit (ch))
00300     return (1);
00301   else if (isoperator (ch))
00302     return (2);
00303   else if (istrailing (ch))
00304     return (4);
00305   else if (isleading (ch))
00306     return (0);
00307   else
00308     return (-1);
00309 }
00310 
00311 
00321 int number_state_change(int state,           //current state
00322                         const char *word) {  //current char
00323   int char_type;                 //type of char
00324   int new_state;                 //state to return
00325   int old_state = state >> kStateShift;
00326   int repeats = state & kRepeatMask;
00327   int index;
00328   char copy_word[4];             //tolowered chars
00329 
00330   char_type = number_character_type (*word, old_state);
00331   if (char_type == -1)
00332     return -1;
00333   new_state = number_state_table[old_state][char_type];
00334   if (new_state == old_state) {
00335     ++repeats;
00336     if (repeats >= kMaxRepeats[old_state])
00337       return -1;
00338   } else {
00339     repeats = 0;
00340   }
00341   if (new_state >= 0)
00342     return (new_state << kStateShift) | repeats;
00343   if (new_state == -99)
00344     return -1;
00345 
00346   //now check to see if the last state-3 chars in the word
00347   //make an allowable word. For now only 3 letter words
00348   //are allowed
00349   if (old_state != 6)
00350     return -1;                   //only 3 letters now
00351   copy_word[0] = tolower (word[-3]);
00352   copy_word[1] = tolower (word[-2]);
00353   copy_word[2] = tolower (word[-1]);
00354   copy_word[3] = '\0';
00355   for (index = 0; allowed_alpha_strs[index] != NULL; index++) {
00356     if (strcmp (copy_word, allowed_alpha_strs[index]) == 0)
00357       return (-new_state) << kStateShift;
00358   }
00359   return -1;                     //not a good word
00360 }
00361 
00362 
00370 CHOICES number_permute(int state,
00371                        CHOICES_LIST choices,
00372                        int char_index,
00373                        float *limit,
00374                        char *word,
00375                        float rating,
00376                        float certainty,
00377                        float *certainty_array) {
00378   CHOICES result = NIL;
00379   CHOICES c;
00380   int depth = 0;
00381 
00382   if (number_debug) {
00383     cprintf ("number_permute (state=%d, char_index=%d, limit=%4.2f, ",
00384       state, char_index, *limit);
00385     cprintf ("word=%s, rating=%4.2f, certainty=%4.2f)\n",
00386       word, rating, certainty);
00387   }
00388   if (char_index < array_count (choices)) {
00389     iterate_list (c, (CHOICES) array_index (choices, char_index)) {
00390       if (depth++ < number_depth)
00391         append_number_choices (state, word, choices, char_index,
00392           (A_CHOICE *) first (c), limit, rating,
00393           certainty, certainty_array, &result);
00394     }
00395   }
00396   if (result && number_debug == 1)
00397     print_choices ("number_permute:", result);
00398   return (result);
00399 }
00400 
00401 
00406 A_CHOICE *number_permute_and_select(CHOICES_LIST char_choices,
00407                                     float rating_limit) {
00408   CHOICES result = NIL;
00409   char word[MAX_WERD_LENGTH + 1];
00410   float certainty_array[MAX_WERD_LENGTH + 1];
00411   float rating = rating_limit;
00412   A_CHOICE *best_choice;
00413 
00414   best_choice = new_choice (NULL, MAXFLOAT, -MAXFLOAT, -1, NO_PERM);
00415 
00416   if (array_count (char_choices) <= MAX_WERD_LENGTH) {
00417     word[0] = '\0';
00418     result = number_permute (0, char_choices, 0, &rating,
00419       word, 0.0, 0.0, certainty_array);
00420 
00421     if (display_ratings && result)
00422       print_choices ("number_permuter", result);
00423 
00424     while (result != NIL) {
00425       if (best_probability (result) < class_probability (best_choice)) {
00426         clone_choice (best_choice, first (result));
00427       }
00428       free_choice (first (result));
00429       pop_off(result);
00430     }
00431   }
00432   return (best_choice);
00433 }
00434 
00435 
00440 int pure_number(const char *string) { 
00441   int x;
00442 
00443   for (x = strlen (string) - 1; x >= 0; x--) {
00444     if (isdigit (string[x])) {
00445       return (TRUE);
00446     }
00447     else if (isalpha (string[x]))
00448       return (FALSE);
00449   }
00450   return (FALSE);
00451 }
00452 
00453 
00459 int valid_number(const char *string) { 
00460   int state = 0;
00461   int char_index;
00462   int num_chars = strlen (string);
00463   int num_digits = 0;
00464 
00465   for (char_index = 0; char_index < num_chars; char_index++) {
00466 
00467     state = number_state_change (state, string + char_index);
00468     if (state == -1)
00469       return (FALSE);
00470     if (isdigit (string[char_index]))
00471       num_digits++;
00472   }
00473   return num_digits > num_chars - num_digits;
00474 }

Generated on Wed Feb 28 19:49:10 2007 for Tesseract by  doxygen 1.5.1