00001 
00029 #include "mfcpch.h"
00030 #include          "tessvars.h"
00031 #ifdef __UNIX__
00032 #include          <assert.h>
00033 #include          <errno.h>
00034 #endif
00035 #include          "scanutils.h"
00036 #include          <ctype.h>
00037 #include          <string.h>
00038 
00039 #include          "memry.h"
00040 #include          "reject.h"
00041 #include          "tfacep.h"
00042 #include          "mainblk.h"
00043 #include          "charcut.h"
00044 #include          "imgs.h"
00045 #include          "scaleimg.h"
00046 #include          "control.h"
00047 #include          "docqual.h"
00048 #include          "secname.h"
00049 
00050 
00051 
00052 
00053 #include          "callnet.h"
00054 
00055 
00056 #include          "notdll.h"
00057 
00058 CLISTIZEH (STRING) CLISTIZE (STRING)
00059 #define EXTERN
00060 
00062 EXTERN
00063 INT_VAR (tessedit_reject_mode, 0, "Rejection algorithm");
00064 EXTERN
00065 INT_VAR (tessedit_ok_mode, 5, "Acceptance decision algorithm");
00066 EXTERN
00067 BOOL_VAR (tessedit_use_nn, FALSE, "");
00068 EXTERN
00069 BOOL_VAR (tessedit_rejection_debug, FALSE, "Adaption debug");
00070 EXTERN
00071 BOOL_VAR (tessedit_rejection_stats, FALSE, "Show NN stats");
00072 EXTERN
00073 BOOL_VAR (tessedit_flip_0O, TRUE, "Contextual 0O O0 flips");
00074 EXTERN
00075 double_VAR (tessedit_lower_flip_hyphen, 1.5,
00076 "Aspect ratio dot/hyphen test");
00077 EXTERN
00078 double_VAR (tessedit_upper_flip_hyphen, 1.8,
00079 "Aspect ratio dot/hyphen test");
00080 
00081 EXTERN
00082 BOOL_VAR (rej_trust_doc_dawg, FALSE,
00083 "Use DOC dawg in 11l conf. detector");
00084 EXTERN
00085 BOOL_VAR (rej_1Il_use_dict_word, FALSE, "Use dictword test");
00086 EXTERN
00087 BOOL_VAR (rej_1Il_trust_permuter_type, TRUE, "Dont double check");
00088 
00089 EXTERN
00090 BOOL_VAR (one_ell_conflict_default, TRUE, "one_ell_conflict default");
00091 EXTERN
00092 BOOL_VAR (show_char_clipping, FALSE, "Show clip image window?");
00093 EXTERN
00094 BOOL_VAR (nn_debug, FALSE, "NN DEBUGGING?");
00095 EXTERN
00096 BOOL_VAR (nn_reject_debug, FALSE, "NN DEBUG each char?");
00097 EXTERN
00098 BOOL_VAR (nn_lax, FALSE, "Use 2nd rate matches");
00099 EXTERN
00100 BOOL_VAR (nn_double_check_dict, FALSE, "Double check");
00101 EXTERN
00102 BOOL_VAR (nn_conf_double_check_dict, TRUE,
00103 "Double check for confusions");
00104 EXTERN
00105 BOOL_VAR (nn_conf_1Il, TRUE, "NN use 1Il conflicts");
00106 EXTERN
00107 BOOL_VAR (nn_conf_Ss, TRUE, "NN use Ss conflicts");
00108 EXTERN
00109 BOOL_VAR (nn_conf_hyphen, TRUE, "NN hyphen conflicts");
00110 EXTERN
00111 BOOL_VAR (nn_conf_test_good_qual, FALSE, "NN dodgy 1Il cross check");
00112 EXTERN
00113 BOOL_VAR (nn_conf_test_dict, TRUE, "NN dodgy 1Il cross check");
00114 EXTERN
00115 BOOL_VAR (nn_conf_test_sensible, TRUE, "NN dodgy 1Il cross check");
00116 EXTERN
00117 BOOL_VAR (nn_conf_strict_on_dodgy_chs, TRUE,
00118 "Require stronger NN match");
00119 EXTERN
00120 double_VAR (nn_dodgy_char_threshold, 0.99, "min accept score");
00121 EXTERN
00122 INT_VAR (nn_conf_accept_level, 4, "NN accept dodgy 1Il matches? ");
00123 EXTERN
00124 INT_VAR (nn_conf_initial_i_level, 3,
00125 "NN accept initial Ii match level ");
00126 
00127 EXTERN
00128 BOOL_VAR (no_unrej_dubious_chars, TRUE, "Dubious chars next to reject?");
00129 EXTERN
00130 BOOL_VAR (no_unrej_no_alphanum_wds, TRUE, "Stop unrej of non A/N wds?");
00131 EXTERN
00132 BOOL_VAR (no_unrej_1Il, FALSE, "Stop unrej of 1Ilchars?");
00133 EXTERN
00134 BOOL_VAR (rej_use_tess_accepted, TRUE, "Individual rejection control");
00135 EXTERN
00136 BOOL_VAR (rej_use_tess_blanks, TRUE, "Individual rejection control");
00137 EXTERN
00138 BOOL_VAR (rej_use_good_perm, TRUE, "Individual rejection control");
00139 EXTERN
00140 BOOL_VAR (rej_use_sensible_wd, FALSE, "Extend permuter check");
00141 EXTERN
00142 BOOL_VAR (rej_alphas_in_number_perm, FALSE, "Extend permuter check");
00143 
00144 EXTERN
00145 double_VAR (rej_whole_of_mostly_reject_word_fract, 0.85,
00146 "if >this fract");
00147 EXTERN
00148 INT_VAR (rej_mostly_reject_mode, 1,
00149 "0-never, 1-afterNN, 2-after new xht");
00150 EXTERN
00151 double_VAR (tessed_fullstop_aspect_ratio, 1.2,
00152 "if >this fract then reject");
00153 
00154 EXTERN
00155 INT_VAR (net_image_width, 40, "NN input image width");
00156 EXTERN
00157 INT_VAR (net_image_height, 36, "NN input image height");
00158 EXTERN
00159 INT_VAR (net_image_x_height, 22, "NN input image x_height");
00160 EXTERN
00161 INT_VAR (tessedit_image_border, 2, "Rej blbs near image edge limit");
00162 
00163 EXTERN
00164 INT_VAR (net_bl_nodes, 20, "Number of baseline nodes");
00165 
00166 EXTERN
00167 double_VAR (nn_reject_threshold, 0.5, "NN min accept score");
00168 EXTERN
00169 double_VAR (nn_reject_head_and_shoulders, 0.6, "top scores sep factor");
00170 
00171 
00172 EXTERN
00173 STRING_VAR (ok_single_ch_non_alphanum_wds, "-?\075",
00174 "Allow NN to unrej");
00175 EXTERN
00176 STRING_VAR (ok_repeated_ch_non_alphanum_wds, "-?*\075",
00177 "Allow NN to unrej");
00178 EXTERN
00179 STRING_VAR (conflict_set_I_l_1, "Il1[]", "Il1 conflict set");
00180 EXTERN
00181 STRING_VAR (conflict_set_S_s, "Ss$", "Ss conflict set");
00182 EXTERN
00183 STRING_VAR (conflict_set_hyphen, "-_~", "hyphen conflict set");
00184 EXTERN
00185 STRING_VAR (dubious_chars_left_of_reject, "!'+`()-./\\<>;:^_,~\"",
00186 "Unreliable chars");
00187 EXTERN
00188 STRING_VAR (dubious_chars_right_of_reject, "!'+`()-./\\<>;:^_,~\"",
00189 "Unreliable chars");
00190 
00191 EXTERN
00192 INT_VAR (min_sane_x_ht_pixels, 8, "Reject any x-ht lt or eq than this");
00213 void set_done(
00214               WERD_RES *word,
00215               INT16 pass) {
00216   
00217 
00218 
00219   if (tessedit_ok_mode == 0) {
00220     
00221     word->done = word->tess_accepted;
00222   }
00223   
00224 
00225 
00226   else if (tessedit_ok_mode == 1) {
00227     word->done = word->tess_accepted &&
00228       (strchr (word->best_choice->string ().string (), ' ') == NULL);
00229 
00230     if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
00231       word->done = FALSE;
00232   }
00233   
00234 
00235 
00236   else if (tessedit_ok_mode == 2) {
00237     word->done = word->tess_accepted &&
00238       (strchr (word->best_choice->string ().string (), ' ') == NULL);
00239 
00240     if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
00241       word->done = FALSE;
00242 
00243     if (word->done &&
00244       (pass == 1) &&
00245       (word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
00246       (word->best_choice->permuter () != FREQ_DAWG_PERM) &&
00247       (word->best_choice->permuter () != USER_DAWG_PERM) &&
00248     (word->best_choice->permuter () != NUMBER_PERM)) {
00249       #ifndef SECURE_NAMES
00250       if (tessedit_rejection_debug)
00251         tprintf ("\nVETO Tess accepting poor word \"%s\"\n",
00252           word->best_choice->string ().string ());
00253       #endif
00254       word->done = FALSE;
00255     }
00256   }
00257   
00258 
00259 
00260   else if (tessedit_ok_mode == 3) {
00261     word->done = word->tess_accepted &&
00262       (strchr (word->best_choice->string ().string (), ' ') == NULL);
00263 
00264     if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
00265       word->done = FALSE;
00266 
00267     if (word->done &&
00268       (word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
00269       (word->best_choice->permuter () != FREQ_DAWG_PERM) &&
00270       (word->best_choice->permuter () != USER_DAWG_PERM) &&
00271     (word->best_choice->permuter () != NUMBER_PERM)) {
00272       #ifndef SECURE_NAMES
00273       if (tessedit_rejection_debug)
00274         tprintf ("\nVETO Tess accepting poor word \"%s\"\n",
00275           word->best_choice->string ().string ());
00276       #endif
00277       word->done = FALSE;
00278     }
00279   }
00280   
00281 
00282 
00283   else if (tessedit_ok_mode == 4) {
00284     word->done = word->tess_accepted &&
00285       (strchr (word->best_choice->string ().string (), ' ') == NULL);
00286 
00287     if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
00288       word->done = FALSE;
00289 
00290     if (word->done &&
00291       (pass == 1) &&
00292       ((word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
00293       (word->best_choice->permuter () != FREQ_DAWG_PERM) &&
00294       (word->best_choice->permuter () != USER_DAWG_PERM) &&
00295       (word->best_choice->permuter () != NUMBER_PERM)) ||
00296     (test_ambig_word (word))) {
00297       #ifndef SECURE_NAMES
00298       if (tessedit_rejection_debug)
00299         tprintf ("\nVETO Tess accepting poor word \"%s\"\n",
00300           word->best_choice->string ().string ());
00301       #endif
00302       word->done = FALSE;
00303     }
00304   }
00305   
00306 
00307 
00308   else if (tessedit_ok_mode == 5) {
00309     word->done = word->tess_accepted &&
00310       (strchr (word->best_choice->string ().string (), ' ') == NULL);
00311 
00312     if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
00313       word->done = FALSE;
00314 
00315     if (word->done &&
00316       ((word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
00317       (word->best_choice->permuter () != FREQ_DAWG_PERM) &&
00318       (word->best_choice->permuter () != USER_DAWG_PERM) &&
00319       (word->best_choice->permuter () != NUMBER_PERM)) ||
00320     (test_ambig_word (word))) {
00321       #ifndef SECURE_NAMES
00322       if (tessedit_rejection_debug)
00323         tprintf ("\nVETO Tess accepting poor word \"%s\"\n",
00324           word->best_choice->string ().string ());
00325       #endif
00326       word->done = FALSE;
00327     }
00328   }
00329 
00330   else {
00331     tprintf ("BAD tessedit_ok_mode\n");
00332     err_exit();
00333   }
00334 }
00335 
00336 
00357 void make_reject_map(
00358                      WERD_RES *word,
00359                      BLOB_CHOICE_LIST_CLIST *blob_choices,
00360                      ROW *row,
00361                      INT16 pass  
00362                     ) {
00363   INT16 i;
00364 
00365   flip_0O(word);
00366   check_debug_pt (word, -1);     
00367   set_done(word, pass);  
00368   word->reject_map.initialise (word->best_choice->string ().length ());
00369   reject_blanks(word);
00370   
00371 
00372 
00373   if (tessedit_reject_mode == 0) {
00374     if (!word->done)
00375       reject_poor_matches(word, blob_choices);
00376   }
00377   
00378 
00379 
00380 
00381 
00382   else if (tessedit_reject_mode == 5) {
00383     if (bln_x_height / word->denorm.scale () <= min_sane_x_ht_pixels)
00384       word->reject_map.rej_word_small_xht ();
00385     else {
00386       one_ell_conflict(word, TRUE);
00387       
00388 
00389 
00390 
00391 
00392 
00393       if (rej_use_tess_accepted && !word->tess_accepted)
00394         word->reject_map.rej_word_not_tess_accepted ();
00395 
00396       if (rej_use_tess_blanks &&
00397         (strchr (word->best_choice->string ().string (), ' ') != NULL))
00398         word->reject_map.rej_word_contains_blanks ();
00399 
00400       if (rej_use_good_perm) {
00401         if (((word->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
00402           (word->best_choice->permuter () == FREQ_DAWG_PERM) ||
00403           (word->best_choice->permuter () == USER_DAWG_PERM)) &&
00404           (!rej_use_sensible_wd ||
00405           (acceptable_word_string
00406           (word->best_choice->string ().string ()) != AC_UNACCEPTABLE))) {
00407           
00408         }
00409         else if (word->best_choice->permuter () == NUMBER_PERM) {
00410           if (rej_alphas_in_number_perm) {
00411             for (i = 0; word->best_choice->string ()[i] != '\0';
00412             i++) {
00413               if (word->reject_map[i].accepted () &&
00414                 isalpha (word->best_choice->string ()[i]))
00415                 word->reject_map[i].setrej_bad_permuter ();
00416               
00417             }
00418           }
00419         }
00420         else {
00421           word->reject_map.rej_word_bad_permuter ();
00422         }
00423       }
00424 
00425       
00426 
00427     }
00428   }
00429   else {
00430     tprintf ("BAD tessedit_reject_mode\n");
00431     err_exit();
00432   }
00433 
00434   if (tessedit_image_border > -1)
00435     reject_edge_blobs(word);
00436 
00437   check_debug_pt (word, 10);
00438   if (tessedit_rejection_debug) {
00439     tprintf ("Permuter Type = %d\n", word->best_choice->permuter ());
00440     tprintf ("Certainty: %f     Rating: %f\n",
00441       word->best_choice->certainty (), word->best_choice->rating ());
00442     tprintf ("Dict word: %d\n",
00443       dict_word (word->best_choice->string ().string ()));
00444   }
00445 
00446   
00447 
00448   if (tessedit_use_nn && (pass == 2) &&
00449     word->reject_map.recoverable_rejects ())
00450     nn_recover_rejects(word, row);
00451   flip_hyphens(word);
00452   check_debug_pt (word, 20);
00453 }
00454 
00455 
00463 void reject_blanks(WERD_RES *word) {
00464   INT16 i;
00465 
00466   for (i = 0; word->best_choice->string ()[i] != '\0'; i++) {
00467     if (word->best_choice->string ()[i] == ' ')
00468       word->reject_map[i].setrej_tess_failure (); 
00469   }
00470 }
00471 
00472 
00481 void reject_I_1_L(WERD_RES *word) {
00482   INT16 i;
00483 
00484   for (i = 0; word->best_choice->string ()[i] != '\0'; i++) {
00485     if (STRING (conflict_set_I_l_1).
00486     contains (word->best_choice->string ()[i])) {
00487       word->reject_map[i].setrej_1Il_conflict (); 
00488     }
00489   }
00490 }
00491 
00492 
00501 void reject_poor_matches(  
00502                          WERD_RES *word,
00503                          BLOB_CHOICE_LIST_CLIST *blob_choices) {
00504   float threshold;
00505   INT16 i = 0;
00506                                  
00507   BLOB_CHOICE_LIST_C_IT list_it = blob_choices;
00508   BLOB_CHOICE_IT choice_it;      
00509 
00510   #ifndef SECURE_NAMES
00511   if (strlen (word->best_choice->string ().string ()) != list_it.length ()) {
00512     tprintf
00513       ("ASSERT FAIL string:\"%s\"; strlen=%d; choices len=%d; blob len=%d\n",
00514       word->best_choice->string ().string (),
00515       strlen (word->best_choice->string ().string ()), list_it.length (),
00516       word->outword->blob_list ()->length ());
00517   }
00518   #endif
00519   ASSERT_HOST (strlen (word->best_choice->string ().string ()) ==
00520     list_it.length ());
00521   ASSERT_HOST (word->outword->blob_list ()->length () == list_it.length ());
00522   threshold = compute_reject_threshold (blob_choices);
00523 
00524   for (list_it.mark_cycle_pt ();
00525   !list_it.cycled_list (); list_it.forward (), i++) {
00526     
00527 
00528 
00529 
00530     choice_it.set_to_list (list_it.data ());
00531     if ((word->best_choice->string ()[i] == ' ') ||
00532       (choice_it.length () == 0))
00533       word->reject_map[i].setrej_tess_failure (); 
00534     else if (choice_it.data ()->certainty () < threshold)
00535       word->reject_map[i].setrej_poor_match (); 
00536   }
00537 }
00538 
00539 
00549 float compute_reject_threshold(
00550                                BLOB_CHOICE_LIST_CLIST *blob_choices) {
00551   INT16 index;                   
00552   INT16 blob_count;              
00553   INT16 ok_blob_count = 0;       
00554   float *ratings;                
00555   float threshold;               
00556   float bestgap;                 
00557   float gapstart;                
00558                                  
00559   BLOB_CHOICE_LIST_C_IT list_it = blob_choices;
00560   BLOB_CHOICE_IT choice_it;      
00561 
00562   blob_count = blob_choices->length ();
00563   ratings = (float *) alloc_mem (blob_count * sizeof (float));
00564   for (list_it.mark_cycle_pt (), index = 0;
00565   !list_it.cycled_list (); list_it.forward (), index++) {
00566     choice_it.set_to_list (list_it.data ());
00567     if (choice_it.length () > 0) {
00568       ratings[ok_blob_count] = choice_it.data ()->certainty ();
00569       
00570       
00571       
00572       
00573       ok_blob_count++;
00574     }
00575   }
00576   ASSERT_HOST (index == blob_count);
00577   qsort (ratings, ok_blob_count, sizeof (float), sort_floats);
00578   
00579   bestgap = 0;
00580   gapstart = ratings[0] - 1;     
00581   if (ok_blob_count >= 3) {
00582     for (index = 0; index < ok_blob_count - 1; index++) {
00583       if (ratings[index + 1] - ratings[index] > bestgap) {
00584         bestgap = ratings[index + 1] - ratings[index];
00585         
00586         gapstart = ratings[index];
00587       }
00588     }
00589   }
00590   threshold = gapstart + bestgap / 2;
00591   
00592   
00593 
00594   free_mem(ratings);
00595   return threshold;
00596 }
00597 
00598 
00606 int sort_floats(
00607                 const void *arg1,
00608                 const void *arg2) {
00609   float diff;                    
00610 
00611   diff = *((float *) arg1) - *((float *) arg2);
00612   if (diff > 0)
00613     return 1;
00614   else if (diff < 0)
00615     return -1;
00616   else
00617     return 0;
00618 }
00619 
00620 
00631 void reject_edge_blobs(WERD_RES *word) {
00632   BOX word_box = word->word->bounding_box ();
00633   BOX blob_box;
00634   PBLOB_IT blob_it = word->outword->blob_list ();
00635   
00636   int blobindex = 0;
00637   float centre;
00638 
00639   if ((word_box.left () < tessedit_image_border) ||
00640     (word_box.bottom () < tessedit_image_border) ||
00641     (word_box.right () + tessedit_image_border >
00642     page_image.get_xsize () - 1) ||
00643   (word_box.top () + tessedit_image_border > page_image.get_ysize () - 1)) {
00644     ASSERT_HOST (word->reject_map.length () == blob_it.length ());
00645     for (blobindex = 0, blob_it.mark_cycle_pt ();
00646     !blob_it.cycled_list (); blobindex++, blob_it.forward ()) {
00647       blob_box = blob_it.data ()->bounding_box ();
00648       centre = (blob_box.left () + blob_box.right ()) / 2.0;
00649       if ((word->denorm.x (blob_box.left ()) < tessedit_image_border) ||
00650         (word->denorm.y (blob_box.bottom (), centre) <
00651         tessedit_image_border) ||
00652         (word->denorm.x (blob_box.right ()) + tessedit_image_border >
00653         page_image.get_xsize () - 1) ||
00654         (word->denorm.y (blob_box.top (), centre)
00655       + tessedit_image_border > page_image.get_ysize () - 1)) {
00656         word->reject_map[blobindex].setrej_edge_char (); 
00657       }
00658     }
00659   }
00660 }
00661 
00662 
00684 BOOL8 one_ell_conflict(WERD_RES *word_res, BOOL8 update_map) {
00685   const char *word;
00686   INT16 word_len;                
00687   INT16 first_alphanum_idx;
00688   INT16 i;
00689   BOOL8 non_conflict_set_char;   
00690   BOOL8 conflict = FALSE;
00691   BOOL8 allow_1s;
00692   ACCEPTABLE_WERD_TYPE word_type;
00693   BOOL8 dict_perm_type;
00694   BOOL8 dict_word_ok;
00695   int dict_word_type;
00696 
00697   word = word_res->best_choice->string ().string ();
00698   word_len = strlen (word);
00699   
00700 
00701 
00702 
00703   if (strpbrk (word, conflict_set_I_l_1.string ()) == NULL)
00704     return FALSE;
00705 
00706   
00707 
00708 
00709 
00710 
00711   for (i = 0, non_conflict_set_char = FALSE;
00712     (i < word_len) && !non_conflict_set_char; i++)
00713   non_conflict_set_char = isalnum (word[i]) &&
00714       !STRING (conflict_set_I_l_1).contains (word[i]);
00715   if (!non_conflict_set_char) {
00716     if (update_map)
00717       reject_I_1_L(word_res);
00718     return TRUE;
00719   }
00720 
00721   
00722 
00723 
00724 
00725 
00726 
00727   dict_perm_type = (word_res->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
00728     (word_res->best_choice->permuter () == USER_DAWG_PERM) ||
00729     (rej_trust_doc_dawg &&
00730     (word_res->best_choice->permuter () == DOC_DAWG_PERM)) ||
00731     (word_res->best_choice->permuter () == FREQ_DAWG_PERM);
00732   dict_word_type = dict_word (word);
00733   dict_word_ok = (dict_word_type > 0) &&
00734     (rej_trust_doc_dawg || (dict_word_type != DOC_DAWG_PERM));
00735 
00736   if ((rej_1Il_use_dict_word && dict_word_ok) ||
00737     (rej_1Il_trust_permuter_type && dict_perm_type) ||
00738   (dict_perm_type && dict_word_ok)) {
00739     first_alphanum_idx = first_alphanum_pos (word);
00740     if (word[first_alphanum_idx] == 'I') {
00741       word_res->best_choice->string ()[first_alphanum_idx] = 'l';
00742       if (safe_dict_word (word) > 0) {
00743         word_res->best_choice->string ()[first_alphanum_idx] = 'I';
00744         if (update_map)
00745           word_res->reject_map[first_alphanum_idx].
00746             setrej_1Il_conflict();
00747         return TRUE;
00748       }
00749       else {
00750         word_res->best_choice->string ()[first_alphanum_idx] = 'I';
00751         return FALSE;
00752       }
00753     }
00754 
00755     if (word[first_alphanum_idx] == 'l') {
00756       word_res->best_choice->string ()[first_alphanum_idx] = 'I';
00757       if (safe_dict_word (word) > 0) {
00758         word_res->best_choice->string ()[first_alphanum_idx] = 'l';
00759         if (update_map)
00760           word_res->reject_map[first_alphanum_idx].
00761           setrej_1Il_conflict();
00762         return TRUE;
00763       }
00764       else {
00765         word_res->best_choice->string ()[first_alphanum_idx] = 'l';
00766         return FALSE;
00767       }
00768     }
00769     return FALSE;
00770   }
00771 
00772   
00773 
00774 
00775 
00776 
00777 
00778 
00779 
00780 
00781 
00782   
00783 
00784 
00785 
00786   first_alphanum_idx = first_alphanum_pos (word);
00787   if (word[first_alphanum_idx] == 'l') {
00788     word_res->best_choice->string ()[first_alphanum_idx] = 'I';
00789     if (safe_dict_word (word) > 0)
00790       return FALSE;
00791     else
00792       word_res->best_choice->string ()[first_alphanum_idx] = 'l';
00793   }
00794   else if (word[first_alphanum_idx] == 'I') {
00795     word_res->best_choice->string ()[first_alphanum_idx] = 'l';
00796     if (safe_dict_word (word) > 0)
00797       return FALSE;
00798     else
00799       word_res->best_choice->string ()[first_alphanum_idx] = 'I';
00800   }
00801   
00802 
00803 
00804 
00805 
00806 
00807   if (word_contains_non_1_digit (word)) {
00808     allow_1s = (alpha_count (word) == 0) ||
00809       (word_res->best_choice->permuter () == NUMBER_PERM);
00810 
00811     conflict = FALSE;
00812     for (i = 0; i < word_len; i++) {
00813       if ((!allow_1s || (word[i] != '1')) &&
00814       STRING (conflict_set_I_l_1).contains (word[i])) {
00815         if (update_map)
00816           word_res->reject_map[i].setrej_1Il_conflict ();
00817         conflict = TRUE;
00818       }
00819     }
00820     return conflict;
00821   }
00822   
00823 
00824 
00825 
00826   word_type = acceptable_word_string (word);
00827   if ((word_type == AC_LOWER_CASE) || (word_type == AC_INITIAL_CAP)) {
00828     first_alphanum_idx = first_alphanum_pos (word);
00829     if (STRING (conflict_set_I_l_1).contains (word[first_alphanum_idx])) {
00830       if (update_map)
00831         word_res->reject_map[first_alphanum_idx].setrej_1Il_conflict ();
00832       return TRUE;
00833     }
00834     else
00835       return FALSE;
00836   }
00837   else if (word_type == AC_UPPER_CASE) {
00838     return FALSE;
00839   }
00840   else {
00841     if (update_map)
00842       reject_I_1_L(word_res);
00843     return TRUE;
00844   }
00845 }
00846 
00847 
00851 INT16 first_alphanum_pos(const char *word) {
00852   INT16 i;
00853 
00854   for (i = 0; word[i] != '\0'; i++) {
00855     if (isalnum (word[i]))
00856       return i;
00857   }
00858   return -1;
00859 }
00860 
00861 
00865 INT16 alpha_count(const char *word) {
00866   INT16 i;
00867   INT16 count = 0;
00868 
00869   for (i = 0; word[i] != '\0'; i++) {
00870     if (isalpha (word[i]))
00871       count++;
00872   }
00873   return count;
00874 }
00875 
00876 
00880 BOOL8 word_contains_non_1_digit(const char *word) {
00881   INT16 i;
00882 
00883   for (i = 0; word[i] != '\0'; i++) {
00884     if (isdigit (word[i]) && word[i] != '1')
00885       return TRUE;
00886   }
00887   return FALSE;
00888 }
00889 
00890 
00896 BOOL8 test_ambig_word(
00897                       WERD_RES *word) {
00898   BOOL8 ambig = FALSE;
00899 
00900   if ((word->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
00901     (word->best_choice->permuter () == FREQ_DAWG_PERM) ||
00902   (word->best_choice->permuter () == USER_DAWG_PERM)) {
00903     ambig = !NoDangerousAmbig(word->best_choice->string().string(), NULL);
00904   }
00905   return ambig;
00906 }
00907 
00908 
00919 BOOL8 ambig_word(
00920                  const char *start_word,
00921                  char *temp_word,
00922                  INT16 test_char_pos
00923                 ) {
00924   const char *ambigs;            
00925 
00926   if (*(temp_word + test_char_pos) == '\0') {
00927     if (safe_dict_word (temp_word)) {
00928       if (strcmp (start_word, temp_word) == 0)
00929         return FALSE;
00930       else
00931         return TRUE;
00932     }
00933     else
00934       return FALSE;
00935   }
00936   else {
00937     ambigs = char_ambiguities (*(temp_word + test_char_pos));
00938     if (ambigs == NULL)
00939       return ambig_word (start_word, temp_word, test_char_pos + 1);
00940     else {
00941       while (*ambigs != '\0') {
00942         *(temp_word + test_char_pos) = *ambigs++;
00943         
00944         if (ambig_word (start_word, temp_word, test_char_pos + 1))
00945           return TRUE;
00946       }
00947       return FALSE;
00948     }
00949   }
00950 }
00951 
00952 
00971 const char *char_ambiguities(char c) {
00972   static STRING_CLIST conflict_sets;
00973   static BOOL8 read_conflict_sets = FALSE;
00974   STRING_C_IT cs_it(&conflict_sets);
00975   const char *cs;
00976   STRING cs_file_name;
00977   FILE *cs_file;
00978   char buff[1024];
00979 
00980   if (!read_conflict_sets) {
00981     cs_file_name = datadir + "confsets";
00982     if (!(cs_file = fopen (cs_file_name.string (), "r"))) {
00983       CANTOPENFILE.error ("char_ambiguities", EXIT, "%s %d",
00984         cs_file_name.string (), errno);
00985     }
00986     while (fscanf (cs_file, "%s", buff) == 1) {
00987       cs_it.add_after_then_move (new STRING (buff));
00988     }
00989     read_conflict_sets = TRUE;
00990     cs_it.move_to_first ();
00991     if (tessedit_rejection_debug) {
00992       for (cs_it.mark_cycle_pt ();
00993       !cs_it.cycled_list (); cs_it.forward ()) {
00994         tprintf ("\"%s\"\n", cs_it.data ()->string ());
00995       }
00996     }
00997   }
00998 
00999   cs_it.move_to_first ();
01000   for (cs_it.mark_cycle_pt (); !cs_it.cycled_list (); cs_it.forward ()) {
01001     cs = cs_it.data ()->string ();
01002     if (strchr (cs, c) != NULL)
01003       return cs;
01004   }
01005   return NULL;
01006 }
01007 
01008 #ifndef EMBEDDED
01009 
01018 void test_ambigs(const char *word) {
01019   char orig_word[80];
01020   char temp_word[80];
01021 
01022   if (strlen (word) > 80)
01023     tprintf ("Ridiculously long word \"%s\"\n", word);
01024   else {
01025     strcpy(orig_word, word);
01026     while (strlen (orig_word) > 0) {
01027       strcpy(temp_word, orig_word);
01028 
01029       #ifndef SECURE_NAMES
01030       if (ambig_word (orig_word, temp_word, 0))
01031         tprintf ("Ambiguity \"%s\" -> \"%s\"\n", orig_word, temp_word);
01032       else
01033         tprintf ("NO Ambiguities for  \"%s\"\n", orig_word);
01034       tprintf ("Next Word > ");
01035       #endif
01036       scanf ("%s", orig_word);
01037     }
01038   }
01039 }
01040 #endif
01041 
01042 
01054 void nn_recover_rejects(WERD_RES *word, ROW *row) {
01055   REJMAP old_map = word->reject_map; 
01056 
01057   
01058 
01059 
01060 
01061 
01062 
01063 
01064   set_global_subsubloc_code(SUBSUBLOC_NN);
01065   nn_match_word(word, row);
01066 
01067   if (no_unrej_1Il)
01068     dont_allow_1Il(word);
01069   if (no_unrej_dubious_chars)
01070     dont_allow_dubious_chars(word);
01071 
01072   if (rej_mostly_reject_mode == 1)
01073     reject_mostly_rejects(word);
01074   
01075 
01076 
01077 
01078 
01079 
01080   if (no_unrej_no_alphanum_wds &&
01081     (count_alphanums (word) < 1) &&
01082     !((word->best_choice->string ().length () == 1) &&
01083     STRING (ok_single_ch_non_alphanum_wds).contains (word->best_choice->
01084     string ()[0]))
01085     && !repeated_nonalphanum_wd (word, row))
01086 
01087     word->reject_map.rej_word_no_alphanums ();
01088 
01089   #ifndef SECURE_NAMES
01090 
01091   if (nn_debug) {
01092     tprintf ("\nTess: \"%s\" MAP ", word->best_choice->string ().string ());
01093     old_map.print (stdout);
01094     tprintf ("->");
01095     word->reject_map.print (stdout);
01096     tprintf ("\n");
01097   }
01098   #endif
01099   set_global_subsubloc_code(SUBSUBLOC_OTHER);
01100 }
01101 
01102 
01121 void nn_match_word(
01122                    WERD_RES *word,
01123                    ROW *row) {
01124   PIXROW_LIST *pixrow_list;
01125   PIXROW_IT pixrow_it;
01126   IMAGELINE *imlines;            
01127   BOX pix_box;                   
01128 #ifndef GRAPHICS_DISABLED
01129   WINDOW win = NULL;
01130 #endif
01131   IMAGE clip_image;
01132   IMAGE scaled_image;
01133   float baseline_pos;
01134   INT16 net_image_size;
01135   INT16 clip_image_size;
01136   WERD copy_outword;             
01137   INT16 i;
01138 
01139   const char *word_string;
01140   BOOL8 word_in_dict;            
01141   BOOL8 checked_dict_word;       
01142   BOOL8 sensible_word;           
01143   BOOL8 centre;                  
01144   BOOL8 good_quality_word;
01145   INT16 char_quality;
01146   INT16 accepted_char_quality;
01147 
01148   INT16 conf_level; 
01149   INT16 first_alphanum_idx;
01150 
01151   word_string = word->best_choice->string ().string ();
01152   first_alphanum_idx = first_alphanum_pos (word_string);
01153   word_in_dict = ((word->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
01154     (word->best_choice->permuter () == FREQ_DAWG_PERM) ||
01155     (word->best_choice->permuter () == USER_DAWG_PERM));
01156   checked_dict_word = word_in_dict && (safe_dict_word (word_string) > 0);
01157   sensible_word = acceptable_word_string (word_string) != AC_UNACCEPTABLE;
01158 
01159   word_char_quality(word, row, &char_quality, &accepted_char_quality);
01160   good_quality_word = word->best_choice->string ().length () == char_quality;
01161 
01162   #ifndef SECURE_NAMES
01163   if (nn_reject_debug) {
01164     tprintf ("Dict: %c   Checked Dict: %c   Sensible: %c   Quality: %c\n",
01165       word_in_dict ? 'T' : 'F',
01166       checked_dict_word ? 'T' : 'F',
01167       sensible_word ? 'T' : 'F', good_quality_word ? 'T' : 'F');
01168   }
01169   #endif
01170 
01171   if (word->best_choice->string ().length () !=
01172   word->outword->blob_list ()->length ()) {
01173     #ifndef SECURE_NAMES
01174     tprintf ("nn_match_word ASSERT FAIL String:\"%s\";  #Blobs=%d\n",
01175       word->best_choice->string ().string (),
01176       word->outword->blob_list ()->length ());
01177     #endif
01178     err_exit();
01179   }
01180 
01181   copy_outword = *(word->outword);
01182   copy_outword.baseline_denormalise (&word->denorm);
01183   
01184 
01185 
01186 
01187 
01188 
01189   net_image_size = (net_image_width > net_image_height) ?
01190     net_image_width : net_image_height;
01191   clip_image_size = (INT16) floor (0.5 +
01192     net_image_size * word->x_height /
01193     net_image_x_height);
01194   if ((clip_image_size <= 1) || (net_image_size <= 1)) {
01195     return;
01196   }
01197 
01198   
01199 
01200 
01201   char_clip_word(©_outword, page_image, pixrow_list, imlines, pix_box);
01202 #ifndef GRAPHICS_DISABLED
01203   if (show_char_clipping) {
01204     win = display_clip_image (©_outword, page_image,
01205       pixrow_list, pix_box);
01206   }
01207 #endif
01208   pixrow_it.set_to_list (pixrow_list);
01209   pixrow_it.move_to_first ();
01210   for (pixrow_it.mark_cycle_pt (), i = 0;
01211   !pixrow_it.cycled_list (); pixrow_it.forward (), i++) {
01212     if (pixrow_it.data ()->
01213       bad_box (page_image.get_xsize (), page_image.get_ysize ()))
01214       continue;
01215     clip_image.create (clip_image_size, clip_image_size, 1);
01216     
01217     if (!copy_outword.flag (W_INVERSE))
01218       invert_image(&clip_image);  
01219     pixrow_it.data ()->char_clip_image (imlines, pix_box, row,
01220       clip_image, baseline_pos);
01221     if (copy_outword.flag (W_INVERSE))
01222       invert_image(&clip_image);  
01223     scaled_image.create (net_image_size, net_image_size, 1);
01224     scale_image(clip_image, scaled_image);
01225     baseline_pos *= net_image_size / clip_image_size;
01226     
01227     centre = !pixrow_it.at_first () && !pixrow_it.at_last ();
01228 
01229     conf_level = nn_match_char (scaled_image, baseline_pos,
01230       word_in_dict, checked_dict_word,
01231       sensible_word, centre,
01232       good_quality_word, word_string[i]);
01233     if (word->reject_map[i].recoverable ()) {
01234       if ((i == first_alphanum_idx) &&
01235       ((word_string[i] == 'I') || (word_string[i] == 'i'))) {
01236         if (conf_level >= nn_conf_initial_i_level)
01237           word->reject_map[i].setrej_nn_accept (); 
01238       }
01239       else if (conf_level > 0)
01240         word->reject_map[i].setrej_nn_accept (); 
01241     }
01242 #ifndef GRAPHICS_DISABLED
01243     if (show_char_clipping)
01244       display_images(clip_image, scaled_image);
01245 #endif
01246     clip_image.destroy ();
01247     scaled_image.destroy ();
01248   }
01249 
01250   delete[]imlines;               
01251   delete pixrow_list;
01252 
01253 #ifndef GRAPHICS_DISABLED
01254   if (show_char_clipping) {
01255     destroy_window(win);
01256   }
01257 #endif
01258 }
01259 
01260 
01276 INT16 nn_match_char(
01277                     IMAGE &scaled_image,
01278                     float baseline_pos,       
01279                     BOOL8 dict_word,          
01280                     BOOL8 checked_dict_word,  
01281                     BOOL8 sensible_word,      
01282                     BOOL8 centre,             
01283                     BOOL8 good_quality_word,  
01284                     char tess_ch              
01285                    ) {
01286   INT16 conf_level;              
01287   INT32 row;
01288   INT32 col;
01289   INT32 y_size = scaled_image.get_ysize ();
01290   INT32 start_y = y_size - (y_size - net_image_height) / 2 - 1;
01291   INT32 end_y = start_y - net_image_height + 1;
01292   IMAGELINE imline;
01293   float *input_vector;
01294   float *input_vec_ptr;
01295   char top;
01296   float top_score;
01297   char next;
01298   float next_score;
01299   INT16 input_nodes = (net_image_height * net_image_width) + net_bl_nodes;
01300   INT16 j;
01301 
01302   input_vector = (float *) alloc_mem (input_nodes * sizeof (float));
01303   input_vec_ptr = input_vector;
01304 
01305   invert_image(&scaled_image);  
01306   for (row = start_y; row >= end_y; row--) {
01307     scaled_image.fast_get_line (0, row, net_image_width, &imline);
01308     for (col = 0; col < net_image_width; col++)
01309       *input_vec_ptr++ = imline.pixels[col];
01310   }
01311   
01312 
01313 
01314 
01315   baseline_pos -= (y_size - net_image_height) / 2.0;
01316   
01317 
01318 
01319 
01320 
01321 
01322 
01323   if (baseline_pos < 0)
01324     baseline_pos = 0;
01325   else if (baseline_pos >= net_image_height)
01326     baseline_pos = net_image_height + 1;
01327   else
01328     baseline_pos = baseline_pos + 1;
01329   baseline_pos = baseline_pos / (net_image_height + 1);
01330 
01331   if (net_bl_nodes > 0) {
01332     baseline_pos *= 1.7;         
01333     if (net_bl_nodes > 1) {
01334       
01335       for (j = 0; j < net_bl_nodes; j++) {
01336         if (baseline_pos > ((float) j / net_bl_nodes))
01337           *input_vec_ptr++ = 1.0;
01338         else
01339           *input_vec_ptr++ = 0.0;
01340       }
01341     }
01342     else {
01343       
01344       *input_vec_ptr++ = baseline_pos;
01345     }
01346   }
01347 
01348   callnet(input_vector, &top, &top_score, &next, &next_score);
01349   conf_level = evaluate_net_match (top, top_score, next, next_score,
01350     tess_ch, dict_word, checked_dict_word,
01351     sensible_word, centre, good_quality_word);
01352   #ifndef SECURE_NAMES
01353   if (nn_reject_debug) {
01354     tprintf ("top:\"%c\" %4.2f   next:\"%c\" %4.2f  TESS:\"%c\" Conf: %d\n",
01355       top, top_score, next, next_score, tess_ch, conf_level);
01356   }
01357   #endif
01358   free_mem(input_vector);
01359   return conf_level;
01360 }
01361 
01386 INT16 evaluate_net_match(char top,
01387                          float top_score,
01388                          char next,
01389                          float next_score,
01390                          char tess_ch,
01391                          BOOL8 dict_word,
01392                          BOOL8 checked_dict_word,
01393                          BOOL8 sensible_word,
01394                          BOOL8 centre,
01395                          BOOL8 good_quality_word) {
01396   INT16 accept_level; 
01397   BOOL8 good_top_choice;
01398   BOOL8 excellent_top_choice;
01399   BOOL8 confusion_match = FALSE;
01400   BOOL8 dodgy_char = !isalnum (tess_ch);
01401 
01402   good_top_choice = (top_score > nn_reject_threshold) &&
01403     (nn_reject_head_and_shoulders * top_score > next_score);
01404 
01405   excellent_top_choice = good_top_choice &&
01406     (top_score > nn_dodgy_char_threshold);
01407 
01408   if (top == tess_ch) {
01409     if (excellent_top_choice)
01410       accept_level = 0;
01411     else if (good_top_choice)
01412       accept_level = 1;          
01413     else
01414       accept_level = 2;          
01415   }
01416   else if ((nn_conf_1Il &&
01417     STRING (conflict_set_I_l_1).contains (tess_ch) &&
01418     STRING (conflict_set_I_l_1).contains (top)) ||
01419     (nn_conf_hyphen &&
01420     STRING (conflict_set_hyphen).contains (tess_ch) &&
01421     STRING (conflict_set_hyphen).contains (top)) ||
01422     (nn_conf_Ss &&
01423     STRING (conflict_set_S_s).contains (tess_ch) &&
01424   STRING (conflict_set_S_s).contains (top))) {
01425     confusion_match = TRUE;
01426     if (good_top_choice)
01427       accept_level = 1;          
01428     else
01429       accept_level = 2;          
01430   }
01431   else if ((nn_conf_1Il &&
01432     STRING (conflict_set_I_l_1).contains (tess_ch) &&
01433     STRING (conflict_set_I_l_1).contains (next)) ||
01434     (nn_conf_hyphen &&
01435     STRING (conflict_set_hyphen).contains (tess_ch) &&
01436     STRING (conflict_set_hyphen).contains (next)) ||
01437     (nn_conf_Ss &&
01438     STRING (conflict_set_S_s).contains (tess_ch) &&
01439   STRING (conflict_set_S_s).contains (next))) {
01440     confusion_match = TRUE;
01441     if (!good_top_choice)
01442       accept_level = 3;          
01443     else
01444       accept_level = 4;          
01445   }
01446   else if (next == tess_ch) {
01447     if (!good_top_choice)
01448       accept_level = 3;          
01449     else
01450       accept_level = 4;          
01451   }
01452   else
01453     accept_level = 5;
01454 
01455   
01456 
01457   
01458 
01459 
01460   if ((accept_level == 0) && !confusion_match)
01461     return 3;
01462 
01463   if ((accept_level <= 1) &&
01464     (!nn_conf_strict_on_dodgy_chs || !dodgy_char) && !confusion_match)
01465     return 3;
01466 
01467   if ((accept_level == 2) &&
01468     !confusion_match && !dodgy_char &&
01469     good_quality_word &&
01470     dict_word &&
01471     (checked_dict_word || !nn_double_check_dict) && sensible_word)
01472     return 2;
01473 
01474   if (confusion_match &&
01475     (accept_level <= nn_conf_accept_level) &&
01476     (good_quality_word ||
01477     (!nn_conf_test_good_qual &&
01478     !STRING (conflict_set_I_l_1).contains (tess_ch))) &&
01479     (dict_word || !nn_conf_test_dict) &&
01480     (checked_dict_word || !nn_conf_double_check_dict) &&
01481     (sensible_word || !nn_conf_test_sensible))
01482     return 1;
01483 
01484   if (!confusion_match &&
01485     nn_lax &&
01486     (accept_level == 3) &&
01487     (good_quality_word || !nn_conf_test_good_qual) &&
01488     (dict_word || !nn_conf_test_dict) &&
01489     (sensible_word || !nn_conf_test_sensible))
01490     return 1;
01491   else
01492     return 0;
01493 }
01494 
01495 
01505 void dont_allow_dubious_chars(WERD_RES *word) {
01506   int i = 0;
01507   int rej_pos;
01508   int word_len = word->reject_map.length ();
01509 
01510   while (i < word_len) {
01511     
01512 
01513     while ((i < word_len) && (word->reject_map[i].accepted ()))
01514       i++;
01515 
01516     if (i < word_len) {
01517       rej_pos = i;
01518 
01519       
01520       i--;
01521       while ((i >= 0) &&
01522         STRING (dubious_chars_left_of_reject).contains (word->
01523         best_choice->
01524         string ()
01525       [i])) {
01526         word->reject_map[i--].setrej_dubious ();
01527       }
01528 
01529       
01530 
01531       for (i = rej_pos;
01532         (i < word_len) && (word->reject_map[i].rejected ()); i++);
01533 
01534       
01535 
01536       while ((i < word_len) &&
01537         STRING (dubious_chars_right_of_reject).contains (word->
01538         best_choice->
01539         string ()
01540       [i])) {
01541         word->reject_map[i++].setrej_dubious ();
01542       }
01543     }
01544   }
01545 }
01546 
01547 
01555 void dont_allow_1Il(WERD_RES *word) {
01556   int i = 0;
01557   int word_len = word->reject_map.length ();
01558   const char *s = word->best_choice->string ().string ();
01559   BOOL8 accepted_1Il = FALSE;
01560 
01561   for (i = 0; i < word_len; i++) {
01562     if (word->reject_map[i].accepted ()) {
01563       if (STRING (conflict_set_I_l_1).contains (s[i]))
01564         accepted_1Il = TRUE;
01565       else {
01566         if (isalnum (s[i]))
01567           return;                
01568       }
01569     }
01570   }
01571   if (!accepted_1Il)
01572     return;                      
01573 
01574   for (i = 0; i < word_len; i++) {
01575     if (STRING (conflict_set_I_l_1).contains (s[i]) &&
01576       word->reject_map[i].accepted ())
01577       word->reject_map[i].setrej_postNN_1Il ();
01578   }
01579 }
01580 
01581 
01585 INT16 count_alphanums(
01586                       WERD_RES *word) {
01587   int count = 0;
01588   int i;
01589 
01590   for (i = 0; i < word->reject_map.length (); i++) {
01591     if ((word->reject_map[i].accepted ()) &&
01592       (isalnum (word->best_choice->string ()[i])))
01593       count++;
01594   }
01595   return count;
01596 }
01597 
01606 void reject_mostly_rejects(
01607                            WERD_RES *word) {
01608 
01609   if ((float) word->reject_map.reject_count () / word->reject_map.length () >=
01610     rej_whole_of_mostly_reject_word_fract)
01611     word->reject_map.rej_word_mostly_rej ();
01612 }
01613 
01614 
01618 BOOL8 repeated_nonalphanum_wd(WERD_RES *word, ROW *row) {
01619   INT16 char_quality;
01620   INT16 accepted_char_quality;
01621 
01622   if (word->best_choice->string ().length () <= 1)
01623     return FALSE;
01624 
01625   if (!STRING (ok_repeated_ch_non_alphanum_wds).
01626     contains (word->best_choice->string ()[0]))
01627     return FALSE;
01628 
01629   if (!repeated_ch_string (word->best_choice->string ().string ()))
01630     return FALSE;
01631 
01632   word_char_quality(word, row, &char_quality, &accepted_char_quality);
01633 
01634   if ((word->best_choice->string ().length () == char_quality) &&
01635     (char_quality == accepted_char_quality))
01636     return TRUE;
01637   else
01638     return FALSE;
01639 }
01640 
01641 
01645 BOOL8 repeated_ch_string(const char *rep_ch_str) {
01646   char c;
01647 
01648   if ((rep_ch_str == NULL) || (*rep_ch_str == '\0')) {
01649     return FALSE;
01650   }
01651 
01652   c = *rep_ch_str;
01653   rep_ch_str++;
01654   while (*rep_ch_str == c) {
01655     rep_ch_str++;
01656   }
01657   if (*rep_ch_str == '\0')
01658     return TRUE;
01659   return FALSE;
01660 }
01661 
01662 
01666 INT16 safe_dict_word(const char *s) {
01667   int dict_word_type;
01668 
01669   dict_word_type = dict_word (s);
01670   if (dict_word_type == DOC_DAWG_PERM)
01671     return 0;
01672   else
01673     return dict_word_type;
01674 }
01675 
01676 
01686 void flip_hyphens(WERD_RES *word) {
01687   char *str = (char *) word->best_choice->string ().string ();
01688   int i = 0;
01689   PBLOB_IT outword_it;
01690   int prev_right = -9999;
01691   int next_left;
01692   BOX out_box;
01693   float aspect_ratio;
01694 
01695   if (tessedit_lower_flip_hyphen <= 1)
01696     return;
01697 
01698   outword_it.set_to_list (word->outword->blob_list ());
01699 
01700   for (outword_it.mark_cycle_pt ();
01701   !outword_it.cycled_list (); outword_it.forward (), i++) {
01702     out_box = outword_it.data ()->bounding_box ();
01703     if (outword_it.at_last ())
01704       next_left = 9999;
01705     else
01706       next_left = outword_it.data_relative (1)->bounding_box ().left ();
01707     
01708 
01709 
01710     if ((out_box.width () > 8 * word->denorm.scale ()) &&
01711     (out_box.left () > prev_right) && (out_box.right () < next_left)) {
01712       aspect_ratio = out_box.width () / (float) out_box.height ();
01713       if (str[i] == '.') {
01714         if (aspect_ratio >= tessedit_upper_flip_hyphen) {
01715           str[i] = '-'; 
01716           if (word->reject_map[i].rejected ())
01717             word->reject_map[i].setrej_hyphen_accept ();
01718         }
01719         if ((aspect_ratio > tessedit_lower_flip_hyphen) &&
01720           word->reject_map[i].accepted ())
01721           word->reject_map[i].setrej_hyphen (); 
01722       }
01723       else if (str[i] == '-') {
01724         if ((aspect_ratio >= tessedit_upper_flip_hyphen) &&
01725           (word->reject_map[i].rejected ()))
01726           word->reject_map[i].setrej_hyphen_accept (); 
01727 
01728         if ((aspect_ratio <= tessedit_lower_flip_hyphen) &&
01729           (word->reject_map[i].accepted ()))
01730           word->reject_map[i].setrej_hyphen (); 
01731       }
01732     }
01733     prev_right = out_box.right ();
01734   }
01735 }
01736 
01737 
01746 void flip_0O(WERD_RES *word) {
01747   char *str = (char *) word->best_choice->string ().string ();
01748   int i;
01749   PBLOB_IT outword_it;
01750   BOX out_box;
01751 
01752   if (!tessedit_flip_0O)
01753     return;
01754 
01755   outword_it.set_to_list (word->outword->blob_list ());
01756 
01757   for (i = 0, outword_it.mark_cycle_pt ();
01758   !outword_it.cycled_list (); i++, outword_it.forward ()) {
01759     if (isupper (str[i]) || isdigit (str[i])) {
01760       out_box = outword_it.data ()->bounding_box ();
01761       if ((out_box.top () < bln_baseline_offset + bln_x_height) ||
01762         (out_box.bottom () > bln_baseline_offset + bln_x_height / 4))
01763         return;   
01764     }
01765   }
01766 
01767   for (i = 1; str[i] != '\0'; i++, outword_it.forward ()) {
01768     if ((str[i] == '0') || (str[i] == 'O')) {
01769       
01770       if (non_O_upper (str[i - 1]) && non_O_upper (str[i + 1])) {
01771         str[i] = 'O';
01772       }
01773       
01774       if (non_O_upper (str[i - 1])
01775       && ((str[i + 1] == '0') || (str[i + 1] == 'O'))
01776       && non_O_upper (str[i + 2])) {
01777         str[i] = 'O';
01778         str[i + 1] = 'O';
01779         i++;
01780       }
01781       
01782       if ((i > 1) &&
01783         non_O_upper (str[i - 2]) &&
01784         non_O_upper (str[i - 1]) &&
01785         !isdigit (str[i + 1]) &&
01786       (str[i + 1] != 'l') && (str[i + 1] != 'I')) {
01787         str[i] = 'O';
01788       }
01789       
01790       if (non_0_digit (str[i - 1]) && non_0_digit (str[i + 1])) {
01791         str[i] = '0';
01792       }
01793       
01794       if (non_0_digit (str[i - 1]) &&
01795         ((str[i + 1] == '0') || (str[i + 1] == 'O')) &&
01796       ((str[i + 2] == '0') || (str[i + 2] == 'O'))) {
01797         str[i] = '0';
01798         str[i + 1] = '0';
01799         str[i + 2] = '0';
01800         i += 2;
01801       }
01802       
01803       if (non_0_digit (str[i - 1]) &&
01804         ((str[i + 1] == '0') || (str[i + 1] == 'O')) &&
01805       !isupper (str[i + 2])) {
01806         str[i] = '0';
01807         str[i + 1] = '0';
01808         i++;
01809       }
01810       
01811       if (non_0_digit (str[i - 1]) && !isupper (str[i + 1])) {
01812         str[i] = '0';
01813       }
01814       
01815       if ((i > 1) &&
01816         ((str[i - 1] == '.') || (str[i - 1] == ',')) &&
01817       (isdigit (str[i - 2]) || (str[i - 2] == 'O'))) {
01818         if (str[i - 2] == 'O')
01819           str[i - 2] = '0';
01820         while ((str[i] == 'O') || (str[i] == '0')) {
01821           str[i++] = '0';
01822         }
01823         i--;
01824       }
01825     }
01826   }
01827 }
01828 
01829 
01833 BOOL8 non_O_upper(char c) {
01834   return isupper (c) && (c != 'O');
01835 }
01836 
01837 
01841 BOOL8 non_0_digit(char c) {
01842   return isdigit (c) && (c != '0');
01843 }