Tesseract: ccmain/reject.cpp Source File

00001 
00029 #include "mfcpch.h"
00030 #include          "tessvars.h"
00031 #ifdef __UNIX__
00032 #include          <assert.h>
00033 #include          <errno.h>
00034 #endif
00035 #include          "scanutils.h"
00036 #include          <ctype.h>
00037 #include          <string.h>
00038 //#include                                      "tessbox.h"
00039 #include          "memry.h"
00040 #include          "reject.h"
00041 #include          "tfacep.h"
00042 #include          "mainblk.h"
00043 #include          "charcut.h"
00044 #include          "imgs.h"
00045 #include          "scaleimg.h"
00046 #include          "control.h"
00047 #include          "docqual.h"
00048 #include          "secname.h"
00049 
00050 /* #define SECURE_NAMES done in secnames.h when necessary */
00051 
00052 //extern "C" {
00053 #include          "callnet.h"
00054 //}
00055 
00056 #include          "notdll.h"
00057 
00058 CLISTIZEH (STRING) CLISTIZE (STRING)
00059 #define EXTERN
00060 
00062 EXTERN
00063 INT_VAR (tessedit_reject_mode, 0, "Rejection algorithm");
00064 EXTERN
00065 INT_VAR (tessedit_ok_mode, 5, "Acceptance decision algorithm");
00066 EXTERN
00067 BOOL_VAR (tessedit_use_nn, FALSE, "");
00068 EXTERN
00069 BOOL_VAR (tessedit_rejection_debug, FALSE, "Adaption debug");
00070 EXTERN
00071 BOOL_VAR (tessedit_rejection_stats, FALSE, "Show NN stats");
00072 EXTERN
00073 BOOL_VAR (tessedit_flip_0O, TRUE, "Contextual 0O O0 flips");
00074 EXTERN
00075 double_VAR (tessedit_lower_flip_hyphen, 1.5,
00076 "Aspect ratio dot/hyphen test");
00077 EXTERN
00078 double_VAR (tessedit_upper_flip_hyphen, 1.8,
00079 "Aspect ratio dot/hyphen test");
00080 
00081 EXTERN
00082 BOOL_VAR (rej_trust_doc_dawg, FALSE,
00083 "Use DOC dawg in 11l conf. detector");
00084 EXTERN
00085 BOOL_VAR (rej_1Il_use_dict_word, FALSE, "Use dictword test");
00086 EXTERN
00087 BOOL_VAR (rej_1Il_trust_permuter_type, TRUE, "Dont double check");
00088 
00089 EXTERN
00090 BOOL_VAR (one_ell_conflict_default, TRUE, "one_ell_conflict default");
00091 EXTERN
00092 BOOL_VAR (show_char_clipping, FALSE, "Show clip image window?");
00093 EXTERN
00094 BOOL_VAR (nn_debug, FALSE, "NN DEBUGGING?");
00095 EXTERN
00096 BOOL_VAR (nn_reject_debug, FALSE, "NN DEBUG each char?");
00097 EXTERN
00098 BOOL_VAR (nn_lax, FALSE, "Use 2nd rate matches");
00099 EXTERN
00100 BOOL_VAR (nn_double_check_dict, FALSE, "Double check");
00101 EXTERN
00102 BOOL_VAR (nn_conf_double_check_dict, TRUE,
00103 "Double check for confusions");
00104 EXTERN
00105 BOOL_VAR (nn_conf_1Il, TRUE, "NN use 1Il conflicts");
00106 EXTERN
00107 BOOL_VAR (nn_conf_Ss, TRUE, "NN use Ss conflicts");
00108 EXTERN
00109 BOOL_VAR (nn_conf_hyphen, TRUE, "NN hyphen conflicts");
00110 EXTERN
00111 BOOL_VAR (nn_conf_test_good_qual, FALSE, "NN dodgy 1Il cross check");
00112 EXTERN
00113 BOOL_VAR (nn_conf_test_dict, TRUE, "NN dodgy 1Il cross check");
00114 EXTERN
00115 BOOL_VAR (nn_conf_test_sensible, TRUE, "NN dodgy 1Il cross check");
00116 EXTERN
00117 BOOL_VAR (nn_conf_strict_on_dodgy_chs, TRUE,
00118 "Require stronger NN match");
00119 EXTERN
00120 double_VAR (nn_dodgy_char_threshold, 0.99, "min accept score");
00121 EXTERN
00122 INT_VAR (nn_conf_accept_level, 4, "NN accept dodgy 1Il matches? ");
00123 EXTERN
00124 INT_VAR (nn_conf_initial_i_level, 3,
00125 "NN accept initial Ii match level ");
00126 
00127 EXTERN
00128 BOOL_VAR (no_unrej_dubious_chars, TRUE, "Dubious chars next to reject?");
00129 EXTERN
00130 BOOL_VAR (no_unrej_no_alphanum_wds, TRUE, "Stop unrej of non A/N wds?");
00131 EXTERN
00132 BOOL_VAR (no_unrej_1Il, FALSE, "Stop unrej of 1Ilchars?");
00133 EXTERN
00134 BOOL_VAR (rej_use_tess_accepted, TRUE, "Individual rejection control");
00135 EXTERN
00136 BOOL_VAR (rej_use_tess_blanks, TRUE, "Individual rejection control");
00137 EXTERN
00138 BOOL_VAR (rej_use_good_perm, TRUE, "Individual rejection control");
00139 EXTERN
00140 BOOL_VAR (rej_use_sensible_wd, FALSE, "Extend permuter check");
00141 EXTERN
00142 BOOL_VAR (rej_alphas_in_number_perm, FALSE, "Extend permuter check");
00143 
00144 EXTERN
00145 double_VAR (rej_whole_of_mostly_reject_word_fract, 0.85,
00146 "if >this fract");
00147 EXTERN
00148 INT_VAR (rej_mostly_reject_mode, 1,
00149 "0-never, 1-afterNN, 2-after new xht");
00150 EXTERN
00151 double_VAR (tessed_fullstop_aspect_ratio, 1.2,
00152 "if >this fract then reject");
00153 
00154 EXTERN
00155 INT_VAR (net_image_width, 40, "NN input image width");
00156 EXTERN
00157 INT_VAR (net_image_height, 36, "NN input image height");
00158 EXTERN
00159 INT_VAR (net_image_x_height, 22, "NN input image x_height");
00160 EXTERN
00161 INT_VAR (tessedit_image_border, 2, "Rej blbs near image edge limit");
00162 
00163 EXTERN
00164 INT_VAR (net_bl_nodes, 20, "Number of baseline nodes");
00165 
00166 EXTERN
00167 double_VAR (nn_reject_threshold, 0.5, "NN min accept score");
00168 EXTERN
00169 double_VAR (nn_reject_head_and_shoulders, 0.6, "top scores sep factor");
00170 
00171 /* NOTE - ctoh doesn't handle "=" properly, hence \075 */
00172 EXTERN
00173 STRING_VAR (ok_single_ch_non_alphanum_wds, "-?\075",
00174 "Allow NN to unrej");
00175 EXTERN
00176 STRING_VAR (ok_repeated_ch_non_alphanum_wds, "-?*\075",
00177 "Allow NN to unrej");
00178 EXTERN
00179 STRING_VAR (conflict_set_I_l_1, "Il1[]", "Il1 conflict set");
00180 EXTERN
00181 STRING_VAR (conflict_set_S_s, "Ss$", "Ss conflict set");
00182 EXTERN
00183 STRING_VAR (conflict_set_hyphen, "-_~", "hyphen conflict set");
00184 EXTERN
00185 STRING_VAR (dubious_chars_left_of_reject, "!'+`()-./\\<>;:^_,~\"",
00186 "Unreliable chars");
00187 EXTERN
00188 STRING_VAR (dubious_chars_right_of_reject, "!'+`()-./\\<>;:^_,~\"",
00189 "Unreliable chars");
00190 
00191 EXTERN
00192 INT_VAR (min_sane_x_ht_pixels, 8, "Reject any x-ht lt or eq than this");
00213 void set_done(
00214               WERD_RES *word,
00215               INT16 pass) {
00216   /*
00217   0: Original heuristic used in Tesseract and Ray's prototype Resaljet
00218   */
00219   if (tessedit_ok_mode == 0) {
00220     /* NOTE - done even if word contains some or all spaces !!! */
00221     word->done = word->tess_accepted;
00222   }
00223   /*
00224   1: Reject words containing blanks and on pass 1 reject I/l/1 conflicts
00225   */
00226   else if (tessedit_ok_mode == 1) {
00227     word->done = word->tess_accepted &&
00228       (strchr (word->best_choice->string ().string (), ' ') == NULL);
00229 
00230     if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
00231       word->done = FALSE;
00232   }
00233   /*
00234   2: as 1 + only accept dict words or numerics in pass 1
00235   */
00236   else if (tessedit_ok_mode == 2) {
00237     word->done = word->tess_accepted &&
00238       (strchr (word->best_choice->string ().string (), ' ') == NULL);
00239 
00240     if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
00241       word->done = FALSE;
00242 
00243     if (word->done &&
00244       (pass == 1) &&
00245       (word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
00246       (word->best_choice->permuter () != FREQ_DAWG_PERM) &&
00247       (word->best_choice->permuter () != USER_DAWG_PERM) &&
00248     (word->best_choice->permuter () != NUMBER_PERM)) {
00249       #ifndef SECURE_NAMES
00250       if (tessedit_rejection_debug)
00251         tprintf ("\nVETO Tess accepting poor word \"%s\"\n",
00252           word->best_choice->string ().string ());
00253       #endif
00254       word->done = FALSE;
00255     }
00256   }
00257   /*
00258   3: as 2 + only accept dict words or numerics in pass 2 as well
00259   */
00260   else if (tessedit_ok_mode == 3) {
00261     word->done = word->tess_accepted &&
00262       (strchr (word->best_choice->string ().string (), ' ') == NULL);
00263 
00264     if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
00265       word->done = FALSE;
00266 
00267     if (word->done &&
00268       (word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
00269       (word->best_choice->permuter () != FREQ_DAWG_PERM) &&
00270       (word->best_choice->permuter () != USER_DAWG_PERM) &&
00271     (word->best_choice->permuter () != NUMBER_PERM)) {
00272       #ifndef SECURE_NAMES
00273       if (tessedit_rejection_debug)
00274         tprintf ("\nVETO Tess accepting poor word \"%s\"\n",
00275           word->best_choice->string ().string ());
00276       #endif
00277       word->done = FALSE;
00278     }
00279   }
00280   /*
00281   4: as 2 + reject dict ambigs in pass 1
00282   */
00283   else if (tessedit_ok_mode == 4) {
00284     word->done = word->tess_accepted &&
00285       (strchr (word->best_choice->string ().string (), ' ') == NULL);
00286 
00287     if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
00288       word->done = FALSE;
00289 
00290     if (word->done &&
00291       (pass == 1) &&
00292       ((word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
00293       (word->best_choice->permuter () != FREQ_DAWG_PERM) &&
00294       (word->best_choice->permuter () != USER_DAWG_PERM) &&
00295       (word->best_choice->permuter () != NUMBER_PERM)) ||
00296     (test_ambig_word (word))) {
00297       #ifndef SECURE_NAMES
00298       if (tessedit_rejection_debug)
00299         tprintf ("\nVETO Tess accepting poor word \"%s\"\n",
00300           word->best_choice->string ().string ());
00301       #endif
00302       word->done = FALSE;
00303     }
00304   }
00305   /*
00306   5: as 3 + reject dict ambigs in both passes
00307   */
00308   else if (tessedit_ok_mode == 5) {
00309     word->done = word->tess_accepted &&
00310       (strchr (word->best_choice->string ().string (), ' ') == NULL);
00311 
00312     if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
00313       word->done = FALSE;
00314 
00315     if (word->done &&
00316       ((word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
00317       (word->best_choice->permuter () != FREQ_DAWG_PERM) &&
00318       (word->best_choice->permuter () != USER_DAWG_PERM) &&
00319       (word->best_choice->permuter () != NUMBER_PERM)) ||
00320     (test_ambig_word (word))) {
00321       #ifndef SECURE_NAMES
00322       if (tessedit_rejection_debug)
00323         tprintf ("\nVETO Tess accepting poor word \"%s\"\n",
00324           word->best_choice->string ().string ());
00325       #endif
00326       word->done = FALSE;
00327     }
00328   }
00329 
00330   else {
00331     tprintf ("BAD tessedit_ok_mode\n");
00332     err_exit();
00333   }
00334 }
00335 
00336 
00357 void make_reject_map(
00358                      WERD_RES *word,
00359                      BLOB_CHOICE_LIST_CLIST *blob_choices,
00360                      ROW *row,
00361                      INT16 pass  //1st or 2nd?
00362                     ) {
00363   INT16 i;
00364 
00365   flip_0O(word);
00366   check_debug_pt (word, -1);     // For trap only
00367   set_done(word, pass);  // Set acceptance
00368   word->reject_map.initialise (word->best_choice->string ().length ());
00369   reject_blanks(word);
00370   /*
00371   0: Rays original heuristic - the baseline
00372   */
00373   if (tessedit_reject_mode == 0) {
00374     if (!word->done)
00375       reject_poor_matches(word, blob_choices);
00376   }
00377   /*
00378   5: Reject I/1/l from words where there is no strong contextual confirmation;
00379     the whole of any unacceptable words (incl PERM rej of dubious 1/I/ls);
00380     and the whole of any words which are very small
00381   */
00382   else if (tessedit_reject_mode == 5) {
00383     if (bln_x_height / word->denorm.scale () <= min_sane_x_ht_pixels)
00384       word->reject_map.rej_word_small_xht ();
00385     else {
00386       one_ell_conflict(word, TRUE);
00387       /*
00388        Originally the code here just used the done flag. Now I have
00389        duplicated and unpacked the conditions for setting the done flag so
00390       that each mechanism can be turned on or off independently. This works
00391       WITHOUT affecting the done flag setting.
00392       */
00393       if (rej_use_tess_accepted && !word->tess_accepted)
00394         word->reject_map.rej_word_not_tess_accepted ();
00395 
00396       if (rej_use_tess_blanks &&
00397         (strchr (word->best_choice->string ().string (), ' ') != NULL))
00398         word->reject_map.rej_word_contains_blanks ();
00399 
00400       if (rej_use_good_perm) {
00401         if (((word->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
00402           (word->best_choice->permuter () == FREQ_DAWG_PERM) ||
00403           (word->best_choice->permuter () == USER_DAWG_PERM)) &&
00404           (!rej_use_sensible_wd ||
00405           (acceptable_word_string
00406           (word->best_choice->string ().string ()) != AC_UNACCEPTABLE))) {
00407           // PASSED TEST
00408         }
00409         else if (word->best_choice->permuter () == NUMBER_PERM) {
00410           if (rej_alphas_in_number_perm) {
00411             for (i = 0; word->best_choice->string ()[i] != '\0';
00412             i++) {
00413               if (word->reject_map[i].accepted () &&
00414                 isalpha (word->best_choice->string ()[i]))
00415                 word->reject_map[i].setrej_bad_permuter ();
00416               // rej alpha
00417             }
00418           }
00419         }
00420         else {
00421           word->reject_map.rej_word_bad_permuter ();
00422         }
00423       }
00424 
00425       /* Ambig word rejection was here once !!*/
00426 
00427     }
00428   }
00429   else {
00430     tprintf ("BAD tessedit_reject_mode\n");
00431     err_exit();
00432   }
00433 
00434   if (tessedit_image_border > -1)
00435     reject_edge_blobs(word);
00436 
00437   check_debug_pt (word, 10);
00438   if (tessedit_rejection_debug) {
00439     tprintf ("Permuter Type = %d\n", word->best_choice->permuter ());
00440     tprintf ("Certainty: %f     Rating: %f\n",
00441       word->best_choice->certainty (), word->best_choice->rating ());
00442     tprintf ("Dict word: %d\n",
00443       dict_word (word->best_choice->string ().string ()));
00444   }
00445 
00446   /* Un-reject any rejected characters if NN permits */
00447 
00448   if (tessedit_use_nn && (pass == 2) &&
00449     word->reject_map.recoverable_rejects ())
00450     nn_recover_rejects(word, row);
00451   flip_hyphens(word);
00452   check_debug_pt (word, 20);
00453 }
00454 
00455 
00463 void reject_blanks(WERD_RES *word) {
00464   INT16 i;
00465 
00466   for (i = 0; word->best_choice->string ()[i] != '\0'; i++) {
00467     if (word->best_choice->string ()[i] == ' ')
00468       word->reject_map[i].setrej_tess_failure (); // rej unrecognised blobs
00469   }
00470 }
00471 
00472 
00481 void reject_I_1_L(WERD_RES *word) {
00482   INT16 i;
00483 
00484   for (i = 0; word->best_choice->string ()[i] != '\0'; i++) {
00485     if (STRING (conflict_set_I_l_1).
00486     contains (word->best_choice->string ()[i])) {
00487       word->reject_map[i].setrej_1Il_conflict (); // rej 1Il conflict
00488     }
00489   }
00490 }
00491 
00492 
00501 void reject_poor_matches(  //detailed results
00502                          WERD_RES *word,
00503                          BLOB_CHOICE_LIST_CLIST *blob_choices) {
00504   float threshold;
00505   INT16 i = 0;
00506                                  //super iterator
00507   BLOB_CHOICE_LIST_C_IT list_it = blob_choices;
00508   BLOB_CHOICE_IT choice_it;      //real iterator
00509 
00510   #ifndef SECURE_NAMES
00511   if (strlen (word->best_choice->string ().string ()) != list_it.length ()) {
00512     tprintf
00513       ("ASSERT FAIL string:\"%s\"; strlen=%d; choices len=%d; blob len=%d\n",
00514       word->best_choice->string ().string (),
00515       strlen (word->best_choice->string ().string ()), list_it.length (),
00516       word->outword->blob_list ()->length ());
00517   }
00518   #endif
00519   ASSERT_HOST (strlen (word->best_choice->string ().string ()) ==
00520     list_it.length ());
00521   ASSERT_HOST (word->outword->blob_list ()->length () == list_it.length ());
00522   threshold = compute_reject_threshold (blob_choices);
00523 
00524   for (list_it.mark_cycle_pt ();
00525   !list_it.cycled_list (); list_it.forward (), i++) {
00526     /*
00527    NB - only compares the threshold against the TOP choice char in the
00528     choices list for a blob !! - the selected one may be below the threshold
00529    */
00530     choice_it.set_to_list (list_it.data ());
00531     if ((word->best_choice->string ()[i] == ' ') ||
00532       (choice_it.length () == 0))
00533       word->reject_map[i].setrej_tess_failure (); //rej unrecognised blobs
00534     else if (choice_it.data ()->certainty () < threshold)
00535       word->reject_map[i].setrej_poor_match (); //rej poor score blob
00536   }
00537 }
00538 
00539 
00549 float compute_reject_threshold(
00550                                BLOB_CHOICE_LIST_CLIST *blob_choices) {
00551   INT16 index;                   //to ratings
00552   INT16 blob_count;              //no of blobs in word
00553   INT16 ok_blob_count = 0;       //non TESS rej blobs in word
00554   float *ratings;                //array of confidences
00555   float threshold;               //rejection threshold
00556   float bestgap;                 //biggest gap
00557   float gapstart;                //bottom of gap
00558                                  //super iterator
00559   BLOB_CHOICE_LIST_C_IT list_it = blob_choices;
00560   BLOB_CHOICE_IT choice_it;      // real iterator
00561 
00562   blob_count = blob_choices->length ();
00563   ratings = (float *) alloc_mem (blob_count * sizeof (float));
00564   for (list_it.mark_cycle_pt (), index = 0;
00565   !list_it.cycled_list (); list_it.forward (), index++) {
00566     choice_it.set_to_list (list_it.data ());
00567     if (choice_it.length () > 0) {
00568       ratings[ok_blob_count] = choice_it.data ()->certainty ();
00569       //get in an array
00570       //  tprintf("Rating[%d]=%c %g %g\n",
00571       //    index,choice_it.data()->char_class(),
00572       //    choice_it.data()->rating(),choice_it.data()->certainty());
00573       ok_blob_count++;
00574     }
00575   }
00576   ASSERT_HOST (index == blob_count);
00577   qsort (ratings, ok_blob_count, sizeof (float), sort_floats);
00578   // sort them
00579   bestgap = 0;
00580   gapstart = ratings[0] - 1;     // all reject if none better
00581   if (ok_blob_count >= 3) {
00582     for (index = 0; index < ok_blob_count - 1; index++) {
00583       if (ratings[index + 1] - ratings[index] > bestgap) {
00584         bestgap = ratings[index + 1] - ratings[index];
00585         // find biggest
00586         gapstart = ratings[index];
00587       }
00588     }
00589   }
00590   threshold = gapstart + bestgap / 2;
00591   //      tprintf("First=%g, last=%g, gap=%g, threshold=%g\n",
00592   //              ratings[0],ratings[index],bestgap,threshold);
00593 
00594   free_mem(ratings);
00595   return threshold;
00596 }
00597 
00598 
00606 int sort_floats(
00607                 const void *arg1,
00608                 const void *arg2) {
00609   float diff;                    // difference
00610 
00611   diff = *((float *) arg1) - *((float *) arg2);
00612   if (diff > 0)
00613     return 1;
00614   else if (diff < 0)
00615     return -1;
00616   else
00617     return 0;
00618 }
00619 
00620 
00631 void reject_edge_blobs(WERD_RES *word) {
00632   BOX word_box = word->word->bounding_box ();
00633   BOX blob_box;
00634   PBLOB_IT blob_it = word->outword->blob_list ();
00635   //blobs
00636   int blobindex = 0;
00637   float centre;
00638 
00639   if ((word_box.left () < tessedit_image_border) ||
00640     (word_box.bottom () < tessedit_image_border) ||
00641     (word_box.right () + tessedit_image_border >
00642     page_image.get_xsize () - 1) ||
00643   (word_box.top () + tessedit_image_border > page_image.get_ysize () - 1)) {
00644     ASSERT_HOST (word->reject_map.length () == blob_it.length ());
00645     for (blobindex = 0, blob_it.mark_cycle_pt ();
00646     !blob_it.cycled_list (); blobindex++, blob_it.forward ()) {
00647       blob_box = blob_it.data ()->bounding_box ();
00648       centre = (blob_box.left () + blob_box.right ()) / 2.0;
00649       if ((word->denorm.x (blob_box.left ()) < tessedit_image_border) ||
00650         (word->denorm.y (blob_box.bottom (), centre) <
00651         tessedit_image_border) ||
00652         (word->denorm.x (blob_box.right ()) + tessedit_image_border >
00653         page_image.get_xsize () - 1) ||
00654         (word->denorm.y (blob_box.top (), centre)
00655       + tessedit_image_border > page_image.get_ysize () - 1)) {
00656         word->reject_map[blobindex].setrej_edge_char (); // close to edge
00657       }
00658     }
00659   }
00660 }
00661 
00662 
00684 BOOL8 one_ell_conflict(WERD_RES *word_res, BOOL8 update_map) {
00685   const char *word;
00686   INT16 word_len;                // its length
00687   INT16 first_alphanum_idx;
00688   INT16 i;
00689   BOOL8 non_conflict_set_char;   // non conf set a/n?
00690   BOOL8 conflict = FALSE;
00691   BOOL8 allow_1s;
00692   ACCEPTABLE_WERD_TYPE word_type;
00693   BOOL8 dict_perm_type;
00694   BOOL8 dict_word_ok;
00695   int dict_word_type;
00696 
00697   word = word_res->best_choice->string ().string ();
00698   word_len = strlen (word);
00699   /*
00700   If there are no occurrences of the conflict set characters then the word
00701   is OK.
00702   */
00703   if (strpbrk (word, conflict_set_I_l_1.string ()) == NULL)
00704     return FALSE;
00705 
00706   /*
00707   There is a conflict if there are NO other (confirmed) alphanumerics apart
00708   from those in the conflict set.
00709   */
00710 
00711   for (i = 0, non_conflict_set_char = FALSE;
00712     (i < word_len) && !non_conflict_set_char; i++)
00713   non_conflict_set_char = isalnum (word[i]) &&
00714       !STRING (conflict_set_I_l_1).contains (word[i]);
00715   if (!non_conflict_set_char) {
00716     if (update_map)
00717       reject_I_1_L(word_res);
00718     return TRUE;
00719   }
00720 
00721   /*
00722   If the word is accepted by a dawg permuter, and the first alpha character
00723   is "I" or "l", check to see if the alternative is also a dawg word. If it
00724   is, then there is a potential error otherwise the word is ok.
00725   */
00726 
00727   dict_perm_type = (word_res->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
00728     (word_res->best_choice->permuter () == USER_DAWG_PERM) ||
00729     (rej_trust_doc_dawg &&
00730     (word_res->best_choice->permuter () == DOC_DAWG_PERM)) ||
00731     (word_res->best_choice->permuter () == FREQ_DAWG_PERM);
00732   dict_word_type = dict_word (word);
00733   dict_word_ok = (dict_word_type > 0) &&
00734     (rej_trust_doc_dawg || (dict_word_type != DOC_DAWG_PERM));
00735 
00736   if ((rej_1Il_use_dict_word && dict_word_ok) ||
00737     (rej_1Il_trust_permuter_type && dict_perm_type) ||
00738   (dict_perm_type && dict_word_ok)) {
00739     first_alphanum_idx = first_alphanum_pos (word);
00740     if (word[first_alphanum_idx] == 'I') {
00741       word_res->best_choice->string ()[first_alphanum_idx] = 'l';
00742       if (safe_dict_word (word) > 0) {
00743         word_res->best_choice->string ()[first_alphanum_idx] = 'I';
00744         if (update_map)
00745           word_res->reject_map[first_alphanum_idx].
00746             setrej_1Il_conflict();
00747         return TRUE;
00748       }
00749       else {
00750         word_res->best_choice->string ()[first_alphanum_idx] = 'I';
00751         return FALSE;
00752       }
00753     }
00754 
00755     if (word[first_alphanum_idx] == 'l') {
00756       word_res->best_choice->string ()[first_alphanum_idx] = 'I';
00757       if (safe_dict_word (word) > 0) {
00758         word_res->best_choice->string ()[first_alphanum_idx] = 'l';
00759         if (update_map)
00760           word_res->reject_map[first_alphanum_idx].
00761           setrej_1Il_conflict();
00762         return TRUE;
00763       }
00764       else {
00765         word_res->best_choice->string ()[first_alphanum_idx] = 'l';
00766         return FALSE;
00767       }
00768     }
00769     return FALSE;
00770   }
00771 
00772   /*
00773   NEW 1Il code.
00774 
00775   The old code relied on permuter types too much. In fact,
00776   tess will use TOP_CHOICE permute for good things like "palette".
00777 
00778   In this code the string is examined independently to see if it looks like
00779   a well formed word.
00780   */
00781 
00782   /*
00783   REGARDLESS OF PERMUTER, see if flipping a leading I/l generates a
00784   dictionary word.
00785   */
00786   first_alphanum_idx = first_alphanum_pos (word);
00787   if (word[first_alphanum_idx] == 'l') {
00788     word_res->best_choice->string ()[first_alphanum_idx] = 'I';
00789     if (safe_dict_word (word) > 0)
00790       return FALSE;
00791     else
00792       word_res->best_choice->string ()[first_alphanum_idx] = 'l';
00793   }
00794   else if (word[first_alphanum_idx] == 'I') {
00795     word_res->best_choice->string ()[first_alphanum_idx] = 'l';
00796     if (safe_dict_word (word) > 0)
00797       return FALSE;
00798     else
00799       word_res->best_choice->string ()[first_alphanum_idx] = 'I';
00800   }
00801   /*
00802   For strings containing digits:
00803   If there are no alphas OR the numeric permuter liked the word,
00804      reject any non 1 conflict chars
00805   Else reject all conflict chars
00806   */
00807   if (word_contains_non_1_digit (word)) {
00808     allow_1s = (alpha_count (word) == 0) ||
00809       (word_res->best_choice->permuter () == NUMBER_PERM);
00810 
00811     conflict = FALSE;
00812     for (i = 0; i < word_len; i++) {
00813       if ((!allow_1s || (word[i] != '1')) &&
00814       STRING (conflict_set_I_l_1).contains (word[i])) {
00815         if (update_map)
00816           word_res->reject_map[i].setrej_1Il_conflict ();
00817         conflict = TRUE;
00818       }
00819     }
00820     return conflict;
00821   }
00822   /*
00823   For anything else. See if it conforms to an acceptable word type. If so,
00824   treat accordingly.
00825   */
00826   word_type = acceptable_word_string (word);
00827   if ((word_type == AC_LOWER_CASE) || (word_type == AC_INITIAL_CAP)) {
00828     first_alphanum_idx = first_alphanum_pos (word);
00829     if (STRING (conflict_set_I_l_1).contains (word[first_alphanum_idx])) {
00830       if (update_map)
00831         word_res->reject_map[first_alphanum_idx].setrej_1Il_conflict ();
00832       return TRUE;
00833     }
00834     else
00835       return FALSE;
00836   }
00837   else if (word_type == AC_UPPER_CASE) {
00838     return FALSE;
00839   }
00840   else {
00841     if (update_map)
00842       reject_I_1_L(word_res);
00843     return TRUE;
00844   }
00845 }
00846 
00847 
00851 INT16 first_alphanum_pos(const char *word) {
00852   INT16 i;
00853 
00854   for (i = 0; word[i] != '\0'; i++) {
00855     if (isalnum (word[i]))
00856       return i;
00857   }
00858   return -1;
00859 }
00860 
00861 
00865 INT16 alpha_count(const char *word) {
00866   INT16 i;
00867   INT16 count = 0;
00868 
00869   for (i = 0; word[i] != '\0'; i++) {
00870     if (isalpha (word[i]))
00871       count++;
00872   }
00873   return count;
00874 }
00875 
00876 
00880 BOOL8 word_contains_non_1_digit(const char *word) {
00881   INT16 i;
00882 
00883   for (i = 0; word[i] != '\0'; i++) {
00884     if (isdigit (word[i]) && word[i] != '1')
00885       return TRUE;
00886   }
00887   return FALSE;
00888 }
00889 
00890 
00896 BOOL8 test_ambig_word(
00897                       WERD_RES *word) {
00898   BOOL8 ambig = FALSE;
00899 
00900   if ((word->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
00901     (word->best_choice->permuter () == FREQ_DAWG_PERM) ||
00902   (word->best_choice->permuter () == USER_DAWG_PERM)) {
00903     ambig = !NoDangerousAmbig(word->best_choice->string().string(), NULL);
00904   }
00905   return ambig;
00906 }
00907 
00908 
00919 BOOL8 ambig_word(
00920                  const char *start_word,
00921                  char *temp_word,
00922                  INT16 test_char_pos
00923                 ) {
00924   const char *ambigs;            // Ambiguities for char
00925 
00926   if (*(temp_word + test_char_pos) == '\0') {
00927     if (safe_dict_word (temp_word)) {
00928       if (strcmp (start_word, temp_word) == 0)
00929         return FALSE;
00930       else
00931         return TRUE;
00932     }
00933     else
00934       return FALSE;
00935   }
00936   else {
00937     ambigs = char_ambiguities (*(temp_word + test_char_pos));
00938     if (ambigs == NULL)
00939       return ambig_word (start_word, temp_word, test_char_pos + 1);
00940     else {
00941       while (*ambigs != '\0') {
00942         *(temp_word + test_char_pos) = *ambigs++;
00943         // test next ambiguity
00944         if (ambig_word (start_word, temp_word, test_char_pos + 1))
00945           return TRUE;
00946       }
00947       return FALSE;
00948     }
00949   }
00950 }
00951 
00952 
00971 const char *char_ambiguities(char c) {
00972   static STRING_CLIST conflict_sets;
00973   static BOOL8 read_conflict_sets = FALSE;
00974   STRING_C_IT cs_it(&conflict_sets);
00975   const char *cs;
00976   STRING cs_file_name;
00977   FILE *cs_file;
00978   char buff[1024];
00979 
00980   if (!read_conflict_sets) {
00981     cs_file_name = datadir + "confsets";
00982     if (!(cs_file = fopen (cs_file_name.string (), "r"))) {
00983       CANTOPENFILE.error ("char_ambiguities", EXIT, "%s %d",
00984         cs_file_name.string (), errno);
00985     }
00986     while (fscanf (cs_file, "%s", buff) == 1) {
00987       cs_it.add_after_then_move (new STRING (buff));
00988     }
00989     read_conflict_sets = TRUE;
00990     cs_it.move_to_first ();
00991     if (tessedit_rejection_debug) {
00992       for (cs_it.mark_cycle_pt ();
00993       !cs_it.cycled_list (); cs_it.forward ()) {
00994         tprintf ("\"%s\"\n", cs_it.data ()->string ());
00995       }
00996     }
00997   }
00998 
00999   cs_it.move_to_first ();
01000   for (cs_it.mark_cycle_pt (); !cs_it.cycled_list (); cs_it.forward ()) {
01001     cs = cs_it.data ()->string ();
01002     if (strchr (cs, c) != NULL)
01003       return cs;
01004   }
01005   return NULL;
01006 }
01007 
01008 #ifndef EMBEDDED
01009 
01018 void test_ambigs(const char *word) {
01019   char orig_word[80];
01020   char temp_word[80];
01021 
01022   if (strlen (word) > 80)
01023     tprintf ("Ridiculously long word \"%s\"\n", word);
01024   else {
01025     strcpy(orig_word, word);
01026     while (strlen (orig_word) > 0) {
01027       strcpy(temp_word, orig_word);
01028 
01029       #ifndef SECURE_NAMES
01030       if (ambig_word (orig_word, temp_word, 0))
01031         tprintf ("Ambiguity \"%s\" -> \"%s\"\n", orig_word, temp_word);
01032       else
01033         tprintf ("NO Ambiguities for  \"%s\"\n", orig_word);
01034       tprintf ("Next Word > ");
01035       #endif
01036       scanf ("%s", orig_word);
01037     }
01038   }
01039 }
01040 #endif
01041 
01042 
01054 void nn_recover_rejects(WERD_RES *word, ROW *row) {
01055   REJMAP old_map = word->reject_map; // copy for debug
01056 
01057   /*
01058   NOTE THAT THIS IS RELATIVELY INEFFICIENT AS THE WHOLE OF THE WERD IS
01059   MATCHED BY THE NN MATCHER. IT COULD EASILY BE RESTRICTED TO JUST THE
01060   REJECT CHARACTERS (Though initial use is when words are total rejects
01061   anyway).
01062   */
01063 
01064   set_global_subsubloc_code(SUBSUBLOC_NN);
01065   nn_match_word(word, row);
01066 
01067   if (no_unrej_1Il)
01068     dont_allow_1Il(word);
01069   if (no_unrej_dubious_chars)
01070     dont_allow_dubious_chars(word);
01071 
01072   if (rej_mostly_reject_mode == 1)
01073     reject_mostly_rejects(word);
01074   /*
01075     IF there are no unrejected alphanumerics AND
01076       The word is not an acceptable single non alphanum char word  AND
01077       The word is not an acceptable repeated non alphanum char word
01078     THEN Reject whole word
01079   */
01080   if (no_unrej_no_alphanum_wds &&
01081     (count_alphanums (word) < 1) &&
01082     !((word->best_choice->string ().length () == 1) &&
01083     STRING (ok_single_ch_non_alphanum_wds).contains (word->best_choice->
01084     string ()[0]))
01085     && !repeated_nonalphanum_wd (word, row))
01086 
01087     word->reject_map.rej_word_no_alphanums ();
01088 
01089   #ifndef SECURE_NAMES
01090 
01091   if (nn_debug) {
01092     tprintf ("\nTess: \"%s\" MAP ", word->best_choice->string ().string ());
01093     old_map.print (stdout);
01094     tprintf ("->");
01095     word->reject_map.print (stdout);
01096     tprintf ("\n");
01097   }
01098   #endif
01099   set_global_subsubloc_code(SUBSUBLOC_OTHER);
01100 }
01101 
01102 
01121 void nn_match_word(
01122                    WERD_RES *word,
01123                    ROW *row) {
01124   PIXROW_LIST *pixrow_list;
01125   PIXROW_IT pixrow_it;
01126   IMAGELINE *imlines;            // lines of the image
01127   BOX pix_box;                   // box of imlines extent
01128 #ifndef GRAPHICS_DISABLED
01129   WINDOW win = NULL;
01130 #endif
01131   IMAGE clip_image;
01132   IMAGE scaled_image;
01133   float baseline_pos;
01134   INT16 net_image_size;
01135   INT16 clip_image_size;
01136   WERD copy_outword;             // copy to denorm
01137   INT16 i;
01138 
01139   const char *word_string;
01140   BOOL8 word_in_dict;            // Tess wd in dict
01141   BOOL8 checked_dict_word;       // Tess wd definitely in dict
01142   BOOL8 sensible_word;           // OK char string
01143   BOOL8 centre;                  // Not at word end chs
01144   BOOL8 good_quality_word;
01145   INT16 char_quality;
01146   INT16 accepted_char_quality;
01147 
01148   INT16 conf_level; // 0:REJECT, 1:DODGY ACCEPT, 2:DICT ACCEPT, 3:CLEAR ACCEPT
01149   INT16 first_alphanum_idx;
01150 
01151   word_string = word->best_choice->string ().string ();
01152   first_alphanum_idx = first_alphanum_pos (word_string);
01153   word_in_dict = ((word->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
01154     (word->best_choice->permuter () == FREQ_DAWG_PERM) ||
01155     (word->best_choice->permuter () == USER_DAWG_PERM));
01156   checked_dict_word = word_in_dict && (safe_dict_word (word_string) > 0);
01157   sensible_word = acceptable_word_string (word_string) != AC_UNACCEPTABLE;
01158 
01159   word_char_quality(word, row, &char_quality, &accepted_char_quality);
01160   good_quality_word = word->best_choice->string ().length () == char_quality;
01161 
01162   #ifndef SECURE_NAMES
01163   if (nn_reject_debug) {
01164     tprintf ("Dict: %c   Checked Dict: %c   Sensible: %c   Quality: %c\n",
01165       word_in_dict ? 'T' : 'F',
01166       checked_dict_word ? 'T' : 'F',
01167       sensible_word ? 'T' : 'F', good_quality_word ? 'T' : 'F');
01168   }
01169   #endif
01170 
01171   if (word->best_choice->string ().length () !=
01172   word->outword->blob_list ()->length ()) {
01173     #ifndef SECURE_NAMES
01174     tprintf ("nn_match_word ASSERT FAIL String:\"%s\";  #Blobs=%d\n",
01175       word->best_choice->string ().string (),
01176       word->outword->blob_list ()->length ());
01177     #endif
01178     err_exit();
01179   }
01180 
01181   copy_outword = *(word->outword);
01182   copy_outword.baseline_denormalise (&word->denorm);
01183   /*
01184   For each character, generate and match a new image, containing JUST the
01185   character we have clipped, centered in the image, on a white background.
01186   Note that we MUST have a square image so that we can scale it uniformly in
01187   x and y.  We base the size on x_height as this can be found fairly reliably.
01188   */
01189   net_image_size = (net_image_width > net_image_height) ?
01190     net_image_width : net_image_height;
01191   clip_image_size = (INT16) floor (0.5 +
01192     net_image_size * word->x_height /
01193     net_image_x_height);
01194   if ((clip_image_size <= 1) || (net_image_size <= 1)) {
01195     return;
01196   }
01197 
01198   /*
01199   Get the image of the word and the pix positions of each char
01200   */
01201   char_clip_word(&copy_outword, page_image, pixrow_list, imlines, pix_box);
01202 #ifndef GRAPHICS_DISABLED
01203   if (show_char_clipping) {
01204     win = display_clip_image (&copy_outword, page_image,
01205       pixrow_list, pix_box);
01206   }
01207 #endif
01208   pixrow_it.set_to_list (pixrow_list);
01209   pixrow_it.move_to_first ();
01210   for (pixrow_it.mark_cycle_pt (), i = 0;
01211   !pixrow_it.cycled_list (); pixrow_it.forward (), i++) {
01212     if (pixrow_it.data ()->
01213       bad_box (page_image.get_xsize (), page_image.get_ysize ()))
01214       continue;
01215     clip_image.create (clip_image_size, clip_image_size, 1);
01216     // make bin imge
01217     if (!copy_outword.flag (W_INVERSE))
01218       invert_image(&clip_image);  // white background for black on white
01219     pixrow_it.data ()->char_clip_image (imlines, pix_box, row,
01220       clip_image, baseline_pos);
01221     if (copy_outword.flag (W_INVERSE))
01222       invert_image(&clip_image);  // invert white on black for scaling &NN
01223     scaled_image.create (net_image_size, net_image_size, 1);
01224     scale_image(clip_image, scaled_image);
01225     baseline_pos *= net_image_size / clip_image_size;
01226     // scale with im
01227     centre = !pixrow_it.at_first () && !pixrow_it.at_last ();
01228 
01229     conf_level = nn_match_char (scaled_image, baseline_pos,
01230       word_in_dict, checked_dict_word,
01231       sensible_word, centre,
01232       good_quality_word, word_string[i]);
01233     if (word->reject_map[i].recoverable ()) {
01234       if ((i == first_alphanum_idx) &&
01235       ((word_string[i] == 'I') || (word_string[i] == 'i'))) {
01236         if (conf_level >= nn_conf_initial_i_level)
01237           word->reject_map[i].setrej_nn_accept (); // un-reject char
01238       }
01239       else if (conf_level > 0)
01240         word->reject_map[i].setrej_nn_accept (); // un-reject char
01241     }
01242 #ifndef GRAPHICS_DISABLED
01243     if (show_char_clipping)
01244       display_images(clip_image, scaled_image);
01245 #endif
01246     clip_image.destroy ();
01247     scaled_image.destroy ();
01248   }
01249 
01250   delete[]imlines;               // Free array of imlines
01251   delete pixrow_list;
01252 
01253 #ifndef GRAPHICS_DISABLED
01254   if (show_char_clipping) {
01255     destroy_window(win);
01256   }
01257 #endif
01258 }
01259 
01260 
01276 INT16 nn_match_char(
01277                     IMAGE &scaled_image,
01278                     float baseline_pos,       // rel to scaled_image
01279                     BOOL8 dict_word,          // part of dict wd?
01280                     BOOL8 checked_dict_word,  // part of dict wd?
01281                     BOOL8 sensible_word,      // part acceptable str?
01282                     BOOL8 centre,             // not at word ends?
01283                     BOOL8 good_quality_word,  // initial segmentation
01284                     char tess_ch              // confirm this?
01285                    ) {
01286   INT16 conf_level;              // 0..2 same as nn_match_word()
01287   INT32 row;
01288   INT32 col;
01289   INT32 y_size = scaled_image.get_ysize ();
01290   INT32 start_y = y_size - (y_size - net_image_height) / 2 - 1;
01291   INT32 end_y = start_y - net_image_height + 1;
01292   IMAGELINE imline;
01293   float *input_vector;
01294   float *input_vec_ptr;
01295   char top;
01296   float top_score;
01297   char next;
01298   float next_score;
01299   INT16 input_nodes = (net_image_height * net_image_width) + net_bl_nodes;
01300   INT16 j;
01301 
01302   input_vector = (float *) alloc_mem (input_nodes * sizeof (float));
01303   input_vec_ptr = input_vector;
01304 
01305   invert_image(&scaled_image);  // cos nns work better
01306   for (row = start_y; row >= end_y; row--) {
01307     scaled_image.fast_get_line (0, row, net_image_width, &imline);
01308     for (col = 0; col < net_image_width; col++)
01309       *input_vec_ptr++ = imline.pixels[col];
01310   }
01311   /*
01312   The bit map presented to the net may be shorter than the image, so shift
01313   the coord to be relative to the bitmap portion.
01314   */
01315   baseline_pos -= (y_size - net_image_height) / 2.0;
01316   /*
01317   Baseline pos is 0 if below bitmap, 1 if above and in proportion otherwise.
01318   This is represented to the net as a set of bl_nodes, an initial proportion
01319   of which are set to 1.0, indicating the level of the baseline. The
01320   remainder are 0.0
01321   */
01322 
01323   if (baseline_pos < 0)
01324     baseline_pos = 0;
01325   else if (baseline_pos >= net_image_height)
01326     baseline_pos = net_image_height + 1;
01327   else
01328     baseline_pos = baseline_pos + 1;
01329   baseline_pos = baseline_pos / (net_image_height + 1);
01330 
01331   if (net_bl_nodes > 0) {
01332     baseline_pos *= 1.7;         // Use a wider range
01333     if (net_bl_nodes > 1) {
01334       /* Multi-node baseline representation */
01335       for (j = 0; j < net_bl_nodes; j++) {
01336         if (baseline_pos > ((float) j / net_bl_nodes))
01337           *input_vec_ptr++ = 1.0;
01338         else
01339           *input_vec_ptr++ = 0.0;
01340       }
01341     }
01342     else {
01343       /* Single node baseline */
01344       *input_vec_ptr++ = baseline_pos;
01345     }
01346   }
01347 
01348   callnet(input_vector, &top, &top_score, &next, &next_score);
01349   conf_level = evaluate_net_match (top, top_score, next, next_score,
01350     tess_ch, dict_word, checked_dict_word,
01351     sensible_word, centre, good_quality_word);
01352   #ifndef SECURE_NAMES
01353   if (nn_reject_debug) {
01354     tprintf ("top:\"%c\" %4.2f   next:\"%c\" %4.2f  TESS:\"%c\" Conf: %d\n",
01355       top, top_score, next, next_score, tess_ch, conf_level);
01356   }
01357   #endif
01358   free_mem(input_vector);
01359   return conf_level;
01360 }
01361 
01386 INT16 evaluate_net_match(char top,
01387                          float top_score,
01388                          char next,
01389                          float next_score,
01390                          char tess_ch,
01391                          BOOL8 dict_word,
01392                          BOOL8 checked_dict_word,
01393                          BOOL8 sensible_word,
01394                          BOOL8 centre,
01395                          BOOL8 good_quality_word) {
01396   INT16 accept_level; // 0..5
01397   BOOL8 good_top_choice;
01398   BOOL8 excellent_top_choice;
01399   BOOL8 confusion_match = FALSE;
01400   BOOL8 dodgy_char = !isalnum (tess_ch);
01401 
01402   good_top_choice = (top_score > nn_reject_threshold) &&
01403     (nn_reject_head_and_shoulders * top_score > next_score);
01404 
01405   excellent_top_choice = good_top_choice &&
01406     (top_score > nn_dodgy_char_threshold);
01407 
01408   if (top == tess_ch) {
01409     if (excellent_top_choice)
01410       accept_level = 0;
01411     else if (good_top_choice)
01412       accept_level = 1;          // Top correct and well matched
01413     else
01414       accept_level = 2;          // Top correct but poor match
01415   }
01416   else if ((nn_conf_1Il &&
01417     STRING (conflict_set_I_l_1).contains (tess_ch) &&
01418     STRING (conflict_set_I_l_1).contains (top)) ||
01419     (nn_conf_hyphen &&
01420     STRING (conflict_set_hyphen).contains (tess_ch) &&
01421     STRING (conflict_set_hyphen).contains (top)) ||
01422     (nn_conf_Ss &&
01423     STRING (conflict_set_S_s).contains (tess_ch) &&
01424   STRING (conflict_set_S_s).contains (top))) {
01425     confusion_match = TRUE;
01426     if (good_top_choice)
01427       accept_level = 1;          // Good top confusion
01428     else
01429       accept_level = 2;          // Poor top confusion
01430   }
01431   else if ((nn_conf_1Il &&
01432     STRING (conflict_set_I_l_1).contains (tess_ch) &&
01433     STRING (conflict_set_I_l_1).contains (next)) ||
01434     (nn_conf_hyphen &&
01435     STRING (conflict_set_hyphen).contains (tess_ch) &&
01436     STRING (conflict_set_hyphen).contains (next)) ||
01437     (nn_conf_Ss &&
01438     STRING (conflict_set_S_s).contains (tess_ch) &&
01439   STRING (conflict_set_S_s).contains (next))) {
01440     confusion_match = TRUE;
01441     if (!good_top_choice)
01442       accept_level = 3;          // Next confusion and top match dodgy
01443     else
01444       accept_level = 4;          // Next confusion and good top match
01445   }
01446   else if (next == tess_ch) {
01447     if (!good_top_choice)
01448       accept_level = 3;          // Next match and top match dodgy
01449     else
01450       accept_level = 4;          // Next match and good top match
01451   }
01452   else
01453     accept_level = 5;
01454 
01455   /* Could allow some match flexibility here sS$ etc */
01456 
01457   /* Now set confirmation level according to how much we can believe the tess
01458     char. */
01459 
01460   if ((accept_level == 0) && !confusion_match)
01461     return 3;
01462 
01463   if ((accept_level <= 1) &&
01464     (!nn_conf_strict_on_dodgy_chs || !dodgy_char) && !confusion_match)
01465     return 3;
01466 
01467   if ((accept_level == 2) &&
01468     !confusion_match && !dodgy_char &&
01469     good_quality_word &&
01470     dict_word &&
01471     (checked_dict_word || !nn_double_check_dict) && sensible_word)
01472     return 2;
01473 
01474   if (confusion_match &&
01475     (accept_level <= nn_conf_accept_level) &&
01476     (good_quality_word ||
01477     (!nn_conf_test_good_qual &&
01478     !STRING (conflict_set_I_l_1).contains (tess_ch))) &&
01479     (dict_word || !nn_conf_test_dict) &&
01480     (checked_dict_word || !nn_conf_double_check_dict) &&
01481     (sensible_word || !nn_conf_test_sensible))
01482     return 1;
01483 
01484   if (!confusion_match &&
01485     nn_lax &&
01486     (accept_level == 3) &&
01487     (good_quality_word || !nn_conf_test_good_qual) &&
01488     (dict_word || !nn_conf_test_dict) &&
01489     (sensible_word || !nn_conf_test_sensible))
01490     return 1;
01491   else
01492     return 0;
01493 }
01494 
01495 
01505 void dont_allow_dubious_chars(WERD_RES *word) {
01506   int i = 0;
01507   int rej_pos;
01508   int word_len = word->reject_map.length ();
01509 
01510   while (i < word_len) {
01511     /* Find next reject */
01512 
01513     while ((i < word_len) && (word->reject_map[i].accepted ()))
01514       i++;
01515 
01516     if (i < word_len) {
01517       rej_pos = i;
01518 
01519       /* Reject dubious chars to the left */
01520       i--;
01521       while ((i >= 0) &&
01522         STRING (dubious_chars_left_of_reject).contains (word->
01523         best_choice->
01524         string ()
01525       [i])) {
01526         word->reject_map[i--].setrej_dubious ();
01527       }
01528 
01529       /* Skip adjacent rejects */
01530 
01531       for (i = rej_pos;
01532         (i < word_len) && (word->reject_map[i].rejected ()); i++);
01533 
01534       /* Reject dubious chars to the right */
01535 
01536       while ((i < word_len) &&
01537         STRING (dubious_chars_right_of_reject).contains (word->
01538         best_choice->
01539         string ()
01540       [i])) {
01541         word->reject_map[i++].setrej_dubious ();
01542       }
01543     }
01544   }
01545 }
01546 
01547 
01555 void dont_allow_1Il(WERD_RES *word) {
01556   int i = 0;
01557   int word_len = word->reject_map.length ();
01558   const char *s = word->best_choice->string ().string ();
01559   BOOL8 accepted_1Il = FALSE;
01560 
01561   for (i = 0; i < word_len; i++) {
01562     if (word->reject_map[i].accepted ()) {
01563       if (STRING (conflict_set_I_l_1).contains (s[i]))
01564         accepted_1Il = TRUE;
01565       else {
01566         if (isalnum (s[i]))
01567           return;                // >=1 non 1Il ch accepted
01568       }
01569     }
01570   }
01571   if (!accepted_1Il)
01572     return;                      // Nothing to worry about
01573 
01574   for (i = 0; i < word_len; i++) {
01575     if (STRING (conflict_set_I_l_1).contains (s[i]) &&
01576       word->reject_map[i].accepted ())
01577       word->reject_map[i].setrej_postNN_1Il ();
01578   }
01579 }
01580 
01581 
01585 INT16 count_alphanums(
01586                       WERD_RES *word) {
01587   int count = 0;
01588   int i;
01589 
01590   for (i = 0; i < word->reject_map.length (); i++) {
01591     if ((word->reject_map[i].accepted ()) &&
01592       (isalnum (word->best_choice->string ()[i])))
01593       count++;
01594   }
01595   return count;
01596 }
01597 
01606 void reject_mostly_rejects(
01607                            WERD_RES *word) {
01608 
01609   if ((float) word->reject_map.reject_count () / word->reject_map.length () >=
01610     rej_whole_of_mostly_reject_word_fract)
01611     word->reject_map.rej_word_mostly_rej ();
01612 }
01613 
01614 
01618 BOOL8 repeated_nonalphanum_wd(WERD_RES *word, ROW *row) {
01619   INT16 char_quality;
01620   INT16 accepted_char_quality;
01621 
01622   if (word->best_choice->string ().length () <= 1)
01623     return FALSE;
01624 
01625   if (!STRING (ok_repeated_ch_non_alphanum_wds).
01626     contains (word->best_choice->string ()[0]))
01627     return FALSE;
01628 
01629   if (!repeated_ch_string (word->best_choice->string ().string ()))
01630     return FALSE;
01631 
01632   word_char_quality(word, row, &char_quality, &accepted_char_quality);
01633 
01634   if ((word->best_choice->string ().length () == char_quality) &&
01635     (char_quality == accepted_char_quality))
01636     return TRUE;
01637   else
01638     return FALSE;
01639 }
01640 
01641 
01645 BOOL8 repeated_ch_string(const char *rep_ch_str) {
01646   char c;
01647 
01648   if ((rep_ch_str == NULL) || (*rep_ch_str == '\0')) {
01649     return FALSE;
01650   }
01651 
01652   c = *rep_ch_str;
01653   rep_ch_str++;
01654   while (*rep_ch_str == c) {
01655     rep_ch_str++;
01656   }
01657   if (*rep_ch_str == '\0')
01658     return TRUE;
01659   return FALSE;
01660 }
01661 
01662 
01666 INT16 safe_dict_word(const char *s) {
01667   int dict_word_type;
01668 
01669   dict_word_type = dict_word (s);
01670   if (dict_word_type == DOC_DAWG_PERM)
01671     return 0;
01672   else
01673     return dict_word_type;
01674 }
01675 
01676 
01686 void flip_hyphens(WERD_RES *word) {
01687   char *str = (char *) word->best_choice->string ().string ();
01688   int i = 0;
01689   PBLOB_IT outword_it;
01690   int prev_right = -9999;
01691   int next_left;
01692   BOX out_box;
01693   float aspect_ratio;
01694 
01695   if (tessedit_lower_flip_hyphen <= 1)
01696     return;
01697 
01698   outword_it.set_to_list (word->outword->blob_list ());
01699 
01700   for (outword_it.mark_cycle_pt ();
01701   !outword_it.cycled_list (); outword_it.forward (), i++) {
01702     out_box = outword_it.data ()->bounding_box ();
01703     if (outword_it.at_last ())
01704       next_left = 9999;
01705     else
01706       next_left = outword_it.data_relative (1)->bounding_box ().left ();
01707     /*
01708       Dont touch small or touching blobs - it is too dangerous
01709     */
01710     if ((out_box.width () > 8 * word->denorm.scale ()) &&
01711     (out_box.left () > prev_right) && (out_box.right () < next_left)) {
01712       aspect_ratio = out_box.width () / (float) out_box.height ();
01713       if (str[i] == '.') {
01714         if (aspect_ratio >= tessedit_upper_flip_hyphen) {
01715           str[i] = '-'; // Certain HYPHEN
01716           if (word->reject_map[i].rejected ())
01717             word->reject_map[i].setrej_hyphen_accept ();
01718         }
01719         if ((aspect_ratio > tessedit_lower_flip_hyphen) &&
01720           word->reject_map[i].accepted ())
01721           word->reject_map[i].setrej_hyphen (); // Suspected HYPHEN
01722       }
01723       else if (str[i] == '-') {
01724         if ((aspect_ratio >= tessedit_upper_flip_hyphen) &&
01725           (word->reject_map[i].rejected ()))
01726           word->reject_map[i].setrej_hyphen_accept (); // Certain HYPHEN
01727 
01728         if ((aspect_ratio <= tessedit_lower_flip_hyphen) &&
01729           (word->reject_map[i].accepted ()))
01730           word->reject_map[i].setrej_hyphen (); // Suspected HYPHEN
01731       }
01732     }
01733     prev_right = out_box.right ();
01734   }
01735 }
01736 
01737 
01746 void flip_0O(WERD_RES *word) {
01747   char *str = (char *) word->best_choice->string ().string ();
01748   int i;
01749   PBLOB_IT outword_it;
01750   BOX out_box;
01751 
01752   if (!tessedit_flip_0O)
01753     return;
01754 
01755   outword_it.set_to_list (word->outword->blob_list ());
01756 
01757   for (i = 0, outword_it.mark_cycle_pt ();
01758   !outword_it.cycled_list (); i++, outword_it.forward ()) {
01759     if (isupper (str[i]) || isdigit (str[i])) {
01760       out_box = outword_it.data ()->bounding_box ();
01761       if ((out_box.top () < bln_baseline_offset + bln_x_height) ||
01762         (out_box.bottom () > bln_baseline_offset + bln_x_height / 4))
01763         return;   // Beware words with sub/superscripts
01764     }
01765   }
01766 
01767   for (i = 1; str[i] != '\0'; i++, outword_it.forward ()) {
01768     if ((str[i] == '0') || (str[i] == 'O')) {
01769       /* A0A */
01770       if (non_O_upper (str[i - 1]) && non_O_upper (str[i + 1])) {
01771         str[i] = 'O';
01772       }
01773       /* A00A */
01774       if (non_O_upper (str[i - 1])
01775       && ((str[i + 1] == '0') || (str[i + 1] == 'O'))
01776       && non_O_upper (str[i + 2])) {
01777         str[i] = 'O';
01778         str[i + 1] = 'O';
01779         i++;
01780       }
01781       /* AA0<non digit or end of word> */
01782       if ((i > 1) &&
01783         non_O_upper (str[i - 2]) &&
01784         non_O_upper (str[i - 1]) &&
01785         !isdigit (str[i + 1]) &&
01786       (str[i + 1] != 'l') && (str[i + 1] != 'I')) {
01787         str[i] = 'O';
01788       }
01789       /* 9O9 */
01790       if (non_0_digit (str[i - 1]) && non_0_digit (str[i + 1])) {
01791         str[i] = '0';
01792       }
01793       /* 9OOO */
01794       if (non_0_digit (str[i - 1]) &&
01795         ((str[i + 1] == '0') || (str[i + 1] == 'O')) &&
01796       ((str[i + 2] == '0') || (str[i + 2] == 'O'))) {
01797         str[i] = '0';
01798         str[i + 1] = '0';
01799         str[i + 2] = '0';
01800         i += 2;
01801       }
01802       /* 9OO<non upper> */
01803       if (non_0_digit (str[i - 1]) &&
01804         ((str[i + 1] == '0') || (str[i + 1] == 'O')) &&
01805       !isupper (str[i + 2])) {
01806         str[i] = '0';
01807         str[i + 1] = '0';
01808         i++;
01809       }
01810       /* 9O<non upper> */
01811       if (non_0_digit (str[i - 1]) && !isupper (str[i + 1])) {
01812         str[i] = '0';
01813       }
01814       /* 9[.,]OOO.. */
01815       if ((i > 1) &&
01816         ((str[i - 1] == '.') || (str[i - 1] == ',')) &&
01817       (isdigit (str[i - 2]) || (str[i - 2] == 'O'))) {
01818         if (str[i - 2] == 'O')
01819           str[i - 2] = '0';
01820         while ((str[i] == 'O') || (str[i] == '0')) {
01821           str[i++] = '0';
01822         }
01823         i--;
01824       }
01825     }
01826   }
01827 }
01828 
01829 
01833 BOOL8 non_O_upper(char c) {
01834   return isupper (c) && (c != 'O');
01835 }
01836 
01837 
01841 BOOL8 non_0_digit(char c) {
01842   return isdigit (c) && (c != '0');
01843 }