ccmain/adaptions.cpp

Go to the documentation of this file.
00001 
00019 #include "mfcpch.h"
00020 #ifdef __UNIX__
00021 #include          <assert.h>
00022 #endif
00023 #include          <ctype.h>
00024 #include          <string.h>
00025 #include          "tessbox.h"
00026 #include          "tessvars.h"
00027 #include          "memry.h"
00028 #include          "mainblk.h"
00029 #include          "charcut.h"
00030 #include          "imgs.h"
00031 #include          "scaleimg.h"
00032 #include          "reject.h"
00033 #include          "control.h"
00034 #include          "adaptions.h"
00035 #include          "stopper.h"
00036 #include          "charsample.h"
00037 #include          "matmatch.h"
00038 #include          "secname.h"
00039 
00040 INT32 demo_word = 0;
00041 
00042 #define EXTERN
00043 
00045 EXTERN BOOL_VAR (tessedit_reject_ems, FALSE, "Reject all m's");
00046 EXTERN BOOL_VAR (tessedit_reject_suspect_ems, FALSE, "Reject suspect m's");
00047 
00048 EXTERN double_VAR (tessedit_cluster_t1, 0.20,
00049 "t1 threshold for clustering samples");
00050 EXTERN double_VAR (tessedit_cluster_t2, 0.40,
00051 "t2 threshold for clustering samples");
00052 EXTERN double_VAR (tessedit_cluster_t3, 0.12,
00053 "Extra threshold for clustering samples, only keep a new sample if best score greater than this value");
00054 EXTERN double_VAR (tessedit_cluster_accept_fraction, 0.80,
00055 "Largest fraction of characters in cluster for it to be used for adaption");
00056 EXTERN INT_VAR (tessedit_cluster_min_size, 3,
00057 "Smallest number of samples in a cluster for it to be used for adaption");
00058 EXTERN BOOL_VAR (tessedit_cluster_debug, FALSE,
00059 "Generate and print debug information for adaption by clustering");
00060 EXTERN BOOL_VAR (tessedit_use_best_sample, FALSE,
00061 "Use best sample from cluster when adapting");
00062 EXTERN BOOL_VAR (tessedit_test_cluster_input, FALSE,
00063 "Set reject map to enable cluster input to be measured");
00064 
00065 EXTERN BOOL_VAR (tessedit_matrix_match, TRUE, "Use matrix matcher");
00066 EXTERN BOOL_VAR (tessedit_mm_use_non_adaption_set, FALSE,
00067 "Don't try to adapt to characters on this list");
00068 EXTERN STRING_VAR (tessedit_non_adaption_set, ",.;:'~@*",
00069 "Characters to be avoided when adapting");
00070 EXTERN BOOL_VAR (tessedit_mm_adapt_using_prototypes, TRUE,
00071 "Use prototypes when adapting");
00072 EXTERN BOOL_VAR (tessedit_mm_use_prototypes, TRUE,
00073 "Use prototypes as clusters are built");
00074 EXTERN BOOL_VAR (tessedit_mm_use_rejmap, FALSE,
00075 "Adapt to characters using reject map");
00076 EXTERN BOOL_VAR (tessedit_mm_all_rejects, FALSE,
00077 "Adapt to all characters using, matrix matcher");
00078 EXTERN BOOL_VAR (tessedit_mm_only_match_same_char, FALSE,
00079 "Only match samples against clusters for the same character");
00080 EXTERN BOOL_VAR (tessedit_process_rns, FALSE, "Handle m - rn ambigs");
00081 
00082 EXTERN BOOL_VAR (tessedit_demo_adaption, FALSE,
00083 "Display cut images and matrix match for demo purposes");
00084 EXTERN INT_VAR (tessedit_demo_word1, 62,
00085 "Word number of first word to display");
00086 EXTERN INT_VAR (tessedit_demo_word2, 64,
00087 "Word number of second word to display");
00088 EXTERN STRING_VAR (tessedit_demo_file, "academe",
00089 "Name of document containing demo words");
00100 BOOL8 word_adaptable(
00101                      WERD_RES *word,
00102                      UINT16 mode) {
00103   BOOL8 status = FALSE;
00104   BITS16 flags(mode); 
00105 
00106   enum MODES
00107   {
00108     ADAPTABLE_WERD,
00109     ACCEPTABLE_WERD,
00110     CHECK_DAWGS,
00111     CHECK_SPACES,
00112     CHECK_ONE_ELL_CONFLICT,
00113     CHECK_AMBIG_WERD
00114   };
00115 
00116   /*
00117   0: NO adaption
00118   */
00119   if (mode == 0) {
00120     return FALSE;
00121   }
00122 
00123   if (flags.bit (ADAPTABLE_WERD))
00124     status |= word->tess_would_adapt;
00125 
00126   if (flags.bit (ACCEPTABLE_WERD))
00127     status |= word->tess_accepted;
00128 
00129   if (!status)                   // If not set then
00130     return FALSE;                // ignore other checks
00131 
00132   if (flags.bit (CHECK_DAWGS) &&
00133     (word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
00134     (word->best_choice->permuter () != FREQ_DAWG_PERM) &&
00135     (word->best_choice->permuter () != USER_DAWG_PERM) &&
00136     (word->best_choice->permuter () != NUMBER_PERM))
00137     return FALSE;
00138 
00139   if (flags.bit (CHECK_ONE_ELL_CONFLICT) && one_ell_conflict (word, FALSE))
00140     return FALSE;
00141 
00142   if (flags.bit (CHECK_SPACES) &&
00143     (strchr (word->best_choice->string ().string (), ' ') != NULL))
00144     return FALSE;
00145 
00146 //  if (flags.bit (CHECK_AMBIG_WERD) && test_ambig_word (word))
00147   if (flags.bit (CHECK_AMBIG_WERD) &&
00148       !NoDangerousAmbig(word->best_choice->string().string(), NULL))
00149     return FALSE;
00150 
00151   return status;
00152 }
00153 
00154 
00163 void collect_ems_for_adaption(WERD_RES *word,
00164                               CHAR_SAMPLES_LIST *char_clusters,
00165                               CHAR_SAMPLE_LIST *chars_waiting) {
00166   PBLOB_LIST *blobs = word->outword->blob_list ();
00167   PBLOB_IT blob_it(blobs); 
00168   INT16 i;
00169   CHAR_SAMPLE *sample;
00170   PIXROW_LIST *pixrow_list;
00171   PIXROW_IT pixrow_it;
00172   IMAGELINE *imlines;            
00173   BOX pix_box;                   
00174   // extent
00175   WERD copy_outword;             
00176   PBLOB_IT copy_blob_it;
00177   OUTLINE_IT copy_outline_it;
00178   INT32 resolution = page_image.get_res ();
00179 
00180   if (tessedit_reject_ems || tessedit_reject_suspect_ems)
00181     return;                      // Do nothing
00182 
00183   if (word->word->bounding_box ().height () > resolution / 3)
00184     return;
00185 
00186   if (tessedit_demo_adaption)
00187     tessedit_display_mm.set_value (FALSE); 
00188 
00189   if (word_adaptable (word, tessedit_em_adaption_mode)
00190     && word->reject_map.reject_count () == 0
00191     && (strchr (word->best_choice->string ().string (), 'm') != NULL
00192     || (tessedit_process_rns
00193     && strstr (word->best_choice->string ().string (), "rn") != NULL))) {
00194     if (tessedit_process_rns
00195     && strstr (word->best_choice->string ().string (), "rn") != NULL) {
00196       copy_outword = *(word->outword);
00197       copy_blob_it.set_to_list (copy_outword.blob_list ());
00198       i = 0;
00199       while (word->best_choice->string ()[i] != '\0') {
00200         if (word->best_choice->string ()[i] == 'r'
00201         && word->best_choice->string ()[i + 1] == 'n') {
00202           copy_outline_it.set_to_list (copy_blob_it.data ()->
00203             out_list ());
00204           copy_outline_it.add_list_after (copy_blob_it.
00205             data_relative (1)->
00206             out_list ());
00207           copy_blob_it.forward ();
00208           delete (copy_blob_it.extract ());
00209           i++;
00210         }
00211         copy_blob_it.forward ();
00212         i++;
00213       }
00214     }
00215     else
00216       copy_outword = *(word->outword);
00217 
00218     copy_outword.baseline_denormalise (&word->denorm);
00219     char_clip_word(&copy_outword, page_image, pixrow_list, imlines, pix_box); 
00220     pixrow_it.set_to_list (pixrow_list);
00221     pixrow_it.move_to_first ();
00222 
00223     blob_it.move_to_first ();
00224     for (i = 0;
00225       word->best_choice->string ()[i] != '\0';
00226     i++, pixrow_it.forward (), blob_it.forward ()) {
00227 
00228       if (word->best_choice->string ()[i] == 'm'
00229         || (word->best_choice->string ()[i] == 'r'
00230       && word->best_choice->string ()[i + 1] == 'n')) {
00231         #ifndef SECURE_NAMES
00232         if (tessedit_cluster_debug)
00233           tprintf ("Sample %c for adaption found in %s, index %d\n",
00234             word->best_choice->string ()[i],
00235             word->best_choice->string ().string (), i);
00236         #endif
00237         if (tessedit_matrix_match) {
00238           sample = clip_sample (pixrow_it.data (),
00239             imlines,
00240             pix_box,
00241             copy_outword.flag (W_INVERSE),
00242             word->best_choice->string ()[i]);
00243 
00244           if (sample == NULL) {  //Clip failed
00245             #ifndef SECURE_NAMES
00246             tprintf ("Unable to clip sample from %s, index %d\n",
00247               word->best_choice->string ().string (), i);
00248             #endif
00249             if (word->best_choice->string ()[i] == 'r')
00250               i++;
00251 
00252             continue;
00253           }
00254         }
00255         else
00256           sample = new CHAR_SAMPLE (blob_it.data (),
00257             &word->denorm,
00258             word->best_choice->string ()[i]);
00259 
00260         cluster_sample(sample, char_clusters, chars_waiting); 
00261 
00262         if (word->best_choice->string ()[i] == 'r')
00263           i++;                   // Skip next character
00264       }
00265     }
00266     delete[]imlines;             // Free array of imlines
00267     delete pixrow_list;
00268   }
00269 }
00270 
00271 
00284 void collect_characters_for_adaption(WERD_RES *word,
00285                                      CHAR_SAMPLES_LIST *char_clusters,
00286                                      CHAR_SAMPLE_LIST *chars_waiting) {
00287   PBLOB_LIST *blobs = word->outword->blob_list ();
00288   PBLOB_IT blob_it(blobs);
00289   INT16 i;
00290   CHAR_SAMPLE *sample;
00291   PIXROW_LIST *pixrow_list;
00292   PIXROW_IT pixrow_it;
00293   IMAGELINE *imlines;            // lines of the image
00294   BOX pix_box;                   // box of imlines
00295   // extent
00296   WERD copy_outword;             // copy to denorm
00297   INT32 resolution = page_image.get_res ();
00298 
00299   if (word->word->bounding_box ().height () > resolution / 3)
00300     return;
00301 
00302   if (tessedit_demo_adaption)
00303     tessedit_display_mm.set_value (FALSE); // Make sure not set
00304 
00305   if ((word_adaptable (word, tessedit_cluster_adaption_mode)
00306   && word->reject_map.reject_count () == 0) || tessedit_mm_use_rejmap) {
00307     if (tessedit_test_cluster_input && !tessedit_mm_use_rejmap)
00308       return;                    // Reject map set to acceptable
00309     /* Collect information about good matches */
00310     copy_outword = *(word->outword);
00311     copy_outword.baseline_denormalise (&word->denorm);
00312     char_clip_word(&copy_outword, page_image, pixrow_list, imlines, pix_box); 
00313     pixrow_it.set_to_list (pixrow_list);
00314     pixrow_it.move_to_first ();
00315 
00316     blob_it.move_to_first ();
00317     for (i = 0;
00318       word->best_choice->string ()[i] != '\0';
00319     i++, pixrow_it.forward (), blob_it.forward ()) {
00320 
00321       if (!(tessedit_mm_use_non_adaption_set
00322         && STRING (tessedit_non_adaption_set).contains (word->
00323         best_choice->
00324         string ()[i]))
00325       || (tessedit_mm_use_rejmap && word->reject_map[i].accepted ())) {
00326         #ifndef SECURE_NAMES
00327         if (tessedit_cluster_debug)
00328           tprintf ("Sample %c for adaption found in %s, index %d\n",
00329             word->best_choice->string ()[i],
00330             word->best_choice->string ().string (), i);
00331         #endif
00332         sample = clip_sample (pixrow_it.data (),
00333           imlines,
00334           pix_box,
00335           copy_outword.flag (W_INVERSE),
00336           word->best_choice->string ()[i]);
00337 
00338         if (sample == NULL) {    // Clip failed
00339           #ifndef SECURE_NAMES
00340           tprintf ("Unable to clip sample from %s, index %d\n",
00341             word->best_choice->string ().string (), i);
00342           #endif
00343           continue;
00344         }
00345         cluster_sample(sample, char_clusters, chars_waiting);
00346       }
00347     }
00348     delete[]imlines;             // Free array of imlines
00349     delete pixrow_list;
00350   }
00351   else if (tessedit_test_cluster_input && !tessedit_mm_use_rejmap)
00352     word->reject_map.rej_word_tess_failure (); // Set word to all rejects
00353 }
00354 
00355 
00364 void cluster_sample(CHAR_SAMPLE *sample,
00365                     CHAR_SAMPLES_LIST *char_clusters,
00366                     CHAR_SAMPLE_LIST *chars_waiting) {
00367   CHAR_SAMPLES *best_cluster = NULL;
00368   CHAR_SAMPLES_IT c_it = char_clusters;
00369   CHAR_SAMPLE_IT cw_it = chars_waiting;
00370   float score;
00371   float best_score = MAX_INT32;
00372 
00373   if (c_it.empty ())
00374     c_it.add_to_end (new CHAR_SAMPLES (sample));
00375   else {
00376     for (c_it.mark_cycle_pt (); !c_it.cycled_list (); c_it.forward ()) {
00377       score = c_it.data ()->match_score (sample);
00378       if (score < best_score) {
00379         best_score = score;
00380         best_cluster = c_it.data ();
00381       }
00382     }
00383 
00384     if (tessedit_cluster_debug)
00385       tprintf ("Sample's best score %f\n", best_score);
00386 
00387     if (best_score < tessedit_cluster_t1) {
00388       if (best_score > tessedit_cluster_t3 || tessedit_mm_use_prototypes) {
00389         best_cluster->add_sample (sample);
00390         check_wait_list(chars_waiting, sample, best_cluster);
00391         #ifndef SECURE_NAMES
00392         if (tessedit_cluster_debug)
00393           tprintf ("Sample added to an existing cluster\n");
00394         #endif
00395       }
00396       else {
00397         #ifndef SECURE_NAMES
00398         if (tessedit_cluster_debug)
00399           tprintf
00400             ("Sample dropped, good match to an existing cluster\n");
00401         #endif
00402       }
00403     }
00404     else if (best_score > tessedit_cluster_t2) {
00405       c_it.add_to_end (new CHAR_SAMPLES (sample));
00406       #ifndef SECURE_NAMES
00407       if (tessedit_cluster_debug)
00408         tprintf ("New cluster created for this sample\n");
00409       #endif
00410     }
00411     else {
00412       cw_it.add_to_end (sample);
00413       if (tessedit_cluster_debug)
00414         tprintf ("Sample added to the wait list\n");
00415     }
00416   }
00417 }
00418 
00419 
00429 void check_wait_list(CHAR_SAMPLE_LIST *chars_waiting,
00430                      CHAR_SAMPLE *sample,
00431                      CHAR_SAMPLES *best_cluster) {
00432   CHAR_SAMPLE *wait_sample;
00433   CHAR_SAMPLE *test_sample = sample;
00434   CHAR_SAMPLE_IT cw_it = chars_waiting;
00435   CHAR_SAMPLE_LIST add_list;     // Samples added to best cluster
00436   CHAR_SAMPLE_IT add_it = &add_list;
00437   float score;
00438 
00439   add_list.clear ();
00440 
00441   if (!cw_it.empty ()) {
00442     do {
00443       if (!add_list.empty ()) {
00444         add_it.forward ();
00445         test_sample = add_it.extract ();
00446         best_cluster->add_sample (test_sample);
00447       }
00448 
00449       for (cw_it.mark_cycle_pt ();
00450       !cw_it.cycled_list (); cw_it.forward ()) {
00451         wait_sample = cw_it.data ();
00452         if (tessedit_mm_use_prototypes)
00453           score = best_cluster->match_score (wait_sample);
00454         else
00455           score = sample->match_sample (wait_sample, FALSE);
00456         if (score < tessedit_cluster_t1) {
00457           if (score > tessedit_cluster_t3
00458           || tessedit_mm_use_prototypes) {
00459             add_it.add_after_stay_put (cw_it.extract ());
00460             #ifndef SECURE_NAMES
00461             if (tessedit_cluster_debug)
00462               tprintf
00463                 ("Wait sample added to an existing cluster\n");
00464             #endif
00465           }
00466           else {
00467             #ifndef SECURE_NAMES
00468             if (tessedit_cluster_debug)
00469               tprintf
00470                 ("Wait sample dropped, good match to an existing cluster\n");
00471             #endif
00472           }
00473         }
00474       }
00475     }
00476     while (!add_list.empty ());
00477   }
00478 }
00479 
00490 void complete_clustering(CHAR_SAMPLES_LIST *char_clusters,
00491                          CHAR_SAMPLE_LIST *chars_waiting) {
00492   CHAR_SAMPLES *best_cluster;
00493   CHAR_SAMPLES_IT c_it = char_clusters;
00494   CHAR_SAMPLE_IT cw_it = chars_waiting;
00495   CHAR_SAMPLE *sample;
00496   INT32 total_sample_count = 0;
00497 
00498   while (!cw_it.empty ()) {
00499     cw_it.move_to_first ();
00500     sample = cw_it.extract ();
00501     best_cluster = new CHAR_SAMPLES (sample);
00502     c_it.add_to_end (best_cluster);
00503     check_wait_list(chars_waiting, sample, best_cluster);
00504   }
00505 
00506   for (c_it.mark_cycle_pt (); !c_it.cycled_list (); c_it.forward ()) {
00507     c_it.data ()->assign_to_char ();
00508     if (tessedit_use_best_sample)
00509       c_it.data ()->find_best_sample ();
00510     else if (tessedit_mm_adapt_using_prototypes)
00511       c_it.data ()->build_prototype ();
00512 
00513     if (tessedit_cluster_debug)
00514       total_sample_count += c_it.data ()->n_samples ();
00515   }
00516   #ifndef SECURE_NAMES
00517   if (tessedit_cluster_debug)
00518     tprintf ("Clustering completed, %d samples in all\n", total_sample_count);
00519   #endif
00520 
00521 #ifndef GRAPHICS_DISABLED
00522   if (tessedit_demo_adaption)
00523     display_cluster_prototypes(char_clusters);
00524 #endif
00525 
00526 }
00527 
00528 
00541 void adapt_to_good_ems(WERD_RES *word,
00542                        CHAR_SAMPLES_LIST *char_clusters,
00543                        CHAR_SAMPLE_LIST *chars_waiting) {
00544   PBLOB_LIST *blobs = word->outword->blob_list ();
00545   PBLOB_IT blob_it(blobs);
00546   INT16 i;
00547   CHAR_SAMPLE *sample;
00548   CHAR_SAMPLES_IT c_it = char_clusters;
00549   CHAR_SAMPLE_IT cw_it = chars_waiting;
00550   float score;
00551   float best_score;
00552   char best_char;
00553   CHAR_SAMPLES *best_cluster;
00554   PIXROW_LIST *pixrow_list;
00555   PIXROW_IT pixrow_it;
00556   IMAGELINE *imlines;            // lines of the image
00557   BOX pix_box;                   // box of imlines
00558   // extent
00559   WERD copy_outword;             // copy to denorm
00560   BOX b_box;
00561   PBLOB_IT copy_blob_it;
00562   OUTLINE_IT copy_outline_it;
00563   PIXROW *pixrow = NULL;
00564 
00565   static INT32 word_number = 0;
00566 
00567 #ifndef GRAPHICS_DISABLED
00568   WINDOW demo_win = NULL;
00569 #endif
00570 
00571   INT32 resolution = page_image.get_res ();
00572 
00573   if (word->word->bounding_box ().height () > resolution / 3)
00574     return;
00575 
00576   word_number++;
00577 
00578   if (strchr (word->best_choice->string ().string (), 'm') == NULL
00579     && (tessedit_process_rns
00580     && strstr (word->best_choice->string ().string (), "rn") == NULL))
00581     return;
00582 
00583   if (tessedit_reject_ems)
00584     reject_all_ems(word);
00585   else if (tessedit_reject_suspect_ems)
00586     reject_suspect_ems(word);
00587   else {
00588     if (char_clusters->length () == 0) {
00589       #ifndef SECURE_NAMES
00590       if (tessedit_cluster_debug)
00591         tprintf ("No clusters to use for em adaption\n");
00592       #endif
00593       return;
00594     }
00595 
00596     if (!cw_it.empty ()) {
00597       complete_clustering(char_clusters, chars_waiting);
00598       print_em_stats(char_clusters, chars_waiting);
00599     }
00600 
00601     if ((!word_adaptable (word, tessedit_em_adaption_mode) ||
00602       word->reject_map.reject_count () != 0)
00603       && (strchr (word->best_choice->string ().string (), 'm') != NULL
00604       || (tessedit_process_rns
00605       && strstr (word->best_choice->string ().string (), "rn") != NULL))) {
00606       if (tessedit_process_rns
00607         && strstr (word->best_choice->string ().string (), "rn") != NULL) {
00608         copy_outword = *(word->outword);
00609         copy_blob_it.set_to_list (copy_outword.blob_list ());
00610         i = 0;
00611         while (word->best_choice->string ()[i] != '\0') {
00612           if (word->best_choice->string ()[i] == 'r'
00613           && word->best_choice->string ()[i + 1] == 'n') {
00614             copy_outline_it.set_to_list (copy_blob_it.data ()-> out_list ());
00615             copy_outline_it.add_list_after (copy_blob_it. data_relative (1)->
00616               out_list ());
00617             copy_blob_it.forward ();
00618             delete (copy_blob_it.extract ());
00619             i++;
00620           }
00621           copy_blob_it.forward ();
00622           i++;
00623         }
00624       }
00625       else
00626         copy_outword = *(word->outword);
00627 
00628       copy_outword.baseline_denormalise (&word->denorm);
00629       copy_blob_it.set_to_list (copy_outword.blob_list ());
00630       char_clip_word(&copy_outword, page_image, pixrow_list, imlines, pix_box); 
00631       pixrow_it.set_to_list (pixrow_list);
00632       pixrow_it.move_to_first ();
00633 
00634       // For debugging only
00635       b_box = copy_outword.bounding_box ();
00636       pixrow = pixrow_it.data ();
00637 
00638       blob_it.move_to_first ();
00639       copy_blob_it.move_to_first ();
00640       for (i = 0;
00641         word->best_choice->string ()[i] != '\0';
00642         i++, pixrow_it.forward (), blob_it.forward (),
00643       copy_blob_it.forward ()) {
00644         if ((word->best_choice->string ()[i] == 'm'
00645           || (word->best_choice->string ()[i] == 'r'
00646           && word->best_choice->string ()[i + 1] == 'n'))
00647         && !word->reject_map[i].perm_rejected ()) {
00648           if (tessedit_cluster_debug)
00649             tprintf ("Sample %c to check found in %s, index %d\n",
00650               word->best_choice->string ()[i],
00651               word->best_choice->string ().string (), i);
00652 
00653           if (tessedit_demo_adaption)
00654             tprintf
00655               ("Sample %c to check found in %s (%d), index %d\n",
00656               word->best_choice->string ()[i],
00657               word->best_choice->string ().string (), word_number,
00658               i);
00659 
00660           if (tessedit_matrix_match) {
00661             BOX copy_box = copy_blob_it.data ()->bounding_box ();
00662 
00663             sample = clip_sample (pixrow_it.data (),
00664               imlines,
00665               pix_box,
00666               copy_outword.flag (W_INVERSE),
00667               word->best_choice->string ()[i]);
00668 
00669             //Clip failed
00670             if (sample == NULL) {
00671               tprintf
00672                 ("Unable to clip sample from %s, index %d\n",
00673                 word->best_choice->string ().string (), i);
00674               #ifndef SECURE_NAMES
00675               if (tessedit_cluster_debug)
00676                 tprintf ("Sample rejected (no sample)\n");
00677               #endif
00678               word->reject_map[i].setrej_mm_reject ();
00679               if (word->best_choice->string ()[i] == 'r') {
00680                 word->reject_map[i + 1].setrej_mm_reject ();
00681                 i++;
00682               }
00683               continue;
00684             }
00685           }
00686           else
00687             sample = new CHAR_SAMPLE (blob_it.data (),
00688               &word->denorm,
00689               word->best_choice->
00690               string ()[i]);
00691 
00692           best_score = MAX_INT32;
00693           best_char = '\0';
00694           best_cluster = NULL;
00695 
00696           for (c_it.mark_cycle_pt ();
00697           !c_it.cycled_list (); c_it.forward ()) {
00698             if (c_it.data ()->character () != '\0') {
00699               score = c_it.data ()->match_score (sample);
00700               if (score < best_score) {
00701                 best_cluster = c_it.data ();
00702                 best_score = score;
00703                 best_char = c_it.data ()->character ();
00704               }
00705             }
00706           }
00707 
00708           if (best_score > tessedit_cluster_t1) {
00709             #ifndef SECURE_NAMES
00710             if (tessedit_cluster_debug)
00711               tprintf ("Sample rejected (score %f)\n", best_score);
00712             if (tessedit_demo_adaption)
00713               tprintf ("Sample rejected (score %f)\n", best_score);
00714             #endif
00715             word->reject_map[i].setrej_mm_reject ();
00716             if (word->best_choice->string ()[i] == 'r')
00717               word->reject_map[i + 1].setrej_mm_reject ();
00718           }
00719           else {
00720             if (word->best_choice->string ()[i] == best_char) {
00721               #ifndef SECURE_NAMES
00722               if (tessedit_cluster_debug)
00723                 tprintf ("Sample accepted (score %f)\n",
00724                   best_score);
00725               if (tessedit_demo_adaption)
00726                 tprintf ("Sample accepted (score %f)\n",
00727                   best_score);
00728               #endif
00729               word->reject_map[i].setrej_mm_accept ();
00730               if (word->best_choice->string ()[i] == 'r')
00731                 word->reject_map[i + 1].setrej_mm_accept ();
00732             }
00733             else {
00734               #ifndef SECURE_NAMES
00735               if (tessedit_cluster_debug)
00736                 tprintf ("Sample rejected (char %c, score %f)\n",
00737                   best_char, best_score);
00738               if (tessedit_demo_adaption)
00739                 tprintf ("Sample rejected (char %c, score %f)\n",
00740                   best_char, best_score);
00741               #endif
00742               word->reject_map[i].setrej_mm_reject ();
00743               if (word->best_choice->string ()[i] == 'r')
00744                 word->reject_map[i + 1].setrej_mm_reject ();
00745             }
00746           }
00747 
00748           if (tessedit_demo_adaption) {
00749             if (strcmp (imagebasename.string (),
00750               tessedit_demo_file.string ()) != 0
00751               || word_number == tessedit_demo_word1
00752             || word_number == tessedit_demo_word2) {
00753 #ifndef GRAPHICS_DISABLED
00754               demo_win =
00755                 display_clip_image(&copy_outword,
00756                                    page_image,
00757                                    pixrow_list,
00758                                    pix_box);
00759 #endif
00760               demo_word = word_number;
00761               best_cluster->match_score (sample);
00762               demo_word = 0;
00763             }
00764           }
00765           if (word->best_choice->string ()[i] == 'r')
00766             i++;                 // Skip next character
00767         }
00768       }
00769       delete[]imlines;           // Free array of imlines
00770       delete pixrow_list;
00771     }
00772   }
00773 }
00774 
00775 
00784 void adapt_to_good_samples(WERD_RES *word,
00785                            CHAR_SAMPLES_LIST *char_clusters,
00786                            CHAR_SAMPLE_LIST *chars_waiting) {
00787   PBLOB_LIST *blobs = word->outword->blob_list ();
00788   PBLOB_IT blob_it(blobs);
00789   INT16 i;
00790   CHAR_SAMPLE *sample;
00791   CHAR_SAMPLES_IT c_it = char_clusters;
00792   CHAR_SAMPLE_IT cw_it = chars_waiting;
00793   float score;
00794   float best_score;
00795   char best_char;
00796   CHAR_SAMPLES *best_cluster;
00797   PIXROW_LIST *pixrow_list;
00798   PIXROW_IT pixrow_it;
00799   IMAGELINE *imlines;            // lines of the image
00800   BOX pix_box;                   // box of imlines
00801   // extent
00802   WERD copy_outword;             // copy to denorm
00803   BOX b_box;
00804   PBLOB_IT copy_blob_it;
00805   PIXROW *pixrow = NULL;
00806 
00807   static INT32 word_number = 0;
00808 
00809 #ifndef GRAPHICS_DISABLED
00810   WINDOW demo_win = NULL;
00811 #endif
00812 
00813   INT32 resolution = page_image.get_res ();
00814 
00815   word_number++;
00816 
00817   if (tessedit_test_cluster_input)
00818     return;
00819 
00820   if (word->word->bounding_box ().height () > resolution / 3)
00821     return;
00822 
00823   if (char_clusters->length () == 0) {
00824     #ifndef SECURE_NAMES
00825     if (tessedit_cluster_debug)
00826       tprintf ("No clusters to use for adaption\n");
00827     #endif
00828     return;
00829   }
00830 
00831   if (!cw_it.empty ()) {
00832     complete_clustering(char_clusters, chars_waiting);
00833     print_em_stats(char_clusters, chars_waiting);
00834   }
00835 
00836   if ((!word_adaptable (word, tessedit_cluster_adaption_mode)
00837   && word->reject_map.reject_count () != 0) || tessedit_mm_use_rejmap) {
00838     if (tessedit_cluster_debug) {
00839       tprintf ("\nChecking: \"%s\"  MAP ",
00840         word->best_choice->string ().string ());
00841       word->reject_map.print (debug_fp);
00842       tprintf ("\n");
00843     }
00844 
00845     copy_outword = *(word->outword);
00846     copy_outword.baseline_denormalise (&word->denorm);
00847     copy_blob_it.set_to_list (copy_outword.blob_list ());
00848     char_clip_word(&copy_outword, page_image, pixrow_list, imlines, pix_box);
00849     pixrow_it.set_to_list (pixrow_list);
00850     pixrow_it.move_to_first ();
00851 
00852                                  // For debugging only
00853     b_box = copy_outword.bounding_box ();
00854     pixrow = pixrow_it.data ();
00855 
00856     blob_it.move_to_first ();
00857     copy_blob_it.move_to_first ();
00858     for (i = 0;
00859       word->best_choice->string ()[i] != '\0';
00860       i++, pixrow_it.forward (), blob_it.forward (),
00861     copy_blob_it.forward ()) {
00862       if (word->reject_map[i].recoverable ()
00863       || (tessedit_mm_all_rejects && word->reject_map[i].rejected ())) {
00864         BOX copy_box = copy_blob_it.data ()->bounding_box ();
00865 
00866         if (tessedit_cluster_debug)
00867           tprintf ("Sample %c to check found in %s, index %d\n",
00868             word->best_choice->string ()[i],
00869             word->best_choice->string ().string (), i);
00870 
00871         if (tessedit_demo_adaption)
00872           tprintf ("Sample %c to check found in %s (%d), index %d\n",
00873             word->best_choice->string ()[i],
00874             word->best_choice->string ().string (),
00875             word_number, i);
00876 
00877         sample = clip_sample (pixrow_it.data (),
00878           imlines,
00879           pix_box,
00880           copy_outword.flag (W_INVERSE),
00881           word->best_choice->string ()[i]);
00882 
00883         if (sample == NULL) {    //Clip failed
00884           tprintf ("Unable to clip sample from %s, index %d\n",
00885             word->best_choice->string ().string (), i);
00886           #ifndef SECURE_NAMES
00887           if (tessedit_cluster_debug)
00888             tprintf ("Sample rejected (no sample)\n");
00889           #endif
00890           word->reject_map[i].setrej_mm_reject ();
00891 
00892           continue;
00893         }
00894 
00895         best_score = MAX_INT32;
00896         best_char = '\0';
00897         best_cluster = NULL;
00898 
00899         for (c_it.mark_cycle_pt ();
00900         !c_it.cycled_list (); c_it.forward ()) {
00901           if (c_it.data ()->character () != '\0') {
00902             score = c_it.data ()->match_score (sample);
00903             if (score < best_score) {
00904               best_cluster = c_it.data ();
00905               best_score = score;
00906               best_char = c_it.data ()->character ();
00907             }
00908           }
00909         }
00910 
00911         if (best_score > tessedit_cluster_t1) {
00912           #ifndef SECURE_NAMES
00913           if (tessedit_cluster_debug)
00914             tprintf ("Sample rejected (score %f)\n", best_score);
00915           if (tessedit_demo_adaption)
00916             tprintf ("Sample rejected (score %f)\n", best_score);
00917           #endif
00918           word->reject_map[i].setrej_mm_reject ();
00919         }
00920         else {
00921           if (word->best_choice->string ()[i] == best_char) {
00922             #ifndef SECURE_NAMES
00923             if (tessedit_cluster_debug)
00924               tprintf ("Sample accepted (score %f)\n", best_score);
00925             if (tessedit_demo_adaption)
00926               tprintf ("Sample accepted (score %f)\n", best_score);
00927             #endif
00928             if (tessedit_test_adaption)
00929               word->reject_map[i].setrej_minimal_rej_accept ();
00930             else
00931               word->reject_map[i].setrej_mm_accept ();
00932           }
00933           else {
00934             #ifndef SECURE_NAMES
00935             if (tessedit_cluster_debug)
00936               tprintf ("Sample rejected (char %c, score %f)\n",
00937                 best_char, best_score);
00938             if (tessedit_demo_adaption)
00939               tprintf ("Sample rejected (char %c, score %f)\n",
00940                 best_char, best_score);
00941             #endif
00942             word->reject_map[i].setrej_mm_reject ();
00943           }
00944         }
00945 
00946         if (tessedit_demo_adaption) {
00947           if (strcmp (imagebasename.string (),
00948             tessedit_demo_file.string ()) != 0
00949             || word_number == tessedit_demo_word1
00950           || word_number == tessedit_demo_word2) {
00951 #ifndef GRAPHICS_DISABLED
00952             demo_win =
00953               display_clip_image(&copy_outword,
00954                                  page_image,
00955                                  pixrow_list,
00956                                  pix_box);
00957 #endif
00958             demo_word = word_number;
00959             best_cluster->match_score (sample);
00960             demo_word = 0;
00961           }
00962         }
00963       }
00964     }
00965     delete[]imlines;             // Free array of imlines
00966     delete pixrow_list;
00967 
00968     if (tessedit_cluster_debug) {
00969       tprintf ("\nFinal: \"%s\"  MAP ",
00970         word->best_choice->string ().string ());
00971       word->reject_map.print (debug_fp);
00972       tprintf ("\n");
00973     }
00974   }
00975 }
00976 
00977 
00985 void print_em_stats(CHAR_SAMPLES_LIST *char_clusters,
00986                     CHAR_SAMPLE_LIST *chars_waiting) {
00987   CHAR_SAMPLES_IT c_it = char_clusters;
00988 
00989   if (!tessedit_cluster_debug)
00990     return;
00991   #ifndef SECURE_NAMES
00992   tprintf ("There are %d clusters and %d samples waiting\n",
00993     char_clusters->length (), chars_waiting->length ());
00994 
00995   for (c_it.mark_cycle_pt (); !c_it.cycled_list (); c_it.forward ())
00996     c_it.data ()->print (debug_fp);
00997   #endif
00998   tprintf ("\n");
00999 }
01000 
01001 
01013 CHAR_SAMPLE *clip_sample(
01014                          PIXROW *pixrow,
01015                          IMAGELINE *imlines,
01016                          BOX pix_box,
01017                          BOOL8 white_on_black,
01018                          char c) {
01019   IMAGE *image = new (IMAGE); // unscaled char image
01020   BOX b_box = pixrow->bounding_box ();
01021   float baseline_pos = 0; // baseline ht in image
01022   INT32 resolution = page_image.get_res ();
01023 
01024   if (!b_box.null_box ()) {
01025     ASSERT_HOST (b_box.width () < page_image.get_xsize () &&
01026       b_box.height () < page_image.get_ysize ());
01027 
01028     if (b_box.width () > resolution || b_box.height () > resolution) {
01029       tprintf ("clip sample: sample too big (%d x %d)\n",
01030         b_box.width (), b_box.height ());
01031 
01032       return NULL;
01033     }
01034 
01035     IMAGE *image = new (IMAGE);
01036     if (image->create (b_box.width (), b_box.height (), 1) == -1) {
01037       tprintf ("clip sample: create image failed (%d x %d)\n",
01038         b_box.width (), b_box.height ());
01039 
01040       delete image;
01041       return NULL;
01042     }
01043 
01044     if (!white_on_black)
01045       invert_image(image);  // Set background to white
01046     pixrow->char_clip_image (imlines, pix_box, NULL, *image, baseline_pos);
01047     if (white_on_black)
01048       invert_image(image);  //invert white on black for scaling &NN
01049     return new CHAR_SAMPLE (image, c);
01050   }
01051   else
01052     return NULL;
01053 }
01054 
01055 
01056 #ifndef GRAPHICS_DISABLED
01057 
01063 void display_cluster_prototypes(CHAR_SAMPLES_LIST *char_clusters) { 
01064   INT16 proto_number = 0;
01065   CHAR_SAMPLES_IT c_it = char_clusters;
01066   char title[WINDOWNAMESIZE];
01067 
01068   for (c_it.mark_cycle_pt (); !c_it.cycled_list (); c_it.forward ()) {
01069     proto_number++;
01070 
01071     #ifndef SECURE_NAMES
01072     tprintf ("Displaying proto number %d\n", proto_number);
01073     #endif
01074 
01075     if (c_it.data ()->prototype () != NULL) {
01076       sprintf (title, "Proto - %d", proto_number);
01077       display_image (c_it.data ()->prototype ()->make_image (),
01078         title, (proto_number - 1) * 400, 0, FALSE);
01079     }
01080   }
01081 }
01082 #endif
01083 
01090 void reject_all_ems(WERD_RES *word) { 
01091   INT16 i;
01092 
01093   for (i = 0; word->best_choice->string ()[i] != '\0'; i++) {
01094     if (word->best_choice->string ()[i] == 'm')
01095                                  // reject all ems
01096       word->reject_map[i].setrej_mm_reject ();
01097   }
01098 }
01099 
01106 void reject_all_fullstops(WERD_RES *word) { 
01107   INT16 i;
01108 
01109   for (i = 0; word->best_choice->string ()[i] != '\0'; i++) {
01110     if (word->best_choice->string ()[i] == '.')
01111                                  // reject all fullstops
01112       word->reject_map[i].setrej_mm_reject ();
01113   }
01114 }
01115 
01116 
01123 void reject_suspect_ems(WERD_RES *word) { 
01124   INT16 i;
01125 
01126   if (!word_adaptable (word, tessedit_cluster_adaption_mode))
01127   for (i = 0; word->best_choice->string ()[i] != '\0'; i++) {
01128     if (word->best_choice->string ()[i] == 'm' && suspect_em (word, i))
01129                                  // reject all ems
01130       word->reject_map[i].setrej_mm_reject ();
01131   }
01132 }
01133 
01140 void reject_suspect_fullstops(WERD_RES *word) { 
01141   INT16 i;
01142 
01143   for (i = 0; word->best_choice->string ()[i] != '\0'; i++) {
01144     if (word->best_choice->string ()[i] == '.'
01145       && suspect_fullstop (word, i))
01146       word->reject_map[i].setrej_mm_reject (); // reject all commas
01147   }
01148 }
01149 
01150 
01158 BOOL8 suspect_em(WERD_RES *word, INT16 index) { 
01159   PBLOB_LIST *blobs = word->outword->blob_list ();
01160   PBLOB_IT blob_it(blobs);
01161   INT16 j;
01162 
01163   for (j = 0; j < index; j++)
01164     blob_it.forward ();
01165 
01166   return (blob_it.data ()->out_list ()->length () != 1);
01167 }
01168 
01169 
01177 BOOL8 suspect_fullstop(WERD_RES *word, INT16 i) { 
01178   float aspect_ratio;
01179   PBLOB_LIST *blobs = word->outword->blob_list ();
01180   PBLOB_IT blob_it(blobs);
01181   INT16 j;
01182   BOX box;
01183   INT16 width;
01184   INT16 height;
01185 
01186   for (j = 0; j < i; j++)
01187     blob_it.forward ();
01188 
01189   box = blob_it.data ()->bounding_box ();
01190 
01191   width = box.width ();
01192   height = box.height ();
01193 
01194   aspect_ratio = ((width > height) ? ((float) width) / height :
01195   ((float) height) / width);
01196 
01197   return (aspect_ratio > tessed_fullstop_aspect_ratio);
01198 }

Generated on Wed Feb 28 19:49:07 2007 for Tesseract by  doxygen 1.5.1