ccmain/fixxht.cpp

Go to the documentation of this file.
00001 
00021 #include "mfcpch.h"
00022 #include          <string.h>
00023 #include          <ctype.h>
00024 #include          "varable.h"
00025 #include          "tessvars.h"
00026 #include          "control.h"
00027 #include          "reject.h"
00028 #include          "fixxht.h"
00029 #include          "secname.h"
00030 #ifdef TEXT_VERBOSE
00031 #include    "callcpp.h"
00032 #endif
00033 
00037 #define EXTERN
00038 
00039 EXTERN double_VAR (x_ht_fraction_of_caps_ht, 0.7,
00040 "Fract of cps ht est of xht");
00041 EXTERN double_VAR (x_ht_variation, 0.35,
00042 "Err band as fract of caps/xht dist");
00043 EXTERN double_VAR (x_ht_sub_variation, 0.5,
00044 "Err band as fract of caps/xht dist");
00045 EXTERN BOOL_VAR (rej_trial_ambigs, TRUE,
00046 "reject x-ht ambigs when under trial");
00047 EXTERN BOOL_VAR (x_ht_conservative_ambigs, FALSE,
00048 "Dont rely on ambigs + maxht");
00049 EXTERN BOOL_VAR (x_ht_check_est, TRUE, "Cross check estimates");
00050 EXTERN BOOL_VAR (x_ht_case_flip, FALSE, "Flip or reject suspect case");
00051 EXTERN BOOL_VAR (x_ht_include_dodgy_blobs, TRUE,
00052 "Include blobs with possible noise?");
00053 EXTERN BOOL_VAR (x_ht_limit_flip_trials, TRUE,
00054 "Dont do trial flips when ambigs are close to xht?");
00055 EXTERN BOOL_VAR (rej_use_check_block_occ, TRUE,
00056 "Analyse rejection behaviour");
00057 
00058 EXTERN STRING_VAR (chs_non_ambig_caps_ht,
00059 "!#$%&()/12346789?ABDEFGHIKLNQRT[]\\bdfhkl",
00060 "Reliable ascenders");
00061 EXTERN STRING_VAR (chs_x_ht, "acegmnopqrsuvwxyz", "X height chars");
00062 EXTERN STRING_VAR (chs_non_ambig_x_ht, "aenqr", "reliable X height chars");
00063 EXTERN STRING_VAR (chs_ambig_caps_x, "cCmMoO05sSuUvVwWxXzZ",
00064 "X ht or caps ht chars");
00065 EXTERN STRING_VAR (chs_bl_ambig_caps_x, "pPyY", " Caps or descender ambigs");
00066 
00067 /* The following arent used in this module but are used in applybox.c */
00068 EXTERN STRING_VAR (chs_caps_ht,
00069 "!#$%&()/0123456789?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]\\bdfhkl{|}",
00070 "Ascender chars");
00071 EXTERN STRING_VAR (chs_desc, "gjpqy", "Descender chars");
00072 EXTERN STRING_VAR (chs_non_ambig_bl,
00073 "!#$%&01246789?ABCDEFGHIKLMNORSTUVWXYZabcdehiklmnorstuvwxz",
00074 "Reliable baseline chars");
00075 EXTERN STRING_VAR (chs_odd_top, "ijt", "Chars with funny ascender region");
00076 EXTERN STRING_VAR (chs_odd_bot, "()35JQ[]\\/{}|", "Chars with funny base");
00077 
00078 /* The following arent used but are defined for completeness */
00079 EXTERN STRING_VAR (chs_bl,
00080 "!#$%&()/01246789?ABCDEFGHIJKLMNOPRSTUVWXYZ[]\\abcdefhiklmnorstuvwxz{}",
00081 "Baseline chars");
00082 EXTERN STRING_VAR (chs_non_ambig_desc, "gq", "Reliable descender chars");
00147 void re_estimate_x_ht(                     //improve for 1 word
00148                       WERD_RES *word_res,  //word to do
00149                       float *trial_x_ht    //new match value
00150                      ) {
00151   PBLOB_IT blob_it;
00152   INT16 blob_ht_above_baseline;
00153 
00154   const char *word_str;
00155   INT16 i;
00156 
00157   STATS all_blobs_ht (0, 300);   //every blob in word
00158   STATS x_ht (0, 300);           //confirmed pts in wd
00159   STATS caps_ht (0, 300);        //confirmed pts in wd
00160   STATS case_ambig (0, 300);     //lower case ambigs
00161 
00162   INT16 rej_blobs_count = 0;
00163   INT16 rej_blobs_max_height = 0;
00164   INT32 rej_blobs_max_area = 0;
00165   float x_ht_ok_variation;
00166   float max_blob_ht;
00167   float marginally_above_x_ht;
00168 
00169   BOX blob_box;                  //blob bounding box
00170   float est_x_ht = 0.0;          //word estimate
00171   float est_caps_ht = 0.0;       //word estimate
00172                                  //based on hard data?
00173   BOOL8 est_caps_ht_certain = FALSE;
00174   BOOL8 est_x_ht_certain = FALSE;//based on hard data?
00175   BOOL8 trial = FALSE;           //Sepeculative values?
00176   BOOL8 no_comment = FALSE;      //No change in xht
00177   float ambig_lc_x_est;
00178   float ambig_uc_caps_est;
00179   INT16 x_ht_ambigs = 0;
00180   INT16 caps_ht_ambigs = 0;
00181 
00182 #ifdef TEXT_VERBOSE
00183   // gets a 'h', see ccmain/tesseractmain.dox
00184   cprintf("h");
00185 #endif
00186   /* Calculate default variation of blob x_ht from bln x_ht for bln word */
00187   x_ht_ok_variation =
00188     (bln_x_height / x_ht_fraction_of_caps_ht - bln_x_height) * x_ht_variation;
00189 
00190   word_str = word_res->best_choice->string ().string ();
00191   /*
00192     Cycle blobs, allocating to one of the stats sets when possible.
00193   */
00194   blob_it.set_to_list (word_res->outword->blob_list ());
00195   for (blob_it.mark_cycle_pt (), i = 0;
00196   !blob_it.cycled_list (); blob_it.forward (), i++) {
00197     if (!dodgy_blob (blob_it.data ())) {
00198       blob_box = blob_it.data ()->bounding_box ();
00199       blob_ht_above_baseline = blob_box.top () - bln_baseline_offset;
00200       all_blobs_ht.add (blob_ht_above_baseline, 1);
00201 
00202       if (word_res->reject_map[i].rejected ()) {
00203         rej_blobs_count++;
00204         if (blob_box.height () > rej_blobs_max_height)
00205           rej_blobs_max_height = blob_box.height ();
00206         if (blob_box.area () > rej_blobs_max_area)
00207           rej_blobs_max_area = blob_box.area ();
00208       }
00209       else {
00210         if (STRING (chs_non_ambig_x_ht).contains (word_str[i]))
00211           x_ht.add (blob_ht_above_baseline, 1);
00212 
00213         if (STRING (chs_non_ambig_caps_ht).contains (word_str[i]))
00214           caps_ht.add (blob_ht_above_baseline, 1);
00215 
00216         if (STRING (chs_ambig_caps_x).contains (word_str[i])) {
00217           case_ambig.add (blob_ht_above_baseline, 1);
00218           if (STRING (chs_x_ht).contains (word_str[i]))
00219             x_ht_ambigs++;
00220           else
00221             caps_ht_ambigs++;
00222         }
00223 
00224         if (STRING (chs_bl_ambig_caps_x).contains (word_str[i])) {
00225           if (STRING (chs_x_ht).contains (word_str[i])) {
00226             /* confirm x_height provided > 15% total height below baseline */
00227             if ((bln_baseline_offset - blob_box.bottom ()) /
00228               (float) blob_box.height () > 0.15)
00229               x_ht.add (blob_ht_above_baseline, 1);
00230           }
00231           else {
00232             /* confirm caps_height provided < 5% total height below baseline */
00233             if ((bln_baseline_offset - blob_box.bottom ()) /
00234               (float) blob_box.height () < 0.05)
00235               caps_ht.add (blob_ht_above_baseline, 1);
00236           }
00237         }
00238       }
00239     }
00240   }
00241   est_caps_ht = estimate_from_stats (caps_ht);
00242   est_x_ht = estimate_from_stats (x_ht);
00243   est_ambigs(word_res, case_ambig, &ambig_lc_x_est, &ambig_uc_caps_est); 
00244   max_blob_ht = all_blobs_ht.ile (0.9999);
00245 
00246   #ifndef SECURE_NAMES
00247   if (debug_x_ht_level >= 20) {
00248     tprintf ("Mode20:A: %s ", word_str);
00249     word_res->reject_map.print (debug_fp);
00250     tprintf (" XHT:%f CAP:%f MAX:%f AMBIG X:%f CAP:%f\n",
00251       est_x_ht, est_caps_ht, max_blob_ht,
00252       ambig_lc_x_est, ambig_uc_caps_est);
00253   }
00254   #endif
00255   if (!x_ht_conservative_ambigs &&
00256     (ambig_lc_x_est > 0) &&
00257     (ambig_lc_x_est == ambig_uc_caps_est) &&
00258   (max_blob_ht > ambig_lc_x_est + x_ht_ok_variation)) {
00259                                  //may be zero but believe xht
00260     ambig_uc_caps_est = est_caps_ht;
00261     #ifndef SECURE_NAMES
00262     if (debug_x_ht_level >= 20)
00263       tprintf ("Mode20:B: Fiddle ambig_uc_caps_est to %f\n",
00264         ambig_lc_x_est);
00265     #endif
00266   }
00267 
00268   /* Now make some estimates */
00269 
00270   if ((est_x_ht > 0) ||
00271     (est_caps_ht > 0) ||
00272   ((ambig_lc_x_est > 0) && (ambig_lc_x_est != ambig_uc_caps_est))) {
00273     /* There is some sensible data to go on so make the most of it. */
00274     #ifndef SECURE_NAMES
00275     if (debug_x_ht_level >= 20)
00276       tprintf ("Mode20:C: Sensible Data\n", ambig_lc_x_est);
00277     #endif
00278     if (est_x_ht > 0) {
00279       est_x_ht_certain = TRUE;
00280       if (est_caps_ht == 0) {
00281         if ((ambig_uc_caps_est > ambig_lc_x_est) &&
00282           (ambig_uc_caps_est > est_x_ht + x_ht_ok_variation))
00283           est_caps_ht = ambig_uc_caps_est;
00284         else
00285           est_caps_ht = est_x_ht / x_ht_fraction_of_caps_ht;
00286       }
00287       if (case_ambig.get_total () > 0)
00288         improve_estimate(word_res, est_x_ht, est_caps_ht, x_ht, caps_ht); 
00289       est_caps_ht_certain = caps_ht.get_total () > 0;
00290       #ifndef SECURE_NAMES
00291       if (debug_x_ht_level >= 20)
00292         tprintf ("Mode20:D: Est from xht XHT:%f CAP:%f\n",
00293           est_x_ht, est_caps_ht);
00294       #endif
00295     }
00296     else if (est_caps_ht > 0) {
00297       est_caps_ht_certain = TRUE;
00298       if ((ambig_lc_x_est > 0) &&
00299         (ambig_lc_x_est < est_caps_ht - x_ht_ok_variation))
00300         est_x_ht = ambig_lc_x_est;
00301       else
00302         est_x_ht = est_caps_ht * x_ht_fraction_of_caps_ht;
00303       if (ambig_lc_x_est + ambig_uc_caps_est > 0)
00304         improve_estimate(word_res, est_x_ht, est_caps_ht, x_ht, caps_ht); 
00305       est_x_ht_certain = x_ht.get_total () > 0;
00306       #ifndef SECURE_NAMES
00307       if (debug_x_ht_level >= 20)
00308         tprintf ("Mode20:E: Est from caps XHT:%f CAP:%f\n",
00309           est_x_ht, est_caps_ht);
00310       #endif
00311     }
00312     else {
00313       /*
00314      Do something based on case ambig chars alone - we have
00315      guessed that the ambigs are lower case.
00316      */
00317       est_x_ht = ambig_lc_x_est;
00318       est_x_ht_certain = TRUE;
00319       if (ambig_uc_caps_est > ambig_lc_x_est) {
00320         est_caps_ht = ambig_uc_caps_est;
00321         est_caps_ht_certain = TRUE;
00322       }
00323       else
00324         est_caps_ht = est_x_ht / x_ht_fraction_of_caps_ht;
00325 
00326       #ifndef SECURE_NAMES
00327       if (debug_x_ht_level >= 20)
00328         tprintf ("Mode20:F: Est from ambigs XHT:%f CAP:%f\n",
00329           est_x_ht, est_caps_ht);
00330       #endif
00331     }
00332     /*
00333      Check for sane interpretation of evidence:
00334       Try shifting caps ht if min certain caps ht is not significantly greater
00335       than the estimated x ht or the max certain x ht is not significantly less
00336       than the estimated caps ht.
00337     */
00338     if (x_ht_check_est) {
00339       if ((caps_ht.get_total () > 0) &&
00340       (est_x_ht + x_ht_ok_variation >= caps_ht.ile (0.0001))) {
00341         trial = TRUE;
00342         est_caps_ht = est_x_ht;
00343         est_x_ht = x_ht_fraction_of_caps_ht * est_caps_ht;
00344 
00345         #ifndef SECURE_NAMES
00346         if (debug_x_ht_level >= 20)
00347           tprintf ("Mode20:G: Trial XHT:%f CAP:%f\n",
00348             est_x_ht, est_caps_ht);
00349         #endif
00350       }
00351       else if ((x_ht.get_total () > 0) &&
00352       (est_caps_ht - x_ht_ok_variation <= x_ht.ile (0.9999))) {
00353         trial = TRUE;
00354         est_x_ht = est_caps_ht;
00355         est_caps_ht = est_x_ht / x_ht_fraction_of_caps_ht;
00356         #ifndef SECURE_NAMES
00357         if (debug_x_ht_level >= 20)
00358           tprintf ("Mode20:H: Trial XHT:%f CAP:%f\n",
00359             est_x_ht, est_caps_ht);
00360         #endif
00361       }
00362     }
00363   }
00364 
00365   else {
00366     /* There is no sensible data so we're in the dark. */
00367     marginally_above_x_ht = bln_x_height +
00368       x_ht_ok_variation * x_ht_sub_variation;
00369     /*
00370      If there are no rejects, or the only rejects have a narrow height,
00371      or have a small area compared to a normal char, then estimate the x-height
00372      as the original one. (I.e dont fiddle about if the only rejects look like
00373       punctuation) - we use max height as mean or median will be too low if
00374       there are only two blobs - Eg "F."
00375     */
00376 
00377     #ifndef SECURE_NAMES
00378     if (debug_x_ht_level >= 20)
00379       tprintf ("Mode20:I: In the dark\n");
00380     #endif
00381 
00382     if ((rej_blobs_count == 0) ||
00383       (rej_blobs_max_height < 0.3 * max_blob_ht) ||
00384     (rej_blobs_max_area < 0.3 * max_blob_ht * max_blob_ht)) {
00385       no_comment = TRUE;
00386       #ifndef SECURE_NAMES
00387       if (debug_x_ht_level >= 20)
00388         tprintf ("Mode20:J: No comment due to no rejects\n");
00389      #endif
00390     }
00391     else if (x_ht_limit_flip_trials &&
00392       ((max_blob_ht < marginally_above_x_ht) ||
00393       ((ambig_lc_x_est > 0) &&
00394       (ambig_lc_x_est == ambig_uc_caps_est) &&
00395     (ambig_lc_x_est < marginally_above_x_ht)))) {
00396       no_comment = TRUE;
00397       #ifndef SECURE_NAMES
00398       if (debug_x_ht_level >= 20)
00399         tprintf ("Mode20:K: No comment as close to xht %f < %f\n",
00400           ambig_lc_x_est, marginally_above_x_ht);
00401      #endif
00402     }
00403     else if (x_ht_conservative_ambigs && (ambig_uc_caps_est > 0)) {
00404       trial = TRUE;
00405       est_caps_ht = ambig_lc_x_est;
00406       est_x_ht = x_ht_fraction_of_caps_ht * est_caps_ht;
00407 
00408       #ifndef SECURE_NAMES
00409       if (debug_x_ht_level >= 20)
00410         tprintf ("Mode20:L: Trial XHT:%f CAP:%f\n",
00411           est_x_ht, est_caps_ht);
00412       #endif
00413     }
00414     /*
00415    If the top of the word is nowhere near where we expect ascenders to be
00416      \(less than half the x_ht \-> caps_ht distance\)
00417      
00418      That is, suspect an all caps word at the x-ht & estimate x-ht
00419      accordingly, but only as a TRIAL!
00420 
00421       NOTE we do NOT check location of baseline. Commas can descend as much as
00422       real descenders so we would need to do something to make sure that any
00423       disqualifying descenders were not at the end.
00424     */
00425     else {
00426       if (max_blob_ht <
00427       (bln_x_height + bln_x_height / x_ht_fraction_of_caps_ht) / 2.0) {
00428         trial = TRUE;
00429         est_x_ht = x_ht_fraction_of_caps_ht * max_blob_ht;
00430         est_caps_ht = max_blob_ht;
00431 
00432         #ifndef SECURE_NAMES
00433         if (debug_x_ht_level >= 20)
00434           tprintf ("Mode20:M: Trial XHT:%f CAP:%f\n",
00435             est_x_ht, est_caps_ht);
00436         #endif
00437       }
00438       else {
00439         no_comment = TRUE;
00440         if (debug_x_ht_level >= 20)
00441           tprintf ("Mode20:N: No comment as nothing else matched\n");
00442       }
00443     }
00444   }
00445 
00446   /* Sanity check - reject word if fails */
00447   if (!no_comment &&
00448     ((est_x_ht > 2 * bln_x_height) ||
00449     (est_x_ht / word_res->denorm.scale () <= min_sane_x_ht_pixels) ||
00450   (est_caps_ht <= est_x_ht) || (est_caps_ht >= 2.5 * est_x_ht))) {
00451     no_comment = TRUE;
00452     if (!trial && rej_use_xht) {
00453       if (debug_x_ht_level >= 2) {
00454         tprintf ("Sanity check rejecting %s ", word_str);
00455         word_res->reject_map.print (debug_fp);
00456         tprintf ("\n");
00457       }
00458       word_res->reject_map.rej_word_xht_fixup ();
00459 
00460     }
00461     if (debug_x_ht_level >= 20)
00462       tprintf ("Mode20:O: No comment as nothing else matched\n");
00463   }
00464 
00465   if (no_comment || trial) {
00466     word_res->x_height = bln_x_height / word_res->denorm.scale ();
00467     word_res->guessed_x_ht = TRUE;
00468     word_res->caps_height = (bln_x_height / x_ht_fraction_of_caps_ht) /
00469       word_res->denorm.scale ();
00470     word_res->guessed_caps_ht = TRUE;
00471     /*
00472        Reject ambigs in the current word if we are uncertain and:
00473        \li there are rejects OR
00474        \li there is only one char which is an ambig OR
00475        \li there is conflict between the case of the ambigs even though there is
00476        \li no height separation Eg "Ms" recognised from "MS"
00477     */
00478     if (rej_trial_ambigs &&
00479       ((word_res->reject_map.reject_count () > 0) ||
00480       (word_res->reject_map.length () == 1) ||
00481     ((x_ht_ambigs > 0) && (caps_ht_ambigs > 0)))) {
00482       #ifndef SECURE_NAMES
00483       if (debug_x_ht_level >= 2) {
00484         tprintf ("TRIAL Rej Ambigs %s ", word_str);
00485         word_res->reject_map.print (debug_fp);
00486       }
00487       #endif
00488       reject_ambigs(word_res); 
00489       if (debug_x_ht_level >= 2) {
00490         tprintf (" ");
00491         word_res->reject_map.print (debug_fp);
00492         tprintf ("\n");
00493       }
00494     }
00495   }
00496   else {
00497     word_res->x_height = est_x_ht / word_res->denorm.scale ();
00498     word_res->guessed_x_ht = !est_x_ht_certain;
00499     word_res->caps_height = est_caps_ht / word_res->denorm.scale ();
00500     word_res->guessed_caps_ht = !est_caps_ht_certain;
00501   }
00502 
00503   if (!no_comment && (fabs (est_x_ht - bln_x_height) > x_ht_ok_variation))
00504     *trial_x_ht = est_x_ht / word_res->denorm.scale ();
00505   else
00506     *trial_x_ht = 0.0;
00507 
00508   #ifndef SECURE_NAMES
00509   if (((*trial_x_ht > 0) && (debug_x_ht_level >= 3)) ||
00510   (debug_x_ht_level >= 5)) {
00511     tprintf ("%s ", word_str);
00512     word_res->reject_map.print (debug_fp);
00513     tprintf
00514       (" X:%0.2f Cps:%0.2f Mxht:%0.2f RJ MxHt:%d MxAr:%d Rematch:%c\n",
00515       est_x_ht, est_caps_ht, max_blob_ht, rej_blobs_max_height,
00516       rej_blobs_max_area, *trial_x_ht > 0 ? '*' : ' ');
00517   }
00518   #endif
00519 
00520 }
00521 
00526 void check_block_occ(WERD_RES *word_res) { 
00527   PBLOB_IT blob_it;
00528   STRING new_string;
00529   REJMAP new_map = word_res->reject_map;
00530   WERD_CHOICE *new_choice;
00531 
00532   const char *word_str = word_res->best_choice->string ().string ();
00533   INT16 i;
00534   INT16 reject_count = 0;
00535   char confirmed_char;
00536   float x_ht;
00537   float caps_ht;
00538 
00539   if (word_res->x_height > 0)
00540     x_ht = word_res->x_height * word_res->denorm.scale ();
00541   else
00542     x_ht = bln_x_height;
00543 
00544   if (word_res->caps_height > 0)
00545     caps_ht = word_res->caps_height * word_res->denorm.scale ();
00546   else
00547     caps_ht = x_ht / x_ht_fraction_of_caps_ht;
00548 
00549   blob_it.set_to_list (word_res->outword->blob_list ());
00550 
00551   for (blob_it.mark_cycle_pt (), i = 0;
00552   !blob_it.cycled_list (); blob_it.forward (), i++) {
00553     new_string += word_str[i];   //default copy
00554     if (word_res->reject_map[i].accepted ()) {
00555       confirmed_char = check_blob_occ (word_str[i],
00556         blob_it.data ()->bounding_box ().
00557         top () - bln_baseline_offset, x_ht,
00558         caps_ht);
00559 
00560       if (confirmed_char == '\0') {
00561         if (rej_use_check_block_occ) {
00562           new_map[i].setrej_xht_fixup ();
00563           reject_count++;
00564         }
00565       }
00566       else
00567         new_string[i] = confirmed_char;
00568     }
00569   }
00570   if ((reject_count > 0) || (new_string != word_str)) {
00571     if (debug_x_ht_level >= 2) {
00572       tprintf ("Shape Verification: %s ", word_str);
00573       word_res->reject_map.print (debug_fp);
00574       tprintf (" -> %s ", new_string.string ());
00575       new_map.print (debug_fp);
00576       tprintf ("\n");
00577     }
00578     new_choice = new WERD_CHOICE (new_string.string (),
00579       word_res->best_choice->rating (),
00580       word_res->best_choice->certainty (),
00581       word_res->best_choice->permuter ());
00582     delete word_res->best_choice;
00583     word_res->best_choice = new_choice;
00584     word_res->reject_map = new_map;
00585   }
00586 }
00587 
00588 
00594 char check_blob_occ(char proposed_char,
00595                     INT16 blob_ht_above_baseline,
00596                     float x_ht,
00597                     float caps_ht) {
00598   BOOL8 blob_definite_x_ht;
00599   BOOL8 blob_definite_caps_ht;
00600   float acceptable_variation;
00601 
00602   acceptable_variation = (caps_ht - x_ht) * x_ht_variation;
00603   /*
00604   REJECT if:
00605   \li expected descender and nothing significantly below BL
00606   \li expected ascender and nothing significantly above x-ht 
00607   */
00608 
00609   /*
00610     IF AMBIG_CAPS_X_CHS
00611       IF blob is definitely an ascender ( > xht + xht err )AND
00612         char is an x-ht char
00613       THEN
00614         flip case
00615       IF blob is defintiely an x-ht ( <= xht + xht err ) AND
00616         char is an ascender char
00617       THEN
00618         flip case
00619   */
00620   blob_definite_x_ht = blob_ht_above_baseline <= x_ht + acceptable_variation;
00621   blob_definite_caps_ht = blob_ht_above_baseline >=
00622     caps_ht - acceptable_variation;
00623 
00624   if (STRING (chs_ambig_caps_x).contains (proposed_char)) {
00625     if ((!blob_definite_x_ht && !blob_definite_caps_ht) ||
00626       (proposed_char == '0' && !blob_definite_caps_ht) ||
00627       (proposed_char == 'o' && !blob_definite_x_ht))
00628       return '\0';
00629 
00630     else if (blob_definite_caps_ht &&
00631     STRING (chs_x_ht).contains (proposed_char)) {
00632       if (x_ht_case_flip)
00633                                  //flip to upper case
00634         return (char) toupper (proposed_char);
00635       else
00636         return '\0';
00637     }
00638 
00639     else if (blob_definite_x_ht &&
00640     !STRING (chs_x_ht).contains (proposed_char)) {
00641       if (x_ht_case_flip)
00642                                  //flip to lower case
00643         return (char) tolower (proposed_char);
00644       else
00645         return '\0';
00646     }
00647   }
00648   else
00649   if ((STRING (chs_non_ambig_x_ht).contains (proposed_char)
00650     && !blob_definite_x_ht)
00651     || (STRING (chs_non_ambig_caps_ht).contains (proposed_char)
00652     && !blob_definite_caps_ht))
00653     return '\0';
00654   return proposed_char;
00655 }
00656 
00657 
00663 float estimate_from_stats(STATS &stats) { 
00664   if (stats.get_total () <= 0)
00665     return 0.0;
00666   else if (stats.get_total () >= 3)
00667     return stats.ile (0.5);      //median
00668   else
00669     return stats.mean ();
00670 }
00671 
00672 
00679 void improve_estimate(WERD_RES *word_res,
00680                       float &est_x_ht,
00681                       float &est_caps_ht,
00682                       STATS &x_ht,
00683                       STATS &caps_ht) {
00684   PBLOB_IT blob_it;
00685   INT16 blob_ht_above_baseline;
00686 
00687   const char *word_str;
00688   INT16 i;
00689   BOX blob_box;                  //blob bounding box
00690   char confirmed_char;
00691   float new_val;
00692 
00693   blob_it.set_to_list (word_res->outword->blob_list ());
00694   word_str = word_res->best_choice->string ().string ();
00695   for (blob_it.mark_cycle_pt (), i = 0;
00696   !blob_it.cycled_list (); blob_it.forward (), i++) {
00697     if ((STRING (chs_ambig_caps_x).contains (word_str[i])) &&
00698     (!dodgy_blob (blob_it.data ()))) {
00699       blob_box = blob_it.data ()->bounding_box ();
00700       blob_ht_above_baseline = blob_box.top () - bln_baseline_offset;
00701       confirmed_char = check_blob_occ (word_str[i],
00702         blob_ht_above_baseline,
00703         est_x_ht, est_caps_ht);
00704       if (confirmed_char != '\0')
00705         if (STRING (chs_x_ht).contains (confirmed_char))
00706           x_ht.add (blob_ht_above_baseline, 1);
00707       else
00708         caps_ht.add (blob_ht_above_baseline, 1);
00709     }
00710   }
00711   new_val = estimate_from_stats (x_ht);
00712   if (new_val > 0)
00713     est_x_ht = new_val;
00714   new_val = estimate_from_stats (caps_ht);
00715   if (new_val > 0)
00716     est_caps_ht = new_val;
00717 }
00718 
00722 void reject_ambigs(
00723                    WERD_RES *word) {
00724   const char *word_str;
00725   int i = 0;
00726 
00727   word_str = word->best_choice->string ().string ();
00728   while (*word_str != '\0') {
00729     if (STRING (chs_ambig_caps_x).contains (*word_str))
00730       word->reject_map[i].setrej_xht_fixup ();
00731     word_str++;
00732     i++;
00733   }
00734 }
00735 
00736 
00740 void est_ambigs(
00741                 WERD_RES *word_res,
00742                 STATS &stats,
00743                 float *ambig_lc_x_est,    //xht est
00744                 float *ambig_uc_caps_est  //caps est
00745                ) {
00746   float x_ht_ok_variation;
00747   STATS short_ambigs (0, 300);
00748   STATS tall_ambigs (0, 300);
00749   PBLOB_IT blob_it;
00750   BOX blob_box;                  //blob bounding box
00751   INT16 blob_ht_above_baseline;
00752 
00753   const char *word_str;
00754   INT16 i;
00755   float min;                     //min ambig ch ht
00756   float max;                     //max ambig ch ht
00757   float short_limit;             // for lower case
00758   float tall_limit;              // for upper case
00759 
00760   x_ht_ok_variation =
00761     (bln_x_height / x_ht_fraction_of_caps_ht - bln_x_height) * x_ht_variation;
00762 
00763   if (stats.get_total () == 0) {
00764     *ambig_lc_x_est = 0;
00765     *ambig_uc_caps_est = 0;
00766   }
00767   else {
00768     min = stats.ile (0.0);
00769     max = stats.ile (0.99999);
00770     if ((max - min) < x_ht_ok_variation) {
00771       *ambig_lc_x_est = *ambig_uc_caps_est = stats.mean ();
00772       //close enough
00773     }
00774     else {
00775     /* Try reclustering into lower and upper case chars */
00776       short_limit = min + (max - min) * x_ht_variation;
00777       tall_limit = max - (max - min) * x_ht_variation;
00778       word_str = word_res->best_choice->string ().string ();
00779       blob_it.set_to_list (word_res->outword->blob_list ());
00780       for (blob_it.mark_cycle_pt (), i = 0;
00781       !blob_it.cycled_list (); blob_it.forward (), i++) {
00782         if (word_res->reject_map[i].accepted () &&
00783           STRING (chs_ambig_caps_x).contains (word_str[i]) &&
00784         (!dodgy_blob (blob_it.data ()))) {
00785           blob_box = blob_it.data ()->bounding_box ();
00786           blob_ht_above_baseline =
00787             blob_box.top () - bln_baseline_offset;
00788           if (blob_ht_above_baseline <= short_limit)
00789             short_ambigs.add (blob_ht_above_baseline, 1);
00790           else if (blob_ht_above_baseline >= tall_limit)
00791             tall_ambigs.add (blob_ht_above_baseline, 1);
00792         }
00793       }
00794       *ambig_lc_x_est = short_ambigs.mean ();
00795       *ambig_uc_caps_est = tall_ambigs.mean ();
00796       /* Cop out if we havent got sensible clusters. */
00797       if (*ambig_uc_caps_est - *ambig_lc_x_est <= x_ht_ok_variation)
00798         *ambig_lc_x_est = *ambig_uc_caps_est = stats.mean ();
00799       //close enough
00800     }
00801   }
00802 }
00803 
00804 
00814 BOOL8 dodgy_blob(PBLOB *blob) { 
00815   OUTLINE_IT outline_it = blob->out_list ();
00816   INT16 highest_bottom = -MAX_INT16;
00817   INT16 lowest_top = MAX_INT16;
00818   BOX outline_box;
00819 
00820   if (x_ht_include_dodgy_blobs)
00821     return FALSE;                //no blob is ever dodgy
00822   for (outline_it.mark_cycle_pt ();
00823   !outline_it.cycled_list (); outline_it.forward ()) {
00824     outline_box = outline_it.data ()->bounding_box ();
00825     if (lowest_top > outline_box.top ())
00826       lowest_top = outline_box.top ();
00827     if (highest_bottom < outline_box.bottom ())
00828       highest_bottom = outline_box.bottom ();
00829   }
00830   return highest_bottom >= lowest_top;
00831 }

Generated on Wed Feb 28 19:49:07 2007 for Tesseract by  doxygen 1.5.1