ccmain/fixspace.cpp

Go to the documentation of this file.
00001 
00023 #include "mfcpch.h"
00024 #include          <ctype.h>
00025 #include          "reject.h"
00026 #include          "statistc.h"
00027 #include          "genblob.h"
00028 #include          "control.h"
00029 #include          "fixspace.h"
00030 #include          "tessvars.h"
00031 #include          "tessbox.h"
00032 #include          "secname.h"
00033 #ifdef TEXT_VERBOSE
00034 #include    "callcpp.h"
00035 #endif
00036 
00037 
00041 #define EXTERN
00042 
00043 EXTERN BOOL_VAR (fixsp_check_for_fp_noise_space, TRUE,
00044 "Try turning noise to space in fixed pitch");
00045 EXTERN BOOL_VAR (fixsp_fp_eval, TRUE, "Use alternate evaluation for fp");
00046 EXTERN BOOL_VAR (fixsp_noise_score_fixing, TRUE, "More sophisticated?");
00047 EXTERN INT_VAR (fixsp_non_noise_limit, 1,
00048 "How many non-noise blbs either side?");
00049 EXTERN double_VAR (fixsp_small_outlines_size, 0.28, "Small if lt xht x this");
00050 
00051 EXTERN BOOL_VAR (fixsp_ignore_punct, TRUE, "In uniform spacing calc");
00052 EXTERN BOOL_VAR (fixsp_numeric_fix, TRUE, "Try to deal with numeric punct");
00053 EXTERN BOOL_VAR (fixsp_prefer_joined_1s, TRUE, "Arbitrary boost");
00054 EXTERN BOOL_VAR (tessedit_test_uniform_wd_spacing, FALSE,
00055 "Limit context word spacing");
00056 EXTERN BOOL_VAR (tessedit_prefer_joined_punct, FALSE,
00057 "Reward punctation joins");
00058 EXTERN INT_VAR (fixsp_done_mode, 1, "What constitues done for spacing");
00059 EXTERN INT_VAR (debug_fix_space_level, 0, "Contextual fixspace debug");
00060 EXTERN STRING_VAR (numeric_punctuation, ".,",
00061 "Punct. chs expected WITHIN numbers");
00064 #define PERFECT_WERDS   999
00065 
00066 #define MAXSPACING      128
00067 
00079 void fix_fuzzy_spaces(
00080                       volatile ETEXT_DESC *monitor,
00081                       INT32 word_count,
00082                       PAGE_RES *page_res) {
00083   BLOCK_RES_IT block_res_it;     //iterators
00084   ROW_RES_IT row_res_it;
00085   WERD_RES_IT word_res_it_from;
00086   WERD_RES_IT word_res_it_to;
00087   WERD_RES *word_res;
00088   WERD_RES_LIST fuzzy_space_words;
00089   INT16 new_length;
00090   BOOL8 prevent_null_wd_fixsp;   //DONT process blobless wds
00091   INT32 word_index;              //current word
00092 
00093   block_res_it.set_to_list (&page_res->block_res_list);
00094   word_index = 0;
00095   for (block_res_it.mark_cycle_pt ();
00096   !block_res_it.cycled_list (); block_res_it.forward ()) {
00097     row_res_it.set_to_list (&block_res_it.data ()->row_res_list);
00098     for (row_res_it.mark_cycle_pt ();
00099     !row_res_it.cycled_list (); row_res_it.forward ()) {
00100       word_res_it_from.set_to_list (&row_res_it.data ()->word_res_list);
00101       while (!word_res_it_from.at_last ()) {
00102         word_res = word_res_it_from.data ();
00103         while (!word_res_it_from.at_last () &&
00104           !(word_res->combination ||
00105           word_res_it_from.data_relative (1)->
00106           word->flag (W_FUZZY_NON) ||
00107           word_res_it_from.data_relative (1)->
00108         word->flag (W_FUZZY_SP))) {
00109           fix_sp_fp_word (word_res_it_from, row_res_it.data ()->row);
00110           word_res = word_res_it_from.forward ();
00111           word_index++;
00112           if (monitor != NULL) {
00113             monitor->ocr_alive = TRUE;
00114             monitor->progress = 90 + 5 * word_index / word_count;
00115           }
00116         }
00117 
00118         if (!word_res_it_from.at_last ()) {
00119           word_res_it_to = word_res_it_from;
00120           prevent_null_wd_fixsp =
00121             word_res->word->gblob_list ()->empty ();
00122           if (check_debug_pt (word_res, 60))
00123             debug_fix_space_level.set_value (10);
00124           word_res_it_to.forward ();
00125           word_index++;
00126           if (monitor != NULL) {
00127             monitor->ocr_alive = TRUE;
00128             monitor->progress = 90 + 5 * word_index / word_count;
00129           }
00130           while (!word_res_it_to.at_last () &&
00131             (word_res_it_to.data_relative (1)->
00132             word->flag (W_FUZZY_NON) ||
00133             word_res_it_to.data_relative (1)->
00134           word->flag (W_FUZZY_SP))) {
00135             if (check_debug_pt (word_res, 60))
00136               debug_fix_space_level.set_value (10);
00137             if (word_res->word->gblob_list ()->empty ())
00138               prevent_null_wd_fixsp = TRUE;
00139             word_res = word_res_it_to.forward ();
00140           }
00141           if (check_debug_pt (word_res, 60))
00142             debug_fix_space_level.set_value (10);
00143           if (word_res->word->gblob_list ()->empty ())
00144             prevent_null_wd_fixsp = TRUE;
00145           if (prevent_null_wd_fixsp)
00146             word_res_it_from = word_res_it_to;
00147           else {
00148             fuzzy_space_words.assign_to_sublist (&word_res_it_from,
00149               &word_res_it_to);
00150             fix_fuzzy_space_list (fuzzy_space_words,
00151               row_res_it.data ()->row);
00152             new_length = fuzzy_space_words.length ();
00153             word_res_it_from.add_list_before (&fuzzy_space_words);
00154             for (;
00155               (!word_res_it_from.at_last () &&
00156             (new_length > 0)); new_length--) {
00157               word_res_it_from.forward ();
00158             }
00159           }
00160           if (test_pt)
00161             debug_fix_space_level.set_value (0);
00162         }
00163         fix_sp_fp_word (word_res_it_from, row_res_it.data ()->row);
00164         //Last word in row
00165       }
00166     }
00167   }
00168 }
00169 
00170 
00178 void fix_fuzzy_space_list(
00179                           WERD_RES_LIST &best_perm,
00180                           ROW *row) {
00181   INT16 best_score;
00182   WERD_RES_LIST current_perm;
00183   INT16 current_score;
00184   BOOL8 improved = FALSE;
00185 
00186 
00187   best_score = eval_word_spacing (best_perm); //default score
00188 
00189   dump_words (best_perm, best_score, 1, improved);
00190 
00191   if (best_score != PERFECT_WERDS)
00192     initialise_search(best_perm, current_perm); 
00193 
00194   while ((best_score != PERFECT_WERDS) && !current_perm.empty ()) {
00195     match_current_words(current_perm, row); 
00196     current_score = eval_word_spacing (current_perm);
00197     dump_words (current_perm, current_score, 2, improved);
00198     if (current_score > best_score) {
00199       best_perm.clear ();
00200       best_perm.deep_copy (&current_perm);
00201       best_score = current_score;
00202       improved = TRUE;
00203     }
00204     if (current_score < PERFECT_WERDS)
00205       transform_to_next_perm(current_perm); 
00206   }
00207   dump_words (best_perm, best_score, 3, improved);
00208 }
00209 
00210 
00218 void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list) { 
00219   WERD_RES_IT src_it(&src_list); 
00220   WERD_RES_IT new_it(&new_list); 
00221   WERD_RES *src_wd;
00222   WERD_RES *new_wd;
00223 
00224   for (src_it.mark_cycle_pt (); !src_it.cycled_list (); src_it.forward ()) {
00225     src_wd = src_it.data ();
00226     if (!src_wd->combination) {
00227       new_wd = new WERD_RES (*src_wd);
00228       new_wd->combination = FALSE;
00229       new_wd->part_of_combo = FALSE;
00230       new_it.add_after_then_move (new_wd);
00231     }
00232   }
00233 }
00234 
00235 
00244 void match_current_words(WERD_RES_LIST &words, ROW *row) { 
00245   WERD_RES_IT word_it(&words); 
00246   WERD_RES *word;
00247 
00248   for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
00249     word = word_it.data ();
00250     if ((!word->part_of_combo) && (word->outword == NULL))
00251       classify_word_pass2(word, row); 
00252   }
00253 }
00254 
00255 
00285 INT16 eval_word_spacing(WERD_RES_LIST &word_res_list) { 
00286   WERD_RES_IT word_res_it(&word_res_list); 
00287   INT16 total_score = 0;
00288   INT16 word_count = 0;
00289   INT16 done_word_count = 0;
00290   INT16 word_len;
00291   INT16 i;
00292   WERD_RES *word;                //current word
00293   INT16 prev_word_score = 0;
00294   BOOL8 prev_word_done = FALSE;
00295   BOOL8 prev_char_1 = FALSE;     //prev ch a "1/I/l"?
00296   BOOL8 prev_char_digit = FALSE; //prev ch 2..9 or 0
00297   BOOL8 current_char_1 = FALSE;
00298   BOOL8 current_word_ok_so_far;
00299   STRING punct_chars = "!\"`',.:;";
00300   BOOL8 prev_char_punct = FALSE;
00301   BOOL8 current_char_punct = FALSE;
00302   BOOL8 word_done = FALSE;
00303 
00304 #ifdef TEXT_VERBOSE
00305   // gets a 'z', see ccmain/tesseractmain.dox
00306   cprintf("z");
00307 #endif
00308   do {
00309     word = word_res_it.data ();
00310     word_done = fixspace_thinks_word_done (word);
00311     word_count++;
00312     if (word->tess_failed) {
00313       total_score += prev_word_score;
00314       if (prev_word_done)
00315         done_word_count++;
00316       prev_word_score = 0;
00317       prev_char_1 = FALSE;
00318       prev_char_digit = FALSE;
00319       prev_word_done = FALSE;
00320     }
00321     else {
00327       word_len = word->reject_map.length ();
00328       current_word_ok_so_far = FALSE;
00329       if (!((prev_char_1 &&
00330         digit_or_numeric_punct (word,
00331         word->best_choice->string ()[0])) ||
00332         (prev_char_digit &&
00333         ((word_done &&
00334         (word->best_choice->string ()[0] == '1')) ||
00335         (!word_done &&
00336         STRING (conflict_set_I_l_1).contains (word->best_choice->
00337            string ()[0])))))) {
00338         total_score += prev_word_score;
00339         if (prev_word_done)
00340           done_word_count++;
00341         current_word_ok_so_far = word_done;
00342       }
00343 
00344       if ((current_word_ok_so_far) &&
00345         (!tessedit_test_uniform_wd_spacing ||
00346         ((word->best_choice->permuter ()==NUMBER_PERM)||uniformly_spaced (word)))) {
00347         prev_word_done = TRUE;
00348         prev_word_score = word_len;
00349       }
00350       else {
00351         prev_word_done = FALSE;
00352         prev_word_score = 0;
00353       }
00354 
00355       if (fixsp_prefer_joined_1s) {
00356         /* Add 1 to total score for every joined 1 regardless of context and rejtn */
00357 
00358         for (i = 0, prev_char_1 = FALSE; i < word_len; i++) {
00359           current_char_1 = word->best_choice->string ()[i] == '1';
00360           if (prev_char_1 || (current_char_1 && (i > 0)))
00361             total_score++;
00362           prev_char_1 = current_char_1;
00363         }
00364       }
00365 
00366       /* Add 1 to total score for every joined punctuation regardless of context
00367         and rejtn */
00368       if (tessedit_prefer_joined_punct) {
00369         for (i = 0, prev_char_punct = FALSE; i < word_len; i++) {
00370           current_char_punct =
00371             punct_chars.contains (word->best_choice->string ()[i]);
00372           if (prev_char_punct || (current_char_punct && (i > 0)))
00373             total_score++;
00374           prev_char_punct = current_char_punct;
00375         }
00376       }
00377       prev_char_digit = digit_or_numeric_punct (word,
00378         word->best_choice->
00379         string ()[word_len - 1]);
00380       prev_char_1 =
00381         ((word_done
00382         && (word->best_choice->string ()[word_len - 1] == '1'))
00383         || (!word_done
00384         && STRING (conflict_set_I_l_1).contains (word->best_choice->
00385         string ()[word_len -
00386         1])));
00387     }
00388     /* Find next word */
00389     do
00390     word_res_it.forward ();
00391     while (word_res_it.data ()->part_of_combo);
00392   }
00393   while (!word_res_it.at_first ());
00394   total_score += prev_word_score;
00395   if (prev_word_done)
00396     done_word_count++;
00397   if (done_word_count == word_count)
00398     return PERFECT_WERDS;
00399   else
00400     return total_score;
00401 }
00402 
00403 
00411 BOOL8 digit_or_numeric_punct(WERD_RES *word, char ch) { 
00412   return (isdigit (ch) ||
00413     (fixsp_numeric_fix &&
00414     (word->best_choice->permuter () == NUMBER_PERM) &&
00415     STRING (numeric_punctuation).contains (ch)));
00416 }
00417 
00418 
00433 void transform_to_next_perm(WERD_RES_LIST &words) { 
00434   WERD_RES_IT word_it(&words); 
00435   WERD_RES_IT prev_word_it(&words); 
00436   WERD_RES *word;
00437   WERD_RES *prev_word;
00438   WERD_RES *combo;
00439   WERD *copy_word;
00440   INT16 prev_right = -1;
00441   BOX box;
00442   INT16 gap;
00443   INT16 min_gap = MAX_INT16;
00444 
00445   for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
00446     word = word_it.data ();
00447     if (!word->part_of_combo) {
00448       box = word->word->bounding_box ();
00449       if (prev_right >= 0) {
00450         gap = box.left () - prev_right;
00451         if (gap < min_gap)
00452           min_gap = gap;
00453       }
00454       prev_right = box.right ();
00455     }
00456   }
00457   if (min_gap < MAX_INT16) {
00458     prev_right = -1;  //back to start
00459     word_it.set_to_list (&words);
00460     for (;  //cant use cycle pt due to inserted combos at start of list
00461     (prev_right < 0) || !word_it.at_first (); word_it.forward ()) {
00462       word = word_it.data ();
00463       if (!word->part_of_combo) {
00464         box = word->word->bounding_box ();
00465         if (prev_right >= 0) {
00466           gap = box.left () - prev_right;
00467           if (gap <= min_gap) {
00468             prev_word = prev_word_it.data ();
00469             if (prev_word->combination)
00470               combo = prev_word;
00471             else {
00472               /* Make a new combination and insert before the first word being joined */
00473               copy_word = new WERD;
00474               *copy_word = *(prev_word->word);
00475               //deep copy
00476               combo = new WERD_RES (copy_word);
00477               combo->combination = TRUE;
00478               prev_word->part_of_combo = TRUE;
00479               prev_word_it.add_before_then_move (combo);
00480             }
00481             combo->word->set_flag (W_EOL, word->word->flag (W_EOL));
00482             if (word->combination) {
00483               combo->word->join_on (word->word);
00484               //Move blbs to combo
00485               //old combo no longer needed
00486               delete word_it.extract ();
00487             }
00488             else {
00489               //Copy current wd to combo
00490               combo->copy_on (word);
00491               word->part_of_combo = TRUE;
00492             }
00493             combo->done = FALSE;
00494             if (combo->outword != NULL) {
00495               delete combo->outword;
00496               delete combo->best_choice;
00497               delete combo->raw_choice;
00498               combo->outword = NULL;
00499               combo->best_choice = NULL;
00500               combo->raw_choice = NULL;
00501             }
00502           }
00503           else
00504               //catch up
00505               prev_word_it = word_it;
00506         }
00507         prev_right = box.right ();
00508       }
00509     }
00510   }
00511   else
00512     words.clear ();              //signal termination
00513 }
00514 
00515 
00526 void dump_words(WERD_RES_LIST &perm, INT16 score, INT16 mode, BOOL8 improved) { 
00527   WERD_RES_IT word_res_it(&perm); 
00528   static STRING initial_str;
00529 
00530   if (debug_fix_space_level > 0) {
00531     if (mode == 1) {
00532       initial_str = "";
00533       for (word_res_it.mark_cycle_pt ();
00534       !word_res_it.cycled_list (); word_res_it.forward ()) {
00535         if (!word_res_it.data ()->part_of_combo) {
00536           initial_str += word_res_it.data ()->best_choice->string ();
00537           initial_str += ' ';
00538         }
00539       }
00540     }
00541 
00542     #ifndef SECURE_NAMES
00543     if (debug_fix_space_level > 1) {
00544       switch (mode) {
00545         case 1:
00546           tprintf ("EXTRACTED (%d): \"", score);
00547           break;
00548         case 2:
00549           tprintf ("TESTED (%d): \"", score);
00550           break;
00551         case 3:
00552           tprintf ("RETURNED (%d): \"", score);
00553           break;
00554       }
00555 
00556       for (word_res_it.mark_cycle_pt ();
00557       !word_res_it.cycled_list (); word_res_it.forward ()) {
00558         if (!word_res_it.data ()->part_of_combo)
00559           tprintf ("%s/%1d ",
00560             word_res_it.data ()->best_choice->string ().
00561             string (),
00562             (int) word_res_it.data ()->best_choice->permuter ());
00563       }
00564       tprintf ("\"\n");
00565     }
00566     else if (improved) {
00567       tprintf ("FIX SPACING \"%s\" => \"", initial_str.string ());
00568       for (word_res_it.mark_cycle_pt ();
00569       !word_res_it.cycled_list (); word_res_it.forward ()) {
00570         if (!word_res_it.data ()->part_of_combo)
00571           tprintf ("%s/%1d ",
00572             word_res_it.data ()->best_choice->string ().
00573             string (),
00574             (int) word_res_it.data ()->best_choice->permuter ());
00575       }
00576       tprintf ("\"\n");
00577     }
00578     #endif
00579   }
00580 }
00581 
00582 
00597 BOOL8 uniformly_spaced(
00598                        WERD_RES *word) {
00599   PBLOB_IT blob_it;
00600   BOX box;
00601   INT16 prev_right = -MAX_INT16;
00602   INT16 gap;
00603   INT16 max_gap = -MAX_INT16;
00604   INT16 max_gap_count = 0;
00605   STATS gap_stats (0, MAXSPACING);
00606   BOOL8 result;
00607   const ROW *row = word->denorm.row ();
00608   float max_non_space;
00609   float normalised_max_nonspace;
00610   INT16 i = 0;
00611   STRING punct_chars = "\"`',.:;";
00612 
00613   blob_it.set_to_list (word->outword->blob_list ());
00614 
00615   for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
00616     box = blob_it.data ()->bounding_box ();
00617     if ((prev_right > -MAX_INT16) &&
00618       (!fixsp_ignore_punct ||
00619       (!punct_chars.contains (word->best_choice->string ()[i - 1]) &&
00620     !punct_chars.contains (word->best_choice->string ()[i])))) {
00621       gap = box.left () - prev_right;
00622       if (gap < max_gap)
00623         gap_stats.add (gap, 1);
00624       else if (gap == max_gap)
00625         max_gap_count++;
00626       else {
00627         if (max_gap_count > 0)
00628           gap_stats.add (max_gap, max_gap_count);
00629         max_gap = gap;
00630         max_gap_count = 1;
00631       }
00632     }
00633     prev_right = box.right ();
00634     i++;
00635   }
00636 
00637   max_non_space = (row->space () + 3 * row->kern ()) / 4;
00638   normalised_max_nonspace = max_non_space * bln_x_height / row->x_height ();
00639 
00640   result = ((gap_stats.get_total () == 0) ||
00641     (max_gap <= normalised_max_nonspace) ||
00642     ((gap_stats.get_total () > 2) &&
00643     (max_gap <= 2 * gap_stats.median ())) ||
00644     ((gap_stats.get_total () <= 2) &&
00645     (max_gap <= 2 * gap_stats.mean ())));
00646   #ifndef SECURE_NAMES
00647   if ((debug_fix_space_level > 1)) {
00648     if (result)
00649       tprintf
00650         ("ACCEPT SPACING FOR: \"%s\" norm_maxnon = %f max=%d maxcount=%d total=%d mean=%f median=%f\n",
00651         word->best_choice->string ().string (), normalised_max_nonspace,
00652         max_gap, max_gap_count, gap_stats.get_total (), gap_stats.mean (),
00653         gap_stats.median ());
00654     else
00655       tprintf
00656         ("REJECT SPACING FOR: \"%s\" norm_maxnon = %f max=%d maxcount=%d total=%d mean=%f median=%f\n",
00657         word->best_choice->string ().string (), normalised_max_nonspace,
00658         max_gap, max_gap_count, gap_stats.get_total (), gap_stats.mean (),
00659         gap_stats.median ());
00660   }
00661   #endif
00662 
00663   return result;
00664 }
00665 
00666 
00675 BOOL8 fixspace_thinks_word_done(WERD_RES *word) { 
00676   if (word->done)
00677     return TRUE;
00678 
00679   if ((fixsp_done_mode > 0) &&
00680     (word->tess_accepted ||
00681     ((fixsp_done_mode == 2) &&
00682     (word->reject_map.reject_count () == 0)) ||
00683     (fixsp_done_mode == 3)) &&
00684     (strchr (word->best_choice->string ().string (), ' ') == NULL) &&
00685     ((word->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
00686     (word->best_choice->permuter () == FREQ_DAWG_PERM) ||
00687     (word->best_choice->permuter () == USER_DAWG_PERM) ||
00688     (word->best_choice->permuter () == NUMBER_PERM)))
00689     return TRUE;
00690   else
00691     return FALSE;
00692 }
00693 
00694 
00708 void fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row) { 
00709   WERD_RES *word_res;
00710   WERD_RES_LIST sub_word_list;
00711   WERD_RES_IT sub_word_list_it(&sub_word_list); 
00712   INT16 blob_index;
00713   INT16 new_length;
00714   float junk;
00715 
00716   word_res = word_res_it.data ();
00717   if (!fixsp_check_for_fp_noise_space ||
00718     word_res->word->flag (W_REP_CHAR) ||
00719     word_res->combination ||
00720     word_res->part_of_combo || !word_res->word->flag (W_DONT_CHOP))
00721     return;
00722 
00723   blob_index = worst_noise_blob (word_res, &junk);
00724   if (blob_index < 0)
00725     return;
00726 
00727   #ifndef SECURE_NAMES
00728   if (debug_fix_space_level > 1) {
00729     tprintf ("FP fixspace working on \"%s\"\n",
00730       word_res->best_choice->string ().string ());
00731   }
00732   #endif
00733   gblob_sort_list ((PBLOB_LIST *) word_res->word->rej_cblob_list (), FALSE);
00734   sub_word_list_it.add_after_stay_put (word_res_it.extract ());
00735   fix_noisy_space_list(sub_word_list, row); 
00736   new_length = sub_word_list.length ();
00737   word_res_it.add_list_before (&sub_word_list);
00738   for (; (!word_res_it.at_last () && (new_length > 1)); new_length--) {
00739     word_res_it.forward ();
00740   }
00741 }
00742 
00743 
00751 void fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row) { 
00752   INT16 best_score;
00753   WERD_RES_IT best_perm_it(&best_perm); 
00754   WERD_RES_LIST current_perm;
00755   WERD_RES_IT current_perm_it(&current_perm); 
00756   WERD_RES *old_word_res;
00757   WERD_RES *new_word_res;
00758   INT16 current_score;
00759   BOOL8 improved = FALSE;
00760 
00761   best_score = fp_eval_word_spacing (best_perm); //default score
00762 
00763   dump_words (best_perm, best_score, 1, improved);
00764 
00765   new_word_res = new WERD_RES;
00766   old_word_res = best_perm_it.data ();
00767   old_word_res->combination = TRUE; //Kludge to force deep copy
00768   *new_word_res = *old_word_res; //deep copy
00769   old_word_res->combination = FALSE; //Undo kludge
00770   new_word_res->combination = FALSE; //Undo kludge
00771   current_perm_it.add_to_end (new_word_res);
00772 
00773   break_noisiest_blob_word(current_perm); 
00774 
00775   while ((best_score != PERFECT_WERDS) && !current_perm.empty ()) {
00776     match_current_words(current_perm, row); 
00777     current_score = fp_eval_word_spacing (current_perm);
00778     dump_words (current_perm, current_score, 2, improved);
00779     if (current_score > best_score) {
00780       best_perm.clear ();
00781       best_perm.deep_copy (&current_perm);
00782       best_score = current_score;
00783       improved = TRUE;
00784     }
00785     if (current_score < PERFECT_WERDS)
00786       break_noisiest_blob_word(current_perm); 
00787   }
00788   dump_words (best_perm, best_score, 3, improved);
00789 }
00790 
00791 
00800 void break_noisiest_blob_word(WERD_RES_LIST &words) { 
00801   WERD_RES_IT word_it(&words); 
00802   WERD_RES_IT worst_word_it;
00803   float worst_noise_score = 9999;
00804   int worst_blob_index = -1;     //noisiest blb of noisiest wd
00805   int blob_index;                //of wds noisiest blb
00806   float noise_score;             //of wds noisiest blb
00807   WERD_RES *word_res;
00808   C_BLOB_IT blob_it;
00809   C_BLOB_IT rej_cblob_it;
00810   C_BLOB_LIST new_blob_list;
00811   C_BLOB_IT new_blob_it;
00812   C_BLOB_IT new_rej_cblob_it;
00813   WERD *new_word;
00814   INT16 start_of_noise_blob;
00815   INT16 i;
00816 
00817   for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
00818     blob_index = worst_noise_blob (word_it.data (), &noise_score);
00819     if ((blob_index > -1) && (worst_noise_score > noise_score)) {
00820       worst_noise_score = noise_score;
00821       worst_blob_index = blob_index;
00822       worst_word_it = word_it;
00823     }
00824   }
00825   if (worst_blob_index < 0) {
00826     words.clear ();              //signal termination
00827     return;
00828   }
00829 
00830   /* Now split the worst_word_it */
00831   word_res = worst_word_it.data ();
00832 
00833   /* Move blobs before noise blob to a new bloblist */
00834   new_blob_it.set_to_list (&new_blob_list);
00835   blob_it.set_to_list (word_res->word->cblob_list ());
00836   for (i = 0; i < worst_blob_index; i++, blob_it.forward ()) {
00837     new_blob_it.add_after_then_move (blob_it.extract ());
00838   }
00839   start_of_noise_blob = blob_it.data ()->bounding_box ().left ();
00840   delete blob_it.extract ();     //throw out noise blb
00841 
00842   new_word = new WERD (&new_blob_list, word_res->word);
00843   new_word->set_flag (W_EOL, FALSE);
00844   word_res->word->set_flag (W_BOL, FALSE);
00845   word_res->word->set_blanks (1);//After break
00846 
00847   new_rej_cblob_it.set_to_list (new_word->rej_cblob_list ());
00848   rej_cblob_it.set_to_list (word_res->word->rej_cblob_list ());
00849   for (;
00850     (!rej_cblob_it.empty () &&
00851     (rej_cblob_it.data ()->bounding_box ().left () <
00852   start_of_noise_blob)); rej_cblob_it.forward ()) {
00853     new_rej_cblob_it.add_after_then_move (rej_cblob_it.extract ());
00854   }
00855 
00856   worst_word_it.add_before_then_move (new WERD_RES (new_word));
00857 
00858   word_res->done = FALSE;
00859   if (word_res->outword != NULL) {
00860     delete word_res->outword;
00861     delete word_res->best_choice;
00862     delete word_res->raw_choice;
00863     word_res->outword = NULL;
00864     word_res->best_choice = NULL;
00865     word_res->raw_choice = NULL;
00866   }
00867 }
00868 
00869 
00880 INT16 worst_noise_blob(WERD_RES *word_res, float *worst_noise_score) { 
00881   PBLOB_IT blob_it;
00882   INT16 blob_count;
00883   float noise_score[512];
00884   int i;
00885   int min_noise_blob;            //1st contender
00886   int max_noise_blob;            //last contender
00887   int non_noise_count;
00888   int worst_noise_blob;          //Worst blob
00889   float small_limit = bln_x_height * fixsp_small_outlines_size;
00890   float non_noise_limit = bln_x_height * 0.8;
00891 
00892   blob_it.set_to_list (word_res->outword->blob_list ());
00893   //normalised
00894   blob_count = blob_it.length ();
00895   ASSERT_HOST (blob_count <= 512);
00896   if (blob_count < 5)
00897     return -1;                   //too short to split
00898   /* Get the noise scores for all blobs */
00899 
00900   #ifndef SECURE_NAMES
00901   if (debug_fix_space_level > 5)
00902     tprintf ("FP fixspace Noise metrics for \"%s\": ",
00903       word_res->best_choice->string ().string ());
00904   #endif
00905 
00906   for (i = 0; i < blob_count; i++, blob_it.forward ()) {
00907     if (word_res->reject_map[i].accepted ())
00908       noise_score[i] = non_noise_limit;
00909     else
00910       noise_score[i] = blob_noise_score (blob_it.data ());
00911 
00912     if (debug_fix_space_level > 5)
00913       tprintf ("%1.1f ", noise_score[i]);
00914   }
00915   if (debug_fix_space_level > 5)
00916     tprintf ("\n");
00917 
00918   /* Now find the worst one which is far enough away from the end of the word */
00919 
00920   non_noise_count = 0;
00921   for (i = 0;
00922   (i < blob_count) && (non_noise_count < fixsp_non_noise_limit); i++) {
00923     if (noise_score[i] >= non_noise_limit)
00924       non_noise_count++;
00925   }
00926   if (non_noise_count < fixsp_non_noise_limit)
00927     return -1;
00928   min_noise_blob = i;
00929 
00930   non_noise_count = 0;
00931   for (i = blob_count - 1;
00932   (i >= 0) && (non_noise_count < fixsp_non_noise_limit); i--) {
00933     if (noise_score[i] >= non_noise_limit)
00934       non_noise_count++;
00935   }
00936   if (non_noise_count < fixsp_non_noise_limit)
00937     return -1;
00938   max_noise_blob = i;
00939 
00940   if (min_noise_blob > max_noise_blob)
00941     return -1;
00942 
00943   *worst_noise_score = small_limit;
00944   worst_noise_blob = -1;
00945   for (i = min_noise_blob; i <= max_noise_blob; i++) {
00946     if (noise_score[i] < *worst_noise_score) {
00947       worst_noise_blob = i;
00948       *worst_noise_score = noise_score[i];
00949     }
00950   }
00951   return worst_noise_blob;
00952 }
00953 
00954 
00963 float blob_noise_score(PBLOB *blob) { 
00964   OUTLINE_IT outline_it;
00965   BOX box;                       //BB of outline
00966   INT16 outline_count = 0;
00967   INT16 max_dimension;
00968   INT16 largest_outline_dimension = 0;
00969 
00970   outline_it.set_to_list (blob->out_list ());
00971   for (outline_it.mark_cycle_pt ();
00972   !outline_it.cycled_list (); outline_it.forward ()) {
00973     outline_count++;
00974     box = outline_it.data ()->bounding_box ();
00975     if (box.height () > box.width ())
00976       max_dimension = box.height ();
00977     else
00978       max_dimension = box.width ();
00979 
00980     if (largest_outline_dimension < max_dimension)
00981       largest_outline_dimension = max_dimension;
00982   }
00983 
00984   if (fixsp_noise_score_fixing) {
00985     if (outline_count > 5)
00986       largest_outline_dimension *= 2; //penalise LOTS of blobs
00987 
00988     box = blob->bounding_box ();
00989 
00990     if ((box.bottom () > bln_baseline_offset * 4) ||
00991       (box.top () < bln_baseline_offset / 2))
00992       largest_outline_dimension /= 2; //Lax blob is if high or low
00993   }
00994   return largest_outline_dimension;
00995 }
00996 
00997 
01006 void fixspace_dbg(WERD_RES *word) { 
01007   BOX box = word->word->bounding_box ();
01008   BOOL8 show_map_detail = FALSE;
01009   INT16 i;
01010 
01011   box.print ();
01012   #ifndef SECURE_NAMES
01013   tprintf (" \"%s\" ", word->best_choice->string ().string ());
01014   tprintf ("Blob count: %d (word); %d/%d (outword)\n",
01015     word->word->gblob_list ()->length (),
01016     word->outword->gblob_list ()->length (),
01017     word->outword->rej_blob_list ()->length ());
01018   word->reject_map.print (debug_fp);
01019   tprintf ("\n");
01020   if (show_map_detail) {
01021     tprintf ("\"%s\"\n", word->best_choice->string ().string ());
01022     for (i = 0; word->best_choice->string ()[i] != '\0'; i++) {
01023       tprintf ("**** \"%c\" ****\n", word->best_choice->string ()[i]);
01024       word->reject_map[i].full_print (debug_fp);
01025     }
01026   }
01027 
01028   tprintf ("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE");
01029   tprintf ("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE");
01030   #endif
01031 }
01032 
01033 
01045 INT16 fp_eval_word_spacing(WERD_RES_LIST &word_res_list) { 
01046   WERD_RES_IT word_it(&word_res_list); 
01047   WERD_RES *word;
01048   PBLOB_IT blob_it;
01049   INT16 word_length;
01050   INT16 score = 0;
01051   INT16 i;
01052   const char *chs;
01053   float small_limit = bln_x_height * fixsp_small_outlines_size;
01054 
01055   if (!fixsp_fp_eval)
01056     return (eval_word_spacing (word_res_list));
01057 
01058   for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
01059     word = word_it.data ();
01060     word_length = word->reject_map.length ();
01061     chs = word->best_choice->string ().string ();
01062     if ((word->done ||
01063       word->tess_accepted) ||
01064       (word->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
01065       (word->best_choice->permuter () == FREQ_DAWG_PERM) ||
01066       (word->best_choice->permuter () == USER_DAWG_PERM) ||
01067     (safe_dict_word (chs) > 0)) {
01068       blob_it.set_to_list (word->outword->blob_list ());
01069       for (i = 0; i < word_length; i++, blob_it.forward ()) {
01070         if ((chs[i] == ' ') ||
01071           (blob_noise_score (blob_it.data ()) < small_limit))
01072           score -= 1;            //penalise possibly erroneous non-space
01073 
01074         else if (word->reject_map[i].accepted ())
01075           score++;
01076       }
01077     }
01078   }
01079   if (score < 0)
01080     score = 0;
01081   return score;
01082 }

Generated on Wed Feb 28 19:49:07 2007 for Tesseract by  doxygen 1.5.1