ccmain/fixspace.cpp File Reference

#include "mfcpch.h"
#include <ctype.h>
#include "reject.h"
#include "statistc.h"
#include "genblob.h"
#include "control.h"
#include "fixspace.h"
#include "tessvars.h"
#include "tessbox.h"
#include "secname.h"

Go to the source code of this file.

Defines

Functions


Define Documentation

#define MAXSPACING   128

Max expected spacing in pix

Definition at line 66 of file fixspace.cpp.

Referenced by block_space_stat(), block_spacing_stats(), isolated_row_stats(), row_space_stat(), row_spacing_stats(), and uniformly_spaced().

#define PERFECT_WERDS   999

Definition at line 64 of file fixspace.cpp.

Referenced by eval_word_spacing(), fix_fuzzy_space_list(), and fix_noisy_space_list().


Function Documentation

float blob_noise_score ( PBLOB blob  ) 

Compute dimensions of largest blob ?

Parameters:
blob Blob
Note:
Global: fixsp_noise_score_fixing, bln_baseline_offset
Returns:
score of blob's noise

Definition at line 963 of file fixspace.cpp.

References BOX::bottom(), PBLOB::bounding_box(), BOX::height(), PBLOB::out_list(), outline_it, BOX::top(), and BOX::width().

Referenced by fp_eval_word_spacing(), and worst_noise_blob().

00963                                     { 
00964   OUTLINE_IT outline_it;
00965   BOX box;                       //BB of outline
00966   INT16 outline_count = 0;
00967   INT16 max_dimension;
00968   INT16 largest_outline_dimension = 0;
00969 
00970   outline_it.set_to_list (blob->out_list ());
00971   for (outline_it.mark_cycle_pt ();
00972   !outline_it.cycled_list (); outline_it.forward ()) {
00973     outline_count++;
00974     box = outline_it.data ()->bounding_box ();
00975     if (box.height () > box.width ())
00976       max_dimension = box.height ();
00977     else
00978       max_dimension = box.width ();
00979 
00980     if (largest_outline_dimension < max_dimension)
00981       largest_outline_dimension = max_dimension;
00982   }
00983 
00984   if (fixsp_noise_score_fixing) {
00985     if (outline_count > 5)
00986       largest_outline_dimension *= 2; //penalise LOTS of blobs
00987 
00988     box = blob->bounding_box ();
00989 
00990     if ((box.bottom () > bln_baseline_offset * 4) ||
00991       (box.top () < bln_baseline_offset / 2))
00992       largest_outline_dimension /= 2; //Lax blob is if high or low
00993   }
00994   return largest_outline_dimension;
00995 }

void break_noisiest_blob_word ( WERD_RES_LIST &  words  ) 

Find the word with the blob which looks like the worst noise.

Parameters:
words Words
Returns:
none, modifies words
Break the word into two, deleting the noise blob.

Definition at line 800 of file fixspace.cpp.

References WERD_RES::best_choice, WERD::cblob_list(), WERD_RES::done, FALSE, NULL, WERD_RES::outword, WERD_RES::raw_choice, WERD::rej_cblob_list(), WERD::set_blanks(), WERD::set_flag(), W_BOL, W_EOL, WERD_RES::word, and worst_noise_blob().

Referenced by fix_noisy_space_list().

00800                                                     { 
00801   WERD_RES_IT word_it(&words); 
00802   WERD_RES_IT worst_word_it;
00803   float worst_noise_score = 9999;
00804   int worst_blob_index = -1;     //noisiest blb of noisiest wd
00805   int blob_index;                //of wds noisiest blb
00806   float noise_score;             //of wds noisiest blb
00807   WERD_RES *word_res;
00808   C_BLOB_IT blob_it;
00809   C_BLOB_IT rej_cblob_it;
00810   C_BLOB_LIST new_blob_list;
00811   C_BLOB_IT new_blob_it;
00812   C_BLOB_IT new_rej_cblob_it;
00813   WERD *new_word;
00814   INT16 start_of_noise_blob;
00815   INT16 i;
00816 
00817   for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
00818     blob_index = worst_noise_blob (word_it.data (), &noise_score);
00819     if ((blob_index > -1) && (worst_noise_score > noise_score)) {
00820       worst_noise_score = noise_score;
00821       worst_blob_index = blob_index;
00822       worst_word_it = word_it;
00823     }
00824   }
00825   if (worst_blob_index < 0) {
00826     words.clear ();              //signal termination
00827     return;
00828   }
00829 
00830   /* Now split the worst_word_it */
00831   word_res = worst_word_it.data ();
00832 
00833   /* Move blobs before noise blob to a new bloblist */
00834   new_blob_it.set_to_list (&new_blob_list);
00835   blob_it.set_to_list (word_res->word->cblob_list ());
00836   for (i = 0; i < worst_blob_index; i++, blob_it.forward ()) {
00837     new_blob_it.add_after_then_move (blob_it.extract ());
00838   }
00839   start_of_noise_blob = blob_it.data ()->bounding_box ().left ();
00840   delete blob_it.extract ();     //throw out noise blb
00841 
00842   new_word = new WERD (&new_blob_list, word_res->word);
00843   new_word->set_flag (W_EOL, FALSE);
00844   word_res->word->set_flag (W_BOL, FALSE);
00845   word_res->word->set_blanks (1);//After break
00846 
00847   new_rej_cblob_it.set_to_list (new_word->rej_cblob_list ());
00848   rej_cblob_it.set_to_list (word_res->word->rej_cblob_list ());
00849   for (;
00850     (!rej_cblob_it.empty () &&
00851     (rej_cblob_it.data ()->bounding_box ().left () <
00852   start_of_noise_blob)); rej_cblob_it.forward ()) {
00853     new_rej_cblob_it.add_after_then_move (rej_cblob_it.extract ());
00854   }
00855 
00856   worst_word_it.add_before_then_move (new WERD_RES (new_word));
00857 
00858   word_res->done = FALSE;
00859   if (word_res->outword != NULL) {
00860     delete word_res->outword;
00861     delete word_res->best_choice;
00862     delete word_res->raw_choice;
00863     word_res->outword = NULL;
00864     word_res->best_choice = NULL;
00865     word_res->raw_choice = NULL;
00866   }
00867 }

BOOL8 digit_or_numeric_punct ( WERD_RES word,
char  ch 
)

Check if ch in word is a digit or digit-compatible punctuation.

Parameters:
word 
ch 
Returns:
TRUE if matches

Definition at line 411 of file fixspace.cpp.

References WERD_RES::best_choice, and NUMBER_PERM.

Referenced by eval_word_spacing().

00411                                                       { 
00412   return (isdigit (ch) ||
00413     (fixsp_numeric_fix &&
00414     (word->best_choice->permuter () == NUMBER_PERM) &&
00415     STRING (numeric_punctuation).contains (ch)));
00416 }

void dump_words ( WERD_RES_LIST &  perm,
INT16  score,
INT16  mode,
BOOL8  improved 
)

Dump out the words recognized & stats.

Parameters:
perm ?
score score
mode mode
improved 0 or 1
Note:
Global: debug_fix_space_level
Returns:
none

Definition at line 526 of file fixspace.cpp.

References STRING::string(), and tprintf().

Referenced by fix_fuzzy_space_list(), and fix_noisy_space_list().

00526                                                                               { 
00527   WERD_RES_IT word_res_it(&perm); 
00528   static STRING initial_str;
00529 
00530   if (debug_fix_space_level > 0) {
00531     if (mode == 1) {
00532       initial_str = "";
00533       for (word_res_it.mark_cycle_pt ();
00534       !word_res_it.cycled_list (); word_res_it.forward ()) {
00535         if (!word_res_it.data ()->part_of_combo) {
00536           initial_str += word_res_it.data ()->best_choice->string ();
00537           initial_str += ' ';
00538         }
00539       }
00540     }
00541 
00542     #ifndef SECURE_NAMES
00543     if (debug_fix_space_level > 1) {
00544       switch (mode) {
00545         case 1:
00546           tprintf ("EXTRACTED (%d): \"", score);
00547           break;
00548         case 2:
00549           tprintf ("TESTED (%d): \"", score);
00550           break;
00551         case 3:
00552           tprintf ("RETURNED (%d): \"", score);
00553           break;
00554       }
00555 
00556       for (word_res_it.mark_cycle_pt ();
00557       !word_res_it.cycled_list (); word_res_it.forward ()) {
00558         if (!word_res_it.data ()->part_of_combo)
00559           tprintf ("%s/%1d ",
00560             word_res_it.data ()->best_choice->string ().
00561             string (),
00562             (int) word_res_it.data ()->best_choice->permuter ());
00563       }
00564       tprintf ("\"\n");
00565     }
00566     else if (improved) {
00567       tprintf ("FIX SPACING \"%s\" => \"", initial_str.string ());
00568       for (word_res_it.mark_cycle_pt ();
00569       !word_res_it.cycled_list (); word_res_it.forward ()) {
00570         if (!word_res_it.data ()->part_of_combo)
00571           tprintf ("%s/%1d ",
00572             word_res_it.data ()->best_choice->string ().
00573             string (),
00574             (int) word_res_it.data ()->best_choice->permuter ());
00575       }
00576       tprintf ("\"\n");
00577     }
00578     #endif
00579   }
00580 }

INT16 eval_word_spacing ( WERD_RES_LIST &  word_res_list  ) 

Basic measure is number of characters in contextually confirmed words = the word is done.

Parameters:
word_res_list Input word list
Note:
Global:
  • conflict_set_I_l_1,
  • tessedit_test_uniform_wd_spacing,
  • fixsp_prefer_joined_1s,
  • tessedit_prefer_joined_punct
Returns:
score
If all words are contextually confirmed the evaluation is deemed perfect.

Some fiddles are done to handle "1"s as these are VERY frequent causes of fuzzy spaces. The problem with the basic measure is that "561 63" would score the same as "56163", though given our knowledge that the space is fuzzy, and that there is a "1" next to the fuzzy space, we need to ensure that "56163" is prefered.

The solution is to NOT COUNT the score of any word which has a digit at one end and a "1Il" as the character the other side of the space.

Conversly, any character next to a "1" within a word is counted as a positive score. Thus "561 63" would score 4 (3 chars in a numeric word plus 1 side of the "1" joined). "56163" would score 7 - all chars in a numeric word + 2 sides of a "1" joined.

The joined 1 rule is applied to any word REGARDLESS of contextual confirmation. Thus "PS7a71 3/7a" scores 1 (neither word is contexutally confirmed. The only score is from the joined 1. "PS7a713/7a" scores 2.

Can we add the prev word score and potentially count this word? Yes IF it didnt end in a 1 when the first char of this word is a digit AND it didnt end in a digit when the first char of this word is a 1

Definition at line 285 of file fixspace.cpp.

References WERD_RES::best_choice, STRING::contains(), cprintf(), digit_or_numeric_punct(), FALSE, fixspace_thinks_word_done(), REJMAP::length(), NUMBER_PERM, PERFECT_WERDS, WERD_RES::reject_map, WERD_RES::tess_failed, TRUE, uniformly_spaced(), and word_count.

Referenced by fix_fuzzy_space_list(), and fp_eval_word_spacing().

00285                                                       { 
00286   WERD_RES_IT word_res_it(&word_res_list); 
00287   INT16 total_score = 0;
00288   INT16 word_count = 0;
00289   INT16 done_word_count = 0;
00290   INT16 word_len;
00291   INT16 i;
00292   WERD_RES *word;                //current word
00293   INT16 prev_word_score = 0;
00294   BOOL8 prev_word_done = FALSE;
00295   BOOL8 prev_char_1 = FALSE;     //prev ch a "1/I/l"?
00296   BOOL8 prev_char_digit = FALSE; //prev ch 2..9 or 0
00297   BOOL8 current_char_1 = FALSE;
00298   BOOL8 current_word_ok_so_far;
00299   STRING punct_chars = "!\"`',.:;";
00300   BOOL8 prev_char_punct = FALSE;
00301   BOOL8 current_char_punct = FALSE;
00302   BOOL8 word_done = FALSE;
00303 
00304 #ifdef TEXT_VERBOSE
00305   // gets a 'z', see ccmain/tesseractmain.dox
00306   cprintf("z");
00307 #endif
00308   do {
00309     word = word_res_it.data ();
00310     word_done = fixspace_thinks_word_done (word);
00311     word_count++;
00312     if (word->tess_failed) {
00313       total_score += prev_word_score;
00314       if (prev_word_done)
00315         done_word_count++;
00316       prev_word_score = 0;
00317       prev_char_1 = FALSE;
00318       prev_char_digit = FALSE;
00319       prev_word_done = FALSE;
00320     }
00321     else {
00327       word_len = word->reject_map.length ();
00328       current_word_ok_so_far = FALSE;
00329       if (!((prev_char_1 &&
00330         digit_or_numeric_punct (word,
00331         word->best_choice->string ()[0])) ||
00332         (prev_char_digit &&
00333         ((word_done &&
00334         (word->best_choice->string ()[0] == '1')) ||
00335         (!word_done &&
00336         STRING (conflict_set_I_l_1).contains (word->best_choice->
00337            string ()[0])))))) {
00338         total_score += prev_word_score;
00339         if (prev_word_done)
00340           done_word_count++;
00341         current_word_ok_so_far = word_done;
00342       }
00343 
00344       if ((current_word_ok_so_far) &&
00345         (!tessedit_test_uniform_wd_spacing ||
00346         ((word->best_choice->permuter ()==NUMBER_PERM)||uniformly_spaced (word)))) {
00347         prev_word_done = TRUE;
00348         prev_word_score = word_len;
00349       }
00350       else {
00351         prev_word_done = FALSE;
00352         prev_word_score = 0;
00353       }
00354 
00355       if (fixsp_prefer_joined_1s) {
00356         /* Add 1 to total score for every joined 1 regardless of context and rejtn */
00357 
00358         for (i = 0, prev_char_1 = FALSE; i < word_len; i++) {
00359           current_char_1 = word->best_choice->string ()[i] == '1';
00360           if (prev_char_1 || (current_char_1 && (i > 0)))
00361             total_score++;
00362           prev_char_1 = current_char_1;
00363         }
00364       }
00365 
00366       /* Add 1 to total score for every joined punctuation regardless of context
00367         and rejtn */
00368       if (tessedit_prefer_joined_punct) {
00369         for (i = 0, prev_char_punct = FALSE; i < word_len; i++) {
00370           current_char_punct =
00371             punct_chars.contains (word->best_choice->string ()[i]);
00372           if (prev_char_punct || (current_char_punct && (i > 0)))
00373             total_score++;
00374           prev_char_punct = current_char_punct;
00375         }
00376       }
00377       prev_char_digit = digit_or_numeric_punct (word,
00378         word->best_choice->
00379         string ()[word_len - 1]);
00380       prev_char_1 =
00381         ((word_done
00382         && (word->best_choice->string ()[word_len - 1] == '1'))
00383         || (!word_done
00384         && STRING (conflict_set_I_l_1).contains (word->best_choice->
00385         string ()[word_len -
00386         1])));
00387     }
00388     /* Find next word */
00389     do
00390     word_res_it.forward ();
00391     while (word_res_it.data ()->part_of_combo);
00392   }
00393   while (!word_res_it.at_first ());
00394   total_score += prev_word_score;
00395   if (prev_word_done)
00396     done_word_count++;
00397   if (done_word_count == word_count)
00398     return PERFECT_WERDS;
00399   else
00400     return total_score;
00401 }

void fix_fuzzy_space_list ( WERD_RES_LIST &  best_perm,
ROW row 
)

Explore spaces in list.

Parameters:
best_perm for eval_word_spacing()
row Row
Returns:
none

Definition at line 178 of file fixspace.cpp.

References dump_words(), eval_word_spacing(), FALSE, initialise_search(), match_current_words(), PERFECT_WERDS, transform_to_next_perm(), and TRUE.

Referenced by fix_fuzzy_spaces().

00180                                     {
00181   INT16 best_score;
00182   WERD_RES_LIST current_perm;
00183   INT16 current_score;
00184   BOOL8 improved = FALSE;
00185 
00186 
00187   best_score = eval_word_spacing (best_perm); //default score
00188 
00189   dump_words (best_perm, best_score, 1, improved);
00190 
00191   if (best_score != PERFECT_WERDS)
00192     initialise_search(best_perm, current_perm); 
00193 
00194   while ((best_score != PERFECT_WERDS) && !current_perm.empty ()) {
00195     match_current_words(current_perm, row); 
00196     current_score = eval_word_spacing (current_perm);
00197     dump_words (current_perm, current_score, 2, improved);
00198     if (current_score > best_score) {
00199       best_perm.clear ();
00200       best_perm.deep_copy (&current_perm);
00201       best_score = current_score;
00202       improved = TRUE;
00203     }
00204     if (current_score < PERFECT_WERDS)
00205       transform_to_next_perm(current_perm); 
00206   }
00207   dump_words (best_perm, best_score, 3, improved);
00208 }

void fix_fuzzy_spaces ( volatile ETEXT_DESC monitor,
INT32  word_count,
PAGE_RES page_res 
)

Walk over the page, finding sequences of words joined by fuzzy spaces.

Parameters:
monitor progress monitor
word_count count of words in doc
page_res 
Returns:
none
Extract them as a sublist, process the sublist to find the optimal arrangement of spaces then replace the sublist in the ROW_RES.

Definition at line 79 of file fixspace.cpp.

References check_debug_pt(), WERD_RES::combination, fix_fuzzy_space_list(), fix_sp_fp_word(), WERD::gblob_list(), NULL, TRUE, W_FUZZY_NON, W_FUZZY_SP, and WERD_RES::word.

Referenced by recog_all_words().

00082                                           {
00083   BLOCK_RES_IT block_res_it;     //iterators
00084   ROW_RES_IT row_res_it;
00085   WERD_RES_IT word_res_it_from;
00086   WERD_RES_IT word_res_it_to;
00087   WERD_RES *word_res;
00088   WERD_RES_LIST fuzzy_space_words;
00089   INT16 new_length;
00090   BOOL8 prevent_null_wd_fixsp;   //DONT process blobless wds
00091   INT32 word_index;              //current word
00092 
00093   block_res_it.set_to_list (&page_res->block_res_list);
00094   word_index = 0;
00095   for (block_res_it.mark_cycle_pt ();
00096   !block_res_it.cycled_list (); block_res_it.forward ()) {
00097     row_res_it.set_to_list (&block_res_it.data ()->row_res_list);
00098     for (row_res_it.mark_cycle_pt ();
00099     !row_res_it.cycled_list (); row_res_it.forward ()) {
00100       word_res_it_from.set_to_list (&row_res_it.data ()->word_res_list);
00101       while (!word_res_it_from.at_last ()) {
00102         word_res = word_res_it_from.data ();
00103         while (!word_res_it_from.at_last () &&
00104           !(word_res->combination ||
00105           word_res_it_from.data_relative (1)->
00106           word->flag (W_FUZZY_NON) ||
00107           word_res_it_from.data_relative (1)->
00108         word->flag (W_FUZZY_SP))) {
00109           fix_sp_fp_word (word_res_it_from, row_res_it.data ()->row);
00110           word_res = word_res_it_from.forward ();
00111           word_index++;
00112           if (monitor != NULL) {
00113             monitor->ocr_alive = TRUE;
00114             monitor->progress = 90 + 5 * word_index / word_count;
00115           }
00116         }
00117 
00118         if (!word_res_it_from.at_last ()) {
00119           word_res_it_to = word_res_it_from;
00120           prevent_null_wd_fixsp =
00121             word_res->word->gblob_list ()->empty ();
00122           if (check_debug_pt (word_res, 60))
00123             debug_fix_space_level.set_value (10);
00124           word_res_it_to.forward ();
00125           word_index++;
00126           if (monitor != NULL) {
00127             monitor->ocr_alive = TRUE;
00128             monitor->progress = 90 + 5 * word_index / word_count;
00129           }
00130           while (!word_res_it_to.at_last () &&
00131             (word_res_it_to.data_relative (1)->
00132             word->flag (W_FUZZY_NON) ||
00133             word_res_it_to.data_relative (1)->
00134           word->flag (W_FUZZY_SP))) {
00135             if (check_debug_pt (word_res, 60))
00136               debug_fix_space_level.set_value (10);
00137             if (word_res->word->gblob_list ()->empty ())
00138               prevent_null_wd_fixsp = TRUE;
00139             word_res = word_res_it_to.forward ();
00140           }
00141           if (check_debug_pt (word_res, 60))
00142             debug_fix_space_level.set_value (10);
00143           if (word_res->word->gblob_list ()->empty ())
00144             prevent_null_wd_fixsp = TRUE;
00145           if (prevent_null_wd_fixsp)
00146             word_res_it_from = word_res_it_to;
00147           else {
00148             fuzzy_space_words.assign_to_sublist (&word_res_it_from,
00149               &word_res_it_to);
00150             fix_fuzzy_space_list (fuzzy_space_words,
00151               row_res_it.data ()->row);
00152             new_length = fuzzy_space_words.length ();
00153             word_res_it_from.add_list_before (&fuzzy_space_words);
00154             for (;
00155               (!word_res_it_from.at_last () &&
00156             (new_length > 0)); new_length--) {
00157               word_res_it_from.forward ();
00158             }
00159           }
00160           if (test_pt)
00161             debug_fix_space_level.set_value (0);
00162         }
00163         fix_sp_fp_word (word_res_it_from, row_res_it.data ()->row);
00164         //Last word in row
00165       }
00166     }
00167   }
00168 }

void fix_noisy_space_list ( WERD_RES_LIST &  best_perm,
ROW row 
)

Go down and fix spaces, starting with the worst/biggest.

Parameters:
best_perm to get scores
row Row
Returns:
void, modifies best_perm

Definition at line 751 of file fixspace.cpp.

References break_noisiest_blob_word(), WERD_RES::combination, dump_words(), FALSE, fp_eval_word_spacing(), match_current_words(), PERFECT_WERDS, and TRUE.

Referenced by fix_sp_fp_word().

00751                                                               { 
00752   INT16 best_score;
00753   WERD_RES_IT best_perm_it(&best_perm); 
00754   WERD_RES_LIST current_perm;
00755   WERD_RES_IT current_perm_it(&current_perm); 
00756   WERD_RES *old_word_res;
00757   WERD_RES *new_word_res;
00758   INT16 current_score;
00759   BOOL8 improved = FALSE;
00760 
00761   best_score = fp_eval_word_spacing (best_perm); //default score
00762 
00763   dump_words (best_perm, best_score, 1, improved);
00764 
00765   new_word_res = new WERD_RES;
00766   old_word_res = best_perm_it.data ();
00767   old_word_res->combination = TRUE; //Kludge to force deep copy
00768   *new_word_res = *old_word_res; //deep copy
00769   old_word_res->combination = FALSE; //Undo kludge
00770   new_word_res->combination = FALSE; //Undo kludge
00771   current_perm_it.add_to_end (new_word_res);
00772 
00773   break_noisiest_blob_word(current_perm); 
00774 
00775   while ((best_score != PERFECT_WERDS) && !current_perm.empty ()) {
00776     match_current_words(current_perm, row); 
00777     current_score = fp_eval_word_spacing (current_perm);
00778     dump_words (current_perm, current_score, 2, improved);
00779     if (current_score > best_score) {
00780       best_perm.clear ();
00781       best_perm.deep_copy (&current_perm);
00782       best_score = current_score;
00783       improved = TRUE;
00784     }
00785     if (current_score < PERFECT_WERDS)
00786       break_noisiest_blob_word(current_perm); 
00787   }
00788   dump_words (best_perm, best_score, 3, improved);
00789 }

void fix_sp_fp_word ( WERD_RES_IT &  word_res_it,
ROW row 
)

Test the current word to see if it can be split by deleting noise blobs.

Parameters:
word_res_it Word results?
row 
Note:
Global: fixsp_check_for_fp_noise_space debug_fix_space_level
Returns:
none
If so, do the buisiness. Return with the iterator pointing to the same place if the word is unchanged, or the last of the replacement words.

Definition at line 708 of file fixspace.cpp.

References WERD_RES::best_choice, WERD_RES::combination, FALSE, fix_noisy_space_list(), WERD::flag(), gblob_sort_list(), WERD_RES::part_of_combo, WERD::rej_cblob_list(), tprintf(), W_DONT_CHOP, W_REP_CHAR, WERD_RES::word, and worst_noise_blob().

Referenced by fix_fuzzy_spaces().

00708                                                         { 
00709   WERD_RES *word_res;
00710   WERD_RES_LIST sub_word_list;
00711   WERD_RES_IT sub_word_list_it(&sub_word_list); 
00712   INT16 blob_index;
00713   INT16 new_length;
00714   float junk;
00715 
00716   word_res = word_res_it.data ();
00717   if (!fixsp_check_for_fp_noise_space ||
00718     word_res->word->flag (W_REP_CHAR) ||
00719     word_res->combination ||
00720     word_res->part_of_combo || !word_res->word->flag (W_DONT_CHOP))
00721     return;
00722 
00723   blob_index = worst_noise_blob (word_res, &junk);
00724   if (blob_index < 0)
00725     return;
00726 
00727   #ifndef SECURE_NAMES
00728   if (debug_fix_space_level > 1) {
00729     tprintf ("FP fixspace working on \"%s\"\n",
00730       word_res->best_choice->string ().string ());
00731   }
00732   #endif
00733   gblob_sort_list ((PBLOB_LIST *) word_res->word->rej_cblob_list (), FALSE);
00734   sub_word_list_it.add_after_stay_put (word_res_it.extract ());
00735   fix_noisy_space_list(sub_word_list, row); 
00736   new_length = sub_word_list.length ();
00737   word_res_it.add_list_before (&sub_word_list);
00738   for (; (!word_res_it.at_last () && (new_length > 1)); new_length--) {
00739     word_res_it.forward ();
00740   }
00741 }

void fixspace_dbg ( WERD_RES word  ) 

Dump out info for fixing spaces functions.

Parameters:
word Word in question
Note:
Global: debug_fp
Returns:
none

Definition at line 1006 of file fixspace.cpp.

References WERD_RES::best_choice, WERD::bounding_box(), debug_fp, WERD_RES::done, FALSE, REJMAP::full_print(), WERD::gblob_list(), WERD_RES::outword, BOX::print(), REJMAP::print(), WERD::rej_blob_list(), WERD_RES::reject_map, WERD_RES::tess_accepted, tprintf(), and WERD_RES::word.

01006                                   { 
01007   BOX box = word->word->bounding_box ();
01008   BOOL8 show_map_detail = FALSE;
01009   INT16 i;
01010 
01011   box.print ();
01012   #ifndef SECURE_NAMES
01013   tprintf (" \"%s\" ", word->best_choice->string ().string ());
01014   tprintf ("Blob count: %d (word); %d/%d (outword)\n",
01015     word->word->gblob_list ()->length (),
01016     word->outword->gblob_list ()->length (),
01017     word->outword->rej_blob_list ()->length ());
01018   word->reject_map.print (debug_fp);
01019   tprintf ("\n");
01020   if (show_map_detail) {
01021     tprintf ("\"%s\"\n", word->best_choice->string ().string ());
01022     for (i = 0; word->best_choice->string ()[i] != '\0'; i++) {
01023       tprintf ("**** \"%c\" ****\n", word->best_choice->string ()[i]);
01024       word->reject_map[i].full_print (debug_fp);
01025     }
01026   }
01027 
01028   tprintf ("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE");
01029   tprintf ("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE");
01030   #endif
01031 }

BOOL8 fixspace_thinks_word_done ( WERD_RES word  ) 

Use all the standard pass 2 conditions for mode 5 in set_done() BUT DONT REJECT IF THE WERD IS AMBIGUOUS - FOR SPACING WE DONT CARE WHETHER WE HAVE of/at on/an etc.

Parameters:
word Word working on
Returns:
TRUE if might be word

Definition at line 675 of file fixspace.cpp.

References WERD_RES::best_choice, WERD_RES::done, FALSE, FREQ_DAWG_PERM, NULL, NUMBER_PERM, REJMAP::reject_count(), WERD_RES::reject_map, SYSTEM_DAWG_PERM, WERD_RES::tess_accepted, TRUE, and USER_DAWG_PERM.

Referenced by eval_word_spacing().

00675                                                 { 
00676   if (word->done)
00677     return TRUE;
00678 
00679   if ((fixsp_done_mode > 0) &&
00680     (word->tess_accepted ||
00681     ((fixsp_done_mode == 2) &&
00682     (word->reject_map.reject_count () == 0)) ||
00683     (fixsp_done_mode == 3)) &&
00684     (strchr (word->best_choice->string ().string (), ' ') == NULL) &&
00685     ((word->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
00686     (word->best_choice->permuter () == FREQ_DAWG_PERM) ||
00687     (word->best_choice->permuter () == USER_DAWG_PERM) ||
00688     (word->best_choice->permuter () == NUMBER_PERM)))
00689     return TRUE;
00690   else
00691     return FALSE;
00692 }

INT16 fp_eval_word_spacing ( WERD_RES_LIST &  word_res_list  ) 

Evaluation function for fixed pitch word lists.

Parameters:
word_res_list list of word results
Note:
Global: bln_x_height, fixsp_small_outlines_size
Returns:
score
Basically, count the number of "nice" characters - those which are in tess acceptable words or in dict words and are not rejected. Penalise any potential noise chars

Definition at line 1045 of file fixspace.cpp.

References WERD_RES::best_choice, WERD::blob_list(), blob_noise_score(), WERD_RES::done, eval_word_spacing(), FREQ_DAWG_PERM, REJMAP::length(), WERD_RES::outword, WERD_RES::reject_map, safe_dict_word(), SYSTEM_DAWG_PERM, WERD_RES::tess_accepted, and USER_DAWG_PERM.

Referenced by fix_noisy_space_list().

01045                                                          { 
01046   WERD_RES_IT word_it(&word_res_list); 
01047   WERD_RES *word;
01048   PBLOB_IT blob_it;
01049   INT16 word_length;
01050   INT16 score = 0;
01051   INT16 i;
01052   const char *chs;
01053   float small_limit = bln_x_height * fixsp_small_outlines_size;
01054 
01055   if (!fixsp_fp_eval)
01056     return (eval_word_spacing (word_res_list));
01057 
01058   for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
01059     word = word_it.data ();
01060     word_length = word->reject_map.length ();
01061     chs = word->best_choice->string ().string ();
01062     if ((word->done ||
01063       word->tess_accepted) ||
01064       (word->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
01065       (word->best_choice->permuter () == FREQ_DAWG_PERM) ||
01066       (word->best_choice->permuter () == USER_DAWG_PERM) ||
01067     (safe_dict_word (chs) > 0)) {
01068       blob_it.set_to_list (word->outword->blob_list ());
01069       for (i = 0; i < word_length; i++, blob_it.forward ()) {
01070         if ((chs[i] == ' ') ||
01071           (blob_noise_score (blob_it.data ()) < small_limit))
01072           score -= 1;            //penalise possibly erroneous non-space
01073 
01074         else if (word->reject_map[i].accepted ())
01075           score++;
01076       }
01077     }
01078   }
01079   if (score < 0)
01080     score = 0;
01081   return score;
01082 }

void initialise_search ( WERD_RES_LIST &  src_list,
WERD_RES_LIST &  new_list 
)

?

Parameters:
src_list source list
new_list new list
Returns:
none, new_list gets modified

Definition at line 218 of file fixspace.cpp.

References WERD_RES::combination, FALSE, and WERD_RES::part_of_combo.

Referenced by fix_fuzzy_space_list().

00218                                                                          { 
00219   WERD_RES_IT src_it(&src_list); 
00220   WERD_RES_IT new_it(&new_list); 
00221   WERD_RES *src_wd;
00222   WERD_RES *new_wd;
00223 
00224   for (src_it.mark_cycle_pt (); !src_it.cycled_list (); src_it.forward ()) {
00225     src_wd = src_it.data ();
00226     if (!src_wd->combination) {
00227       new_wd = new WERD_RES (*src_wd);
00228       new_wd->combination = FALSE;
00229       new_wd->part_of_combo = FALSE;
00230       new_it.add_after_then_move (new_wd);
00231     }
00232   }
00233 }

void match_current_words ( WERD_RES_LIST &  words,
ROW row 
)

classifies some words (those not part of combo & not outword) using pass2?

Parameters:
words list of words
row Row
Note:
Global:
Returns:

Definition at line 244 of file fixspace.cpp.

References classify_word_pass2(), NULL, WERD_RES::outword, and WERD_RES::part_of_combo.

Referenced by fix_fuzzy_space_list(), and fix_noisy_space_list().

00244                                                          { 
00245   WERD_RES_IT word_it(&words); 
00246   WERD_RES *word;
00247 
00248   for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
00249     word = word_it.data ();
00250     if ((!word->part_of_combo) && (word->outword == NULL))
00251       classify_word_pass2(word, row); 
00252   }
00253 }

void transform_to_next_perm ( WERD_RES_LIST &  words  ) 

Checks current word list to find smallest word gap size.

Parameters:
words list of words
Returns:
none
Then walks the word list closing any gaps of this size by either inserted new combination words, or extending existing ones.

The routine COULD be limited to stop it building words longer than N blobs.

If there are no more gaps then it DELETES the entire list and returns the empty list to cause termination.

Definition at line 433 of file fixspace.cpp.

References WERD_RES::best_choice, WERD::bounding_box(), WERD_RES::combination, WERD_RES::copy_on(), WERD_RES::done, FALSE, WERD::flag(), WERD::join_on(), BOX::left(), MAX_INT16, NULL, WERD_RES::outword, WERD_RES::part_of_combo, WERD_RES::raw_choice, BOX::right(), WERD::set_flag(), TRUE, W_EOL, and WERD_RES::word.

Referenced by fix_fuzzy_space_list().

00433                                                   { 
00434   WERD_RES_IT word_it(&words); 
00435   WERD_RES_IT prev_word_it(&words); 
00436   WERD_RES *word;
00437   WERD_RES *prev_word;
00438   WERD_RES *combo;
00439   WERD *copy_word;
00440   INT16 prev_right = -1;
00441   BOX box;
00442   INT16 gap;
00443   INT16 min_gap = MAX_INT16;
00444 
00445   for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
00446     word = word_it.data ();
00447     if (!word->part_of_combo) {
00448       box = word->word->bounding_box ();
00449       if (prev_right >= 0) {
00450         gap = box.left () - prev_right;
00451         if (gap < min_gap)
00452           min_gap = gap;
00453       }
00454       prev_right = box.right ();
00455     }
00456   }
00457   if (min_gap < MAX_INT16) {
00458     prev_right = -1;  //back to start
00459     word_it.set_to_list (&words);
00460     for (;  //cant use cycle pt due to inserted combos at start of list
00461     (prev_right < 0) || !word_it.at_first (); word_it.forward ()) {
00462       word = word_it.data ();
00463       if (!word->part_of_combo) {
00464         box = word->word->bounding_box ();
00465         if (prev_right >= 0) {
00466           gap = box.left () - prev_right;
00467           if (gap <= min_gap) {
00468             prev_word = prev_word_it.data ();
00469             if (prev_word->combination)
00470               combo = prev_word;
00471             else {
00472               /* Make a new combination and insert before the first word being joined */
00473               copy_word = new WERD;
00474               *copy_word = *(prev_word->word);
00475               //deep copy
00476               combo = new WERD_RES (copy_word);
00477               combo->combination = TRUE;
00478               prev_word->part_of_combo = TRUE;
00479               prev_word_it.add_before_then_move (combo);
00480             }
00481             combo->word->set_flag (W_EOL, word->word->flag (W_EOL));
00482             if (word->combination) {
00483               combo->word->join_on (word->word);
00484               //Move blbs to combo
00485               //old combo no longer needed
00486               delete word_it.extract ();
00487             }
00488             else {
00489               //Copy current wd to combo
00490               combo->copy_on (word);
00491               word->part_of_combo = TRUE;
00492             }
00493             combo->done = FALSE;
00494             if (combo->outword != NULL) {
00495               delete combo->outword;
00496               delete combo->best_choice;
00497               delete combo->raw_choice;
00498               combo->outword = NULL;
00499               combo->best_choice = NULL;
00500               combo->raw_choice = NULL;
00501             }
00502           }
00503           else
00504               //catch up
00505               prev_word_it = word_it;
00506         }
00507         prev_right = box.right ();
00508       }
00509     }
00510   }
00511   else
00512     words.clear ();              //signal termination
00513 }

BOOL8 uniformly_spaced ( WERD_RES word  ) 

Test if spacing of word is uniform/sensible.

Parameters:
word Word of interest
Note:
Global: bln_x_height
Returns:
TRUE if uniform
Return true if one of the following are true:
Note:
REMEMBER - WE'RE NOW WORKING WITH A BLN WERD !!!

Definition at line 597 of file fixspace.cpp.

References STATS::add(), WERD_RES::best_choice, WERD::blob_list(), STRING::contains(), WERD_RES::denorm, STATS::get_total(), ROW::kern(), BOX::left(), MAX_INT16, MAXSPACING, STATS::mean(), STATS::median(), WERD_RES::outword, BOX::right(), DENORM::row(), ROW::space(), tprintf(), and ROW::x_height().

Referenced by eval_word_spacing().

00598                                        {
00599   PBLOB_IT blob_it;
00600   BOX box;
00601   INT16 prev_right = -MAX_INT16;
00602   INT16 gap;
00603   INT16 max_gap = -MAX_INT16;
00604   INT16 max_gap_count = 0;
00605   STATS gap_stats (0, MAXSPACING);
00606   BOOL8 result;
00607   const ROW *row = word->denorm.row ();
00608   float max_non_space;
00609   float normalised_max_nonspace;
00610   INT16 i = 0;
00611   STRING punct_chars = "\"`',.:;";
00612 
00613   blob_it.set_to_list (word->outword->blob_list ());
00614 
00615   for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
00616     box = blob_it.data ()->bounding_box ();
00617     if ((prev_right > -MAX_INT16) &&
00618       (!fixsp_ignore_punct ||
00619       (!punct_chars.contains (word->best_choice->string ()[i - 1]) &&
00620     !punct_chars.contains (word->best_choice->string ()[i])))) {
00621       gap = box.left () - prev_right;
00622       if (gap < max_gap)
00623         gap_stats.add (gap, 1);
00624       else if (gap == max_gap)
00625         max_gap_count++;
00626       else {
00627         if (max_gap_count > 0)
00628           gap_stats.add (max_gap, max_gap_count);
00629         max_gap = gap;
00630         max_gap_count = 1;
00631       }
00632     }
00633     prev_right = box.right ();
00634     i++;
00635   }
00636 
00637   max_non_space = (row->space () + 3 * row->kern ()) / 4;
00638   normalised_max_nonspace = max_non_space * bln_x_height / row->x_height ();
00639 
00640   result = ((gap_stats.get_total () == 0) ||
00641     (max_gap <= normalised_max_nonspace) ||
00642     ((gap_stats.get_total () > 2) &&
00643     (max_gap <= 2 * gap_stats.median ())) ||
00644     ((gap_stats.get_total () <= 2) &&
00645     (max_gap <= 2 * gap_stats.mean ())));
00646   #ifndef SECURE_NAMES
00647   if ((debug_fix_space_level > 1)) {
00648     if (result)
00649       tprintf
00650         ("ACCEPT SPACING FOR: \"%s\" norm_maxnon = %f max=%d maxcount=%d total=%d mean=%f median=%f\n",
00651         word->best_choice->string ().string (), normalised_max_nonspace,
00652         max_gap, max_gap_count, gap_stats.get_total (), gap_stats.mean (),
00653         gap_stats.median ());
00654     else
00655       tprintf
00656         ("REJECT SPACING FOR: \"%s\" norm_maxnon = %f max=%d maxcount=%d total=%d mean=%f median=%f\n",
00657         word->best_choice->string ().string (), normalised_max_nonspace,
00658         max_gap, max_gap_count, gap_stats.get_total (), gap_stats.mean (),
00659         gap_stats.median ());
00660   }
00661   #endif
00662 
00663   return result;
00664 }

INT16 worst_noise_blob ( WERD_RES word_res,
float *  worst_noise_score 
)

Examine blobs and return reference to the worst one,.

Parameters:
word_res Word results
worst_noise_score Modified by this function
Note:
Global: bln_x_height, fixsp_small_outlines_size, fixsp_non_noise_limit, debug_fix_space_level
Returns:
number of worst blob

Definition at line 880 of file fixspace.cpp.

References ASSERT_HOST, WERD_RES::best_choice, blob_count, WERD::blob_list(), blob_noise_score(), WERD_RES::outword, WERD_RES::reject_map, tprintf(), and worst_noise_blob().

Referenced by break_noisiest_blob_word(), fix_sp_fp_word(), and worst_noise_blob().

00880                                                                      { 
00881   PBLOB_IT blob_it;
00882   INT16 blob_count;
00883   float noise_score[512];
00884   int i;
00885   int min_noise_blob;            //1st contender
00886   int max_noise_blob;            //last contender
00887   int non_noise_count;
00888   int worst_noise_blob;          //Worst blob
00889   float small_limit = bln_x_height * fixsp_small_outlines_size;
00890   float non_noise_limit = bln_x_height * 0.8;
00891 
00892   blob_it.set_to_list (word_res->outword->blob_list ());
00893   //normalised
00894   blob_count = blob_it.length ();
00895   ASSERT_HOST (blob_count <= 512);
00896   if (blob_count < 5)
00897     return -1;                   //too short to split
00898   /* Get the noise scores for all blobs */
00899 
00900   #ifndef SECURE_NAMES
00901   if (debug_fix_space_level > 5)
00902     tprintf ("FP fixspace Noise metrics for \"%s\": ",
00903       word_res->best_choice->string ().string ());
00904   #endif
00905 
00906   for (i = 0; i < blob_count; i++, blob_it.forward ()) {
00907     if (word_res->reject_map[i].accepted ())
00908       noise_score[i] = non_noise_limit;
00909     else
00910       noise_score[i] = blob_noise_score (blob_it.data ());
00911 
00912     if (debug_fix_space_level > 5)
00913       tprintf ("%1.1f ", noise_score[i]);
00914   }
00915   if (debug_fix_space_level > 5)
00916     tprintf ("\n");
00917 
00918   /* Now find the worst one which is far enough away from the end of the word */
00919 
00920   non_noise_count = 0;
00921   for (i = 0;
00922   (i < blob_count) && (non_noise_count < fixsp_non_noise_limit); i++) {
00923     if (noise_score[i] >= non_noise_limit)
00924       non_noise_count++;
00925   }
00926   if (non_noise_count < fixsp_non_noise_limit)
00927     return -1;
00928   min_noise_blob = i;
00929 
00930   non_noise_count = 0;
00931   for (i = blob_count - 1;
00932   (i >= 0) && (non_noise_count < fixsp_non_noise_limit); i--) {
00933     if (noise_score[i] >= non_noise_limit)
00934       non_noise_count++;
00935   }
00936   if (non_noise_count < fixsp_non_noise_limit)
00937     return -1;
00938   max_noise_blob = i;
00939 
00940   if (min_noise_blob > max_noise_blob)
00941     return -1;
00942 
00943   *worst_noise_score = small_limit;
00944   worst_noise_blob = -1;
00945   for (i = min_noise_blob; i <= max_noise_blob; i++) {
00946     if (noise_score[i] < *worst_noise_score) {
00947       worst_noise_blob = i;
00948       *worst_noise_score = noise_score[i];
00949     }
00950   }
00951   return worst_noise_blob;
00952 }


Generated on Wed Feb 28 19:49:14 2007 for Tesseract by  doxygen 1.5.1