textord/topitch.cpp

Go to the documentation of this file.
00001 
00020 #include          "mfcpch.h"
00021 #ifdef __UNIX__
00022 #include          <assert.h>
00023 #endif
00024 #include          "stderr.h"
00025 #include          "blobbox.h"
00026 #include          "lmedsq.h"
00027 #include          "statistc.h"
00028 #include          "drawtord.h"
00029 #include          "makerow.h"
00030 #include          "pitsync1.h"
00031 #include          "pithsync.h"
00032 #include          "blobcmpl.h"
00033 #include          "tovars.h"
00034 #include          "wordseg.h"
00035 #include          "topitch.h"
00036 #include          "secname.h"
00037 
00038 #define EXTERN
00039 
00040 EXTERN BOOL_VAR (textord_all_prop, FALSE, "All doc is proportial text");
00043 EXTERN BOOL_VAR (textord_debug_pitch_test, FALSE,
00044 "Debug on fixed pitch test");
00045 EXTERN BOOL_VAR (textord_disable_pitch_test, FALSE,
00046 "Turn off dp fixed pitch algorithm");
00047 EXTERN BOOL_VAR (textord_fast_pitch_test, FALSE,
00048 "Do even faster pitch algorithm");
00049 EXTERN BOOL_VAR (textord_debug_pitch_metric, FALSE,
00050 "Write full metric stuff");
00051 EXTERN BOOL_VAR (textord_show_row_cuts, FALSE, "Draw row-level cuts");
00052 EXTERN BOOL_VAR (textord_show_page_cuts, FALSE, "Draw page-level cuts");
00053 EXTERN BOOL_VAR (textord_pitch_cheat, FALSE,
00054 "Use correct answer for fixed/prop");
00055 EXTERN BOOL_VAR (textord_blockndoc_fixed, FALSE,
00056 "Attempt whole doc/block fixed pitch");
00057 EXTERN double_VAR (textord_projection_scale, 0.200, "Ding rate for mid-cuts");
00058 EXTERN double_VAR (textord_balance_factor, 1.0,
00059 "Ding rate for unbalanced char cells");
00060 EXTERN double_VAR (textord_repch_width_variance, 0.2,
00061 "Max width change of gap/blob");
00064 #define FIXED_WIDTH_MULTIPLE  5
00065 #define BLOCK_STATS_CLUSTERS  10
00066 #define MAX_ALLOWED_PITCH 100    //max pixel pitch.
00067 
00075 void compute_fixed_pitch(                             //determine pitch
00076                          ICOORD page_tr,              //top right
00077                          TO_BLOCK_LIST *port_blocks,  //input list
00078                          float gradient,              //page skew
00079                          FCOORD rotation,             //for drawing
00080                          BOOL8 testing_on             //correct orientation
00081                         ) {
00082   TO_BLOCK_IT block_it;          //iterator
00083   TO_BLOCK *block;               //current block;
00084   TO_ROW_IT row_it;              //row iterator
00085   TO_ROW *row;                   //current row
00086   int block_index;               //block number
00087   int row_index;                 //row number
00088 
00089 #ifndef GRAPHICS_DISABLED
00090   if (textord_show_initial_words && testing_on) {
00091     if (to_win == NO_WINDOW)
00092       create_to_win(page_tr);
00093   }
00094 #endif
00095 
00096   block_it.set_to_list (port_blocks);
00097   block_index = 1;
00098   for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
00099   block_it.forward ()) {
00100     block = block_it.data ();
00101     compute_block_pitch(block, rotation, block_index, testing_on);
00102     block_index++;
00103   }
00104 
00105   if (!try_doc_fixed (page_tr, port_blocks, gradient)) {
00106     block_index = 1;
00107     for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
00108     block_it.forward ()) {
00109       block = block_it.data ();
00110       if (!try_block_fixed (block, block_index))
00111         try_rows_fixed(block, block_index, testing_on);
00112       block_index++;
00113     }
00114   }
00115 
00116   block_index = 1;
00117   for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
00118   block_it.forward ()) {
00119     block = block_it.data ();
00120     row_it.set_to_list (block->get_rows ());
00121     row_index = 1;
00122     for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00123       row = row_it.data ();
00124       fix_row_pitch(row, block, port_blocks, row_index, block_index);
00125       row_index++;
00126     }
00127     if (testing_on
00128       && (textord_debug_pitch_test && block->block->text_region () != NULL
00129     || textord_blocksall_fixed || textord_blocksall_prop)) {
00130       tprintf ("Corr:");
00131       print_block_counts(block, block_index);
00132     }
00133     block_index++;
00134   }
00135 #ifndef GRAPHICS_DISABLED
00136   if (textord_show_initial_words && testing_on) {
00137     overlap_picture_ops(TRUE);
00138   }
00139 #endif
00140 }
00141 
00142 
00149 void fix_row_pitch(                        //get some value
00150                    TO_ROW *bad_row,        //row to fix
00151                    TO_BLOCK *bad_block,    //block of bad_row
00152                    TO_BLOCK_LIST *blocks,  //blocks to scan
00153                    INT32 row_target,       //number of row
00154                    INT32 block_target      //number of block
00155                   ) {
00156   const char *res_string;        //decision on line
00157   INT16 mid_cuts;
00158   int block_votes;               //votes in block
00159   int like_votes;                //votes over page
00160   int other_votes;               //votes of unlike blocks
00161   int block_index;               //number of block
00162   int row_index;                 //number of row
00163   int maxwidth;                  //max pitch
00164   TO_BLOCK_IT block_it = blocks; //block iterator
00165   TO_ROW_IT row_it;
00166   TO_BLOCK *block;               //current block
00167   TO_ROW *row;                   //current row
00168   float sp_sd;                   //space deviation
00169   STATS block_stats;             //pitches in block
00170   STATS like_stats;              //pitches in page
00171 
00172   block_votes = like_votes = other_votes = 0;
00173   maxwidth = (INT32) ceil (bad_row->xheight * textord_words_maxspace);
00174   if (bad_row->pitch_decision != PITCH_DEF_FIXED
00175   && bad_row->pitch_decision != PITCH_DEF_PROP) {
00176     block_stats.set_range (0, maxwidth);
00177     like_stats.set_range (0, maxwidth);
00178     block_index = 1;
00179     for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
00180     block_it.forward ()) {
00181       block = block_it.data ();
00182       row_index = 1;
00183       row_it.set_to_list (block->get_rows ());
00184       for (row_it.mark_cycle_pt (); !row_it.cycled_list ();
00185       row_it.forward ()) {
00186         row = row_it.data ();
00187         if (bad_row->all_caps
00188           && row->xheight + row->ascrise
00189           <
00190           (bad_row->xheight + bad_row->ascrise) * (1 +
00191           textord_pitch_rowsimilarity)
00192           && row->xheight + row->ascrise >
00193           (bad_row->xheight + bad_row->ascrise) * (1 -
00194           textord_pitch_rowsimilarity)
00195           || !bad_row->all_caps
00196           && row->xheight <
00197           bad_row->xheight * (1 + textord_pitch_rowsimilarity)
00198           && row->xheight >
00199         bad_row->xheight * (1 - textord_pitch_rowsimilarity)) {
00200           if (block_index == block_target) {
00201             if (row->pitch_decision == PITCH_DEF_FIXED) {
00202               block_votes += textord_words_veto_power;
00203               block_stats.add ((INT32) row->fixed_pitch,
00204                 textord_words_veto_power);
00205             }
00206             else if (row->pitch_decision == PITCH_MAYBE_FIXED
00207             || row->pitch_decision == PITCH_CORR_FIXED) {
00208               block_votes++;
00209               block_stats.add ((INT32) row->fixed_pitch, 1);
00210             }
00211             else if (row->pitch_decision == PITCH_DEF_PROP)
00212               block_votes -= textord_words_veto_power;
00213             else if (row->pitch_decision == PITCH_MAYBE_PROP
00214               || row->pitch_decision == PITCH_CORR_PROP)
00215               block_votes--;
00216           }
00217           else {
00218             if (row->pitch_decision == PITCH_DEF_FIXED) {
00219               like_votes += textord_words_veto_power;
00220               like_stats.add ((INT32) row->fixed_pitch,
00221                 textord_words_veto_power);
00222             }
00223             else if (row->pitch_decision == PITCH_MAYBE_FIXED
00224             || row->pitch_decision == PITCH_CORR_FIXED) {
00225               like_votes++;
00226               like_stats.add ((INT32) row->fixed_pitch, 1);
00227             }
00228             else if (row->pitch_decision == PITCH_DEF_PROP)
00229               like_votes -= textord_words_veto_power;
00230             else if (row->pitch_decision == PITCH_MAYBE_PROP
00231               || row->pitch_decision == PITCH_CORR_PROP)
00232               like_votes--;
00233           }
00234         }
00235         else {
00236           if (row->pitch_decision == PITCH_DEF_FIXED)
00237             other_votes += textord_words_veto_power;
00238           else if (row->pitch_decision == PITCH_MAYBE_FIXED
00239             || row->pitch_decision == PITCH_CORR_FIXED)
00240             other_votes++;
00241           else if (row->pitch_decision == PITCH_DEF_PROP)
00242             other_votes -= textord_words_veto_power;
00243           else if (row->pitch_decision == PITCH_MAYBE_PROP
00244             || row->pitch_decision == PITCH_CORR_PROP)
00245             other_votes--;
00246         }
00247         row_index++;
00248       }
00249       block_index++;
00250     }
00251     if (block_votes > textord_words_veto_power) {
00252       bad_row->fixed_pitch = block_stats.ile (0.5);
00253       bad_row->pitch_decision = PITCH_CORR_FIXED;
00254     }
00255     else if (block_votes <= textord_words_veto_power && like_votes > 0) {
00256       bad_row->fixed_pitch = like_stats.ile (0.5);
00257       bad_row->pitch_decision = PITCH_CORR_FIXED;
00258     }
00259     else {
00260       bad_row->pitch_decision = PITCH_CORR_PROP;
00261       #ifndef SECURE_NAMES
00262       if (block_votes == 0 && like_votes == 0 && other_votes > 0
00263         && (textord_debug_pitch_test || textord_debug_pitch_metric))
00264         tprintf
00265           ("Warning:row %d of block %d set prop with no like rows against trend\n",
00266           row_target, block_target);
00267       #endif
00268     }
00269   }
00270   if (textord_debug_pitch_metric) {
00271     tprintf (":b_votes=%d:l_votes=%d:o_votes=%d",
00272       block_votes, like_votes, other_votes);
00273     if (bad_row->pitch_decision == PITCH_CORR_PROP
00274     || bad_row->pitch_decision == PITCH_DEF_PROP) {
00275       res_string = bad_block->block->text_region () != NULL ?
00276         (bad_block->block->text_region ()->
00277         is_prop ()? "CP" : "WP") : "XP";
00278     }
00279     else {
00280       res_string = bad_block->block->text_region () != NULL ?
00281         (bad_block->block->text_region ()->
00282         is_prop ()? "WF" : "CF") : "XF";
00283     }
00284     tprintf (":Blk=%d:Row=%d:%c:",
00285       block_target, row_target,
00286       bad_block->block->text_region () != NULL ?
00287       (bad_block->block->text_region ()->
00288       is_prop ()? 'P' : 'F') : 'X');
00289     tprintf ("x=%g:asc=%g:corr_res=%s\n", bad_row->xheight,
00290       bad_row->ascrise, res_string);
00291   }
00292   if (textord_pitch_cheat && bad_block->block->text_region () != NULL)
00293     bad_row->pitch_decision =
00294       bad_block->block->text_region ()->
00295       is_prop ()? PITCH_CORR_PROP : PITCH_CORR_FIXED;
00296   if (bad_row->pitch_decision == PITCH_CORR_FIXED) {
00297     if (bad_row->fixed_pitch < textord_min_xheight) {
00298       if (block_votes > 0)
00299         bad_row->fixed_pitch = block_stats.ile (0.5);
00300       else if (block_votes == 0 && like_votes > 0)
00301         bad_row->fixed_pitch = like_stats.ile (0.5);
00302       else {
00303         tprintf
00304           ("Warning:guessing pitch as xheight on row %d, block %d\n",
00305           row_target, block_target);
00306         bad_row->fixed_pitch = bad_row->xheight;
00307       }
00308     }
00309     if (bad_row->fixed_pitch < textord_min_xheight)
00310       bad_row->fixed_pitch = (float) textord_min_xheight;
00311     bad_row->kern_size = bad_row->fixed_pitch / 4;
00312     bad_row->min_space = (INT32) (bad_row->fixed_pitch * 0.6);
00313     bad_row->max_nonspace = (INT32) (bad_row->fixed_pitch * 0.4);
00314     bad_row->space_threshold =
00315       (bad_row->min_space + bad_row->max_nonspace) / 2;
00316     bad_row->space_size = bad_row->fixed_pitch;
00317     if (bad_row->char_cells.empty ())
00318       tune_row_pitch (bad_row, &bad_row->projection,
00319         bad_row->projection_left, bad_row->projection_right,
00320         (bad_row->fixed_pitch +
00321         bad_row->max_nonspace * 3) / 4, bad_row->fixed_pitch,
00322         sp_sd, mid_cuts, &bad_row->char_cells, FALSE);
00323   }
00324   else if (bad_row->pitch_decision == PITCH_CORR_PROP
00325   || bad_row->pitch_decision == PITCH_DEF_PROP) {
00326     bad_row->fixed_pitch = 0.0f;
00327     bad_row->char_cells.clear ();
00328   }
00329 }
00330 
00331 
00337 void compute_block_pitch(                    //process each block
00338                          TO_BLOCK *block,    //input list
00339                          FCOORD rotation,    //for drawing
00340                          INT32 block_index,  //block number
00341                          BOOL8 testing_on    //correct orientation
00342                         ) {
00343   BOX block_box;                 //bounding box
00344 
00345   block_box = block->block->bounding_box ();
00346   if (testing_on && textord_debug_pitch_test) {
00347     tprintf ("Block %d at (%d,%d)->(%d,%d)\n",
00348       block_index,
00349       block_box.left (), block_box.bottom (),
00350       block_box.right (), block_box.top ());
00351   }
00352   block->min_space = (INT32) floor (block->xheight
00353     * textord_words_default_minspace);
00354   block->max_nonspace = (INT32) ceil (block->xheight
00355     * textord_words_default_nonspace);
00356   block->fixed_pitch = 0.0f;
00357   block->space_size = (float) block->min_space;
00358   block->kern_size = (float) block->max_nonspace;
00359   block->pr_nonsp = block->xheight * words_default_prop_nonspace;
00360   block->pr_space = block->pr_nonsp * textord_spacesize_ratioprop;
00361   if (!block->get_rows ()->empty ()) {
00362     ASSERT_HOST (block->xheight > 0);
00363     if (textord_repeat_extraction)
00364       find_repeated_chars(block, textord_show_initial_words &&testing_on);
00365 #ifndef GRAPHICS_DISABLED
00366     if (textord_show_initial_words && testing_on)
00367       overlap_picture_ops(TRUE);
00368 #endif
00369     compute_rows_pitch(block,
00370                        block_index,
00371                        textord_debug_pitch_test &&testing_on);
00372   }
00373 }
00374 
00375 
00381 BOOL8 compute_rows_pitch(                    //find line stats
00382                          TO_BLOCK *block,    //block to do
00383                          INT32 block_index,  //block number
00384                          BOOL8 testing_on    //correct orientation
00385                         ) {
00386   INT32 maxwidth;                //of spaces
00387   TO_ROW *row;                   //current row
00388   INT32 row_index;               //row number.
00389   float lower, upper;            //cluster thresholds
00390   TO_ROW_IT row_it = block->get_rows ();
00391 
00392   row_index = 1;
00393   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00394     row = row_it.data ();
00395     ASSERT_HOST (row->xheight > 0);
00396     row->compute_vertical_projection ();
00397     maxwidth = (INT32) ceil (row->xheight * textord_words_maxspace);
00398     if (row_pitch_stats (row, maxwidth, testing_on)
00399       && find_row_pitch (row, maxwidth,
00400       textord_dotmatrix_gap + 1, block, block_index,
00401     row_index, testing_on)) {
00402       if (row->fixed_pitch == 0) {
00403         lower = row->pr_nonsp;
00404         upper = row->pr_space;
00405         row->space_size = upper;
00406         row->kern_size = lower;
00407       }
00408     }
00409     else {
00410       row->fixed_pitch = 0.0f;   //insufficient data
00411       row->pitch_decision = PITCH_DUNNO;
00412     }
00413     row_index++;
00414   }
00415   return FALSE;
00416 }
00417 
00418 
00424 BOOL8 try_doc_fixed(                             //determine pitch
00425                     ICOORD page_tr,              //top right
00426                     TO_BLOCK_LIST *port_blocks,  //input list
00427                     float gradient               //page skew
00428                    ) {
00429   INT16 master_x;                //uniform shifts
00430   INT16 pitch;                   //median pitch.
00431   int x;                         //profile coord
00432   int prop_blocks;               //correct counts
00433   int fixed_blocks;
00434   int total_row_count;           //total in page
00435                                  //iterator
00436   TO_BLOCK_IT block_it = port_blocks;
00437   TO_BLOCK *block;               //current block;
00438   TO_ROW_IT row_it;              //row iterator
00439   TO_ROW *row;                   //current row
00440   INT16 projection_left;         //edges
00441   INT16 projection_right;
00442   INT16 row_left;                //edges of row
00443   INT16 row_right;
00444   ICOORDELT_LIST *master_cells;  //cells for page
00445   float master_y;                //uniform shifts
00446   float shift_factor;            //page skew correction
00447   float row_shift;               //shift for row
00448   float final_pitch;             //output pitch
00449   float row_y;                   //baseline
00450   STATS projection;              //entire page
00451   STATS pitches (0, MAX_ALLOWED_PITCH);
00452   //for median
00453   float sp_sd;                   //space sd
00454   INT16 mid_cuts;                //no of cheap cuts
00455   float pitch_sd;                //sync rating
00456 
00457   if (block_it.empty ()
00458     //      || block_it.data()==block_it.data_relative(1)
00459     || !textord_blockndoc_fixed)
00460     return FALSE;
00461   shift_factor = gradient / (gradient * gradient + 1);
00462   row_it.set_to_list (block_it.data ()->get_rows ());
00463   master_x = row_it.data ()->projection_left;
00464   master_y = row_it.data ()->baseline.y (master_x);
00465   projection_left = MAX_INT16;
00466   projection_right = -MAX_INT16;
00467   prop_blocks = 0;
00468   fixed_blocks = 0;
00469   total_row_count = 0;
00470 
00471   for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
00472   block_it.forward ()) {
00473     block = block_it.data ();
00474     if (block->block->text_region () != NULL) {
00475       if (block->block->text_region ()->is_prop ())
00476         prop_blocks++;
00477       else
00478         fixed_blocks++;
00479     }
00480     row_it.set_to_list (block->get_rows ());
00481     for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00482       row = row_it.data ();
00483       total_row_count++;
00484       if (row->fixed_pitch > 0)
00485         pitches.add ((INT32) (row->fixed_pitch), 1);
00486       //find median
00487       row_y = row->baseline.y (master_x);
00488       row_left =
00489         (INT16) (row->projection_left -
00490         shift_factor * (master_y - row_y));
00491       row_right =
00492         (INT16) (row->projection_right -
00493         shift_factor * (master_y - row_y));
00494       if (row_left < projection_left)
00495         projection_left = row_left;
00496       if (row_right > projection_right)
00497         projection_right = row_right;
00498     }
00499   }
00500   if (pitches.get_total () == 0)
00501     return FALSE;
00502   projection.set_range (projection_left, projection_right);
00503 
00504   for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
00505   block_it.forward ()) {
00506     block = block_it.data ();
00507     row_it.set_to_list (block->get_rows ());
00508     for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00509       row = row_it.data ();
00510       row_y = row->baseline.y (master_x);
00511       row_left =
00512         (INT16) (row->projection_left -
00513         shift_factor * (master_y - row_y));
00514       for (x = row->projection_left; x < row->projection_right;
00515       x++, row_left++) {
00516         projection.add (row_left, row->projection.pile_count (x));
00517       }
00518     }
00519   }
00520 
00521   row_it.set_to_list (block_it.data ()->get_rows ());
00522   row = row_it.data ();
00523 #ifndef GRAPHICS_DISABLED
00524   if (textord_show_page_cuts && to_win != NO_WINDOW)
00525     projection.plot (to_win, projection_left,
00526       row->intercept (), 1.0f, -1.0f, CORAL);
00527 #endif
00528   final_pitch = pitches.ile (0.5);
00529   pitch = (INT16) final_pitch;
00530   pitch_sd =
00531     tune_row_pitch (row, &projection, projection_left, projection_right,
00532     pitch * 0.75, final_pitch, sp_sd, mid_cuts,
00533     &row->char_cells, FALSE);
00534 
00535   if (textord_debug_pitch_metric)
00536     tprintf
00537       ("try_doc:props=%d:fixed=%d:pitch=%d:final_pitch=%g:pitch_sd=%g:sp_sd=%g:sd/trc=%g:sd/p=%g:sd/trc/p=%g\n",
00538       prop_blocks, fixed_blocks, pitch, final_pitch, pitch_sd, sp_sd,
00539       pitch_sd / total_row_count, pitch_sd / pitch,
00540       pitch_sd / total_row_count / pitch);
00541 
00542 #ifndef GRAPHICS_DISABLED
00543   if (textord_show_page_cuts && to_win != NO_WINDOW) {
00544     master_cells = &row->char_cells;
00545     for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
00546     block_it.forward ()) {
00547       block = block_it.data ();
00548       row_it.set_to_list (block->get_rows ());
00549       for (row_it.mark_cycle_pt (); !row_it.cycled_list ();
00550       row_it.forward ()) {
00551         row = row_it.data ();
00552         row_y = row->baseline.y (master_x);
00553         row_shift = shift_factor * (master_y - row_y);
00554         plot_row_cells(to_win, GOLDENROD, row, row_shift, master_cells);
00555       }
00556     }
00557   }
00558 #endif
00559   row->char_cells.clear ();
00560   return FALSE;
00561 }
00562 
00563 
00569 BOOL8 try_block_fixed(                   //find line stats
00570                       TO_BLOCK *block,   //block to do
00571                       INT32 block_index  //block number
00572                      ) {
00573   return FALSE;
00574 }
00575 
00576 
00582 BOOL8 try_rows_fixed(                    //find line stats
00583                      TO_BLOCK *block,    //block to do
00584                      INT32 block_index,  //block number
00585                      BOOL8 testing_on    //correct orientation
00586                     ) {
00587   INT32 maxwidth;                //of spaces
00588   TO_ROW *row;                   //current row
00589   INT32 row_index;               //row number.
00590   INT32 def_fixed = 0;           //counters
00591   INT32 def_prop = 0;
00592   INT32 maybe_fixed = 0;
00593   INT32 maybe_prop = 0;
00594   INT32 dunno = 0;
00595   INT32 corr_fixed = 0;
00596   INT32 corr_prop = 0;
00597   float lower, upper;            //cluster thresholds
00598   TO_ROW_IT row_it = block->get_rows ();
00599 
00600   row_index = 1;
00601   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00602     row = row_it.data ();
00603     ASSERT_HOST (row->xheight > 0);
00604     maxwidth = (INT32) ceil (row->xheight * textord_words_maxspace);
00605     if (row->fixed_pitch > 0 && fixed_pitch_row (row, block_index)) {
00606       if (row->fixed_pitch == 0) {
00607         lower = row->pr_nonsp;
00608         upper = row->pr_space;
00609         row->space_size = upper;
00610         row->kern_size = lower;
00611       }
00612     }
00613     row_index++;
00614   }
00615   count_block_votes(block,
00616                     def_fixed,
00617                     def_prop,
00618                     maybe_fixed,
00619                     maybe_prop,
00620                     corr_fixed,
00621                     corr_prop,
00622                     dunno);
00623   if (testing_on
00624     && (textord_debug_pitch_test
00625   || textord_blocksall_prop || textord_blocksall_fixed)) {
00626     tprintf ("Initially:");
00627     print_block_counts(block, block_index);
00628   }
00629   if (def_fixed > def_prop * textord_words_veto_power)
00630     block->pitch_decision = PITCH_DEF_FIXED;
00631   else if (def_prop > def_fixed * textord_words_veto_power)
00632     block->pitch_decision = PITCH_DEF_PROP;
00633   else if (def_fixed > 0 || def_prop > 0)
00634     block->pitch_decision = PITCH_DUNNO;
00635   else if (maybe_fixed > maybe_prop * textord_words_veto_power)
00636     block->pitch_decision = PITCH_MAYBE_FIXED;
00637   else if (maybe_prop > maybe_fixed * textord_words_veto_power)
00638     block->pitch_decision = PITCH_MAYBE_PROP;
00639   else
00640     block->pitch_decision = PITCH_DUNNO;
00641   return FALSE;
00642 }
00643 
00644 
00650 void print_block_counts(                   //find line stats
00651                         TO_BLOCK *block,   //block to do
00652                         INT32 block_index  //block number
00653                        ) {
00654   INT32 def_fixed = 0;           //counters
00655   INT32 def_prop = 0;
00656   INT32 maybe_fixed = 0;
00657   INT32 maybe_prop = 0;
00658   INT32 dunno = 0;
00659   INT32 corr_fixed = 0;
00660   INT32 corr_prop = 0;
00661 
00662   count_block_votes(block,
00663                     def_fixed,
00664                     def_prop,
00665                     maybe_fixed,
00666                     maybe_prop,
00667                     corr_fixed,
00668                     corr_prop,
00669                     dunno);
00670   tprintf ("Block %d has (%d,%d,%d)",
00671     block_index, def_fixed, maybe_fixed, corr_fixed);
00672   if ((textord_blocksall_prop
00673     || block->block->text_region () != NULL
00674     && block->block->text_region ()->is_prop ()) && (def_fixed
00675     || maybe_fixed
00676     || corr_fixed))
00677     tprintf (" (Wrongly)");
00678   tprintf (" fixed, (%d,%d,%d)", def_prop, maybe_prop, corr_prop);
00679   if ((textord_blocksall_fixed
00680     || block->block->text_region () != NULL
00681     && !block->block->text_region ()->is_prop ()) && (def_prop
00682     || maybe_prop
00683     || corr_prop))
00684     tprintf (" (Wrongly)");
00685   tprintf (" prop, %d dunno\n", dunno);
00686 }
00687 
00688 
00694 void count_block_votes(                   //find line stats
00695                        TO_BLOCK *block,   //block to do
00696                        INT32 &def_fixed,  //add to counts
00697                        INT32 &def_prop,
00698                        INT32 &maybe_fixed,
00699                        INT32 &maybe_prop,
00700                        INT32 &corr_fixed,
00701                        INT32 &corr_prop,
00702                        INT32 &dunno) {
00703   TO_ROW *row;                   //current row
00704   TO_ROW_IT row_it = block->get_rows ();
00705 
00706   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00707     row = row_it.data ();
00708     switch (row->pitch_decision) {
00709       case PITCH_DUNNO:
00710         dunno++;
00711         break;
00712       case PITCH_DEF_PROP:
00713         def_prop++;
00714         break;
00715       case PITCH_MAYBE_PROP:
00716         maybe_prop++;
00717         break;
00718       case PITCH_DEF_FIXED:
00719         def_fixed++;
00720         break;
00721       case PITCH_MAYBE_FIXED:
00722         maybe_fixed++;
00723         break;
00724       case PITCH_CORR_PROP:
00725         corr_prop++;
00726         break;
00727       case PITCH_CORR_FIXED:
00728         corr_fixed++;
00729         break;
00730     }
00731   }
00732 }
00733 
00734 
00740 BOOL8 row_pitch_stats(                  //find line stats
00741                       TO_ROW *row,      //current row
00742                       INT32 maxwidth,   //of spaces
00743                       BOOL8 testing_on  //correct orientation
00744                      ) {
00745   BLOBNBOX *blob;                //current blob
00746   int gap_index;                 //current gap
00747   INT32 prev_x;                  //end of prev blob
00748   INT32 cluster_count;           //no of clusters
00749   INT32 prev_count;              //of clusters
00750   INT32 smooth_factor;           //for smoothing stats
00751   BOX blob_box;                  //bounding box
00752   float lower, upper;            //cluster thresholds
00753                                  //gap sizes
00754   float gaps[BLOCK_STATS_CLUSTERS];
00755                                  //blobs
00756   BLOBNBOX_IT blob_it = row->blob_list ();
00757   STATS gap_stats (0, maxwidth);
00758   STATS cluster_stats[BLOCK_STATS_CLUSTERS + 1];
00759   //clusters
00760 
00761   smooth_factor =
00762     (INT32) (row->xheight * textord_wordstats_smooth_factor + 1.5);
00763   if (!blob_it.empty ()) {
00764     prev_x = blob_it.data ()->bounding_box ().right ();
00765     blob_it.forward ();
00766     while (!blob_it.at_first ()) {
00767       blob = blob_it.data ();
00768       if (!blob->joined_to_prev ()) {
00769         blob_box = blob->bounding_box ();
00770         if (blob_box.left () - prev_x < maxwidth)
00771           gap_stats.add (blob_box.left () - prev_x, 1);
00772         prev_x = blob_box.right ();
00773       }
00774       blob_it.forward ();
00775     }
00776   }
00777   if (gap_stats.get_total () == 0) {
00778     return FALSE;
00779   }
00780   cluster_count = 0;
00781   lower = row->xheight * words_initial_lower;
00782   upper = row->xheight * words_initial_upper;
00783   gap_stats.smooth (smooth_factor);
00784   do {
00785     prev_count = cluster_count;
00786     cluster_count = gap_stats.cluster (lower, upper,
00787       textord_spacesize_ratioprop,
00788       BLOCK_STATS_CLUSTERS, cluster_stats);
00789   }
00790   while (cluster_count > prev_count && cluster_count < BLOCK_STATS_CLUSTERS);
00791   if (cluster_count < 1) {
00792     return FALSE;
00793   }
00794   for (gap_index = 0; gap_index < cluster_count; gap_index++)
00795     gaps[gap_index] = cluster_stats[gap_index + 1].ile (0.5);
00796   //get medians
00797   if (testing_on) {
00798     tprintf ("cluster_count=%d:", cluster_count);
00799     for (gap_index = 0; gap_index < cluster_count; gap_index++)
00800       tprintf (" %g(%d)", gaps[gap_index],
00801         cluster_stats[gap_index + 1].get_total ());
00802     tprintf ("\n");
00803   }
00804   qsort (gaps, cluster_count, sizeof (float), sort_floats2);
00805 
00806   //Try to find proportional non-space and space for row.
00807   lower = row->xheight * words_default_prop_nonspace;
00808   upper = row->xheight * textord_words_min_minspace;
00809   for (gap_index = 0; gap_index < cluster_count
00810     && gaps[gap_index] < lower; gap_index++);
00811   if (gap_index == 0) {
00812     if (testing_on)
00813       tprintf ("No clusters below nonspace threshold!!\n");
00814     if (cluster_count > 1) {
00815       row->pr_nonsp = gaps[0];
00816       row->pr_space = gaps[1];
00817     }
00818     else {
00819       row->pr_nonsp = lower;
00820       row->pr_space = gaps[0];
00821     }
00822   }
00823   else {
00824     row->pr_nonsp = gaps[gap_index - 1];
00825     while (gap_index < cluster_count && gaps[gap_index] < upper)
00826       gap_index++;
00827     if (gap_index == cluster_count) {
00828       if (testing_on)
00829         tprintf ("No clusters above nonspace threshold!!\n");
00830       row->pr_space = lower * textord_spacesize_ratioprop;
00831     }
00832     else
00833       row->pr_space = gaps[gap_index];
00834   }
00835 
00836   //Now try to find the fixed pitch space and non-space.
00837   upper = row->xheight * words_default_fixed_space;
00838   for (gap_index = 0; gap_index < cluster_count
00839     && gaps[gap_index] < upper; gap_index++);
00840   if (gap_index == 0) {
00841     if (testing_on)
00842       tprintf ("No clusters below space threshold!!\n");
00843     row->fp_nonsp = upper;
00844     row->fp_space = gaps[0];
00845   }
00846   else {
00847     row->fp_nonsp = gaps[gap_index - 1];
00848     if (gap_index == cluster_count) {
00849       if (testing_on)
00850         tprintf ("No clusters above space threshold!!\n");
00851       row->fp_space = row->xheight;
00852     }
00853     else
00854       row->fp_space = gaps[gap_index];
00855   }
00856   if (testing_on) {
00857     tprintf
00858       ("Initial estimates:pr_nonsp=%g, pr_space=%g, fp_nonsp=%g, fp_space=%g\n",
00859       row->pr_nonsp, row->pr_space, row->fp_nonsp, row->fp_space);
00860   }
00861   return TRUE;                   //computed some stats
00862 }
00863 
00864 
00872 BOOL8 find_row_pitch(                    //find lines
00873                      TO_ROW *row,        //row to do
00874                      INT32 maxwidth,     //max permitted space
00875                      INT32 dm_gap,       //ignorable gaps
00876                      TO_BLOCK *block,    //block of row
00877                      INT32 block_index,  //block_number
00878                      INT32 row_index,    //number of row
00879                      BOOL8 testing_on    //correct orientation
00880                     ) {
00881   BOOL8 used_dm_model;           //looks lik dot matrix
00882   float min_space;               //estimate threshold
00883   float non_space;               //gap size
00884   float gap_iqr;                 //interquartile range
00885   float pitch_iqr;
00886   float dm_gap_iqr;              //interquartile range
00887   float dm_pitch_iqr;
00888   float dm_pitch;                //pitch with dm on
00889   float pitch;                   //revised estimate
00890   float initial_pitch;           //guess at pitch
00891   STATS gap_stats (0, maxwidth);
00892                                  //centre-centre
00893   STATS pitch_stats (0, maxwidth);
00894 
00895   row->fixed_pitch = 0.0f;
00896   initial_pitch = row->fp_space;
00897   if (initial_pitch > row->xheight * (1 + words_default_fixed_limit))
00898     initial_pitch = row->xheight;//keep pitch decent
00899   non_space = row->fp_nonsp;
00900   if (non_space > initial_pitch)
00901     non_space = initial_pitch;
00902   min_space = (initial_pitch + non_space) / 2;
00903 
00904   if (!count_pitch_stats (row, &gap_stats, &pitch_stats,
00905   initial_pitch, min_space, TRUE, FALSE, dm_gap)) {
00906     dm_gap_iqr = 0.0001;
00907     dm_pitch_iqr = maxwidth * 2.0f;
00908     dm_pitch = initial_pitch;
00909   }
00910   else {
00911     dm_gap_iqr = gap_stats.ile (0.75) - gap_stats.ile (0.25);
00912     dm_pitch_iqr = pitch_stats.ile (0.75) - pitch_stats.ile (0.25);
00913     dm_pitch = pitch_stats.ile (0.5);
00914   }
00915   gap_stats.clear ();
00916   pitch_stats.clear ();
00917   if (!count_pitch_stats (row, &gap_stats, &pitch_stats,
00918   initial_pitch, min_space, TRUE, FALSE, 0)) {
00919     gap_iqr = 0.0001;
00920     pitch_iqr = maxwidth * 3.0f;
00921   }
00922   else {
00923     gap_iqr = gap_stats.ile (0.75) - gap_stats.ile (0.25);
00924     pitch_iqr = pitch_stats.ile (0.75) - pitch_stats.ile (0.25);
00925     if (testing_on)
00926       tprintf
00927         ("First fp iteration:initial_pitch=%g, gap_iqr=%g, pitch_iqr=%g, pitch=%g\n",
00928         initial_pitch, gap_iqr, pitch_iqr, pitch_stats.ile (0.5));
00929     initial_pitch = pitch_stats.ile (0.5);
00930     if (min_space > initial_pitch
00931       && count_pitch_stats (row, &gap_stats, &pitch_stats,
00932     initial_pitch, initial_pitch, TRUE, FALSE, 0)) {
00933       min_space = initial_pitch;
00934       gap_iqr = gap_stats.ile (0.75) - gap_stats.ile (0.25);
00935       pitch_iqr = pitch_stats.ile (0.75) - pitch_stats.ile (0.25);
00936       if (testing_on)
00937         tprintf
00938           ("Revised fp iteration:initial_pitch=%g, gap_iqr=%g, pitch_iqr=%g, pitch=%g\n",
00939           initial_pitch, gap_iqr, pitch_iqr, pitch_stats.ile (0.5));
00940       initial_pitch = pitch_stats.ile (0.5);
00941     }
00942   }
00943   if (textord_debug_pitch_metric)
00944     tprintf ("Blk=%d:Row=%d:%c:p_iqr=%g:g_iqr=%g:dm_p_iqr=%g:dm_g_iqr=%g:%c:",
00945       block_index, row_index,
00946       block->block->text_region () != NULL ?
00947       (block->block->text_region ()->is_prop ()? 'P' : 'F') : 'X',
00948     pitch_iqr, gap_iqr, dm_pitch_iqr, dm_gap_iqr,
00949     pitch_iqr > maxwidth && dm_pitch_iqr > maxwidth ? 'D'
00950     : (pitch_iqr * dm_gap_iqr <=
00951     dm_pitch_iqr * gap_iqr ? 'S' : 'M'));
00952   if (pitch_iqr > maxwidth && dm_pitch_iqr > maxwidth) {
00953     row->pitch_decision = PITCH_DUNNO;
00954     if (textord_debug_pitch_metric)
00955       tprintf ("\n");
00956     return FALSE;                //insufficient data
00957   }
00958   if (pitch_iqr * dm_gap_iqr <= dm_pitch_iqr * gap_iqr) {
00959     if (testing_on)
00960       tprintf
00961         ("Choosing non dm version:pitch_iqr=%g, gap_iqr=%g, dm_pitch_iqr=%g, dm_gap_iqr=%g\n",
00962         pitch_iqr, gap_iqr, dm_pitch_iqr, dm_gap_iqr);
00963     gap_iqr = gap_stats.ile (0.75) - gap_stats.ile (0.25);
00964     pitch_iqr = pitch_stats.ile (0.75) - pitch_stats.ile (0.25);
00965     pitch = pitch_stats.ile (0.5);
00966     used_dm_model = FALSE;
00967   }
00968   else {
00969     if (testing_on)
00970       tprintf
00971         ("Choosing dm version:pitch_iqr=%g, gap_iqr=%g, dm_pitch_iqr=%g, dm_gap_iqr=%g\n",
00972         pitch_iqr, gap_iqr, dm_pitch_iqr, dm_gap_iqr);
00973     gap_iqr = dm_gap_iqr;
00974     pitch_iqr = dm_pitch_iqr;
00975     pitch = dm_pitch;
00976     used_dm_model = TRUE;
00977   }
00978   if (textord_debug_pitch_metric) {
00979     tprintf ("rev_p_iqr=%g:rev_g_iqr=%g:pitch=%g:",
00980       pitch_iqr, gap_iqr, pitch);
00981     tprintf ("p_iqr/g=%g:p_iqr/x=%g:iqr_res=%c:",
00982       pitch_iqr / gap_iqr, pitch_iqr / block->xheight,
00983       pitch_iqr < gap_iqr * textord_fpiqr_ratio
00984       && pitch_iqr < block->xheight * textord_max_pitch_iqr
00985       && pitch < block->xheight * textord_words_default_maxspace
00986       ? 'F' : 'P');
00987   }
00988   if (pitch_iqr < gap_iqr * textord_fpiqr_ratio
00989     && pitch_iqr < block->xheight * textord_max_pitch_iqr
00990     && pitch < block->xheight * textord_words_default_maxspace)
00991     row->pitch_decision = PITCH_MAYBE_FIXED;
00992   else
00993     row->pitch_decision = PITCH_MAYBE_PROP;
00994   row->fixed_pitch = pitch;
00995   row->kern_size = gap_stats.ile (0.5);
00996   row->min_space = (INT32) (row->fixed_pitch + non_space) / 2;
00997   if (row->min_space > row->fixed_pitch)
00998     row->min_space = (INT32) row->fixed_pitch;
00999   row->max_nonspace = row->min_space;
01000   row->space_size = row->fixed_pitch;
01001   row->space_threshold = (row->max_nonspace + row->min_space) / 2;
01002   row->used_dm_model = used_dm_model;
01003   return TRUE;
01004 }
01005 
01006 
01015 BOOL8 fixed_pitch_row(                   //find lines
01016                       TO_ROW *row,       //row to do
01017                       INT32 block_index  //block_number
01018                      ) {
01019   const char *res_string;        //pitch result
01020   INT16 mid_cuts;                //no of cheap cuts
01021   float non_space;               //gap size
01022   float pitch_sd;                //error on pitch
01023   float sp_sd;                   //space sd
01024 
01025   non_space = row->fp_nonsp;
01026   if (non_space > row->fixed_pitch)
01027     non_space = row->fixed_pitch;
01028   if (textord_all_prop) {
01029     // Set the decision to definitely proportional.
01030     pitch_sd = textord_words_def_prop * row->fixed_pitch;
01031     row->pitch_decision = PITCH_DEF_PROP;
01032   } else {
01033     pitch_sd = tune_row_pitch (row, &row->projection, row->projection_left,
01034                                row->projection_right,
01035                                (row->fixed_pitch + non_space * 3) / 4,
01036                                row->fixed_pitch, sp_sd, mid_cuts,
01037                                &row->char_cells,
01038                                block_index == textord_debug_block);
01039     if (pitch_sd < textord_words_pitchsd_threshold * row->fixed_pitch
01040       && ((pitsync_linear_version & 3) < 3
01041       || (pitsync_linear_version & 3) >= 3 && (row->used_dm_model
01042       || sp_sd > 20
01043       || pitch_sd == 0
01044     && sp_sd > 10))) {
01045       if (pitch_sd < textord_words_def_fixed * row->fixed_pitch
01046         && !row->all_caps
01047         && ((pitsync_linear_version & 3) < 3 || sp_sd > 20))
01048         row->pitch_decision = PITCH_DEF_FIXED;
01049       else
01050         row->pitch_decision = PITCH_MAYBE_FIXED;
01051     }
01052     else if ((pitsync_linear_version & 3) < 3
01053       || sp_sd > 20
01054       || mid_cuts > 0
01055       || pitch_sd >= textord_words_pitchsd_threshold * row->fixed_pitch) {
01056       if (pitch_sd < textord_words_def_prop * row->fixed_pitch)
01057         row->pitch_decision = PITCH_MAYBE_PROP;
01058       else
01059         row->pitch_decision = PITCH_DEF_PROP;
01060     }
01061     else
01062       row->pitch_decision = PITCH_DUNNO;
01063   }
01064 
01065   if (textord_debug_pitch_metric) {
01066     res_string = "??";
01067     switch (row->pitch_decision) {
01068       case PITCH_DEF_PROP:
01069         res_string = "DP";
01070         break;
01071       case PITCH_MAYBE_PROP:
01072         res_string = "MP";
01073         break;
01074       case PITCH_DEF_FIXED:
01075         res_string = "DF";
01076         break;
01077       case PITCH_MAYBE_FIXED:
01078         res_string = "MF";
01079       default:
01080         res_string = "??";
01081     }
01082     tprintf (":sd/p=%g:occ=%g:init_res=%s\n",
01083       pitch_sd / row->fixed_pitch, sp_sd, res_string);
01084   }
01085   return TRUE;
01086 }
01087 
01088 
01098 BOOL8 count_pitch_stats(                       //find lines
01099                         TO_ROW *row,           //row to do
01100                         STATS *gap_stats,      //blob gaps
01101                         STATS *pitch_stats,    //centre-centre stats
01102                         float initial_pitch,   //guess at pitch
01103                         float min_space,       //estimate space size
01104                         BOOL8 ignore_outsize,  //discard big objects
01105                         BOOL8 split_outsize,   //split big objects
01106                         INT32 dm_gap           //ignorable gaps
01107                        ) {
01108   BOOL8 prev_valid;              //not word broken
01109   BLOBNBOX *blob;                //current blob
01110                                  //blobs
01111   BLOBNBOX_IT blob_it = row->blob_list ();
01112   INT32 prev_right;              //end of prev blob
01113   INT32 prev_centre;             //centre of previous blob
01114   INT32 x_centre;                //centre of this blob
01115   INT32 blob_width;              //width of blob
01116   INT32 width_units;             //no of widths in blob
01117   float width;                   //blob width
01118   BOX blob_box;                  //bounding box
01119   BOX joined_box;                //of super blob
01120 
01121   gap_stats->clear ();
01122   pitch_stats->clear ();
01123   if (blob_it.empty ())
01124     return FALSE;
01125   prev_valid = FALSE;
01126   prev_centre = 0;
01127   prev_right = 0;                //stop complier warning
01128   joined_box = blob_it.data ()->bounding_box ();
01129   do {
01130     blob_it.forward ();
01131     blob = blob_it.data ();
01132     if (!blob->joined_to_prev ()) {
01133       blob_box = blob->bounding_box ();
01134       if (blob_box.left () - joined_box.right () < dm_gap
01135         && !blob_it.at_first ()
01136         || blob->cblob () == NULL && blob->blob () == NULL)
01137         joined_box += blob_box;  //merge blobs
01138       else {
01139         blob_width = joined_box.width ();
01140         if (split_outsize) {
01141           width_units =
01142             (INT32) floor ((float) blob_width / initial_pitch + 0.5);
01143           if (width_units < 1)
01144             width_units = 1;
01145           width_units--;
01146         }
01147         else if (ignore_outsize) {
01148           width = (float) blob_width / initial_pitch;
01149           width_units = width < 1 + words_default_fixed_limit
01150             && width > 1 - words_default_fixed_limit ? 0 : -1;
01151         }
01152         else
01153           width_units = 0;       //everything in
01154         x_centre = (INT32) (joined_box.left ()
01155           + (blob_width -
01156           width_units * initial_pitch) / 2);
01157         if (prev_valid && width_units >= 0) {
01158           // if (width_units>0)
01159           // {
01160           //   tprintf("wu=%d, width=%d, xc=%d, adding %d\n",
01161           //         width_units,blob_width,x_centre,x_centre-prev_centre);
01162           // }
01163           gap_stats->add (joined_box.left () - prev_right, 1);
01164           pitch_stats->add (x_centre - prev_centre, 1);
01165         }
01166         prev_centre = (INT32) (x_centre + width_units * initial_pitch);
01167         prev_right = joined_box.right ();
01168         prev_valid = blob_box.left () - joined_box.right () < min_space;
01169         prev_valid = prev_valid && width_units >= 0;
01170         joined_box = blob_box;
01171       }
01172     }
01173   }
01174   while (!blob_it.at_first ());
01175   return gap_stats->get_total () >= 3;
01176 }
01177 
01178 
01185 float tune_row_pitch(                             //find fp cells
01186                      TO_ROW *row,                 //row to do
01187                      STATS *projection,           //vertical projection
01188                      INT16 projection_left,       //edge of projection
01189                      INT16 projection_right,      //edge of projection
01190                      float space_size,            //size of blank
01191                      float &initial_pitch,        //guess at pitch
01192                      float &best_sp_sd,           //space sd
01193                      INT16 &best_mid_cuts,        //no of cheap cuts
01194                      ICOORDELT_LIST *best_cells,  //row cells
01195                      BOOL8 testing_on             //inidividual words
01196                     ) {
01197   int pitch_delta;               //offset pitch
01198   INT16 mid_cuts;                //cheap cuts
01199   float pitch_sd;                //current sd
01200   float best_sd;                 //best result
01201   float best_pitch;              //pitch for best result
01202   float initial_sd;              //starting error
01203   float sp_sd;                   //space sd
01204   ICOORDELT_LIST test_cells;     //row cells
01205   ICOORDELT_IT best_it;          //start of best list
01206 
01207   if (textord_fast_pitch_test)
01208     return tune_row_pitch2 (row, projection, projection_left,
01209       projection_right, space_size, initial_pitch,
01210       best_sp_sd,
01211     //space sd
01212       best_mid_cuts, best_cells, testing_on);
01213   if (textord_disable_pitch_test) {
01214     best_sp_sd = initial_pitch;
01215     return initial_pitch;
01216   }
01217   initial_sd =
01218     compute_pitch_sd(row,
01219                      projection,
01220                      projection_left,
01221                      projection_right,
01222                      space_size,
01223                      initial_pitch,
01224                      best_sp_sd,
01225                      best_mid_cuts,
01226                      best_cells,
01227                      testing_on);
01228   best_sd = initial_sd;
01229   best_pitch = initial_pitch;
01230   if (testing_on)
01231     tprintf ("tune_row_pitch:start pitch=%g, sd=%g\n", best_pitch, best_sd);
01232   for (pitch_delta = 1; pitch_delta <= textord_pitch_range; pitch_delta++) {
01233     pitch_sd =
01234       compute_pitch_sd (row, projection, projection_left, projection_right,
01235       space_size, initial_pitch + pitch_delta, sp_sd,
01236       mid_cuts, &test_cells, testing_on);
01237     if (testing_on)
01238       tprintf ("testing pitch at %g, sd=%g\n", initial_pitch + pitch_delta,
01239         pitch_sd);
01240     if (pitch_sd < best_sd) {
01241       best_sd = pitch_sd;
01242       best_mid_cuts = mid_cuts;
01243       best_sp_sd = sp_sd;
01244       best_pitch = initial_pitch + pitch_delta;
01245       best_cells->clear ();
01246       best_it.set_to_list (best_cells);
01247       best_it.add_list_after (&test_cells);
01248     }
01249     else
01250       test_cells.clear ();
01251     if (pitch_sd > initial_sd)
01252       break;                     //getting worse
01253   }
01254   for (pitch_delta = 1; pitch_delta <= textord_pitch_range; pitch_delta++) {
01255     pitch_sd =
01256       compute_pitch_sd (row, projection, projection_left, projection_right,
01257       space_size, initial_pitch - pitch_delta, sp_sd,
01258       mid_cuts, &test_cells, testing_on);
01259     if (testing_on)
01260       tprintf ("testing pitch at %g, sd=%g\n", initial_pitch - pitch_delta,
01261         pitch_sd);
01262     if (pitch_sd < best_sd) {
01263       best_sd = pitch_sd;
01264       best_mid_cuts = mid_cuts;
01265       best_sp_sd = sp_sd;
01266       best_pitch = initial_pitch - pitch_delta;
01267       best_cells->clear ();
01268       best_it.set_to_list (best_cells);
01269       best_it.add_list_after (&test_cells);
01270     }
01271     else
01272       test_cells.clear ();
01273     if (pitch_sd > initial_sd)
01274       break;
01275   }
01276   initial_pitch = best_pitch;
01277 
01278   if (textord_debug_pitch_metric)
01279     print_pitch_sd(row,
01280                    projection,
01281                    projection_left,
01282                    projection_right,
01283                    space_size,
01284                    best_pitch);
01285 
01286   return best_sd;
01287 }
01288 
01289 
01296 float tune_row_pitch2(                             //find fp cells
01297                       TO_ROW *row,                 //row to do
01298                       STATS *projection,           //vertical projection
01299                       INT16 projection_left,       //edge of projection
01300                       INT16 projection_right,      //edge of projection
01301                       float space_size,            //size of blank
01302                       float &initial_pitch,        //guess at pitch
01303                       float &best_sp_sd,           //space sd
01304                       INT16 &best_mid_cuts,        //no of cheap cuts
01305                       ICOORDELT_LIST *best_cells,  //row cells
01306                       BOOL8 testing_on             //inidividual words
01307                      ) {
01308   int pitch_delta;               //offset pitch
01309   INT16 pixel;                   //pixel coord
01310   INT16 best_pixel;              //pixel coord
01311   INT16 best_delta;              //best pitch
01312   INT16 best_pitch;              //best pitch
01313   INT16 start;                   //of good range
01314   INT16 end;                     //of good range
01315   INT32 best_count;              //lowest sum
01316   float best_sd;                 //best result
01317   STATS *sum_proj;               //summed projection
01318 
01319   best_sp_sd = initial_pitch;
01320 
01321   if (textord_disable_pitch_test) {
01322     return initial_pitch;
01323   }
01324   sum_proj = new STATS[textord_pitch_range * 2 + 1];
01325   if (sum_proj == NULL)
01326     return initial_pitch;
01327   best_pitch = (INT32) initial_pitch;
01328 
01329   for (pitch_delta = -textord_pitch_range; pitch_delta <= textord_pitch_range;
01330     pitch_delta++)
01331   sum_proj[textord_pitch_range + pitch_delta].set_range (0,
01332       best_pitch +
01333       pitch_delta + 1);
01334   for (pixel = projection_left; pixel <= projection_right; pixel++) {
01335     for (pitch_delta = -textord_pitch_range;
01336       pitch_delta <= textord_pitch_range; pitch_delta++)
01337     sum_proj[textord_pitch_range +
01338         pitch_delta].add ((pixel - projection_left) % (best_pitch +
01339         pitch_delta),
01340         projection->pile_count (pixel));
01341   }
01342   best_count = sum_proj[textord_pitch_range].pile_count (0);
01343   best_delta = 0;
01344   best_pixel = 0;
01345   for (pitch_delta = -textord_pitch_range; pitch_delta <= textord_pitch_range;
01346   pitch_delta++) {
01347     for (pixel = 0; pixel < best_pitch + pitch_delta; pixel++) {
01348       if (sum_proj[textord_pitch_range + pitch_delta].pile_count (pixel)
01349       < best_count) {
01350         best_count =
01351           sum_proj[textord_pitch_range +
01352           pitch_delta].pile_count (pixel);
01353         best_delta = pitch_delta;
01354         best_pixel = pixel;
01355       }
01356     }
01357   }
01358   if (testing_on)
01359     tprintf ("tune_row_pitch:start pitch=%g, best_delta=%d, count=%d\n",
01360       initial_pitch, best_delta, best_count);
01361   best_pitch += best_delta;
01362   initial_pitch = best_pitch;
01363   best_count++;
01364   best_count += best_count;
01365   for (start = best_pixel - 2; start > best_pixel - best_pitch
01366     && sum_proj[textord_pitch_range +
01367     best_delta].pile_count (start % best_pitch) <= best_count;
01368     start--);
01369   for (end = best_pixel + 2;
01370     end < best_pixel + best_pitch
01371     && sum_proj[textord_pitch_range +
01372     best_delta].pile_count (end % best_pitch) <= best_count;
01373     end++);
01374 
01375   best_sd =
01376     compute_pitch_sd(row,
01377                      projection,
01378                      projection_left,
01379                      projection_right,
01380                      space_size,
01381                      initial_pitch,
01382                      best_sp_sd,
01383                      best_mid_cuts,
01384                      best_cells,
01385                      testing_on,
01386                      start,
01387                      end);
01388   if (testing_on)
01389     tprintf ("tune_row_pitch:output pitch=%g, sd=%g\n", initial_pitch,
01390       best_sd);
01391 
01392   if (textord_debug_pitch_metric)
01393     print_pitch_sd(row,
01394                    projection,
01395                    projection_left,
01396                    projection_right,
01397                    space_size,
01398                    initial_pitch);
01399 
01400   delete[]sum_proj;
01401 
01402   return best_sd;
01403 }
01404 
01405 
01412 float compute_pitch_sd(                            //find fp cells
01413                        TO_ROW *row,                //row to do
01414                        STATS *projection,          //vertical projection
01415                        INT16 projection_left,      //edge
01416                        INT16 projection_right,     //edge
01417                        float space_size,           //size of blank
01418                        float initial_pitch,        //guess at pitch
01419                        float &sp_sd,               //space sd
01420                        INT16 &mid_cuts,            //no of free cuts
01421                        ICOORDELT_LIST *row_cells,  //list of chop pts
01422                        BOOL8 testing_on,           //inidividual words
01423                        INT16 start,                //start of good range
01424                        INT16 end                   //end of good range
01425                       ) {
01426   INT16 occupation;              //no of cells in word.
01427                                  //blobs
01428   BLOBNBOX_IT blob_it = row->blob_list ();
01429   BLOBNBOX_IT start_it;          //start of word
01430   BLOBNBOX_IT plot_it;           //for plotting
01431   INT16 blob_count;              //no of blobs
01432   BOX blob_box;                  //bounding box
01433   BOX prev_box;                  //of super blob
01434   INT32 prev_right;              //of word sync
01435   int scale_factor;              //on scores for big words
01436   INT32 sp_count;                //spaces
01437   FPSEGPT_LIST seg_list;         //char cells
01438   FPSEGPT_IT seg_it;             //iterator
01439   INT16 segpos;                  //position of segment
01440   INT16 cellpos;                 //previous cell boundary
01441                                  //iterator
01442   ICOORDELT_IT cell_it = row_cells;
01443   ICOORDELT *cell;               //new cell
01444   double sqsum;                  //sum of squares
01445   double spsum;                  //of spaces
01446   double sp_var;                 //space error
01447   double word_sync;              //result for word
01448   INT32 total_count;             //total blobs
01449 
01450   if ((pitsync_linear_version & 3) > 1) {
01451     word_sync = compute_pitch_sd2 (row, projection, projection_left,
01452       projection_right, initial_pitch,
01453       occupation, mid_cuts, row_cells,
01454       testing_on, start, end);
01455     sp_sd = occupation;
01456     return word_sync;
01457   }
01458   mid_cuts = 0;
01459   cellpos = 0;
01460   total_count = 0;
01461   sqsum = 0;
01462   sp_count = 0;
01463   spsum = 0;
01464   prev_right = -1;
01465   if (blob_it.empty ())
01466     return space_size * 10;
01467 #ifndef GRAPHICS_DISABLED
01468   if (testing_on && to_win > 0) {
01469     blob_box = blob_it.data ()->bounding_box ();
01470     projection->plot (to_win, projection_left,
01471       row->intercept (), 1.0f, -1.0f, CORAL);
01472   }
01473 #endif
01474   start_it = blob_it;
01475   blob_count = 0;
01476   blob_box = box_next (&blob_it);//first blob
01477   blob_it.mark_cycle_pt ();
01478   do {
01479     for (; blob_count > 0; blob_count--)
01480       box_next(&start_it);
01481     do {
01482       prev_box = blob_box;
01483       blob_count++;
01484       blob_box = box_next (&blob_it);
01485     }
01486     while (!blob_it.cycled_list ()
01487       && blob_box.left () - prev_box.right () < space_size);
01488     plot_it = start_it;
01489     if (pitsync_linear_version & 3)
01490       word_sync =
01491         check_pitch_sync2 (&start_it, blob_count, (INT16) initial_pitch, 2,
01492         projection, projection_left, projection_right,
01493         row->xheight * textord_projection_scale,
01494         occupation, &seg_list, start, end);
01495     else
01496       word_sync =
01497         check_pitch_sync (&start_it, blob_count, (INT16) initial_pitch, 2,
01498         projection, &seg_list);
01499     if (testing_on) {
01500       tprintf ("Word ending at (%d,%d), len=%d, sync rating=%g, ",
01501         prev_box.right (), prev_box.top (),
01502         seg_list.length () - 1, word_sync);
01503       seg_it.set_to_list (&seg_list);
01504       for (seg_it.mark_cycle_pt (); !seg_it.cycled_list ();
01505       seg_it.forward ()) {
01506         if (seg_it.data ()->faked)
01507           tprintf ("(F)");
01508         tprintf ("%d, ", seg_it.data ()->position ());
01509         //          tprintf("C=%g, s=%g, sq=%g\n",
01510         //                  seg_it.data()->cost_function(),
01511         //                  seg_it.data()->sum(),
01512         //                  seg_it.data()->squares());
01513       }
01514       tprintf ("\n");
01515     }
01516 #ifndef GRAPHICS_DISABLED
01517     if (textord_show_fixed_cuts && blob_count > 0 && to_win > 0)
01518       plot_fp_cells2(to_win, GOLDENROD, row, &seg_list);
01519 #endif
01520     seg_it.set_to_list (&seg_list);
01521     if (prev_right >= 0) {
01522       sp_var = seg_it.data ()->position () - prev_right;
01523       sp_var -= floor (sp_var / initial_pitch + 0.5) * initial_pitch;
01524       sp_var *= sp_var;
01525       spsum += sp_var;
01526       sp_count++;
01527     }
01528     for (seg_it.mark_cycle_pt (); !seg_it.cycled_list (); seg_it.forward ()) {
01529       segpos = seg_it.data ()->position ();
01530       if (cell_it.empty () || segpos > cellpos + initial_pitch / 2) {
01531                                  //big gap
01532         while (!cell_it.empty () && segpos > cellpos + initial_pitch * 3 / 2) {
01533           cell = new ICOORDELT (cellpos + (INT16) initial_pitch, 0);
01534           cell_it.add_after_then_move (cell);
01535           cellpos += (INT16) initial_pitch;
01536         }
01537                                  //make new one
01538         cell = new ICOORDELT (segpos, 0);
01539         cell_it.add_after_then_move (cell);
01540         cellpos = segpos;
01541       }
01542       else if (segpos > cellpos - initial_pitch / 2) {
01543         cell = cell_it.data ();
01544                                  //average positions
01545         cell->set_x ((cellpos + segpos) / 2);
01546         cellpos = cell->x ();
01547       }
01548     }
01549     seg_it.move_to_last ();
01550     prev_right = seg_it.data ()->position ();
01551     if (textord_pitch_scalebigwords) {
01552       scale_factor = (seg_list.length () - 2) / 2;
01553       if (scale_factor < 1)
01554         scale_factor = 1;
01555     }
01556     else
01557       scale_factor = 1;
01558     sqsum += word_sync * scale_factor;
01559     total_count += (seg_list.length () - 1) * scale_factor;
01560     seg_list.clear ();
01561   }
01562   while (!blob_it.cycled_list ());
01563   sp_sd = sp_count > 0 ? sqrt (spsum / sp_count) : 0;
01564   return total_count > 0 ? sqrt (sqsum / total_count) : space_size * 10;
01565 }
01566 
01567 
01574 float compute_pitch_sd2(                            //find fp cells
01575                         TO_ROW *row,                //row to do
01576                         STATS *projection,          //vertical projection
01577                         INT16 projection_left,      //edge
01578                         INT16 projection_right,     //edge
01579                         float initial_pitch,        //guess at pitch
01580                         INT16 &occupation,          //no of occupied cells
01581                         INT16 &mid_cuts,            //no of free cuts
01582                         ICOORDELT_LIST *row_cells,  //list of chop pts
01583                         BOOL8 testing_on,           //inidividual words
01584                         INT16 start,                //start of good range
01585                         INT16 end                   //end of good range
01586                        ) {
01587                                  //blobs
01588   BLOBNBOX_IT blob_it = row->blob_list ();
01589   BLOBNBOX_IT plot_it;
01590   INT16 blob_count;              //no of blobs
01591   BOX blob_box;                  //bounding box
01592   FPSEGPT_LIST seg_list;         //char cells
01593   FPSEGPT_IT seg_it;             //iterator
01594   INT16 segpos;                  //position of segment
01595                                  //iterator
01596   ICOORDELT_IT cell_it = row_cells;
01597   ICOORDELT *cell;               //new cell
01598   double word_sync;              //result for word
01599 
01600   mid_cuts = 0;
01601   if (blob_it.empty ()) {
01602     occupation = 0;
01603     return initial_pitch * 10;
01604   }
01605 #ifndef GRAPHICS_DISABLED
01606   if (testing_on && to_win > 0) {
01607     projection->plot (to_win, projection_left,
01608       row->intercept (), 1.0f, -1.0f, CORAL);
01609   }
01610 #endif
01611   blob_count = 0;
01612   blob_it.mark_cycle_pt ();
01613   do {
01614                                  //first blob
01615     blob_box = box_next (&blob_it);
01616     blob_count++;
01617   }
01618   while (!blob_it.cycled_list ());
01619   plot_it = blob_it;
01620   word_sync = check_pitch_sync2 (&blob_it, blob_count, (INT16) initial_pitch,
01621     2, projection, projection_left,
01622     projection_right,
01623     row->xheight * textord_projection_scale,
01624     occupation, &seg_list, start, end);
01625   if (testing_on) {
01626     tprintf ("Row ending at (%d,%d), len=%d, sync rating=%g, ",
01627       blob_box.right (), blob_box.top (),
01628       seg_list.length () - 1, word_sync);
01629     seg_it.set_to_list (&seg_list);
01630     for (seg_it.mark_cycle_pt (); !seg_it.cycled_list (); seg_it.forward ()) {
01631       if (seg_it.data ()->faked)
01632         tprintf ("(F)");
01633       tprintf ("%d, ", seg_it.data ()->position ());
01634       //     tprintf("C=%g, s=%g, sq=%g\n",
01635       //             seg_it.data()->cost_function(),
01636       //             seg_it.data()->sum(),
01637       //             seg_it.data()->squares());
01638     }
01639     tprintf ("\n");
01640   }
01641 #ifndef GRAPHICS_DISABLED
01642   if (textord_show_fixed_cuts && blob_count > 0 && to_win > 0)
01643     plot_fp_cells2(to_win, GOLDENROD, row, &seg_list);
01644 #endif
01645   seg_it.set_to_list (&seg_list);
01646   for (seg_it.mark_cycle_pt (); !seg_it.cycled_list (); seg_it.forward ()) {
01647     segpos = seg_it.data ()->position ();
01648                                  //make new one
01649     cell = new ICOORDELT (segpos, 0);
01650     cell_it.add_after_then_move (cell);
01651     if (seg_it.at_last ())
01652       mid_cuts = seg_it.data ()->cheap_cuts ();
01653   }
01654   seg_list.clear ();
01655   return occupation > 0 ? sqrt (word_sync / occupation) : initial_pitch * 10;
01656 }
01657 
01658 
01665 void print_pitch_sd(                        //find fp cells
01666                     TO_ROW *row,            //row to do
01667                     STATS *projection,      //vertical projection
01668                     INT16 projection_left,  //edges //size of blank
01669                     INT16 projection_right,
01670                     float space_size,
01671                     float initial_pitch     //guess at pitch
01672                    ) {
01673   const char *res2;              //pitch result
01674   INT16 occupation;              //used cells
01675   float sp_sd;                   //space sd
01676                                  //blobs
01677   BLOBNBOX_IT blob_it = row->blob_list ();
01678   BLOBNBOX_IT start_it;          //start of word
01679   BLOBNBOX_IT row_start;         //start of row
01680   INT16 blob_count;              //no of blobs
01681   INT16 total_blob_count;        //total blobs in line
01682   BOX blob_box;                  //bounding box
01683   BOX prev_box;                  //of super blob
01684   INT32 prev_right;              //of word sync
01685   int scale_factor;              //on scores for big words
01686   INT32 sp_count;                //spaces
01687   FPSEGPT_LIST seg_list;         //char cells
01688   FPSEGPT_IT seg_it;             //iterator
01689   double sqsum;                  //sum of squares
01690   double spsum;                  //of spaces
01691   double sp_var;                 //space error
01692   double word_sync;              //result for word
01693   double total_count;            //total cuts
01694 
01695   if (blob_it.empty ())
01696     return;
01697   row_start = blob_it;
01698   total_blob_count = 0;
01699 
01700   total_count = 0;
01701   sqsum = 0;
01702   sp_count = 0;
01703   spsum = 0;
01704   prev_right = -1;
01705   blob_it = row_start;
01706   start_it = blob_it;
01707   blob_count = 0;
01708   blob_box = box_next (&blob_it);//first blob
01709   blob_it.mark_cycle_pt ();
01710   do {
01711     for (; blob_count > 0; blob_count--)
01712       box_next(&start_it);
01713     do {
01714       prev_box = blob_box;
01715       blob_count++;
01716       blob_box = box_next (&blob_it);
01717     }
01718     while (!blob_it.cycled_list ()
01719       && blob_box.left () - prev_box.right () < space_size);
01720     word_sync =
01721       check_pitch_sync2 (&start_it, blob_count, (INT16) initial_pitch, 2,
01722       projection, projection_left, projection_right,
01723       row->xheight * textord_projection_scale,
01724       occupation, &seg_list, 0, 0);
01725     total_blob_count += blob_count;
01726     seg_it.set_to_list (&seg_list);
01727     if (prev_right >= 0) {
01728       sp_var = seg_it.data ()->position () - prev_right;
01729       sp_var -= floor (sp_var / initial_pitch + 0.5) * initial_pitch;
01730       sp_var *= sp_var;
01731       spsum += sp_var;
01732       sp_count++;
01733     }
01734     seg_it.move_to_last ();
01735     prev_right = seg_it.data ()->position ();
01736     if (textord_pitch_scalebigwords) {
01737       scale_factor = (seg_list.length () - 2) / 2;
01738       if (scale_factor < 1)
01739         scale_factor = 1;
01740     }
01741     else
01742       scale_factor = 1;
01743     sqsum += word_sync * scale_factor;
01744     total_count += (seg_list.length () - 1) * scale_factor;
01745     seg_list.clear ();
01746   }
01747   while (!blob_it.cycled_list ());
01748   sp_sd = sp_count > 0 ? sqrt (spsum / sp_count) : 0;
01749   word_sync = total_count > 0 ? sqrt (sqsum / total_count) : space_size * 10;
01750   tprintf ("new_sd=%g:sd/p=%g:new_sp_sd=%g:res=%c:",
01751     word_sync, word_sync / initial_pitch, sp_sd,
01752     word_sync < textord_words_pitchsd_threshold * initial_pitch
01753     ? 'F' : 'P');
01754 
01755   start_it = row_start;
01756   blob_it = row_start;
01757   word_sync =
01758     check_pitch_sync2 (&blob_it, total_blob_count, (INT16) initial_pitch, 2,
01759     projection, projection_left, projection_right,
01760     row->xheight * textord_projection_scale, occupation,
01761     &seg_list, 0, 0);
01762   if (occupation > 1)
01763     word_sync /= occupation;
01764   word_sync = sqrt (word_sync);
01765 
01766 #ifndef GRAPHICS_DISABLED
01767   if (textord_show_row_cuts && to_win != NO_WINDOW)
01768     plot_fp_cells2(to_win, CORAL, row, &seg_list);
01769 #endif
01770   seg_list.clear ();
01771   if (word_sync < textord_words_pitchsd_threshold * initial_pitch) {
01772     if (word_sync < textord_words_def_fixed * initial_pitch
01773       && !row->all_caps)
01774       res2 = "DF";
01775     else
01776       res2 = "MF";
01777   }
01778   else
01779     res2 = word_sync < textord_words_def_prop * initial_pitch ? "MP" : "DP";
01780   tprintf
01781     ("row_sd=%g:sd/p=%g:res=%c:N=%d:res2=%s,init pitch=%g, row_pitch=%g, all_caps=%d\n",
01782     word_sync, word_sync / initial_pitch,
01783     word_sync < textord_words_pitchsd_threshold * initial_pitch ? 'F' : 'P',
01784     occupation, res2, initial_pitch, row->fixed_pitch, row->all_caps);
01785 }
01786 
01787 
01791 int sort_floats2(                   //qsort function
01792                  const void *arg1,  //ptrs to floats
01793                  const void *arg2) {
01794   float diff;                    //difference
01795 
01796   diff = *((float *) arg1) - *((float *) arg2);
01797   if (diff > 0)
01798     return 1;
01799   else if (diff < 0)
01800     return -1;
01801   else
01802     return 0;
01803 }
01804 
01805 
01812 void find_repeated_chars(                  //search for equal chars
01813                          TO_BLOCK *block,  //block to search
01814                          BOOL8 testing_on  //dbug mode
01815                         ) {
01816   BOOL8 bol;                     //start of line
01817   TO_ROW *row;                   //current row
01818   TO_ROW_IT row_it = block->get_rows ();
01819   ROW *real_row;                 //output row
01820   WERD_IT word_it;               //new words
01821   WERD *word;                    //new word
01822   BLOBNBOX *bblob;               //current blob
01823   BLOBNBOX *nextblob;            //neighbour to compare
01824   BLOBNBOX_IT box_it;            //iterator
01825   BLOBNBOX_IT search_it;         //forward search
01826   INT32 blobcount;               //no of neighbours
01827   INT32 matched_blobcount;       //no of matches
01828   INT32 blobindex;               //in row
01829   INT32 row_length;              //blobs in row
01830   INT32 width_change;            //max width change
01831   INT32 blob_width;              //required blob width
01832   INT32 space_width;             //required gap width
01833   INT32 prev_right;              //right edge of last blob
01834   float rating;                  //match rating
01835   PBLOB *pblob1;                 //polygonal blob
01836   PBLOB *pblob2;                 //second blob
01837   BOX word_box;                  //for plotting
01838 
01839   if (row_it.empty ())
01840     return;                      //empty block
01841   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
01842     row = row_it.data ();
01843     box_it.set_to_list (row->blob_list ());
01844     row_length = row->blob_list ()->length ();
01845     blobindex = 0;
01846     word_it.set_to_list (&row->rep_words);
01847     bol = TRUE;
01848     if (!box_it.empty ()) {
01849       real_row = new ROW (row,
01850         (INT16) block->kern_size,
01851         (INT16) block->space_size);
01852       do {
01853         bblob = box_it.data ();
01854         blobcount = 1;
01855         search_it = box_it;
01856         search_it.forward ();
01857         matched_blobcount = 1;
01858         width_change = MAX_INT16;
01859         blob_width = 0;
01860         space_width = 0;
01861         prev_right = bblob->bounding_box ().right ();
01862         if (bblob->bounding_box ().height () * 2 < row->xheight
01863           && !bblob->joined_to_prev ()
01864         && (bblob->blob () != NULL || bblob->cblob () != NULL)) {
01865           if (bblob->cblob () != NULL)
01866             pblob1 = new PBLOB (bblob->cblob (), row->xheight);
01867           else
01868             pblob1 = bblob->blob ();
01869 
01870           rating = 0.0f;
01871           while (rating < textord_repeat_rating
01872             && blobindex + blobcount < row_length
01873             && ((nextblob = search_it.data ())->blob () != NULL
01874             || nextblob->cblob () != NULL)
01875             && nextblob->bounding_box ().height () * 2 <
01876           row->xheight) {
01877             if (blobcount == 1) {
01878               space_width = nextblob->bounding_box ().left ()
01879                 - bblob->bounding_box ().right ();
01880               blob_width = bblob->bounding_box ().width ();
01881               width_change =
01882                 blob_width >
01883                 space_width ? blob_width : space_width;
01884               width_change =
01885                 (INT32) (width_change *
01886                 textord_repch_width_variance);
01887               if (width_change < 3)
01888                 width_change = 3;
01889             }
01890             if (nextblob->bounding_box ().width () >
01891               blob_width + width_change
01892               || nextblob->bounding_box ().width () <
01893               blob_width - width_change
01894               || nextblob->bounding_box ().left () - prev_right >
01895               space_width + width_change
01896               || nextblob->bounding_box ().left () - prev_right <
01897             space_width - width_change) {
01898               if (testing_on)
01899                 tprintf
01900                   ("Repch terminated:bw=%d, sw=%d, wc=%d, pr=%d, nb=(%d,%d)\n",
01901                   blob_width, space_width, width_change,
01902                   prev_right, nextblob->bounding_box ().left (),
01903                   nextblob->bounding_box ().right ());
01904               break;             //not good enough
01905             }
01906             if (nextblob->blob () != NULL)
01907               rating = compare_blobs (pblob1, real_row,
01908                 nextblob->blob (), real_row);
01909             else {
01910               pblob2 =
01911                 new PBLOB (nextblob->cblob (), row->xheight);
01912               rating =
01913                 compare_blobs(pblob1, real_row, pblob2, real_row);
01914               delete pblob2;
01915             }
01916             if (rating < textord_repeat_rating) {
01917               //    if (testing_on)
01918               //       tprintf("Blob at (%d,%d)->(%d,%d) had rating %g\n",
01919               //               nextblob->bounding_box().left(),
01920               //               nextblob->bounding_box().bottom(),
01921               //               nextblob->bounding_box().right(),
01922               //               nextblob->bounding_box().top(),
01923               //               rating);
01924               blobcount++;
01925               search_it.forward ();
01926               matched_blobcount++;
01927               while (blobindex + blobcount < row_length
01928               && search_it.data ()->joined_to_prev ()) {
01929                 search_it.forward ();
01930                 blobcount++;     //suck in joined bits
01931               }
01932             }
01933             prev_right = nextblob->bounding_box ().right ();
01934           }
01935           if (bblob->cblob () != NULL)
01936             delete pblob1;
01937 
01938           if (matched_blobcount >= textord_repeat_threshold) {
01939             word =
01940               make_real_word (&box_it, blobcount, bol, FALSE, FALSE,
01941               1);
01942 #ifndef GRAPHICS_DISABLED
01943             if (testing_on) {
01944               word_box = word->bounding_box ();
01945               tprintf
01946                 ("Found repeated word of %d blobs (%d matched) from (%d,%d)->(%d,%d)\n",
01947                 blobcount, matched_blobcount, word_box.left (),
01948                 word_box.bottom (), word_box.right (),
01949                 word_box.top ());
01950               perimeter_color_index(to_win, RED);
01951               interior_style(to_win, INT_HOLLOW, TRUE);
01952               rectangle (to_win, word_box.left (),
01953                 word_box.bottom (), word_box.right (),
01954                 word_box.top ());
01955             }
01956 #endif
01957             word->set_flag (W_REP_CHAR, TRUE);
01958             word->set_flag (W_DONT_CHOP, TRUE);
01959             word_it.add_after_then_move (word);
01960             blobindex += blobcount;
01961           }
01962         }
01963         bol = FALSE;
01964         box_it.forward ();       //next one
01965         blobindex++;
01966       }
01967                                  //until all done
01968       while (!box_it.at_first ());
01969       delete real_row;
01970     }
01971   }
01972 }
01973 
01974 
01980 #ifndef GRAPHICS_DISABLED
01981 void plot_fp_word(                  //draw block of words
01982                   TO_BLOCK *block,  //block to draw
01983                   float pitch,      //pitch to draw with
01984                   float nonspace    //for space threshold
01985                  ) {
01986   TO_ROW *row;                   //current row
01987   TO_ROW_IT row_it = block->get_rows ();
01988 
01989   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
01990     row = row_it.data ();
01991     row->min_space = (INT32) ((pitch + nonspace) / 2);
01992     row->max_nonspace = row->min_space;
01993     row->space_threshold = row->min_space;
01994     plot_word_decisions (to_win, (INT16) pitch, row);
01995   }
01996 }
01997 #endif

Generated on Wed Feb 28 19:49:12 2007 for Tesseract by  doxygen 1.5.1