textord/topitch.cpp File Reference

#include "mfcpch.h"
#include "stderr.h"
#include "blobbox.h"
#include "lmedsq.h"
#include "statistc.h"
#include "drawtord.h"
#include "makerow.h"
#include "pitsync1.h"
#include "pithsync.h"
#include "blobcmpl.h"
#include "tovars.h"
#include "wordseg.h"
#include "topitch.h"
#include "secname.h"

Go to the source code of this file.

Defines

Functions


Define Documentation

#define BLOCK_STATS_CLUSTERS   10

Definition at line 65 of file topitch.cpp.

Referenced by row_pitch_stats(), and row_words2().

#define EXTERN

Note:
File: topitch.cpp (Formerly to_pitch.c)
Code to determine fixed pitchness and the pitch if fixed.
Author:
Ray Smith
Date:
Aug 24 16:57:29 BST 1993
 * (C) Copyright 1993, Hewlett-Packard Ltd.
 ** Licensed under the Apache License, Version 2.0 (the "License");
 ** you may not use this file except in compliance with the License.
 ** You may obtain a copy of the License at
 ** http://www.apache.org/licenses/LICENSE-2.0
 ** Unless required by applicable law or agreed to in writing, software
 ** distributed under the License is distributed on an "AS IS" BASIS,
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 ** See the License for the specific language governing permissions and
 ** limitations under the License.

Definition at line 38 of file topitch.cpp.

#define FIXED_WIDTH_MULTIPLE   5

Definition at line 64 of file topitch.cpp.

#define MAX_ALLOWED_PITCH   100

Definition at line 66 of file topitch.cpp.

Referenced by try_doc_fixed().


Function Documentation

EXTERN BOOL_VAR ( textord_all_prop  ,
FALSE  ,
"All doc is proportial text"   
)

void compute_block_pitch ( TO_BLOCK block,
FCOORD  rotation,
INT32  block_index,
BOOL8  testing_on 
)

Process each block.

Decide whether each block is fixed pitch individually.

Definition at line 337 of file topitch.cpp.

References ASSERT_HOST, BOX::bottom(), compute_rows_pitch(), find_repeated_chars(), BOX::left(), overlap_picture_ops, BOX::right(), BOX::top(), tprintf(), and TRUE.

Referenced by compute_fixed_pitch().

00342                           {
00343   BOX block_box;                 //bounding box
00344 
00345   block_box = block->block->bounding_box ();
00346   if (testing_on && textord_debug_pitch_test) {
00347     tprintf ("Block %d at (%d,%d)->(%d,%d)\n",
00348       block_index,
00349       block_box.left (), block_box.bottom (),
00350       block_box.right (), block_box.top ());
00351   }
00352   block->min_space = (INT32) floor (block->xheight
00353     * textord_words_default_minspace);
00354   block->max_nonspace = (INT32) ceil (block->xheight
00355     * textord_words_default_nonspace);
00356   block->fixed_pitch = 0.0f;
00357   block->space_size = (float) block->min_space;
00358   block->kern_size = (float) block->max_nonspace;
00359   block->pr_nonsp = block->xheight * words_default_prop_nonspace;
00360   block->pr_space = block->pr_nonsp * textord_spacesize_ratioprop;
00361   if (!block->get_rows ()->empty ()) {
00362     ASSERT_HOST (block->xheight > 0);
00363     if (textord_repeat_extraction)
00364       find_repeated_chars(block, textord_show_initial_words &&testing_on);
00365 #ifndef GRAPHICS_DISABLED
00366     if (textord_show_initial_words && testing_on)
00367       overlap_picture_ops(TRUE);
00368 #endif
00369     compute_rows_pitch(block,
00370                        block_index,
00371                        textord_debug_pitch_test &&testing_on);
00372   }
00373 }

void compute_fixed_pitch ( ICOORD  page_tr,
TO_BLOCK_LIST *  port_blocks,
float  gradient,
FCOORD  rotation,
BOOL8  testing_on 
)

Determine pitch.

Decide whether each row is fixed pitch individually. Correlate definite and uncertain results to obtain an individual result for each row in the TO_ROW class.

Definition at line 75 of file topitch.cpp.

References compute_block_pitch(), create_to_win(), fix_row_pitch(), NO_WINDOW, NULL, overlap_picture_ops, print_block_counts(), rotation, to_win, tprintf(), TRUE, try_block_fixed(), try_doc_fixed(), and try_rows_fixed().

Referenced by make_words().

00081                           {
00082   TO_BLOCK_IT block_it;          //iterator
00083   TO_BLOCK *block;               //current block;
00084   TO_ROW_IT row_it;              //row iterator
00085   TO_ROW *row;                   //current row
00086   int block_index;               //block number
00087   int row_index;                 //row number
00088 
00089 #ifndef GRAPHICS_DISABLED
00090   if (textord_show_initial_words && testing_on) {
00091     if (to_win == NO_WINDOW)
00092       create_to_win(page_tr);
00093   }
00094 #endif
00095 
00096   block_it.set_to_list (port_blocks);
00097   block_index = 1;
00098   for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
00099   block_it.forward ()) {
00100     block = block_it.data ();
00101     compute_block_pitch(block, rotation, block_index, testing_on);
00102     block_index++;
00103   }
00104 
00105   if (!try_doc_fixed (page_tr, port_blocks, gradient)) {
00106     block_index = 1;
00107     for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
00108     block_it.forward ()) {
00109       block = block_it.data ();
00110       if (!try_block_fixed (block, block_index))
00111         try_rows_fixed(block, block_index, testing_on);
00112       block_index++;
00113     }
00114   }
00115 
00116   block_index = 1;
00117   for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
00118   block_it.forward ()) {
00119     block = block_it.data ();
00120     row_it.set_to_list (block->get_rows ());
00121     row_index = 1;
00122     for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00123       row = row_it.data ();
00124       fix_row_pitch(row, block, port_blocks, row_index, block_index);
00125       row_index++;
00126     }
00127     if (testing_on
00128       && (textord_debug_pitch_test && block->block->text_region () != NULL
00129     || textord_blocksall_fixed || textord_blocksall_prop)) {
00130       tprintf ("Corr:");
00131       print_block_counts(block, block_index);
00132     }
00133     block_index++;
00134   }
00135 #ifndef GRAPHICS_DISABLED
00136   if (textord_show_initial_words && testing_on) {
00137     overlap_picture_ops(TRUE);
00138   }
00139 #endif
00140 }

float compute_pitch_sd ( TO_ROW row,
STATS projection,
INT16  projection_left,
INT16  projection_right,
float  space_size,
float  initial_pitch,
float &  sp_sd,
INT16 mid_cuts,
ICOORDELT_LIST *  row_cells,
BOOL8  testing_on,
INT16  start,
INT16  end 
)

Find fp cells.

Use a dp algorithm to fit the character cells and return the sd of the cell size over the row.

Definition at line 1412 of file topitch.cpp.

References blob_count, TO_ROW::blob_list(), box_next(), check_pitch_sync(), check_pitch_sync2(), compute_pitch_sd2(), CORAL, GOLDENROD, TO_ROW::intercept(), BOX::left(), STATS::plot(), plot_fp_cells2(), projection, BOX::right(), seg_list, ICOORD::set_x(), to_win, BOX::top(), tprintf(), and TO_ROW::xheight.

Referenced by tune_row_pitch(), and tune_row_pitch2().

01425                         {
01426   INT16 occupation;              //no of cells in word.
01427                                  //blobs
01428   BLOBNBOX_IT blob_it = row->blob_list ();
01429   BLOBNBOX_IT start_it;          //start of word
01430   BLOBNBOX_IT plot_it;           //for plotting
01431   INT16 blob_count;              //no of blobs
01432   BOX blob_box;                  //bounding box
01433   BOX prev_box;                  //of super blob
01434   INT32 prev_right;              //of word sync
01435   int scale_factor;              //on scores for big words
01436   INT32 sp_count;                //spaces
01437   FPSEGPT_LIST seg_list;         //char cells
01438   FPSEGPT_IT seg_it;             //iterator
01439   INT16 segpos;                  //position of segment
01440   INT16 cellpos;                 //previous cell boundary
01441                                  //iterator
01442   ICOORDELT_IT cell_it = row_cells;
01443   ICOORDELT *cell;               //new cell
01444   double sqsum;                  //sum of squares
01445   double spsum;                  //of spaces
01446   double sp_var;                 //space error
01447   double word_sync;              //result for word
01448   INT32 total_count;             //total blobs
01449 
01450   if ((pitsync_linear_version & 3) > 1) {
01451     word_sync = compute_pitch_sd2 (row, projection, projection_left,
01452       projection_right, initial_pitch,
01453       occupation, mid_cuts, row_cells,
01454       testing_on, start, end);
01455     sp_sd = occupation;
01456     return word_sync;
01457   }
01458   mid_cuts = 0;
01459   cellpos = 0;
01460   total_count = 0;
01461   sqsum = 0;
01462   sp_count = 0;
01463   spsum = 0;
01464   prev_right = -1;
01465   if (blob_it.empty ())
01466     return space_size * 10;
01467 #ifndef GRAPHICS_DISABLED
01468   if (testing_on && to_win > 0) {
01469     blob_box = blob_it.data ()->bounding_box ();
01470     projection->plot (to_win, projection_left,
01471       row->intercept (), 1.0f, -1.0f, CORAL);
01472   }
01473 #endif
01474   start_it = blob_it;
01475   blob_count = 0;
01476   blob_box = box_next (&blob_it);//first blob
01477   blob_it.mark_cycle_pt ();
01478   do {
01479     for (; blob_count > 0; blob_count--)
01480       box_next(&start_it);
01481     do {
01482       prev_box = blob_box;
01483       blob_count++;
01484       blob_box = box_next (&blob_it);
01485     }
01486     while (!blob_it.cycled_list ()
01487       && blob_box.left () - prev_box.right () < space_size);
01488     plot_it = start_it;
01489     if (pitsync_linear_version & 3)
01490       word_sync =
01491         check_pitch_sync2 (&start_it, blob_count, (INT16) initial_pitch, 2,
01492         projection, projection_left, projection_right,
01493         row->xheight * textord_projection_scale,
01494         occupation, &seg_list, start, end);
01495     else
01496       word_sync =
01497         check_pitch_sync (&start_it, blob_count, (INT16) initial_pitch, 2,
01498         projection, &seg_list);
01499     if (testing_on) {
01500       tprintf ("Word ending at (%d,%d), len=%d, sync rating=%g, ",
01501         prev_box.right (), prev_box.top (),
01502         seg_list.length () - 1, word_sync);
01503       seg_it.set_to_list (&seg_list);
01504       for (seg_it.mark_cycle_pt (); !seg_it.cycled_list ();
01505       seg_it.forward ()) {
01506         if (seg_it.data ()->faked)
01507           tprintf ("(F)");
01508         tprintf ("%d, ", seg_it.data ()->position ());
01509         //          tprintf("C=%g, s=%g, sq=%g\n",
01510         //                  seg_it.data()->cost_function(),
01511         //                  seg_it.data()->sum(),
01512         //                  seg_it.data()->squares());
01513       }
01514       tprintf ("\n");
01515     }
01516 #ifndef GRAPHICS_DISABLED
01517     if (textord_show_fixed_cuts && blob_count > 0 && to_win > 0)
01518       plot_fp_cells2(to_win, GOLDENROD, row, &seg_list);
01519 #endif
01520     seg_it.set_to_list (&seg_list);
01521     if (prev_right >= 0) {
01522       sp_var = seg_it.data ()->position () - prev_right;
01523       sp_var -= floor (sp_var / initial_pitch + 0.5) * initial_pitch;
01524       sp_var *= sp_var;
01525       spsum += sp_var;
01526       sp_count++;
01527     }
01528     for (seg_it.mark_cycle_pt (); !seg_it.cycled_list (); seg_it.forward ()) {
01529       segpos = seg_it.data ()->position ();
01530       if (cell_it.empty () || segpos > cellpos + initial_pitch / 2) {
01531                                  //big gap
01532         while (!cell_it.empty () && segpos > cellpos + initial_pitch * 3 / 2) {
01533           cell = new ICOORDELT (cellpos + (INT16) initial_pitch, 0);
01534           cell_it.add_after_then_move (cell);
01535           cellpos += (INT16) initial_pitch;
01536         }
01537                                  //make new one
01538         cell = new ICOORDELT (segpos, 0);
01539         cell_it.add_after_then_move (cell);
01540         cellpos = segpos;
01541       }
01542       else if (segpos > cellpos - initial_pitch / 2) {
01543         cell = cell_it.data ();
01544                                  //average positions
01545         cell->set_x ((cellpos + segpos) / 2);
01546         cellpos = cell->x ();
01547       }
01548     }
01549     seg_it.move_to_last ();
01550     prev_right = seg_it.data ()->position ();
01551     if (textord_pitch_scalebigwords) {
01552       scale_factor = (seg_list.length () - 2) / 2;
01553       if (scale_factor < 1)
01554         scale_factor = 1;
01555     }
01556     else
01557       scale_factor = 1;
01558     sqsum += word_sync * scale_factor;
01559     total_count += (seg_list.length () - 1) * scale_factor;
01560     seg_list.clear ();
01561   }
01562   while (!blob_it.cycled_list ());
01563   sp_sd = sp_count > 0 ? sqrt (spsum / sp_count) : 0;
01564   return total_count > 0 ? sqrt (sqsum / total_count) : space_size * 10;
01565 }

float compute_pitch_sd2 ( TO_ROW row,
STATS projection,
INT16  projection_left,
INT16  projection_right,
float  initial_pitch,
INT16 occupation,
INT16 mid_cuts,
ICOORDELT_LIST *  row_cells,
BOOL8  testing_on,
INT16  start,
INT16  end 
)

Find fp cells.

Use a dp algorithm to fit the character cells and return the sd of the cell size over the row.

Definition at line 1574 of file topitch.cpp.

References blob_count, TO_ROW::blob_list(), box_next(), check_pitch_sync2(), CORAL, GOLDENROD, TO_ROW::intercept(), STATS::plot(), plot_fp_cells2(), projection, BOX::right(), seg_list, to_win, BOX::top(), tprintf(), and TO_ROW::xheight.

Referenced by compute_pitch_sd().

01586                          {
01587                                  //blobs
01588   BLOBNBOX_IT blob_it = row->blob_list ();
01589   BLOBNBOX_IT plot_it;
01590   INT16 blob_count;              //no of blobs
01591   BOX blob_box;                  //bounding box
01592   FPSEGPT_LIST seg_list;         //char cells
01593   FPSEGPT_IT seg_it;             //iterator
01594   INT16 segpos;                  //position of segment
01595                                  //iterator
01596   ICOORDELT_IT cell_it = row_cells;
01597   ICOORDELT *cell;               //new cell
01598   double word_sync;              //result for word
01599 
01600   mid_cuts = 0;
01601   if (blob_it.empty ()) {
01602     occupation = 0;
01603     return initial_pitch * 10;
01604   }
01605 #ifndef GRAPHICS_DISABLED
01606   if (testing_on && to_win > 0) {
01607     projection->plot (to_win, projection_left,
01608       row->intercept (), 1.0f, -1.0f, CORAL);
01609   }
01610 #endif
01611   blob_count = 0;
01612   blob_it.mark_cycle_pt ();
01613   do {
01614                                  //first blob
01615     blob_box = box_next (&blob_it);
01616     blob_count++;
01617   }
01618   while (!blob_it.cycled_list ());
01619   plot_it = blob_it;
01620   word_sync = check_pitch_sync2 (&blob_it, blob_count, (INT16) initial_pitch,
01621     2, projection, projection_left,
01622     projection_right,
01623     row->xheight * textord_projection_scale,
01624     occupation, &seg_list, start, end);
01625   if (testing_on) {
01626     tprintf ("Row ending at (%d,%d), len=%d, sync rating=%g, ",
01627       blob_box.right (), blob_box.top (),
01628       seg_list.length () - 1, word_sync);
01629     seg_it.set_to_list (&seg_list);
01630     for (seg_it.mark_cycle_pt (); !seg_it.cycled_list (); seg_it.forward ()) {
01631       if (seg_it.data ()->faked)
01632         tprintf ("(F)");
01633       tprintf ("%d, ", seg_it.data ()->position ());
01634       //     tprintf("C=%g, s=%g, sq=%g\n",
01635       //             seg_it.data()->cost_function(),
01636       //             seg_it.data()->sum(),
01637       //             seg_it.data()->squares());
01638     }
01639     tprintf ("\n");
01640   }
01641 #ifndef GRAPHICS_DISABLED
01642   if (textord_show_fixed_cuts && blob_count > 0 && to_win > 0)
01643     plot_fp_cells2(to_win, GOLDENROD, row, &seg_list);
01644 #endif
01645   seg_it.set_to_list (&seg_list);
01646   for (seg_it.mark_cycle_pt (); !seg_it.cycled_list (); seg_it.forward ()) {
01647     segpos = seg_it.data ()->position ();
01648                                  //make new one
01649     cell = new ICOORDELT (segpos, 0);
01650     cell_it.add_after_then_move (cell);
01651     if (seg_it.at_last ())
01652       mid_cuts = seg_it.data ()->cheap_cuts ();
01653   }
01654   seg_list.clear ();
01655   return occupation > 0 ? sqrt (word_sync / occupation) : initial_pitch * 10;
01656 }

BOOL8 compute_rows_pitch ( TO_BLOCK block,
INT32  block_index,
BOOL8  testing_on 
)

Find line stats.

Decide whether each row is fixed pitch individually.

Definition at line 381 of file topitch.cpp.

References ASSERT_HOST, TO_ROW::compute_vertical_projection(), FALSE, find_row_pitch(), PITCH_DUNNO, row_pitch_stats(), and TO_ROW::xheight.

Referenced by compute_block_pitch().

00385                           {
00386   INT32 maxwidth;                //of spaces
00387   TO_ROW *row;                   //current row
00388   INT32 row_index;               //row number.
00389   float lower, upper;            //cluster thresholds
00390   TO_ROW_IT row_it = block->get_rows ();
00391 
00392   row_index = 1;
00393   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00394     row = row_it.data ();
00395     ASSERT_HOST (row->xheight > 0);
00396     row->compute_vertical_projection ();
00397     maxwidth = (INT32) ceil (row->xheight * textord_words_maxspace);
00398     if (row_pitch_stats (row, maxwidth, testing_on)
00399       && find_row_pitch (row, maxwidth,
00400       textord_dotmatrix_gap + 1, block, block_index,
00401     row_index, testing_on)) {
00402       if (row->fixed_pitch == 0) {
00403         lower = row->pr_nonsp;
00404         upper = row->pr_space;
00405         row->space_size = upper;
00406         row->kern_size = lower;
00407       }
00408     }
00409     else {
00410       row->fixed_pitch = 0.0f;   //insufficient data
00411       row->pitch_decision = PITCH_DUNNO;
00412     }
00413     row_index++;
00414   }
00415   return FALSE;
00416 }

void count_block_votes ( TO_BLOCK block,
INT32 def_fixed,
INT32 def_prop,
INT32 maybe_fixed,
INT32 maybe_prop,
INT32 corr_fixed,
INT32 corr_prop,
INT32 dunno 
)

Find line stats.

Count the number of rows in the block with each kind of pitch_decision.

Definition at line 694 of file topitch.cpp.

References PITCH_CORR_FIXED, PITCH_CORR_PROP, TO_ROW::pitch_decision, PITCH_DEF_FIXED, PITCH_DEF_PROP, PITCH_DUNNO, PITCH_MAYBE_FIXED, and PITCH_MAYBE_PROP.

Referenced by print_block_counts(), and try_rows_fixed().

00702                                      {
00703   TO_ROW *row;                   //current row
00704   TO_ROW_IT row_it = block->get_rows ();
00705 
00706   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00707     row = row_it.data ();
00708     switch (row->pitch_decision) {
00709       case PITCH_DUNNO:
00710         dunno++;
00711         break;
00712       case PITCH_DEF_PROP:
00713         def_prop++;
00714         break;
00715       case PITCH_MAYBE_PROP:
00716         maybe_prop++;
00717         break;
00718       case PITCH_DEF_FIXED:
00719         def_fixed++;
00720         break;
00721       case PITCH_MAYBE_FIXED:
00722         maybe_fixed++;
00723         break;
00724       case PITCH_CORR_PROP:
00725         corr_prop++;
00726         break;
00727       case PITCH_CORR_FIXED:
00728         corr_fixed++;
00729         break;
00730     }
00731   }
00732 }

BOOL8 count_pitch_stats ( TO_ROW row,
STATS gap_stats,
STATS pitch_stats,
float  initial_pitch,
float  min_space,
BOOL8  ignore_outsize,
BOOL8  split_outsize,
INT32  dm_gap 
)

Find lines.

Returns:
TRUE if there were any decent values to use.
Count up the gap and pitch stats on the block to see if it is fixed pitch. Blobs with gaps smaller than the lower threshold are assumed to be one. The larger threshold is the word gap threshold.

Definition at line 1098 of file topitch.cpp.

References STATS::add(), TO_ROW::blob_list(), STATS::clear(), FALSE, STATS::get_total(), BOX::left(), NULL, BOX::right(), and BOX::width().

Referenced by find_row_pitch().

01107                          {
01108   BOOL8 prev_valid;              //not word broken
01109   BLOBNBOX *blob;                //current blob
01110                                  //blobs
01111   BLOBNBOX_IT blob_it = row->blob_list ();
01112   INT32 prev_right;              //end of prev blob
01113   INT32 prev_centre;             //centre of previous blob
01114   INT32 x_centre;                //centre of this blob
01115   INT32 blob_width;              //width of blob
01116   INT32 width_units;             //no of widths in blob
01117   float width;                   //blob width
01118   BOX blob_box;                  //bounding box
01119   BOX joined_box;                //of super blob
01120 
01121   gap_stats->clear ();
01122   pitch_stats->clear ();
01123   if (blob_it.empty ())
01124     return FALSE;
01125   prev_valid = FALSE;
01126   prev_centre = 0;
01127   prev_right = 0;                //stop complier warning
01128   joined_box = blob_it.data ()->bounding_box ();
01129   do {
01130     blob_it.forward ();
01131     blob = blob_it.data ();
01132     if (!blob->joined_to_prev ()) {
01133       blob_box = blob->bounding_box ();
01134       if (blob_box.left () - joined_box.right () < dm_gap
01135         && !blob_it.at_first ()
01136         || blob->cblob () == NULL && blob->blob () == NULL)
01137         joined_box += blob_box;  //merge blobs
01138       else {
01139         blob_width = joined_box.width ();
01140         if (split_outsize) {
01141           width_units =
01142             (INT32) floor ((float) blob_width / initial_pitch + 0.5);
01143           if (width_units < 1)
01144             width_units = 1;
01145           width_units--;
01146         }
01147         else if (ignore_outsize) {
01148           width = (float) blob_width / initial_pitch;
01149           width_units = width < 1 + words_default_fixed_limit
01150             && width > 1 - words_default_fixed_limit ? 0 : -1;
01151         }
01152         else
01153           width_units = 0;       //everything in
01154         x_centre = (INT32) (joined_box.left ()
01155           + (blob_width -
01156           width_units * initial_pitch) / 2);
01157         if (prev_valid && width_units >= 0) {
01158           // if (width_units>0)
01159           // {
01160           //   tprintf("wu=%d, width=%d, xc=%d, adding %d\n",
01161           //         width_units,blob_width,x_centre,x_centre-prev_centre);
01162           // }
01163           gap_stats->add (joined_box.left () - prev_right, 1);
01164           pitch_stats->add (x_centre - prev_centre, 1);
01165         }
01166         prev_centre = (INT32) (x_centre + width_units * initial_pitch);
01167         prev_right = joined_box.right ();
01168         prev_valid = blob_box.left () - joined_box.right () < min_space;
01169         prev_valid = prev_valid && width_units >= 0;
01170         joined_box = blob_box;
01171       }
01172     }
01173   }
01174   while (!blob_it.at_first ());
01175   return gap_stats->get_total () >= 3;
01176 }

void find_repeated_chars ( TO_BLOCK block,
BOOL8  testing_on 
)

Search for equal chars.

Find 4 or more adjacent chars which are the same and put them into words in advance of fixed pitch checking and word generation.

Definition at line 1812 of file topitch.cpp.

References TO_ROW::blob_list(), BOX::bottom(), WERD::bounding_box(), compare_blobs(), FALSE, INT_HOLLOW, interior_style, BOX::left(), make_real_word(), MAX_INT16, NULL, perimeter_color_index, rectangle, RED, TO_ROW::rep_words, BOX::right(), WERD::set_flag(), to_win, BOX::top(), tprintf(), TRUE, W_DONT_CHOP, and W_REP_CHAR.

Referenced by compute_block_pitch().

01815                           {
01816   BOOL8 bol;                     //start of line
01817   TO_ROW *row;                   //current row
01818   TO_ROW_IT row_it = block->get_rows ();
01819   ROW *real_row;                 //output row
01820   WERD_IT word_it;               //new words
01821   WERD *word;                    //new word
01822   BLOBNBOX *bblob;               //current blob
01823   BLOBNBOX *nextblob;            //neighbour to compare
01824   BLOBNBOX_IT box_it;            //iterator
01825   BLOBNBOX_IT search_it;         //forward search
01826   INT32 blobcount;               //no of neighbours
01827   INT32 matched_blobcount;       //no of matches
01828   INT32 blobindex;               //in row
01829   INT32 row_length;              //blobs in row
01830   INT32 width_change;            //max width change
01831   INT32 blob_width;              //required blob width
01832   INT32 space_width;             //required gap width
01833   INT32 prev_right;              //right edge of last blob
01834   float rating;                  //match rating
01835   PBLOB *pblob1;                 //polygonal blob
01836   PBLOB *pblob2;                 //second blob
01837   BOX word_box;                  //for plotting
01838 
01839   if (row_it.empty ())
01840     return;                      //empty block
01841   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
01842     row = row_it.data ();
01843     box_it.set_to_list (row->blob_list ());
01844     row_length = row->blob_list ()->length ();
01845     blobindex = 0;
01846     word_it.set_to_list (&row->rep_words);
01847     bol = TRUE;
01848     if (!box_it.empty ()) {
01849       real_row = new ROW (row,
01850         (INT16) block->kern_size,
01851         (INT16) block->space_size);
01852       do {
01853         bblob = box_it.data ();
01854         blobcount = 1;
01855         search_it = box_it;
01856         search_it.forward ();
01857         matched_blobcount = 1;
01858         width_change = MAX_INT16;
01859         blob_width = 0;
01860         space_width = 0;
01861         prev_right = bblob->bounding_box ().right ();
01862         if (bblob->bounding_box ().height () * 2 < row->xheight
01863           && !bblob->joined_to_prev ()
01864         && (bblob->blob () != NULL || bblob->cblob () != NULL)) {
01865           if (bblob->cblob () != NULL)
01866             pblob1 = new PBLOB (bblob->cblob (), row->xheight);
01867           else
01868             pblob1 = bblob->blob ();
01869 
01870           rating = 0.0f;
01871           while (rating < textord_repeat_rating
01872             && blobindex + blobcount < row_length
01873             && ((nextblob = search_it.data ())->blob () != NULL
01874             || nextblob->cblob () != NULL)
01875             && nextblob->bounding_box ().height () * 2 <
01876           row->xheight) {
01877             if (blobcount == 1) {
01878               space_width = nextblob->bounding_box ().left ()
01879                 - bblob->bounding_box ().right ();
01880               blob_width = bblob->bounding_box ().width ();
01881               width_change =
01882                 blob_width >
01883                 space_width ? blob_width : space_width;
01884               width_change =
01885                 (INT32) (width_change *
01886                 textord_repch_width_variance);
01887               if (width_change < 3)
01888                 width_change = 3;
01889             }
01890             if (nextblob->bounding_box ().width () >
01891               blob_width + width_change
01892               || nextblob->bounding_box ().width () <
01893               blob_width - width_change
01894               || nextblob->bounding_box ().left () - prev_right >
01895               space_width + width_change
01896               || nextblob->bounding_box ().left () - prev_right <
01897             space_width - width_change) {
01898               if (testing_on)
01899                 tprintf
01900                   ("Repch terminated:bw=%d, sw=%d, wc=%d, pr=%d, nb=(%d,%d)\n",
01901                   blob_width, space_width, width_change,
01902                   prev_right, nextblob->bounding_box ().left (),
01903                   nextblob->bounding_box ().right ());
01904               break;             //not good enough
01905             }
01906             if (nextblob->blob () != NULL)
01907               rating = compare_blobs (pblob1, real_row,
01908                 nextblob->blob (), real_row);
01909             else {
01910               pblob2 =
01911                 new PBLOB (nextblob->cblob (), row->xheight);
01912               rating =
01913                 compare_blobs(pblob1, real_row, pblob2, real_row);
01914               delete pblob2;
01915             }
01916             if (rating < textord_repeat_rating) {
01917               //    if (testing_on)
01918               //       tprintf("Blob at (%d,%d)->(%d,%d) had rating %g\n",
01919               //               nextblob->bounding_box().left(),
01920               //               nextblob->bounding_box().bottom(),
01921               //               nextblob->bounding_box().right(),
01922               //               nextblob->bounding_box().top(),
01923               //               rating);
01924               blobcount++;
01925               search_it.forward ();
01926               matched_blobcount++;
01927               while (blobindex + blobcount < row_length
01928               && search_it.data ()->joined_to_prev ()) {
01929                 search_it.forward ();
01930                 blobcount++;     //suck in joined bits
01931               }
01932             }
01933             prev_right = nextblob->bounding_box ().right ();
01934           }
01935           if (bblob->cblob () != NULL)
01936             delete pblob1;
01937 
01938           if (matched_blobcount >= textord_repeat_threshold) {
01939             word =
01940               make_real_word (&box_it, blobcount, bol, FALSE, FALSE,
01941               1);
01942 #ifndef GRAPHICS_DISABLED
01943             if (testing_on) {
01944               word_box = word->bounding_box ();
01945               tprintf
01946                 ("Found repeated word of %d blobs (%d matched) from (%d,%d)->(%d,%d)\n",
01947                 blobcount, matched_blobcount, word_box.left (),
01948                 word_box.bottom (), word_box.right (),
01949                 word_box.top ());
01950               perimeter_color_index(to_win, RED);
01951               interior_style(to_win, INT_HOLLOW, TRUE);
01952               rectangle (to_win, word_box.left (),
01953                 word_box.bottom (), word_box.right (),
01954                 word_box.top ());
01955             }
01956 #endif
01957             word->set_flag (W_REP_CHAR, TRUE);
01958             word->set_flag (W_DONT_CHOP, TRUE);
01959             word_it.add_after_then_move (word);
01960             blobindex += blobcount;
01961           }
01962         }
01963         bol = FALSE;
01964         box_it.forward ();       //next one
01965         blobindex++;
01966       }
01967                                  //until all done
01968       while (!box_it.at_first ());
01969       delete real_row;
01970     }
01971   }
01972 }

BOOL8 find_row_pitch ( TO_ROW row,
INT32  maxwidth,
INT32  dm_gap,
TO_BLOCK block,
INT32  block_index,
INT32  row_index,
BOOL8  testing_on 
)

Find lines.

Check to see if this row could be fixed pitch using the given spacings. Blobs with gaps smaller than the lower threshold are assumed to be one. The larger threshold is the word gap threshold.

Definition at line 872 of file topitch.cpp.

References STATS::clear(), count_pitch_stats(), FALSE, TO_ROW::fixed_pitch, TO_ROW::fp_nonsp, TO_ROW::fp_space, STATS::ile(), NULL, pitch, PITCH_DUNNO, PITCH_MAYBE_FIXED, PITCH_MAYBE_PROP, tprintf(), TRUE, and TO_ROW::xheight.

Referenced by compute_rows_pitch().

00880                       {
00881   BOOL8 used_dm_model;           //looks lik dot matrix
00882   float min_space;               //estimate threshold
00883   float non_space;               //gap size
00884   float gap_iqr;                 //interquartile range
00885   float pitch_iqr;
00886   float dm_gap_iqr;              //interquartile range
00887   float dm_pitch_iqr;
00888   float dm_pitch;                //pitch with dm on
00889   float pitch;                   //revised estimate
00890   float initial_pitch;           //guess at pitch
00891   STATS gap_stats (0, maxwidth);
00892                                  //centre-centre
00893   STATS pitch_stats (0, maxwidth);
00894 
00895   row->fixed_pitch = 0.0f;
00896   initial_pitch = row->fp_space;
00897   if (initial_pitch > row->xheight * (1 + words_default_fixed_limit))
00898     initial_pitch = row->xheight;//keep pitch decent
00899   non_space = row->fp_nonsp;
00900   if (non_space > initial_pitch)
00901     non_space = initial_pitch;
00902   min_space = (initial_pitch + non_space) / 2;
00903 
00904   if (!count_pitch_stats (row, &gap_stats, &pitch_stats,
00905   initial_pitch, min_space, TRUE, FALSE, dm_gap)) {
00906     dm_gap_iqr = 0.0001;
00907     dm_pitch_iqr = maxwidth * 2.0f;
00908     dm_pitch = initial_pitch;
00909   }
00910   else {
00911     dm_gap_iqr = gap_stats.ile (0.75) - gap_stats.ile (0.25);
00912     dm_pitch_iqr = pitch_stats.ile (0.75) - pitch_stats.ile (0.25);
00913     dm_pitch = pitch_stats.ile (0.5);
00914   }
00915   gap_stats.clear ();
00916   pitch_stats.clear ();
00917   if (!count_pitch_stats (row, &gap_stats, &pitch_stats,
00918   initial_pitch, min_space, TRUE, FALSE, 0)) {
00919     gap_iqr = 0.0001;
00920     pitch_iqr = maxwidth * 3.0f;
00921   }
00922   else {
00923     gap_iqr = gap_stats.ile (0.75) - gap_stats.ile (0.25);
00924     pitch_iqr = pitch_stats.ile (0.75) - pitch_stats.ile (0.25);
00925     if (testing_on)
00926       tprintf
00927         ("First fp iteration:initial_pitch=%g, gap_iqr=%g, pitch_iqr=%g, pitch=%g\n",
00928         initial_pitch, gap_iqr, pitch_iqr, pitch_stats.ile (0.5));
00929     initial_pitch = pitch_stats.ile (0.5);
00930     if (min_space > initial_pitch
00931       && count_pitch_stats (row, &gap_stats, &pitch_stats,
00932     initial_pitch, initial_pitch, TRUE, FALSE, 0)) {
00933       min_space = initial_pitch;
00934       gap_iqr = gap_stats.ile (0.75) - gap_stats.ile (0.25);
00935       pitch_iqr = pitch_stats.ile (0.75) - pitch_stats.ile (0.25);
00936       if (testing_on)
00937         tprintf
00938           ("Revised fp iteration:initial_pitch=%g, gap_iqr=%g, pitch_iqr=%g, pitch=%g\n",
00939           initial_pitch, gap_iqr, pitch_iqr, pitch_stats.ile (0.5));
00940       initial_pitch = pitch_stats.ile (0.5);
00941     }
00942   }
00943   if (textord_debug_pitch_metric)
00944     tprintf ("Blk=%d:Row=%d:%c:p_iqr=%g:g_iqr=%g:dm_p_iqr=%g:dm_g_iqr=%g:%c:",
00945       block_index, row_index,
00946       block->block->text_region () != NULL ?
00947       (block->block->text_region ()->is_prop ()? 'P' : 'F') : 'X',
00948     pitch_iqr, gap_iqr, dm_pitch_iqr, dm_gap_iqr,
00949     pitch_iqr > maxwidth && dm_pitch_iqr > maxwidth ? 'D'
00950     : (pitch_iqr * dm_gap_iqr <=
00951     dm_pitch_iqr * gap_iqr ? 'S' : 'M'));
00952   if (pitch_iqr > maxwidth && dm_pitch_iqr > maxwidth) {
00953     row->pitch_decision = PITCH_DUNNO;
00954     if (textord_debug_pitch_metric)
00955       tprintf ("\n");
00956     return FALSE;                //insufficient data
00957   }
00958   if (pitch_iqr * dm_gap_iqr <= dm_pitch_iqr * gap_iqr) {
00959     if (testing_on)
00960       tprintf
00961         ("Choosing non dm version:pitch_iqr=%g, gap_iqr=%g, dm_pitch_iqr=%g, dm_gap_iqr=%g\n",
00962         pitch_iqr, gap_iqr, dm_pitch_iqr, dm_gap_iqr);
00963     gap_iqr = gap_stats.ile (0.75) - gap_stats.ile (0.25);
00964     pitch_iqr = pitch_stats.ile (0.75) - pitch_stats.ile (0.25);
00965     pitch = pitch_stats.ile (0.5);
00966     used_dm_model = FALSE;
00967   }
00968   else {
00969     if (testing_on)
00970       tprintf
00971         ("Choosing dm version:pitch_iqr=%g, gap_iqr=%g, dm_pitch_iqr=%g, dm_gap_iqr=%g\n",
00972         pitch_iqr, gap_iqr, dm_pitch_iqr, dm_gap_iqr);
00973     gap_iqr = dm_gap_iqr;
00974     pitch_iqr = dm_pitch_iqr;
00975     pitch = dm_pitch;
00976     used_dm_model = TRUE;
00977   }
00978   if (textord_debug_pitch_metric) {
00979     tprintf ("rev_p_iqr=%g:rev_g_iqr=%g:pitch=%g:",
00980       pitch_iqr, gap_iqr, pitch);
00981     tprintf ("p_iqr/g=%g:p_iqr/x=%g:iqr_res=%c:",
00982       pitch_iqr / gap_iqr, pitch_iqr / block->xheight,
00983       pitch_iqr < gap_iqr * textord_fpiqr_ratio
00984       && pitch_iqr < block->xheight * textord_max_pitch_iqr
00985       && pitch < block->xheight * textord_words_default_maxspace
00986       ? 'F' : 'P');
00987   }
00988   if (pitch_iqr < gap_iqr * textord_fpiqr_ratio
00989     && pitch_iqr < block->xheight * textord_max_pitch_iqr
00990     && pitch < block->xheight * textord_words_default_maxspace)
00991     row->pitch_decision = PITCH_MAYBE_FIXED;
00992   else
00993     row->pitch_decision = PITCH_MAYBE_PROP;
00994   row->fixed_pitch = pitch;
00995   row->kern_size = gap_stats.ile (0.5);
00996   row->min_space = (INT32) (row->fixed_pitch + non_space) / 2;
00997   if (row->min_space > row->fixed_pitch)
00998     row->min_space = (INT32) row->fixed_pitch;
00999   row->max_nonspace = row->min_space;
01000   row->space_size = row->fixed_pitch;
01001   row->space_threshold = (row->max_nonspace + row->min_space) / 2;
01002   row->used_dm_model = used_dm_model;
01003   return TRUE;
01004 }

void fix_row_pitch ( TO_ROW bad_row,
TO_BLOCK bad_block,
TO_BLOCK_LIST *  blocks,
INT32  row_target,
INT32  block_target 
)

Get some value.

Get a pitch_decision for this row by voting among similar rows in the block, then similar rows over all the page, or any other rows at all.

Definition at line 149 of file topitch.cpp.

References STATS::add(), TO_ROW::all_caps, TO_ROW::ascrise, TO_ROW::char_cells, FALSE, TO_ROW::fixed_pitch, STATS::ile(), TO_ROW::kern_size, TO_ROW::max_nonspace, TO_ROW::min_space, NULL, PITCH_CORR_FIXED, PITCH_CORR_PROP, TO_ROW::pitch_decision, PITCH_DEF_FIXED, PITCH_DEF_PROP, PITCH_MAYBE_FIXED, PITCH_MAYBE_PROP, TO_ROW::projection, STATS::set_range(), TO_ROW::space_size, TO_ROW::space_threshold, tprintf(), tune_row_pitch(), and TO_ROW::xheight.

Referenced by compute_fixed_pitch().

00155                     {
00156   const char *res_string;        //decision on line
00157   INT16 mid_cuts;
00158   int block_votes;               //votes in block
00159   int like_votes;                //votes over page
00160   int other_votes;               //votes of unlike blocks
00161   int block_index;               //number of block
00162   int row_index;                 //number of row
00163   int maxwidth;                  //max pitch
00164   TO_BLOCK_IT block_it = blocks; //block iterator
00165   TO_ROW_IT row_it;
00166   TO_BLOCK *block;               //current block
00167   TO_ROW *row;                   //current row
00168   float sp_sd;                   //space deviation
00169   STATS block_stats;             //pitches in block
00170   STATS like_stats;              //pitches in page
00171 
00172   block_votes = like_votes = other_votes = 0;
00173   maxwidth = (INT32) ceil (bad_row->xheight * textord_words_maxspace);
00174   if (bad_row->pitch_decision != PITCH_DEF_FIXED
00175   && bad_row->pitch_decision != PITCH_DEF_PROP) {
00176     block_stats.set_range (0, maxwidth);
00177     like_stats.set_range (0, maxwidth);
00178     block_index = 1;
00179     for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
00180     block_it.forward ()) {
00181       block = block_it.data ();
00182       row_index = 1;
00183       row_it.set_to_list (block->get_rows ());
00184       for (row_it.mark_cycle_pt (); !row_it.cycled_list ();
00185       row_it.forward ()) {
00186         row = row_it.data ();
00187         if (bad_row->all_caps
00188           && row->xheight + row->ascrise
00189           <
00190           (bad_row->xheight + bad_row->ascrise) * (1 +
00191           textord_pitch_rowsimilarity)
00192           && row->xheight + row->ascrise >
00193           (bad_row->xheight + bad_row->ascrise) * (1 -
00194           textord_pitch_rowsimilarity)
00195           || !bad_row->all_caps
00196           && row->xheight <
00197           bad_row->xheight * (1 + textord_pitch_rowsimilarity)
00198           && row->xheight >
00199         bad_row->xheight * (1 - textord_pitch_rowsimilarity)) {
00200           if (block_index == block_target) {
00201             if (row->pitch_decision == PITCH_DEF_FIXED) {
00202               block_votes += textord_words_veto_power;
00203               block_stats.add ((INT32) row->fixed_pitch,
00204                 textord_words_veto_power);
00205             }
00206             else if (row->pitch_decision == PITCH_MAYBE_FIXED
00207             || row->pitch_decision == PITCH_CORR_FIXED) {
00208               block_votes++;
00209               block_stats.add ((INT32) row->fixed_pitch, 1);
00210             }
00211             else if (row->pitch_decision == PITCH_DEF_PROP)
00212               block_votes -= textord_words_veto_power;
00213             else if (row->pitch_decision == PITCH_MAYBE_PROP
00214               || row->pitch_decision == PITCH_CORR_PROP)
00215               block_votes--;
00216           }
00217           else {
00218             if (row->pitch_decision == PITCH_DEF_FIXED) {
00219               like_votes += textord_words_veto_power;
00220               like_stats.add ((INT32) row->fixed_pitch,
00221                 textord_words_veto_power);
00222             }
00223             else if (row->pitch_decision == PITCH_MAYBE_FIXED
00224             || row->pitch_decision == PITCH_CORR_FIXED) {
00225               like_votes++;
00226               like_stats.add ((INT32) row->fixed_pitch, 1);
00227             }
00228             else if (row->pitch_decision == PITCH_DEF_PROP)
00229               like_votes -= textord_words_veto_power;
00230             else if (row->pitch_decision == PITCH_MAYBE_PROP
00231               || row->pitch_decision == PITCH_CORR_PROP)
00232               like_votes--;
00233           }
00234         }
00235         else {
00236           if (row->pitch_decision == PITCH_DEF_FIXED)
00237             other_votes += textord_words_veto_power;
00238           else if (row->pitch_decision == PITCH_MAYBE_FIXED
00239             || row->pitch_decision == PITCH_CORR_FIXED)
00240             other_votes++;
00241           else if (row->pitch_decision == PITCH_DEF_PROP)
00242             other_votes -= textord_words_veto_power;
00243           else if (row->pitch_decision == PITCH_MAYBE_PROP
00244             || row->pitch_decision == PITCH_CORR_PROP)
00245             other_votes--;
00246         }
00247         row_index++;
00248       }
00249       block_index++;
00250     }
00251     if (block_votes > textord_words_veto_power) {
00252       bad_row->fixed_pitch = block_stats.ile (0.5);
00253       bad_row->pitch_decision = PITCH_CORR_FIXED;
00254     }
00255     else if (block_votes <= textord_words_veto_power && like_votes > 0) {
00256       bad_row->fixed_pitch = like_stats.ile (0.5);
00257       bad_row->pitch_decision = PITCH_CORR_FIXED;
00258     }
00259     else {
00260       bad_row->pitch_decision = PITCH_CORR_PROP;
00261       #ifndef SECURE_NAMES
00262       if (block_votes == 0 && like_votes == 0 && other_votes > 0
00263         && (textord_debug_pitch_test || textord_debug_pitch_metric))
00264         tprintf
00265           ("Warning:row %d of block %d set prop with no like rows against trend\n",
00266           row_target, block_target);
00267       #endif
00268     }
00269   }
00270   if (textord_debug_pitch_metric) {
00271     tprintf (":b_votes=%d:l_votes=%d:o_votes=%d",
00272       block_votes, like_votes, other_votes);
00273     if (bad_row->pitch_decision == PITCH_CORR_PROP
00274     || bad_row->pitch_decision == PITCH_DEF_PROP) {
00275       res_string = bad_block->block->text_region () != NULL ?
00276         (bad_block->block->text_region ()->
00277         is_prop ()? "CP" : "WP") : "XP";
00278     }
00279     else {
00280       res_string = bad_block->block->text_region () != NULL ?
00281         (bad_block->block->text_region ()->
00282         is_prop ()? "WF" : "CF") : "XF";
00283     }
00284     tprintf (":Blk=%d:Row=%d:%c:",
00285       block_target, row_target,
00286       bad_block->block->text_region () != NULL ?
00287       (bad_block->block->text_region ()->
00288       is_prop ()? 'P' : 'F') : 'X');
00289     tprintf ("x=%g:asc=%g:corr_res=%s\n", bad_row->xheight,
00290       bad_row->ascrise, res_string);
00291   }
00292   if (textord_pitch_cheat && bad_block->block->text_region () != NULL)
00293     bad_row->pitch_decision =
00294       bad_block->block->text_region ()->
00295       is_prop ()? PITCH_CORR_PROP : PITCH_CORR_FIXED;
00296   if (bad_row->pitch_decision == PITCH_CORR_FIXED) {
00297     if (bad_row->fixed_pitch < textord_min_xheight) {
00298       if (block_votes > 0)
00299         bad_row->fixed_pitch = block_stats.ile (0.5);
00300       else if (block_votes == 0 && like_votes > 0)
00301         bad_row->fixed_pitch = like_stats.ile (0.5);
00302       else {
00303         tprintf
00304           ("Warning:guessing pitch as xheight on row %d, block %d\n",
00305           row_target, block_target);
00306         bad_row->fixed_pitch = bad_row->xheight;
00307       }
00308     }
00309     if (bad_row->fixed_pitch < textord_min_xheight)
00310       bad_row->fixed_pitch = (float) textord_min_xheight;
00311     bad_row->kern_size = bad_row->fixed_pitch / 4;
00312     bad_row->min_space = (INT32) (bad_row->fixed_pitch * 0.6);
00313     bad_row->max_nonspace = (INT32) (bad_row->fixed_pitch * 0.4);
00314     bad_row->space_threshold =
00315       (bad_row->min_space + bad_row->max_nonspace) / 2;
00316     bad_row->space_size = bad_row->fixed_pitch;
00317     if (bad_row->char_cells.empty ())
00318       tune_row_pitch (bad_row, &bad_row->projection,
00319         bad_row->projection_left, bad_row->projection_right,
00320         (bad_row->fixed_pitch +
00321         bad_row->max_nonspace * 3) / 4, bad_row->fixed_pitch,
00322         sp_sd, mid_cuts, &bad_row->char_cells, FALSE);
00323   }
00324   else if (bad_row->pitch_decision == PITCH_CORR_PROP
00325   || bad_row->pitch_decision == PITCH_DEF_PROP) {
00326     bad_row->fixed_pitch = 0.0f;
00327     bad_row->char_cells.clear ();
00328   }
00329 }

BOOL8 fixed_pitch_row ( TO_ROW row,
INT32  block_index 
)

Find lines.

Check to see if this row could be fixed pitch using the given spacings. Blobs with gaps smaller than the lower threshold are assumed to be one. The larger threshold is the word gap threshold.

Note:
Major changes for v1.03

Definition at line 1015 of file topitch.cpp.

References TO_ROW::all_caps, TO_ROW::char_cells, TO_ROW::fixed_pitch, TO_ROW::fp_nonsp, TO_ROW::pitch_decision, PITCH_DEF_FIXED, PITCH_DEF_PROP, PITCH_DUNNO, PITCH_MAYBE_FIXED, PITCH_MAYBE_PROP, TO_ROW::projection, TO_ROW::projection_left, TO_ROW::projection_right, tprintf(), TRUE, tune_row_pitch(), and TO_ROW::used_dm_model.

Referenced by try_rows_fixed().

01018                        {
01019   const char *res_string;        //pitch result
01020   INT16 mid_cuts;                //no of cheap cuts
01021   float non_space;               //gap size
01022   float pitch_sd;                //error on pitch
01023   float sp_sd;                   //space sd
01024 
01025   non_space = row->fp_nonsp;
01026   if (non_space > row->fixed_pitch)
01027     non_space = row->fixed_pitch;
01028   if (textord_all_prop) {
01029     // Set the decision to definitely proportional.
01030     pitch_sd = textord_words_def_prop * row->fixed_pitch;
01031     row->pitch_decision = PITCH_DEF_PROP;
01032   } else {
01033     pitch_sd = tune_row_pitch (row, &row->projection, row->projection_left,
01034                                row->projection_right,
01035                                (row->fixed_pitch + non_space * 3) / 4,
01036                                row->fixed_pitch, sp_sd, mid_cuts,
01037                                &row->char_cells,
01038                                block_index == textord_debug_block);
01039     if (pitch_sd < textord_words_pitchsd_threshold * row->fixed_pitch
01040       && ((pitsync_linear_version & 3) < 3
01041       || (pitsync_linear_version & 3) >= 3 && (row->used_dm_model
01042       || sp_sd > 20
01043       || pitch_sd == 0
01044     && sp_sd > 10))) {
01045       if (pitch_sd < textord_words_def_fixed * row->fixed_pitch
01046         && !row->all_caps
01047         && ((pitsync_linear_version & 3) < 3 || sp_sd > 20))
01048         row->pitch_decision = PITCH_DEF_FIXED;
01049       else
01050         row->pitch_decision = PITCH_MAYBE_FIXED;
01051     }
01052     else if ((pitsync_linear_version & 3) < 3
01053       || sp_sd > 20
01054       || mid_cuts > 0
01055       || pitch_sd >= textord_words_pitchsd_threshold * row->fixed_pitch) {
01056       if (pitch_sd < textord_words_def_prop * row->fixed_pitch)
01057         row->pitch_decision = PITCH_MAYBE_PROP;
01058       else
01059         row->pitch_decision = PITCH_DEF_PROP;
01060     }
01061     else
01062       row->pitch_decision = PITCH_DUNNO;
01063   }
01064 
01065   if (textord_debug_pitch_metric) {
01066     res_string = "??";
01067     switch (row->pitch_decision) {
01068       case PITCH_DEF_PROP:
01069         res_string = "DP";
01070         break;
01071       case PITCH_MAYBE_PROP:
01072         res_string = "MP";
01073         break;
01074       case PITCH_DEF_FIXED:
01075         res_string = "DF";
01076         break;
01077       case PITCH_MAYBE_FIXED:
01078         res_string = "MF";
01079       default:
01080         res_string = "??";
01081     }
01082     tprintf (":sd/p=%g:occ=%g:init_res=%s\n",
01083       pitch_sd / row->fixed_pitch, sp_sd, res_string);
01084   }
01085   return TRUE;
01086 }

void plot_fp_word ( TO_BLOCK block,
float  pitch,
float  nonspace 
)

Draw block of words.

Plot a block of words as if fixed pitch.

Definition at line 1981 of file topitch.cpp.

References TO_ROW::max_nonspace, TO_ROW::min_space, plot_word_decisions(), TO_ROW::space_threshold, and to_win.

01985                    {
01986   TO_ROW *row;                   //current row
01987   TO_ROW_IT row_it = block->get_rows ();
01988 
01989   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
01990     row = row_it.data ();
01991     row->min_space = (INT32) ((pitch + nonspace) / 2);
01992     row->max_nonspace = row->min_space;
01993     row->space_threshold = row->min_space;
01994     plot_word_decisions (to_win, (INT16) pitch, row);
01995   }
01996 }

void print_block_counts ( TO_BLOCK block,
INT32  block_index 
)

Print line stats.

Count up how many rows have what decision and print the results.

Definition at line 650 of file topitch.cpp.

References count_block_votes(), NULL, and tprintf().

Referenced by compute_fixed_pitch(), and try_rows_fixed().

00653                          {
00654   INT32 def_fixed = 0;           //counters
00655   INT32 def_prop = 0;
00656   INT32 maybe_fixed = 0;
00657   INT32 maybe_prop = 0;
00658   INT32 dunno = 0;
00659   INT32 corr_fixed = 0;
00660   INT32 corr_prop = 0;
00661 
00662   count_block_votes(block,
00663                     def_fixed,
00664                     def_prop,
00665                     maybe_fixed,
00666                     maybe_prop,
00667                     corr_fixed,
00668                     corr_prop,
00669                     dunno);
00670   tprintf ("Block %d has (%d,%d,%d)",
00671     block_index, def_fixed, maybe_fixed, corr_fixed);
00672   if ((textord_blocksall_prop
00673     || block->block->text_region () != NULL
00674     && block->block->text_region ()->is_prop ()) && (def_fixed
00675     || maybe_fixed
00676     || corr_fixed))
00677     tprintf (" (Wrongly)");
00678   tprintf (" fixed, (%d,%d,%d)", def_prop, maybe_prop, corr_prop);
00679   if ((textord_blocksall_fixed
00680     || block->block->text_region () != NULL
00681     && !block->block->text_region ()->is_prop ()) && (def_prop
00682     || maybe_prop
00683     || corr_prop))
00684     tprintf (" (Wrongly)");
00685   tprintf (" prop, %d dunno\n", dunno);
00686 }

void print_pitch_sd ( TO_ROW row,
STATS projection,
INT16  projection_left,
INT16  projection_right,
float  space_size,
float  initial_pitch 
)

Print fp cells.

Use a dp algorithm to fit the character cells and return the sd of the cell size over the row.

Definition at line 1665 of file topitch.cpp.

References TO_ROW::all_caps, blob_count, TO_ROW::blob_list(), box_next(), check_pitch_sync2(), CORAL, TO_ROW::fixed_pitch, BOX::left(), NO_WINDOW, plot_fp_cells2(), projection, BOX::right(), seg_list, to_win, tprintf(), and TO_ROW::xheight.

Referenced by tune_row_pitch(), and tune_row_pitch2().

01672                      {
01673   const char *res2;              //pitch result
01674   INT16 occupation;              //used cells
01675   float sp_sd;                   //space sd
01676                                  //blobs
01677   BLOBNBOX_IT blob_it = row->blob_list ();
01678   BLOBNBOX_IT start_it;          //start of word
01679   BLOBNBOX_IT row_start;         //start of row
01680   INT16 blob_count;              //no of blobs
01681   INT16 total_blob_count;        //total blobs in line
01682   BOX blob_box;                  //bounding box
01683   BOX prev_box;                  //of super blob
01684   INT32 prev_right;              //of word sync
01685   int scale_factor;              //on scores for big words
01686   INT32 sp_count;                //spaces
01687   FPSEGPT_LIST seg_list;         //char cells
01688   FPSEGPT_IT seg_it;             //iterator
01689   double sqsum;                  //sum of squares
01690   double spsum;                  //of spaces
01691   double sp_var;                 //space error
01692   double word_sync;              //result for word
01693   double total_count;            //total cuts
01694 
01695   if (blob_it.empty ())
01696     return;
01697   row_start = blob_it;
01698   total_blob_count = 0;
01699 
01700   total_count = 0;
01701   sqsum = 0;
01702   sp_count = 0;
01703   spsum = 0;
01704   prev_right = -1;
01705   blob_it = row_start;
01706   start_it = blob_it;
01707   blob_count = 0;
01708   blob_box = box_next (&blob_it);//first blob
01709   blob_it.mark_cycle_pt ();
01710   do {
01711     for (; blob_count > 0; blob_count--)
01712       box_next(&start_it);
01713     do {
01714       prev_box = blob_box;
01715       blob_count++;
01716       blob_box = box_next (&blob_it);
01717     }
01718     while (!blob_it.cycled_list ()
01719       && blob_box.left () - prev_box.right () < space_size);
01720     word_sync =
01721       check_pitch_sync2 (&start_it, blob_count, (INT16) initial_pitch, 2,
01722       projection, projection_left, projection_right,
01723       row->xheight * textord_projection_scale,
01724       occupation, &seg_list, 0, 0);
01725     total_blob_count += blob_count;
01726     seg_it.set_to_list (&seg_list);
01727     if (prev_right >= 0) {
01728       sp_var = seg_it.data ()->position () - prev_right;
01729       sp_var -= floor (sp_var / initial_pitch + 0.5) * initial_pitch;
01730       sp_var *= sp_var;
01731       spsum += sp_var;
01732       sp_count++;
01733     }
01734     seg_it.move_to_last ();
01735     prev_right = seg_it.data ()->position ();
01736     if (textord_pitch_scalebigwords) {
01737       scale_factor = (seg_list.length () - 2) / 2;
01738       if (scale_factor < 1)
01739         scale_factor = 1;
01740     }
01741     else
01742       scale_factor = 1;
01743     sqsum += word_sync * scale_factor;
01744     total_count += (seg_list.length () - 1) * scale_factor;
01745     seg_list.clear ();
01746   }
01747   while (!blob_it.cycled_list ());
01748   sp_sd = sp_count > 0 ? sqrt (spsum / sp_count) : 0;
01749   word_sync = total_count > 0 ? sqrt (sqsum / total_count) : space_size * 10;
01750   tprintf ("new_sd=%g:sd/p=%g:new_sp_sd=%g:res=%c:",
01751     word_sync, word_sync / initial_pitch, sp_sd,
01752     word_sync < textord_words_pitchsd_threshold * initial_pitch
01753     ? 'F' : 'P');
01754 
01755   start_it = row_start;
01756   blob_it = row_start;
01757   word_sync =
01758     check_pitch_sync2 (&blob_it, total_blob_count, (INT16) initial_pitch, 2,
01759     projection, projection_left, projection_right,
01760     row->xheight * textord_projection_scale, occupation,
01761     &seg_list, 0, 0);
01762   if (occupation > 1)
01763     word_sync /= occupation;
01764   word_sync = sqrt (word_sync);
01765 
01766 #ifndef GRAPHICS_DISABLED
01767   if (textord_show_row_cuts && to_win != NO_WINDOW)
01768     plot_fp_cells2(to_win, CORAL, row, &seg_list);
01769 #endif
01770   seg_list.clear ();
01771   if (word_sync < textord_words_pitchsd_threshold * initial_pitch) {
01772     if (word_sync < textord_words_def_fixed * initial_pitch
01773       && !row->all_caps)
01774       res2 = "DF";
01775     else
01776       res2 = "MF";
01777   }
01778   else
01779     res2 = word_sync < textord_words_def_prop * initial_pitch ? "MP" : "DP";
01780   tprintf
01781     ("row_sd=%g:sd/p=%g:res=%c:N=%d:res2=%s,init pitch=%g, row_pitch=%g, all_caps=%d\n",
01782     word_sync, word_sync / initial_pitch,
01783     word_sync < textord_words_pitchsd_threshold * initial_pitch ? 'F' : 'P',
01784     occupation, res2, initial_pitch, row->fixed_pitch, row->all_caps);
01785 }

BOOL8 row_pitch_stats ( TO_ROW row,
INT32  maxwidth,
BOOL8  testing_on 
)

Find line stats.

Decide whether each row is fixed pitch individually.

Definition at line 740 of file topitch.cpp.

References STATS::add(), TO_ROW::blob_list(), BLOCK_STATS_CLUSTERS, STATS::cluster(), FALSE, TO_ROW::fp_nonsp, TO_ROW::fp_space, STATS::get_total(), BOX::left(), TO_ROW::pr_nonsp, TO_ROW::pr_space, BOX::right(), STATS::smooth(), sort_floats2(), tprintf(), TRUE, and TO_ROW::xheight.

Referenced by compute_rows_pitch().

00744                        {
00745   BLOBNBOX *blob;                //current blob
00746   int gap_index;                 //current gap
00747   INT32 prev_x;                  //end of prev blob
00748   INT32 cluster_count;           //no of clusters
00749   INT32 prev_count;              //of clusters
00750   INT32 smooth_factor;           //for smoothing stats
00751   BOX blob_box;                  //bounding box
00752   float lower, upper;            //cluster thresholds
00753                                  //gap sizes
00754   float gaps[BLOCK_STATS_CLUSTERS];
00755                                  //blobs
00756   BLOBNBOX_IT blob_it = row->blob_list ();
00757   STATS gap_stats (0, maxwidth);
00758   STATS cluster_stats[BLOCK_STATS_CLUSTERS + 1];
00759   //clusters
00760 
00761   smooth_factor =
00762     (INT32) (row->xheight * textord_wordstats_smooth_factor + 1.5);
00763   if (!blob_it.empty ()) {
00764     prev_x = blob_it.data ()->bounding_box ().right ();
00765     blob_it.forward ();
00766     while (!blob_it.at_first ()) {
00767       blob = blob_it.data ();
00768       if (!blob->joined_to_prev ()) {
00769         blob_box = blob->bounding_box ();
00770         if (blob_box.left () - prev_x < maxwidth)
00771           gap_stats.add (blob_box.left () - prev_x, 1);
00772         prev_x = blob_box.right ();
00773       }
00774       blob_it.forward ();
00775     }
00776   }
00777   if (gap_stats.get_total () == 0) {
00778     return FALSE;
00779   }
00780   cluster_count = 0;
00781   lower = row->xheight * words_initial_lower;
00782   upper = row->xheight * words_initial_upper;
00783   gap_stats.smooth (smooth_factor);
00784   do {
00785     prev_count = cluster_count;
00786     cluster_count = gap_stats.cluster (lower, upper,
00787       textord_spacesize_ratioprop,
00788       BLOCK_STATS_CLUSTERS, cluster_stats);
00789   }
00790   while (cluster_count > prev_count && cluster_count < BLOCK_STATS_CLUSTERS);
00791   if (cluster_count < 1) {
00792     return FALSE;
00793   }
00794   for (gap_index = 0; gap_index < cluster_count; gap_index++)
00795     gaps[gap_index] = cluster_stats[gap_index + 1].ile (0.5);
00796   //get medians
00797   if (testing_on) {
00798     tprintf ("cluster_count=%d:", cluster_count);
00799     for (gap_index = 0; gap_index < cluster_count; gap_index++)
00800       tprintf (" %g(%d)", gaps[gap_index],
00801         cluster_stats[gap_index + 1].get_total ());
00802     tprintf ("\n");
00803   }
00804   qsort (gaps, cluster_count, sizeof (float), sort_floats2);
00805 
00806   //Try to find proportional non-space and space for row.
00807   lower = row->xheight * words_default_prop_nonspace;
00808   upper = row->xheight * textord_words_min_minspace;
00809   for (gap_index = 0; gap_index < cluster_count
00810     && gaps[gap_index] < lower; gap_index++);
00811   if (gap_index == 0) {
00812     if (testing_on)
00813       tprintf ("No clusters below nonspace threshold!!\n");
00814     if (cluster_count > 1) {
00815       row->pr_nonsp = gaps[0];
00816       row->pr_space = gaps[1];
00817     }
00818     else {
00819       row->pr_nonsp = lower;
00820       row->pr_space = gaps[0];
00821     }
00822   }
00823   else {
00824     row->pr_nonsp = gaps[gap_index - 1];
00825     while (gap_index < cluster_count && gaps[gap_index] < upper)
00826       gap_index++;
00827     if (gap_index == cluster_count) {
00828       if (testing_on)
00829         tprintf ("No clusters above nonspace threshold!!\n");
00830       row->pr_space = lower * textord_spacesize_ratioprop;
00831     }
00832     else
00833       row->pr_space = gaps[gap_index];
00834   }
00835 
00836   //Now try to find the fixed pitch space and non-space.
00837   upper = row->xheight * words_default_fixed_space;
00838   for (gap_index = 0; gap_index < cluster_count
00839     && gaps[gap_index] < upper; gap_index++);
00840   if (gap_index == 0) {
00841     if (testing_on)
00842       tprintf ("No clusters below space threshold!!\n");
00843     row->fp_nonsp = upper;
00844     row->fp_space = gaps[0];
00845   }
00846   else {
00847     row->fp_nonsp = gaps[gap_index - 1];
00848     if (gap_index == cluster_count) {
00849       if (testing_on)
00850         tprintf ("No clusters above space threshold!!\n");
00851       row->fp_space = row->xheight;
00852     }
00853     else
00854       row->fp_space = gaps[gap_index];
00855   }
00856   if (testing_on) {
00857     tprintf
00858       ("Initial estimates:pr_nonsp=%g, pr_space=%g, fp_nonsp=%g, fp_space=%g\n",
00859       row->pr_nonsp, row->pr_space, row->fp_nonsp, row->fp_space);
00860   }
00861   return TRUE;                   //computed some stats
00862 }

int sort_floats2 ( const void *  arg1,
const void *  arg2 
)

qsort function to sort 2 floats.

Definition at line 1791 of file topitch.cpp.

Referenced by row_pitch_stats().

01793                                    {
01794   float diff;                    //difference
01795 
01796   diff = *((float *) arg1) - *((float *) arg2);
01797   if (diff > 0)
01798     return 1;
01799   else if (diff < 0)
01800     return -1;
01801   else
01802     return 0;
01803 }

BOOL8 try_block_fixed ( TO_BLOCK block,
INT32  block_index 
)

Find line stats.

Try to call the entire block fixed.

Definition at line 569 of file topitch.cpp.

References FALSE.

Referenced by compute_fixed_pitch().

00572                        {
00573   return FALSE;
00574 }

BOOL8 try_doc_fixed ( ICOORD  page_tr,
TO_BLOCK_LIST *  port_blocks,
float  gradient 
)

Determine pitch.

Attempt to call the entire document fixed pitch.

Definition at line 424 of file topitch.cpp.

References STATS::add(), TO_ROW::baseline, TO_ROW::char_cells, CORAL, FALSE, TO_ROW::fixed_pitch, STATS::get_total(), GOLDENROD, STATS::ile(), TO_ROW::intercept(), MAX_ALLOWED_PITCH, MAX_INT16, NO_WINDOW, NULL, STATS::pile_count(), pitch, STATS::plot(), plot_row_cells(), projection, TO_ROW::projection, TO_ROW::projection_left, TO_ROW::projection_right, STATS::set_range(), to_win, tprintf(), tune_row_pitch(), and QSPLINE::y().

Referenced by compute_fixed_pitch().

00428                      {
00429   INT16 master_x;                //uniform shifts
00430   INT16 pitch;                   //median pitch.
00431   int x;                         //profile coord
00432   int prop_blocks;               //correct counts
00433   int fixed_blocks;
00434   int total_row_count;           //total in page
00435                                  //iterator
00436   TO_BLOCK_IT block_it = port_blocks;
00437   TO_BLOCK *block;               //current block;
00438   TO_ROW_IT row_it;              //row iterator
00439   TO_ROW *row;                   //current row
00440   INT16 projection_left;         //edges
00441   INT16 projection_right;
00442   INT16 row_left;                //edges of row
00443   INT16 row_right;
00444   ICOORDELT_LIST *master_cells;  //cells for page
00445   float master_y;                //uniform shifts
00446   float shift_factor;            //page skew correction
00447   float row_shift;               //shift for row
00448   float final_pitch;             //output pitch
00449   float row_y;                   //baseline
00450   STATS projection;              //entire page
00451   STATS pitches (0, MAX_ALLOWED_PITCH);
00452   //for median
00453   float sp_sd;                   //space sd
00454   INT16 mid_cuts;                //no of cheap cuts
00455   float pitch_sd;                //sync rating
00456 
00457   if (block_it.empty ()
00458     //      || block_it.data()==block_it.data_relative(1)
00459     || !textord_blockndoc_fixed)
00460     return FALSE;
00461   shift_factor = gradient / (gradient * gradient + 1);
00462   row_it.set_to_list (block_it.data ()->get_rows ());
00463   master_x = row_it.data ()->projection_left;
00464   master_y = row_it.data ()->baseline.y (master_x);
00465   projection_left = MAX_INT16;
00466   projection_right = -MAX_INT16;
00467   prop_blocks = 0;
00468   fixed_blocks = 0;
00469   total_row_count = 0;
00470 
00471   for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
00472   block_it.forward ()) {
00473     block = block_it.data ();
00474     if (block->block->text_region () != NULL) {
00475       if (block->block->text_region ()->is_prop ())
00476         prop_blocks++;
00477       else
00478         fixed_blocks++;
00479     }
00480     row_it.set_to_list (block->get_rows ());
00481     for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00482       row = row_it.data ();
00483       total_row_count++;
00484       if (row->fixed_pitch > 0)
00485         pitches.add ((INT32) (row->fixed_pitch), 1);
00486       //find median
00487       row_y = row->baseline.y (master_x);
00488       row_left =
00489         (INT16) (row->projection_left -
00490         shift_factor * (master_y - row_y));
00491       row_right =
00492         (INT16) (row->projection_right -
00493         shift_factor * (master_y - row_y));
00494       if (row_left < projection_left)
00495         projection_left = row_left;
00496       if (row_right > projection_right)
00497         projection_right = row_right;
00498     }
00499   }
00500   if (pitches.get_total () == 0)
00501     return FALSE;
00502   projection.set_range (projection_left, projection_right);
00503 
00504   for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
00505   block_it.forward ()) {
00506     block = block_it.data ();
00507     row_it.set_to_list (block->get_rows ());
00508     for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00509       row = row_it.data ();
00510       row_y = row->baseline.y (master_x);
00511       row_left =
00512         (INT16) (row->projection_left -
00513         shift_factor * (master_y - row_y));
00514       for (x = row->projection_left; x < row->projection_right;
00515       x++, row_left++) {
00516         projection.add (row_left, row->projection.pile_count (x));
00517       }
00518     }
00519   }
00520 
00521   row_it.set_to_list (block_it.data ()->get_rows ());
00522   row = row_it.data ();
00523 #ifndef GRAPHICS_DISABLED
00524   if (textord_show_page_cuts && to_win != NO_WINDOW)
00525     projection.plot (to_win, projection_left,
00526       row->intercept (), 1.0f, -1.0f, CORAL);
00527 #endif
00528   final_pitch = pitches.ile (0.5);
00529   pitch = (INT16) final_pitch;
00530   pitch_sd =
00531     tune_row_pitch (row, &projection, projection_left, projection_right,
00532     pitch * 0.75, final_pitch, sp_sd, mid_cuts,
00533     &row->char_cells, FALSE);
00534 
00535   if (textord_debug_pitch_metric)
00536     tprintf
00537       ("try_doc:props=%d:fixed=%d:pitch=%d:final_pitch=%g:pitch_sd=%g:sp_sd=%g:sd/trc=%g:sd/p=%g:sd/trc/p=%g\n",
00538       prop_blocks, fixed_blocks, pitch, final_pitch, pitch_sd, sp_sd,
00539       pitch_sd / total_row_count, pitch_sd / pitch,
00540       pitch_sd / total_row_count / pitch);
00541 
00542 #ifndef GRAPHICS_DISABLED
00543   if (textord_show_page_cuts && to_win != NO_WINDOW) {
00544     master_cells = &row->char_cells;
00545     for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
00546     block_it.forward ()) {
00547       block = block_it.data ();
00548       row_it.set_to_list (block->get_rows ());
00549       for (row_it.mark_cycle_pt (); !row_it.cycled_list ();
00550       row_it.forward ()) {
00551         row = row_it.data ();
00552         row_y = row->baseline.y (master_x);
00553         row_shift = shift_factor * (master_y - row_y);
00554         plot_row_cells(to_win, GOLDENROD, row, row_shift, master_cells);
00555       }
00556     }
00557   }
00558 #endif
00559   row->char_cells.clear ();
00560   return FALSE;
00561 }

BOOL8 try_rows_fixed ( TO_BLOCK block,
INT32  block_index,
BOOL8  testing_on 
)

Find line stats.

Decide whether each row is fixed pitch individually.

Definition at line 582 of file topitch.cpp.

References ASSERT_HOST, count_block_votes(), FALSE, TO_ROW::fixed_pitch, fixed_pitch_row(), TO_ROW::kern_size, PITCH_DEF_FIXED, PITCH_DEF_PROP, PITCH_DUNNO, PITCH_MAYBE_FIXED, PITCH_MAYBE_PROP, TO_ROW::pr_nonsp, TO_ROW::pr_space, print_block_counts(), TO_ROW::space_size, tprintf(), and TO_ROW::xheight.

Referenced by compute_fixed_pitch().

00586                       {
00587   INT32 maxwidth;                //of spaces
00588   TO_ROW *row;                   //current row
00589   INT32 row_index;               //row number.
00590   INT32 def_fixed = 0;           //counters
00591   INT32 def_prop = 0;
00592   INT32 maybe_fixed = 0;
00593   INT32 maybe_prop = 0;
00594   INT32 dunno = 0;
00595   INT32 corr_fixed = 0;
00596   INT32 corr_prop = 0;
00597   float lower, upper;            //cluster thresholds
00598   TO_ROW_IT row_it = block->get_rows ();
00599 
00600   row_index = 1;
00601   for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00602     row = row_it.data ();
00603     ASSERT_HOST (row->xheight > 0);
00604     maxwidth = (INT32) ceil (row->xheight * textord_words_maxspace);
00605     if (row->fixed_pitch > 0 && fixed_pitch_row (row, block_index)) {
00606       if (row->fixed_pitch == 0) {
00607         lower = row->pr_nonsp;
00608         upper = row->pr_space;
00609         row->space_size = upper;
00610         row->kern_size = lower;
00611       }
00612     }
00613     row_index++;
00614   }
00615   count_block_votes(block,
00616                     def_fixed,
00617                     def_prop,
00618                     maybe_fixed,
00619                     maybe_prop,
00620                     corr_fixed,
00621                     corr_prop,
00622                     dunno);
00623   if (testing_on
00624     && (textord_debug_pitch_test
00625   || textord_blocksall_prop || textord_blocksall_fixed)) {
00626     tprintf ("Initially:");
00627     print_block_counts(block, block_index);
00628   }
00629   if (def_fixed > def_prop * textord_words_veto_power)
00630     block->pitch_decision = PITCH_DEF_FIXED;
00631   else if (def_prop > def_fixed * textord_words_veto_power)
00632     block->pitch_decision = PITCH_DEF_PROP;
00633   else if (def_fixed > 0 || def_prop > 0)
00634     block->pitch_decision = PITCH_DUNNO;
00635   else if (maybe_fixed > maybe_prop * textord_words_veto_power)
00636     block->pitch_decision = PITCH_MAYBE_FIXED;
00637   else if (maybe_prop > maybe_fixed * textord_words_veto_power)
00638     block->pitch_decision = PITCH_MAYBE_PROP;
00639   else
00640     block->pitch_decision = PITCH_DUNNO;
00641   return FALSE;
00642 }

float tune_row_pitch ( TO_ROW row,
STATS projection,
INT16  projection_left,
INT16  projection_right,
float  space_size,
float &  initial_pitch,
float &  best_sp_sd,
INT16 best_mid_cuts,
ICOORDELT_LIST *  best_cells,
BOOL8  testing_on 
)

Find fp cells.

Use a dp algorithm to fit the character cells and return the sd of the cell size over the row.

Definition at line 1185 of file topitch.cpp.

References compute_pitch_sd(), print_pitch_sd(), projection, tprintf(), and tune_row_pitch2().

Referenced by fix_row_pitch(), fixed_pitch_row(), and try_doc_fixed().

01196                       {
01197   int pitch_delta;               //offset pitch
01198   INT16 mid_cuts;                //cheap cuts
01199   float pitch_sd;                //current sd
01200   float best_sd;                 //best result
01201   float best_pitch;              //pitch for best result
01202   float initial_sd;              //starting error
01203   float sp_sd;                   //space sd
01204   ICOORDELT_LIST test_cells;     //row cells
01205   ICOORDELT_IT best_it;          //start of best list
01206 
01207   if (textord_fast_pitch_test)
01208     return tune_row_pitch2 (row, projection, projection_left,
01209       projection_right, space_size, initial_pitch,
01210       best_sp_sd,
01211     //space sd
01212       best_mid_cuts, best_cells, testing_on);
01213   if (textord_disable_pitch_test) {
01214     best_sp_sd = initial_pitch;
01215     return initial_pitch;
01216   }
01217   initial_sd =
01218     compute_pitch_sd(row,
01219                      projection,
01220                      projection_left,
01221                      projection_right,
01222                      space_size,
01223                      initial_pitch,
01224                      best_sp_sd,
01225                      best_mid_cuts,
01226                      best_cells,
01227                      testing_on);
01228   best_sd = initial_sd;
01229   best_pitch = initial_pitch;
01230   if (testing_on)
01231     tprintf ("tune_row_pitch:start pitch=%g, sd=%g\n", best_pitch, best_sd);
01232   for (pitch_delta = 1; pitch_delta <= textord_pitch_range; pitch_delta++) {
01233     pitch_sd =
01234       compute_pitch_sd (row, projection, projection_left, projection_right,
01235       space_size, initial_pitch + pitch_delta, sp_sd,
01236       mid_cuts, &test_cells, testing_on);
01237     if (testing_on)
01238       tprintf ("testing pitch at %g, sd=%g\n", initial_pitch + pitch_delta,
01239         pitch_sd);
01240     if (pitch_sd < best_sd) {
01241       best_sd = pitch_sd;
01242       best_mid_cuts = mid_cuts;
01243       best_sp_sd = sp_sd;
01244       best_pitch = initial_pitch + pitch_delta;
01245       best_cells->clear ();
01246       best_it.set_to_list (best_cells);
01247       best_it.add_list_after (&test_cells);
01248     }
01249     else
01250       test_cells.clear ();
01251     if (pitch_sd > initial_sd)
01252       break;                     //getting worse
01253   }
01254   for (pitch_delta = 1; pitch_delta <= textord_pitch_range; pitch_delta++) {
01255     pitch_sd =
01256       compute_pitch_sd (row, projection, projection_left, projection_right,
01257       space_size, initial_pitch - pitch_delta, sp_sd,
01258       mid_cuts, &test_cells, testing_on);
01259     if (testing_on)
01260       tprintf ("testing pitch at %g, sd=%g\n", initial_pitch - pitch_delta,
01261         pitch_sd);
01262     if (pitch_sd < best_sd) {
01263       best_sd = pitch_sd;
01264       best_mid_cuts = mid_cuts;
01265       best_sp_sd = sp_sd;
01266       best_pitch = initial_pitch - pitch_delta;
01267       best_cells->clear ();
01268       best_it.set_to_list (best_cells);
01269       best_it.add_list_after (&test_cells);
01270     }
01271     else
01272       test_cells.clear ();
01273     if (pitch_sd > initial_sd)
01274       break;
01275   }
01276   initial_pitch = best_pitch;
01277 
01278   if (textord_debug_pitch_metric)
01279     print_pitch_sd(row,
01280                    projection,
01281                    projection_left,
01282                    projection_right,
01283                    space_size,
01284                    best_pitch);
01285 
01286   return best_sd;
01287 }

float tune_row_pitch2 ( TO_ROW row,
STATS projection,
INT16  projection_left,
INT16  projection_right,
float  space_size,
float &  initial_pitch,
float &  best_sp_sd,
INT16 best_mid_cuts,
ICOORDELT_LIST *  best_cells,
BOOL8  testing_on 
)

Find fp cells.

Use a dp algorithm to fit the character cells and return the sd of the cell size over the row.

Definition at line 1296 of file topitch.cpp.

References compute_pitch_sd(), NULL, STATS::pile_count(), print_pitch_sd(), projection, and tprintf().

Referenced by tune_row_pitch().

01307                        {
01308   int pitch_delta;               //offset pitch
01309   INT16 pixel;                   //pixel coord
01310   INT16 best_pixel;              //pixel coord
01311   INT16 best_delta;              //best pitch
01312   INT16 best_pitch;              //best pitch
01313   INT16 start;                   //of good range
01314   INT16 end;                     //of good range
01315   INT32 best_count;              //lowest sum
01316   float best_sd;                 //best result
01317   STATS *sum_proj;               //summed projection
01318 
01319   best_sp_sd = initial_pitch;
01320 
01321   if (textord_disable_pitch_test) {
01322     return initial_pitch;
01323   }
01324   sum_proj = new STATS[textord_pitch_range * 2 + 1];
01325   if (sum_proj == NULL)
01326     return initial_pitch;
01327   best_pitch = (INT32) initial_pitch;
01328 
01329   for (pitch_delta = -textord_pitch_range; pitch_delta <= textord_pitch_range;
01330     pitch_delta++)
01331   sum_proj[textord_pitch_range + pitch_delta].set_range (0,
01332       best_pitch +
01333       pitch_delta + 1);
01334   for (pixel = projection_left; pixel <= projection_right; pixel++) {
01335     for (pitch_delta = -textord_pitch_range;
01336       pitch_delta <= textord_pitch_range; pitch_delta++)
01337     sum_proj[textord_pitch_range +
01338         pitch_delta].add ((pixel - projection_left) % (best_pitch +
01339         pitch_delta),
01340         projection->pile_count (pixel));
01341   }
01342   best_count = sum_proj[textord_pitch_range].pile_count (0);
01343   best_delta = 0;
01344   best_pixel = 0;
01345   for (pitch_delta = -textord_pitch_range; pitch_delta <= textord_pitch_range;
01346   pitch_delta++) {
01347     for (pixel = 0; pixel < best_pitch + pitch_delta; pixel++) {
01348       if (sum_proj[textord_pitch_range + pitch_delta].pile_count (pixel)
01349       < best_count) {
01350         best_count =
01351           sum_proj[textord_pitch_range +
01352           pitch_delta].pile_count (pixel);
01353         best_delta = pitch_delta;
01354         best_pixel = pixel;
01355       }
01356     }
01357   }
01358   if (testing_on)
01359     tprintf ("tune_row_pitch:start pitch=%g, best_delta=%d, count=%d\n",
01360       initial_pitch, best_delta, best_count);
01361   best_pitch += best_delta;
01362   initial_pitch = best_pitch;
01363   best_count++;
01364   best_count += best_count;
01365   for (start = best_pixel - 2; start > best_pixel - best_pitch
01366     && sum_proj[textord_pitch_range +
01367     best_delta].pile_count (start % best_pitch) <= best_count;
01368     start--);
01369   for (end = best_pixel + 2;
01370     end < best_pixel + best_pitch
01371     && sum_proj[textord_pitch_range +
01372     best_delta].pile_count (end % best_pitch) <= best_count;
01373     end++);
01374 
01375   best_sd =
01376     compute_pitch_sd(row,
01377                      projection,
01378                      projection_left,
01379                      projection_right,
01380                      space_size,
01381                      initial_pitch,
01382                      best_sp_sd,
01383                      best_mid_cuts,
01384                      best_cells,
01385                      testing_on,
01386                      start,
01387                      end);
01388   if (testing_on)
01389     tprintf ("tune_row_pitch:output pitch=%g, sd=%g\n", initial_pitch,
01390       best_sd);
01391 
01392   if (textord_debug_pitch_metric)
01393     print_pitch_sd(row,
01394                    projection,
01395                    projection_left,
01396                    projection_right,
01397                    space_size,
01398                    initial_pitch);
01399 
01400   delete[]sum_proj;
01401 
01402   return best_sd;
01403 }


Generated on Wed Feb 28 19:49:26 2007 for Tesseract by  doxygen 1.5.1