00001
00018 #include "mfcpch.h"
00019 #include "tovars.h"
00020 #include "drawtord.h"
00021 #include "tospace.h"
00022 #include "ndminx.h"
00023 #include "statistc.h"
00024
00025 #define EXTERN
00026
00029 EXTERN BOOL_VAR (tosp_old_to_method, FALSE, "Space stats use prechopping?");
00030 EXTERN BOOL_VAR (tosp_only_use_prop_rows, TRUE,
00031 "Block stats to use fixed pitch rows?");
00032 EXTERN BOOL_VAR (tosp_use_pre_chopping, FALSE,
00033 "Space stats use prechopping?");
00034 EXTERN BOOL_VAR (tosp_old_to_bug_fix, FALSE, "Fix suspected bug in old code");
00035 EXTERN BOOL_VAR (tosp_block_use_cert_spaces, TRUE,
00036 "Only stat OBVIOUS spaces");
00037 EXTERN BOOL_VAR (tosp_row_use_cert_spaces, TRUE, "Only stat OBVIOUS spaces");
00038 EXTERN BOOL_VAR (tosp_narrow_blobs_not_cert, TRUE,
00039 "Only stat OBVIOUS spaces");
00040 EXTERN BOOL_VAR (tosp_row_use_cert_spaces1, TRUE, "Only stat OBVIOUS spaces");
00041 EXTERN BOOL_VAR (tosp_recovery_isolated_row_stats, TRUE,
00042 "Use row alone when inadequate cert spaces");
00043 EXTERN BOOL_VAR (tosp_only_small_gaps_for_kern, FALSE, "Better guess");
00044 EXTERN BOOL_VAR (tosp_all_flips_fuzzy, FALSE, "Pass ANY flip to context?");
00045 EXTERN BOOL_VAR (tosp_fuzzy_limit_all, TRUE,
00046 "Dont restrict kn->sp fuzzy limit to tables");
00047 EXTERN BOOL_VAR (tosp_stats_use_xht_gaps, TRUE,
00048 "Use within xht gap for wd breaks");
00049 EXTERN BOOL_VAR (tosp_use_xht_gaps, TRUE, "Use within xht gap for wd breaks");
00050 EXTERN BOOL_VAR (tosp_only_use_xht_gaps, FALSE,
00051 "Only use within xht gap for wd breaks");
00052 EXTERN BOOL_VAR (tosp_rule_9_test_punct, FALSE,
00053 "Dont chng kn to space next to punct");
00054 EXTERN BOOL_VAR (tosp_flip_fuzz_kn_to_sp, TRUE, "Default flip");
00055 EXTERN BOOL_VAR (tosp_flip_fuzz_sp_to_kn, TRUE, "Default flip");
00056 EXTERN BOOL_VAR (tosp_improve_thresh, FALSE, "Enable improvement heuristic");
00057 EXTERN INT_VAR (tosp_debug_level, 0, "Debug data");
00058 EXTERN INT_VAR (tosp_enough_space_samples_for_median, 3,
00059 "or should we use mean");
00060 EXTERN INT_VAR (tosp_redo_kern_limit, 10,
00061 "No.samples reqd to reestimate for row");
00062 EXTERN INT_VAR (tosp_few_samples, 40,
00063 "No.gaps reqd with 1 large gap to treat as a table");
00064 EXTERN INT_VAR (tosp_short_row, 20,
00065 "No.gaps reqd with few cert spaces to use certs");
00066 EXTERN INT_VAR (tosp_sanity_method, 1, "How to avoid being silly");
00067 EXTERN double_VAR (tosp_threshold_bias1, 0,
00068 "how far between kern and space?");
00069 EXTERN double_VAR (tosp_threshold_bias2, 0,
00070 "how far between kern and space?");
00071 EXTERN double_VAR (tosp_narrow_fraction, 0.3, "Fract of xheight for narrow");
00072 EXTERN double_VAR (tosp_narrow_aspect_ratio, 0.48,
00073 "narrow if w/h less than this");
00074 EXTERN double_VAR (tosp_wide_fraction, 0.52, "Fract of xheight for wide");
00075 EXTERN double_VAR (tosp_wide_aspect_ratio, 0.0, "wide if w/h less than this");
00076 EXTERN double_VAR (tosp_fuzzy_space_factor, 0.6,
00077 "Fract of xheight for fuzz sp");
00078 EXTERN double_VAR (tosp_fuzzy_space_factor1, 0.5,
00079 "Fract of xheight for fuzz sp");
00080 EXTERN double_VAR (tosp_fuzzy_space_factor2, 0.72,
00081 "Fract of xheight for fuzz sp");
00082 EXTERN double_VAR (tosp_gap_factor, 0.83, "gap ratio to flip sp->kern");
00083 EXTERN double_VAR (tosp_kern_gap_factor1, 2.0, "gap ratio to flip kern->sp");
00084 EXTERN double_VAR (tosp_kern_gap_factor2, 1.3, "gap ratio to flip kern->sp");
00085 EXTERN double_VAR (tosp_kern_gap_factor3, 2.5, "gap ratio to flip kern->sp");
00086 EXTERN double_VAR (tosp_ignore_big_gaps, -1, "xht multiplier");
00087 EXTERN double_VAR (tosp_ignore_very_big_gaps, 3.5, "xht multiplier");
00088 EXTERN double_VAR (tosp_rep_space, 1.6, "rep gap multiplier for space");
00089 EXTERN double_VAR (tosp_enough_small_gaps, 0.65,
00090 "Fract of kerns reqd for isolated row stats");
00091 EXTERN double_VAR (tosp_table_kn_sp_ratio, 2.25,
00092 "Min difference of kn & sp in table");
00093 EXTERN double_VAR (tosp_table_xht_sp_ratio, 0.33,
00094 "Expect spaces bigger than this");
00095 EXTERN double_VAR (tosp_table_fuzzy_kn_sp_ratio, 3.0,
00096 "Fuzzy if less than this");
00097 EXTERN double_VAR (tosp_fuzzy_kn_fraction, 0.5, "New fuzzy kn alg");
00098 EXTERN double_VAR (tosp_fuzzy_sp_fraction, 0.5, "New fuzzy sp alg");
00099 EXTERN double_VAR (tosp_min_sane_kn_sp, 1.5,
00100 "Dont trust spaces less than this time kn");
00101 EXTERN double_VAR (tosp_init_guess_kn_mult, 2.2,
00102 "Thresh guess - mult kn by this");
00103 EXTERN double_VAR (tosp_init_guess_xht_mult, 0.28,
00104 "Thresh guess - mult xht by this");
00105 EXTERN double_VAR (tosp_max_sane_kn_thresh, 5.0,
00106 "Multiplier on kn to limit thresh");
00107 EXTERN double_VAR (tosp_flip_caution, 0.0,
00108 "Dont autoflip kn to sp when large separation");
00109
00110 EXTERN double_VAR (tosp_large_kerning, 0.19,
00111 "Limit use of xht gap with large kns");
00112 EXTERN double_VAR (tosp_dont_fool_with_small_kerns, -1,
00113 "Limit use of xht gap with odd small kns");
00114 EXTERN double_VAR (tosp_near_lh_edge, 0,
00115 "Dont reduce box if the top left is non blank");
00116 EXTERN double_VAR (tosp_silly_kn_sp_gap, 0.2,
00117 "Dont let sp minus kn get too small");
00118 EXTERN double_VAR (tosp_pass_wide_fuzz_sp_to_context, 0.75,
00119 "How wide fuzzies need context");
00123 #define MAXSPACING 128
00124
00138 void to_spacing(
00139 ICOORD page_tr,
00140 TO_BLOCK_LIST *blocks
00141 ) {
00142 TO_BLOCK_IT block_it;
00143 TO_BLOCK *block;
00144 TO_ROW_IT row_it;
00145 TO_ROW *row;
00146 int block_index;
00147 int row_index;
00148 INT16 block_space_gap_width;
00149
00150 INT16 block_non_space_gap_width;
00151
00152 BOOL8 old_text_ord_proportional;
00153 GAPMAP *gapmap = NULL;
00154
00155 block_it.set_to_list (blocks);
00156 block_index = 1;
00157 for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
00158 block_it.forward ()) {
00159 block = block_it.data ();
00160 gapmap = new GAPMAP (block);
00161 block_spacing_stats(block,
00162 gapmap,
00163 old_text_ord_proportional,
00164 block_space_gap_width,
00165 block_non_space_gap_width);
00166 row_it.set_to_list (block->get_rows ());
00167 row_index = 1;
00168 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00169 row = row_it.data ();
00170 if ((row->pitch_decision == PITCH_DEF_PROP) ||
00171 (row->pitch_decision == PITCH_CORR_PROP)) {
00172 if ((tosp_debug_level > 0) && !old_text_ord_proportional)
00173 tprintf ("Block %d Row %d: Now Proportional\n",
00174 block_index, row_index);
00175 row_spacing_stats(row,
00176 gapmap,
00177 block_index,
00178 row_index,
00179 block_space_gap_width,
00180 block_non_space_gap_width);
00181 }
00182 else {
00183 if ((tosp_debug_level > 0) && old_text_ord_proportional)
00184 tprintf
00185 ("Block %d Row %d: Now Fixed Pitch Decision:%d fp flag:%f\n",
00186 block_index, row_index, row->pitch_decision,
00187 row->fixed_pitch);
00188 }
00189 #ifndef GRAPHICS_DISABLED
00190 if (textord_show_initial_words)
00191 plot_word_decisions (to_win, (INT16) row->fixed_pitch, row);
00192 #endif
00193 row_index++;
00194 }
00195 delete gapmap;
00196 block_index++;
00197 }
00198 }
00199
00200
00206 void block_spacing_stats(
00207 TO_BLOCK *block,
00208 GAPMAP *gapmap,
00209 BOOL8 &old_text_ord_proportional,
00210 INT16 &block_space_gap_width,
00211 INT16 &block_non_space_gap_width
00212 ) {
00213 TO_ROW_IT row_it;
00214 TO_ROW *row;
00215 BLOBNBOX_IT blob_it;
00216
00217 STATS centre_to_centre_stats (0, MAXSPACING);
00218
00219 STATS all_gap_stats (0, MAXSPACING);
00220 STATS space_gap_stats (0, MAXSPACING);
00221 INT16 minwidth = MAX_INT16;
00222 BOX blob_box;
00223 BOX prev_blob_box;
00224 INT16 centre_to_centre;
00225 INT16 gap_width;
00226 float real_space_threshold;
00227 float iqr_centre_to_centre;
00228 float iqr_all_gap_stats;
00229 INT32 end_of_row;
00230 INT32 row_length;
00231
00232 row_it.set_to_list (block->get_rows ());
00233 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00234 row = row_it.data ();
00235 if (!row->blob_list ()->empty () &&
00236 (!tosp_only_use_prop_rows ||
00237 (row->pitch_decision == PITCH_DEF_PROP) ||
00238 (row->pitch_decision == PITCH_CORR_PROP))) {
00239 blob_it.set_to_list (row->blob_list ());
00240 blob_it.mark_cycle_pt ();
00241 end_of_row = blob_it.data_relative (-1)->bounding_box ().right ();
00242 if (tosp_use_pre_chopping)
00243 blob_box = box_next_pre_chopped (&blob_it);
00244 else if (tosp_stats_use_xht_gaps)
00245 blob_box = reduced_box_next (row, &blob_it);
00246 else
00247 blob_box = box_next (&blob_it);
00248 row_length = end_of_row - blob_box.left ();
00249 if (blob_box.width () < minwidth)
00250 minwidth = blob_box.width ();
00251 prev_blob_box = blob_box;
00252 while (!blob_it.cycled_list ()) {
00253 if (tosp_use_pre_chopping)
00254 blob_box = box_next_pre_chopped (&blob_it);
00255 else if (tosp_stats_use_xht_gaps)
00256 blob_box = reduced_box_next (row, &blob_it);
00257 else
00258 blob_box = box_next (&blob_it);
00259 if (blob_box.width () < minwidth)
00260 minwidth = blob_box.width ();
00261 gap_width = blob_box.left () - prev_blob_box.right ();
00262 if (!ignore_big_gap (row, row_length, gapmap,
00263 prev_blob_box.right (), blob_box.left ())) {
00264 all_gap_stats.add (gap_width, 1);
00265
00266 centre_to_centre = (blob_box.left () + blob_box.right () -
00267 (prev_blob_box.left () +
00268 prev_blob_box.right ())) / 2;
00269
00270 centre_to_centre_stats.add (centre_to_centre, 1);
00271
00272 }
00273 prev_blob_box = blob_box;
00274 }
00275 }
00276 }
00277
00278
00279 if (all_gap_stats.get_total () <= 1) {
00280 block_non_space_gap_width = minwidth;
00281 block_space_gap_width = -1;
00282
00283 old_text_ord_proportional = TRUE;
00284 }
00285 else {
00286
00287 iqr_centre_to_centre = centre_to_centre_stats.ile (0.75) -
00288 centre_to_centre_stats.ile (0.25);
00289 iqr_all_gap_stats = all_gap_stats.ile (0.75) - all_gap_stats.ile (0.25);
00290 old_text_ord_proportional =
00291 iqr_centre_to_centre * 2 > iqr_all_gap_stats;
00292
00293
00294
00295
00296
00297
00298
00299
00300
00301
00302
00303 block_non_space_gap_width = (INT16) floor (all_gap_stats.median ());
00304
00305
00306 row_it.set_to_list (block->get_rows ());
00307 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00308 row = row_it.data ();
00309 if (!row->blob_list ()->empty () &&
00310 (!tosp_only_use_prop_rows ||
00311 (row->pitch_decision == PITCH_DEF_PROP) ||
00312 (row->pitch_decision == PITCH_CORR_PROP))) {
00313 real_space_threshold =
00314 MAX (tosp_init_guess_kn_mult * block_non_space_gap_width,
00315 tosp_init_guess_xht_mult * row->xheight);
00316 blob_it.set_to_list (row->blob_list ());
00317 blob_it.mark_cycle_pt ();
00318 end_of_row =
00319 blob_it.data_relative (-1)->bounding_box ().right ();
00320 if (tosp_use_pre_chopping)
00321 blob_box = box_next_pre_chopped (&blob_it);
00322 else if (tosp_stats_use_xht_gaps)
00323 blob_box = reduced_box_next (row, &blob_it);
00324 else
00325 blob_box = box_next (&blob_it);
00326 row_length = blob_box.left () - end_of_row;
00327 prev_blob_box = blob_box;
00328 while (!blob_it.cycled_list ()) {
00329 if (tosp_use_pre_chopping)
00330 blob_box = box_next_pre_chopped (&blob_it);
00331 else if (tosp_stats_use_xht_gaps)
00332 blob_box = reduced_box_next (row, &blob_it);
00333 else
00334 blob_box = box_next (&blob_it);
00335 gap_width = blob_box.left () - prev_blob_box.right ();
00336 if ((gap_width > real_space_threshold) &&
00337 !ignore_big_gap (row, row_length, gapmap,
00338 prev_blob_box.right (),
00339 blob_box.left ())) {
00340
00341
00342
00343
00344
00345
00346 if (!tosp_block_use_cert_spaces ||
00347 (gap_width >
00348 tosp_fuzzy_space_factor2 * row->xheight)
00349 ||
00350 ((gap_width >
00351 tosp_fuzzy_space_factor1 * row->xheight)
00352 && (!tosp_narrow_blobs_not_cert
00353 || (!narrow_blob (row, prev_blob_box)
00354 && !narrow_blob (row, blob_box))))
00355 || (wide_blob (row, prev_blob_box)
00356 && wide_blob (row, blob_box)))
00357 space_gap_stats.add (gap_width, 1);
00358 }
00359 prev_blob_box = blob_box;
00360 }
00361 }
00362 }
00363
00364 if (space_gap_stats.get_total () <= 2)
00365 block_space_gap_width = -1;
00366 else
00367 block_space_gap_width =
00368 MAX ((INT16) floor (space_gap_stats.median ()),
00369 3 * block_non_space_gap_width);
00370 }
00371 }
00372
00373
00381 void row_spacing_stats(
00382 TO_ROW *row,
00383 GAPMAP *gapmap,
00384 INT16 block_idx,
00385 INT16 row_idx,
00386 INT16 block_space_gap_width,
00387 INT16 block_non_space_gap_width
00388 ) {
00389
00390 BLOBNBOX_IT blob_it = row->blob_list ();
00391 STATS all_gap_stats (0, MAXSPACING);
00392 STATS cert_space_gap_stats (0, MAXSPACING);
00393 STATS all_space_gap_stats (0, MAXSPACING);
00394 STATS small_gap_stats (0, MAXSPACING);
00395 BOX blob_box;
00396 BOX prev_blob_box;
00397 INT16 gap_width;
00398 INT16 real_space_threshold = 0;
00399 INT16 max = 0;
00400 INT16 index;
00401 INT16 large_gap_count = 0;
00402 BOOL8 suspected_table;
00403 INT32 max_max_nonspace;
00404 BOOL8 good_block_space_estimate = block_space_gap_width > 0;
00405 INT32 end_of_row;
00406 INT32 row_length = 0;
00407 float sane_space;
00408 INT32 sane_threshold;
00409
00410
00411
00412 if (!good_block_space_estimate)
00413 block_space_gap_width = INT16 (floor (row->xheight / 2));
00414 if (!row->blob_list ()->empty ()) {
00415 if (tosp_threshold_bias1 > 0)
00416 real_space_threshold =
00417 block_non_space_gap_width +
00418 INT16 (floor (0.5 +
00419 tosp_threshold_bias1 * (block_space_gap_width -
00420 block_non_space_gap_width)));
00421 else
00422 real_space_threshold =
00423 (block_space_gap_width + block_non_space_gap_width) / 2;
00424 blob_it.set_to_list (row->blob_list ());
00425 blob_it.mark_cycle_pt ();
00426 end_of_row = blob_it.data_relative (-1)->bounding_box ().right ();
00427 if (tosp_use_pre_chopping)
00428 blob_box = box_next_pre_chopped (&blob_it);
00429 else if (tosp_stats_use_xht_gaps)
00430 blob_box = reduced_box_next (row, &blob_it);
00431 else
00432 blob_box = box_next (&blob_it);
00433 row_length = end_of_row - blob_box.left ();
00434 prev_blob_box = blob_box;
00435 while (!blob_it.cycled_list ()) {
00436 if (tosp_use_pre_chopping)
00437 blob_box = box_next_pre_chopped (&blob_it);
00438 else if (tosp_stats_use_xht_gaps)
00439 blob_box = reduced_box_next (row, &blob_it);
00440 else
00441 blob_box = box_next (&blob_it);
00442 gap_width = blob_box.left () - prev_blob_box.right ();
00443 if (ignore_big_gap (row, row_length, gapmap,
00444 prev_blob_box.right (), blob_box.left ()))
00445 large_gap_count++;
00446 else {
00447 if (gap_width >= real_space_threshold) {
00448 if (!tosp_row_use_cert_spaces ||
00449 (gap_width > tosp_fuzzy_space_factor2 * row->xheight) ||
00450 ((gap_width > tosp_fuzzy_space_factor1 * row->xheight)
00451 && (!tosp_narrow_blobs_not_cert
00452 || (!narrow_blob (row, prev_blob_box)
00453 && !narrow_blob (row, blob_box))))
00454 || (wide_blob (row, prev_blob_box)
00455 && wide_blob (row, blob_box)))
00456 cert_space_gap_stats.add (gap_width, 1);
00457 all_space_gap_stats.add (gap_width, 1);
00458 }
00459 else
00460 small_gap_stats.add (gap_width, 1);
00461 all_gap_stats.add (gap_width, 1);
00462 }
00463 prev_blob_box = blob_box;
00464 }
00465 }
00466 suspected_table = (large_gap_count > 1) ||
00467 ((large_gap_count > 0) &&
00468 (all_gap_stats.get_total () <= tosp_few_samples));
00469
00470
00471
00472 if ((cert_space_gap_stats.get_total () >=
00473 tosp_enough_space_samples_for_median) ||
00474 ((suspected_table ||
00475 all_gap_stats.get_total () <= tosp_short_row) &&
00476 cert_space_gap_stats.get_total () > 0))
00477 old_to_method(row,
00478 &all_gap_stats,
00479 &cert_space_gap_stats,
00480 &small_gap_stats,
00481 block_space_gap_width,
00482 block_non_space_gap_width);
00483 else {
00484 if (!tosp_recovery_isolated_row_stats ||
00485 !isolated_row_stats (row, gapmap, &all_gap_stats, suspected_table,
00486 block_idx, row_idx)) {
00487 if (tosp_row_use_cert_spaces && (tosp_debug_level > 5))
00488 tprintf ("B:%d R:%d -- Inadequate certain spaces.\n",
00489 block_idx, row_idx);
00490 if (tosp_row_use_cert_spaces1 && good_block_space_estimate) {
00491
00492 row->space_size = block_space_gap_width;
00493 if (all_gap_stats.get_total () > tosp_redo_kern_limit)
00494 row->kern_size = all_gap_stats.median ();
00495 else
00496 row->kern_size = block_non_space_gap_width;
00497 row->space_threshold =
00498 INT32 (floor ((row->space_size + row->kern_size) / 2));
00499 }
00500 else
00501 old_to_method(row,
00502 &all_gap_stats,
00503 &all_space_gap_stats,
00504 &small_gap_stats,
00505 block_space_gap_width,
00506 block_non_space_gap_width);
00507 }
00508 }
00509
00510 if (tosp_improve_thresh && !suspected_table)
00511 improve_row_threshold(row, &all_gap_stats);
00512
00513
00514
00515 if (tosp_sanity_method == 0) {
00516 if (suspected_table &&
00517 (row->space_size < tosp_table_kn_sp_ratio * row->kern_size)) {
00518 if (tosp_debug_level > 0)
00519 tprintf ("B:%d R:%d -- DONT BELIEVE SPACE %3.2f %d %3.2f.\n",
00520 block_idx, row_idx,
00521 row->kern_size, row->space_threshold, row->space_size);
00522 row->space_threshold =
00523 (INT32) (tosp_table_kn_sp_ratio * row->kern_size);
00524 row->space_size = MAX (row->space_threshold + 1, row->xheight);
00525 }
00526 }
00527 else if (tosp_sanity_method == 1) {
00528 sane_space = row->space_size;
00529
00530 if ((row->space_size < tosp_min_sane_kn_sp * MAX (row->kern_size, 2.5))
00531 || ((row->space_size - row->kern_size) <
00532 (tosp_silly_kn_sp_gap * row->xheight))) {
00533 if (good_block_space_estimate &&
00534 (block_space_gap_width >= tosp_min_sane_kn_sp * row->kern_size))
00535 sane_space = block_space_gap_width;
00536 else
00537 sane_space =
00538 MAX (tosp_min_sane_kn_sp * MAX (row->kern_size, 2.5),
00539 row->xheight / 2);
00540 if (tosp_debug_level > 0)
00541 tprintf
00542 ("B:%d R:%d -- DONT BELIEVE SPACE %3.2f %d %3.2f -> %3.2f.\n",
00543 block_idx, row_idx, row->kern_size, row->space_threshold,
00544 row->space_size, sane_space);
00545 row->space_size = sane_space;
00546 row->space_threshold =
00547 INT32 (floor ((row->space_size + row->kern_size) / 2));
00548 }
00549
00550 sane_threshold = INT32 (floor (tosp_max_sane_kn_thresh *
00551 MAX (row->kern_size, 2.5)));
00552 if (row->space_threshold > sane_threshold) {
00553 if (tosp_debug_level > 0)
00554 tprintf ("B:%d R:%d -- DONT BELIEVE THRESH %3.2f %d %3.2f->%d.\n",
00555 block_idx, row_idx,
00556 row->kern_size,
00557 row->space_threshold, row->space_size, sane_threshold);
00558 row->space_threshold = sane_threshold;
00559 if (row->space_size <= sane_threshold)
00560 row->space_size = row->space_threshold + 1.0f;
00561 }
00562
00563 if (suspected_table) {
00564 sane_space = MAX (tosp_table_kn_sp_ratio * row->kern_size,
00565 tosp_table_xht_sp_ratio * row->xheight);
00566 sane_threshold = INT32 (floor ((sane_space + row->kern_size) / 2));
00567
00568 if ((row->space_size < sane_space) ||
00569 (row->space_threshold < sane_threshold)) {
00570 if (tosp_debug_level > 0)
00571 tprintf ("B:%d R:%d -- SUSPECT NO SPACES %3.2f %d %3.2f.\n",
00572 block_idx, row_idx,
00573 row->kern_size,
00574 row->space_threshold, row->space_size);
00575
00576 row->space_threshold = (INT32) sane_space;
00577 row->space_size = MAX (row->space_threshold + 1, row->xheight);
00578 }
00579 }
00580 }
00581
00582
00583
00584 if (tosp_old_to_method) {
00585
00586
00587 row->max_nonspace = row->space_threshold;
00588
00589 row->min_space = row->space_threshold + 1;
00590 }
00591 else {
00592
00593 row->min_space =
00594 MIN (INT32 (ceil (tosp_fuzzy_space_factor * row->xheight)),
00595 INT32 (row->space_size));
00596 if (row->min_space <= row->space_threshold)
00597
00598 row->min_space = row->space_threshold + 1;
00599
00600
00601
00602
00603
00604
00605
00606
00607
00608
00609
00610
00611
00612
00613
00614 max_max_nonspace = INT32 ((row->space_threshold + row->kern_size) / 2);
00615
00616
00617 row->max_nonspace = max_max_nonspace;
00618 for (index = 0; index <= max_max_nonspace; index++) {
00619 if (all_gap_stats.pile_count (index) > max)
00620 max = all_gap_stats.pile_count (index);
00621 if ((index > row->kern_size) &&
00622 (all_gap_stats.pile_count (index) < 0.1 * max)) {
00623 row->max_nonspace = index;
00624 break;
00625 }
00626 }
00627 }
00628
00629
00630
00631
00632 if ((tosp_fuzzy_sp_fraction > 0) &&
00633 (row->space_size > row->space_threshold))
00634 row->min_space = MAX (row->min_space,
00635 (INT32) ceil (row->space_threshold +
00636 tosp_fuzzy_sp_fraction *
00637 (row->space_size -
00638 row->space_threshold)));
00639
00640
00641
00642
00643
00644
00645
00646
00647
00648
00649
00650
00651 if ((tosp_table_fuzzy_kn_sp_ratio > 0) &&
00652 (suspected_table || tosp_fuzzy_limit_all))
00653 row->min_space = MAX (row->min_space,
00654 (INT32) ceil (tosp_table_fuzzy_kn_sp_ratio *
00655 row->kern_size));
00656
00657 if ((tosp_fuzzy_kn_fraction > 0) && (row->kern_size < row->space_threshold))
00658 row->max_nonspace = (INT32) floor (0.5 + row->kern_size +
00659 tosp_fuzzy_kn_fraction *
00660 (row->space_threshold -
00661 row->kern_size));
00662
00663 if (row->max_nonspace > row->space_threshold)
00664
00665 row->max_nonspace = row->space_threshold;
00666
00667 if (tosp_debug_level > 5)
00668 tprintf
00669 ("B:%d R:%d L:%d-- Kn:%d Sp:%d Thr:%d -- Kn:%3.2f (%d) Thr:%d (%d) Sp:%3.2f\n",
00670 block_idx, row_idx, row_length, block_non_space_gap_width,
00671 block_space_gap_width, real_space_threshold, row->kern_size,
00672 row->max_nonspace, row->space_threshold, row->min_space,
00673 row->space_size);
00674 }
00675
00676
00680 void old_to_method(
00681 TO_ROW *row,
00682 STATS *all_gap_stats,
00683 STATS *space_gap_stats,
00684 STATS *small_gap_stats,
00685 INT16 block_space_gap_width,
00686 INT16 block_non_space_gap_width
00687 ) {
00688
00689 if (space_gap_stats->get_total () >= tosp_enough_space_samples_for_median) {
00690
00691
00692 row->space_size = space_gap_stats->median ();
00693 if (row->space_size > block_space_gap_width * 1.5) {
00694 if (tosp_old_to_bug_fix)
00695 row->space_size = block_space_gap_width * 1.5;
00696 else
00697
00698 row->space_size = block_space_gap_width;
00699 }
00700 if (row->space_size < (block_non_space_gap_width * 2) + 1)
00701 row->space_size = (block_non_space_gap_width * 2) + 1;
00702 }
00703
00704 else if (space_gap_stats->get_total () >= 1) {
00705
00706 row->space_size = space_gap_stats->mean ();
00707 if (row->space_size > block_space_gap_width * 1.5) {
00708 if (tosp_old_to_bug_fix)
00709 row->space_size = block_space_gap_width * 1.5;
00710 else
00711
00712 row->space_size = block_space_gap_width;
00713 }
00714 if (row->space_size < (block_non_space_gap_width * 3) + 1)
00715 row->space_size = (block_non_space_gap_width * 3) + 1;
00716 }
00717 else
00718
00719 row->space_size = block_space_gap_width;
00720
00721 if ((tosp_only_small_gaps_for_kern) &&
00722 (small_gap_stats->get_total () > tosp_redo_kern_limit))
00723 row->kern_size = small_gap_stats->median ();
00724 else if (all_gap_stats->get_total () > tosp_redo_kern_limit)
00725 row->kern_size = all_gap_stats->median ();
00726 else
00727
00728 row->kern_size = block_non_space_gap_width;
00729
00730 if (tosp_threshold_bias2 > 0)
00731 row->space_threshold =
00732 INT32 (floor (0.5 + row->kern_size +
00733 tosp_threshold_bias2 * (row->space_size -
00734 row->kern_size)));
00735 else
00736
00737
00738
00739
00740
00741
00742
00743
00744
00745
00746 row->space_threshold =
00747 INT32 (floor ((row->space_size + row->kern_size) / 2));
00748 }
00749
00750
00754 BOOL8 isolated_row_stats(TO_ROW *row,
00755 GAPMAP *gapmap,
00756 STATS *all_gap_stats,
00757 BOOL8 suspected_table,
00758 INT16 block_idx,
00759 INT16 row_idx) {
00760 float kern_estimate;
00761 float crude_threshold_estimate;
00762 INT16 small_gaps_count;
00763 INT16 total;
00764
00765 BLOBNBOX_IT blob_it = row->blob_list ();
00766 STATS cert_space_gap_stats (0, MAXSPACING);
00767 STATS all_space_gap_stats (0, MAXSPACING);
00768 STATS small_gap_stats (0, MAXSPACING);
00769 BOX blob_box;
00770 BOX prev_blob_box;
00771 INT16 gap_width;
00772 INT32 end_of_row;
00773 INT32 row_length;
00774
00775 kern_estimate = all_gap_stats->median ();
00776 crude_threshold_estimate = MAX (tosp_init_guess_kn_mult * kern_estimate,
00777 tosp_init_guess_xht_mult * row->xheight);
00778 small_gaps_count = stats_count_under (all_gap_stats,
00779 (INT16)
00780 ceil (crude_threshold_estimate));
00781 total = all_gap_stats->get_total ();
00782
00783 if ((total <= tosp_redo_kern_limit) ||
00784 ((small_gaps_count / (float) total) < tosp_enough_small_gaps) ||
00785 (total - small_gaps_count < 1)) {
00786 if (tosp_debug_level > 5)
00787 tprintf ("B:%d R:%d -- Cant do isolated row stats.\n",
00788 block_idx, row_idx);
00789 return FALSE;
00790 }
00791 blob_it.set_to_list (row->blob_list ());
00792 blob_it.mark_cycle_pt ();
00793 end_of_row = blob_it.data_relative (-1)->bounding_box ().right ();
00794 if (tosp_use_pre_chopping)
00795 blob_box = box_next_pre_chopped (&blob_it);
00796 else if (tosp_stats_use_xht_gaps)
00797 blob_box = reduced_box_next (row, &blob_it);
00798 else
00799 blob_box = box_next (&blob_it);
00800 row_length = end_of_row - blob_box.left ();
00801 prev_blob_box = blob_box;
00802 while (!blob_it.cycled_list ()) {
00803 if (tosp_use_pre_chopping)
00804 blob_box = box_next_pre_chopped (&blob_it);
00805 else if (tosp_stats_use_xht_gaps)
00806 blob_box = reduced_box_next (row, &blob_it);
00807 else
00808 blob_box = box_next (&blob_it);
00809 gap_width = blob_box.left () - prev_blob_box.right ();
00810 if (!ignore_big_gap (row, row_length, gapmap,
00811 prev_blob_box.right (), blob_box.left ()) &&
00812 (gap_width > crude_threshold_estimate)) {
00813 if ((gap_width > tosp_fuzzy_space_factor2 * row->xheight) ||
00814 ((gap_width > tosp_fuzzy_space_factor1 * row->xheight) &&
00815 (!tosp_narrow_blobs_not_cert ||
00816 (!narrow_blob (row, prev_blob_box) &&
00817 !narrow_blob (row, blob_box)))) ||
00818 (wide_blob (row, prev_blob_box) && wide_blob (row, blob_box)))
00819 cert_space_gap_stats.add (gap_width, 1);
00820 all_space_gap_stats.add (gap_width, 1);
00821 }
00822 if (gap_width < crude_threshold_estimate)
00823 small_gap_stats.add (gap_width, 1);
00824
00825 prev_blob_box = blob_box;
00826 }
00827 if (cert_space_gap_stats.get_total () >=
00828 tosp_enough_space_samples_for_median)
00829
00830 row->space_size = cert_space_gap_stats.median ();
00831 else if (suspected_table && (cert_space_gap_stats.get_total () > 0))
00832
00833 row->space_size = cert_space_gap_stats.mean ();
00834
00835 else if (all_space_gap_stats.get_total () >=
00836 tosp_enough_space_samples_for_median)
00837
00838 row->space_size = all_space_gap_stats.median ();
00839 else
00840 row->space_size = all_space_gap_stats.mean ();
00841
00842 if (tosp_only_small_gaps_for_kern)
00843 row->kern_size = small_gap_stats.median ();
00844 else
00845 row->kern_size = all_gap_stats->median ();
00846 row->space_threshold =
00847 INT32 (floor ((row->space_size + row->kern_size) / 2));
00848
00849 if ((row->kern_size >= row->space_threshold) ||
00850 (row->space_threshold >= row->space_size) ||
00851 (row->space_threshold <= 0)) {
00852 if (tosp_debug_level > 0)
00853 tprintf ("B:%d R:%d -- Isolated row stats SANITY FAILURE: %f %d %f\n",
00854 block_idx, row_idx,
00855 row->kern_size, row->space_threshold, row->space_size);
00856 row->kern_size = 0.0f;
00857 row->space_threshold = 0;
00858 row->space_size = 0.0f;
00859 return FALSE;
00860 }
00861
00862 if (tosp_debug_level > 5)
00863 tprintf ("B:%d R:%d -- Isolated row stats: %f %d %f\n",
00864 block_idx, row_idx,
00865 row->kern_size, row->space_threshold, row->space_size);
00866 return TRUE;
00867 }
00868
00869
00873 INT16 stats_count_under(STATS *stats, INT16 threshold) {
00874 INT16 index;
00875 INT16 total = 0;
00876
00877 for (index = 0; index < threshold; index++)
00878 total += stats->pile_count (index);
00879 return total;
00880 }
00881
00882
00905 void improve_row_threshold(TO_ROW *row, STATS *all_gap_stats) {
00906 float sp = row->space_size;
00907 float kn = row->kern_size;
00908 INT16 reqd_zero_width = 0;
00909 INT16 zero_width = 0;
00910 INT16 zero_start = 0;
00911 INT16 index = 0;
00912
00913 if (tosp_debug_level > 10)
00914 tprintf ("Improve row threshold 0");
00915 if ((all_gap_stats->get_total () <= 25) ||
00916 (sp <= 10) ||
00917 (sp <= 3 * kn) ||
00918 (stats_count_under (all_gap_stats,
00919 (INT16) ceil (kn + (sp - kn) / 3 + 0.5)) <
00920 (0.75 * all_gap_stats->get_total ())))
00921 return;
00922 if (tosp_debug_level > 10)
00923 tprintf (" 1");
00924
00925
00926
00927
00928
00929 reqd_zero_width = (INT16) floor ((sp - kn) / 3 + 0.5);
00930 if (reqd_zero_width < 3)
00931 reqd_zero_width = 3;
00932
00933 for (index = INT16 (ceil (kn)); index < INT16 (floor (sp)); index++) {
00934 if (all_gap_stats->pile_count (index) == 0) {
00935 if (zero_width == 0)
00936 zero_start = index;
00937 zero_width++;
00938 }
00939 else {
00940 if (zero_width >= reqd_zero_width)
00941 break;
00942 else {
00943 zero_width = 0;
00944 }
00945 }
00946 }
00947 index--;
00948 if (tosp_debug_level > 10)
00949 tprintf (" reqd_z_width: %d found %d 0's, starting %d; thresh: %d/n",
00950 reqd_zero_width, zero_width, zero_start, row->space_threshold);
00951 if ((zero_width < reqd_zero_width) ||
00952 ((row->space_threshold >= zero_start) &&
00953 (row->space_threshold <= index)))
00954 return;
00955 if (tosp_debug_level > 10)
00956 tprintf (" 2");
00957 if (row->space_threshold < zero_start) {
00958 if (tosp_debug_level > 5)
00959 tprintf
00960 ("Improve row kn:%5.2f sp:%5.2f 0's: %d -> %d thresh:%d -> %d\n",
00961 kn, sp, zero_start, index, row->space_threshold, zero_start);
00962 row->space_threshold = zero_start;
00963 }
00964 if (row->space_threshold > index) {
00965 if (tosp_debug_level > 5)
00966 tprintf
00967 ("Improve row kn:%5.2f sp:%5.2f 0's: %d -> %d thresh:%d -> %d\n",
00968 kn, sp, zero_start, index, row->space_threshold, index);
00969 row->space_threshold = index;
00970 }
00971 }
00972
00973
00983 ROW *make_prop_words(
00984 TO_ROW *row,
00985 FCOORD rotation
00986 ) {
00987 BOOL8 bol;
00993 BOOL8 prev_fuzzy_sp;
00994 BOOL8 prev_fuzzy_non;
00995 UINT8 prev_blanks;
00996 BOOL8 fuzzy_sp;
00997 BOOL8 fuzzy_non;
00998 UINT8 blanks;
00999 ROW *real_row;
01000 OUTLINE_IT out_it;
01001 C_OUTLINE_IT cout_it;
01002 PBLOB_LIST blobs;
01003 C_BLOB_LIST cblobs;
01004 PBLOB_IT blob_it = &blobs;
01005 C_BLOB_IT cblob_it = &cblobs;
01006 WERD_LIST words;
01007 WERD_IT word_it;
01008 WERD *word;
01009 WERD_IT rep_char_it;
01010 INT32 next_rep_char_word_right = MAX_INT32;
01011 float repetition_spacing;
01012 INT32 xstarts[2];
01013 double coeffs[3];
01014 INT32 prev_x;
01015 BLOBNBOX *bblob;
01016 BOX blob_box;
01017 BLOBNBOX_IT box_it;
01018 BOX prev_blob_box;
01019 BOX next_blob_box;
01020 INT16 prev_gap = MAX_INT16;
01021 INT16 current_gap = MAX_INT16;
01022 INT16 next_gap = MAX_INT16;
01023 INT16 prev_within_xht_gap = MAX_INT16;
01024 INT16 current_within_xht_gap = MAX_INT16;
01025 INT16 next_within_xht_gap = MAX_INT16;
01026 INT16 word_count = 0;
01027 static INT16 row_count = 0;
01028
01029 row_count++;
01030 rep_char_it.set_to_list (&(row->rep_words));
01031 if (!rep_char_it.empty ()) {
01032 next_rep_char_word_right =
01033 rep_char_it.data ()->bounding_box ().right ();
01034 }
01035
01036 prev_x = -MAX_INT16;
01037 blob_it.set_to_list (&blobs);
01038 cblob_it.set_to_list (&cblobs);
01039 box_it.set_to_list (row->blob_list ());
01040 word_it.set_to_list (&words);
01041 bol = TRUE;
01042 prev_blanks = 0;
01043 prev_fuzzy_sp = FALSE;
01044 prev_fuzzy_non = FALSE;
01045 if (!box_it.empty ()) {
01046 xstarts[0] = box_it.data ()->bounding_box ().left ();
01047 if (xstarts[0] > next_rep_char_word_right) {
01048
01049 word = rep_char_it.extract ();
01050 word_it.add_after_then_move (word);
01051
01052 word->set_flag (W_BOL, TRUE);
01053 bol = FALSE;
01054 word->set_blanks (0);
01055
01056 word->set_flag (W_FUZZY_SP, FALSE);
01057 word->set_flag (W_FUZZY_NON, FALSE);
01058 xstarts[0] = word->bounding_box ().left ();
01059
01060 repetition_spacing = find_mean_blob_spacing (word);
01061 current_gap = box_it.data ()->bounding_box ().left () -
01062 next_rep_char_word_right;
01063 current_within_xht_gap = current_gap;
01064 if (current_gap > tosp_rep_space * repetition_spacing) {
01065 prev_blanks = (UINT8) floor (current_gap / row->space_size);
01066 if (prev_blanks < 1)
01067 prev_blanks = 1;
01068 }
01069 else
01070 prev_blanks = 0;
01071 if (tosp_debug_level > 5)
01072 tprintf ("Repch wd at BOL(%d, %d). rep spacing %5.2f; Rgap:%d ",
01073 box_it.data ()->bounding_box ().left (),
01074 box_it.data ()->bounding_box ().bottom (),
01075 repetition_spacing, current_gap);
01076 prev_fuzzy_sp = FALSE;
01077 prev_fuzzy_non = FALSE;
01078 if (rep_char_it.empty ()) {
01079 next_rep_char_word_right = MAX_INT32;
01080 }
01081 else {
01082 rep_char_it.forward ();
01083 next_rep_char_word_right =
01084 rep_char_it.data ()->bounding_box ().right ();
01085 }
01086 }
01087
01088 peek_at_next_gap(row,
01089 box_it,
01090 next_blob_box,
01091 next_gap,
01092 next_within_xht_gap);
01093 do {
01094 bblob = box_it.data ();
01095 blob_box = bblob->bounding_box ();
01096 if (bblob->joined_to_prev ()) {
01097 if (bblob->blob () != NULL) {
01098 out_it.set_to_list (blob_it.data ()->out_list ());
01099 out_it.move_to_last ();
01100 out_it.add_list_after (bblob->blob ()->out_list ());
01101 delete bblob->blob ();
01102 }
01103 else if (bblob->cblob () != NULL) {
01104 cout_it.set_to_list (cblob_it.data ()->out_list ());
01105 cout_it.move_to_last ();
01106 cout_it.add_list_after (bblob->cblob ()->out_list ());
01107 delete bblob->cblob ();
01108 }
01109 }
01110 else {
01111 if (bblob->blob () != NULL)
01112 blob_it.add_after_then_move (bblob->blob ());
01113 else if (bblob->cblob () != NULL)
01114 cblob_it.add_after_then_move (bblob->cblob ());
01115 prev_x = blob_box.right ();
01116 }
01117 box_it.forward ();
01118 bblob = box_it.data ();
01119 blob_box = bblob->bounding_box ();
01120
01121 if (!bblob->joined_to_prev () &&
01122 (bblob->blob () != NULL || bblob->cblob () != NULL)) {
01123
01124 prev_gap = current_gap;
01125 prev_within_xht_gap = current_within_xht_gap;
01126 prev_blob_box = next_blob_box;
01127 current_gap = next_gap;
01128 current_within_xht_gap = next_within_xht_gap;
01129 peek_at_next_gap(row,
01130 box_it,
01131 next_blob_box,
01132 next_gap,
01133 next_within_xht_gap);
01134
01135 if ((blob_box.left () > next_rep_char_word_right) ||
01136 (!tosp_only_use_xht_gaps &&
01137 make_a_word_break (row, blob_box, prev_gap, prev_blob_box,
01138 current_gap, current_within_xht_gap,
01139 next_blob_box, next_gap,
01140 blanks, fuzzy_sp, fuzzy_non)) ||
01141 (tosp_only_use_xht_gaps &&
01142 make_a_word_break (row, blob_box, prev_within_xht_gap,
01143 prev_blob_box,
01144 current_gap, current_within_xht_gap,
01145 next_blob_box, next_within_xht_gap,
01146 blanks, fuzzy_sp, fuzzy_non)) ||
01147 box_it.at_first ()) {
01148
01149 if (!blob_it.empty ()) {
01150 word = new WERD (&blobs, prev_blanks, NULL);
01151
01152 word_count++;
01153 }
01154 else {
01155 word = new WERD (&cblobs, prev_blanks, NULL);
01156 word_count++;
01157 }
01158 word_it.add_after_then_move (word);
01159 if (bol) {
01160 word->set_flag (W_BOL, TRUE);
01161 bol = FALSE;
01162 }
01163 if (prev_fuzzy_sp)
01164
01165 word->set_flag (W_FUZZY_SP, TRUE);
01166 else if (prev_fuzzy_non)
01167 word->set_flag (W_FUZZY_NON, TRUE);
01168
01169
01170 if (blob_box.left () > next_rep_char_word_right) {
01171
01172 word = rep_char_it.extract ();
01173 word_it.add_after_then_move (word);
01174
01175
01176 repetition_spacing = find_mean_blob_spacing (word);
01177 current_gap = word->bounding_box ().left () - prev_x;
01178 current_within_xht_gap = current_gap;
01179 if (current_gap > tosp_rep_space * repetition_spacing) {
01180 blanks =
01181 (UINT8) floor (current_gap / row->space_size);
01182 if (blanks < 1)
01183 blanks = 1;
01184 }
01185 else
01186 blanks = 0;
01187 if (tosp_debug_level > 5)
01188 tprintf
01189 ("Repch wd (%d,%d) rep gap %5.2f; Lgap:%d (%d blanks);",
01190 word->bounding_box ().left (),
01191 word->bounding_box ().bottom (),
01192 repetition_spacing, current_gap, blanks);
01193 word->set_blanks (blanks);
01194
01195 word->set_flag (W_FUZZY_SP, FALSE);
01196 word->set_flag (W_FUZZY_NON, FALSE);
01197
01198
01199 current_gap =
01200 blob_box.left () - next_rep_char_word_right;
01201 if (current_gap > tosp_rep_space * repetition_spacing) {
01202 blanks = (UINT8) (current_gap / row->space_size);
01203 if (blanks < 1)
01204 blanks = 1;
01205 }
01206 else
01207 blanks = 0;
01208 if (tosp_debug_level > 5)
01209 tprintf (" Rgap:%d (%d blanks)\n",
01210 current_gap, blanks);
01211 fuzzy_sp = FALSE;
01212 fuzzy_non = FALSE;
01213
01214 if (rep_char_it.empty ()) {
01215 next_rep_char_word_right = MAX_INT32;
01216 }
01217 else {
01218 rep_char_it.forward ();
01219 next_rep_char_word_right =
01220 rep_char_it.data ()->bounding_box ().right ();
01221 }
01222 }
01223
01224 if (box_it.at_first () && rep_char_it.empty ()) {
01225
01226 word->set_flag (W_EOL, TRUE);
01227 xstarts[1] = prev_x;
01228 }
01229 else {
01230 prev_blanks = blanks;
01231 prev_fuzzy_sp = fuzzy_sp;
01232 prev_fuzzy_non = fuzzy_non;
01233 }
01234 }
01235 }
01236 }
01237 while (!box_it.at_first ());
01238
01239
01240 while (!rep_char_it.empty ()) {
01241 word = rep_char_it.extract ();
01242 word_it.add_after_then_move (word);
01243
01244
01245 repetition_spacing = find_mean_blob_spacing (word);
01246 current_gap = word->bounding_box ().left () - prev_x;
01247 if (current_gap > tosp_rep_space * repetition_spacing) {
01248 blanks = (UINT8) floor (current_gap / row->space_size);
01249 if (blanks < 1)
01250 blanks = 1;
01251 }
01252 else
01253 blanks = 0;
01254 if (tosp_debug_level > 5)
01255 tprintf
01256 ("Repch wd at EOL (%d,%d). rep spacing %d; Lgap:%d (%d blanks)\n",
01257 word->bounding_box ().left (), word->bounding_box ().bottom (),
01258 repetition_spacing, current_gap, blanks);
01259 word->set_blanks (blanks);
01260
01261 word->set_flag (W_FUZZY_SP, FALSE);
01262 word->set_flag (W_FUZZY_NON, FALSE);
01263 prev_x = word->bounding_box ().right ();
01264 if (rep_char_it.empty ()) {
01265
01266 word->set_flag (W_EOL, TRUE);
01267 xstarts[1] = prev_x;
01268 }
01269 else {
01270 rep_char_it.forward ();
01271 }
01272 }
01273 coeffs[0] = 0;
01274 coeffs[1] = row->line_m ();
01275 coeffs[2] = row->line_c ();
01276 real_row = new ROW (row,
01277 (INT16) row->kern_size, (INT16) row->space_size);
01278 word_it.set_to_list (real_row->word_list ());
01279
01280 word_it.add_list_after (&words);
01281 real_row->recalc_bounding_box ();
01282 if (tosp_debug_level > 9) {
01283 tprintf ("Row %d Made %d words in row ((%d,%d)(%d,%d))\n",
01284 row_count,
01285 word_count,
01286 real_row->bounding_box ().left (),
01287 real_row->bounding_box ().bottom (),
01288 real_row->bounding_box ().right (),
01289 real_row->bounding_box ().top ());
01290 }
01291 return real_row;
01292 }
01293 return NULL;
01294 }
01295
01296
01319 BOOL8 make_a_word_break(
01320 TO_ROW *row,
01321 BOX blob_box,
01322 INT16 prev_gap,
01323 BOX prev_blob_box,
01324 INT16 real_current_gap,
01325 INT16 within_xht_current_gap,
01326 BOX next_blob_box,
01327 INT16 next_gap,
01328 UINT8 &blanks,
01329 BOOL8 &fuzzy_sp,
01330 BOOL8 &fuzzy_non) {
01331 static BOOL8 prev_gap_was_a_space;
01332 BOOL8 space;
01333 INT16 current_gap;
01334 float fuzzy_sp_to_kn_limit;
01335
01336
01337
01338
01339 if ((row->kern_size > tosp_large_kerning * row->xheight) ||
01340 ((tosp_dont_fool_with_small_kerns >= 0) &&
01341 (real_current_gap < tosp_dont_fool_with_small_kerns * row->kern_size)))
01342
01343 within_xht_current_gap = real_current_gap;
01344
01345 if (tosp_use_xht_gaps && tosp_only_use_xht_gaps)
01346 current_gap = within_xht_current_gap;
01347 else
01348 current_gap = real_current_gap;
01349
01350 if (tosp_old_to_method) {
01351
01352 space = current_gap > row->max_nonspace;
01353 if (space && (current_gap < MAX_INT16)) {
01354 if (current_gap < row->min_space) {
01355 if (current_gap > row->space_threshold) {
01356 blanks = 1;
01357 fuzzy_sp = TRUE;
01358 fuzzy_non = FALSE;
01359 }
01360 else {
01361 blanks = 0;
01362 fuzzy_sp = FALSE;
01363 fuzzy_non = TRUE;
01364 }
01365 }
01366 else {
01367 blanks = (UINT8) (current_gap / row->space_size);
01368 if (blanks < 1)
01369 blanks = 1;
01370 fuzzy_sp = FALSE;
01371 fuzzy_non = FALSE;
01372 }
01373 }
01374 return space;
01375 }
01376 else {
01377
01378 if (prev_blob_box.null_box ())
01379
01380 prev_gap_was_a_space = TRUE;
01381
01382
01383 space = current_gap > row->space_threshold;
01384
01385
01386
01387
01388
01389
01390 blanks = (UINT8) (current_gap / row->space_size);
01391 if (blanks < 1)
01392 blanks = 1;
01393 fuzzy_sp = FALSE;
01394 fuzzy_non = FALSE;
01395
01396
01397
01398
01399
01400 if (tosp_use_xht_gaps &&
01401 (real_current_gap <= row->max_nonspace) &&
01402 (within_xht_current_gap > row->max_nonspace)) {
01403 space = TRUE;
01404 fuzzy_non = TRUE;
01405 #ifndef GRAPHICS_DISABLED
01406 mark_gap (blob_box, 20,
01407 prev_gap, prev_blob_box.width (),
01408 current_gap, next_blob_box.width (), next_gap);
01409 #endif
01410 }
01411 else if (tosp_use_xht_gaps &&
01412 (real_current_gap <= row->space_threshold) &&
01413 (within_xht_current_gap > row->space_threshold)) {
01414 space = TRUE;
01415 if (tosp_flip_fuzz_kn_to_sp)
01416 fuzzy_sp = TRUE;
01417 else
01418 fuzzy_non = TRUE;
01419 #ifndef GRAPHICS_DISABLED
01420 mark_gap (blob_box, 21,
01421 prev_gap, prev_blob_box.width (),
01422 current_gap, next_blob_box.width (), next_gap);
01423 #endif
01424 }
01425 else if (tosp_use_xht_gaps &&
01426 (real_current_gap < row->min_space) &&
01427 (within_xht_current_gap >= row->min_space)) {
01428 space = TRUE;
01429 #ifndef GRAPHICS_DISABLED
01430 mark_gap (blob_box, 22,
01431 prev_gap, prev_blob_box.width (),
01432 current_gap, next_blob_box.width (), next_gap);
01433 #endif
01434 }
01435
01436 else if ((current_gap < row->min_space) &&
01437 (current_gap > row->space_threshold)) {
01438
01439 if (tosp_pass_wide_fuzz_sp_to_context > 0)
01440 fuzzy_sp_to_kn_limit = row->kern_size +
01441 tosp_pass_wide_fuzz_sp_to_context *
01442 (row->space_size - row->kern_size);
01443 else
01444 fuzzy_sp_to_kn_limit = 99999.0f;
01445
01446
01447
01448 if ((prev_blob_box.width () > 0) &&
01449 narrow_blob (row, prev_blob_box) &&
01450 prev_gap_was_a_space &&
01451 (current_gap <= tosp_gap_factor * prev_gap)) {
01452 if ((tosp_all_flips_fuzzy) ||
01453 (current_gap > fuzzy_sp_to_kn_limit)) {
01454 if (tosp_flip_fuzz_sp_to_kn)
01455 fuzzy_non = TRUE;
01456 else
01457 fuzzy_sp = TRUE;
01458 }
01459 else
01460 space = FALSE;
01461 #ifndef GRAPHICS_DISABLED
01462 mark_gap (blob_box, 1,
01463 prev_gap, prev_blob_box.width (),
01464 current_gap, next_blob_box.width (), next_gap);
01465 #endif
01466 }
01467
01468
01469 else if ((prev_blob_box.width () > 0) &&
01470 narrow_blob (row, prev_blob_box) &&
01471 !prev_gap_was_a_space &&
01472 (current_gap * tosp_gap_factor <= prev_gap)) {
01473 if ((tosp_all_flips_fuzzy) ||
01474 (current_gap > fuzzy_sp_to_kn_limit)) {
01475 if (tosp_flip_fuzz_sp_to_kn)
01476 fuzzy_non = TRUE;
01477 else
01478 fuzzy_sp = TRUE;
01479 }
01480 else
01481 space = FALSE;
01482 #ifndef GRAPHICS_DISABLED
01483 mark_gap (blob_box, 2,
01484 prev_gap, prev_blob_box.width (),
01485 current_gap, next_blob_box.width (), next_gap);
01486 #endif
01487 }
01488 else if ((next_blob_box.width () > 0) &&
01489 narrow_blob (row, next_blob_box) &&
01490 (next_gap > row->space_threshold) &&
01491 (current_gap <= tosp_gap_factor * next_gap)) {
01492 if ((tosp_all_flips_fuzzy) ||
01493 (current_gap > fuzzy_sp_to_kn_limit)) {
01494 if (tosp_flip_fuzz_sp_to_kn)
01495 fuzzy_non = TRUE;
01496 else
01497 fuzzy_sp = TRUE;
01498 }
01499 else
01500 space = FALSE;
01501 #ifndef GRAPHICS_DISABLED
01502 mark_gap (blob_box, 3,
01503 prev_gap, prev_blob_box.width (),
01504 current_gap, next_blob_box.width (), next_gap);
01505 #endif
01506 }
01507 else if ((next_blob_box.width () > 0) &&
01508 narrow_blob (row, next_blob_box) &&
01509 (next_gap <= row->space_threshold) &&
01510 (current_gap * tosp_gap_factor <= next_gap)) {
01511 if ((tosp_all_flips_fuzzy) ||
01512 (current_gap > fuzzy_sp_to_kn_limit)) {
01513 if (tosp_flip_fuzz_sp_to_kn)
01514 fuzzy_non = TRUE;
01515 else
01516 fuzzy_sp = TRUE;
01517 }
01518 else
01519 space = FALSE;
01520 #ifndef GRAPHICS_DISABLED
01521 mark_gap (blob_box, 4,
01522 prev_gap, prev_blob_box.width (),
01523 current_gap, next_blob_box.width (), next_gap);
01524 #endif
01525 }
01526 else if ((((next_blob_box.width () > 0) &&
01527 narrow_blob (row, next_blob_box)) ||
01528 ((prev_blob_box.width () > 0) &&
01529 narrow_blob (row, prev_blob_box)))) {
01530 fuzzy_sp = TRUE;
01531 #ifndef GRAPHICS_DISABLED
01532 mark_gap (blob_box, 6,
01533 prev_gap, prev_blob_box.width (),
01534 current_gap, next_blob_box.width (), next_gap);
01535 #endif
01536 }
01537 }
01538 else if ((current_gap > row->max_nonspace) &&
01539 (current_gap <= row->space_threshold)) {
01540
01541
01542
01543
01544
01545
01546
01547
01548 if ((prev_blob_box.width () > 0) &&
01549 (next_blob_box.width () > 0) &&
01550 (current_gap >=
01551 tosp_kern_gap_factor1 * MAX (prev_gap, next_gap)) &&
01552 wide_blob (row, prev_blob_box) &&
01553 wide_blob (row, next_blob_box)) {
01554
01555 space = TRUE;
01556
01557
01558
01559
01560
01561 if ((tosp_flip_fuzz_kn_to_sp) &&
01562 ((tosp_flip_caution <= 0) ||
01563 (tosp_flip_caution * row->kern_size > row->space_size)))
01564 fuzzy_sp = TRUE;
01565 else
01566 fuzzy_non = TRUE;
01567 #ifndef GRAPHICS_DISABLED
01568 mark_gap (blob_box, 7,
01569 prev_gap, prev_blob_box.width (),
01570 current_gap, next_blob_box.width (), next_gap);
01571 #endif
01572 }
01573 else if ((prev_blob_box.width () > 0) &&
01574 (next_blob_box.width () > 0) &&
01575 (current_gap >=
01576 tosp_kern_gap_factor2 * MAX (prev_gap, next_gap)) &&
01577 !(narrow_blob (row, prev_blob_box) ||
01578 suspected_punct_blob (row, prev_blob_box)) &&
01579 !(narrow_blob (row, next_blob_box) ||
01580 suspected_punct_blob (row, next_blob_box))) {
01581 space = TRUE;
01582 fuzzy_non = TRUE;
01583 #ifndef GRAPHICS_DISABLED
01584 mark_gap (blob_box, 8,
01585 prev_gap, prev_blob_box.width (),
01586 current_gap, next_blob_box.width (), next_gap);
01587 #endif
01588 }
01589 else if ((tosp_kern_gap_factor3 > 0) &&
01590 (prev_blob_box.width () > 0) &&
01591 (next_blob_box.width () > 0) &&
01592 (current_gap >=
01593 tosp_kern_gap_factor3 * MAX (prev_gap, next_gap)) &&
01594 (!tosp_rule_9_test_punct ||
01595 (!suspected_punct_blob (row, prev_blob_box) &&
01596 !suspected_punct_blob (row, next_blob_box)))) {
01597 space = TRUE;
01598 fuzzy_non = TRUE;
01599 #ifndef GRAPHICS_DISABLED
01600 mark_gap (blob_box, 9,
01601 prev_gap, prev_blob_box.width (),
01602 current_gap, next_blob_box.width (), next_gap);
01603 #endif
01604 }
01605 }
01606 prev_gap_was_a_space = space && !(fuzzy_non);
01607 return space;
01608 }
01609 }
01610
01611
01619 BOOL8 narrow_blob(TO_ROW *row, BOX blob_box) {
01620 BOOL8 result;
01621
01622 result = ((blob_box.width () <= tosp_narrow_fraction * row->xheight) ||
01623 (((float) blob_box.width () / blob_box.height ()) <=
01624 tosp_narrow_aspect_ratio));
01625 return result;
01626 }
01627
01628
01636 BOOL8 wide_blob(TO_ROW *row, BOX blob_box) {
01637 BOOL8 result;
01638
01639 if (tosp_wide_fraction > 0) {
01640 if (tosp_wide_aspect_ratio > 0)
01641 result = ((blob_box.width () >= tosp_wide_fraction * row->xheight) &&
01642 (((float) blob_box.width () / blob_box.height ()) >
01643 tosp_wide_aspect_ratio));
01644 else
01645 result = (blob_box.width () >= tosp_wide_fraction * row->xheight);
01646 }
01647 else
01648 result = !narrow_blob (row, blob_box);
01649 return result;
01650 }
01651
01652
01660 BOOL8 suspected_punct_blob(TO_ROW *row, BOX box) {
01661 BOOL8 result;
01662 float baseline;
01663 float blob_x_centre;
01664
01665
01666
01667 blob_x_centre = (box.right () + box.left ()) / 2.0;
01668 baseline = row->baseline.y (blob_x_centre);
01669
01670 result = (box.height () <= 0.66 * row->xheight) ||
01671 (box.top () < baseline + row->xheight / 2.0) ||
01672 (box.bottom () > baseline + row->xheight / 2.0);
01673 return result;
01674 }
01675
01676
01680 void peek_at_next_gap(
01681 TO_ROW *row,
01682 BLOBNBOX_IT box_it,
01683 BOX &next_blob_box,
01684 INT16 &next_gap,
01685 INT16 &next_within_xht_gap) {
01686 BOX next_reduced_blob_box;
01687 BOX bit_beyond;
01688 BLOBNBOX_IT reduced_box_it = box_it;
01689
01690 next_blob_box = box_next (&box_it);
01691 next_reduced_blob_box = reduced_box_next (row, &reduced_box_it);
01692 if (box_it.at_first ()) {
01693 next_gap = MAX_INT16;
01694 next_within_xht_gap = MAX_INT16;
01695 }
01696 else {
01697 bit_beyond = box_it.data ()->bounding_box ();
01698 next_gap = bit_beyond.left () - next_blob_box.right ();
01699 bit_beyond = reduced_box_next (row, &reduced_box_it);
01700 next_within_xht_gap =
01701 bit_beyond.left () - next_reduced_blob_box.right ();
01702 }
01703 }
01704
01705
01709 #ifndef GRAPHICS_DISABLED
01710 void mark_gap(
01711 BOX blob,
01712 INT16 rule,
01713 INT16 prev_gap,
01714 INT16 prev_blob_width,
01715 INT16 current_gap,
01716 INT16 next_blob_width,
01717 INT16 next_gap) {
01718 COLOUR col;
01719
01720 switch (rule) {
01721 case 1:
01722 col = RED;
01723 break;
01724 case 2:
01725 col = CYAN;
01726 break;
01727 case 3:
01728 col = GREEN;
01729 break;
01730 case 4:
01731 col = BLACK;
01732 break;
01733 case 5:
01734 col = MAGENTA;
01735 break;
01736 case 6:
01737 col = BLUE;
01738 break;
01739
01740 case 7:
01741 col = WHITE;
01742 break;
01743 case 8:
01744 col = YELLOW;
01745 break;
01746 case 9:
01747 col = BLACK;
01748 break;
01749
01750 case 20:
01751 col = CYAN;
01752 break;
01753 case 21:
01754 col = GREEN;
01755 break;
01756 case 22:
01757 col = MAGENTA;
01758 break;
01759 default:
01760 col = BLACK;
01761 }
01762 if (textord_show_initial_words) {
01763 fill_color_index(to_win, col);
01764 perimeter_color_index(to_win, col);
01765 if (rule < 20)
01766 interior_style(to_win, INT_SOLID, FALSE);
01767 else
01768 interior_style(to_win, INT_HOLLOW, TRUE);
01769
01770 ellipse (to_win, current_gap / 2.0f,
01771 blob.height () / 2.0f,
01772
01773 blob.left () - current_gap / 2.0f,
01774
01775 blob.bottom () + blob.height () / 2.0f,
01776 0.0f);
01777 }
01778 if (tosp_debug_level > 0)
01779 tprintf (" (%d,%d) Sp<->Kn Rule %d %d %d %d %d\n",
01780 blob.left () - current_gap / 2, blob.bottom (), rule,
01781 prev_gap, prev_blob_width, current_gap,
01782 next_blob_width, next_gap);
01783 }
01784 #endif
01785
01786
01790 float find_mean_blob_spacing(WERD *word) {
01791 PBLOB_IT blob_it;
01792 C_BLOB_IT cblob_it;
01793 BOX blob_box;
01794 INT32 gap_sum = 0;
01795 INT16 gap_count = 0;
01796 INT16 prev_right;
01797
01798 if (word->flag (W_POLYGON)) {
01799 blob_it.set_to_list (word->blob_list ());
01800 if (!blob_it.empty ()) {
01801 blob_it.mark_cycle_pt ();
01802 prev_right = blob_it.data ()->bounding_box ().right ();
01803
01804 blob_it.forward ();
01805 for (; !blob_it.cycled_list (); blob_it.forward ()) {
01806 blob_box = blob_it.data ()->bounding_box ();
01807 gap_sum += blob_box.left () - prev_right;
01808 gap_count++;
01809 prev_right = blob_box.right ();
01810 }
01811 }
01812 }
01813 else {
01814 cblob_it.set_to_list (word->cblob_list ());
01815 if (!cblob_it.empty ()) {
01816 cblob_it.mark_cycle_pt ();
01817 prev_right = cblob_it.data ()->bounding_box ().right ();
01818
01819 cblob_it.forward ();
01820 for (; !cblob_it.cycled_list (); cblob_it.forward ()) {
01821 blob_box = cblob_it.data ()->bounding_box ();
01822 gap_sum += blob_box.left () - prev_right;
01823 gap_count++;
01824 prev_right = blob_box.right ();
01825 }
01826 }
01827 }
01828 if (gap_count > 0)
01829 return (gap_sum / (float) gap_count);
01830 else
01831 return 0.0f;
01832 }
01833
01834
01838 BOOL8 ignore_big_gap(TO_ROW *row,
01839 INT32 row_length,
01840 GAPMAP *gapmap,
01841 INT16 left,
01842 INT16 right) {
01843 INT16 gap = right - left + 1;
01844
01845 if (tosp_ignore_big_gaps > 999)
01846 return FALSE;
01847 if (tosp_ignore_big_gaps > 0)
01848 return (gap > tosp_ignore_big_gaps * row->xheight);
01849 if (gap > tosp_ignore_very_big_gaps * row->xheight)
01850 return TRUE;
01851 if (tosp_ignore_big_gaps == 0) {
01852 if ((gap > 2.1 * row->xheight) && (row_length > 20 * row->xheight))
01853 return TRUE;
01854 if ((gap > 1.75 * row->xheight) &&
01855 ((row_length > 35 * row->xheight) ||
01856 gapmap->table_gap (left, right)))
01857 return TRUE;
01858 }
01859 else {
01860
01861 if ((gap > gapmap_big_gaps * row->xheight) &&
01862 gapmap->table_gap (left, right))
01863 return TRUE;
01864 }
01865 return FALSE;
01866 }
01867
01868
01878 BOX reduced_box_next(
01879 TO_ROW *row,
01880 BLOBNBOX_IT *it
01881 ) {
01882 BLOBNBOX *blob;
01883 BLOBNBOX *head_blob;
01884 BOX full_box;
01885 BOX reduced_box;
01886 INT16 left_above_xht;
01887 INT16 new_left_above_xht;
01888
01889 blob = it->data ();
01890 if (blob->red_box_set ()) {
01891 reduced_box = blob->reduced_box ();
01892 do {
01893 it->forward ();
01894 blob = it->data ();
01895 }
01896
01897 while (blob->blob () == NULL
01898 && blob->cblob () == NULL || blob->joined_to_prev ());
01899 return reduced_box;
01900 }
01901 head_blob = blob;
01902 full_box = blob->bounding_box ();
01903 reduced_box = reduced_box_for_blob (blob, row, &left_above_xht);
01904 do {
01905 it->forward ();
01906 blob = it->data ();
01907 if (blob->blob () == NULL && blob->cblob () == NULL)
01908
01909 full_box += blob->bounding_box ();
01910 else if (blob->joined_to_prev ()) {
01911 reduced_box +=
01912 reduced_box_for_blob(blob, row, &new_left_above_xht);
01913 left_above_xht = MIN (left_above_xht, new_left_above_xht);
01914 }
01915 }
01916
01917 while (blob->blob () == NULL
01918 && blob->cblob () == NULL || blob->joined_to_prev ());
01919
01920 if ((reduced_box.width () > 0) &&
01921 ((reduced_box.left () + tosp_near_lh_edge * reduced_box.width ())
01922 < left_above_xht) && (reduced_box.height () > 0.7 * row->xheight)) {
01923 #ifndef GRAPHICS_DISABLED
01924 if (textord_show_initial_words)
01925 reduced_box.plot (to_win, INT_HOLLOW, TRUE, YELLOW, YELLOW);
01926 #endif
01927 }
01928 else
01929 reduced_box = full_box;
01930 head_blob->set_reduced_box (reduced_box);
01931 return reduced_box;
01932 }
01933
01934
01958 BOX reduced_box_for_blob(
01959 BLOBNBOX *blob,
01960 TO_ROW *row,
01961 INT16 *left_above_xht) {
01962 float baseline;
01963 float blob_x_centre;
01964 float left_limit;
01965 float right_limit;
01966 float junk;
01967 BOX blob_box;
01968
01969
01970
01971 blob_box = blob->bounding_box ();
01972 blob_x_centre = (blob_box.left () + blob_box.right ()) / 2.0;
01973 baseline = row->baseline.y (blob_x_centre);
01974
01975
01976
01977
01978
01979 left_limit = (float) MAX_INT32;
01980 junk = (float) -MAX_INT32;
01981 if (blob->blob () != NULL)
01982
01983 find_blob_limits (blob->blob (),
01984 (float) -MAX_INT16,
01985 -(baseline + 1.1 * row->xheight),
01986
01987 FCOORD (0.0, 1.0),
01988 left_limit, junk);
01989 else
01990
01991 find_cblob_hlimits (blob->cblob (),
01992
01993 (baseline + 1.1 * row->xheight), (float) MAX_INT16,
01994
01995
01996 left_limit, junk);
01997 if (left_limit > junk)
01998 *left_above_xht = MAX_INT16;
01999 else
02000 *left_above_xht = (INT16) floor (left_limit);
02001
02002
02003
02004
02005 left_limit = (float) MAX_INT32;
02006 junk = (float) -MAX_INT32;
02007 if (blob->blob () != NULL)
02008
02009 find_blob_limits (blob->blob (),
02010 (float) -MAX_INT16,
02011 -baseline,
02012 FCOORD (0.0, 1.0),
02013 left_limit, junk);
02014 else
02015
02016 find_cblob_hlimits (blob->cblob (),
02017 baseline,
02018 (float) MAX_INT16,
02019
02020 left_limit, junk);
02021
02022 if (left_limit > junk)
02023 return BOX ();
02024
02025
02026
02027 junk = (float) MAX_INT32;
02028 right_limit = (float) -MAX_INT32;
02029 if (blob->blob () != NULL)
02030
02031 find_blob_limits (blob->blob (),
02032 -(baseline + row->xheight),
02033
02034 (float) MAX_INT16,
02035 FCOORD (0.0, 1.0),
02036 junk, right_limit);
02037 else
02038
02039 find_cblob_hlimits (blob->cblob (),
02040 (float) -MAX_INT16,
02041 (baseline + row->xheight),
02042
02043
02044 junk, right_limit);
02045 if (junk > right_limit)
02046 return BOX ();
02047
02048 return BOX (ICOORD ((INT16) floor (left_limit), blob_box.bottom ()),
02049 ICOORD ((INT16) ceil (right_limit), blob_box.top ()));
02050 }