00001
00020 #include "mfcpch.h"
00021 #ifdef __UNIX__
00022 #include <assert.h>
00023 #endif
00024 #include "stderr.h"
00025 #include "blobbox.h"
00026 #include "ocrclass.h"
00027 #include "lmedsq.h"
00028 #include "statistc.h"
00029 #include "drawtord.h"
00030 #include "makerow.h"
00031 #include "pitsync1.h"
00032 #ifdef TEXT_VERBOSE
00033 #include "../cutil/callcpp.h"
00034 #endif
00035 #include "blobcmpl.h"
00036 #include "tovars.h"
00037 #include "topitch.h"
00038 #include "tospace.h"
00039 #include "fpchop.h"
00040 #include "wordseg.h"
00041
00042 #define EXTERN
00043
00046 EXTERN BOOL_VAR (textord_fp_chopping, TRUE, "Do fixed pitch chopping");
00049 extern ETEXT_DESC *global_monitor;
00050
00051 #define FIXED_WIDTH_MULTIPLE 5
00052 #define BLOCK_STATS_CLUSTERS 10
00053
00059 void make_words(
00060 ICOORD page_tr,
00061 float gradient,
00062 BLOCK_LIST *blocks,
00063 TO_BLOCK_LIST *land_blocks,
00064 TO_BLOCK_LIST *port_blocks
00065 ) {
00066 TO_BLOCK_IT block_it;
00067 TO_BLOCK *block;
00068
00069 #ifdef TEXT_VERBOSE
00070
00071 cprintf("j");
00072 #endif
00073 compute_fixed_pitch (page_tr, port_blocks, gradient, FCOORD (0.0f, -1.0f),
00074 !(BOOL8) textord_test_landscape);
00075 if (global_monitor != NULL) {
00076 global_monitor->ocr_alive = TRUE;
00077 global_monitor->progress = 25;
00078 }
00079 to_spacing(page_tr, port_blocks);
00080 block_it.set_to_list (port_blocks);
00081 for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
00082 block_it.forward ()) {
00083 block = block_it.data ();
00084
00085
00086 make_real_words (block, FCOORD (1.0f, 0.0f));
00087 }
00088 }
00089
00090
00097 void set_row_spaces(
00098 TO_BLOCK *block,
00099 FCOORD rotation,
00100 BOOL8 testing_on
00101 ) {
00102 INT32 maxwidth;
00103 TO_ROW *row;
00104 TO_ROW_IT row_it = block->get_rows ();
00105
00106 if (row_it.empty ())
00107 return;
00108 maxwidth = (INT32) ceil (block->xheight * textord_words_maxspace);
00109 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00110 row = row_it.data ();
00111 if (row->fixed_pitch == 0) {
00112
00113
00114
00115
00116
00117 row->min_space =
00118 (INT32) ceil (row->pr_space -
00119 (row->pr_space -
00120 row->pr_nonsp) * textord_words_definite_spread);
00121 row->max_nonspace =
00122 (INT32) floor (row->pr_nonsp +
00123 (row->pr_space -
00124 row->pr_nonsp) * textord_words_definite_spread);
00125 if (testing_on && textord_show_initial_words) {
00126 tprintf ("Assigning defaults %d non, %d space to row at %g\n",
00127 row->max_nonspace, row->min_space, row->intercept ());
00128 }
00129 row->space_threshold = (row->max_nonspace + row->min_space) / 2;
00130 row->space_size = row->pr_space;
00131 row->kern_size = row->pr_nonsp;
00132
00133 }
00134 #ifndef GRAPHICS_DISABLED
00135 if (textord_show_initial_words && testing_on) {
00136 plot_word_decisions (to_win, (INT16) row->fixed_pitch, row);
00137 }
00138 #endif
00139 }
00140 }
00141
00142
00148 INT32 row_words(
00149 TO_BLOCK *block,
00150 TO_ROW *row,
00151 INT32 maxwidth,
00152 FCOORD rotation,
00153 BOOL8 testing_on
00154 ) {
00155 BOOL8 testing_row;
00156 BOOL8 prev_valid;
00157 BOOL8 this_valid;
00158 INT32 prev_x;
00159 INT32 min_gap;
00160 INT32 cluster_count;
00161 INT32 gap_index;
00162 INT32 smooth_factor;
00163 BLOBNBOX *blob;
00164 float lower, upper;
00165 float gaps[3];
00166 ICOORD testpt;
00167 BOX blob_box;
00168
00169 BLOBNBOX_IT blob_it = row->blob_list ();
00170 STATS gap_stats (0, maxwidth);
00171 STATS cluster_stats[4];
00172
00173 testpt = ICOORD (textord_test_x, textord_test_y);
00174 smooth_factor =
00175 (INT32) (block->xheight * textord_wordstats_smooth_factor + 1.5);
00176
00177
00178 prev_valid = FALSE;
00179 prev_x = -MAX_INT32;
00180 testing_row = FALSE;
00181 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
00182 blob = blob_it.data ();
00183 blob_box = blob->bounding_box ();
00184 if (blob_box.contains (testpt))
00185 testing_row = TRUE;
00186 gap_stats.add (blob_box.width (), 1);
00187 }
00188 min_gap = (INT32) floor (gap_stats.ile (textord_words_width_ile));
00189 gap_stats.clear ();
00190 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
00191 blob = blob_it.data ();
00192 if (!blob->joined_to_prev ()) {
00193 blob_box = blob->bounding_box ();
00194
00195 this_valid = TRUE;
00196 if (this_valid && prev_valid
00197 && blob_box.left () - prev_x < maxwidth) {
00198 gap_stats.add (blob_box.left () - prev_x, 1);
00199 }
00200 prev_x = blob_box.right ();
00201 prev_valid = this_valid;
00202 }
00203 }
00204 if (gap_stats.get_total () == 0) {
00205 row->min_space = 0;
00206 row->max_nonspace = 0;
00207 return 0;
00208 }
00209 gap_stats.smooth (smooth_factor);
00210 lower = row->xheight * textord_words_initial_lower;
00211 upper = row->xheight * textord_words_initial_upper;
00212 cluster_count = gap_stats.cluster (lower, upper,
00213 textord_spacesize_ratioprop, 3,
00214 cluster_stats);
00215 while (cluster_count < 2 && ceil (lower) < floor (upper)) {
00216
00217 upper = (upper * 3 + lower) / 4;
00218 lower = (lower * 3 + upper) / 4;
00219 cluster_count = gap_stats.cluster (lower, upper,
00220 textord_spacesize_ratioprop, 3,
00221 cluster_stats);
00222 }
00223 if (cluster_count < 2) {
00224 row->min_space = 0;
00225 row->max_nonspace = 0;
00226 return 0;
00227 }
00228 for (gap_index = 0; gap_index < cluster_count; gap_index++)
00229 gaps[gap_index] = cluster_stats[gap_index + 1].ile (0.5);
00230
00231 if (cluster_count > 2) {
00232 if (testing_on && textord_show_initial_words) {
00233 tprintf ("Row at %g has 3 sizes of gap:%g,%g,%g\n",
00234 row->intercept (),
00235 cluster_stats[1].ile (0.5),
00236 cluster_stats[2].ile (0.5), cluster_stats[3].ile (0.5));
00237 }
00238 lower = gaps[0];
00239 if (gaps[1] > lower) {
00240 upper = gaps[1];
00241 if (upper < block->xheight * textord_words_min_minspace
00242 && gaps[2] > gaps[1]) {
00243 upper = gaps[2];
00244 }
00245 }
00246 else if (gaps[2] > lower
00247 && gaps[2] >= block->xheight * textord_words_min_minspace)
00248 upper = gaps[2];
00249 else if (lower >= block->xheight * textord_words_min_minspace) {
00250 upper = lower;
00251 lower = gaps[1];
00252 if (testing_on && textord_show_initial_words) {
00253 tprintf ("Had to switch most common from lower to upper!!\n");
00254 gap_stats.print (stdout, TRUE);
00255 }
00256 }
00257 else {
00258 row->min_space = 0;
00259 row->max_nonspace = 0;
00260 return 0;
00261 }
00262 }
00263 else {
00264 if (gaps[1] < gaps[0]) {
00265 if (testing_on && textord_show_initial_words) {
00266 tprintf ("Had to switch most common from lower to upper!!\n");
00267 gap_stats.print (stdout, TRUE);
00268 }
00269 lower = gaps[1];
00270 upper = gaps[0];
00271 }
00272 else {
00273 upper = gaps[1];
00274 lower = gaps[0];
00275 }
00276 }
00277 if (upper < block->xheight * textord_words_min_minspace) {
00278 row->min_space = 0;
00279 row->max_nonspace = 0;
00280 return 0;
00281 }
00282 if (upper * 3 < block->min_space * 2 + block->max_nonspace
00283 || lower * 3 > block->min_space * 2 + block->max_nonspace) {
00284 if (testing_on && textord_show_initial_words) {
00285 tprintf ("Disagreement between block and row at %g!!\n",
00286 row->intercept ());
00287 tprintf ("Lower=%g, upper=%g, Stats:\n", lower, upper);
00288 gap_stats.print (stdout, TRUE);
00289 }
00290 }
00291 row->min_space =
00292 (INT32) ceil (upper - (upper - lower) * textord_words_definite_spread);
00293 row->max_nonspace =
00294 (INT32) floor (lower + (upper - lower) * textord_words_definite_spread);
00295 row->space_threshold = (row->max_nonspace + row->min_space) / 2;
00296 row->space_size = upper;
00297 row->kern_size = lower;
00298 if (testing_on && textord_show_initial_words) {
00299 if (testing_row) {
00300 tprintf ("GAP STATS\n");
00301 gap_stats.print (stdout, TRUE);
00302 tprintf ("SPACE stats\n");
00303 cluster_stats[2].print (stdout, FALSE);
00304 tprintf ("NONSPACE stats\n");
00305 cluster_stats[1].print (stdout, FALSE);
00306 }
00307 tprintf ("Row at %g has minspace=%d(%g), max_non=%d(%g)\n",
00308 row->intercept (), row->min_space, upper,
00309 row->max_nonspace, lower);
00310 }
00311 return cluster_stats[2].get_total ();
00312 }
00313
00314
00320 INT32 row_words2(
00321 TO_BLOCK *block,
00322 TO_ROW *row,
00323 INT32 maxwidth,
00324 FCOORD rotation,
00325 BOOL8 testing_on
00326 ) {
00327 BOOL8 testing_row;
00328 BOOL8 prev_valid;
00329 BOOL8 this_valid;
00330 INT32 prev_x;
00331 INT32 min_width;
00332 INT32 valid_count;
00333 INT32 total_count;
00334 INT32 cluster_count;
00335 INT32 prev_count;
00336 INT32 gap_index;
00337 INT32 smooth_factor;
00338 BLOBNBOX *blob;
00339 float lower, upper;
00340 ICOORD testpt;
00341 BOX blob_box;
00342
00343 BLOBNBOX_IT blob_it = row->blob_list ();
00344 STATS gap_stats (0, maxwidth);
00345
00346 float gaps[BLOCK_STATS_CLUSTERS];
00347 STATS cluster_stats[BLOCK_STATS_CLUSTERS + 1];
00348
00349
00350 testpt = ICOORD (textord_test_x, textord_test_y);
00351 smooth_factor =
00352 (INT32) (block->xheight * textord_wordstats_smooth_factor + 1.5);
00353
00354
00355 prev_valid = FALSE;
00356 prev_x = -MAX_INT16;
00357 testing_row = FALSE;
00358
00359 min_width = (INT32) block->pr_space;
00360 total_count = 0;
00361 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
00362 blob = blob_it.data ();
00363 if (!blob->joined_to_prev ()) {
00364 blob_box = blob->bounding_box ();
00365 this_valid = blob_box.width () >= min_width;
00366 this_valid = TRUE;
00367 if (this_valid && prev_valid
00368 && blob_box.left () - prev_x < maxwidth) {
00369 gap_stats.add (blob_box.left () - prev_x, 1);
00370 }
00371 total_count++;
00372 prev_x = blob_box.right ();
00373 prev_valid = this_valid;
00374 }
00375 }
00376 valid_count = gap_stats.get_total ();
00377 if (valid_count < total_count * textord_words_minlarge) {
00378 gap_stats.clear ();
00379 prev_x = -MAX_INT16;
00380 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
00381 blob_it.forward ()) {
00382 blob = blob_it.data ();
00383 if (!blob->joined_to_prev ()) {
00384 blob_box = blob->bounding_box ();
00385 if (blob_box.left () - prev_x < maxwidth) {
00386 gap_stats.add (blob_box.left () - prev_x, 1);
00387 }
00388 prev_x = blob_box.right ();
00389 }
00390 }
00391 }
00392 if (gap_stats.get_total () == 0) {
00393 row->min_space = 0;
00394 row->max_nonspace = 0;
00395 return 0;
00396 }
00397
00398 cluster_count = 0;
00399 lower = block->xheight * words_initial_lower;
00400 upper = block->xheight * words_initial_upper;
00401 gap_stats.smooth (smooth_factor);
00402 do {
00403 prev_count = cluster_count;
00404 cluster_count = gap_stats.cluster (lower, upper,
00405 textord_spacesize_ratioprop,
00406 BLOCK_STATS_CLUSTERS, cluster_stats);
00407 }
00408 while (cluster_count > prev_count && cluster_count < BLOCK_STATS_CLUSTERS);
00409 if (cluster_count < 1) {
00410 row->min_space = 0;
00411 row->max_nonspace = 0;
00412 return 0;
00413 }
00414 for (gap_index = 0; gap_index < cluster_count; gap_index++)
00415 gaps[gap_index] = cluster_stats[gap_index + 1].ile (0.5);
00416
00417 if (testing_on) {
00418 tprintf ("cluster_count=%d:", cluster_count);
00419 for (gap_index = 0; gap_index < cluster_count; gap_index++)
00420 tprintf (" %g(%d)", gaps[gap_index],
00421 cluster_stats[gap_index + 1].get_total ());
00422 tprintf ("\n");
00423 }
00424
00425
00426 for (gap_index = 0; gap_index < cluster_count
00427 && gaps[gap_index] > block->max_nonspace; gap_index++);
00428 if (gap_index < cluster_count)
00429 lower = gaps[gap_index];
00430 else {
00431 if (testing_on)
00432 tprintf ("No cluster below block threshold!, using default=%g\n",
00433 block->pr_nonsp);
00434 lower = block->pr_nonsp;
00435 }
00436 for (gap_index = 0; gap_index < cluster_count
00437 && gaps[gap_index] <= block->max_nonspace; gap_index++);
00438 if (gap_index < cluster_count)
00439 upper = gaps[gap_index];
00440 else {
00441 if (testing_on)
00442 tprintf ("No cluster above block threshold!, using default=%g\n",
00443 block->pr_space);
00444 upper = block->pr_space;
00445 }
00446 row->min_space =
00447 (INT32) ceil (upper - (upper - lower) * textord_words_definite_spread);
00448 row->max_nonspace =
00449 (INT32) floor (lower + (upper - lower) * textord_words_definite_spread);
00450 row->space_threshold = (row->max_nonspace + row->min_space) / 2;
00451 row->space_size = upper;
00452 row->kern_size = lower;
00453 if (testing_on) {
00454 if (testing_row) {
00455 tprintf ("GAP STATS\n");
00456 gap_stats.print (stdout, TRUE);
00457 tprintf ("SPACE stats\n");
00458 cluster_stats[2].print (stdout, FALSE);
00459 tprintf ("NONSPACE stats\n");
00460 cluster_stats[1].print (stdout, FALSE);
00461 }
00462 tprintf ("Row at %g has minspace=%d(%g), max_non=%d(%g)\n",
00463 row->intercept (), row->min_space, upper,
00464 row->max_nonspace, lower);
00465 }
00466 return 1;
00467 }
00468
00469
00475 void make_real_words(
00476 TO_BLOCK *block,
00477 FCOORD rotation
00478 ) {
00479 TO_ROW *row;
00480 TO_ROW_IT row_it = block->get_rows ();
00481 ROW *real_row = NULL;
00482 ROW_IT real_row_it = block->block->row_list ();
00483
00484 if (row_it.empty ())
00485 return;
00486 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00487 row = row_it.data ();
00488 if (row->blob_list ()->empty () && !row->rep_words.empty ()) {
00489 real_row = make_rep_words (row, block);
00490 }
00491 else if (!row->blob_list ()->empty ()) {
00492
00493 if (row->pitch_decision == PITCH_DEF_FIXED
00494 || row->pitch_decision == PITCH_CORR_FIXED)
00495 real_row = fixed_pitch_words (row, rotation);
00496 else if (row->pitch_decision == PITCH_DEF_PROP
00497 || row->pitch_decision == PITCH_CORR_PROP)
00498 real_row = make_prop_words (row, rotation);
00499 else
00500 ASSERT_HOST(FALSE);
00501 }
00502 if (real_row != NULL) {
00503
00504 real_row_it.add_after_then_move (real_row);
00505 }
00506 }
00507 block->block->set_stats (block->fixed_pitch == 0, (INT16) block->kern_size,
00508 (INT16) block->space_size,
00509 (INT16) block->fixed_pitch);
00510 block->block->check_pitch ();
00511 }
00512
00513
00520 ROW *make_rep_words(
00521 TO_ROW *row,
00522 TO_BLOCK *block
00523 ) {
00524 INT32 xstarts[2];
00525 ROW *real_row;
00526 BOX word_box;
00527 double coeffs[3];
00528
00529 WERD_IT word_it = &row->rep_words;
00530
00531 if (word_it.empty ())
00532 return NULL;
00533 word_box = word_it.data ()->bounding_box ();
00534 for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ())
00535 word_box += word_it.data ()->bounding_box ();
00536 xstarts[0] = word_box.left ();
00537 xstarts[1] = word_box.right ();
00538 coeffs[0] = 0;
00539 coeffs[1] = row->line_m ();
00540 coeffs[2] = row->line_c ();
00541 row->xheight = block->xheight;
00542 real_row = new ROW (row,
00543 (INT16) block->kern_size, (INT16) block->space_size);
00544 word_it.set_to_list (real_row->word_list ());
00545
00546 word_it.add_list_after (&row->rep_words);
00547 real_row->recalc_bounding_box ();
00548 return real_row;
00549 }
00550
00551
00566 WERD *make_real_word(
00567 BLOBNBOX_IT *box_it,
00568 INT32 blobcount,
00569 BOOL8 bol,
00570 BOOL8 fuzzy_sp,
00571 BOOL8 fuzzy_non,
00572 UINT8 blanks
00573 ) {
00574 OUTLINE_IT out_it;
00575 C_OUTLINE_IT cout_it;
00576 PBLOB_LIST blobs;
00577 C_BLOB_LIST cblobs;
00578 PBLOB_IT blob_it = &blobs;
00579 C_BLOB_IT cblob_it = &cblobs;
00580 WERD *word;
00581 BLOBNBOX *bblob;
00582 INT32 blobindex;
00583
00584 for (blobindex = 0; blobindex < blobcount; blobindex++) {
00585 bblob = box_it->extract ();
00586 if (bblob->joined_to_prev ()) {
00587 if (bblob->blob () != NULL) {
00588 out_it.set_to_list (blob_it.data ()->out_list ());
00589 out_it.move_to_last ();
00590 out_it.add_list_after (bblob->blob ()->out_list ());
00591 delete bblob->blob ();
00592 }
00593 else if (bblob->cblob () != NULL) {
00594 cout_it.set_to_list (cblob_it.data ()->out_list ());
00595 cout_it.move_to_last ();
00596 cout_it.add_list_after (bblob->cblob ()->out_list ());
00597 delete bblob->cblob ();
00598 }
00599 }
00600 else {
00601 if (bblob->blob () != NULL)
00602 blob_it.add_after_then_move (bblob->blob ());
00603 else if (bblob->cblob () != NULL)
00604 cblob_it.add_after_then_move (bblob->cblob ());
00605 }
00606 delete bblob;
00607 box_it->forward ();
00608 }
00609
00610 if (blanks < 1)
00611 blanks = 1;
00612 if (!blob_it.empty ()) {
00613
00614 word = new WERD (&blobs, blanks, NULL);
00615 }
00616 else {
00617 word = new WERD (&cblobs, blanks, NULL);
00618 }
00619 if (bol) {
00620 word->set_flag (W_BOL, TRUE);
00621 }
00622 if (fuzzy_sp)
00623
00624 word->set_flag (W_FUZZY_SP, TRUE);
00625 else if (fuzzy_non)
00626
00627 word->set_flag (W_FUZZY_NON, TRUE);
00628 if (box_it->at_first ()) {
00629 word->set_flag (W_EOL, TRUE);
00630 }
00631 return word;
00632 }