00001
00020 #include "mfcpch.h"
00021 #ifdef __UNIX__
00022 #include <assert.h>
00023 #endif
00024 #include "stderr.h"
00025 #include "blobbox.h"
00026 #include "lmedsq.h"
00027 #include "statistc.h"
00028 #include "drawtord.h"
00029 #include "blkocc.h"
00030 #ifdef TEXT_VERBOSE
00031 #include "../cutil/callcpp.h"
00032 #endif
00033 #include "sortflts.h"
00034 #include "oldbasel.h"
00035 #include "tordmain.h"
00036 #include "underlin.h"
00037 #include "makerow.h"
00038 #include "tprintf.h"
00039
00040 #define EXTERN
00041
00044 EXTERN BOOL_VAR (textord_heavy_nr, FALSE, "Vigorously remove noise");
00045 EXTERN BOOL_VAR (textord_show_initial_rows, FALSE,
00046 "Display row accumulation");
00047 EXTERN BOOL_VAR (textord_show_parallel_rows, FALSE,
00048 "Display page correlated rows");
00049 EXTERN BOOL_VAR (textord_show_expanded_rows, FALSE,
00050 "Display rows after expanding");
00051 EXTERN BOOL_VAR (textord_show_final_rows, FALSE,
00052 "Display rows after final fittin");
00053 EXTERN BOOL_VAR (textord_show_final_blobs, FALSE,
00054 "Display blob bounds after pre-ass");
00055 EXTERN BOOL_VAR (textord_test_landscape, FALSE, "Tests refer to land/port");
00056 EXTERN BOOL_VAR (textord_parallel_baselines, TRUE,
00057 "Force parallel baselines");
00058 EXTERN BOOL_VAR (textord_straight_baselines, FALSE,
00059 "Force straight baselines");
00060 EXTERN BOOL_VAR (textord_quadratic_baselines, FALSE, "Use quadratic splines");
00061 EXTERN BOOL_VAR (textord_old_baselines, TRUE, "Use old baseline algorithm");
00062 EXTERN BOOL_VAR (textord_old_xheight, TRUE, "Use old xheight algorithm");
00063 EXTERN BOOL_VAR (textord_fix_xheight_bug, TRUE, "Use spline baseline");
00064 EXTERN BOOL_VAR (textord_fix_makerow_bug, TRUE, "Prevent multiple baselines");
00065 EXTERN BOOL_VAR (textord_row_xheights, FALSE, "Use row height policy");
00066 EXTERN BOOL_VAR (textord_block_xheights, TRUE, "Use block height policy");
00067 EXTERN BOOL_VAR (textord_xheight_tweak, FALSE, "New min condition on height");
00068 EXTERN BOOL_VAR (textord_cblob_blockocc, TRUE,
00069 "Use new projection for underlines");
00070 EXTERN BOOL_VAR (textord_debug_xheights, FALSE, "Test xheight algorithms");
00071 EXTERN BOOL_VAR (textord_biased_skewcalc, TRUE,
00072 "Bias skew estimates with line length");
00073 EXTERN BOOL_VAR (textord_interpolating_skew, TRUE, "Interpolate across gaps");
00074 EXTERN INT_VAR (textord_skewsmooth_offset, 2, "For smooth factor");
00075 EXTERN INT_VAR (textord_test_x, 0, "coord of test pt");
00076 EXTERN INT_VAR (textord_test_y, 0, "coord of test pt");
00077 EXTERN INT_VAR (textord_min_blobs_in_row, 4,
00078 "Min blobs before gradient counted");
00079 EXTERN INT_VAR (textord_spline_minblobs, 8,
00080 "Min blobs in each spline segment");
00081 EXTERN INT_VAR (textord_spline_medianwin, 6,
00082 "Size of window for spline segmentation");
00083 EXTERN INT_VAR (textord_min_xheight, 10, "Min credible pixel xheight");
00084 EXTERN double_VAR (textord_spline_shift_fraction, 0.02,
00085 "Fraction of line spacing for quad");
00086 EXTERN double_VAR (textord_spline_outlier_fraction, 0.1,
00087 "Fraction of line spacing for outlier");
00088 EXTERN double_VAR (textord_skew_ile, 0.5, "Ile of gradients for page skew");
00089 EXTERN double_VAR (textord_skew_lag, 0.01,
00090 "Lag for skew on row accumulation");
00091 EXTERN double_VAR (textord_linespace_iqrlimit, 0.2,
00092 "Max iqr/median for linespace");
00093 EXTERN double_VAR (textord_width_limit, 8, "Max width of blobs to make rows");
00094 EXTERN double_VAR (textord_chop_width, 1.5, "Max width before chopping");
00095 EXTERN double_VAR (textord_merge_desc, 0.25,
00096 "Fraction of linespace for desc drop");
00097 EXTERN double_VAR (textord_merge_x, 0.5,
00098 "Fraction of linespace for x height");
00099 EXTERN double_VAR (textord_merge_asc, 0.25,
00100 "Fraction of linespace for asc height");
00101 EXTERN double_VAR (textord_minxh, 0.25,
00102 "fraction of linesize for min xheight");
00103 EXTERN double_VAR (textord_min_linesize, 1.25,
00104 "* blob height for initial linesize");
00105 EXTERN double_VAR (textord_excess_blobsize, 1.3,
00106 "New row made if blob makes row this big");
00107 EXTERN double_VAR (textord_occupancy_threshold, 0.4,
00108 "Fraction of neighbourhood");
00109 EXTERN double_VAR (textord_underline_width, 2.0,
00110 "Multiple of line_size for underline");
00111 EXTERN double_VAR (textord_xheight_mode_fraction, 0.4,
00112 "Min pile height to make xheight");
00113 EXTERN double_VAR (textord_ascheight_mode_fraction, 0.15,
00114 "Min pile height to make ascheight");
00115 EXTERN double_VAR (textord_ascx_ratio_min, 1.2, "Min cap/xheight");
00116 EXTERN double_VAR (textord_ascx_ratio_max, 1.7, "Max cap/xheight");
00117 EXTERN double_VAR (textord_descx_ratio_min, 0.15, "Min desc/xheight");
00118 EXTERN double_VAR (textord_descx_ratio_max, 0.6, "Max desc/xheight");
00119 EXTERN double_VAR (textord_xheight_error_margin, 0.1, "Accepted variation");
00123 #define MAX_HEIGHT_MODES 12
00124
00130 float make_rows(
00131 ICOORD page_tr,
00132 BLOCK_LIST *blocks,
00133 TO_BLOCK_LIST *land_blocks,
00134 TO_BLOCK_LIST *port_blocks
00135 ) {
00136 float port_m;
00137 float port_err;
00138
00139
00140 TO_BLOCK_IT block_it;
00141
00142
00143
00144
00145
00146
00147
00148 #ifdef TEXT_VERBOSE
00149
00150 cprintf("r\n");
00151 #endif
00152 block_it.set_to_list (port_blocks);
00153 for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
00154 block_it.forward ())
00155 make_initial_textrows (page_tr, block_it.data (), FCOORD (1.0f, 0.0f),
00156 !(BOOL8) textord_test_landscape);
00157
00158 compute_page_skew(port_blocks, port_m, port_err);
00159
00160
00161
00162
00163
00164 block_it.set_to_list (port_blocks);
00165 for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
00166 block_it.forward ()) {
00167 cleanup_rows (page_tr, block_it.data (), port_m, FCOORD (1.0f, 0.0f),
00168 block_it.data ()->block->bounding_box ().left (),
00169 !(BOOL8) textord_test_landscape);
00170 }
00171 block_it.set_to_list (land_blocks);
00172
00173
00174
00175
00176
00177
00178 return port_m;
00179 }
00180
00181
00187 void make_initial_textrows(
00188 ICOORD page_tr,
00189 TO_BLOCK *block,
00190 FCOORD rotation,
00191 BOOL8 testing_on
00192 ) {
00193 TO_ROW_IT row_it = block->get_rows ();
00194
00195 #ifndef GRAPHICS_DISABLED
00196 COLOUR colour;
00197
00198 if (textord_show_initial_rows && testing_on) {
00199 if (to_win == NO_WINDOW)
00200 create_to_win(page_tr);
00201 }
00202 #endif
00203
00204 assign_blobs_to_rows (block, NULL, 0, TRUE, TRUE, textord_show_initial_rows && testing_on);
00205 row_it.move_to_first ();
00206 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ())
00207 fit_lms_line (row_it.data ());
00208 #ifndef GRAPHICS_DISABLED
00209 if (textord_show_initial_rows && testing_on) {
00210 colour = RED;
00211 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00212 plot_to_row (row_it.data (), colour, rotation);
00213 colour = (COLOUR) (colour + 1);
00214 if (colour > MAGENTA)
00215 colour = RED;
00216 }
00217 }
00218 #endif
00219 }
00220
00221
00227 void fit_lms_line(
00228 TO_ROW *row
00229 ) {
00230 float m, c;
00231 BOX box;
00232 LMS lms (row->blob_list ()->length ());
00233
00234 BLOBNBOX_IT blob_it = row->blob_list ();
00235
00236 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
00237 box = blob_it.data ()->bounding_box ();
00238 lms.add (FCOORD ((box.left () + box.right ()) / 2.0, box.bottom ()));
00239 }
00240 lms.fit (m, c);
00241 row->set_line (m, c, lms.error ());
00242 }
00243
00244
00251 void compute_page_skew(
00252 TO_BLOCK_LIST *blocks,
00253 float &page_m,
00254 float &page_err
00255 ) {
00256 INT32 row_count;
00257 INT32 blob_count;
00258 INT32 row_err;
00259 float *gradients;
00260 float *errors;
00261 INT32 row_index;
00262 TO_ROW *row;
00263 TO_BLOCK_IT block_it = blocks;
00264 TO_ROW_IT row_it;
00265
00266 row_count = 0;
00267 blob_count = 0;
00268
00269 #ifdef TEXT_VERBOSE
00270
00271 cprintf("q");
00272 #endif
00273 for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
00274 block_it.forward ()) {
00275 row_count += block_it.data ()->get_rows ()->length ();
00276
00277 row_it.set_to_list (block_it.data ()->get_rows ());
00278 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ())
00279 blob_count += row_it.data ()->blob_list ()->length ();
00280 }
00281 if (row_count == 0) {
00282 page_m = 0.0f;
00283 page_err = 0.0f;
00284 return;
00285 }
00286 gradients = (float *) alloc_mem (blob_count * sizeof (float));
00287
00288 errors = (float *) alloc_mem (blob_count * sizeof (float));
00289 if (gradients == NULL || errors == NULL)
00290 MEMORY_OUT.error ("compute_page_skew", ABORT, NULL);
00291
00292 row_index = 0;
00293 for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
00294 block_it.forward ()) {
00295 row_it.set_to_list (block_it.data ()->get_rows ());
00296 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00297 row = row_it.data ();
00298 blob_count = row->blob_list ()->length ();
00299 row_err = (INT32) ceil (row->line_error ());
00300 if (row_err <= 0)
00301 row_err = 1;
00302 if (textord_biased_skewcalc) {
00303 blob_count /= row_err;
00304 for (blob_count /= row_err; blob_count > 0; blob_count--) {
00305 gradients[row_index] = row->line_m ();
00306 errors[row_index] = row->line_error ();
00307 row_index++;
00308 }
00309 }
00310 else if (blob_count >= textord_min_blobs_in_row) {
00311
00312 gradients[row_index] = row->line_m ();
00313 errors[row_index] = row->line_error ();
00314 row_index++;
00315 }
00316 }
00317 }
00318 if (row_index == 0) {
00319
00320 for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
00321 block_it.forward ()) {
00322 row_it.set_to_list (block_it.data ()->get_rows ());
00323 for (row_it.mark_cycle_pt (); !row_it.cycled_list ();
00324 row_it.forward ()) {
00325 row = row_it.data ();
00326 gradients[row_index] = row->line_m ();
00327 errors[row_index] = row->line_error ();
00328 row_index++;
00329 }
00330 }
00331 }
00332 row_count = row_index;
00333 row_index = choose_nth_item ((INT32) (row_count * textord_skew_ile),
00334 gradients, row_count);
00335 page_m = gradients[row_index];
00336 row_index = choose_nth_item ((INT32) (row_count * textord_skew_ile),
00337 errors, row_count);
00338 page_err = errors[row_index];
00339 free_mem(gradients);
00340 free_mem(errors);
00341 }
00342
00344 const double kNoiseSize = 0.5;
00346 const int kMinSize = 8;
00347
00360 static bool dot_of_i(BLOBNBOX* dot, BLOBNBOX* i, TO_ROW* row) {
00361 const BOX& ibox = i->bounding_box();
00362 const BOX& dotbox = dot->bounding_box();
00363
00364
00365 int overlap = MIN(dotbox.right(), ibox.right()) -
00366 MAX(dotbox.left(), ibox.left());
00367 if (ibox.height() <= 2 * dotbox.height() ||
00368 (overlap * 2 < ibox.width() && overlap < dotbox.width()))
00369 return false;
00370
00371
00372 if (ibox.height() > ibox.width() * 2)
00373 return true;
00374
00375
00376
00377
00378 const double kHeightFraction = 0.6;
00379 double target_height = MIN(dotbox.bottom(), ibox.top());
00380 target_height -= row->line_m()*dotbox.left() + row->line_c();
00381 target_height *= kHeightFraction;
00382 int left_min = dotbox.left() - dotbox.width();
00383 int middle = (dotbox.left() + dotbox.right())/2;
00384 int right_max = dotbox.right() + dotbox.width();
00385 int left_miny = 0;
00386 int left_maxy = 0;
00387 int right_miny = 0;
00388 int right_maxy = 0;
00389 bool found_left = false;
00390 bool found_right = false;
00391 bool in_left = false;
00392 bool in_right = false;
00393 C_BLOB* blob = i->cblob();
00394 C_OUTLINE_IT o_it = blob->out_list();
00395 for (o_it.mark_cycle_pt(); !o_it.cycled_list(); o_it.forward()) {
00396 C_OUTLINE* outline = o_it.data();
00397 int length = outline->pathlength();
00398 ICOORD pos = outline->start_pos();
00399 for (int step = 0; step < length; pos += outline->step(step++)) {
00400 int x = pos.x();
00401 int y = pos.y();
00402 if (x >= left_min && x < middle && !found_left) {
00403
00404 if (in_left) {
00405 if (y > left_maxy) left_maxy = y;
00406 if (y < left_miny) left_miny = y;
00407 } else {
00408 left_maxy = left_miny = y;
00409 in_left = true;
00410 }
00411 } else if (in_left) {
00412
00413 if (left_maxy - left_miny > target_height) {
00414 if (found_right)
00415 return true;
00416 found_left = true;
00417 }
00418 in_left = false;
00419 }
00420 if (x <= right_max && x > middle && !found_right) {
00421
00422 if (in_right) {
00423 if (y > right_maxy) right_maxy = y;
00424 if (y < right_miny) right_miny = y;
00425 } else {
00426 right_maxy = right_miny = y;
00427 in_right = true;
00428 }
00429 } else if (in_right) {
00430
00431 if (right_maxy - right_miny > target_height) {
00432 if (found_left)
00433 return true;
00434 found_right = true;
00435 }
00436 in_right = false;
00437 }
00438 }
00439 }
00440 return false;
00441 }
00442
00453 static void vigorous_noise_removal(TO_BLOCK* block) {
00454 TO_ROW_IT row_it = block->get_rows ();
00455 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00456 TO_ROW* row = row_it.data();
00457 BLOBNBOX_IT b_it = row->blob_list();
00458
00459 int max_height = 0;
00460 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
00461 BLOBNBOX* blob = b_it.data();
00462 if (blob->bounding_box().height() > max_height)
00463 max_height = blob->bounding_box().height();
00464 }
00465 STATS hstats(0, max_height + 1);
00466 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
00467 BLOBNBOX* blob = b_it.data();
00468 int height = blob->bounding_box().height();
00469 if (height >= kMinSize)
00470 hstats.add(blob->bounding_box().height(), 1);
00471 }
00472 float xheight = hstats.median();
00473
00474 BLOBNBOX* prev = NULL;
00475 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
00476 BLOBNBOX* blob = b_it.data();
00477 const BOX& box = blob->bounding_box();
00478 if (box.height() < kNoiseSize * xheight) {
00479
00480 if (prev != NULL) {
00481 if (dot_of_i(blob, prev, row))
00482 continue;
00483 }
00484 if (!b_it.at_last()) {
00485 BLOBNBOX* next = b_it.data_relative(1);
00486 if (dot_of_i(blob, next, row))
00487 continue;
00488 }
00489
00490 if (blob->blob() != NULL)
00491 delete blob->blob();
00492 if (blob->cblob() != NULL)
00493 delete blob->cblob();
00494 delete b_it.extract();
00495 } else {
00496 prev = blob;
00497 }
00498 }
00499 }
00500 }
00501
00507 void cleanup_rows(
00508 ICOORD page_tr,
00509 TO_BLOCK *block,
00510 float gradient,
00511 FCOORD rotation,
00512 INT32 block_edge,
00513 BOOL8 testing_on
00514 ) {
00515
00516 BLOBNBOX_IT blob_it = &block->blobs;
00517 TO_ROW_IT row_it = block->get_rows ();
00518
00519 #ifndef GRAPHICS_DISABLED
00520 if (textord_show_parallel_rows && testing_on) {
00521 if (to_win == NO_WINDOW)
00522 create_to_win(page_tr);
00523 }
00524 #endif
00525
00526 fit_parallel_rows(block,
00527 gradient,
00528 rotation,
00529 block_edge,
00530 textord_show_parallel_rows &&testing_on);
00531 delete_non_dropout_rows(block,
00532 gradient,
00533 rotation,
00534 block_edge,
00535 textord_show_parallel_rows &&testing_on);
00536 expand_rows(page_tr, block, gradient, rotation, block_edge, testing_on);
00537 blob_it.set_to_list (&block->blobs);
00538 row_it.set_to_list (block->get_rows ());
00539 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ())
00540 blob_it.add_list_after (row_it.data ()->blob_list ());
00541
00542 assign_blobs_to_rows (block, &gradient, 1, TRUE, TRUE, FALSE);
00543
00544 blob_it.set_to_list (&block->blobs);
00545 blob_it.add_list_after (&block->large_blobs);
00546 assign_blobs_to_rows (block, &gradient, 2, TRUE, TRUE, FALSE);
00547
00548 blob_it.set_to_list (&block->blobs);
00549
00550 blob_it.add_list_after (&block->noise_blobs);
00551 blob_it.add_list_after (&block->small_blobs);
00552 assign_blobs_to_rows (block, &gradient, 3, FALSE, FALSE, FALSE);
00553
00554 row_it.set_to_list (block->get_rows ());
00555 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ())
00556 row_it.data ()->blob_list ()->sort (blob_x_order);
00557 fit_parallel_rows(block, gradient, rotation, block_edge, FALSE);
00558
00559 if (textord_heavy_nr) {
00560 vigorous_noise_removal(block);
00561 }
00562 separate_underlines(block, gradient, rotation, testing_on);
00563 pre_associate_blobs(page_tr, block, rotation, testing_on);
00564
00565 #ifndef GRAPHICS_DISABLED
00566 if (textord_show_final_rows && testing_on) {
00567 if (to_win == NO_WINDOW)
00568 create_to_win(page_tr);
00569 }
00570 #endif
00571
00572 fit_parallel_rows(block, gradient, rotation, block_edge, FALSE);
00573
00574 make_spline_rows(block,
00575 gradient,
00576 rotation,
00577 block_edge,
00578 textord_show_final_rows &&testing_on);
00579 if (!textord_old_xheight || !textord_old_baselines)
00580 compute_block_xheight(block, gradient);
00581 if (textord_restore_underlines)
00582
00583 restore_underlined_blobs(block);
00584 #ifndef GRAPHICS_DISABLED
00585 if (textord_show_final_rows && testing_on) {
00586 plot_blob_list (to_win, &block->blobs, MAGENTA, WHITE);
00587
00588 plot_blob_list (to_win, &block->underlines, YELLOW, CORAL);
00589 }
00590 if (textord_show_final_rows && testing_on && block->blobs.length () > 0)
00591 tprintf ("%d blobs discarded as noise\n", block->blobs.length ());
00592 if (textord_show_final_rows && testing_on) {
00593 draw_meanlines(block, gradient, block_edge, WHITE, rotation);
00594 }
00595 #endif
00596 }
00597
00598
00604 void delete_non_dropout_rows(
00605 TO_BLOCK *block,
00606 float gradient,
00607 FCOORD rotation,
00608 INT32 block_edge,
00609 BOOL8 testing_on
00610 ) {
00611 BOX block_box;
00612 INT32 *deltas;
00613 INT32 *occupation;
00614 INT32 max_y;
00615 INT32 min_y;
00616 INT32 line_index;
00617 INT32 line_count;
00618 INT32 distance;
00619 INT32 xleft;
00620 INT32 ybottom;
00621 TO_ROW *row;
00622 TO_ROW_IT row_it = block->get_rows ();
00623 BLOBNBOX_IT blob_it = &block->blobs;
00624
00625 #ifdef TEXT_VERBOSE
00626
00627 cprintf("n");
00628 #endif
00629 if (row_it.length () == 0)
00630 return;
00631 block_box = deskew_block_coords (block, gradient);
00632 xleft = block->block->bounding_box ().left ();
00633 ybottom = block->block->bounding_box ().bottom ();
00634 min_y = block_box.bottom () - 1;
00635 max_y = block_box.top () + 1;
00636 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00637 line_index = (INT32) floor (row_it.data ()->intercept ());
00638 if (line_index <= min_y)
00639 min_y = line_index - 1;
00640 if (line_index >= max_y)
00641 max_y = line_index + 1;
00642 }
00643 line_count = max_y - min_y + 1;
00644 if (line_count <= 0)
00645 return;
00646 deltas = (INT32 *) alloc_mem (line_count * sizeof (INT32));
00647 occupation = (INT32 *) alloc_mem (line_count * sizeof (INT32));
00648 if (deltas == NULL || occupation == NULL)
00649 MEMORY_OUT.error ("compute_line_spacing", ABORT, NULL);
00650
00651 compute_line_occupation(block, gradient, min_y, max_y, occupation, deltas);
00652 compute_occupation_threshold ((INT32)
00653 ceil (block->line_spacing *
00654 (textord_merge_desc +
00655 textord_merge_asc)),
00656 (INT32) ceil (block->line_spacing *
00657 (textord_merge_x +
00658 textord_merge_asc)),
00659 max_y - min_y + 1, occupation, deltas);
00660 #ifndef GRAPHICS_DISABLED
00661 if (testing_on) {
00662 draw_occupation(xleft, ybottom, min_y, max_y, occupation, deltas);
00663 }
00664 #endif
00665 compute_dropout_distances(occupation, deltas, line_count);
00666 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00667 row = row_it.data ();
00668 line_index = (INT32) floor (row->intercept ());
00669 distance = deltas[line_index - min_y];
00670 if (find_best_dropout_row (row, distance, block->line_spacing / 2,
00671 line_index, &row_it, testing_on)) {
00672 #ifndef GRAPHICS_DISABLED
00673 if (testing_on)
00674 plot_parallel_row(row, gradient, block_edge, WHITE, rotation);
00675 #endif
00676 blob_it.add_list_after (row_it.data ()->blob_list ());
00677 delete row_it.extract ();
00678 }
00679 }
00680 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00681 blob_it.add_list_after (row_it.data ()->blob_list ());
00682 }
00683
00684 free_mem(deltas);
00685 free_mem(occupation);
00686 }
00687
00688
00697 BOOL8 find_best_dropout_row(
00698 TO_ROW *row,
00699 INT32 distance,
00700 float dist_limit,
00701 INT32 line_index,
00702 TO_ROW_IT *row_it,
00703 BOOL8 testing_on
00704 ) {
00705 INT32 next_index;
00706 INT32 row_offset;
00707 INT32 abs_dist;
00708 INT8 row_inc;
00709 TO_ROW *next_row;
00710
00711 if (testing_on)
00712 tprintf ("Row at %g(%g), dropout dist=%d,",
00713 row->intercept (), row->parallel_c (), distance);
00714 if (distance < 0) {
00715 row_inc = 1;
00716 abs_dist = -distance;
00717 }
00718 else {
00719 row_inc = -1;
00720 abs_dist = distance;
00721 }
00722 if (abs_dist > dist_limit) {
00723 if (testing_on) {
00724 tprintf (" too far - deleting\n");
00725 }
00726 return TRUE;
00727 }
00728 if (distance < 0 && !row_it->at_last ()
00729 || distance >= 0 && !row_it->at_first ()) {
00730 row_offset = row_inc;
00731 do {
00732 next_row = row_it->data_relative (row_offset);
00733 next_index = (INT32) floor (next_row->intercept ());
00734 if (distance < 0
00735 && next_index < line_index
00736 && next_index > line_index + distance + distance
00737 || distance >= 0
00738 && next_index > line_index
00739 && next_index < line_index + distance + distance) {
00740 if (testing_on) {
00741 tprintf (" nearer neighbour (%d) at %g\n",
00742 line_index + distance - next_index,
00743 next_row->intercept ());
00744 }
00745 return TRUE;
00746 }
00747 else if (next_index == line_index
00748 || next_index == line_index + distance + distance) {
00749 if (row->believability () <= next_row->believability ()) {
00750 if (testing_on) {
00751 tprintf (" equal but more believable at %g (%g/%g)\n",
00752 next_row->intercept (),
00753 row->believability (),
00754 next_row->believability ());
00755 }
00756 return TRUE;
00757 }
00758 }
00759 row_offset += row_inc;
00760 }
00761 while ((next_index == line_index
00762 || next_index == line_index + distance + distance)
00763 && row_offset < row_it->length ());
00764 if (testing_on)
00765 tprintf (" keeping\n");
00766 }
00767 return FALSE;
00768 }
00769
00770
00777 BOX deskew_block_coords(
00778 TO_BLOCK *block,
00779 float gradient
00780 ) {
00781 BOX result;
00782 BOX blob_box;
00783 FCOORD rotation;
00784 float length;
00785 TO_ROW_IT row_it = block->get_rows ();
00786 TO_ROW *row;
00787 BLOBNBOX *blob;
00788 BLOBNBOX_IT blob_it;
00789
00790 length = sqrt (gradient * gradient + 1);
00791 rotation = FCOORD (1 / length, -gradient / length);
00792 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00793 row = row_it.data ();
00794 blob_it.set_to_list (row->blob_list ());
00795 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
00796 blob_it.forward ()) {
00797 blob = blob_it.data ();
00798 blob_box = blob->bounding_box ();
00799 blob_box.rotate (rotation);
00800 result += blob_box;
00801 }
00802 }
00803 return result;
00804 }
00805
00806
00813 void compute_line_occupation(
00814 TO_BLOCK *block,
00815 float gradient,
00816 INT32 min_y,
00817 INT32 max_y,
00818 INT32 *occupation,
00819 INT32 *deltas
00820 ) {
00821 INT32 line_count;
00822 INT32 line_index;
00823 float top, bottom;
00824 INT32 width;
00825 INT32 idx;
00826 TO_ROW *row;
00827 TO_ROW_IT row_it = block->get_rows ();
00828 BLOBNBOX *blob;
00829 BLOBNBOX_IT blob_it;
00830 float length;
00831 BOX blob_box;
00832 FCOORD rotation;
00833
00834 line_count = max_y - min_y + 1;
00835 length = sqrt (gradient * gradient + 1);
00836 rotation = FCOORD (1 / length, -gradient / length);
00837 for (line_index = 0; line_index < line_count; line_index++)
00838 deltas[line_index] = 0;
00839 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00840 row = row_it.data ();
00841 blob_it.set_to_list (row->blob_list ());
00842 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
00843 blob_it.forward ()) {
00844 blob = blob_it.data ();
00845 blob_box = blob->bounding_box ();
00846 blob_box.rotate (rotation);
00847 top = blob_box.top ();
00848 bottom = blob_box.bottom ();
00849 width =
00850 (INT32) floor ((FLOAT32) (blob_box.right () - blob_box.left ()));
00851 if ((INT32) floor (bottom) < min_y
00852 || (INT32) floor (bottom) - min_y >= line_count)
00853 fprintf (stderr,
00854 "Bad y coord of bottom, " INT32FORMAT "(" INT32FORMAT ","
00855 INT32FORMAT ")\n", (INT32) floor (bottom), min_y, max_y);
00856
00857
00858
00859 idx = (INT32) floor (bottom) - min_y;
00860 deltas[idx] += width;
00861 if ((INT32) floor (top) < min_y
00862 || (INT32) floor (top) - min_y >= line_count)
00863 fprintf (stderr,
00864 "Bad y coord of top, " INT32FORMAT "(" INT32FORMAT ","
00865 INT32FORMAT ")\n", (INT32) floor (top), min_y, max_y);
00866
00867 idx = (INT32) floor (top) - min_y;
00868 deltas[idx] -= width;
00869 }
00870 }
00871 occupation[0] = deltas[0];
00872 for (line_index = 1; line_index < line_count; line_index++)
00873 occupation[line_index] = occupation[line_index - 1] + deltas[line_index];
00874 }
00875
00876
00882 void compute_occupation_threshold(
00883 INT32 low_window,
00884 INT32 high_window,
00885 INT32 line_count,
00886 INT32 *occupation,
00887 INT32 *thresholds
00888 ) {
00889 INT32 line_index;
00890 INT32 low_index;
00891 INT32 high_index;
00892 INT32 sum;
00893 INT32 divisor;
00894 INT32 min_index;
00895 INT32 min_occ;
00896 INT32 test_index;
00897
00898 divisor =
00899 (INT32) ceil ((low_window + high_window) / textord_occupancy_threshold);
00900 if (low_window + high_window < line_count) {
00901 for (sum = 0, high_index = 0; high_index < low_window; high_index++)
00902 sum += occupation[high_index];
00903 for (low_index = 0; low_index < high_window; low_index++, high_index++)
00904 sum += occupation[high_index];
00905 min_occ = occupation[0];
00906 min_index = 0;
00907 for (test_index = 1; test_index < high_index; test_index++) {
00908 if (occupation[test_index] <= min_occ) {
00909 min_occ = occupation[test_index];
00910 min_index = test_index;
00911 }
00912 }
00913 for (line_index = 0; line_index < low_window; line_index++)
00914 thresholds[line_index] = (sum - min_occ) / divisor + min_occ;
00915
00916 for (low_index = 0; high_index < line_count; low_index++, high_index++) {
00917 sum -= occupation[low_index];
00918 sum += occupation[high_index];
00919 if (occupation[high_index] <= min_occ) {
00920
00921 min_occ = occupation[high_index];
00922 min_index = high_index;
00923 }
00924
00925 if (min_index <= low_index) {
00926 min_occ = occupation[low_index + 1];
00927 min_index = low_index + 1;
00928 for (test_index = low_index + 2; test_index <= high_index;
00929 test_index++) {
00930 if (occupation[test_index] <= min_occ) {
00931 min_occ = occupation[test_index];
00932
00933 min_index = test_index;
00934 }
00935 }
00936 }
00937 thresholds[line_index++] = (sum - min_occ) / divisor + min_occ;
00938 }
00939 }
00940 else {
00941 min_occ = occupation[0];
00942 min_index = 0;
00943 for (sum = 0, low_index = 0; low_index < line_count; low_index++) {
00944 if (occupation[low_index] < min_occ) {
00945 min_occ = occupation[low_index];
00946 min_index = low_index;
00947 }
00948 sum += occupation[low_index];
00949 }
00950 line_index = 0;
00951 }
00952 for (; line_index < line_count; line_index++)
00953 thresholds[line_index] = (sum - min_occ) / divisor + min_occ;
00954
00955 }
00956
00957
00963 void compute_dropout_distances(
00964 INT32 *occupation,
00965 INT32 *thresholds,
00966 INT32 line_count
00967 ) {
00968 INT32 line_index;
00969 INT32 distance;
00970 INT32 next_dist;
00971 INT32 back_index;
00972 INT32 prev_threshold;
00973
00974 distance = -line_count;
00975 line_index = 0;
00976 do {
00977 do {
00978 distance--;
00979 prev_threshold = thresholds[line_index];
00980
00981 thresholds[line_index] = distance;
00982 line_index++;
00983 }
00984 while (line_index < line_count
00985 && (occupation[line_index] < thresholds[line_index]
00986 || occupation[line_index - 1] >= prev_threshold));
00987 if (line_index < line_count) {
00988 back_index = line_index - 1;
00989 next_dist = 1;
00990 while (next_dist < -distance && back_index >= 0) {
00991 thresholds[back_index] = next_dist;
00992 back_index--;
00993 next_dist++;
00994 distance++;
00995 }
00996 distance = 1;
00997 }
00998 }
00999 while (line_index < line_count);
01000 }
01001
01002
01015 void expand_rows(
01016 ICOORD page_tr,
01017 TO_BLOCK *block,
01018 float gradient,
01019 FCOORD rotation,
01020 INT32 block_edge,
01021 BOOL8 testing_on
01022 ) {
01023 BOOL8 swallowed_row;
01024 float y_max, y_min;
01025 float y_bottom, y_top;
01026 TO_ROW *test_row;
01027 TO_ROW *row;
01028
01029 BLOBNBOX_IT blob_it = &block->blobs;
01030 TO_ROW_IT row_it = block->get_rows ();
01031
01032 #ifdef TEXT_VERBOSE
01033
01034 cprintf("x");
01035 #endif
01036 #ifndef GRAPHICS_DISABLED
01037 if (textord_show_expanded_rows && testing_on) {
01038 if (to_win == NO_WINDOW)
01039 create_to_win(page_tr);
01040 }
01041 #endif
01042
01043 adjust_row_limits(block);
01044 if (textord_new_initial_xheight) {
01045 if (block->get_rows ()->length () == 0)
01046 return;
01047 compute_row_stats(block, textord_show_expanded_rows &&testing_on);
01048 }
01049 assign_blobs_to_rows (block, &gradient, 4, TRUE, FALSE, FALSE);
01050
01051 if (block->get_rows ()->length () == 0)
01052 return;
01053 fit_parallel_rows(block,
01054 gradient,
01055 rotation,
01056 block_edge,
01057 textord_show_expanded_rows &&testing_on);
01058 if (!textord_new_initial_xheight)
01059 compute_row_stats(block, textord_show_expanded_rows &&testing_on);
01060 row_it.move_to_last ();
01061 do {
01062 row = row_it.data ();
01063 y_max = row->max_y ();
01064 y_min = row->min_y ();
01065 y_bottom = row->intercept () - block->line_size * textord_merge_desc;
01066 y_top = row->intercept () + block->line_size
01067 * (textord_merge_x + textord_merge_asc);
01068 if (y_min > y_bottom) {
01069
01070 swallowed_row = TRUE;
01071 while (swallowed_row && !row_it.at_last ()) {
01072 swallowed_row = FALSE;
01073
01074 test_row = row_it.data_relative (1);
01075
01076 if (test_row->max_y () > y_bottom) {
01077 if (test_row->min_y () > y_bottom) {
01078 row_it.forward ();
01079 #ifndef GRAPHICS_DISABLED
01080 if (textord_show_expanded_rows && testing_on)
01081 plot_parallel_row(test_row,
01082 gradient,
01083 block_edge,
01084 WHITE,
01085 rotation);
01086 #endif
01087 blob_it.set_to_list (row->blob_list ());
01088 blob_it.add_list_after (test_row->blob_list ());
01089
01090 delete row_it.extract ();
01091 row_it.backward ();
01092 swallowed_row = TRUE;
01093 }
01094 else if (test_row->max_y () < y_min)
01095
01096 y_bottom = test_row->max_y ();
01097 else
01098 y_bottom = y_min;
01099 }
01100 }
01101 y_min = y_bottom;
01102 }
01103 if (y_max < y_top) {
01104 swallowed_row = TRUE;
01105 while (swallowed_row && !row_it.at_first ()) {
01106 swallowed_row = FALSE;
01107
01108 test_row = row_it.data_relative (-1);
01109 if (test_row->min_y () < y_top) {
01110 if (test_row->max_y () < y_top) {
01111 row_it.backward ();
01112 blob_it.set_to_list (row->blob_list ());
01113 #ifndef GRAPHICS_DISABLED
01114 if (textord_show_expanded_rows && testing_on)
01115 plot_parallel_row(test_row,
01116 gradient,
01117 block_edge,
01118 WHITE,
01119 rotation);
01120 #endif
01121 blob_it.add_list_after (test_row->blob_list ());
01122
01123 delete row_it.extract ();
01124 row_it.forward ();
01125 swallowed_row = TRUE;
01126 }
01127 else if (test_row->min_y () < y_max)
01128
01129 y_top = test_row->min_y ();
01130 else
01131 y_top = y_max;
01132
01133 }
01134 }
01135 y_max = y_top;
01136 }
01137
01138 row->set_limits (y_min, y_max);
01139 row_it.backward ();
01140 }
01141 while (!row_it.at_last ());
01142 }
01143
01144
01150 void adjust_row_limits(
01151 TO_BLOCK *block
01152 ) {
01153 TO_ROW *row;
01154 float size;
01155 float ymax;
01156 float ymin;
01157 TO_ROW_IT row_it = block->get_rows ();
01158
01159 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
01160 row = row_it.data ();
01161 size = row->max_y () - row->min_y ();
01162 size /= textord_merge_x + textord_merge_asc + textord_merge_desc;
01163 ymax = size * (textord_merge_x + textord_merge_asc);
01164 ymin = -size * textord_merge_desc;
01165 row->set_limits (row->intercept () + ymin, row->intercept () + ymax);
01166 row->merged = FALSE;
01167 }
01168 }
01169
01170
01176 void compute_row_stats(
01177 TO_BLOCK *block,
01178 BOOL8 testing_on
01179 ) {
01180 INT32 row_index;
01181 TO_ROW *row;
01182 TO_ROW *prev_row;
01183 float iqr;
01184 TO_ROW_IT row_it = block->get_rows ();
01185
01186 INT16 rowcount = row_it.length ();
01187 TO_ROW **rows;
01188
01189 rows = (TO_ROW **) alloc_mem (rowcount * sizeof (TO_ROW *));
01190 if (rows == NULL)
01191 MEMORY_OUT.error ("compute_row_stats", ABORT, NULL);
01192 rowcount = 0;
01193 prev_row = NULL;
01194 row_it.move_to_last ();
01195 do {
01196 row = row_it.data ();
01197 if (prev_row != NULL) {
01198 rows[rowcount++] = prev_row;
01199 prev_row->spacing = row->intercept () - prev_row->intercept ();
01200 if (testing_on)
01201 tprintf ("Row at %g yields spacing of %g\n",
01202 row->intercept (), prev_row->spacing);
01203 }
01204 prev_row = row;
01205 row_it.backward ();
01206 }
01207 while (!row_it.at_last ());
01208 block->key_row = prev_row;
01209 block->baseline_offset =
01210 fmod (prev_row->parallel_c (), block->line_spacing);
01211 if (testing_on)
01212 tprintf ("Blob based spacing=(%g,%g), offset=%g",
01213 block->line_size, block->line_spacing, block->baseline_offset);
01214 if (rowcount > 0) {
01215 row_index = choose_nth_item (rowcount * 3 / 4, rows, rowcount,
01216 sizeof (TO_ROW *), row_spacing_order);
01217 iqr = rows[row_index]->spacing;
01218 row_index = choose_nth_item (rowcount / 4, rows, rowcount,
01219 sizeof (TO_ROW *), row_spacing_order);
01220 iqr -= rows[row_index]->spacing;
01221 row_index = choose_nth_item (rowcount / 2, rows, rowcount,
01222 sizeof (TO_ROW *), row_spacing_order);
01223 block->key_row = rows[row_index];
01224 if (testing_on)
01225 tprintf (" row based=%g(%g)", rows[row_index]->spacing, iqr);
01226 if (rowcount > 2
01227 && iqr < rows[row_index]->spacing * textord_linespace_iqrlimit) {
01228 if (!textord_new_initial_xheight) {
01229 if (rows[row_index]->spacing < block->line_spacing
01230 && rows[row_index]->spacing > block->line_size)
01231
01232 block->line_size = rows[row_index]->spacing;
01233
01234 else if (rows[row_index]->spacing > block->line_spacing)
01235 block->line_size = block->line_spacing;
01236
01237 }
01238 else {
01239 if (rows[row_index]->spacing < block->line_spacing)
01240 block->line_size = rows[row_index]->spacing;
01241 else
01242 block->line_size = block->line_spacing;
01243
01244 }
01245 if (block->line_size < textord_min_xheight)
01246 block->line_size = (float) textord_min_xheight;
01247 block->line_spacing = rows[row_index]->spacing;
01248 block->max_blob_size =
01249 block->line_spacing * textord_excess_blobsize;
01250 }
01251 block->baseline_offset = fmod (rows[row_index]->intercept (),
01252 block->line_spacing);
01253 }
01254 if (testing_on)
01255 tprintf ("\nEstimate line size=%g, spacing=%g, offset=%g\n",
01256 block->line_size, block->line_spacing, block->baseline_offset);
01257 free_mem(rows);
01258 }
01259
01260
01267 void compute_block_xheight(
01268 TO_BLOCK *block,
01269 float gradient
01270 ) {
01271 TO_ROW *row;
01272 int xh_count, desc_count;
01273 float block_median;
01274 int asc_count, cap_count;
01275 INT32 min_size, max_size;
01276 INT32 evidence;
01277 float xh_sum, desc_sum;
01278 float asc_sum, cap_sum;
01279 TO_ROW_IT row_it = block->get_rows ();
01280 STATS row_heights;
01281
01282 if (row_it.empty ())
01283 return;
01284 block_median = median_block_xheight (block, gradient);
01285 block_median *= 2;
01286 if (block_median < block->line_size)
01287 block_median = block->line_size;
01288
01289
01290 max_size = (INT32) ceil (block_median);
01291 min_size = (INT32) floor (block_median * textord_minxh);
01292 row_heights.set_range (min_size, max_size + 1);
01293 xh_count = desc_count = asc_count = cap_count = 0;
01294 xh_sum = desc_sum = asc_sum = cap_sum = 0.0f;
01295 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
01296 row = row_it.data ();
01297 evidence = compute_row_xheight (row, min_size, max_size, gradient);
01298 if (row->xheight > 0 && row->ascrise > 0) {
01299 row_heights.add ((INT32) row->xheight, evidence);
01300 xh_count += evidence;
01301 asc_sum += row->ascrise;
01302 asc_count++;
01303 }
01304 else if (row->xheight > 0) {
01305 cap_sum += row->xheight;
01306 cap_count++;
01307 }
01308 if (row->descdrop != 0) {
01309 desc_sum += row->descdrop;
01310 desc_count++;
01311 }
01312 }
01313 if (xh_count > 0) {
01314
01315 xh_sum = row_heights.ile (0.5);
01316 asc_sum /= asc_count;
01317 }
01318 else if (cap_count > 0) {
01319 cap_sum /= cap_count;
01320 xh_sum =
01321 cap_sum * textord_merge_x / (textord_merge_x + textord_merge_asc);
01322 asc_sum =
01323 cap_sum * textord_merge_asc / (textord_merge_x + textord_merge_asc);
01324 }
01325 else {
01326
01327 xh_sum = block_median * textord_merge_x;
01328 asc_sum = block_median * textord_merge_asc;
01329 }
01330 if (desc_count > 0) {
01331 desc_sum /= desc_count;
01332 }
01333 else {
01334 desc_sum = xh_sum * textord_merge_desc / textord_merge_x;
01335 }
01336
01337
01338
01339 if (xh_sum < textord_min_xheight)
01340 xh_sum = (float) textord_min_xheight;
01341 block->xheight = xh_sum;
01342 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
01343 correct_row_xheight (row_it.data (), xh_sum, asc_sum, desc_sum);
01344 }
01345 }
01346
01347
01353 float median_block_xheight(
01354 TO_BLOCK *block,
01355 float gradient
01356 ) {
01357 TO_ROW *row;
01358 float result;
01359 float xcentre;
01360 TO_ROW_IT row_it = block->get_rows ();
01361 BLOBNBOX_IT blob_it;
01362 BLOBNBOX *blob;
01363 float *heights;
01364 INT32 blob_count;
01365 INT32 blob_index;
01366
01367 blob_count = 0;
01368 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ())
01369 blob_count += row_it.data ()->blob_list ()->length ();
01370 heights = (float *) alloc_mem (blob_count * sizeof (float));
01371 if (heights == NULL)
01372 MEMORY_OUT.error ("compute_row_stats", ABORT, NULL);
01373
01374 blob_index = 0;
01375 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
01376 row = row_it.data ();
01377 blob_it.set_to_list (row->blob_list ());
01378 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
01379 blob_it.forward ()) {
01380 blob = blob_it.data ();
01381 if (!blob->joined_to_prev ()) {
01382 xcentre =
01383 (blob->bounding_box ().left () +
01384 blob->bounding_box ().right ()) / 2.0f;
01385 heights[blob_index] =
01386 blob->bounding_box ().top () - gradient * xcentre -
01387 row->parallel_c ();
01388 if (heights[blob_index] > 0)
01389 blob_index++;
01390 }
01391 }
01392 }
01393 ASSERT_HOST (blob_index > 0);
01394 blob_count = blob_index;
01395 blob_index = choose_nth_item (blob_count / 2, heights, blob_count);
01396 result = heights[blob_index];
01397 free_mem(heights);
01398 return result;
01399 }
01400
01401
01408 INT32 compute_row_xheight(
01409 TO_ROW *row,
01410 INT32 min_height,
01411 INT32 max_height,
01412 float gradient
01413 ) {
01414 BOOL8 in_best_pile;
01415 INT32 prev_size;
01416 float xcentre;
01417 float height;
01418 BLOBNBOX_IT blob_it = row->blob_list ();
01419 BLOBNBOX *blob;
01420 INT32 blob_count;
01421 INT32 x;
01422 INT32 asc;
01423 INT32 blob_index;
01424 INT32 mode_count;
01425 INT32 best_count;
01426 float ratio;
01427 INT32 modes[MAX_HEIGHT_MODES];
01428 STATS heights (min_height, max_height + 1);
01429
01430 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
01431 blob = blob_it.data ();
01432 if (!blob->joined_to_prev ()) {
01433 xcentre =
01434 (blob->bounding_box ().left () +
01435 blob->bounding_box ().right ()) / 2.0f;
01436 height = blob->bounding_box ().top ();
01437 if (textord_fix_xheight_bug)
01438 height -= row->baseline.y (xcentre);
01439 else
01440 height -= gradient * xcentre + row->parallel_c ();
01441 if (height >= min_height && height <= max_height
01442 && (!textord_xheight_tweak || height > textord_min_xheight))
01443 heights.add ((INT32) floor (height + 0.5), 1);
01444 }
01445 }
01446 blob_index = heights.mode ();
01447
01448 blob_count = heights.pile_count (blob_index);
01449 if (textord_debug_xheights)
01450 tprintf ("min_height=%d, max_height=%d, mode=%d, count=%d, total=%d,%d\n",
01451 min_height, max_height, blob_index, blob_count,
01452 heights.get_total (), row->blob_list ()->length ());
01453 row->ascrise = 0.0f;
01454 row->xheight = 0.0f;
01455 row->descdrop = 0.0f;
01456 in_best_pile = FALSE;
01457 prev_size = -MAX_INT32;
01458 best_count = 0;
01459 if (blob_count > 0) {
01460
01461 mode_count = compute_height_modes (&heights, min_height, max_height, modes, MAX_HEIGHT_MODES);
01462 for (x = 0; x < mode_count - 1; x++) {
01463 if (modes[x] != prev_size + 1)
01464 in_best_pile = FALSE;
01465 if (heights.pile_count (modes[x])
01466 >= blob_count * textord_xheight_mode_fraction
01467 && (in_best_pile || heights.pile_count (modes[x]) > best_count)) {
01468 for (asc = x + 1; asc < mode_count; asc++) {
01469 ratio = (float) modes[asc] / modes[x];
01470 if (textord_ascx_ratio_min < ratio
01471 && ratio < textord_ascx_ratio_max
01472 && heights.pile_count (modes[asc])
01473 >= blob_count * textord_ascheight_mode_fraction) {
01474 if (heights.pile_count (modes[x]) > best_count) {
01475 in_best_pile = TRUE;
01476 best_count = heights.pile_count (modes[x]);
01477 }
01478
01479
01480
01481
01482 prev_size = modes[x];
01483 row->xheight = (float) modes[x];
01484 row->ascrise = (float) (modes[asc] - modes[x]);
01485 }
01486 }
01487 }
01488 }
01489 if (row->xheight == 0) {
01490
01491 row->xheight = (float) blob_index;
01492 row->ascrise = 0.0f;
01493 if (textord_debug_xheights)
01494 tprintf ("Single mode xheight set to %g\n", row->xheight);
01495 }
01496 else if (textord_debug_xheights)
01497 tprintf ("Multi-mode xheight set to %g, asc=%g\n",
01498 row->xheight, row->ascrise);
01499 row->descdrop = (float) compute_row_descdrop (row, gradient);
01500
01501 }
01502 return best_count;
01503 }
01504
01505
01511 INT32 compute_row_descdrop(
01512 TO_ROW *row,
01513 float gradient
01514 ) {
01515 INT32 min_height = (INT32) floor (row->xheight * textord_descx_ratio_min);
01516 INT32 max_height = (INT32) floor (row->xheight * textord_descx_ratio_max);
01517 float xcentre;
01518 float height;
01519 BLOBNBOX_IT blob_it = row->blob_list ();
01520 BLOBNBOX *blob;
01521 INT32 blob_count;
01522 INT32 blob_index;
01523 STATS heights (min_height, max_height + 1);
01524
01525 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
01526 blob = blob_it.data ();
01527 if (!blob->joined_to_prev ()) {
01528 xcentre =
01529 (blob->bounding_box ().left () +
01530 blob->bounding_box ().right ()) / 2.0f;
01531 height =
01532 gradient * xcentre + row->parallel_c () -
01533 blob->bounding_box ().bottom ();
01534 if (height >= min_height && height <= max_height)
01535 heights.add ((INT32) floor (height + 0.5), 1);
01536 }
01537 }
01538 blob_index = heights.mode ();
01539
01540 blob_count = heights.pile_count (blob_index);
01541 return blob_count > 0 ? -blob_index : 0;
01542 }
01543
01544
01557 INT32 compute_height_modes(
01558 STATS *heights,
01559 INT32 min_height,
01560 INT32 max_height,
01561 INT32 *modes,
01562 INT32 maxmodes
01563 ) {
01564 INT32 pile_count;
01565 INT32 src_count;
01566 INT32 src_index;
01567 INT32 least_count;
01568 INT32 least_index;
01569 INT32 dest_count;
01570
01571 src_count = max_height + 1 - min_height;
01572 dest_count = 0;
01573 least_count = MAX_INT32;
01574 least_index = -1;
01575 for (src_index = 0; src_index < src_count; src_index++) {
01576 pile_count = heights->pile_count (min_height + src_index);
01577 if (pile_count > 0) {
01578 if (dest_count < maxmodes) {
01579 if (pile_count < least_count) {
01580
01581 least_count = pile_count;
01582 least_index = dest_count;
01583 }
01584 modes[dest_count++] = min_height + src_index;
01585 }
01586 else if (pile_count >= least_count) {
01587 while (least_index < maxmodes - 1) {
01588 modes[least_index] = modes[least_index + 1];
01589
01590 least_index++;
01591 }
01592
01593 modes[maxmodes - 1] = min_height + src_index;
01594 if (pile_count == least_count) {
01595
01596 least_index = maxmodes - 1;
01597 }
01598 else {
01599 least_count = heights->pile_count (modes[0]);
01600 least_index = 0;
01601 for (dest_count = 1; dest_count < maxmodes; dest_count++) {
01602 pile_count = heights->pile_count (modes[dest_count]);
01603 if (pile_count < least_count) {
01604
01605 least_count = pile_count;
01606 least_index = dest_count;
01607 }
01608 }
01609 }
01610 }
01611 }
01612 }
01613 return dest_count;
01614 }
01615
01616
01623 void correct_row_xheight(
01624 TO_ROW *row,
01625 float xheight,
01626 float ascrise,
01627 float descdrop) {
01628 if (textord_row_xheights) {
01629 if (row->xheight <= 0)
01630 row->xheight = xheight;
01631 if (row->ascrise < row->xheight * (textord_ascx_ratio_min - 1)) {
01632 if (row->xheight >= xheight * (1 - textord_xheight_error_margin)
01633 && row->xheight <= xheight * (1 + textord_xheight_error_margin)) {
01634 row->all_caps = FALSE;
01635 row->ascrise = ascrise;
01636 }
01637 else if (row->xheight >=
01638 (xheight + ascrise) * (1 - textord_xheight_error_margin)
01639 && row->xheight <=
01640 (xheight + ascrise) * (1 + textord_xheight_error_margin)) {
01641 row->all_caps = TRUE;
01642
01643 row->ascrise = row->xheight - xheight;
01644 row->xheight = xheight;
01645 }
01646 else {
01647 row->all_caps = TRUE;
01648 row->ascrise = row->xheight * ascrise / (xheight + ascrise);
01649 row->xheight -= row->ascrise;
01650 }
01651 }
01652 else
01653 row->all_caps = FALSE;
01654 row->ascrise = ascrise;
01655 if (row->descdrop >= -row->xheight * (textord_ascx_ratio_min - 1))
01656 row->descdrop = descdrop;
01657 }
01658 else {
01659 if (row->xheight < xheight * (1 - textord_xheight_error_margin)
01660 || row->xheight > xheight * (1 + textord_xheight_error_margin))
01661 row->xheight = xheight;
01662 row->all_caps = row->ascrise <= 0;
01663 if (row->ascrise < ascrise * (1 - textord_xheight_error_margin)
01664 || row->ascrise > ascrise * (1 + textord_xheight_error_margin))
01665 row->ascrise = ascrise;
01666 if (row->descdrop < descdrop * (1 - textord_xheight_error_margin)
01667 || row->descdrop > descdrop * (1 + textord_xheight_error_margin))
01668 row->descdrop = descdrop;
01669 }
01670 }
01671
01672
01679 void separate_underlines(
01680 TO_BLOCK *block,
01681 float gradient,
01682 FCOORD rotation,
01683 BOOL8 testing_on
01684 ) {
01685 BLOBNBOX *blob;
01686 PBLOB *poly_blob;
01687 C_BLOB *rotated_blob;
01688 TO_ROW *row;
01689 float length;
01690 BOX blob_box;
01691 FCOORD blob_rotation;
01692 FCOORD g_vec;
01693 BLOBNBOX_IT blob_it;
01694
01695 BLOBNBOX_IT under_it = &block->underlines;
01696 TO_ROW_IT row_it = block->get_rows ();
01697
01698 #ifdef TEXT_VERBOSE
01699
01700 cprintf("u");
01701 #endif
01702
01703 length = sqrt (1 + gradient * gradient);
01704 g_vec = FCOORD (1 / length, -gradient / length);
01705 blob_rotation = FCOORD (rotation.x (), -rotation.y ());
01706 blob_rotation.rotate (g_vec);
01707 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
01708 row = row_it.data ();
01709
01710 blob_it.set_to_list (row->blob_list ());
01711 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
01712 blob_it.forward ()) {
01713 blob = blob_it.data ();
01714 blob_box = blob->bounding_box ();
01715 if (blob_box.width () > block->line_size * textord_underline_width) {
01716 if (textord_cblob_blockocc && blob->cblob () != NULL) {
01717 rotated_blob = crotate_cblob (blob->cblob (),
01718 blob_rotation);
01719 if (test_underline (testing_on && textord_show_final_rows,
01720 rotated_blob, (INT16) row->intercept (),
01721 (INT16) (block->line_size *
01722 (textord_merge_x +
01723 textord_merge_asc / 2.0f)))) {
01724 under_it.add_after_then_move (blob_it.extract ());
01725 if (testing_on && textord_show_final_rows) {
01726 tprintf ("Underlined blob at (%d,%d)->(%d,%d) ",
01727 rotated_blob->bounding_box ().left (),
01728 rotated_blob->bounding_box ().bottom (),
01729 rotated_blob->bounding_box ().right (),
01730 rotated_blob->bounding_box ().top ());
01731 tprintf ("(Was (%d,%d)->(%d,%d))\n",
01732 blob_box.left (), blob_box.bottom (),
01733 blob_box.right (), blob_box.top ());
01734 }
01735 }
01736 delete rotated_blob;
01737 }
01738 else {
01739 if (blob->blob () != NULL) {
01740
01741
01742
01743 poly_blob = rotate_blob (blob->blob (), blob_rotation);
01744 }
01745 else
01746 poly_blob = rotate_cblob (blob->cblob (),
01747 block->line_size,
01748 blob_rotation);
01749 if (test_underline
01750 (testing_on
01751 && textord_show_final_rows, poly_blob,
01752 row->intercept (),
01753 block->line_size * (textord_merge_x +
01754 textord_merge_asc / 2))) {
01755 if (testing_on && textord_show_final_rows) {
01756 tprintf ("Underlined blob at (%d,%d)->(%d,%d) ",
01757 poly_blob->bounding_box ().left (),
01758 poly_blob->bounding_box ().bottom (),
01759 poly_blob->bounding_box ().right (),
01760 poly_blob->bounding_box ().top ());
01761 tprintf ("(Was (%d,%d)->(%d,%d))\n",
01762 blob_box.left (), blob_box.bottom (),
01763 blob_box.right (), blob_box.top ());
01764 }
01765 under_it.add_after_then_move (blob_it.extract ());
01766 }
01767 delete poly_blob;
01768 }
01769 }
01770 }
01771 }
01772 }
01773
01774
01780 void pre_associate_blobs(
01781 ICOORD page_tr,
01782 TO_BLOCK *block,
01783 FCOORD rotation,
01784 BOOL8 testing_on
01785 ) {
01786 #ifndef GRAPHICS_DISABLED
01787 COLOUR colour;
01788 #endif
01789 INT16 overlap;
01790 BLOBNBOX *blob;
01791 BLOBNBOX *nextblob;
01792 BOX blob_box;
01793 BOX next_box;
01794 FCOORD blob_rotation;
01795 BLOBNBOX_IT blob_it;
01796 BLOBNBOX_IT start_it;
01797 TO_ROW_IT row_it = block->get_rows ();
01798
01799 #ifndef GRAPHICS_DISABLED
01800 colour = RED;
01801 #endif
01802
01803 blob_rotation = FCOORD (rotation.x (), -rotation.y ());
01804 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
01805
01806 blob_it.set_to_list (row_it.data ()->blob_list ());
01807 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
01808 blob_it.forward ()) {
01809 blob = blob_it.data ();
01810 blob_box = blob->bounding_box ();
01811 start_it = blob_it;
01812
01813
01814
01815
01816
01817
01818
01819 do {
01820 if (!blob_it.at_last ()) {
01821 nextblob = blob_it.data_relative (1);
01822 next_box = nextblob->bounding_box ();
01823 overlap = next_box.width ();
01824 if (blob_box.left () > next_box.left ())
01825 overlap -= blob_box.left () - next_box.left ();
01826 if (blob_box.right () < next_box.right ())
01827 overlap -= next_box.right () - blob_box.right ();
01828 if (overlap >= next_box.width () / 2
01829 || overlap >= blob_box.width () / 2) {
01830
01831 blob->merge (nextblob);
01832
01833 blob_box = blob->bounding_box ();
01834 blob_it.forward ();
01835 }
01836 else
01837 overlap = -1;
01838 }
01839 else
01840 overlap = -1;
01841 }
01842 while (overlap >= 0);
01843 blob->chop (&start_it, &blob_it,
01844 blob_rotation,
01845 block->line_size * textord_merge_x *
01846 textord_chop_width);
01847
01848 }
01849 #ifndef GRAPHICS_DISABLED
01850 if (testing_on && textord_show_final_blobs) {
01851 if (to_win == NO_WINDOW)
01852 create_to_win(page_tr);
01853 perimeter_color_index(to_win, colour);
01854 interior_style(to_win, INT_HOLLOW, TRUE);
01855 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
01856 blob_it.forward ()) {
01857 blob = blob_it.data ();
01858 blob_box = blob->bounding_box ();
01859 blob_box.rotate (rotation);
01860 if (!blob->joined_to_prev ()) {
01861 rectangle (to_win, blob_box.left (), blob_box.bottom (),
01862 blob_box.right (), blob_box.top ());
01863 }
01864 }
01865 colour = (COLOUR) (colour + 1);
01866 if (colour > MAGENTA)
01867 colour = RED;
01868 }
01869 #endif
01870 }
01871 }
01872
01873
01879 void fit_parallel_rows(
01880 TO_BLOCK *block,
01881 float gradient,
01882 FCOORD rotation,
01883 INT32 block_edge,
01884 BOOL8 testing_on
01885 ) {
01886 #ifndef GRAPHICS_DISABLED
01887 COLOUR colour;
01888 #endif
01889 TO_ROW_IT row_it = block->get_rows ();
01890
01891 row_it.move_to_first ();
01892 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
01893 if (row_it.data ()->blob_list ()->empty ())
01894 delete row_it.extract ();
01895 else
01896 fit_parallel_lms (gradient, row_it.data ());
01897 }
01898 #ifndef GRAPHICS_DISABLED
01899 if (testing_on) {
01900 colour = RED;
01901 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
01902 plot_parallel_row (row_it.data (), gradient,
01903 block_edge, colour, rotation);
01904 colour = (COLOUR) (colour + 1);
01905 if (colour > MAGENTA)
01906 colour = RED;
01907 }
01908 }
01909 #endif
01910 row_it.sort (row_y_order);
01911 }
01912
01913
01920 void fit_parallel_lms(
01921 float gradient,
01922 TO_ROW *row
01923 ) {
01924 float c;
01925 int blobcount;
01926 BOX box;
01927 LMS lms (row->blob_list ()->length ());
01928
01929 BLOBNBOX_IT blob_it = row->blob_list ();
01930
01931 #ifdef TEXT_VERBOSE
01932
01933 cprintf("m");
01934 #endif
01935 blobcount = 0;
01936 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
01937 if (!blob_it.data ()->joined_to_prev ()) {
01938 box = blob_it.data ()->bounding_box ();
01939 lms.
01940 add (FCOORD ((box.left () + box.right ()) / 2.0, box.bottom ()));
01941 blobcount++;
01942 }
01943 }
01944 lms.constrained_fit (gradient, c);
01945 row->set_parallel_line (gradient, c, lms.error ());
01946 if (textord_straight_baselines && blobcount > lms_line_trials) {
01947 lms.fit (gradient, c);
01948 }
01949
01950 row->set_line (gradient, c, lms.error ());
01951 }
01952
01953
01959 void make_spline_rows(
01960 TO_BLOCK *block,
01961 float gradient,
01962 FCOORD rotation,
01963 INT32 block_edge,
01964 BOOL8 testing_on
01965 ) {
01966 COLOUR colour;
01967 TO_ROW_IT row_it = block->get_rows ();
01968
01969 row_it.move_to_first ();
01970 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
01971 if (row_it.data ()->blob_list ()->empty ())
01972 delete row_it.extract ();
01973 else
01974 make_baseline_spline (row_it.data (), block);
01975 }
01976 if (textord_old_baselines) {
01977 #ifndef GRAPHICS_DISABLED
01978 if (testing_on) {
01979 colour = RED;
01980 for (row_it.mark_cycle_pt (); !row_it.cycled_list ();
01981 row_it.forward ()) {
01982 row_it.data ()->baseline.plot (to_win, colour);
01983 colour = (COLOUR) (colour + 1);
01984 if (colour > MAGENTA)
01985 colour = RED;
01986 }
01987 }
01988 #endif
01989 make_old_baselines(block, testing_on);
01990 }
01991 #ifndef GRAPHICS_DISABLED
01992 if (testing_on) {
01993 colour = RED;
01994 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
01995 row_it.data ()->baseline.plot (to_win, colour);
01996 colour = (COLOUR) (colour + 1);
01997 if (colour > MAGENTA)
01998 colour = RED;
01999 }
02000 }
02001 #endif
02002 }
02003
02004
02011 void make_baseline_spline(
02012 TO_ROW *row,
02013 TO_BLOCK *block
02014 ) {
02015 float b, c;
02016 float middle;
02017 BOX box;
02018 LMS lms (row->blob_list ()->length ());
02019
02020 BLOBNBOX_IT blob_it = row->blob_list ();
02021 INT32 *xstarts;
02022 double *coeffs;
02023 INT32 segments;
02024 INT32 segment;
02025
02026 xstarts =
02027 (INT32 *) alloc_mem ((row->blob_list ()->length () + 1) * sizeof (INT32));
02028 if (segment_baseline (row, block, segments, xstarts)
02029 && !textord_straight_baselines && !textord_parallel_baselines) {
02030 if (textord_quadratic_baselines) {
02031 coeffs = (double *) alloc_mem (segments * 3 * sizeof (double));
02032 for (segment = 0; segment < segments; segment++) {
02033 lms.clear ();
02034 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
02035 blob_it.forward ()) {
02036 if (!blob_it.data ()->joined_to_prev ()) {
02037 box = blob_it.data ()->bounding_box ();
02038 middle = (box.left () + box.right ()) / 2.0;
02039 if (middle >= xstarts[segment]
02040 && middle < xstarts[segment + 1]) {
02041 lms.add (FCOORD (middle, box.bottom ()));
02042 }
02043 }
02044 }
02045 if (textord_quadratic_baselines)
02046 lms.fit_quadratic (block->line_size *
02047 textord_spline_outlier_fraction,
02048 coeffs[segment * 3], b, c);
02049 else {
02050 lms.fit (b, c);
02051 coeffs[segment * 3] = 0;
02052 }
02053 coeffs[segment * 3 + 1] = b;
02054 coeffs[segment * 3 + 2] = c;
02055 }
02056 }
02057 else
02058 coeffs = linear_spline_baseline (row, block, segments, xstarts);
02059 }
02060 else {
02061 xstarts[1] = xstarts[segments];
02062 segments = 1;
02063 coeffs = (double *) alloc_mem (3 * sizeof (double));
02064 coeffs[0] = 0;
02065 coeffs[1] = row->line_m ();
02066 coeffs[2] = row->line_c ();
02067 }
02068 row->baseline = QSPLINE (segments, xstarts, coeffs);
02069 free_mem(coeffs);
02070 free_mem(xstarts);
02071 }
02072
02073
02082 BOOL8
02083 segment_baseline (
02084 TO_ROW * row,
02085 TO_BLOCK * block,
02086 INT32 & segments,
02087 INT32 xstarts[]
02088 ) {
02089 BOOL8 needs_curve;
02090 int blobcount;
02091 int blobindex;
02092 int last_state;
02093 int state;
02094 float yshift;
02095 BOX box;
02096 BOX new_box;
02097 float middle;
02098
02099 BLOBNBOX_IT blob_it = row->blob_list ();
02100 BLOBNBOX_IT new_it = blob_it;
02101 SORTED_FLOATS yshifts;
02102
02103 needs_curve = FALSE;
02104 box = box_next_pre_chopped (&blob_it);
02105 xstarts[0] = box.left ();
02106 segments = 1;
02107 blobcount = row->blob_list ()->length ();
02108 if (textord_oldbl_debug)
02109 tprintf ("Segmenting baseline of %d blobs at (%d,%d)\n",
02110 blobcount, box.left (), box.bottom ());
02111 if (blobcount <= textord_spline_medianwin
02112 || blobcount < textord_spline_minblobs) {
02113 blob_it.move_to_last ();
02114 box = blob_it.data ()->bounding_box ();
02115 xstarts[1] = box.right ();
02116 return FALSE;
02117 }
02118 last_state = 0;
02119 new_it.mark_cycle_pt ();
02120 for (blobindex = 0; blobindex < textord_spline_medianwin; blobindex++) {
02121 new_box = box_next_pre_chopped (&new_it);
02122 middle = (new_box.left () + new_box.right ()) / 2.0;
02123 yshift = new_box.bottom () - row->line_m () * middle - row->line_c ();
02124
02125 yshifts.add (yshift, blobindex);
02126 if (new_it.cycled_list ()) {
02127 xstarts[1] = new_box.right ();
02128 return FALSE;
02129 }
02130 }
02131 for (blobcount = 0; blobcount < textord_spline_medianwin / 2; blobcount++)
02132 box = box_next_pre_chopped (&blob_it);
02133 do {
02134 new_box = box_next_pre_chopped (&new_it);
02135
02136 yshift = yshifts[textord_spline_medianwin / 2];
02137 if (yshift > textord_spline_shift_fraction * block->line_size)
02138 state = 1;
02139 else if (-yshift > textord_spline_shift_fraction * block->line_size)
02140 state = -1;
02141 else
02142 state = 0;
02143 if (state != 0)
02144 needs_curve = TRUE;
02145
02146
02147 if (state != last_state && blobcount > textord_spline_minblobs) {
02148 xstarts[segments++] = box.left ();
02149 blobcount = 0;
02150 }
02151 last_state = state;
02152 yshifts.remove (blobindex - textord_spline_medianwin);
02153 box = box_next_pre_chopped (&blob_it);
02154 middle = (new_box.left () + new_box.right ()) / 2.0;
02155 yshift = new_box.bottom () - row->line_m () * middle - row->line_c ();
02156 yshifts.add (yshift, blobindex);
02157 blobindex++;
02158 blobcount++;
02159 }
02160 while (!new_it.cycled_list ());
02161 if (blobcount > textord_spline_minblobs || segments == 1) {
02162 xstarts[segments] = new_box.right ();
02163 }
02164 else {
02165 xstarts[--segments] = new_box.right ();
02166 }
02167 if (textord_oldbl_debug)
02168 tprintf ("Made %d segments on row at (%d,%d)\n",
02169 segments, box.right (), box.bottom ());
02170 return needs_curve;
02171 }
02172
02173
02182 double *
02183 linear_spline_baseline (
02184 TO_ROW * row,
02185 TO_BLOCK * block,
02186 INT32 & segments,
02187 INT32 xstarts[]
02188 ) {
02189 int blobcount;
02190 int blobindex;
02191 int index1, index2;
02192 int blobs_per_segment;
02193 BOX box;
02194 BOX new_box;
02195 float middle;
02196
02197 BLOBNBOX_IT blob_it = row->blob_list ();
02198 BLOBNBOX_IT new_it = blob_it;
02199 float b, c;
02200 LMS lms (row->blob_list ()->length ());
02201 double *coeffs;
02202 INT32 segment;
02203
02204 box = box_next_pre_chopped (&blob_it);
02205 xstarts[0] = box.left ();
02206 blobcount = 1;
02207 while (!blob_it.at_first ()) {
02208 blobcount++;
02209 box = box_next_pre_chopped (&blob_it);
02210 }
02211 segments = blobcount / textord_spline_medianwin;
02212 if (segments < 1)
02213 segments = 1;
02214 blobs_per_segment = blobcount / segments;
02215 coeffs = (double *) alloc_mem (segments * 3 * sizeof (double));
02216 if (textord_oldbl_debug)
02217 tprintf
02218 ("Linear splining baseline of %d blobs at (%d,%d), into %d segments of %d blobs\n",
02219 blobcount, box.left (), box.bottom (), segments, blobs_per_segment);
02220 segment = 1;
02221 for (index2 = 0; index2 < blobs_per_segment / 2; index2++)
02222 box_next_pre_chopped(&new_it);
02223 index1 = 0;
02224 blobindex = index2;
02225 do {
02226 blobindex += blobs_per_segment;
02227 lms.clear ();
02228 while (index1 < blobindex || segment == segments && index1 < blobcount) {
02229 box = box_next_pre_chopped (&blob_it);
02230 middle = (box.left () + box.right ()) / 2.0;
02231 lms.add (FCOORD (middle, box.bottom ()));
02232 index1++;
02233 if (index1 == blobindex - blobs_per_segment / 2
02234 || index1 == blobcount - 1) {
02235 xstarts[segment] = box.left ();
02236 }
02237 }
02238 lms.fit (b, c);
02239 coeffs[segment * 3 - 3] = 0;
02240 coeffs[segment * 3 - 2] = b;
02241 coeffs[segment * 3 - 1] = c;
02242 segment++;
02243 if (segment > segments)
02244 break;
02245
02246 blobindex += blobs_per_segment;
02247 lms.clear ();
02248 while (index2 < blobindex || segment == segments && index2 < blobcount) {
02249 new_box = box_next_pre_chopped (&new_it);
02250 middle = (new_box.left () + new_box.right ()) / 2.0;
02251 lms.add (FCOORD (middle, new_box.bottom ()));
02252 index2++;
02253 if (index2 == blobindex - blobs_per_segment / 2
02254 || index2 == blobcount - 1) {
02255 xstarts[segment] = new_box.left ();
02256 }
02257 }
02258 lms.fit (b, c);
02259 coeffs[segment * 3 - 3] = 0;
02260 coeffs[segment * 3 - 2] = b;
02261 coeffs[segment * 3 - 1] = c;
02262 segment++;
02263 }
02264 while (segment <= segments);
02265 return coeffs;
02266 }
02267
02268
02275 void assign_blobs_to_rows(
02276 TO_BLOCK *block,
02277 float *gradient,
02278 int pass,
02279 BOOL8 reject_misses,
02280 BOOL8 make_new_rows,
02281 BOOL8 drawing_skew
02282 ) {
02283 OVERLAP_STATE overlap_result;
02284 float ycoord;
02285 float top, bottom;
02286 float g_length = 1.0f;
02287 INT16 row_count;
02288 INT16 left_x;
02289 INT16 last_x;
02290 float block_skew;
02291 float smooth_factor;
02292 float near_dist;
02293 ICOORD testpt;
02294 BLOBNBOX *blob;
02295 TO_ROW *row;
02296 TO_ROW *dest_row;
02297
02298 BLOBNBOX_IT blob_it = &block->blobs;
02299 TO_ROW_IT row_it = block->get_rows ();
02300
02301 #ifdef TEXT_VERBOSE
02302
02303 cprintf("l");
02304 #endif
02305 ycoord =
02306 (block->block->bounding_box ().bottom () +
02307 block->block->bounding_box ().top ()) / 2.0f;
02308 if (gradient != NULL)
02309 g_length = sqrt (1 + *gradient * *gradient);
02310 #ifndef GRAPHICS_DISABLED
02311 if (drawing_skew)
02312 move2d (to_win, block->block->bounding_box ().left (), ycoord);
02313 #endif
02314 testpt = ICOORD (textord_test_x, textord_test_y);
02315 blob_it.sort (blob_x_order);
02316 smooth_factor = 1.0;
02317 block_skew = 0.0f;
02318 row_count = row_it.length ();
02319 if (!blob_it.empty ()) {
02320 left_x = blob_it.data ()->bounding_box ().left ();
02321 }
02322 else {
02323 left_x = block->block->bounding_box ().left ();
02324 }
02325 last_x = left_x;
02326 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
02327 blob = blob_it.data ();
02328 if (gradient != NULL) {
02329 block_skew = (1 - 1 / g_length) * blob->bounding_box ().bottom ()
02330 + *gradient / g_length * blob->bounding_box ().left ();
02331 }
02332 else if (blob->bounding_box ().left () - last_x > block->line_size / 2
02333 && last_x - left_x > block->line_size * 2
02334 && textord_interpolating_skew) {
02335
02336 block_skew *= (float) (blob->bounding_box ().left () - left_x)
02337 / (last_x - left_x);
02338
02339 }
02340 last_x = blob->bounding_box ().left ();
02341 top = blob->bounding_box ().top () - block_skew;
02342 bottom = blob->bounding_box ().bottom () - block_skew;
02343 #ifndef GRAPHICS_DISABLED
02344 if (drawing_skew)
02345 draw2d (to_win, blob->bounding_box ().left (), ycoord + block_skew);
02346 #endif
02347 if (!row_it.empty ()) {
02348 for (row_it.move_to_first ();
02349 !row_it.at_last () && row_it.data ()->min_y () > top;
02350 row_it.forward ());
02351 row = row_it.data ();
02352 if (row->min_y () <= top && row->max_y () >= bottom) {
02353
02354 dest_row = row;
02355 overlap_result = most_overlapping_row (&row_it, dest_row,
02356 top, bottom,
02357 block->line_size,
02358 blob->bounding_box ().
02359 contains (testpt));
02360 if (overlap_result == NEW_ROW && !reject_misses)
02361 overlap_result = ASSIGN;
02362 }
02363 else {
02364 overlap_result = NEW_ROW;
02365 if (!make_new_rows) {
02366 near_dist = row_it.data_relative (-1)->min_y () - top;
02367
02368 if (bottom < row->min_y ()) {
02369 if (row->min_y () - bottom <=
02370 (block->line_spacing -
02371 block->line_size) * textord_merge_desc) {
02372
02373 overlap_result = ASSIGN;
02374 dest_row = row;
02375 }
02376 }
02377 else if (near_dist > 0
02378 && near_dist < bottom - row->max_y ()) {
02379 row_it.backward ();
02380 dest_row = row_it.data ();
02381 if (dest_row->min_y () - bottom <=
02382 (block->line_spacing -
02383 block->line_size) * textord_merge_desc) {
02384
02385 overlap_result = ASSIGN;
02386 }
02387 }
02388 else {
02389 if (top - row->max_y () <=
02390 (block->line_spacing -
02391 block->line_size) * (textord_merge_x +
02392 textord_merge_asc)) {
02393
02394 overlap_result = ASSIGN;
02395 dest_row = row;
02396 }
02397 }
02398 }
02399 }
02400 if (overlap_result == ASSIGN)
02401 dest_row->add_blob (blob_it.extract (), top, bottom,
02402 block->line_size);
02403 if (overlap_result == NEW_ROW) {
02404 if (make_new_rows && top - bottom < block->max_blob_size) {
02405 dest_row =
02406 new TO_ROW (blob_it.extract (), top, bottom,
02407 block->line_size);
02408 row_count++;
02409 if (bottom > row_it.data ()->min_y ())
02410 row_it.add_before_then_move (dest_row);
02411
02412 else
02413 row_it.add_after_then_move (dest_row);
02414 smooth_factor =
02415 1.0 / (row_count * textord_skew_lag +
02416 textord_skewsmooth_offset);
02417 }
02418 else
02419 overlap_result = REJECT;
02420 }
02421 }
02422 else if (make_new_rows && top - bottom < block->max_blob_size) {
02423 overlap_result = NEW_ROW;
02424 dest_row =
02425 new TO_ROW (blob_it.extract (), top, bottom, block->line_size);
02426 row_count++;
02427 row_it.add_after_then_move (dest_row);
02428 smooth_factor = 1.0 / (row_count * textord_skew_lag + 1);
02429 }
02430 else
02431 overlap_result = REJECT;
02432 if (blob->bounding_box ().contains (testpt)) {
02433 if (overlap_result != REJECT) {
02434 tprintf ("Test blob assigned to row at (%g,%g) on pass %d\n",
02435 dest_row->min_y (), dest_row->max_y (), pass);
02436 }
02437 else {
02438 tprintf ("Test blob assigned to no row on pass %d\n", pass);
02439 }
02440 }
02441 if (overlap_result != REJECT) {
02442 while (!row_it.at_first ()
02443 && row_it.data ()->min_y () >
02444 row_it.data_relative (-1)->min_y ()) {
02445 row = row_it.extract ();
02446 row_it.backward ();
02447 row_it.add_before_then_move (row);
02448 }
02449 while (!row_it.at_last ()
02450 && row_it.data ()->min_y () <
02451 row_it.data_relative (1)->min_y ()) {
02452 row = row_it.extract ();
02453 row_it.forward ();
02454
02455 row_it.add_after_then_move (row);
02456 }
02457 block_skew = (1 - smooth_factor) * block_skew
02458 + smooth_factor * (blob->bounding_box ().bottom () -
02459 dest_row->initial_min_y ());
02460 }
02461 }
02462 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
02463 if (row_it.data ()->blob_list ()->empty ())
02464 delete row_it.extract ();
02465 }
02466 }
02467
02468
02482 OVERLAP_STATE most_overlapping_row(
02483 TO_ROW_IT *row_it,
02484 TO_ROW *&best_row,
02485 float top,
02486 float bottom,
02487 float rowsize,
02488 BOOL8 testing_blob
02489 ) {
02490 OVERLAP_STATE result;
02491 float overlap;
02492 float bestover;
02493 float merge_top, merge_bottom;
02494 ICOORD testpt;
02495 TO_ROW *row;
02496 TO_ROW *test_row;
02497 BLOBNBOX_IT blob_it;
02498
02499 result = ASSIGN;
02500 row = row_it->data ();
02501 bestover = top - bottom;
02502 if (top > row->max_y ())
02503 bestover -= top - row->max_y ();
02504 if (bottom < row->min_y ())
02505
02506 bestover -= row->min_y () - bottom;
02507 if (testing_blob) {
02508 tprintf ("Test blob y=(%g,%g), row=(%f,%f), overlap=%f\n",
02509 bottom, top, row->min_y (), row->max_y (), bestover);
02510 }
02511 test_row = row;
02512 do {
02513 if (!row_it->at_last ()) {
02514 row_it->forward ();
02515 test_row = row_it->data ();
02516 if (test_row->min_y () <= top && test_row->max_y () >= bottom) {
02517 merge_top =
02518 test_row->max_y () >
02519 row->max_y ()? test_row->max_y () : row->max_y ();
02520 merge_bottom =
02521 test_row->min_y () <
02522 row->min_y ()? test_row->min_y () : row->min_y ();
02523 if (merge_top - merge_bottom <= rowsize) {
02524 if (testing_blob) {
02525 tprintf ("Merging rows at (%g,%g), (%g,%g)\n",
02526 row->min_y (), row->max_y (),
02527 test_row->min_y (), test_row->max_y ());
02528 }
02529 test_row->set_limits (merge_bottom, merge_top);
02530 blob_it.set_to_list (test_row->blob_list ());
02531 blob_it.add_list_after (row->blob_list ());
02532 blob_it.sort (blob_x_order);
02533 row_it->backward ();
02534 delete row_it->extract ();
02535 row_it->forward ();
02536 bestover = -1.0f;
02537 }
02538 overlap = top - bottom;
02539 if (top > test_row->max_y ())
02540 overlap -= top - test_row->max_y ();
02541 if (bottom < test_row->min_y ())
02542 overlap -= test_row->min_y () - bottom;
02543 if (bestover >= rowsize - 1 && overlap >= rowsize - 1) {
02544 result = REJECT;
02545 }
02546 if (overlap > bestover) {
02547 bestover = overlap;
02548 row = test_row;
02549 }
02550 if (testing_blob) {
02551 tprintf
02552 ("Test blob y=(%g,%g), row=(%f,%f), overlap=%f->%f\n",
02553 bottom, top, test_row->min_y (), test_row->max_y (),
02554 overlap, bestover);
02555 }
02556 }
02557 }
02558 }
02559 while (!row_it->at_last ()
02560 && test_row->min_y () <= top && test_row->max_y () >= bottom);
02561 while (row_it->data () != row)
02562 row_it->backward ();
02563
02564
02565 if (top - bottom - bestover > rowsize * textord_merge_x
02566 && (!textord_fix_makerow_bug || bestover < rowsize * textord_merge_x)
02567 && result == ASSIGN)
02568 result = NEW_ROW;
02569 best_row = row;
02570 return result;
02571 }
02572
02573
02579 int blob_x_order(
02580 const void *item1,
02581 const void *item2) {
02582
02583 BLOBNBOX *blob1 = *(BLOBNBOX **) item1;
02584
02585 BLOBNBOX *blob2 = *(BLOBNBOX **) item2;
02586
02587 if (blob1->bounding_box ().left () < blob2->bounding_box ().left ())
02588 return -1;
02589 else if (blob1->bounding_box ().left () > blob2->bounding_box ().left ())
02590 return 1;
02591 else
02592 return 0;
02593 }
02594
02595
02601 int row_y_order(
02602 const void *item1,
02603 const void *item2) {
02604
02605 TO_ROW *row1 = *(TO_ROW **) item1;
02606
02607 TO_ROW *row2 = *(TO_ROW **) item2;
02608
02609 if (row1->parallel_c () > row2->parallel_c ())
02610 return -1;
02611 else if (row1->parallel_c () < row2->parallel_c ())
02612 return 1;
02613 else
02614 return 0;
02615 }
02616
02617
02623 int row_spacing_order(
02624 const void *item1,
02625 const void *item2) {
02626
02627 TO_ROW *row1 = *(TO_ROW **) item1;
02628
02629 TO_ROW *row2 = *(TO_ROW **) item2;
02630
02631 if (row1->spacing < row2->spacing)
02632 return -1;
02633 else if (row1->spacing > row2->spacing)
02634 return 1;
02635 else
02636 return 0;
02637 }