00001
00020 #include "mfcpch.h"
00021 #ifdef __UNIX__
00022 #include <assert.h>
00023 #endif
00024 #include "stderr.h"
00025 #include "globaloc.h"
00026 #include "tessout.h"
00027 #include "blread.h"
00028 #include "blobbox.h"
00029
00030 #include "edgblob.h"
00031
00032 #include "drawtord.h"
00033 #include "makerow.h"
00034 #include "wordseg.h"
00035 #include "ocrclass.h"
00036 #include "genblob.h"
00037 #include "imgs.h"
00038
00039 #include "tordmain.h"
00040 #include "secname.h"
00041
00042 #ifdef TEXT_VERBOSE
00043 #include "../cutil/callcpp.h"
00044 #endif
00045
00046
00047 const ERRCODE BLOCKLESS_BLOBS = "Warning:some blobs assigned to no block";
00048
00049 #ifdef GRAPHICS_DISABLED
00050 ETEXT_DESC *global_monitor = NULL;
00051 #endif
00052
00053 #define EXTERN
00054
00057 EXTERN BOOL_VAR (textord_show_blobs, FALSE, "Display unsorted blobs");
00058 EXTERN BOOL_VAR (textord_show_boxes, FALSE, "Display unsorted blobs");
00059 EXTERN BOOL_VAR (textord_new_initial_xheight, TRUE,
00060 "Use test xheight mechanism");
00061 EXTERN BOOL_VAR (textord_exit_after, FALSE, "Exit after completing textord");
00062 EXTERN INT_VAR (textord_max_noise_size, 7, "Pixel size of noise");
00063 EXTERN double_VAR (textord_blob_size_bigile, 95,
00064 "Percentile for large blobs");
00065 EXTERN double_VAR (textord_noise_area_ratio, 0.7,
00066 "Fraction of bounding box for noise");
00067 EXTERN double_VAR (textord_blob_size_smallile, 20,
00068 "Percentile for small blobs");
00069 EXTERN double_VAR (textord_initialx_ile, 0.75,
00070 "Ile of sizes for xheight guess");
00071 EXTERN double_VAR (textord_initialasc_ile, 0.90,
00072 "Ile of sizes for xheight guess");
00073 EXTERN INT_VAR (textord_noise_sizefraction, 10,
00074 "Fraction of size for maxima");
00075 EXTERN double_VAR (textord_noise_sizelimit, 0.5,
00076 "Fraction of x for big t count");
00077 EXTERN INT_VAR (textord_noise_translimit, 16, "Transitions for normal blob");
00078 EXTERN double_VAR (textord_noise_normratio, 2.0,
00079 "Dot to norm ratio for deletion");
00080 EXTERN BOOL_VAR (textord_noise_rejwords, TRUE, "Reject noise-like words");
00081 EXTERN BOOL_VAR (textord_noise_rejrows, TRUE, "Reject noise-like rows");
00082 EXTERN double_VAR (textord_noise_syfract, 0.2,
00083 "xh fract error for norm blobs");
00084 EXTERN double_VAR (textord_noise_sxfract, 0.4,
00085 "xh fract width error for norm blobs");
00086 EXTERN INT_VAR (textord_noise_sncount, 1, "super norm blobs to save row");
00087 EXTERN double_VAR (textord_noise_rowratio, 6.0,
00088 "Dot to norm ratio for deletion");
00089
00090 EXTERN BOOL_VAR (textord_noise_debug, FALSE, "Debug row garbage detector");
00091 EXTERN double_VAR (textord_blshift_maxshift, 0.00, "Max baseline shift");
00092 EXTERN double_VAR (textord_blshift_xfraction, 9.99,
00093 "Min size of baseline shift");
00094 EXTERN STRING_EVAR (tessedit_image_ext, ".tif", "Externsion for image file");
00095 extern BOOL_VAR_H (polygon_tess_approximation, TRUE,
00096 "Do tess poly instead of grey scale");
00097 extern BOOL_VAR_H (interactive_mode, TRUE, "Run interactively?");
00098
00099 #ifndef EMBEDDED
00100 EXTERN clock_t previous_cpu;
00101 #endif
00102
00104 #define MAX_NEAREST_DIST 600
00105
00106 #define MAX_BLOB_TRANSITIONS 100
00107
00109 extern IMAGE page_image;
00111 extern ETEXT_DESC *global_monitor;
00126 void read_and_textord(
00127 const char *filename,
00128 BLOCK_LIST *blocks) {
00129 int c;
00130 FILE *infp;
00131 BLOCK *block;
00132 BOX page_box;
00133 BLOCK_IT block_it = blocks;
00134
00135 TO_BLOCK_LIST land_blocks, port_blocks;
00136
00137 infp = fopen (filename, "r");
00138 if (infp == NULL)
00139 CANTOPENFILE.error ("read_and_textord", EXIT, filename);
00140
00141 while (((c = fgetc (infp)) != EOF) && (ungetc (c, infp) != EOF)) {
00142
00143 block = BLOCK::de_serialise (infp);
00144
00145 block_it.add_after_then_move (block);
00146
00147 page_box += block->bounding_box ();
00148 }
00149 fclose(infp);
00150
00151 assign_blobs_to_blocks2(blocks, &land_blocks, &port_blocks);
00152 filter_blobs (page_box.topright (), &port_blocks, !textord_test_landscape);
00153 filter_blobs (page_box.topright (), &land_blocks, textord_test_landscape);
00154 textord_page (page_box.topright (), blocks, &land_blocks, &port_blocks);
00155 }
00156
00157
00173 void edges_and_textord(
00174 const char *filename,
00175 BLOCK_LIST *blocks) {
00176 BLOCK *block;
00177 char *lastdot;
00178 STRING name = filename;
00179 ICOORD page_tr;
00180 BOX page_box;
00181 PDBLK_CLIST pd_blocks;
00182 BLOCK_IT block_it = blocks;
00183 PDBLK_C_IT pd_it = &pd_blocks;
00184
00185 TO_BLOCK_LIST land_blocks, port_blocks;
00186 IMAGE thresh_image;
00187
00188 lastdot = strrchr (name.string (), '.');
00189 if (lastdot != NULL)
00190 *lastdot = '\0';
00191 if (page_image.get_bpp () == 0) {
00192 name += tessedit_image_ext;
00193 if (page_image.read_header (name.string ()))
00194 CANTOPENFILE.error ("edges_and_textord", EXIT, name.string ());
00195 if (page_image.read (0))
00196 READFAILED.error ("edges_and_textord", EXIT, name.string ());
00197 name = filename;
00198 lastdot = strrchr (name.string (), '.');
00199 if (lastdot != NULL)
00200 *lastdot = '\0';
00201 }
00202 page_tr = ICOORD (page_image.get_xsize (), page_image.get_ysize ());
00203 read_pd_file (name, page_image.get_xsize (), page_image.get_ysize (),
00204 blocks);
00205 block_it.set_to_list (blocks);
00206 if (global_monitor != NULL)
00207 global_monitor->ocr_alive = TRUE;
00208
00209 if (page_image.get_bpp () > 1) {
00210 set_global_loc_code(LOC_ADAPTIVE);
00211 for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
00212 block_it.forward ()) {
00213 block = block_it.data ();
00214 pd_it.add_after_then_move (block);
00215 }
00216
00217 set_global_loc_code(LOC_EDGE_PROG);
00218 #ifndef EMBEDDED
00219 previous_cpu = clock ();
00220 #endif
00221 for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
00222 block_it.forward ()) {
00223 block = block_it.data ();
00224 if (!polygon_tess_approximation)
00225 invert_image(&page_image);
00226 #ifndef GRAPHICS_DISABLED
00227 extract_edges(NO_WINDOW, &page_image, &thresh_image, page_tr, block);
00228 #else
00229 extract_edges(&page_image, &thresh_image, page_tr, block);
00230 #endif
00231 page_box += block->bounding_box ();
00232 }
00233 page_image = thresh_image;
00234 }
00235 else {
00236 set_global_loc_code(LOC_EDGE_PROG);
00237 if (!page_image.white_high ())
00238 invert_image(&page_image);
00239
00240 #ifndef EMBEDDED
00241 previous_cpu = clock ();
00242 #endif
00243
00244 for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
00245 block_it.forward ()) {
00246 block = block_it.data ();
00247 #ifndef GRAPHICS_DISABLED
00248 extract_edges(NO_WINDOW, &page_image, &page_image, page_tr, block);
00249 #else
00250 extract_edges(&page_image, &page_image, page_tr, block);
00251 #endif
00252 page_box += block->bounding_box ();
00253 }
00254 }
00255 if (global_monitor != NULL) {
00256 global_monitor->ocr_alive = TRUE;
00257 global_monitor->progress = 10;
00258 }
00259
00260 assign_blobs_to_blocks2(blocks, &land_blocks, &port_blocks);
00261 if (global_monitor != NULL)
00262 global_monitor->ocr_alive = TRUE;
00263 filter_blobs (page_box.topright (), &land_blocks, textord_test_landscape);
00264 #ifndef EMBEDDED
00265 previous_cpu = clock ();
00266 #endif
00267 filter_blobs (page_box.topright (), &port_blocks, !textord_test_landscape);
00268 if (global_monitor != NULL)
00269 global_monitor->ocr_alive = TRUE;
00270 textord_page (page_box.topright (), blocks, &land_blocks, &port_blocks);
00271 }
00272
00273
00274
00275
00286 void assign_blobs_to_blocks2(
00287 BLOCK_LIST *blocks,
00288 TO_BLOCK_LIST *land_blocks,
00289 TO_BLOCK_LIST *port_blocks
00290 ) {
00291 BLOCK *block;
00292 BLOBNBOX *newblob;
00293 C_BLOB *blob;
00294 BLOCK_IT block_it = blocks;
00295 C_BLOB_IT blob_it;
00296 BLOBNBOX_IT port_box_it;
00297
00298 TO_BLOCK_IT port_block_it = port_blocks;
00299 TO_BLOCK *port_block;
00300
00301 for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
00302 block_it.forward ()) {
00303 block = block_it.data ();
00304 blob_it.set_to_list (block->blob_list ());
00305
00306 port_block = new TO_BLOCK (block);
00307
00308 port_box_it.set_to_list (&port_block->blobs);
00309 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
00310 blob_it.forward ()) {
00311 blob = blob_it.extract ();
00312
00313 newblob = new BLOBNBOX (blob);
00314
00315 port_box_it.add_after_then_move (newblob);
00316
00317 }
00318 port_block_it.add_after_then_move (port_block);
00319 }
00320 }
00321
00322
00338 void filter_blobs(
00339 ICOORD page_tr,
00340 TO_BLOCK_LIST *blocks,
00341 BOOL8 testing_on
00342 ) {
00343 TO_BLOCK_IT block_it = blocks;
00344 TO_BLOCK *block;
00345
00346 for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
00347 block_it.forward ()) {
00348 block = block_it.data ();
00349 block->line_size = filter_noise_blobs (&block->blobs,
00350 &block->noise_blobs,
00351 &block->small_blobs,
00352 &block->large_blobs);
00353 block->line_spacing =
00354 block->line_size * (textord_merge_desc + textord_merge_x +
00355 textord_merge_asc +
00356 textord_merge_asc) / textord_merge_x;
00357 block->line_size *= textord_min_linesize;
00358 block->max_blob_size = block->line_size * textord_excess_blobsize;
00359 #ifndef GRAPHICS_DISABLED
00360 if (textord_show_blobs && testing_on) {
00361 if (to_win == NO_WINDOW)
00362 create_to_win(page_tr);
00363 plot_blob_list (to_win, &block->noise_blobs, CORAL, BLUE);
00364 plot_blob_list (to_win, &block->small_blobs, GOLDENROD, YELLOW);
00365 plot_blob_list (to_win, &block->large_blobs, DARK_GREEN, YELLOW);
00366 plot_blob_list (to_win, &block->blobs, WHITE, BROWN);
00367 }
00368 if (textord_show_boxes && testing_on) {
00369 if (to_win == NO_WINDOW)
00370 create_to_win(page_tr);
00371 plot_box_list (to_win, &block->noise_blobs, WHITE);
00372 plot_box_list (to_win, &block->small_blobs, WHITE);
00373 plot_box_list (to_win, &block->large_blobs, WHITE);
00374 plot_box_list (to_win, &block->blobs, WHITE);
00375 }
00376 #endif
00377 }
00378 }
00379
00380
00404 float filter_noise_blobs(
00405 BLOBNBOX_LIST *src_list,
00406 BLOBNBOX_LIST *noise_list,
00407 BLOBNBOX_LIST *small_list,
00408 BLOBNBOX_LIST *large_list
00409 ) {
00410 INT16 height;
00411 INT16 width;
00412 BLOBNBOX_IT src_it = src_list;
00413 BLOBNBOX_IT noise_it = noise_list;
00414 BLOBNBOX_IT small_it = small_list;
00415 BLOBNBOX_IT large_it = large_list;
00416 STATS size_stats (0, MAX_NEAREST_DIST);
00417
00418 if (textord_new_initial_xheight) {
00419 return filter_noise_blobs2 (src_list, noise_list, small_list, large_list);
00420 }
00421 float min_y;
00422 float max_y;
00423 float max_x;
00424
00425 for (src_it.mark_cycle_pt (); !src_it.cycled_list (); src_it.forward ()) {
00426 if (src_it.data ()->bounding_box ().height () < textord_max_noise_size) {
00427 noise_it.add_after_then_move (src_it.extract ());
00428 }
00429 }
00430 for (src_it.mark_cycle_pt (); !src_it.cycled_list (); src_it.forward ()) {
00431 size_stats.add (src_it.data ()->bounding_box ().height (), 1);
00432 }
00433 min_y = floor (size_stats.ile (textord_blob_size_smallile / 100.0));
00434 max_y = ceil (size_stats.ile (textord_blob_size_bigile / 100.0));
00435 max_x = ceil (size_stats.ile (0.5) * textord_width_limit);
00436 for (src_it.mark_cycle_pt (); !src_it.cycled_list (); src_it.forward ()) {
00437 height = src_it.data ()->bounding_box ().height ();
00438 width = src_it.data ()->bounding_box ().width ();
00439 if (height < min_y) {
00440 small_it.add_after_then_move (src_it.extract ());
00441 }
00442 else if (height > max_y || width > max_x) {
00443 large_it.add_after_then_move (src_it.extract ());
00444 }
00445 }
00446 return size_stats.ile (textord_initialx_ile);
00447 }
00448
00449
00498 float filter_noise_blobs2(
00499 BLOBNBOX_LIST *src_list,
00500 BLOBNBOX_LIST *noise_list,
00501 BLOBNBOX_LIST *small_list,
00502 BLOBNBOX_LIST *large_list
00503 ) {
00504 INT16 height;
00505 INT16 width;
00506 BLOBNBOX *blob;
00507 float initial_x;
00508 BLOBNBOX_IT src_it = src_list;
00509 BLOBNBOX_IT noise_it = noise_list;
00510 BLOBNBOX_IT small_it = small_list;
00511 BLOBNBOX_IT large_it = large_list;
00512 STATS size_stats (0, MAX_NEAREST_DIST);
00513
00514 float min_y;
00515 float max_y;
00516 float max_x;
00517 float max_height;
00518
00519 for (src_it.mark_cycle_pt (); !src_it.cycled_list (); src_it.forward ()) {
00520 blob = src_it.data ();
00521 if (blob->bounding_box ().height () < textord_max_noise_size)
00522 noise_it.add_after_then_move (src_it.extract ());
00523 else if (blob->enclosed_area () >= blob->bounding_box ().height ()
00524 * blob->bounding_box ().width () * textord_noise_area_ratio)
00525 small_it.add_after_then_move (src_it.extract ());
00526 }
00527 for (src_it.mark_cycle_pt (); !src_it.cycled_list (); src_it.forward ()) {
00528 size_stats.add (src_it.data ()->bounding_box ().height (), 1);
00529 }
00530 initial_x = size_stats.ile (textord_initialx_ile);
00531 max_y =
00532 ceil (initial_x *
00533 (textord_merge_desc + textord_merge_x +
00534 2 * textord_merge_asc) / textord_merge_x);
00535 min_y = floor (initial_x / 2);
00536 max_x = ceil (initial_x * textord_width_limit);
00537 small_it.move_to_first ();
00538 for (small_it.mark_cycle_pt (); !small_it.cycled_list ();
00539 small_it.forward ()) {
00540 height = small_it.data ()->bounding_box ().height ();
00541 if (height >= min_y)
00542 large_it.add_after_then_move (small_it.extract ());
00543 }
00544 size_stats.clear ();
00545 for (src_it.mark_cycle_pt (); !src_it.cycled_list (); src_it.forward ()) {
00546 height = src_it.data ()->bounding_box ().height ();
00547 width = src_it.data ()->bounding_box ().width ();
00548 if (height < min_y) {
00549 small_it.add_after_then_move (src_it.extract ());
00550 }
00551 else if (height > max_y || width > max_x) {
00552 large_it.add_after_then_move (src_it.extract ());
00553 }
00554 else {
00555 size_stats.add (height, 1);
00556 }
00557 }
00558 max_height = size_stats.ile (textord_initialasc_ile);
00559
00560
00561 max_height *= textord_merge_x / (textord_merge_x + textord_merge_asc);
00562 if (max_height > initial_x)
00563 initial_x = max_height;
00564
00565 return initial_x;
00566 }
00567
00568
00582 void textord_page(
00583 ICOORD page_tr,
00584 BLOCK_LIST *blocks,
00585 TO_BLOCK_LIST *land_blocks,
00586 TO_BLOCK_LIST *port_blocks
00587 ) {
00588 float gradient;
00589
00590 set_global_loc_code(LOC_TEXT_ORD_ROWS);
00591 gradient = make_rows (page_tr, blocks, land_blocks, port_blocks);
00592 if (global_monitor != NULL) {
00593 global_monitor->ocr_alive = TRUE;
00594 global_monitor->progress = 20;
00595 }
00596 set_global_loc_code(LOC_TEXT_ORD_WORDS);
00597 make_words(page_tr, gradient, blocks, land_blocks, port_blocks);
00598 if (global_monitor != NULL) {
00599 global_monitor->ocr_alive = TRUE;
00600 global_monitor->progress = 30;
00601 }
00602 cleanup_blocks(blocks);
00603 #ifndef GRAPHICS_DISABLED
00604 close_to_win();
00605 #endif
00606 if (textord_exit_after && !interactive_mode)
00607 exit (0);
00608 }
00609
00610
00619 void cleanup_blocks(
00620 BLOCK_LIST *blocks
00621 ) {
00622 BLOCK_IT block_it = blocks;
00623 ROW_IT row_it;
00624
00625 for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
00626 block_it.forward ()) {
00627 row_it.set_to_list (block_it.data ()->row_list ());
00628 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00629 if (textord_noise_rejrows
00630 && !row_it.data ()->word_list ()->empty ()
00631 && clean_noise_from_row (row_it.data ())
00632 || row_it.data ()->word_list ()->empty ())
00633 delete row_it.extract ();
00634 else {
00635 if (textord_noise_rejwords)
00636 clean_noise_from_words (row_it.data ());
00637 if (textord_blshift_maxshift >= 0)
00638 tweak_row_baseline (row_it.data ());
00639 }
00640 }
00641 if (block_it.data ()->row_list ()->empty ()) {
00642 delete block_it.extract ();
00643 }
00644 }
00645 }
00646
00647
00653 BOOL8 clean_noise_from_row(
00654 ROW *row
00655 ) {
00656 BOOL8 testing_on;
00657 BOX blob_box;
00658 C_BLOB *blob;
00659 C_OUTLINE *outline;
00660 WERD *word;
00661 INT32 blob_size;
00662 INT32 trans_count = 0;
00663 INT32 trans_threshold;
00664 INT32 dot_count;
00665 INT32 norm_count;
00666 INT32 super_norm_count;
00667
00668 WERD_IT word_it = row->word_list ();
00669 C_BLOB_IT blob_it;
00670 C_OUTLINE_IT out_it;
00671
00672 if (textord_test_y > row->base_line (textord_test_x)
00673 && textord_show_blobs
00674 && textord_test_y < row->base_line (textord_test_x) + row->x_height ())
00675 testing_on = TRUE;
00676 else
00677 testing_on = FALSE;
00678 dot_count = 0;
00679 norm_count = 0;
00680 super_norm_count = 0;
00681 for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
00682 word = word_it.data ();
00683
00684 blob_it.set_to_list (word->cblob_list ());
00685 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
00686 blob_it.forward ()) {
00687 blob = blob_it.data ();
00688 if (!word->flag (W_DONT_CHOP)) {
00689
00690 out_it.set_to_list (blob->out_list ());
00691 for (out_it.mark_cycle_pt (); !out_it.cycled_list ();
00692 out_it.forward ()) {
00693 outline = out_it.data ();
00694 blob_box = outline->bounding_box ();
00695 blob_size =
00696 blob_box.width () >
00697 blob_box.height ()? blob_box.width () : blob_box.
00698 height();
00699 if (blob_size < textord_noise_sizelimit * row->x_height ())
00700 dot_count++;
00701 if (!outline->child ()->empty ()
00702 && blob_box.height () <
00703 (1 + textord_noise_syfract) * row->x_height ()
00704 && blob_box.height () >
00705 (1 - textord_noise_syfract) * row->x_height ()
00706 && blob_box.width () <
00707 (1 + textord_noise_sxfract) * row->x_height ()
00708 && blob_box.width () >
00709 (1 - textord_noise_sxfract) * row->x_height ())
00710 super_norm_count++;
00711 }
00712 }
00713 else
00714 super_norm_count++;
00715 blob_box = blob->bounding_box ();
00716 blob_size =
00717 blob_box.width () >
00718 blob_box.height ()? blob_box.width () : blob_box.height ();
00719 if (blob_size >= textord_noise_sizelimit * row->x_height ()
00720 && blob_size < row->x_height () * 2) {
00721 trans_threshold = blob_size / textord_noise_sizefraction;
00722 trans_count = blob->count_transitions (trans_threshold);
00723 if (trans_count < textord_noise_translimit)
00724 norm_count++;
00725 }
00726 else if (blob_box.height () > row->x_height () * 2
00727 && (!word_it.at_first () || !blob_it.at_first ()))
00728 dot_count += 2;
00729 #ifndef SECURE_NAMES
00730 if (testing_on) {
00731 tprintf
00732 ("Blob at (%d,%d) -> (%d,%d), ols=%d, tc=%d, bldiff=%g\n",
00733 blob_box.left (), blob_box.bottom (), blob_box.right (),
00734 blob_box.top (), blob->out_list ()->length (), trans_count,
00735 blob_box.bottom () - row->base_line (blob_box.left ()));
00736 }
00737 #endif
00738 }
00739 }
00740 #ifndef SECURE_NAMES
00741 if (textord_noise_debug) {
00742 tprintf ("Row ending at (%d,%g):",
00743 blob_box.right (), row->base_line (blob_box.right ()));
00744 tprintf (" R=%g, dc=%d, nc=%d, %s\n",
00745 norm_count > 0 ? (float) dot_count / norm_count : 9999,
00746 dot_count, norm_count,
00747 dot_count > norm_count * textord_noise_normratio
00748 && dot_count > 2 ? "REJECTED" : "ACCEPTED");
00749 }
00750 #endif
00751 return super_norm_count < textord_noise_sncount
00752 && dot_count > norm_count * textord_noise_rowratio && dot_count > 2;
00753 }
00754
00755
00761 void clean_noise_from_words(
00762 ROW *row
00763 ) {
00764 BOX blob_box;
00765 INT8 *word_dud;
00766 C_BLOB *blob;
00767 C_OUTLINE *outline;
00768 WERD *word;
00769 INT32 blob_size;
00770 INT32 trans_count;
00771 INT32 trans_threshold;
00772 INT32 dot_count;
00773 INT32 norm_count;
00774 INT32 dud_words;
00775 INT32 ok_words;
00776 INT32 word_index;
00777
00778 WERD_IT word_it = row->word_list ();
00779 C_BLOB_IT blob_it;
00780 C_OUTLINE_IT out_it;
00781
00782 ok_words = word_it.length ();
00783 if (ok_words == 0)
00784 return;
00785 word_dud = (INT8 *) alloc_mem (ok_words * sizeof (INT8));
00786 dud_words = 0;
00787 ok_words = 0;
00788 word_index = 0;
00789 for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
00790 word = word_it.data ();
00791 dot_count = 0;
00792 norm_count = 0;
00793
00794 blob_it.set_to_list (word->cblob_list ());
00795 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
00796 blob_it.forward ()) {
00797 blob = blob_it.data ();
00798 if (!word->flag (W_DONT_CHOP)) {
00799
00800 out_it.set_to_list (blob->out_list ());
00801 for (out_it.mark_cycle_pt (); !out_it.cycled_list ();
00802 out_it.forward ()) {
00803 outline = out_it.data ();
00804 blob_box = outline->bounding_box ();
00805 blob_size =
00806 blob_box.width () >
00807 blob_box.height ()? blob_box.width () : blob_box.
00808 height();
00809 if (blob_size < textord_noise_sizelimit * row->x_height ())
00810 dot_count++;
00811 if (!outline->child ()->empty ()
00812 && blob_box.height () <
00813 (1 + textord_noise_syfract) * row->x_height ()
00814 && blob_box.height () >
00815 (1 - textord_noise_syfract) * row->x_height ()
00816 && blob_box.width () <
00817 (1 + textord_noise_sxfract) * row->x_height ()
00818 && blob_box.width () >
00819 (1 - textord_noise_sxfract) * row->x_height ())
00820 norm_count++;
00821 }
00822 }
00823 else
00824 norm_count++;
00825 blob_box = blob->bounding_box ();
00826 blob_size =
00827 blob_box.width () >
00828 blob_box.height ()? blob_box.width () : blob_box.height ();
00829 if (blob_size >= textord_noise_sizelimit * row->x_height ()
00830 && blob_size < row->x_height () * 2) {
00831 trans_threshold = blob_size / textord_noise_sizefraction;
00832 trans_count = blob->count_transitions (trans_threshold);
00833 if (trans_count < textord_noise_translimit)
00834 norm_count++;
00835 }
00836 else if (blob_box.height () > row->x_height () * 2
00837 && (!word_it.at_first () || !blob_it.at_first ()))
00838 dot_count += 2;
00839 }
00840 if (dot_count > 2) {
00841 if (dot_count > norm_count * textord_noise_normratio * 2)
00842 word_dud[word_index] = 2;
00843 else if (dot_count > norm_count * textord_noise_normratio)
00844 word_dud[word_index] = 1;
00845 else
00846 word_dud[word_index] = 0;
00847 }
00848 else
00849 word_dud[word_index] = 0;
00850 if (word_dud[word_index] == 2)
00851 dud_words++;
00852 else
00853 ok_words++;
00854 word_index++;
00855 }
00856
00857 word_index = 0;
00858 for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
00859 if (word_dud[word_index] == 2
00860 || word_dud[word_index] == 1 && dud_words > ok_words) {
00861 word = word_it.data ();
00862
00863 blob_it.set_to_list (word->rej_cblob_list ());
00864
00865 blob_it.add_list_after (word->cblob_list ());
00866 }
00867 word_index++;
00868 }
00869 free_mem(word_dud);
00870 }
00871
00872
00886 void tweak_row_baseline(
00887 ROW *row
00888 ) {
00889 BOX blob_box;
00890 C_BLOB *blob;
00891 WERD *word;
00892 INT32 blob_count;
00893 INT32 src_index;
00894 INT32 dest_index;
00895 INT32 *xstarts;
00896 double *coeffs;
00897 float ydiff;
00898 float x_centre;
00899
00900 WERD_IT word_it = row->word_list ();
00901 C_BLOB_IT blob_it;
00902
00903 blob_count = 0;
00904 for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
00905 word = word_it.data ();
00906
00907 blob_count += word->cblob_list ()->length ();
00908 }
00909 if (blob_count == 0)
00910 return;
00911 xstarts =
00912 (INT32 *) alloc_mem ((blob_count + row->baseline.segments + 1) *
00913 sizeof (INT32));
00914 coeffs =
00915 (double *) alloc_mem ((blob_count + row->baseline.segments) * 3 *
00916 sizeof (double));
00917
00918 src_index = 0;
00919 dest_index = 0;
00920 xstarts[0] = row->baseline.xcoords[0];
00921 for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
00922 word = word_it.data ();
00923
00924 blob_it.set_to_list (word->cblob_list ());
00925 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
00926 blob_it.forward ()) {
00927 blob = blob_it.data ();
00928 blob_box = blob->bounding_box ();
00929 x_centre = (blob_box.left () + blob_box.right ()) / 2.0;
00930 ydiff = blob_box.bottom () - row->base_line (x_centre);
00931 if (ydiff < 0)
00932 ydiff = -ydiff / row->x_height ();
00933 else
00934 ydiff = ydiff / row->x_height ();
00935 if (ydiff < textord_blshift_maxshift
00936 && blob_box.height () / row->x_height () >
00937 textord_blshift_xfraction) {
00938 if (xstarts[dest_index] >= x_centre)
00939 xstarts[dest_index] = blob_box.left ();
00940 coeffs[dest_index * 3] = 0;
00941 coeffs[dest_index * 3 + 1] = 0;
00942 coeffs[dest_index * 3 + 2] = blob_box.bottom ();
00943
00944 dest_index++;
00945 xstarts[dest_index] = blob_box.right () + 1;
00946 }
00947 else {
00948 if (xstarts[dest_index] <= x_centre) {
00949 while (row->baseline.xcoords[src_index + 1] <= x_centre
00950 && src_index < row->baseline.segments - 1) {
00951 if (row->baseline.xcoords[src_index + 1] >
00952 xstarts[dest_index]) {
00953 coeffs[dest_index * 3] =
00954 row->baseline.quadratics[src_index].a;
00955 coeffs[dest_index * 3 + 1] =
00956 row->baseline.quadratics[src_index].b;
00957 coeffs[dest_index * 3 + 2] =
00958 row->baseline.quadratics[src_index].c;
00959 dest_index++;
00960 xstarts[dest_index] =
00961 row->baseline.xcoords[src_index + 1];
00962 }
00963 src_index++;
00964 }
00965 coeffs[dest_index * 3] =
00966 row->baseline.quadratics[src_index].a;
00967 coeffs[dest_index * 3 + 1] =
00968 row->baseline.quadratics[src_index].b;
00969 coeffs[dest_index * 3 + 2] =
00970 row->baseline.quadratics[src_index].c;
00971 dest_index++;
00972 xstarts[dest_index] = row->baseline.xcoords[src_index + 1];
00973 }
00974 }
00975 }
00976 }
00977 while (src_index < row->baseline.segments
00978 && row->baseline.xcoords[src_index + 1] <= xstarts[dest_index])
00979 src_index++;
00980 while (src_index < row->baseline.segments) {
00981 coeffs[dest_index * 3] = row->baseline.quadratics[src_index].a;
00982 coeffs[dest_index * 3 + 1] = row->baseline.quadratics[src_index].b;
00983 coeffs[dest_index * 3 + 2] = row->baseline.quadratics[src_index].c;
00984 dest_index++;
00985 src_index++;
00986 xstarts[dest_index] = row->baseline.xcoords[src_index];
00987 }
00988
00989 row->baseline = QSPLINE (dest_index, xstarts, coeffs);
00990 free_mem(xstarts);
00991 free_mem(coeffs);
00992 }
00993
00994
01000 INT32 blob_y_order(
01001 void *item1,
01002 void *item2) {
01003
01004 BLOBNBOX *blob1 = *(BLOBNBOX **) item1;
01005
01006 BLOBNBOX *blob2 = *(BLOBNBOX **) item2;
01007
01008 if (blob1->bounding_box ().bottom () > blob2->bounding_box ().bottom ())
01009 return -1;
01010 else if (blob1->bounding_box ().bottom () <
01011 blob2->bounding_box ().bottom ())
01012 return 1;
01013 else {
01014 if (blob1->bounding_box ().left () < blob2->bounding_box ().left ())
01015 return -1;
01016 else if (blob1->bounding_box ().left () >
01017 blob2->bounding_box ().left ())
01018 return 1;
01019 else
01020 return 0;
01021 }
01022 }