#include "mfcpch.h"
#include "stderr.h"
#include "globaloc.h"
#include "tessout.h"
#include "blread.h"
#include "blobbox.h"
#include "edgblob.h"
#include "drawtord.h"
#include "makerow.h"
#include "wordseg.h"
#include "ocrclass.h"
#include "genblob.h"
#include "imgs.h"
#include "tordmain.h"
#include "secname.h"
Go to the source code of this file.
#define EXTERN |
Definition at line 53 of file tordmain.cpp.
void assign_blobs_to_blocks2 | ( | BLOCK_LIST * | blocks, | |
TO_BLOCK_LIST * | land_blocks, | |||
TO_BLOCK_LIST * | port_blocks | |||
) |
Split into groups.
blocks | Blocks to process | |
land_blocks | Rotated for landscape | |
port_blocks | Output list |
Definition at line 286 of file tordmain.cpp.
References newblob().
Referenced by edges_and_textord(), and read_and_textord().
00290 { 00291 BLOCK *block; //current block 00292 BLOBNBOX *newblob; //created blob 00293 C_BLOB *blob; //current blob 00294 BLOCK_IT block_it = blocks; 00295 C_BLOB_IT blob_it; //iterator 00296 BLOBNBOX_IT port_box_it; //iterator 00297 //destination iterator 00298 TO_BLOCK_IT port_block_it = port_blocks; 00299 TO_BLOCK *port_block; //created block 00300 00301 for (block_it.mark_cycle_pt (); !block_it.cycled_list (); 00302 block_it.forward ()) { 00303 block = block_it.data (); 00304 blob_it.set_to_list (block->blob_list ()); 00305 //make one 00306 port_block = new TO_BLOCK (block); 00307 //make one 00308 port_box_it.set_to_list (&port_block->blobs); 00309 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); 00310 blob_it.forward ()) { 00311 blob = blob_it.extract (); 00312 //convert blob 00313 newblob = new BLOBNBOX (blob); 00314 //add to list 00315 port_box_it.add_after_then_move (newblob); 00316 //convert blob 00317 } 00318 port_block_it.add_after_then_move (port_block); 00319 } 00320 }
INT32 blob_y_order | ( | void * | item1, | |
void * | item2 | |||
) |
sort function
Definition at line 1000 of file tordmain.cpp.
01002 { 01003 //converted ptr 01004 BLOBNBOX *blob1 = *(BLOBNBOX **) item1; 01005 //converted ptr 01006 BLOBNBOX *blob2 = *(BLOBNBOX **) item2; 01007 01008 if (blob1->bounding_box ().bottom () > blob2->bounding_box ().bottom ()) 01009 return -1; 01010 else if (blob1->bounding_box ().bottom () < 01011 blob2->bounding_box ().bottom ()) 01012 return 1; 01013 else { 01014 if (blob1->bounding_box ().left () < blob2->bounding_box ().left ()) 01015 return -1; 01016 else if (blob1->bounding_box ().left () > 01017 blob2->bounding_box ().left ()) 01018 return 1; 01019 else 01020 return 0; 01021 } 01022 }
Remove empties.
Move blobs of words from rows of garbage into the reject blobs list.
Definition at line 653 of file tordmain.cpp.
References ROW::base_line(), BOX::bottom(), C_BLOB::bounding_box(), WERD::cblob_list(), C_BLOB::count_transitions(), FALSE, WERD::flag(), BOX::height(), BOX::left(), C_BLOB::out_list(), BOX::right(), BOX::top(), tprintf(), TRUE, W_DONT_CHOP, BOX::width(), ROW::word_list(), and ROW::x_height().
Referenced by cleanup_blocks().
00655 { 00656 BOOL8 testing_on; 00657 BOX blob_box; //bounding box 00658 C_BLOB *blob; //current blob 00659 C_OUTLINE *outline; //current outline 00660 WERD *word; //current word 00661 INT32 blob_size; //biggest size 00662 INT32 trans_count = 0; //no of transitions 00663 INT32 trans_threshold; //noise tolerance 00664 INT32 dot_count; //small objects 00665 INT32 norm_count; //normal objects 00666 INT32 super_norm_count; //real char-like 00667 //words of row 00668 WERD_IT word_it = row->word_list (); 00669 C_BLOB_IT blob_it; //blob iterator 00670 C_OUTLINE_IT out_it; //outline iterator 00671 00672 if (textord_test_y > row->base_line (textord_test_x) 00673 && textord_show_blobs 00674 && textord_test_y < row->base_line (textord_test_x) + row->x_height ()) 00675 testing_on = TRUE; 00676 else 00677 testing_on = FALSE; 00678 dot_count = 0; 00679 norm_count = 0; 00680 super_norm_count = 0; 00681 for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { 00682 word = word_it.data (); //current word 00683 //blobs in word 00684 blob_it.set_to_list (word->cblob_list ()); 00685 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); 00686 blob_it.forward ()) { 00687 blob = blob_it.data (); 00688 if (!word->flag (W_DONT_CHOP)) { 00689 //get outlines 00690 out_it.set_to_list (blob->out_list ()); 00691 for (out_it.mark_cycle_pt (); !out_it.cycled_list (); 00692 out_it.forward ()) { 00693 outline = out_it.data (); 00694 blob_box = outline->bounding_box (); 00695 blob_size = 00696 blob_box.width () > 00697 blob_box.height ()? blob_box.width () : blob_box. 00698 height(); 00699 if (blob_size < textord_noise_sizelimit * row->x_height ()) 00700 dot_count++; //count smal outlines 00701 if (!outline->child ()->empty () 00702 && blob_box.height () < 00703 (1 + textord_noise_syfract) * row->x_height () 00704 && blob_box.height () > 00705 (1 - textord_noise_syfract) * row->x_height () 00706 && blob_box.width () < 00707 (1 + textord_noise_sxfract) * row->x_height () 00708 && blob_box.width () > 00709 (1 - textord_noise_sxfract) * row->x_height ()) 00710 super_norm_count++; //count smal outlines 00711 } 00712 } 00713 else 00714 super_norm_count++; 00715 blob_box = blob->bounding_box (); 00716 blob_size = 00717 blob_box.width () > 00718 blob_box.height ()? blob_box.width () : blob_box.height (); 00719 if (blob_size >= textord_noise_sizelimit * row->x_height () 00720 && blob_size < row->x_height () * 2) { 00721 trans_threshold = blob_size / textord_noise_sizefraction; 00722 trans_count = blob->count_transitions (trans_threshold); 00723 if (trans_count < textord_noise_translimit) 00724 norm_count++; 00725 } 00726 else if (blob_box.height () > row->x_height () * 2 00727 && (!word_it.at_first () || !blob_it.at_first ())) 00728 dot_count += 2; 00729 #ifndef SECURE_NAMES 00730 if (testing_on) { 00731 tprintf 00732 ("Blob at (%d,%d) -> (%d,%d), ols=%d, tc=%d, bldiff=%g\n", 00733 blob_box.left (), blob_box.bottom (), blob_box.right (), 00734 blob_box.top (), blob->out_list ()->length (), trans_count, 00735 blob_box.bottom () - row->base_line (blob_box.left ())); 00736 } 00737 #endif 00738 } 00739 } 00740 #ifndef SECURE_NAMES 00741 if (textord_noise_debug) { 00742 tprintf ("Row ending at (%d,%g):", 00743 blob_box.right (), row->base_line (blob_box.right ())); 00744 tprintf (" R=%g, dc=%d, nc=%d, %s\n", 00745 norm_count > 0 ? (float) dot_count / norm_count : 9999, 00746 dot_count, norm_count, 00747 dot_count > norm_count * textord_noise_normratio 00748 && dot_count > 2 ? "REJECTED" : "ACCEPTED"); 00749 } 00750 #endif 00751 return super_norm_count < textord_noise_sncount 00752 && dot_count > norm_count * textord_noise_rowratio && dot_count > 2; 00753 }
void clean_noise_from_words | ( | ROW * | row | ) |
Remove empties.
Move blobs of words from rows of garbage into the reject blobs list.
Definition at line 761 of file tordmain.cpp.
References alloc_mem(), C_BLOB::bounding_box(), WERD::cblob_list(), C_BLOB::count_transitions(), WERD::flag(), free_mem(), BOX::height(), C_BLOB::out_list(), WERD::rej_cblob_list(), W_DONT_CHOP, BOX::width(), ROW::word_list(), and ROW::x_height().
Referenced by cleanup_blocks().
00763 { 00764 BOX blob_box; //bounding box 00765 INT8 *word_dud; //was it chucked 00766 C_BLOB *blob; //current blob 00767 C_OUTLINE *outline; //current outline 00768 WERD *word; //current word 00769 INT32 blob_size; //biggest size 00770 INT32 trans_count; //no of transitions 00771 INT32 trans_threshold; //noise tolerance 00772 INT32 dot_count; //small objects 00773 INT32 norm_count; //normal objects 00774 INT32 dud_words; //number discarded 00775 INT32 ok_words; //number remaining 00776 INT32 word_index; //current word 00777 //words of row 00778 WERD_IT word_it = row->word_list (); 00779 C_BLOB_IT blob_it; //blob iterator 00780 C_OUTLINE_IT out_it; //outline iterator 00781 00782 ok_words = word_it.length (); 00783 if (ok_words == 0) 00784 return; 00785 word_dud = (INT8 *) alloc_mem (ok_words * sizeof (INT8)); 00786 dud_words = 0; 00787 ok_words = 0; 00788 word_index = 0; 00789 for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { 00790 word = word_it.data (); //current word 00791 dot_count = 0; 00792 norm_count = 0; 00793 //blobs in word 00794 blob_it.set_to_list (word->cblob_list ()); 00795 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); 00796 blob_it.forward ()) { 00797 blob = blob_it.data (); 00798 if (!word->flag (W_DONT_CHOP)) { 00799 //get outlines 00800 out_it.set_to_list (blob->out_list ()); 00801 for (out_it.mark_cycle_pt (); !out_it.cycled_list (); 00802 out_it.forward ()) { 00803 outline = out_it.data (); 00804 blob_box = outline->bounding_box (); 00805 blob_size = 00806 blob_box.width () > 00807 blob_box.height ()? blob_box.width () : blob_box. 00808 height(); 00809 if (blob_size < textord_noise_sizelimit * row->x_height ()) 00810 dot_count++; //count smal outlines 00811 if (!outline->child ()->empty () 00812 && blob_box.height () < 00813 (1 + textord_noise_syfract) * row->x_height () 00814 && blob_box.height () > 00815 (1 - textord_noise_syfract) * row->x_height () 00816 && blob_box.width () < 00817 (1 + textord_noise_sxfract) * row->x_height () 00818 && blob_box.width () > 00819 (1 - textord_noise_sxfract) * row->x_height ()) 00820 norm_count++; //count smal outlines 00821 } 00822 } 00823 else 00824 norm_count++; 00825 blob_box = blob->bounding_box (); 00826 blob_size = 00827 blob_box.width () > 00828 blob_box.height ()? blob_box.width () : blob_box.height (); 00829 if (blob_size >= textord_noise_sizelimit * row->x_height () 00830 && blob_size < row->x_height () * 2) { 00831 trans_threshold = blob_size / textord_noise_sizefraction; 00832 trans_count = blob->count_transitions (trans_threshold); 00833 if (trans_count < textord_noise_translimit) 00834 norm_count++; 00835 } 00836 else if (blob_box.height () > row->x_height () * 2 00837 && (!word_it.at_first () || !blob_it.at_first ())) 00838 dot_count += 2; 00839 } 00840 if (dot_count > 2) { 00841 if (dot_count > norm_count * textord_noise_normratio * 2) 00842 word_dud[word_index] = 2; 00843 else if (dot_count > norm_count * textord_noise_normratio) 00844 word_dud[word_index] = 1; 00845 else 00846 word_dud[word_index] = 0; 00847 } 00848 else 00849 word_dud[word_index] = 0; 00850 if (word_dud[word_index] == 2) 00851 dud_words++; 00852 else 00853 ok_words++; 00854 word_index++; 00855 } 00856 00857 word_index = 0; 00858 for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { 00859 if (word_dud[word_index] == 2 00860 || word_dud[word_index] == 1 && dud_words > ok_words) { 00861 word = word_it.data (); //current word 00862 //rejected blobs 00863 blob_it.set_to_list (word->rej_cblob_list ()); 00864 //move from blobs 00865 blob_it.add_list_after (word->cblob_list ()); 00866 } 00867 word_index++; 00868 } 00869 free_mem(word_dud); 00870 }
void cleanup_blocks | ( | BLOCK_LIST * | blocks | ) |
Remove empties.
Definition at line 619 of file tordmain.cpp.
References clean_noise_from_row(), clean_noise_from_words(), and tweak_row_baseline().
Referenced by textord_page().
00621 { 00622 BLOCK_IT block_it = blocks; //iterator 00623 ROW_IT row_it; //row iterator 00624 00625 for (block_it.mark_cycle_pt (); !block_it.cycled_list (); 00626 block_it.forward ()) { 00627 row_it.set_to_list (block_it.data ()->row_list ()); 00628 for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { 00629 if (textord_noise_rejrows 00630 && !row_it.data ()->word_list ()->empty () 00631 && clean_noise_from_row (row_it.data ()) 00632 || row_it.data ()->word_list ()->empty ()) 00633 delete row_it.extract ();//lose empty row 00634 else { 00635 if (textord_noise_rejwords) 00636 clean_noise_from_words (row_it.data ()); 00637 if (textord_blshift_maxshift >= 0) 00638 tweak_row_baseline (row_it.data ()); 00639 } 00640 } 00641 if (block_it.data ()->row_list ()->empty ()) { 00642 delete block_it.extract ();//lose empty block 00643 } 00644 } 00645 }
void edges_and_textord | ( | const char * | filename, | |
BLOCK_LIST * | blocks | |||
) |
Read a .pb file.
filename | Filename of .pb (source image) | |
blocks | Pointer to LIST of blocks found (modified) |
A ".pb file" seems to be a synonym for the image passed to tesseract on the command-line because this IS the main entry-point for textord (either that or the comment is wrong :-)
Definition at line 173 of file tordmain.cpp.
References assign_blobs_to_blocks2(), CANTOPENFILE, ERRCODE::error(), EXIT, extract_edges(), filter_blobs(), IMAGE::get_bpp(), IMAGE::get_xsize(), IMAGE::get_ysize(), global_monitor, invert_image(), LOC_ADAPTIVE, LOC_EDGE_PROG, NO_WINDOW, NULL, ETEXT_DESC::ocr_alive, page_image, previous_cpu, ETEXT_DESC::progress, IMAGE::read(), IMAGE::read_header(), read_pd_file(), READFAILED, set_global_loc_code(), STRING::string(), textord_page(), BOX::topright(), TRUE, and IMAGE::white_high().
Referenced by pgeditor_read_file().
00175 { 00176 BLOCK *block; //current block 00177 char *lastdot; //of name 00178 STRING name = filename; //truncated name 00179 ICOORD page_tr; 00180 BOX page_box; //bounding_box 00181 PDBLK_CLIST pd_blocks; //copy of list 00182 BLOCK_IT block_it = blocks; //iterator 00183 PDBLK_C_IT pd_it = &pd_blocks; //iterator 00184 //different orientations 00185 TO_BLOCK_LIST land_blocks, port_blocks; 00186 IMAGE thresh_image; //thresholded 00187 00188 lastdot = strrchr (name.string (), '.'); 00189 if (lastdot != NULL) 00190 *lastdot = '\0'; 00191 if (page_image.get_bpp () == 0) { 00192 name += tessedit_image_ext; 00193 if (page_image.read_header (name.string ())) 00194 CANTOPENFILE.error ("edges_and_textord", EXIT, name.string ()); 00195 if (page_image.read (0)) 00196 READFAILED.error ("edges_and_textord", EXIT, name.string ()); 00197 name = filename; 00198 lastdot = strrchr (name.string (), '.'); 00199 if (lastdot != NULL) 00200 *lastdot = '\0'; 00201 } 00202 page_tr = ICOORD (page_image.get_xsize (), page_image.get_ysize ()); 00203 read_pd_file (name, page_image.get_xsize (), page_image.get_ysize (), 00204 blocks); 00205 block_it.set_to_list (blocks); 00206 if (global_monitor != NULL) 00207 global_monitor->ocr_alive = TRUE; 00208 00209 if (page_image.get_bpp () > 1) { 00210 set_global_loc_code(LOC_ADAPTIVE); 00211 for (block_it.mark_cycle_pt (); !block_it.cycled_list (); 00212 block_it.forward ()) { 00213 block = block_it.data (); 00214 pd_it.add_after_then_move (block); 00215 } 00216 // adaptive_threshold(&page_image,&pd_blocks,&thresh_image); 00217 set_global_loc_code(LOC_EDGE_PROG); 00218 #ifndef EMBEDDED 00219 previous_cpu = clock (); 00220 #endif 00221 for (block_it.mark_cycle_pt (); !block_it.cycled_list (); 00222 block_it.forward ()) { 00223 block = block_it.data (); 00224 if (!polygon_tess_approximation) 00225 invert_image(&page_image); 00226 #ifndef GRAPHICS_DISABLED 00227 extract_edges(NO_WINDOW, &page_image, &thresh_image, page_tr, block); 00228 #else 00229 extract_edges(&page_image, &thresh_image, page_tr, block); 00230 #endif 00231 page_box += block->bounding_box (); 00232 } 00233 page_image = thresh_image; //everyone else gets it 00234 } 00235 else { 00236 set_global_loc_code(LOC_EDGE_PROG); 00237 if (!page_image.white_high ()) 00238 invert_image(&page_image); 00239 00240 #ifndef EMBEDDED 00241 previous_cpu = clock (); 00242 #endif 00243 00244 for (block_it.mark_cycle_pt (); !block_it.cycled_list (); 00245 block_it.forward ()) { 00246 block = block_it.data (); 00247 #ifndef GRAPHICS_DISABLED 00248 extract_edges(NO_WINDOW, &page_image, &page_image, page_tr, block); 00249 #else 00250 extract_edges(&page_image, &page_image, page_tr, block); 00251 #endif 00252 page_box += block->bounding_box (); 00253 } 00254 } 00255 if (global_monitor != NULL) { 00256 global_monitor->ocr_alive = TRUE; 00257 global_monitor->progress = 10; 00258 } 00259 00260 assign_blobs_to_blocks2(blocks, &land_blocks, &port_blocks); 00261 if (global_monitor != NULL) 00262 global_monitor->ocr_alive = TRUE; 00263 filter_blobs (page_box.topright (), &land_blocks, textord_test_landscape); 00264 #ifndef EMBEDDED 00265 previous_cpu = clock (); 00266 #endif 00267 filter_blobs (page_box.topright (), &port_blocks, !textord_test_landscape); 00268 if (global_monitor != NULL) 00269 global_monitor->ocr_alive = TRUE; 00270 textord_page (page_box.topright (), blocks, &land_blocks, &port_blocks); 00271 }//edges_and_textord
Split into groups.
page_tr | Top right | |
blocks | Output list | |
testing_on | For plotting |
(textord_merge_desc + textord_merge_x + 2*textord_merge_asc) block->line_size * ------------------------------------------------------------ textord_merge_x
Definition at line 338 of file tordmain.cpp.
References BLUE, BROWN, CORAL, create_to_win(), DARK_GREEN, filter_noise_blobs(), GOLDENROD, NO_WINDOW, plot_blob_list(), plot_box_list(), to_win, WHITE, and YELLOW.
Referenced by edges_and_textord(), and read_and_textord().
00342 { 00343 TO_BLOCK_IT block_it = blocks; //destination iterator 00344 TO_BLOCK *block; //created block 00345 00346 for (block_it.mark_cycle_pt (); !block_it.cycled_list (); 00347 block_it.forward ()) { 00348 block = block_it.data (); 00349 block->line_size = filter_noise_blobs (&block->blobs, 00350 &block->noise_blobs, 00351 &block->small_blobs, 00352 &block->large_blobs); 00353 block->line_spacing = 00354 block->line_size * (textord_merge_desc + textord_merge_x + 00355 textord_merge_asc + 00356 textord_merge_asc) / textord_merge_x; 00357 block->line_size *= textord_min_linesize; 00358 block->max_blob_size = block->line_size * textord_excess_blobsize; 00359 #ifndef GRAPHICS_DISABLED 00360 if (textord_show_blobs && testing_on) { 00361 if (to_win == NO_WINDOW) 00362 create_to_win(page_tr); 00363 plot_blob_list (to_win, &block->noise_blobs, CORAL, BLUE); 00364 plot_blob_list (to_win, &block->small_blobs, GOLDENROD, YELLOW); 00365 plot_blob_list (to_win, &block->large_blobs, DARK_GREEN, YELLOW); 00366 plot_blob_list (to_win, &block->blobs, WHITE, BROWN); 00367 } 00368 if (textord_show_boxes && testing_on) { 00369 if (to_win == NO_WINDOW) 00370 create_to_win(page_tr); 00371 plot_box_list (to_win, &block->noise_blobs, WHITE); 00372 plot_box_list (to_win, &block->small_blobs, WHITE); 00373 plot_box_list (to_win, &block->large_blobs, WHITE); 00374 plot_box_list (to_win, &block->blobs, WHITE); 00375 } 00376 #endif 00377 } 00378 }
float filter_noise_blobs | ( | BLOBNBOX_LIST * | src_list, | |
BLOBNBOX_LIST * | noise_list, | |||
BLOBNBOX_LIST * | small_list, | |||
BLOBNBOX_LIST * | large_list | |||
) |
Separate noise.
src_list | Origonal list | |
noise_list | Noise list | |
small_list | Small blobs | |
large_list | Large blobs |
textord_new_initial_xheight is TRUE by default so processing is ALWAYS done by filter_noise_blobs2().
Definition at line 404 of file tordmain.cpp.
References STATS::add(), filter_noise_blobs2(), and STATS::ile().
Referenced by filter_blobs().
00409 { 00410 INT16 height; //height of blob 00411 INT16 width; //of blob 00412 BLOBNBOX_IT src_it = src_list; //iterators 00413 BLOBNBOX_IT noise_it = noise_list; 00414 BLOBNBOX_IT small_it = small_list; 00415 BLOBNBOX_IT large_it = large_list; 00416 STATS size_stats (0, MAX_NEAREST_DIST); 00417 //blob heights 00418 if (textord_new_initial_xheight) { 00419 return filter_noise_blobs2 (src_list, noise_list, small_list, large_list); 00420 } 00421 float min_y; //size limits 00422 float max_y; 00423 float max_x; 00424 00425 for (src_it.mark_cycle_pt (); !src_it.cycled_list (); src_it.forward ()) { 00426 if (src_it.data ()->bounding_box ().height () < textord_max_noise_size) { 00427 noise_it.add_after_then_move (src_it.extract ()); 00428 } 00429 } 00430 for (src_it.mark_cycle_pt (); !src_it.cycled_list (); src_it.forward ()) { 00431 size_stats.add (src_it.data ()->bounding_box ().height (), 1); 00432 } 00433 min_y = floor (size_stats.ile (textord_blob_size_smallile / 100.0)); 00434 max_y = ceil (size_stats.ile (textord_blob_size_bigile / 100.0)); 00435 max_x = ceil (size_stats.ile (0.5) * textord_width_limit); 00436 for (src_it.mark_cycle_pt (); !src_it.cycled_list (); src_it.forward ()) { 00437 height = src_it.data ()->bounding_box ().height (); 00438 width = src_it.data ()->bounding_box ().width (); 00439 if (height < min_y) { 00440 small_it.add_after_then_move (src_it.extract ()); 00441 } 00442 else if (height > max_y || width > max_x) { 00443 large_it.add_after_then_move (src_it.extract ()); 00444 } 00445 } 00446 return size_stats.ile (textord_initialx_ile); 00447 }
float filter_noise_blobs2 | ( | BLOBNBOX_LIST * | src_list, | |
BLOBNBOX_LIST * | noise_list, | |||
BLOBNBOX_LIST * | small_list, | |||
BLOBNBOX_LIST * | large_list | |||
) |
Separate noise.
src_list | Origonal list | |
noise_list | Noise list | |
small_list | Small blobs | |
large_list | Large blobs |
Definition at line 498 of file tordmain.cpp.
References STATS::add(), STATS::clear(), and STATS::ile().
Referenced by filter_noise_blobs().
00503 { 00504 INT16 height; //height of blob 00505 INT16 width; //of blob 00506 BLOBNBOX *blob; //current blob 00507 float initial_x; //first guess 00508 BLOBNBOX_IT src_it = src_list; //iterators 00509 BLOBNBOX_IT noise_it = noise_list; 00510 BLOBNBOX_IT small_it = small_list; 00511 BLOBNBOX_IT large_it = large_list; 00512 STATS size_stats (0, MAX_NEAREST_DIST); 00513 //blob heights 00514 float min_y; //size limits 00515 float max_y; 00516 float max_x; 00517 float max_height; //of good blobs 00518 00519 for (src_it.mark_cycle_pt (); !src_it.cycled_list (); src_it.forward ()) { 00520 blob = src_it.data (); 00521 if (blob->bounding_box ().height () < textord_max_noise_size) 00522 noise_it.add_after_then_move (src_it.extract ()); 00523 else if (blob->enclosed_area () >= blob->bounding_box ().height () 00524 * blob->bounding_box ().width () * textord_noise_area_ratio) 00525 small_it.add_after_then_move (src_it.extract ()); 00526 } 00527 for (src_it.mark_cycle_pt (); !src_it.cycled_list (); src_it.forward ()) { 00528 size_stats.add (src_it.data ()->bounding_box ().height (), 1); 00529 } 00530 initial_x = size_stats.ile (textord_initialx_ile); 00531 max_y = 00532 ceil (initial_x * 00533 (textord_merge_desc + textord_merge_x + 00534 2 * textord_merge_asc) / textord_merge_x); 00535 min_y = floor (initial_x / 2); 00536 max_x = ceil (initial_x * textord_width_limit); 00537 small_it.move_to_first (); 00538 for (small_it.mark_cycle_pt (); !small_it.cycled_list (); 00539 small_it.forward ()) { 00540 height = small_it.data ()->bounding_box ().height (); 00541 if (height >= min_y) 00542 large_it.add_after_then_move (small_it.extract ()); 00543 } 00544 size_stats.clear (); 00545 for (src_it.mark_cycle_pt (); !src_it.cycled_list (); src_it.forward ()) { 00546 height = src_it.data ()->bounding_box ().height (); 00547 width = src_it.data ()->bounding_box ().width (); 00548 if (height < min_y) { 00549 small_it.add_after_then_move (src_it.extract ()); 00550 } 00551 else if (height > max_y || width > max_x) { 00552 large_it.add_after_then_move (src_it.extract ()); 00553 } 00554 else { 00555 size_stats.add (height, 1); 00556 } 00557 } 00558 max_height = size_stats.ile (textord_initialasc_ile); 00559 // printf("max_y=%g, min_y=%g, initial_x=%g, max_height=%g,", 00560 // max_y,min_y,initial_x,max_height); 00561 max_height *= textord_merge_x / (textord_merge_x + textord_merge_asc); 00562 if (max_height > initial_x) 00563 initial_x = max_height; 00564 // printf(" ret=%g\n",initial_x); 00565 return initial_x; 00566 }
void read_and_textord | ( | const char * | filename, | |
BLOCK_LIST * | blocks | |||
) |
Read a .pb file.
filename | Filename of .pb (source image) | |
blocks | Pointer to LIST of blocks found (modified) |
See edges_and_textord() for main 'usual' entry-point into textord... THIS seems to be for testing/development only.
Definition at line 126 of file tordmain.cpp.
References assign_blobs_to_blocks2(), CANTOPENFILE, ERRCODE::error(), EXIT, filter_blobs(), NULL, textord_page(), and BOX::topright().
Referenced by pgeditor_read_file().
00128 { 00129 int c; //input character 00130 FILE *infp; //input file 00131 BLOCK *block; //current block 00132 BOX page_box; //bounding_box 00133 BLOCK_IT block_it = blocks; //iterator 00134 //different orientations 00135 TO_BLOCK_LIST land_blocks, port_blocks; 00136 00137 infp = fopen (filename, "r"); 00138 if (infp == NULL) 00139 CANTOPENFILE.error ("read_and_textord", EXIT, filename); 00140 00141 while (((c = fgetc (infp)) != EOF) && (ungetc (c, infp) != EOF)) { 00142 //get one 00143 block = BLOCK::de_serialise (infp); 00144 //add to list 00145 block_it.add_after_then_move (block); 00146 //find page size 00147 page_box += block->bounding_box (); 00148 } 00149 fclose(infp); 00150 00151 assign_blobs_to_blocks2(blocks, &land_blocks, &port_blocks); 00152 filter_blobs (page_box.topright (), &port_blocks, !textord_test_landscape); 00153 filter_blobs (page_box.topright (), &land_blocks, textord_test_landscape); 00154 textord_page (page_box.topright (), blocks, &land_blocks, &port_blocks); 00155 }
void textord_page | ( | ICOORD | page_tr, | |
BLOCK_LIST * | blocks, | |||
TO_BLOCK_LIST * | land_blocks, | |||
TO_BLOCK_LIST * | port_blocks | |||
) |
Make rows & words.
page_tr | Top right | |
blocks | Block list | |
land_blocks | Rotated for landscape | |
port_blocks | Output list |
Definition at line 582 of file tordmain.cpp.
References cleanup_blocks(), close_to_win(), global_monitor, LOC_TEXT_ORD_ROWS, LOC_TEXT_ORD_WORDS, make_rows(), make_words(), NULL, ETEXT_DESC::ocr_alive, ETEXT_DESC::progress, set_global_loc_code(), and TRUE.
Referenced by edges_and_textord(), and read_and_textord().
00587 { 00588 float gradient; //global skew 00589 00590 set_global_loc_code(LOC_TEXT_ORD_ROWS); 00591 gradient = make_rows (page_tr, blocks, land_blocks, port_blocks); 00592 if (global_monitor != NULL) { 00593 global_monitor->ocr_alive = TRUE; 00594 global_monitor->progress = 20; 00595 } 00596 set_global_loc_code(LOC_TEXT_ORD_WORDS); 00597 make_words(page_tr, gradient, blocks, land_blocks, port_blocks); 00598 if (global_monitor != NULL) { 00599 global_monitor->ocr_alive = TRUE; 00600 global_monitor->progress = 30; 00601 } 00602 cleanup_blocks(blocks); //remove empties 00603 #ifndef GRAPHICS_DISABLED 00604 close_to_win(); 00605 #endif 00606 if (textord_exit_after && !interactive_mode) 00607 exit (0); 00608 }
void tweak_row_baseline | ( | ROW * | row | ) |
Remove empties.
Shift baseline to fit the blobs more accurately where they are close enough.
Definition at line 886 of file tordmain.cpp.
References QUAD_COEFFS::a, alloc_mem(), QUAD_COEFFS::b, ROW::base_line(), ROW::baseline, baseline, blob_count, BOX::bottom(), C_BLOB::bounding_box(), QUAD_COEFFS::c, WERD::cblob_list(), free_mem(), BOX::height(), BOX::left(), QSPLINE::quadratics, BOX::right(), QSPLINE::segments, ROW::word_list(), ROW::x_height(), and QSPLINE::xcoords.
Referenced by cleanup_blocks().
00888 { 00889 BOX blob_box; //bounding box 00890 C_BLOB *blob; //current blob 00891 WERD *word; //current word 00892 INT32 blob_count; //no of blobs 00893 INT32 src_index; //source segment 00894 INT32 dest_index; //destination segment 00895 INT32 *xstarts; //spline segments 00896 double *coeffs; //spline coeffs 00897 float ydiff; //baseline error 00898 float x_centre; //centre of blob 00899 //words of row 00900 WERD_IT word_it = row->word_list (); 00901 C_BLOB_IT blob_it; //blob iterator 00902 00903 blob_count = 0; 00904 for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { 00905 word = word_it.data (); //current word 00906 //get total blobs 00907 blob_count += word->cblob_list ()->length (); 00908 } 00909 if (blob_count == 0) 00910 return; 00911 xstarts = 00912 (INT32 *) alloc_mem ((blob_count + row->baseline.segments + 1) * 00913 sizeof (INT32)); 00914 coeffs = 00915 (double *) alloc_mem ((blob_count + row->baseline.segments) * 3 * 00916 sizeof (double)); 00917 00918 src_index = 0; 00919 dest_index = 0; 00920 xstarts[0] = row->baseline.xcoords[0]; 00921 for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { 00922 word = word_it.data (); //current word 00923 //blobs in word 00924 blob_it.set_to_list (word->cblob_list ()); 00925 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); 00926 blob_it.forward ()) { 00927 blob = blob_it.data (); 00928 blob_box = blob->bounding_box (); 00929 x_centre = (blob_box.left () + blob_box.right ()) / 2.0; 00930 ydiff = blob_box.bottom () - row->base_line (x_centre); 00931 if (ydiff < 0) 00932 ydiff = -ydiff / row->x_height (); 00933 else 00934 ydiff = ydiff / row->x_height (); 00935 if (ydiff < textord_blshift_maxshift 00936 && blob_box.height () / row->x_height () > 00937 textord_blshift_xfraction) { 00938 if (xstarts[dest_index] >= x_centre) 00939 xstarts[dest_index] = blob_box.left (); 00940 coeffs[dest_index * 3] = 0; 00941 coeffs[dest_index * 3 + 1] = 0; 00942 coeffs[dest_index * 3 + 2] = blob_box.bottom (); 00943 //shift it 00944 dest_index++; 00945 xstarts[dest_index] = blob_box.right () + 1; 00946 } 00947 else { 00948 if (xstarts[dest_index] <= x_centre) { 00949 while (row->baseline.xcoords[src_index + 1] <= x_centre 00950 && src_index < row->baseline.segments - 1) { 00951 if (row->baseline.xcoords[src_index + 1] > 00952 xstarts[dest_index]) { 00953 coeffs[dest_index * 3] = 00954 row->baseline.quadratics[src_index].a; 00955 coeffs[dest_index * 3 + 1] = 00956 row->baseline.quadratics[src_index].b; 00957 coeffs[dest_index * 3 + 2] = 00958 row->baseline.quadratics[src_index].c; 00959 dest_index++; 00960 xstarts[dest_index] = 00961 row->baseline.xcoords[src_index + 1]; 00962 } 00963 src_index++; 00964 } 00965 coeffs[dest_index * 3] = 00966 row->baseline.quadratics[src_index].a; 00967 coeffs[dest_index * 3 + 1] = 00968 row->baseline.quadratics[src_index].b; 00969 coeffs[dest_index * 3 + 2] = 00970 row->baseline.quadratics[src_index].c; 00971 dest_index++; 00972 xstarts[dest_index] = row->baseline.xcoords[src_index + 1]; 00973 } 00974 } 00975 } 00976 } 00977 while (src_index < row->baseline.segments 00978 && row->baseline.xcoords[src_index + 1] <= xstarts[dest_index]) 00979 src_index++; 00980 while (src_index < row->baseline.segments) { 00981 coeffs[dest_index * 3] = row->baseline.quadratics[src_index].a; 00982 coeffs[dest_index * 3 + 1] = row->baseline.quadratics[src_index].b; 00983 coeffs[dest_index * 3 + 2] = row->baseline.quadratics[src_index].c; 00984 dest_index++; 00985 src_index++; 00986 xstarts[dest_index] = row->baseline.xcoords[src_index]; 00987 } 00988 //turn to spline 00989 row->baseline = QSPLINE (dest_index, xstarts, coeffs); 00990 free_mem(xstarts); 00991 free_mem(coeffs); 00992 }
const ERRCODE BLOCKLESS_BLOBS = "Warning:some blobs assigned to no block" |
* (C) Copyright 1992, Hewlett-Packard Ltd. ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. ** You may obtain a copy of the License at ** http://www.apache.org/licenses/LICENSE-2.0 ** Unless required by applicable law or agreed to in writing, software ** distributed under the License is distributed on an "AS IS" BASIS, ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ** See the License for the specific language governing permissions and ** limitations under the License.
Definition at line 47 of file tordmain.cpp.