textord/tordmain.cpp File Reference

#include "mfcpch.h"
#include "stderr.h"
#include "globaloc.h"
#include "tessout.h"
#include "blread.h"
#include "blobbox.h"
#include "edgblob.h"
#include "drawtord.h"
#include "makerow.h"
#include "wordseg.h"
#include "ocrclass.h"
#include "genblob.h"
#include "imgs.h"
#include "tordmain.h"
#include "secname.h"

Go to the source code of this file.

Defines

Functions

Variables


Define Documentation

#define EXTERN

Definition at line 53 of file tordmain.cpp.


Function Documentation

void assign_blobs_to_blocks2 ( BLOCK_LIST *  blocks,
TO_BLOCK_LIST *  land_blocks,
TO_BLOCK_LIST *  port_blocks 
)

Split into groups.

Parameters:
blocks Blocks to process
land_blocks Rotated for landscape
port_blocks Output list
Make a list of TO_BLOCKs for portrait and landscape orientation.

Definition at line 286 of file tordmain.cpp.

References newblob().

Referenced by edges_and_textord(), and read_and_textord().

00290                               {
00291   BLOCK *block;                  //current block
00292   BLOBNBOX *newblob;             //created blob
00293   C_BLOB *blob;                  //current blob
00294   BLOCK_IT block_it = blocks;
00295   C_BLOB_IT blob_it;             //iterator
00296   BLOBNBOX_IT port_box_it;       //iterator
00297                                  //destination iterator
00298   TO_BLOCK_IT port_block_it = port_blocks;
00299   TO_BLOCK *port_block;          //created block
00300 
00301   for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
00302   block_it.forward ()) {
00303     block = block_it.data ();
00304     blob_it.set_to_list (block->blob_list ());
00305                                  //make one
00306     port_block = new TO_BLOCK (block);
00307                                  //make one
00308     port_box_it.set_to_list (&port_block->blobs);
00309     for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
00310     blob_it.forward ()) {
00311       blob = blob_it.extract ();
00312                                  //convert blob
00313       newblob = new BLOBNBOX (blob);
00314                                  //add to list
00315       port_box_it.add_after_then_move (newblob);
00316                                  //convert blob
00317     }
00318     port_block_it.add_after_then_move (port_block);
00319   }
00320 }

INT32 blob_y_order ( void *  item1,
void *  item2 
)

sort function

Note:
NOT used anywhere in present code (V1.03)

Definition at line 1000 of file tordmain.cpp.

01002                                 {
01003                                  //converted ptr
01004   BLOBNBOX *blob1 = *(BLOBNBOX **) item1;
01005                                  //converted ptr
01006   BLOBNBOX *blob2 = *(BLOBNBOX **) item2;
01007 
01008   if (blob1->bounding_box ().bottom () > blob2->bounding_box ().bottom ())
01009     return -1;
01010   else if (blob1->bounding_box ().bottom () <
01011     blob2->bounding_box ().bottom ())
01012     return 1;
01013   else {
01014     if (blob1->bounding_box ().left () < blob2->bounding_box ().left ())
01015       return -1;
01016     else if (blob1->bounding_box ().left () >
01017       blob2->bounding_box ().left ())
01018       return 1;
01019     else
01020       return 0;
01021   }
01022 }

BOOL8 clean_noise_from_row ( ROW row  ) 

Remove empties.

Move blobs of words from rows of garbage into the reject blobs list.

Definition at line 653 of file tordmain.cpp.

References ROW::base_line(), BOX::bottom(), C_BLOB::bounding_box(), WERD::cblob_list(), C_BLOB::count_transitions(), FALSE, WERD::flag(), BOX::height(), BOX::left(), C_BLOB::out_list(), BOX::right(), BOX::top(), tprintf(), TRUE, W_DONT_CHOP, BOX::width(), ROW::word_list(), and ROW::x_height().

Referenced by cleanup_blocks().

00655                             {
00656   BOOL8 testing_on;
00657   BOX blob_box;                  //bounding box
00658   C_BLOB *blob;                  //current blob
00659   C_OUTLINE *outline;            //current outline
00660   WERD *word;                    //current word
00661   INT32 blob_size;               //biggest size
00662   INT32 trans_count = 0;         //no of transitions
00663   INT32 trans_threshold;         //noise tolerance
00664   INT32 dot_count;               //small objects
00665   INT32 norm_count;              //normal objects
00666   INT32 super_norm_count;        //real char-like
00667                                  //words of row
00668   WERD_IT word_it = row->word_list ();
00669   C_BLOB_IT blob_it;             //blob iterator
00670   C_OUTLINE_IT out_it;           //outline iterator
00671 
00672   if (textord_test_y > row->base_line (textord_test_x)
00673     && textord_show_blobs
00674     && textord_test_y < row->base_line (textord_test_x) + row->x_height ())
00675     testing_on = TRUE;
00676   else
00677     testing_on = FALSE;
00678   dot_count = 0;
00679   norm_count = 0;
00680   super_norm_count = 0;
00681   for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
00682     word = word_it.data ();      //current word
00683                                  //blobs in word
00684     blob_it.set_to_list (word->cblob_list ());
00685     for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
00686     blob_it.forward ()) {
00687       blob = blob_it.data ();
00688       if (!word->flag (W_DONT_CHOP)) {
00689                                  //get outlines
00690         out_it.set_to_list (blob->out_list ());
00691         for (out_it.mark_cycle_pt (); !out_it.cycled_list ();
00692         out_it.forward ()) {
00693           outline = out_it.data ();
00694           blob_box = outline->bounding_box ();
00695           blob_size =
00696             blob_box.width () >
00697             blob_box.height ()? blob_box.width () : blob_box.
00698             height();
00699           if (blob_size < textord_noise_sizelimit * row->x_height ())
00700             dot_count++;         //count smal outlines
00701           if (!outline->child ()->empty ()
00702             && blob_box.height () <
00703             (1 + textord_noise_syfract) * row->x_height ()
00704             && blob_box.height () >
00705             (1 - textord_noise_syfract) * row->x_height ()
00706             && blob_box.width () <
00707             (1 + textord_noise_sxfract) * row->x_height ()
00708             && blob_box.width () >
00709             (1 - textord_noise_sxfract) * row->x_height ())
00710             super_norm_count++;  //count smal outlines
00711         }
00712       }
00713       else
00714         super_norm_count++;
00715       blob_box = blob->bounding_box ();
00716       blob_size =
00717         blob_box.width () >
00718         blob_box.height ()? blob_box.width () : blob_box.height ();
00719       if (blob_size >= textord_noise_sizelimit * row->x_height ()
00720       && blob_size < row->x_height () * 2) {
00721         trans_threshold = blob_size / textord_noise_sizefraction;
00722         trans_count = blob->count_transitions (trans_threshold);
00723         if (trans_count < textord_noise_translimit)
00724           norm_count++;
00725       }
00726       else if (blob_box.height () > row->x_height () * 2
00727         && (!word_it.at_first () || !blob_it.at_first ()))
00728         dot_count += 2;
00729       #ifndef SECURE_NAMES
00730       if (testing_on) {
00731         tprintf
00732           ("Blob at (%d,%d) -> (%d,%d), ols=%d, tc=%d, bldiff=%g\n",
00733           blob_box.left (), blob_box.bottom (), blob_box.right (),
00734           blob_box.top (), blob->out_list ()->length (), trans_count,
00735           blob_box.bottom () - row->base_line (blob_box.left ()));
00736       }
00737       #endif
00738     }
00739   }
00740   #ifndef SECURE_NAMES
00741   if (textord_noise_debug) {
00742     tprintf ("Row ending at (%d,%g):",
00743       blob_box.right (), row->base_line (blob_box.right ()));
00744     tprintf (" R=%g, dc=%d, nc=%d, %s\n",
00745       norm_count > 0 ? (float) dot_count / norm_count : 9999,
00746       dot_count, norm_count,
00747       dot_count > norm_count * textord_noise_normratio
00748       && dot_count > 2 ? "REJECTED" : "ACCEPTED");
00749   }
00750   #endif
00751   return super_norm_count < textord_noise_sncount
00752     && dot_count > norm_count * textord_noise_rowratio && dot_count > 2;
00753 }

void clean_noise_from_words ( ROW row  ) 

Remove empties.

Move blobs of words from rows of garbage into the reject blobs list.

Definition at line 761 of file tordmain.cpp.

References alloc_mem(), C_BLOB::bounding_box(), WERD::cblob_list(), C_BLOB::count_transitions(), WERD::flag(), free_mem(), BOX::height(), C_BLOB::out_list(), WERD::rej_cblob_list(), W_DONT_CHOP, BOX::width(), ROW::word_list(), and ROW::x_height().

Referenced by cleanup_blocks().

00763                              {
00764   BOX blob_box;                  //bounding box
00765   INT8 *word_dud;                //was it chucked
00766   C_BLOB *blob;                  //current blob
00767   C_OUTLINE *outline;            //current outline
00768   WERD *word;                    //current word
00769   INT32 blob_size;               //biggest size
00770   INT32 trans_count;             //no of transitions
00771   INT32 trans_threshold;         //noise tolerance
00772   INT32 dot_count;               //small objects
00773   INT32 norm_count;              //normal objects
00774   INT32 dud_words;               //number discarded
00775   INT32 ok_words;                //number remaining
00776   INT32 word_index;              //current word
00777                                  //words of row
00778   WERD_IT word_it = row->word_list ();
00779   C_BLOB_IT blob_it;             //blob iterator
00780   C_OUTLINE_IT out_it;           //outline iterator
00781 
00782   ok_words = word_it.length ();
00783   if (ok_words == 0)
00784     return;
00785   word_dud = (INT8 *) alloc_mem (ok_words * sizeof (INT8));
00786   dud_words = 0;
00787   ok_words = 0;
00788   word_index = 0;
00789   for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
00790     word = word_it.data ();      //current word
00791     dot_count = 0;
00792     norm_count = 0;
00793                                  //blobs in word
00794     blob_it.set_to_list (word->cblob_list ());
00795     for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
00796     blob_it.forward ()) {
00797       blob = blob_it.data ();
00798       if (!word->flag (W_DONT_CHOP)) {
00799                                  //get outlines
00800         out_it.set_to_list (blob->out_list ());
00801         for (out_it.mark_cycle_pt (); !out_it.cycled_list ();
00802         out_it.forward ()) {
00803           outline = out_it.data ();
00804           blob_box = outline->bounding_box ();
00805           blob_size =
00806             blob_box.width () >
00807             blob_box.height ()? blob_box.width () : blob_box.
00808             height();
00809           if (blob_size < textord_noise_sizelimit * row->x_height ())
00810             dot_count++;         //count smal outlines
00811           if (!outline->child ()->empty ()
00812             && blob_box.height () <
00813             (1 + textord_noise_syfract) * row->x_height ()
00814             && blob_box.height () >
00815             (1 - textord_noise_syfract) * row->x_height ()
00816             && blob_box.width () <
00817             (1 + textord_noise_sxfract) * row->x_height ()
00818             && blob_box.width () >
00819             (1 - textord_noise_sxfract) * row->x_height ())
00820             norm_count++;        //count smal outlines
00821         }
00822       }
00823       else
00824         norm_count++;
00825       blob_box = blob->bounding_box ();
00826       blob_size =
00827         blob_box.width () >
00828         blob_box.height ()? blob_box.width () : blob_box.height ();
00829       if (blob_size >= textord_noise_sizelimit * row->x_height ()
00830       && blob_size < row->x_height () * 2) {
00831         trans_threshold = blob_size / textord_noise_sizefraction;
00832         trans_count = blob->count_transitions (trans_threshold);
00833         if (trans_count < textord_noise_translimit)
00834           norm_count++;
00835       }
00836       else if (blob_box.height () > row->x_height () * 2
00837         && (!word_it.at_first () || !blob_it.at_first ()))
00838         dot_count += 2;
00839     }
00840     if (dot_count > 2) {
00841       if (dot_count > norm_count * textord_noise_normratio * 2)
00842         word_dud[word_index] = 2;
00843       else if (dot_count > norm_count * textord_noise_normratio)
00844         word_dud[word_index] = 1;
00845       else
00846         word_dud[word_index] = 0;
00847     }
00848     else
00849       word_dud[word_index] = 0;
00850     if (word_dud[word_index] == 2)
00851       dud_words++;
00852     else
00853       ok_words++;
00854     word_index++;
00855   }
00856 
00857   word_index = 0;
00858   for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
00859     if (word_dud[word_index] == 2
00860     || word_dud[word_index] == 1 && dud_words > ok_words) {
00861       word = word_it.data ();    //current word
00862                                  //rejected blobs
00863       blob_it.set_to_list (word->rej_cblob_list ());
00864                                  //move from blobs
00865       blob_it.add_list_after (word->cblob_list ());
00866     }
00867     word_index++;
00868   }
00869   free_mem(word_dud);
00870 }

void cleanup_blocks ( BLOCK_LIST *  blocks  ) 

Remove empties.

Note:
Globals:
  • textord_noise_rejrows
Delete empty blocks, rows from the page.

Definition at line 619 of file tordmain.cpp.

References clean_noise_from_row(), clean_noise_from_words(), and tweak_row_baseline().

Referenced by textord_page().

00621                      {
00622   BLOCK_IT block_it = blocks;    //iterator
00623   ROW_IT row_it;                 //row iterator
00624 
00625   for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
00626   block_it.forward ()) {
00627     row_it.set_to_list (block_it.data ()->row_list ());
00628     for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00629       if (textord_noise_rejrows
00630         && !row_it.data ()->word_list ()->empty ()
00631         && clean_noise_from_row (row_it.data ())
00632         || row_it.data ()->word_list ()->empty ())
00633         delete row_it.extract ();//lose empty row
00634       else {
00635         if (textord_noise_rejwords)
00636           clean_noise_from_words (row_it.data ());
00637         if (textord_blshift_maxshift >= 0)
00638           tweak_row_baseline (row_it.data ());
00639       }
00640     }
00641     if (block_it.data ()->row_list ()->empty ()) {
00642       delete block_it.extract ();//lose empty block
00643     }
00644   }
00645 }

void edges_and_textord ( const char *  filename,
BLOCK_LIST *  blocks 
)

Read a .pb file.

Parameters:
filename Filename of .pb (source image)
blocks Pointer to LIST of blocks found (modified)
Note:
Globals:
  • polygon_tess_approximation
Returns:
none
Read a file of blocks and blobs and textord them.

A ".pb file" seems to be a synonym for the image passed to tesseract on the command-line because this IS the main entry-point for textord (either that or the comment is wrong :-)

Definition at line 173 of file tordmain.cpp.

References assign_blobs_to_blocks2(), CANTOPENFILE, ERRCODE::error(), EXIT, extract_edges(), filter_blobs(), IMAGE::get_bpp(), IMAGE::get_xsize(), IMAGE::get_ysize(), global_monitor, invert_image(), LOC_ADAPTIVE, LOC_EDGE_PROG, NO_WINDOW, NULL, ETEXT_DESC::ocr_alive, page_image, previous_cpu, ETEXT_DESC::progress, IMAGE::read(), IMAGE::read_header(), read_pd_file(), READFAILED, set_global_loc_code(), STRING::string(), textord_page(), BOX::topright(), TRUE, and IMAGE::white_high().

Referenced by pgeditor_read_file().

00175                                            {
00176   BLOCK *block;                  //current block
00177   char *lastdot;                 //of name
00178   STRING name = filename;        //truncated name
00179   ICOORD page_tr;
00180   BOX page_box;                  //bounding_box
00181   PDBLK_CLIST pd_blocks;         //copy of list
00182   BLOCK_IT block_it = blocks;    //iterator
00183   PDBLK_C_IT pd_it = &pd_blocks; //iterator
00184                                  //different orientations
00185   TO_BLOCK_LIST land_blocks, port_blocks;
00186   IMAGE thresh_image;            //thresholded
00187 
00188   lastdot = strrchr (name.string (), '.');
00189   if (lastdot != NULL)
00190     *lastdot = '\0';
00191   if (page_image.get_bpp () == 0) {
00192     name += tessedit_image_ext;
00193     if (page_image.read_header (name.string ()))
00194       CANTOPENFILE.error ("edges_and_textord", EXIT, name.string ());
00195     if (page_image.read (0))
00196       READFAILED.error ("edges_and_textord", EXIT, name.string ());
00197     name = filename;
00198     lastdot = strrchr (name.string (), '.');
00199     if (lastdot != NULL)
00200       *lastdot = '\0';
00201   }
00202   page_tr = ICOORD (page_image.get_xsize (), page_image.get_ysize ());
00203   read_pd_file (name, page_image.get_xsize (), page_image.get_ysize (),
00204     blocks);
00205   block_it.set_to_list (blocks);
00206   if (global_monitor != NULL)
00207     global_monitor->ocr_alive = TRUE;
00208 
00209   if (page_image.get_bpp () > 1) {
00210     set_global_loc_code(LOC_ADAPTIVE);
00211     for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
00212     block_it.forward ()) {
00213       block = block_it.data ();
00214       pd_it.add_after_then_move (block);
00215     }
00216     //  adaptive_threshold(&page_image,&pd_blocks,&thresh_image);
00217     set_global_loc_code(LOC_EDGE_PROG);
00218 #ifndef EMBEDDED
00219     previous_cpu = clock ();
00220 #endif
00221     for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
00222     block_it.forward ()) {
00223       block = block_it.data ();
00224       if (!polygon_tess_approximation)
00225         invert_image(&page_image);
00226 #ifndef GRAPHICS_DISABLED
00227       extract_edges(NO_WINDOW, &page_image, &thresh_image, page_tr, block);
00228 #else
00229       extract_edges(&page_image, &thresh_image, page_tr, block);
00230 #endif
00231       page_box += block->bounding_box ();
00232     }
00233     page_image = thresh_image;   //everyone else gets it
00234   }
00235   else {
00236     set_global_loc_code(LOC_EDGE_PROG);
00237     if (!page_image.white_high ())
00238       invert_image(&page_image);
00239 
00240 #ifndef EMBEDDED
00241     previous_cpu = clock ();
00242 #endif
00243 
00244     for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
00245     block_it.forward ()) {
00246       block = block_it.data ();
00247 #ifndef GRAPHICS_DISABLED
00248       extract_edges(NO_WINDOW, &page_image, &page_image, page_tr, block);
00249 #else
00250       extract_edges(&page_image, &page_image, page_tr, block);
00251 #endif
00252       page_box += block->bounding_box ();
00253     }
00254   }
00255   if (global_monitor != NULL) {
00256     global_monitor->ocr_alive = TRUE;
00257     global_monitor->progress = 10;
00258   }
00259 
00260   assign_blobs_to_blocks2(blocks, &land_blocks, &port_blocks);
00261   if (global_monitor != NULL)
00262     global_monitor->ocr_alive = TRUE;
00263   filter_blobs (page_box.topright (), &land_blocks, textord_test_landscape);
00264 #ifndef EMBEDDED
00265   previous_cpu = clock ();
00266 #endif
00267   filter_blobs (page_box.topright (), &port_blocks, !textord_test_landscape);
00268   if (global_monitor != NULL)
00269     global_monitor->ocr_alive = TRUE;
00270   textord_page (page_box.topright (), blocks, &land_blocks, &port_blocks);
00271 }//edges_and_textord

void filter_blobs ( ICOORD  page_tr,
TO_BLOCK_LIST *  blocks,
BOOL8  testing_on 
)

Split into groups.

Parameters:
page_tr Top right
blocks Output list
testing_on For plotting
Sort the blobs into sizes in all the blocks, computing each line's spacing as follows:
                   (textord_merge_desc + textord_merge_x + 2*textord_merge_asc)
block->line_size * ------------------------------------------------------------
                                      textord_merge_x

Definition at line 338 of file tordmain.cpp.

References BLUE, BROWN, CORAL, create_to_win(), DARK_GREEN, filter_noise_blobs(), GOLDENROD, NO_WINDOW, plot_blob_list(), plot_box_list(), to_win, WHITE, and YELLOW.

Referenced by edges_and_textord(), and read_and_textord().

00342                    {
00343   TO_BLOCK_IT block_it = blocks; //destination iterator
00344   TO_BLOCK *block;               //created block
00345 
00346   for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
00347   block_it.forward ()) {
00348     block = block_it.data ();
00349     block->line_size = filter_noise_blobs (&block->blobs,
00350       &block->noise_blobs,
00351       &block->small_blobs,
00352       &block->large_blobs);
00353     block->line_spacing =
00354       block->line_size * (textord_merge_desc + textord_merge_x +
00355       textord_merge_asc +
00356       textord_merge_asc) / textord_merge_x;
00357     block->line_size *= textord_min_linesize;
00358     block->max_blob_size = block->line_size * textord_excess_blobsize;
00359 #ifndef GRAPHICS_DISABLED
00360     if (textord_show_blobs && testing_on) {
00361       if (to_win == NO_WINDOW)
00362         create_to_win(page_tr);
00363       plot_blob_list (to_win, &block->noise_blobs, CORAL, BLUE);
00364       plot_blob_list (to_win, &block->small_blobs, GOLDENROD, YELLOW);
00365       plot_blob_list (to_win, &block->large_blobs, DARK_GREEN, YELLOW);
00366       plot_blob_list (to_win, &block->blobs, WHITE, BROWN);
00367     }
00368     if (textord_show_boxes && testing_on) {
00369       if (to_win == NO_WINDOW)
00370         create_to_win(page_tr);
00371       plot_box_list (to_win, &block->noise_blobs, WHITE);
00372       plot_box_list (to_win, &block->small_blobs, WHITE);
00373       plot_box_list (to_win, &block->large_blobs, WHITE);
00374       plot_box_list (to_win, &block->blobs, WHITE);
00375     }
00376 #endif
00377   }
00378 }

float filter_noise_blobs ( BLOBNBOX_LIST *  src_list,
BLOBNBOX_LIST *  noise_list,
BLOBNBOX_LIST *  small_list,
BLOBNBOX_LIST *  large_list 
)

Separate noise.

Parameters:
src_list Origonal list
noise_list Noise list
small_list Small blobs
large_list Large blobs
Returns:
Max size found
Note:
Globals:
  • textord_new_initial_xheight
  • textord_blob_size_smallile
  • textord_blob_size_bigile
  • textord_width_limit
  • textord_max_noise_size
  • textord_initialx_ile
  • MAX_NEAREST_DIST
Move small blobs to a separate list.

textord_new_initial_xheight is TRUE by default so processing is ALWAYS done by filter_noise_blobs2().

Definition at line 404 of file tordmain.cpp.

References STATS::add(), filter_noise_blobs2(), and STATS::ile().

Referenced by filter_blobs().

00409                           {
00410   INT16 height;                  //height of blob
00411   INT16 width;                   //of blob
00412   BLOBNBOX_IT src_it = src_list; //iterators
00413   BLOBNBOX_IT noise_it = noise_list;
00414   BLOBNBOX_IT small_it = small_list;
00415   BLOBNBOX_IT large_it = large_list;
00416   STATS size_stats (0, MAX_NEAREST_DIST);
00417   //blob heights
00418   if (textord_new_initial_xheight) {
00419     return filter_noise_blobs2 (src_list, noise_list, small_list, large_list);
00420   }
00421   float min_y;                   //size limits
00422   float max_y;
00423   float max_x;
00424 
00425   for (src_it.mark_cycle_pt (); !src_it.cycled_list (); src_it.forward ()) {
00426     if (src_it.data ()->bounding_box ().height () < textord_max_noise_size) {
00427       noise_it.add_after_then_move (src_it.extract ());
00428     }
00429   }
00430   for (src_it.mark_cycle_pt (); !src_it.cycled_list (); src_it.forward ()) {
00431     size_stats.add (src_it.data ()->bounding_box ().height (), 1);
00432   }
00433   min_y = floor (size_stats.ile (textord_blob_size_smallile / 100.0));
00434   max_y = ceil (size_stats.ile (textord_blob_size_bigile / 100.0));
00435   max_x = ceil (size_stats.ile (0.5) * textord_width_limit);
00436   for (src_it.mark_cycle_pt (); !src_it.cycled_list (); src_it.forward ()) {
00437     height = src_it.data ()->bounding_box ().height ();
00438     width = src_it.data ()->bounding_box ().width ();
00439     if (height < min_y) {
00440       small_it.add_after_then_move (src_it.extract ());
00441    }
00442     else if (height > max_y || width > max_x) {
00443       large_it.add_after_then_move (src_it.extract ());
00444    }
00445   }
00446   return size_stats.ile (textord_initialx_ile);
00447 }

float filter_noise_blobs2 ( BLOBNBOX_LIST *  src_list,
BLOBNBOX_LIST *  noise_list,
BLOBNBOX_LIST *  small_list,
BLOBNBOX_LIST *  large_list 
)

Separate noise.

Parameters:
src_list Origonal list
noise_list Noise list
small_list Small blobs
large_list Large blobs
Returns:
Max size found
Move small blobs to a separate list. 'Normal' blobs dimensions are computed and the outliners sieved out.

Definition at line 498 of file tordmain.cpp.

References STATS::add(), STATS::clear(), and STATS::ile().

Referenced by filter_noise_blobs().

00503                            {
00504   INT16 height;                  //height of blob
00505   INT16 width;                   //of blob
00506   BLOBNBOX *blob;                //current blob
00507   float initial_x;               //first guess
00508   BLOBNBOX_IT src_it = src_list; //iterators
00509   BLOBNBOX_IT noise_it = noise_list;
00510   BLOBNBOX_IT small_it = small_list;
00511   BLOBNBOX_IT large_it = large_list;
00512   STATS size_stats (0, MAX_NEAREST_DIST);
00513   //blob heights
00514   float min_y;                   //size limits
00515   float max_y;
00516   float max_x;
00517   float max_height;              //of good blobs
00518 
00519   for (src_it.mark_cycle_pt (); !src_it.cycled_list (); src_it.forward ()) {
00520     blob = src_it.data ();
00521     if (blob->bounding_box ().height () < textord_max_noise_size)
00522       noise_it.add_after_then_move (src_it.extract ());
00523     else if (blob->enclosed_area () >= blob->bounding_box ().height ()
00524       * blob->bounding_box ().width () * textord_noise_area_ratio)
00525       small_it.add_after_then_move (src_it.extract ());
00526   }
00527   for (src_it.mark_cycle_pt (); !src_it.cycled_list (); src_it.forward ()) {
00528     size_stats.add (src_it.data ()->bounding_box ().height (), 1);
00529   }
00530   initial_x = size_stats.ile (textord_initialx_ile);
00531   max_y =
00532     ceil (initial_x *
00533     (textord_merge_desc + textord_merge_x +
00534     2 * textord_merge_asc) / textord_merge_x);
00535   min_y = floor (initial_x / 2);
00536   max_x = ceil (initial_x * textord_width_limit);
00537   small_it.move_to_first ();
00538   for (small_it.mark_cycle_pt (); !small_it.cycled_list ();
00539   small_it.forward ()) {
00540     height = small_it.data ()->bounding_box ().height ();
00541     if (height >= min_y)
00542       large_it.add_after_then_move (small_it.extract ());
00543   }
00544   size_stats.clear ();
00545   for (src_it.mark_cycle_pt (); !src_it.cycled_list (); src_it.forward ()) {
00546     height = src_it.data ()->bounding_box ().height ();
00547     width = src_it.data ()->bounding_box ().width ();
00548     if (height < min_y) {
00549       small_it.add_after_then_move (src_it.extract ());
00550    }
00551     else if (height > max_y || width > max_x) {
00552       large_it.add_after_then_move (src_it.extract ());
00553    }
00554     else {
00555       size_stats.add (height, 1);
00556    }
00557   }
00558   max_height = size_stats.ile (textord_initialasc_ile);
00559   //      printf("max_y=%g, min_y=%g, initial_x=%g, max_height=%g,",
00560   //              max_y,min_y,initial_x,max_height);
00561   max_height *= textord_merge_x / (textord_merge_x + textord_merge_asc);
00562   if (max_height > initial_x)
00563     initial_x = max_height;
00564   //      printf(" ret=%g\n",initial_x);
00565   return initial_x;
00566 }

void read_and_textord ( const char *  filename,
BLOCK_LIST *  blocks 
)

Read a .pb file.

Parameters:
filename Filename of .pb (source image)
blocks Pointer to LIST of blocks found (modified)
Returns:
none
Read a file of blocks n blobs and textord them.

See edges_and_textord() for main 'usual' entry-point into textord... THIS seems to be for testing/development only.

Definition at line 126 of file tordmain.cpp.

References assign_blobs_to_blocks2(), CANTOPENFILE, ERRCODE::error(), EXIT, filter_blobs(), NULL, textord_page(), and BOX::topright().

Referenced by pgeditor_read_file().

00128                                           {
00129   int c;                         //input character
00130   FILE *infp;                    //input file
00131   BLOCK *block;                  //current block
00132   BOX page_box;                  //bounding_box
00133   BLOCK_IT block_it = blocks;    //iterator
00134                                  //different orientations
00135   TO_BLOCK_LIST land_blocks, port_blocks;
00136 
00137   infp = fopen (filename, "r");
00138   if (infp == NULL)
00139     CANTOPENFILE.error ("read_and_textord", EXIT, filename);
00140 
00141   while (((c = fgetc (infp)) != EOF) && (ungetc (c, infp) != EOF)) {
00142                                  //get one
00143     block = BLOCK::de_serialise (infp);
00144                                  //add to list
00145     block_it.add_after_then_move (block);
00146                                  //find page size
00147     page_box += block->bounding_box ();
00148   }
00149   fclose(infp);
00150 
00151   assign_blobs_to_blocks2(blocks, &land_blocks, &port_blocks);
00152   filter_blobs (page_box.topright (), &port_blocks, !textord_test_landscape);
00153   filter_blobs (page_box.topright (), &land_blocks, textord_test_landscape);
00154   textord_page (page_box.topright (), blocks, &land_blocks, &port_blocks);
00155 }

void textord_page ( ICOORD  page_tr,
BLOCK_LIST *  blocks,
TO_BLOCK_LIST *  land_blocks,
TO_BLOCK_LIST *  port_blocks 
)

Make rows & words.

Parameters:
page_tr Top right
blocks Block list
land_blocks Rotated for landscape
port_blocks Output list
Note:
Globals:
  • interactive_mode = 1 if interactive
  • textord_exit_after = if 1 exit() after textord_page() if interactive
Textord the list of blobs and return a list of proper blocks.

Definition at line 582 of file tordmain.cpp.

References cleanup_blocks(), close_to_win(), global_monitor, LOC_TEXT_ORD_ROWS, LOC_TEXT_ORD_WORDS, make_rows(), make_words(), NULL, ETEXT_DESC::ocr_alive, ETEXT_DESC::progress, set_global_loc_code(), and TRUE.

Referenced by edges_and_textord(), and read_and_textord().

00587                    {
00588   float gradient;                //global skew
00589 
00590   set_global_loc_code(LOC_TEXT_ORD_ROWS);
00591   gradient = make_rows (page_tr, blocks, land_blocks, port_blocks);
00592   if (global_monitor != NULL) {
00593     global_monitor->ocr_alive = TRUE;
00594     global_monitor->progress = 20;
00595   }
00596   set_global_loc_code(LOC_TEXT_ORD_WORDS);
00597   make_words(page_tr, gradient, blocks, land_blocks, port_blocks);
00598   if (global_monitor != NULL) {
00599     global_monitor->ocr_alive = TRUE;
00600     global_monitor->progress = 30;
00601   }
00602   cleanup_blocks(blocks);  //remove empties
00603 #ifndef GRAPHICS_DISABLED
00604   close_to_win();
00605 #endif
00606   if (textord_exit_after && !interactive_mode)
00607     exit (0);
00608 }

void tweak_row_baseline ( ROW row  ) 

Remove empties.

Shift baseline to fit the blobs more accurately where they are close enough.

Definition at line 886 of file tordmain.cpp.

References QUAD_COEFFS::a, alloc_mem(), QUAD_COEFFS::b, ROW::base_line(), ROW::baseline, baseline, blob_count, BOX::bottom(), C_BLOB::bounding_box(), QUAD_COEFFS::c, WERD::cblob_list(), free_mem(), BOX::height(), BOX::left(), QSPLINE::quadratics, BOX::right(), QSPLINE::segments, ROW::word_list(), ROW::x_height(), and QSPLINE::xcoords.

Referenced by cleanup_blocks().

00888                          {
00889   BOX blob_box;                  //bounding box
00890   C_BLOB *blob;                  //current blob
00891   WERD *word;                    //current word
00892   INT32 blob_count;              //no of blobs
00893   INT32 src_index;               //source segment
00894   INT32 dest_index;              //destination segment
00895   INT32 *xstarts;                //spline segments
00896   double *coeffs;                //spline coeffs
00897   float ydiff;                   //baseline error
00898   float x_centre;                //centre of blob
00899                                  //words of row
00900   WERD_IT word_it = row->word_list ();
00901   C_BLOB_IT blob_it;             //blob iterator
00902 
00903   blob_count = 0;
00904   for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
00905     word = word_it.data ();      //current word
00906                                  //get total blobs
00907     blob_count += word->cblob_list ()->length ();
00908   }
00909   if (blob_count == 0)
00910     return;
00911   xstarts =
00912     (INT32 *) alloc_mem ((blob_count + row->baseline.segments + 1) *
00913     sizeof (INT32));
00914   coeffs =
00915     (double *) alloc_mem ((blob_count + row->baseline.segments) * 3 *
00916     sizeof (double));
00917 
00918   src_index = 0;
00919   dest_index = 0;
00920   xstarts[0] = row->baseline.xcoords[0];
00921   for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
00922     word = word_it.data ();      //current word
00923                                  //blobs in word
00924     blob_it.set_to_list (word->cblob_list ());
00925     for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
00926     blob_it.forward ()) {
00927       blob = blob_it.data ();
00928       blob_box = blob->bounding_box ();
00929       x_centre = (blob_box.left () + blob_box.right ()) / 2.0;
00930       ydiff = blob_box.bottom () - row->base_line (x_centre);
00931       if (ydiff < 0)
00932         ydiff = -ydiff / row->x_height ();
00933       else
00934         ydiff = ydiff / row->x_height ();
00935       if (ydiff < textord_blshift_maxshift
00936         && blob_box.height () / row->x_height () >
00937       textord_blshift_xfraction) {
00938         if (xstarts[dest_index] >= x_centre)
00939           xstarts[dest_index] = blob_box.left ();
00940         coeffs[dest_index * 3] = 0;
00941         coeffs[dest_index * 3 + 1] = 0;
00942         coeffs[dest_index * 3 + 2] = blob_box.bottom ();
00943         //shift it
00944         dest_index++;
00945         xstarts[dest_index] = blob_box.right () + 1;
00946       }
00947       else {
00948         if (xstarts[dest_index] <= x_centre) {
00949           while (row->baseline.xcoords[src_index + 1] <= x_centre
00950           && src_index < row->baseline.segments - 1) {
00951             if (row->baseline.xcoords[src_index + 1] >
00952             xstarts[dest_index]) {
00953               coeffs[dest_index * 3] =
00954                 row->baseline.quadratics[src_index].a;
00955               coeffs[dest_index * 3 + 1] =
00956                 row->baseline.quadratics[src_index].b;
00957               coeffs[dest_index * 3 + 2] =
00958                 row->baseline.quadratics[src_index].c;
00959               dest_index++;
00960               xstarts[dest_index] =
00961                 row->baseline.xcoords[src_index + 1];
00962             }
00963             src_index++;
00964           }
00965           coeffs[dest_index * 3] =
00966             row->baseline.quadratics[src_index].a;
00967           coeffs[dest_index * 3 + 1] =
00968             row->baseline.quadratics[src_index].b;
00969           coeffs[dest_index * 3 + 2] =
00970             row->baseline.quadratics[src_index].c;
00971           dest_index++;
00972           xstarts[dest_index] = row->baseline.xcoords[src_index + 1];
00973         }
00974       }
00975     }
00976   }
00977   while (src_index < row->baseline.segments
00978     && row->baseline.xcoords[src_index + 1] <= xstarts[dest_index])
00979     src_index++;
00980   while (src_index < row->baseline.segments) {
00981     coeffs[dest_index * 3] = row->baseline.quadratics[src_index].a;
00982     coeffs[dest_index * 3 + 1] = row->baseline.quadratics[src_index].b;
00983     coeffs[dest_index * 3 + 2] = row->baseline.quadratics[src_index].c;
00984     dest_index++;
00985     src_index++;
00986     xstarts[dest_index] = row->baseline.xcoords[src_index];
00987   }
00988                                  //turn to spline
00989   row->baseline = QSPLINE (dest_index, xstarts, coeffs);
00990   free_mem(xstarts);
00991   free_mem(coeffs);
00992 }


Variable Documentation

const ERRCODE BLOCKLESS_BLOBS = "Warning:some blobs assigned to no block"

Note:
File: tordmain.cpp (Formerly textordp.c)
C++ top level textord code.
Author:
Ray Smith
Date:
Jul 28 17:12:33 BST 1992
 * (C) Copyright 1992, Hewlett-Packard Ltd.
 ** Licensed under the Apache License, Version 2.0 (the "License");
 ** you may not use this file except in compliance with the License.
 ** You may obtain a copy of the License at
 ** http://www.apache.org/licenses/LICENSE-2.0
 ** Unless required by applicable law or agreed to in writing, software
 ** distributed under the License is distributed on an "AS IS" BASIS,
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 ** See the License for the specific language governing permissions and
 ** limitations under the License.

Definition at line 47 of file tordmain.cpp.


Generated on Wed Feb 28 19:49:26 2007 for Tesseract by  doxygen 1.5.1