textord/edgblob.cpp

Go to the documentation of this file.
00001 
00020 #include          "mfcpch.h"
00021 //#include        "dirtab.h"
00022 #include          "scanedg.h"
00023 #include          "drawedg.h"
00024 #include          "edgloop.h"
00025 #include          "edgblob.h"
00026 
00027 #ifdef TEXT_VERBOSE
00028 #include           "../cutil/callcpp.h"
00029 #endif
00030 
00031 #define EXTERN
00032 
00035 EXTERN INT_VAR (edges_children_per_grandchild, 10,
00036 "Importance ratio for chucking outlines");
00037 EXTERN INT_VAR (edges_children_count_limit, 45, "Max holes allowed in blob");
00038 EXTERN BOOL_VAR (edges_children_fix, FALSE,
00039 "Remove boxy parents of char-like children");
00040 EXTERN INT_VAR (edges_min_nonhole, 12,
00041 "Min pixels for potential char in box");
00042 EXTERN INT_VAR (edges_patharea_ratio, 40,
00043 "Max lensq/area for acceptable child outline");
00044 EXTERN double_VAR (edges_childarea, 0.5,
00045 "Max area fraction of child outline");
00046 EXTERN double_VAR (edges_boxarea, 0.8,
00047 "Min area fraction of grandchild for box");
00053 OL_BUCKETS::OL_BUCKETS (
00054 ICOORD bleft,                    //corners
00055 ICOORD tright):         bl (bleft), tr (tright) {
00056 
00057   bxdim = (tright.x () - bleft.x ()) / BUCKETSIZE + 1;
00058   bydim = (tright.y () - bleft.y ()) / BUCKETSIZE + 1;
00059                                  //make array
00060   buckets = new C_OUTLINE_LIST[bxdim * bydim];
00061   index = 0;
00062 }
00063 
00064 
00073 C_OUTLINE_LIST *
00074 OL_BUCKETS::operator () (        //array access
00075 INT16 x,                         //image coords
00076 INT16 y) {
00077   return &buckets[(y - bl.y ()) / BUCKETSIZE * bxdim +
00078     (x - bl.x ()) / BUCKETSIZE];
00079 }
00080 
00081 
00104 INT32 OL_BUCKETS::count_children(                     //recursive count
00105                                  C_OUTLINE *outline,  //parent outline
00106                                  INT32 max_count      //max output
00107                                 ) {
00108   BOOL8 parent_box;              //could it be boxy
00109   INT16 xmin, xmax;              //coord limits
00110   INT16 ymin, ymax;
00111   INT16 xindex, yindex;          //current bucket
00112   C_OUTLINE *child;              //current child
00113   INT32 child_count;             //no of children
00114   INT32 grandchild_count;        //no of grandchildren
00115   INT32 parent_area;             //potential box
00116   FLOAT32 max_parent_area;       //potential box
00117   INT32 child_area;              //current child
00118   INT32 child_length;            //current child
00119   BOX olbox;
00120   C_OUTLINE_IT child_it;         //search iterator
00121 
00122   olbox = outline->bounding_box ();
00123   xmin = (olbox.left () - bl.x ()) / BUCKETSIZE;
00124   xmax = (olbox.right () - bl.x ()) / BUCKETSIZE;
00125   ymin = (olbox.bottom () - bl.y ()) / BUCKETSIZE;
00126   ymax = (olbox.top () - bl.y ()) / BUCKETSIZE;
00127 
00128   child_count = 0;
00129   grandchild_count = 0;
00130   parent_area = 0;
00131   max_parent_area = 0;
00132   parent_box = TRUE;
00133 #if defined(TEXT_VERBOSE) && defined(TV_FOCUSG)
00134   cprintf("\n1(bb=%d,%d,%d,%d)",xmin,ymin,xmax,ymax); // , see ccmain/tessvars.h
00135 #endif
00136 
00137   for (yindex = ymin; yindex <= ymax; yindex++) {
00138     for (xindex = xmin; xindex <= xmax; xindex++) {
00139       child_it.set_to_list (&buckets[yindex * bxdim + xindex]);
00140       if (child_it.empty ())
00141         continue;
00142       for (child_it.mark_cycle_pt (); !child_it.cycled_list ();
00143       child_it.forward ()) {
00144         child = child_it.data ();
00145         if (child != outline && *child < *outline) {
00146           child_count++;
00147           if (child_count <= max_count)
00148             grandchild_count += count_children (child,
00149               (max_count -
00150               child_count) /
00151               edges_children_per_grandchild)
00152               * edges_children_per_grandchild;
00153           if (child_count + grandchild_count > max_count) {
00154             /* err.log(RESULT_OKAY,E_LOC,ERR_OCR,
00155                  ERR_SCROLLING,ERR_CONTINUE,ERR_DEBUG,
00156                  "Discarding parent with child count=%d, gc=%d",
00157                  child_count,grandchild_count);*/
00158 #if defined(TEXT_VERBOSE) && defined(TV_FOCUSG)
00159   cprintf("2(%d,%d)",child_count,grandchild_count); // , see ccmain/tessvars.h
00160 #endif
00161             return child_count + grandchild_count;
00162           }
00163           if (parent_area == 0) {
00164             parent_area = outline->outer_area ();
00165             if (parent_area < 0)
00166               parent_area = -parent_area;
00167             max_parent_area = outline->bounding_box ().width ()
00168               * outline->bounding_box ().height () * edges_boxarea;
00169             if (parent_area < max_parent_area)
00170               parent_box = FALSE;
00171           }
00172           if (parent_box
00173             && (!edges_children_fix
00174             || child->bounding_box ().height () >
00175           edges_min_nonhole) ) {
00176             child_area = child->outer_area ();
00177             if (child_area < 0)
00178               child_area = -child_area;
00179             if (edges_children_fix) {
00180               if (parent_area - child_area < max_parent_area) {
00181                 parent_box = FALSE;
00182                 continue;
00183               }
00184               if (grandchild_count > 0) {
00185 /*
00186    err.log(RESULT_OKAY,E_LOC,ERR_OCR,
00187       ERR_SCROLLING,ERR_CONTINUE,ERR_DEBUG,
00188       "Discarding parent of area %d, child area=%d, max%g with gc=%d",
00189       parent_area,child_area,max_parent_area,grandchild_count);
00190 */
00191 #if defined(TEXT_VERBOSE) && defined(TV_FOCUSG)
00192   cprintf("5(%d)",max_count+1); // , see ccmain/tessvars.h
00193 #endif
00194                 return max_count + 1;
00195               }
00196               child_length = child->pathlength ();
00197               if (child_length * child_length >
00198               child_area * edges_patharea_ratio) {
00199 /*
00200    err.log(RESULT_OKAY,E_LOC,ERR_OCR,
00201       ERR_SCROLLING,ERR_CONTINUE,ERR_DEBUG,
00202       "Discarding parent of area %d, child area=%d, max%g with child length=%d",
00203       parent_area,child_area,max_parent_area,child_length);
00204 */
00205 #if defined(TEXT_VERBOSE) && defined(TV_FOCUSG)
00206   cprintf("6(%d)",max_count+1); // , see ccmain/tessvars.h
00207 #endif
00208                 return max_count + 1;
00209               }
00210             }
00211             if (child_area < child->bounding_box ().width ()
00212               * child->bounding_box ().height () *
00213             edges_childarea) {
00214 /*
00215    err.log(RESULT_OKAY,E_LOC,ERR_OCR,
00216       ERR_SCROLLING,ERR_CONTINUE,ERR_DEBUG,
00217       "Discarding parent of area %d, child area=%d, max%g with child rect=%d",
00218       parent_area,child_area,max_parent_area,child->bounding_box().width()
00219       *child->bounding_box().height());
00220 */
00221 #if defined(TEXT_VERBOSE) && defined(TV_FOCUSG)
00222   cprintf("7(%d)",max_count+1); // , see ccmain/tessvars.h
00223 #endif
00224               return max_count + 1;
00225             }
00226           }
00227         }
00228       }
00229     }
00230   }
00231 #if defined(TEXT_VERBOSE) && defined(TV_FOCUSG)
00232   cprintf("8[%d,%d=%d|%d,%d,%d]",xindex,yindex,child_count,parent_area,grandchild_count,child_length);
00233 #endif
00234   return child_count + grandchild_count;
00235 }
00236 
00237 
00250 void OL_BUCKETS::extract_children(                     //recursive count
00251                                   C_OUTLINE *outline,  //parent outline
00252                                   C_OUTLINE_IT *it     //destination iterator
00253                                  ) {
00254   INT16 xmin, xmax;              //coord limits
00255   INT16 ymin, ymax;
00256   INT16 xindex, yindex;          //current bucket
00257   BOX olbox;
00258   C_OUTLINE_IT child_it;         //search iterator
00259 
00260   olbox = outline->bounding_box ();
00261   xmin = (olbox.left () - bl.x ()) / BUCKETSIZE;
00262   xmax = (olbox.right () - bl.x ()) / BUCKETSIZE;
00263   ymin = (olbox.bottom () - bl.y ()) / BUCKETSIZE;
00264   ymax = (olbox.top () - bl.y ()) / BUCKETSIZE;
00265   for (yindex = ymin; yindex <= ymax; yindex++) {
00266     for (xindex = xmin; xindex <= xmax; xindex++) {
00267       child_it.set_to_list (&buckets[yindex * bxdim + xindex]);
00268       for (child_it.mark_cycle_pt (); !child_it.cycled_list ();
00269       child_it.forward ()) {
00270         if (*child_it.data () < *outline) {
00271           it->add_after_then_move (child_it.extract ());
00272         }
00273       }
00274     }
00275   }
00276 }
00277 
00278 
00289 void extract_edges(
00290 #ifndef GRAPHICS_DISABLED
00291                    WINDOW window,
00292 #endif
00293                    IMAGE *image,
00294                    IMAGE *t_image,
00295                    ICOORD page_tr,
00296                    BLOCK *block 
00297                   ) {
00298   ICOORD bleft;                  //block box
00299   ICOORD tright;
00300   C_OUTLINE_LIST outlines;       //outlines in block
00301                                  //iterator
00302   C_OUTLINE_IT out_it = &outlines;
00303 
00304 #ifndef GRAPHICS_DISABLED
00305   get_outlines (window, image, t_image, page_tr, (PDBLK *) block, &out_it);
00306 #else
00307   get_outlines (image, t_image, page_tr, (PDBLK *) block, &out_it);
00308 #endif
00309                                  //block box
00310   block->bounding_box (bleft, tright);
00311                                  //make blobs
00312   outlines_to_blobs(block, bleft, tright, &outlines); 
00313 
00314 }
00315 
00316 
00328 void outlines_to_blobs(               //find blobs
00329                        BLOCK *block,  //block to scan
00330                        ICOORD bleft,  //block box //outlines in block
00331                        ICOORD tright,
00332                        C_OUTLINE_LIST *outlines) {
00333                                  //make buckets
00334   OL_BUCKETS buckets(bleft, tright); 
00335 
00336   fill_buckets(outlines, &buckets);
00337   empty_buckets(block, &buckets);//Makes blobs WHILE emptying buckets
00338 }
00339 
00340 
00349 void fill_buckets(                           //find blobs
00350                   C_OUTLINE_LIST *outlines,  //outlines in block
00351                   OL_BUCKETS *buckets        //output buckets
00352                  ) {
00353   BOX ol_box;                    //outline box
00354   C_OUTLINE_IT out_it = outlines;//iterator
00355   C_OUTLINE_IT bucket_it;        //iterator in bucket
00356   C_OUTLINE *outline;            //current outline
00357 
00358   int t=0;
00359   for (out_it.mark_cycle_pt (); !out_it.cycled_list (); out_it.forward ()) {
00360     outline = out_it.extract (); //take off list
00361                                  //get box
00362     ol_box = outline->bounding_box ();
00363     bucket_it.set_to_list ((*buckets) (ol_box.left (), ol_box.bottom ()));
00364     bucket_it.add_to_end (outline);
00365   }
00366 }
00367 
00368 
00377 void empty_buckets(                     //find blobs
00378                    BLOCK *block,        //block to scan
00379                    OL_BUCKETS *buckets  //output buckets
00380                   ) {
00381   BOOL8 good_blob;               //healthy blob
00382   C_OUTLINE_LIST outlines;       //outlines in block
00383                                  //iterator
00384   C_OUTLINE_IT out_it = &outlines;
00385   C_OUTLINE_IT bucket_it = buckets->start_scan ();
00386   C_OUTLINE_IT parent_it;        //parent outline
00387   C_BLOB *blob;                  //new blob
00388   C_BLOB_IT good_blobs = block->blob_list ();
00389   C_BLOB_IT junk_blobs = block->reject_blobs ();
00390 
00391 #if defined(TEXT_VERBOSE) && defined(TV_FOCUSI)
00392   cprintf("1");
00393 #endif
00394   while (!bucket_it.empty ()) {
00395     out_it.set_to_list (&outlines);
00396     do {
00397       parent_it = bucket_it;     //find outermost
00398       do
00399          bucket_it.forward ();
00400       while (!bucket_it.at_first ()
00401         && !(*parent_it.data () < *bucket_it.data ()));
00402     }
00403     while (!bucket_it.at_first ());
00404 
00405                                  //move to new list
00406     out_it.add_after_then_move (parent_it.extract ());
00407     good_blob = capture_children (buckets, &junk_blobs, &out_it);
00408     blob = new C_BLOB (&outlines);
00409     if (good_blob) {
00410       good_blobs.add_after_then_move (blob);
00411 #if defined(TEXT_VERBOSE) && defined(TV_FOCUSI)
00412   cprintf("2");
00413 #endif
00414     }
00415    else {
00416       junk_blobs.add_after_then_move (blob);
00417 #if defined(TEXT_VERBOSE) && defined(TV_FOCUSI)
00418   cprintf("3");
00419 #endif
00420     }
00421 
00422     bucket_it.set_to_list (buckets->scan_next ());
00423   }
00424 }
00425 
00426 
00444 BOOL8 capture_children(                       //find children
00445                        OL_BUCKETS *buckets,   //bucket sort class
00446                        C_BLOB_IT *reject_it,  //dead grandchildren
00447                        C_OUTLINE_IT *blob_it  //output outlines
00448                       ) {
00449   BOOL8 anydone;                 //anything canned
00450   C_OUTLINE *outline;            //master outline
00451   C_OUTLINE *child;              //child under test
00452   C_OUTLINE_IT test_it;          //for grandchildren
00453   INT32 child_count;             //no of children
00454   C_BLOB *blob;                  //reject
00455   C_OUTLINE_LIST r_list;         //rejects
00456   C_OUTLINE_IT r_it;             //iterator
00457 
00458   outline = blob_it->data ();
00459   child_count = buckets->count_children (outline, edges_children_count_limit);
00460 #if defined(TEXT_VERBOSE) && defined(TV_FOCUSI)
00461   cprintf("L");
00462 #endif
00463   if (child_count > edges_children_count_limit)
00464     return FALSE;
00465   if (child_count == 0)
00466     return TRUE;
00467                                  //get single level
00468   buckets->extract_children (outline, blob_it);
00469   if (child_count == 1) {
00470 #if defined(TEXT_VERBOSE) && defined(TV_FOCUSI)
00471   cprintf("K");
00472 #endif
00473     return TRUE;
00474   }
00475   do {
00476     anydone = FALSE;
00477     blob_it->move_to_first ();
00478     for (blob_it->mark_cycle_pt (); !blob_it->cycled_list ();
00479     blob_it->forward ()) {
00480       child = blob_it->data ();
00481       if (child != outline) {
00482         for (test_it = *blob_it, test_it.mark_cycle_pt ();
00483         !test_it.cycled_list (); test_it.forward ()) {
00484           if (test_it.data () != child && *test_it.data () < *child) {
00485             r_it.set_to_list (&r_list);
00486             r_it.add_after_then_move (test_it.extract ());
00487                                  //turn to blob
00488             blob = new C_BLOB (&r_list);
00489             reject_it->add_after_then_move (blob);
00490             anydone = TRUE;
00491 #if defined(TEXT_VERBOSE) && defined(TV_FOCUSI)
00492   cprintf("N");
00493 #endif
00494           }
00495         }
00496         if (anydone) {
00497 #if defined(TEXT_VERBOSE) && defined(TV_FOCUSI)
00498   cprintf("M");
00499 #endif
00500           break;                 //got to restart
00501       }
00502 #if defined(TEXT_VERBOSE) && defined(TV_FOCUSI)
00503   cprintf("L");
00504 #endif
00505       }
00506     }
00507   }
00508   while (anydone);               //got to restart
00509 #if defined(TEXT_VERBOSE) && defined(TV_FOCUSI)
00510   cprintf("O\n");
00511 #endif
00512   return TRUE;
00513 }//capture_children

Generated on Wed Feb 28 19:49:11 2007 for Tesseract by  doxygen 1.5.1