ccmain/applybox.h File Reference

#include "varable.h"
#include "ocrblock.h"
#include "ocrrow.h"
#include "notdll.h"

Go to the source code of this file.

Functions


Function Documentation

void apply_box_testing ( BLOCK_LIST *  block_list  ) 

Test for a TESS screw-up on a word.

Parameters:
block_list List of blocks
Returns:
none (modifies ??)
Recog_word has already ensured that the choice list, outword blob lists and best_choice string are the same length. A TESS screw up is indicated by a blank filled or zero length string.

Definition at line 789 of file applybox.cpp.

References ASSERT_HOST, WERD::blob_list(), count(), fix_quotes(), WERD::gblob_list(), make_bln_copy(), tess_default_matcher(), tess_segment_pass1(), WERD::text(), tprintf(), word_count, ROW::word_list(), and ROW::x_height().

00789                                                { 
00790   BLOCK_IT block_it(block_list); 
00791   ROW_IT row_it;
00792   ROW *row;
00793   INT16 row_count = 0;
00794   WERD_IT word_it;
00795   WERD *word;
00796   WERD *bln_word;
00797   INT16 word_count = 0;
00798   PBLOB_IT blob_it;
00799   DENORM denorm;
00800   INT16 count = 0;
00801   char ch[2];
00802   WERD *outword;                 //bln best choice
00803   //segmentation
00804   WERD_CHOICE *best_choice;      //tess output
00805   WERD_CHOICE *raw_choice;       //top choice permuter
00806                                  //detailed results
00807   BLOB_CHOICE_LIST_CLIST blob_choices;
00808   INT16 char_count = 0;
00809   INT16 correct_count = 0;
00810   INT16 err_count = 0;
00811   INT16 rej_count = 0;
00812   #ifndef SECURE_NAMES
00813   WERDSTATS wordstats;           //As from newdiff
00814   #endif
00815   char tess_rej_str[3];
00816   char tess_long_str[3];
00817 
00818   ch[1] = '\0';
00819   strcpy (tess_rej_str, "|A");
00820   strcpy (tess_long_str, "|B");
00821 
00822   for (block_it.mark_cycle_pt ();
00823   !block_it.cycled_list (); block_it.forward ()) {
00824     row_it.set_to_list (block_it.data ()->row_list ());
00825     for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00826       row = row_it.data ();
00827       row_count++;
00828       word_count = 0;
00829       word_it.set_to_list (row->word_list ());
00830       for (word_it.mark_cycle_pt ();
00831       !word_it.cycled_list (); word_it.forward ()) {
00832         word = word_it.data ();
00833         word_count++;
00834         if ((strlen (word->text ()) == 1) &&
00835           !STRING (applybox_test_exclusions).contains (*word->text ())
00836         && (word->gblob_list ()->length () == 1)) {
00837           /* Here is a word with a single char label and a single blob so test it */
00838           bln_word =
00839             make_bln_copy (word, row, row->x_height (), &denorm);
00840           blob_it.set_to_list (bln_word->blob_list ());
00841           ch[0] = *word->text ();
00842           char_count++;
00843           best_choice = tess_segment_pass1 (bln_word,
00844             &denorm,
00845             tess_default_matcher,
00846             raw_choice,
00847             &blob_choices, outword);
00848 
00849           /* Test for TESS screw up on word. */
00850           if ((best_choice->string ().length () == 0) ||
00851             (strspn (best_choice->string ().string (), " ") ==
00852           best_choice->string ().length ())) {
00853             rej_count++;
00854             tprintf ("%d:%d: \"%s\" -> TESS FAILED\n",
00855               row_count, word_count, ch);
00856             #ifndef SECURE_NAMES
00857             wordstats.word (tess_rej_str, 2, ch, 1);
00858             #endif
00859           }
00860           else {
00861             if ((best_choice->string ().length () !=
00862               outword->blob_list ()->length ()) ||
00863               (best_choice->string ().length () !=
00864             blob_choices.length ())) {
00865               tprintf
00866                 ("ASSERT FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n",
00867                 best_choice->string ().string (),
00868                 best_choice->string ().length (),
00869                 outword->blob_list ()->length (),
00870                 blob_choices.length ());
00871             }
00872             ASSERT_HOST (best_choice->string ().length () ==
00873               outword->blob_list ()->length ());
00874             ASSERT_HOST (best_choice->string ().length () ==
00875               blob_choices.length ());
00876             fix_quotes ((char *) best_choice->string ().string (),
00877                                  //turn to double
00878               outword, &blob_choices);
00879             if (strcmp (best_choice->string ().string (), ch) != 0) {
00880               err_count++;
00881               tprintf ("%d:%d: \"%s\" -> \"%s\"\n",
00882                 row_count, word_count, ch,
00883                 best_choice->string ().string ());
00884             }
00885             else
00886               correct_count++;
00887             #ifndef SECURE_NAMES
00888             if (best_choice->string ().length () > 2)
00889               wordstats.word (tess_long_str, 2, ch, 1);
00890             else
00891               wordstats.word ((char *) best_choice->string ().
00892                 string (),
00893                 best_choice->string ().length (), ch,
00894                 1);
00895             #endif
00896           }
00897           delete bln_word;
00898           delete outword;
00899           delete best_choice;
00900           delete raw_choice;
00901           blob_choices.deep_clear ();
00902           count++;
00903         }
00904       }
00905     }
00906   }
00907   #ifndef SECURE_NAMES
00908   wordstats.print (1, 100.0);
00909   wordstats.conf_matrix ();
00910   tprintf ("Tested %d chars: %d correct; %d rejected by tess; %d errs\n",
00911     char_count, correct_count, rej_count, err_count);
00912   #endif
00913 }

void apply_box_training ( BLOCK_LIST *  block_list  ) 

Generates training data.

Parameters:
block_list List of blocks
Returns:
none (modifies ??)

Definition at line 727 of file applybox.cpp.

References WERD::baseline_denormalise(), WERD::blob_list(), count(), WERD::gblob_list(), make_bln_copy(), NULL, tess_training_tester(), WERD::text(), tprintf(), TRUE, ROW::word_list(), and ROW::x_height().

Referenced by TessBaseAPI::Recognize().

00727                                                 { 
00728   BLOCK_IT block_it(block_list); 
00729   ROW_IT row_it;
00730   ROW *row;
00731   WERD_IT word_it;
00732   WERD *word;
00733   WERD *bln_word;
00734   WERD copy_outword;             // copy to denorm
00735   PBLOB_IT blob_it;
00736   DENORM denorm;
00737   INT16 count = 0;
00738   char ch[2];
00739 
00740   ch[1] = '\0';
00741 
00742   tprintf ("Generating training data\n");
00743   for (block_it.mark_cycle_pt ();
00744   !block_it.cycled_list (); block_it.forward ()) {
00745     row_it.set_to_list (block_it.data ()->row_list ());
00746     for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00747       row = row_it.data ();
00748       word_it.set_to_list (row->word_list ());
00749       for (word_it.mark_cycle_pt ();
00750       !word_it.cycled_list (); word_it.forward ()) {
00751         word = word_it.data ();
00752         if ((strlen (word->text ()) == 1) &&
00753         (word->gblob_list ()->length () == 1)) {
00754           /* Here is a word with a single char label and a single blob so train on it */
00755           bln_word =
00756             make_bln_copy (word, row, row->x_height (), &denorm);
00757           blob_it.set_to_list (bln_word->blob_list ());
00758           ch[0] = *word->text ();
00759           tess_training_tester (blob_it.data (),
00760                                  //single blob
00761             &denorm, TRUE,       //correct
00762             ch,                  //correct ASCII char
00763             1,                   //ASCII length
00764             NULL);
00765           copy_outword = *(bln_word);
00766           copy_outword.baseline_denormalise (&denorm);
00767           blob_it.set_to_list (copy_outword.blob_list ());
00768           ch[0] = *word->text ();
00769           delete bln_word;
00770           count++;
00771         }
00772       }
00773     }
00774   }
00775   tprintf ("Generated training data for %d blobs\n", count);
00776 }

void apply_boxes ( BLOCK_LIST *  block_list  ) 

Reassigns outlines to form words each with ONE labelled blob.

Parameters:
block_list Real blocks
Returns:
none
Noise is left in UNLABELLED words. The chars on the page are checked crudely for sensible position relative to baseline and xht. Failed boxes are compensated for by duplicating other believable instances of the character.

The box file is assumed to contain box definitions, one per line, of the following format:

  <Char> <left> <bottom> <right> <top> ... arbitrary trailing fields unused

The approach taken is to search the WHOLE page for stuff overlapping each box.

A box should only overlap one row.

A warning is given if the box is on the same row as the previous box, but NOT on the same row as the previous blob.

Any OUTLINE which overlaps the box is put into the new word.

ascender chars must ascend above xht significantly xht chars must not rise above row xht significantly bl chars must not descend below baseline significantly descender chars must descend below baseline significantly

?? Certain chars are DROPPED - to limit the training data.

Definition at line 100 of file applybox.cpp.

References CANTOPENFILE, clear_any_old_text(), ERRCODE::error(), EXIT, find_row_of_box(), imagefile, BOX::left(), MAX_INT16, NULL, read_next_box(), report_failed_box(), resegment_box(), BOX::right(), STRING::string(), tidy_up(), and tprintf().

Referenced by TessBaseAPI::Recognize().

00101                   {
00102   INT16 boxfile_lineno = 0;
00103   INT16 boxfile_charno = 0;
00104   BOX box;                       //boxfile box
00105   char ch[2];                    //correct ch from boxfile
00106   ROW *row;
00107   ROW *prev_row = NULL;
00108   INT16 prev_box_right = MAX_INT16;
00109   INT16 block_id;
00110   INT16 row_id;
00111   INT16 box_count = 0;
00112   INT16 box_failures = 0;
00113   INT16 labels_ok;
00114   INT16 rows_ok;
00115   INT16 bad_blobs;
00116   INT16 tgt_char_counts[128];    //No. of box samples
00117   // INT16  labelled_char_counts[128];    //No. of unique labelled samples
00118   INT16 i;
00119   INT16 rebalance_count = 0;
00120   char min_char;
00121   INT16 min_samples;
00122   INT16 final_labelled_blob_count;
00123 
00124   for (i = 0; i < 128; i++)
00125     tgt_char_counts[i] = 0;
00126 
00127   FILE* box_file;
00128   STRING filename = imagefile;
00129   filename += ".box";
00130   if (!(box_file = fopen (filename.string(), "r"))) {
00131     CANTOPENFILE.error ("read_next_box", EXIT,
00132       "Cant open box file %s %d",
00133       filename.string(), errno);
00134   }
00135 
00136   ch[1] = '\0';
00137   clear_any_old_text(block_list);
00138   while (read_next_box (box_file, &box, &ch[0])) {
00139     box_count++;
00140     tgt_char_counts[ch[0]]++;
00141     row = find_row_of_box (block_list, box, block_id, row_id);
00142     if (box.left () < prev_box_right) {
00143       boxfile_lineno++;
00144       boxfile_charno = 1;
00145     }
00146     else
00147       boxfile_charno++;
00148 
00149     if (row == NULL) {
00150       box_failures++;
00151       report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
00152         "FAILURE! box overlaps no blobs or blobs in multiple rows");
00153     }
00154     else {
00155       if ((box.left () >= prev_box_right) && (row != prev_row))
00156         report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
00157           "WARNING! false row break");
00158       box_failures += resegment_box (row, box, ch, block_id, row_id,
00159         boxfile_lineno, boxfile_charno);
00160       prev_row = row;
00161     }
00162     prev_box_right = box.right ();
00163   }
00164   tidy_up(block_list,
00165           labels_ok,
00166           rows_ok,
00167           bad_blobs,
00168           tgt_char_counts,
00169           rebalance_count,
00170           min_char,
00171           min_samples,
00172           final_labelled_blob_count);
00173   tprintf ("APPLY_BOXES:\n");
00174   tprintf ("   Boxes read from boxfile:  %6d\n", box_count);
00175   tprintf ("   Initially labelled blobs: %6d in %d rows\n",
00176     labels_ok, rows_ok);
00177   tprintf ("   Box failures detected:     %6d\n", box_failures);
00178   tprintf ("   Duped blobs for rebalance:%6d\n", rebalance_count);
00179   tprintf ("   \"%c\" has fewest samples:%6d\n", min_char, min_samples);
00180   tprintf ("            Total unlabelled words:   %6d\n",
00181     bad_blobs);
00182   tprintf ("            Final labelled words:     %6d\n",
00183     final_labelled_blob_count);
00184 }

void clear_any_old_text ( BLOCK_LIST *  block_list  ) 

Remove correct text

Definition at line 190 of file applybox.cpp.

Referenced by apply_boxes().

00192                          {
00193   BLOCK_IT block_it(block_list);
00194   ROW_IT row_it;
00195   WERD_IT word_it;
00196 
00197   for (block_it.mark_cycle_pt ();
00198   !block_it.cycled_list (); block_it.forward ()) {
00199     row_it.set_to_list (block_it.data ()->row_list ());
00200     for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00201       word_it.set_to_list (row_it.data ()->word_list ());
00202       for (word_it.mark_cycle_pt ();
00203       !word_it.cycled_list (); word_it.forward ()) {
00204         word_it.data ()->set_text ("");
00205       }
00206     }
00207   }
00208 }

ROW* find_row_of_box ( BLOCK_LIST *  block_list,
BOX  box,
INT16 block_id,
INT16 row_id_to_process 
)

Find row to process.

Error if box REALLY overlaps more than one row. That is, if it overlaps blobs in the row, not just overlaps the bounding box of the whole row.

Definition at line 263 of file applybox.cpp.

References WERD::bounding_box(), ROW::bounding_box(), WERD::flag(), gblob_bounding_box(), WERD::gblob_list(), gblob_out_list(), goutline_bounding_box(), NULL, outline_it, BOX::overlap(), W_POLYGON, and ROW::word_list().

Referenced by apply_boxes().

00267                                                {
00268   BLOCK_IT block_it(block_list);
00269   BLOCK *block;
00270   ROW_IT row_it;
00271   ROW *row;
00272   ROW *row_to_process = NULL;
00273   INT16 row_id;
00274   WERD_IT word_it;
00275   WERD *word;
00276   BOOL8 polyg;
00277   PBLOB_IT blob_it;
00278   PBLOB *blob;
00279   OUTLINE_IT outline_it;
00280   OUTLINE *outline;
00281 
00282   /*
00283     Find row to process - error if box REALLY overlaps more than one row. (I.e
00284     it overlaps blobs in the row - not just overlaps the bounding box of the
00285     whole row.)
00286   */
00287 
00288   block_id = 0;
00289   for (block_it.mark_cycle_pt ();
00290   !block_it.cycled_list (); block_it.forward ()) {
00291     block_id++;
00292     row_id = 0;
00293     block = block_it.data ();
00294     if (block->bounding_box ().overlap (box)) {
00295       row_it.set_to_list (block->row_list ());
00296       for (row_it.mark_cycle_pt ();
00297       !row_it.cycled_list (); row_it.forward ()) {
00298         row_id++;
00299         row = row_it.data ();
00300         if (row->bounding_box ().overlap (box)) {
00301           word_it.set_to_list (row->word_list ());
00302           for (word_it.mark_cycle_pt ();
00303           !word_it.cycled_list (); word_it.forward ()) {
00304             word = word_it.data ();
00305             polyg = word->flag (W_POLYGON);
00306             if (word->bounding_box ().overlap (box)) {
00307               blob_it.set_to_list (word->gblob_list ());
00308               for (blob_it.mark_cycle_pt ();
00309               !blob_it.cycled_list (); blob_it.forward ()) {
00310                 blob = blob_it.data ();
00311                 if (gblob_bounding_box (blob, polyg).
00312                 overlap (box)) {
00313                   outline_it.
00314                     set_to_list (gblob_out_list
00315                     (blob, polyg));
00316                   for (outline_it.mark_cycle_pt ();
00317                     !outline_it.cycled_list ();
00318                   outline_it.forward ()) {
00319                     outline = outline_it.data ();
00320                     if (goutline_bounding_box
00321                     (outline, polyg).major_overlap (box)) {
00322                       if ((row_to_process == NULL) ||
00323                       (row_to_process == row)) {
00324                         row_to_process = row;
00325                         row_id_to_process = row_id;
00326                       }
00327                       else
00328                         /* RETURN ERROR Box overlaps blobs in more than one row  */
00329                         return NULL;
00330                     }
00331                   }
00332                 }
00333               }
00334             }
00335           }
00336         }
00337       }
00338     }
00339   }
00340   return row_to_process;
00341 }

BOOL8 read_next_box ( FILE *  box_file,
BOX box,
char *  ch 
)

Read another box from file <outputbase>.box

The box file is assumed to contain box definitions, one per line, of the following format:

<Char> <left> <bottom> <right> <top> ... arbitrary trailing fields unused

Definition at line 220 of file applybox.cpp.

References count(), FALSE, INT32FORMAT, tprintf(), and TRUE.

Referenced by apply_boxes().

00222                               {
00223   char buff[256];                //boxfile read buffer
00224   char *buffptr = buff;
00225   STRING box_filename;
00226   static INT16 line = 0;
00227   INT32 x_min;
00228   INT32 y_min;
00229   INT32 x_max;
00230   INT32 y_max;
00231   INT32 count = 0;
00232 
00233   while (!feof (box_file)) {
00234     fgets (buff, sizeof (buff) - 1, box_file);
00235     line++;
00236 
00237     /* Check for blank lines in box file */
00238     for (buffptr = buff; isspace (*buffptr); buffptr++)
00239       ;
00240     if (*buffptr != '\0') {
00241       count =
00242         sscanf (buff,
00243         "%c " INT32FORMAT " " INT32FORMAT " " INT32FORMAT " "
00244         INT32FORMAT, ch, &x_min, &y_min, &x_max, &y_max);
00245       if (count != 5) {
00246         tprintf ("Box file format error on line %i ignored\n", line);
00247       }
00248       else {
00249         *box = BOX (ICOORD (x_min, y_min), ICOORD (x_max, y_max));
00250         return TRUE;             //read a box ok
00251       }
00252     }
00253   }
00254   return FALSE;                  //EOF
00255 }

void report_failed_box ( INT16  boxfile_lineno,
INT16  boxfile_charno,
BOX  box,
char *  box_ch,
const char *  err_msg 
)

Print info on failed box.

If global variable applybox_debug > 4

Definition at line 707 of file applybox.cpp.

References BOX::bottom(), BOX::left(), BOX::right(), BOX::top(), and tprintf().

Referenced by apply_boxes(), and resegment_box().

00711                                             {
00712   if (applybox_debug > 4)
00713     tprintf ("APPLY_BOXES: boxfile %1d/%1d/%s ((%1d,%1d),(%1d,%1d)): %s\n",
00714       boxfile_lineno,
00715       boxfile_charno,
00716       box_ch,
00717       box.left (), box.bottom (), box.right (), box.top (), err_msg);
00718 }

INT16 resegment_box ( ROW row,
BOX  box,
char *  ch,
INT16  block_id,
INT16  row_id,
INT16  boxfile_lineno,
INT16  boxfile_charno 
)

Resegments blobs in box, checking that letters follow prescribed rules.

The error messages show what's going on: box overlaps blob in labelled word; caps-ht char didn't ascend; Odd top char below xht; x-ht char didn't have top near xht; non-ambig BL char didnt have bottom near baseline; Odd bottom char above baseline; Descender doesn't descend; and, finally, Couldn't find any blobs

Definition at line 351 of file applybox.cpp.

References ROW::base_line(), baseline, BOX::bottom(), WERD::bounding_box(), dummy, f, WERD::flag(), gblob_bounding_box(), WERD::gblob_list(), gblob_out_list(), gblob_sort_list(), goutline_bounding_box(), BOX::left(), NULL, outline_it, BOX::overlap(), report_failed_box(), BOX::right(), WERD::set_text(), WERD::shallow_copy(), WERD::text(), BOX::top(), tprintf(), W_POLYGON, ROW::word_list(), and ROW::x_height().

Referenced by apply_boxes().

00358                                           {
00359   WERD_IT word_it;
00360   WERD *word;
00361   WERD *new_word = NULL;
00362   BOOL8 polyg = false;
00363   PBLOB_IT blob_it;
00364   PBLOB_IT new_blob_it;
00365   PBLOB *blob;
00366   PBLOB *new_blob;
00367   OUTLINE_IT outline_it;
00368   OUTLINE_LIST dummy;  // Just to initialize new_outline_it.
00369   OUTLINE_IT new_outline_it = &dummy;
00370   OUTLINE *outline;
00371   BOX new_word_box;
00372   float word_x_centre;
00373   float baseline;
00374   INT16 error_count = 0;         //number of chars lost
00375 
00376   word_it.set_to_list (row->word_list ());
00377   for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
00378     word = word_it.data ();
00379     polyg = word->flag (W_POLYGON);
00380     if (word->bounding_box ().overlap (box)) {
00381       blob_it.set_to_list (word->gblob_list ());
00382       for (blob_it.mark_cycle_pt ();
00383       !blob_it.cycled_list (); blob_it.forward ()) {
00384         blob = blob_it.data ();
00385         if (gblob_bounding_box (blob, polyg).overlap (box)) {
00386           outline_it.set_to_list (gblob_out_list (blob, polyg));
00387           for (outline_it.mark_cycle_pt ();
00388           !outline_it.cycled_list (); outline_it.forward ()) {
00389             outline = outline_it.data ();
00390             if (goutline_bounding_box (outline, polyg).
00391             major_overlap (box)) {
00392               if (strlen (word->text ()) > 0) {
00393                 if (error_count == 0) {
00394                   error_count = 1;
00395                   if (applybox_debug > 4)
00396                     report_failed_box (boxfile_lineno,
00397                       boxfile_charno,
00398                       box, ch,
00399                       "FAILURE! box overlaps blob in labelled word");
00400                 }
00401                 if (applybox_debug > 4)
00402                   tprintf
00403                     ("APPLY_BOXES: ALSO ignoring corrupted char blk:%d row:%d \"%s\"\n",
00404                     block_id, row_id,
00405                     word_it.data ()->text ());
00406                 word_it.data ()->set_text ("");
00407                 //UN label it
00408                 error_count++;
00409               }
00410 
00411               if (error_count == 0) {
00412                 if (new_word == NULL) {
00413                                  /* Make a new word with a single blob */
00414                   new_word = word->shallow_copy ();
00415                   new_word->set_text (ch);
00416                   if (polyg)
00417                     new_blob = new PBLOB;
00418                   else
00419                     new_blob = (PBLOB *) new C_BLOB;
00420                   new_blob_it.set_to_list (new_word->
00421                     gblob_list ());
00422                   new_blob_it.add_to_end (new_blob);
00423                   new_outline_it.
00424                     set_to_list (gblob_out_list
00425                     (new_blob, polyg));
00426                 }
00427                 new_outline_it.add_to_end (outline_it.
00428                   extract ());
00429                 //move blob
00430               }
00431             }
00432           }
00433                                  //no outlines in blob
00434           if (outline_it.empty ())
00435                                  //so delete blob
00436             delete blob_it.extract ();
00437         }
00438       }
00439       if (blob_it.empty ())      //no blobs in word
00440                                  //so delete word
00441           delete word_it.extract ();
00442     }
00443   }
00444   if (error_count > 0)
00445     return error_count;
00446 
00447   if (new_word != NULL) {
00448     gblob_sort_list (new_word->gblob_list (), polyg);
00449     word_it.add_to_end (new_word);
00450     new_word_box = new_word->bounding_box ();
00451     word_x_centre = (new_word_box.left () + new_word_box.right ()) / 2.0f;
00452     baseline = row->base_line (word_x_centre);
00453 
00454     if (STRING (chs_caps_ht).contains (ch[0]) &&
00455       (new_word_box.top () <
00456     baseline + (1 + applybox_error_band) * row->x_height ())) {
00457       report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
00458         "FAILURE! caps-ht char didn't ascend");
00459       new_word->set_text ("");
00460       return 1;
00461     }
00462     if (STRING (chs_odd_top).contains (ch[0]) &&
00463       (new_word_box.top () <
00464     baseline + (1 - applybox_error_band) * row->x_height ())) {
00465       report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
00466         "FAILURE! Odd top char below xht");
00467       new_word->set_text ("");
00468       return 1;
00469     }
00470     if (STRING (chs_x_ht).contains (ch[0]) &&
00471       ((new_word_box.top () >
00472       baseline + (1 + applybox_error_band) * row->x_height ()) ||
00473       (new_word_box.top () <
00474     baseline + (1 - applybox_error_band) * row->x_height ()))) {
00475       report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
00476         "FAILURE! x-ht char didn't have top near xht");
00477       new_word->set_text ("");
00478       return 1;
00479     }
00480     if (STRING (chs_non_ambig_bl).contains (ch[0]) &&
00481       ((new_word_box.bottom () <
00482       baseline - applybox_error_band * row->x_height ()) ||
00483       (new_word_box.bottom () >
00484     baseline + applybox_error_band * row->x_height ()))) {
00485       report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
00486         "FAILURE! non ambig BL char didnt have bottom near baseline");
00487       new_word->set_text ("");
00488       return 1;
00489     }
00490     if (STRING (chs_odd_bot).contains (ch[0]) &&
00491       (new_word_box.bottom () >
00492     baseline + applybox_error_band * row->x_height ())) {
00493       report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
00494         "FAILURE! Odd bottom char above baseline");
00495       new_word->set_text ("");
00496       return 1;
00497     }
00498     if (STRING (chs_desc).contains (ch[0]) &&
00499       (new_word_box.bottom () >
00500     baseline - applybox_error_band * row->x_height ())) {
00501       report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
00502         "FAILURE! Descender doesn't descend");
00503       new_word->set_text ("");
00504       return 1;
00505     }
00506     return 0;
00507   }
00508   else {
00509     report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
00510       "FAILURE! Couldn't find any blobs");
00511     return 1;
00512   }
00513 }

void tidy_up ( BLOCK_LIST *  block_list,
INT16 ok_char_count,
INT16 ok_row_count,
INT16 unlabelled_words,
INT16 tgt_char_counts,
INT16 rebalance_count,
char &  min_char,
INT16 min_samples,
INT16 final_labelled_blob_count 
)

Tidy up blocks; apply boxes.

Definition at line 525 of file applybox.cpp.

References WERD::bounding_box(), FALSE, WERD::gblob_list(), BOX::left(), NULL, BOX::print(), WERD::text(), tprintf(), TRUE, word_comparator(), and ROW::word_list().

Referenced by apply_boxes().

00534                                                {
00535   BLOCK_IT block_it(block_list);
00536   ROW_IT row_it;
00537   ROW *row;
00538   WERD_IT word_it;
00539   WERD *word;
00540   WERD *duplicate_word;
00541   INT16 block_idx = 0;
00542   INT16 row_idx;
00543   INT16 all_row_idx = 0;
00544   BOOL8 row_ok;
00545   BOOL8 rebalance_needed = FALSE;
00546                                  //No. of unique labelled samples
00547   INT16 labelled_char_counts[128];
00548   INT16 i;
00549   char ch;
00550   char prev_ch = '\0';
00551   BOOL8 at_dupe_of_prev_word;
00552   ROW *prev_row = NULL;
00553   INT16 left;
00554   INT16 prev_left = -1;
00555 
00556   for (i = 0; i < 128; i++)
00557     labelled_char_counts[i] = 0;
00558 
00559   ok_char_count = 0;
00560   ok_row_count = 0;
00561   unlabelled_words = 0;
00562   if ((applybox_debug > 4) && (block_it.length () != 1))
00563 
00564     tprintf ("APPLY_BOXES: More than one block??\n");
00565 
00566   for (block_it.mark_cycle_pt ();
00567   !block_it.cycled_list (); block_it.forward ()) {
00568     block_idx++;
00569     row_idx = 0;
00570     row_ok = FALSE;
00571     row_it.set_to_list (block_it.data ()->row_list ());
00572     for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00573       row_idx++;
00574       all_row_idx++;
00575       row = row_it.data ();
00576       word_it.set_to_list (row->word_list ());
00577       word_it.sort (word_comparator);
00578       for (word_it.mark_cycle_pt ();
00579       !word_it.cycled_list (); word_it.forward ()) {
00580         word = word_it.data ();
00581         if (strlen (word->text ()) == 0) {
00582           unlabelled_words++;
00583           if (applybox_debug > 4) {
00584             tprintf
00585               ("APPLY_BOXES: Unlabelled word blk:%d row:%d allrows:%d\n",
00586               block_idx, row_idx, all_row_idx);
00587           }
00588         }
00589         else {
00590           if (word->gblob_list ()->length () != 1)
00591             tprintf
00592               ("APPLY_BOXES: FATALITY - MULTIBLOB Labelled word blk:%d row:%d allrows:%d\n",
00593               block_idx, row_idx, all_row_idx);
00594 
00595           ok_char_count++;
00596           labelled_char_counts[*word->text ()]++;
00597           row_ok = TRUE;
00598         }
00599       }
00600       if ((applybox_debug > 4) && (!row_ok)) {
00601         tprintf
00602           ("APPLY_BOXES: Row with no labelled words blk:%d row:%d allrows:%d\n",
00603           block_idx, row_idx, all_row_idx);
00604       }
00605       else
00606         ok_row_count++;
00607     }
00608   }
00609 
00610   min_samples = 9999;
00611   for (i = 0; i < 128; i++) {
00612     if (tgt_char_counts[i] > labelled_char_counts[i]) {
00613       if (labelled_char_counts[i] <= 1) {
00614         tprintf
00615           ("APPLY_BOXES: FATALITY - %d labelled samples of \"%c\" - target is %d\n",
00616           labelled_char_counts[i], (char) i, tgt_char_counts[i]);
00617       }
00618       else {
00619         rebalance_needed = TRUE;
00620         if (applybox_debug > 0)
00621           tprintf
00622             ("APPLY_BOXES: REBALANCE REQD \"%c\" - target of %d from %d labelled samples\n",
00623             (char) i, tgt_char_counts[i], labelled_char_counts[i]);
00624       }
00625     }
00626     if ((min_samples > labelled_char_counts[i]) && (tgt_char_counts[i] > 0)) {
00627       min_samples = labelled_char_counts[i];
00628       min_char = (char) i;
00629     }
00630   }
00631 
00632   while (applybox_rebalance && rebalance_needed) {
00633     block_it.set_to_list (block_list);
00634     for (block_it.mark_cycle_pt ();
00635     !block_it.cycled_list (); block_it.forward ()) {
00636       row_it.set_to_list (block_it.data ()->row_list ());
00637       for (row_it.mark_cycle_pt ();
00638       !row_it.cycled_list (); row_it.forward ()) {
00639         row = row_it.data ();
00640         word_it.set_to_list (row->word_list ());
00641         for (word_it.mark_cycle_pt ();
00642         !word_it.cycled_list (); word_it.forward ()) {
00643           word = word_it.data ();
00644           left = word->bounding_box ().left ();
00645           ch = *word->text ();
00646           at_dupe_of_prev_word = ((row == prev_row) &&
00647             (left = prev_left) &&
00648             (ch == prev_ch));
00649           if ((ch != '\0') &&
00650             (labelled_char_counts[ch] > 1) &&
00651             (tgt_char_counts[ch] > labelled_char_counts[ch]) &&
00652           (!at_dupe_of_prev_word)) {
00653             /* Duplicate the word to rebalance the labelled samples */
00654             if (applybox_debug > 9) {
00655               tprintf ("Duping \"%c\" from ", ch);
00656               word->bounding_box ().print ();
00657             }
00658             duplicate_word = new WERD;
00659             *duplicate_word = *word;
00660             word_it.add_after_then_move (duplicate_word);
00661             rebalance_count++;
00662             labelled_char_counts[ch]++;
00663           }
00664           prev_row = row;
00665           prev_left = left;
00666           prev_ch = ch;
00667         }
00668       }
00669     }
00670     rebalance_needed = FALSE;
00671     for (i = 0; i < 128; i++) {
00672       if ((tgt_char_counts[i] > labelled_char_counts[i]) &&
00673       (labelled_char_counts[i] > 1)) {
00674         rebalance_needed = TRUE;
00675         break;
00676       }
00677     }
00678   }
00679 
00680   /* Now final check - count labelled blobs */
00681   final_labelled_blob_count = 0;
00682   block_it.set_to_list (block_list);
00683   for (block_it.mark_cycle_pt ();
00684   !block_it.cycled_list (); block_it.forward ()) {
00685     row_it.set_to_list (block_it.data ()->row_list ());
00686     for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00687       row = row_it.data ();
00688       word_it.set_to_list (row->word_list ());
00689       word_it.sort (word_comparator);
00690       for (word_it.mark_cycle_pt ();
00691       !word_it.cycled_list (); word_it.forward ()) {
00692         word = word_it.data ();
00693         if ((strlen (word->text ()) == 1) &&
00694           (word->gblob_list ()->length () == 1))
00695           final_labelled_blob_count++;
00696       }
00697     }
00698   }
00699 }


Generated on Wed Feb 28 19:49:13 2007 for Tesseract by  doxygen 1.5.1