ccmain/applybox.cpp

Go to the documentation of this file.
00001 
00020 /*
00021 define SECURE_NAMES for code versions which go to UNLV to stop tessedit
00022 including all the newdiff stuff (which contains lots of text indicating
00023 what measures we are interested in.
00024 */
00025 /* #define SECURE_NAMES done in secnames.h when necessary*/
00026 
00027 #include "mfcpch.h"
00028 #include          "applybox.h"
00029 #include          <ctype.h>
00030 #include          <string.h>
00031 #ifdef __UNIX__
00032 #include          <assert.h>
00033 #include                    <errno.h>
00034 #endif
00035 #include          "mainblk.h"
00036 #include          "genblob.h"
00037 #include          "fixxht.h"
00038 #include          "control.h"
00039 #include          "tessbox.h"
00040 #include          "globals.h"
00041 #include          "secname.h"
00042 #ifdef TEXT_VERBOSE
00043 #include    "callcpp.h"
00044 #endif
00045 
00046 #define SECURE_NAMES
00047 #ifndef SECURE_NAMES
00048 #include          "wordstats.h"
00049 #endif
00050 
00051 #define EXTERN
00052 
00055 EXTERN BOOL_VAR (applybox_rebalance, TRUE, "Drop dead");
00056 EXTERN INT_VAR (applybox_debug, 0, "Debug level");
00057 EXTERN STRING_VAR (applybox_test_exclusions, "|",
00058 "Chars ignored for testing");
00059 EXTERN double_VAR (applybox_error_band, 0.15, "Err band as fract of xht");
00100 void apply_boxes(BLOCK_LIST *block_list    //real blocks
00101                 ) {
00102   INT16 boxfile_lineno = 0;
00103   INT16 boxfile_charno = 0;
00104   BOX box;                       //boxfile box
00105   char ch[2];                    //correct ch from boxfile
00106   ROW *row;
00107   ROW *prev_row = NULL;
00108   INT16 prev_box_right = MAX_INT16;
00109   INT16 block_id;
00110   INT16 row_id;
00111   INT16 box_count = 0;
00112   INT16 box_failures = 0;
00113   INT16 labels_ok;
00114   INT16 rows_ok;
00115   INT16 bad_blobs;
00116   INT16 tgt_char_counts[128];    //No. of box samples
00117   // INT16  labelled_char_counts[128];    //No. of unique labelled samples
00118   INT16 i;
00119   INT16 rebalance_count = 0;
00120   char min_char;
00121   INT16 min_samples;
00122   INT16 final_labelled_blob_count;
00123 
00124   for (i = 0; i < 128; i++)
00125     tgt_char_counts[i] = 0;
00126 
00127   FILE* box_file;
00128   STRING filename = imagefile;
00129   filename += ".box";
00130   if (!(box_file = fopen (filename.string(), "r"))) {
00131     CANTOPENFILE.error ("read_next_box", EXIT,
00132       "Cant open box file %s %d",
00133       filename.string(), errno);
00134   }
00135 
00136   ch[1] = '\0';
00137   clear_any_old_text(block_list);
00138   while (read_next_box (box_file, &box, &ch[0])) {
00139     box_count++;
00140     tgt_char_counts[ch[0]]++;
00141     row = find_row_of_box (block_list, box, block_id, row_id);
00142     if (box.left () < prev_box_right) {
00143       boxfile_lineno++;
00144       boxfile_charno = 1;
00145     }
00146     else
00147       boxfile_charno++;
00148 
00149     if (row == NULL) {
00150       box_failures++;
00151       report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
00152         "FAILURE! box overlaps no blobs or blobs in multiple rows");
00153     }
00154     else {
00155       if ((box.left () >= prev_box_right) && (row != prev_row))
00156         report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
00157           "WARNING! false row break");
00158       box_failures += resegment_box (row, box, ch, block_id, row_id,
00159         boxfile_lineno, boxfile_charno);
00160       prev_row = row;
00161     }
00162     prev_box_right = box.right ();
00163   }
00164   tidy_up(block_list,
00165           labels_ok,
00166           rows_ok,
00167           bad_blobs,
00168           tgt_char_counts,
00169           rebalance_count,
00170           min_char,
00171           min_samples,
00172           final_labelled_blob_count);
00173   tprintf ("APPLY_BOXES:\n");
00174   tprintf ("   Boxes read from boxfile:  %6d\n", box_count);
00175   tprintf ("   Initially labelled blobs: %6d in %d rows\n",
00176     labels_ok, rows_ok);
00177   tprintf ("   Box failures detected:     %6d\n", box_failures);
00178   tprintf ("   Duped blobs for rebalance:%6d\n", rebalance_count);
00179   tprintf ("   \"%c\" has fewest samples:%6d\n", min_char, min_samples);
00180   tprintf ("            Total unlabelled words:   %6d\n",
00181     bad_blobs);
00182   tprintf ("            Final labelled words:     %6d\n",
00183     final_labelled_blob_count);
00184 }
00185 
00186 
00190 void clear_any_old_text(
00191                         BLOCK_LIST *block_list  //real blocks
00192                        ) {
00193   BLOCK_IT block_it(block_list);
00194   ROW_IT row_it;
00195   WERD_IT word_it;
00196 
00197   for (block_it.mark_cycle_pt ();
00198   !block_it.cycled_list (); block_it.forward ()) {
00199     row_it.set_to_list (block_it.data ()->row_list ());
00200     for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00201       word_it.set_to_list (row_it.data ()->word_list ());
00202       for (word_it.mark_cycle_pt ();
00203       !word_it.cycled_list (); word_it.forward ()) {
00204         word_it.data ()->set_text ("");
00205       }
00206     }
00207   }
00208 }
00209 
00210 
00220 BOOL8 read_next_box(FILE* box_file,  //
00221                     BOX *box,
00222                     char *ch) {
00223   char buff[256];                //boxfile read buffer
00224   char *buffptr = buff;
00225   STRING box_filename;
00226   static INT16 line = 0;
00227   INT32 x_min;
00228   INT32 y_min;
00229   INT32 x_max;
00230   INT32 y_max;
00231   INT32 count = 0;
00232 
00233   while (!feof (box_file)) {
00234     fgets (buff, sizeof (buff) - 1, box_file);
00235     line++;
00236 
00237     /* Check for blank lines in box file */
00238     for (buffptr = buff; isspace (*buffptr); buffptr++)
00239       ;
00240     if (*buffptr != '\0') {
00241       count =
00242         sscanf (buff,
00243         "%c " INT32FORMAT " " INT32FORMAT " " INT32FORMAT " "
00244         INT32FORMAT, ch, &x_min, &y_min, &x_max, &y_max);
00245       if (count != 5) {
00246         tprintf ("Box file format error on line %i ignored\n", line);
00247       }
00248       else {
00249         *box = BOX (ICOORD (x_min, y_min), ICOORD (x_max, y_max));
00250         return TRUE;             //read a box ok
00251       }
00252     }
00253   }
00254   return FALSE;                  //EOF
00255 }
00256 
00263 ROW *find_row_of_box(                         //
00264                      BLOCK_LIST *block_list,  //real blocks
00265                      BOX box,                 //from boxfile
00266                      INT16 &block_id,
00267                      INT16 &row_id_to_process) {
00268   BLOCK_IT block_it(block_list);
00269   BLOCK *block;
00270   ROW_IT row_it;
00271   ROW *row;
00272   ROW *row_to_process = NULL;
00273   INT16 row_id;
00274   WERD_IT word_it;
00275   WERD *word;
00276   BOOL8 polyg;
00277   PBLOB_IT blob_it;
00278   PBLOB *blob;
00279   OUTLINE_IT outline_it;
00280   OUTLINE *outline;
00281 
00282   /*
00283     Find row to process - error if box REALLY overlaps more than one row. (I.e
00284     it overlaps blobs in the row - not just overlaps the bounding box of the
00285     whole row.)
00286   */
00287 
00288   block_id = 0;
00289   for (block_it.mark_cycle_pt ();
00290   !block_it.cycled_list (); block_it.forward ()) {
00291     block_id++;
00292     row_id = 0;
00293     block = block_it.data ();
00294     if (block->bounding_box ().overlap (box)) {
00295       row_it.set_to_list (block->row_list ());
00296       for (row_it.mark_cycle_pt ();
00297       !row_it.cycled_list (); row_it.forward ()) {
00298         row_id++;
00299         row = row_it.data ();
00300         if (row->bounding_box ().overlap (box)) {
00301           word_it.set_to_list (row->word_list ());
00302           for (word_it.mark_cycle_pt ();
00303           !word_it.cycled_list (); word_it.forward ()) {
00304             word = word_it.data ();
00305             polyg = word->flag (W_POLYGON);
00306             if (word->bounding_box ().overlap (box)) {
00307               blob_it.set_to_list (word->gblob_list ());
00308               for (blob_it.mark_cycle_pt ();
00309               !blob_it.cycled_list (); blob_it.forward ()) {
00310                 blob = blob_it.data ();
00311                 if (gblob_bounding_box (blob, polyg).
00312                 overlap (box)) {
00313                   outline_it.
00314                     set_to_list (gblob_out_list
00315                     (blob, polyg));
00316                   for (outline_it.mark_cycle_pt ();
00317                     !outline_it.cycled_list ();
00318                   outline_it.forward ()) {
00319                     outline = outline_it.data ();
00320                     if (goutline_bounding_box
00321                     (outline, polyg).major_overlap (box)) {
00322                       if ((row_to_process == NULL) ||
00323                       (row_to_process == row)) {
00324                         row_to_process = row;
00325                         row_id_to_process = row_id;
00326                       }
00327                       else
00328                         /* RETURN ERROR Box overlaps blobs in more than one row  */
00329                         return NULL;
00330                     }
00331                   }
00332                 }
00333               }
00334             }
00335           }
00336         }
00337       }
00338     }
00339   }
00340   return row_to_process;
00341 }
00342 
00351 INT16 resegment_box(  //
00352                     ROW *row,
00353                     BOX box,
00354                     char *ch,
00355                     INT16 block_id,
00356                     INT16 row_id,
00357                     INT16 boxfile_lineno,
00358                     INT16 boxfile_charno) {
00359   WERD_IT word_it;
00360   WERD *word;
00361   WERD *new_word = NULL;
00362   BOOL8 polyg = false;
00363   PBLOB_IT blob_it;
00364   PBLOB_IT new_blob_it;
00365   PBLOB *blob;
00366   PBLOB *new_blob;
00367   OUTLINE_IT outline_it;
00368   OUTLINE_LIST dummy;  // Just to initialize new_outline_it.
00369   OUTLINE_IT new_outline_it = &dummy;
00370   OUTLINE *outline;
00371   BOX new_word_box;
00372   float word_x_centre;
00373   float baseline;
00374   INT16 error_count = 0;         //number of chars lost
00375 
00376   word_it.set_to_list (row->word_list ());
00377   for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
00378     word = word_it.data ();
00379     polyg = word->flag (W_POLYGON);
00380     if (word->bounding_box ().overlap (box)) {
00381       blob_it.set_to_list (word->gblob_list ());
00382       for (blob_it.mark_cycle_pt ();
00383       !blob_it.cycled_list (); blob_it.forward ()) {
00384         blob = blob_it.data ();
00385         if (gblob_bounding_box (blob, polyg).overlap (box)) {
00386           outline_it.set_to_list (gblob_out_list (blob, polyg));
00387           for (outline_it.mark_cycle_pt ();
00388           !outline_it.cycled_list (); outline_it.forward ()) {
00389             outline = outline_it.data ();
00390             if (goutline_bounding_box (outline, polyg).
00391             major_overlap (box)) {
00392               if (strlen (word->text ()) > 0) {
00393                 if (error_count == 0) {
00394                   error_count = 1;
00395                   if (applybox_debug > 4)
00396                     report_failed_box (boxfile_lineno,
00397                       boxfile_charno,
00398                       box, ch,
00399                       "FAILURE! box overlaps blob in labelled word");
00400                 }
00401                 if (applybox_debug > 4)
00402                   tprintf
00403                     ("APPLY_BOXES: ALSO ignoring corrupted char blk:%d row:%d \"%s\"\n",
00404                     block_id, row_id,
00405                     word_it.data ()->text ());
00406                 word_it.data ()->set_text ("");
00407                 //UN label it
00408                 error_count++;
00409               }
00410 
00411               if (error_count == 0) {
00412                 if (new_word == NULL) {
00413                                  /* Make a new word with a single blob */
00414                   new_word = word->shallow_copy ();
00415                   new_word->set_text (ch);
00416                   if (polyg)
00417                     new_blob = new PBLOB;
00418                   else
00419                     new_blob = (PBLOB *) new C_BLOB;
00420                   new_blob_it.set_to_list (new_word->
00421                     gblob_list ());
00422                   new_blob_it.add_to_end (new_blob);
00423                   new_outline_it.
00424                     set_to_list (gblob_out_list
00425                     (new_blob, polyg));
00426                 }
00427                 new_outline_it.add_to_end (outline_it.
00428                   extract ());
00429                 //move blob
00430               }
00431             }
00432           }
00433                                  //no outlines in blob
00434           if (outline_it.empty ())
00435                                  //so delete blob
00436             delete blob_it.extract ();
00437         }
00438       }
00439       if (blob_it.empty ())      //no blobs in word
00440                                  //so delete word
00441           delete word_it.extract ();
00442     }
00443   }
00444   if (error_count > 0)
00445     return error_count;
00446 
00447   if (new_word != NULL) {
00448     gblob_sort_list (new_word->gblob_list (), polyg);
00449     word_it.add_to_end (new_word);
00450     new_word_box = new_word->bounding_box ();
00451     word_x_centre = (new_word_box.left () + new_word_box.right ()) / 2.0f;
00452     baseline = row->base_line (word_x_centre);
00453 
00454     if (STRING (chs_caps_ht).contains (ch[0]) &&
00455       (new_word_box.top () <
00456     baseline + (1 + applybox_error_band) * row->x_height ())) {
00457       report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
00458         "FAILURE! caps-ht char didn't ascend");
00459       new_word->set_text ("");
00460       return 1;
00461     }
00462     if (STRING (chs_odd_top).contains (ch[0]) &&
00463       (new_word_box.top () <
00464     baseline + (1 - applybox_error_band) * row->x_height ())) {
00465       report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
00466         "FAILURE! Odd top char below xht");
00467       new_word->set_text ("");
00468       return 1;
00469     }
00470     if (STRING (chs_x_ht).contains (ch[0]) &&
00471       ((new_word_box.top () >
00472       baseline + (1 + applybox_error_band) * row->x_height ()) ||
00473       (new_word_box.top () <
00474     baseline + (1 - applybox_error_band) * row->x_height ()))) {
00475       report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
00476         "FAILURE! x-ht char didn't have top near xht");
00477       new_word->set_text ("");
00478       return 1;
00479     }
00480     if (STRING (chs_non_ambig_bl).contains (ch[0]) &&
00481       ((new_word_box.bottom () <
00482       baseline - applybox_error_band * row->x_height ()) ||
00483       (new_word_box.bottom () >
00484     baseline + applybox_error_band * row->x_height ()))) {
00485       report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
00486         "FAILURE! non ambig BL char didnt have bottom near baseline");
00487       new_word->set_text ("");
00488       return 1;
00489     }
00490     if (STRING (chs_odd_bot).contains (ch[0]) &&
00491       (new_word_box.bottom () >
00492     baseline + applybox_error_band * row->x_height ())) {
00493       report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
00494         "FAILURE! Odd bottom char above baseline");
00495       new_word->set_text ("");
00496       return 1;
00497     }
00498     if (STRING (chs_desc).contains (ch[0]) &&
00499       (new_word_box.bottom () >
00500     baseline - applybox_error_band * row->x_height ())) {
00501       report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
00502         "FAILURE! Descender doesn't descend");
00503       new_word->set_text ("");
00504       return 1;
00505     }
00506     return 0;
00507   }
00508   else {
00509     report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
00510       "FAILURE! Couldn't find any blobs");
00511     return 1;
00512   }
00513 }
00514 
00515 
00525 void tidy_up(                         //
00526              BLOCK_LIST *block_list,  //real blocks
00527              INT16 &ok_char_count,
00528              INT16 &ok_row_count,
00529              INT16 &unlabelled_words,
00530              INT16 *tgt_char_counts,
00531              INT16 &rebalance_count,
00532              char &min_char,
00533              INT16 &min_samples,
00534              INT16 &final_labelled_blob_count) {
00535   BLOCK_IT block_it(block_list);
00536   ROW_IT row_it;
00537   ROW *row;
00538   WERD_IT word_it;
00539   WERD *word;
00540   WERD *duplicate_word;
00541   INT16 block_idx = 0;
00542   INT16 row_idx;
00543   INT16 all_row_idx = 0;
00544   BOOL8 row_ok;
00545   BOOL8 rebalance_needed = FALSE;
00546                                  //No. of unique labelled samples
00547   INT16 labelled_char_counts[128];
00548   INT16 i;
00549   char ch;
00550   char prev_ch = '\0';
00551   BOOL8 at_dupe_of_prev_word;
00552   ROW *prev_row = NULL;
00553   INT16 left;
00554   INT16 prev_left = -1;
00555 
00556   for (i = 0; i < 128; i++)
00557     labelled_char_counts[i] = 0;
00558 
00559   ok_char_count = 0;
00560   ok_row_count = 0;
00561   unlabelled_words = 0;
00562   if ((applybox_debug > 4) && (block_it.length () != 1))
00563 
00564     tprintf ("APPLY_BOXES: More than one block??\n");
00565 
00566   for (block_it.mark_cycle_pt ();
00567   !block_it.cycled_list (); block_it.forward ()) {
00568     block_idx++;
00569     row_idx = 0;
00570     row_ok = FALSE;
00571     row_it.set_to_list (block_it.data ()->row_list ());
00572     for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00573       row_idx++;
00574       all_row_idx++;
00575       row = row_it.data ();
00576       word_it.set_to_list (row->word_list ());
00577       word_it.sort (word_comparator);
00578       for (word_it.mark_cycle_pt ();
00579       !word_it.cycled_list (); word_it.forward ()) {
00580         word = word_it.data ();
00581         if (strlen (word->text ()) == 0) {
00582           unlabelled_words++;
00583           if (applybox_debug > 4) {
00584             tprintf
00585               ("APPLY_BOXES: Unlabelled word blk:%d row:%d allrows:%d\n",
00586               block_idx, row_idx, all_row_idx);
00587           }
00588         }
00589         else {
00590           if (word->gblob_list ()->length () != 1)
00591             tprintf
00592               ("APPLY_BOXES: FATALITY - MULTIBLOB Labelled word blk:%d row:%d allrows:%d\n",
00593               block_idx, row_idx, all_row_idx);
00594 
00595           ok_char_count++;
00596           labelled_char_counts[*word->text ()]++;
00597           row_ok = TRUE;
00598         }
00599       }
00600       if ((applybox_debug > 4) && (!row_ok)) {
00601         tprintf
00602           ("APPLY_BOXES: Row with no labelled words blk:%d row:%d allrows:%d\n",
00603           block_idx, row_idx, all_row_idx);
00604       }
00605       else
00606         ok_row_count++;
00607     }
00608   }
00609 
00610   min_samples = 9999;
00611   for (i = 0; i < 128; i++) {
00612     if (tgt_char_counts[i] > labelled_char_counts[i]) {
00613       if (labelled_char_counts[i] <= 1) {
00614         tprintf
00615           ("APPLY_BOXES: FATALITY - %d labelled samples of \"%c\" - target is %d\n",
00616           labelled_char_counts[i], (char) i, tgt_char_counts[i]);
00617       }
00618       else {
00619         rebalance_needed = TRUE;
00620         if (applybox_debug > 0)
00621           tprintf
00622             ("APPLY_BOXES: REBALANCE REQD \"%c\" - target of %d from %d labelled samples\n",
00623             (char) i, tgt_char_counts[i], labelled_char_counts[i]);
00624       }
00625     }
00626     if ((min_samples > labelled_char_counts[i]) && (tgt_char_counts[i] > 0)) {
00627       min_samples = labelled_char_counts[i];
00628       min_char = (char) i;
00629     }
00630   }
00631 
00632   while (applybox_rebalance && rebalance_needed) {
00633     block_it.set_to_list (block_list);
00634     for (block_it.mark_cycle_pt ();
00635     !block_it.cycled_list (); block_it.forward ()) {
00636       row_it.set_to_list (block_it.data ()->row_list ());
00637       for (row_it.mark_cycle_pt ();
00638       !row_it.cycled_list (); row_it.forward ()) {
00639         row = row_it.data ();
00640         word_it.set_to_list (row->word_list ());
00641         for (word_it.mark_cycle_pt ();
00642         !word_it.cycled_list (); word_it.forward ()) {
00643           word = word_it.data ();
00644           left = word->bounding_box ().left ();
00645           ch = *word->text ();
00646           at_dupe_of_prev_word = ((row == prev_row) &&
00647             (left = prev_left) &&
00648             (ch == prev_ch));
00649           if ((ch != '\0') &&
00650             (labelled_char_counts[ch] > 1) &&
00651             (tgt_char_counts[ch] > labelled_char_counts[ch]) &&
00652           (!at_dupe_of_prev_word)) {
00653             /* Duplicate the word to rebalance the labelled samples */
00654             if (applybox_debug > 9) {
00655               tprintf ("Duping \"%c\" from ", ch);
00656               word->bounding_box ().print ();
00657             }
00658             duplicate_word = new WERD;
00659             *duplicate_word = *word;
00660             word_it.add_after_then_move (duplicate_word);
00661             rebalance_count++;
00662             labelled_char_counts[ch]++;
00663           }
00664           prev_row = row;
00665           prev_left = left;
00666           prev_ch = ch;
00667         }
00668       }
00669     }
00670     rebalance_needed = FALSE;
00671     for (i = 0; i < 128; i++) {
00672       if ((tgt_char_counts[i] > labelled_char_counts[i]) &&
00673       (labelled_char_counts[i] > 1)) {
00674         rebalance_needed = TRUE;
00675         break;
00676       }
00677     }
00678   }
00679 
00680   /* Now final check - count labelled blobs */
00681   final_labelled_blob_count = 0;
00682   block_it.set_to_list (block_list);
00683   for (block_it.mark_cycle_pt ();
00684   !block_it.cycled_list (); block_it.forward ()) {
00685     row_it.set_to_list (block_it.data ()->row_list ());
00686     for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00687       row = row_it.data ();
00688       word_it.set_to_list (row->word_list ());
00689       word_it.sort (word_comparator);
00690       for (word_it.mark_cycle_pt ();
00691       !word_it.cycled_list (); word_it.forward ()) {
00692         word = word_it.data ();
00693         if ((strlen (word->text ()) == 1) &&
00694           (word->gblob_list ()->length () == 1))
00695           final_labelled_blob_count++;
00696       }
00697     }
00698   }
00699 }
00700 
00701 
00707 void report_failed_box(INT16 boxfile_lineno,
00708                        INT16 boxfile_charno,
00709                        BOX box,
00710                        char *box_ch,
00711                        const char *err_msg) {
00712   if (applybox_debug > 4)
00713     tprintf ("APPLY_BOXES: boxfile %1d/%1d/%s ((%1d,%1d),(%1d,%1d)): %s\n",
00714       boxfile_lineno,
00715       boxfile_charno,
00716       box_ch,
00717       box.left (), box.bottom (), box.right (), box.top (), err_msg);
00718 }
00719 
00720 
00727 void apply_box_training(BLOCK_LIST *block_list) { 
00728   BLOCK_IT block_it(block_list); 
00729   ROW_IT row_it;
00730   ROW *row;
00731   WERD_IT word_it;
00732   WERD *word;
00733   WERD *bln_word;
00734   WERD copy_outword;             // copy to denorm
00735   PBLOB_IT blob_it;
00736   DENORM denorm;
00737   INT16 count = 0;
00738   char ch[2];
00739 
00740   ch[1] = '\0';
00741 
00742   tprintf ("Generating training data\n");
00743   for (block_it.mark_cycle_pt ();
00744   !block_it.cycled_list (); block_it.forward ()) {
00745     row_it.set_to_list (block_it.data ()->row_list ());
00746     for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00747       row = row_it.data ();
00748       word_it.set_to_list (row->word_list ());
00749       for (word_it.mark_cycle_pt ();
00750       !word_it.cycled_list (); word_it.forward ()) {
00751         word = word_it.data ();
00752         if ((strlen (word->text ()) == 1) &&
00753         (word->gblob_list ()->length () == 1)) {
00754           /* Here is a word with a single char label and a single blob so train on it */
00755           bln_word =
00756             make_bln_copy (word, row, row->x_height (), &denorm);
00757           blob_it.set_to_list (bln_word->blob_list ());
00758           ch[0] = *word->text ();
00759           tess_training_tester (blob_it.data (),
00760                                  //single blob
00761             &denorm, TRUE,       //correct
00762             ch,                  //correct ASCII char
00763             1,                   //ASCII length
00764             NULL);
00765           copy_outword = *(bln_word);
00766           copy_outword.baseline_denormalise (&denorm);
00767           blob_it.set_to_list (copy_outword.blob_list ());
00768           ch[0] = *word->text ();
00769           delete bln_word;
00770           count++;
00771         }
00772       }
00773     }
00774   }
00775   tprintf ("Generated training data for %d blobs\n", count);
00776 }
00777 
00778 
00789 void apply_box_testing(BLOCK_LIST *block_list) { 
00790   BLOCK_IT block_it(block_list); 
00791   ROW_IT row_it;
00792   ROW *row;
00793   INT16 row_count = 0;
00794   WERD_IT word_it;
00795   WERD *word;
00796   WERD *bln_word;
00797   INT16 word_count = 0;
00798   PBLOB_IT blob_it;
00799   DENORM denorm;
00800   INT16 count = 0;
00801   char ch[2];
00802   WERD *outword;                 //bln best choice
00803   //segmentation
00804   WERD_CHOICE *best_choice;      //tess output
00805   WERD_CHOICE *raw_choice;       //top choice permuter
00806                                  //detailed results
00807   BLOB_CHOICE_LIST_CLIST blob_choices;
00808   INT16 char_count = 0;
00809   INT16 correct_count = 0;
00810   INT16 err_count = 0;
00811   INT16 rej_count = 0;
00812   #ifndef SECURE_NAMES
00813   WERDSTATS wordstats;           //As from newdiff
00814   #endif
00815   char tess_rej_str[3];
00816   char tess_long_str[3];
00817 
00818   ch[1] = '\0';
00819   strcpy (tess_rej_str, "|A");
00820   strcpy (tess_long_str, "|B");
00821 
00822   for (block_it.mark_cycle_pt ();
00823   !block_it.cycled_list (); block_it.forward ()) {
00824     row_it.set_to_list (block_it.data ()->row_list ());
00825     for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
00826       row = row_it.data ();
00827       row_count++;
00828       word_count = 0;
00829       word_it.set_to_list (row->word_list ());
00830       for (word_it.mark_cycle_pt ();
00831       !word_it.cycled_list (); word_it.forward ()) {
00832         word = word_it.data ();
00833         word_count++;
00834         if ((strlen (word->text ()) == 1) &&
00835           !STRING (applybox_test_exclusions).contains (*word->text ())
00836         && (word->gblob_list ()->length () == 1)) {
00837           /* Here is a word with a single char label and a single blob so test it */
00838           bln_word =
00839             make_bln_copy (word, row, row->x_height (), &denorm);
00840           blob_it.set_to_list (bln_word->blob_list ());
00841           ch[0] = *word->text ();
00842           char_count++;
00843           best_choice = tess_segment_pass1 (bln_word,
00844             &denorm,
00845             tess_default_matcher,
00846             raw_choice,
00847             &blob_choices, outword);
00848 
00849           /* Test for TESS screw up on word. */
00850           if ((best_choice->string ().length () == 0) ||
00851             (strspn (best_choice->string ().string (), " ") ==
00852           best_choice->string ().length ())) {
00853             rej_count++;
00854             tprintf ("%d:%d: \"%s\" -> TESS FAILED\n",
00855               row_count, word_count, ch);
00856             #ifndef SECURE_NAMES
00857             wordstats.word (tess_rej_str, 2, ch, 1);
00858             #endif
00859           }
00860           else {
00861             if ((best_choice->string ().length () !=
00862               outword->blob_list ()->length ()) ||
00863               (best_choice->string ().length () !=
00864             blob_choices.length ())) {
00865               tprintf
00866                 ("ASSERT FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n",
00867                 best_choice->string ().string (),
00868                 best_choice->string ().length (),
00869                 outword->blob_list ()->length (),
00870                 blob_choices.length ());
00871             }
00872             ASSERT_HOST (best_choice->string ().length () ==
00873               outword->blob_list ()->length ());
00874             ASSERT_HOST (best_choice->string ().length () ==
00875               blob_choices.length ());
00876             fix_quotes ((char *) best_choice->string ().string (),
00877                                  //turn to double
00878               outword, &blob_choices);
00879             if (strcmp (best_choice->string ().string (), ch) != 0) {
00880               err_count++;
00881               tprintf ("%d:%d: \"%s\" -> \"%s\"\n",
00882                 row_count, word_count, ch,
00883                 best_choice->string ().string ());
00884             }
00885             else
00886               correct_count++;
00887             #ifndef SECURE_NAMES
00888             if (best_choice->string ().length () > 2)
00889               wordstats.word (tess_long_str, 2, ch, 1);
00890             else
00891               wordstats.word ((char *) best_choice->string ().
00892                 string (),
00893                 best_choice->string ().length (), ch,
00894                 1);
00895             #endif
00896           }
00897           delete bln_word;
00898           delete outword;
00899           delete best_choice;
00900           delete raw_choice;
00901           blob_choices.deep_clear ();
00902           count++;
00903         }
00904       }
00905     }
00906   }
00907   #ifndef SECURE_NAMES
00908   wordstats.print (1, 100.0);
00909   wordstats.conf_matrix ();
00910   tprintf ("Tested %d chars: %d correct; %d rejected by tess; %d errs\n",
00911     char_count, correct_count, rej_count, err_count);
00912   #endif
00913 }

Generated on Wed Feb 28 19:49:07 2007 for Tesseract by  doxygen 1.5.1