wordrec/chopper.cpp

Go to the documentation of this file.
00001 
00021 /*----------------------------------------------------------------------
00022           I n c l u d e s
00023 ----------------------------------------------------------------------*/
00024 #include "chopper.h"
00025 #include "wordclass.h"
00026 #include "makechop.h"
00027 #include "associate.h"
00028 #include "metrics.h"
00029 #include "tordvars.h"
00030 #include "stopper.h"
00031 #include "callcpp.h"
00032 #include "structures.h"
00033 #include "findseam.h"
00034 #include "render.h"
00035 #include "seam.h"
00036 #include "const.h"
00037 #include "freelist.h"
00038 #include "pieces.h"
00039 #include "permute.h"
00040 //#include "tessvars.h"
00041 
00042 #include <math.h>
00043 
00044 extern int blob_skip;
00047 INT_VAR (repair_unchopped_blobs, 1, "Fix blobs that aren't chopped");
00048 
00049 //?extern int tessedit_dangambigs_chop;
00050 double_VAR (tessedit_certainty_threshold, -2.25, "Good blob limit");
00053 /*----------------------------------------------------------------------
00054           M a c r o s
00055 ----------------------------------------------------------------------*/
00060 #define bounds_inside(inner_tl,inner_br,outer_tl,outer_br)  \
00061 ((inner_tl.x >= outer_tl.x)   && \
00062 (inner_tl.y <= outer_tl.y) && \
00063 (inner_br.x <= outer_br.x)   && \
00064 (inner_br.y >= outer_br.y))     \
00065 
00066 
00069 #define set_null_choice(choice)            \
00070 (class_string      (choice) =  NULL,     \
00071 class_probability (choice) =  MAX_FLOAT32, \
00072 class_certainty   (choice) = -MAX_FLOAT32) \
00073 
00074 /*----------------------------------------------------------------------
00075           F u n c t i o n s
00076 ----------------------------------------------------------------------*/
00077 /* ================== */
00081 void preserve_outline(EDGEPT *start) { 
00082   EDGEPT *srcpt;
00083 
00084   if (start == NULL)
00085     return;
00086   srcpt = start;
00087   do {
00088     srcpt->flags[1] = 1;
00089     srcpt = srcpt->next;
00090   }
00091   while (srcpt != start);
00092   srcpt->flags[1] = 2;
00093 }
00094 
00095 
00096 /* ================== */
00102 void preserve_outline_tree(TESSLINE *srcline) { 
00103   TESSLINE *outline;
00104 
00105   for (outline = srcline; outline != NULL; outline = outline->next) {
00106     preserve_outline (outline->loop);
00107   }
00108   if (srcline->child != NULL)
00109     preserve_outline_tree (srcline->child);
00110 }
00111 
00112 
00113 /* ================== */
00117 EDGEPT *restore_outline(EDGEPT *start) {
00118   EDGEPT *srcpt;
00119   EDGEPT *real_start;
00120   EDGEPT *deadpt;
00121 
00122   if (start == NULL)
00123     return NULL;
00124   srcpt = start;
00125   do {
00126     if (srcpt->flags[1] == 2)
00127       break;
00128     srcpt = srcpt->next;
00129   }
00130   while (srcpt != start);
00131   real_start = srcpt;
00132   do {
00133     if (srcpt->flags[1] == 0) {
00134       deadpt = srcpt;
00135       srcpt = srcpt->next;
00136       srcpt->prev = deadpt->prev;
00137       deadpt->prev->next = srcpt;
00138       deadpt->prev->vec.x = srcpt->pos.x - deadpt->prev->pos.x;
00139       deadpt->prev->vec.y = srcpt->pos.y - deadpt->prev->pos.y;
00140       oldedgept(deadpt);
00141     }
00142     else
00143       srcpt = srcpt->next;
00144   }
00145   while (srcpt != real_start);
00146   return real_start;
00147 }
00148 
00149 
00150 /* ================== */
00156 void restore_outline_tree(TESSLINE *srcline) { 
00157   TESSLINE *outline;
00158 
00159   for (outline = srcline; outline != NULL; outline = outline->next) {
00160     outline->loop = restore_outline (outline->loop);
00161     outline->start = outline->loop->pos;
00162   }
00163   if (srcline->child != NULL)
00164     restore_outline_tree (srcline->child);
00165 }
00166 
00167 
00168 /* ================== */
00181 SEAM *attempt_blob_chop(TWERD *word, INT32 blob_number, SEAMS seam_list) { 
00182   TBLOB *blob;
00183   TBLOB *other_blob;
00184   SEAM *seam;
00185   TBLOB *last_blob;
00186   TBLOB *next_blob;
00187   INT16 x;
00188 
00189   if (first_pass)
00190     chops_attempted1++;
00191   else
00192     chops_attempted2++;
00193 
00194   last_blob = NULL;
00195   blob = word->blobs;
00196   for (x = 0; x < blob_number; x++) {
00197     last_blob = blob;
00198     blob = blob->next;
00199   }
00200   next_blob = blob->next;
00201 
00202   if (repair_unchopped_blobs)
00203     preserve_outline_tree (blob->outlines);
00204   other_blob = newblob ();       /* Make new blob */
00205   other_blob->next = blob->next;
00206   other_blob->outlines = NULL;
00207   blob->next = other_blob;
00208 
00209   seam = pick_good_seam (blob);
00210 
00211 #define TEXT_PROGRESS 1
00212 #ifdef TEXT_PROGRESS
00213   #ifndef TEXT_VERBOSE
00214    // a dot per seam, if not debugging seams
00215    if (seam != NULL && !chop_debug)
00216       cprintf(".");
00217   #else
00218    // gets a 's', see ccmain/tesseractmain.dox
00219    if (seam != NULL && !chop_debug)
00220       cprintf("s");
00221   #endif
00222 #endif
00223   if (chop_debug) {
00224     if (seam != NULL) {
00225       print_seam ("Good seam picked=", seam);
00226     }
00227     else
00228       cprintf ("\n** no seam picked *** \n");
00229   }
00230   if (seam) {
00231     apply_seam(blob, other_blob, seam);
00232   }
00233 
00234   if ((seam == NULL) ||
00235     (blob->outlines == NULL) ||
00236     (other_blob->outlines == NULL) ||
00237     total_containment (blob, other_blob) ||
00238     check_blob (other_blob) ||
00239     !(check_seam_order (blob, seam) &&
00240     check_seam_order (other_blob, seam)) ||
00241     any_shared_split_points (seam_list, seam) ||
00242     !test_insert_seam(seam_list, blob_number, blob, word->blobs)) {
00243 
00244     blob->next = next_blob;
00245     if (seam) {
00246       undo_seam(blob, other_blob, seam);
00247       delete_seam(seam);
00248 #ifndef GRAPHICS_DISABLED
00249       if (chop_debug) {
00250         display_blob(blob, Red);
00251         cprintf ("\n** seam being removed ** \n");
00252       }
00253 #endif
00254     }
00255     else {
00256       oldblob(other_blob);
00257     }
00258 
00259     if (repair_unchopped_blobs)
00260       restore_outline_tree (blob->outlines);
00261     return (NULL);
00262   }
00263   return (seam);
00264 }
00265 
00266 
00267 /* ================== */
00271 int any_shared_split_points(SEAMS seam_list, SEAM *seam) {
00272   int length;
00273   int index;
00274 
00275   length = array_count (seam_list);
00276   for (index = 0; index < length; index++)
00277     if (shared_split_points ((SEAM *) array_value (seam_list, index), seam))
00278       return TRUE;
00279   return FALSE;
00280 }
00281 
00282 
00283 /* ================== */
00287 int check_blob(TBLOB *blob) {
00288   TESSLINE *outline;
00289   EDGEPT *edgept;
00290 
00291   for (outline = blob->outlines; outline != NULL; outline = outline->next) {
00292     edgept = outline->loop;
00293     do {
00294       if (edgept == NULL)
00295         break;
00296       edgept = edgept->next;
00297     }
00298     while (edgept != outline->loop);
00299     if (edgept == NULL)
00300       return 1;
00301   }
00302   return 0;
00303 }
00304 
00305 
00306 /* ================== */
00313 CHOICES_LIST improve_one_blob(TWERD *word,
00314                               CHOICES_LIST char_choices,
00315                               int fx,
00316                               INT32 *blob_number,
00317                               SEAMS *seam_list,
00318                               DANGERR *fixpt,
00319                               STATE *this_state,
00320                               STATE *correct_state,
00321                               INT32 pass) {
00322   TBLOB *pblob;
00323   TBLOB *blob;
00324   INT16 x = 0;
00325   float rating_ceiling = MAX_FLOAT32;
00326   CHOICES answer;
00327   SEAM *seam;
00328 
00329   do {
00330     *blob_number = select_blob_to_split (char_choices, rating_ceiling);
00331     if (*blob_number == -1)
00332       return (NULL);
00333 
00334     seam = attempt_blob_chop (word, *blob_number, *seam_list);
00335     if (seam != NULL)
00336       break;
00337     /* Must split null blobs */
00338     answer = (CHOICES) array_value (char_choices, *blob_number);
00339     if (answer == NIL)
00340       return (NULL);             /* Try different blob */
00341     rating_ceiling = best_probability (answer);
00342   }
00343   while (!blob_skip);
00344   /* Split OK */
00345   for (blob = word->blobs, pblob = NULL; x < *blob_number; x++) {
00346     pblob = blob;
00347     blob = blob->next;
00348   }
00349 
00350   *seam_list =
00351     insert_seam (*seam_list, *blob_number, seam, blob, word->blobs);
00352 
00353   free_choices ((CHOICES) array_value (char_choices, *blob_number));
00354 
00355   answer =
00356     classify_blob (pblob, blob, blob->next, NULL, fx, "improve 1:", Red,
00357     this_state, correct_state, pass, *blob_number);
00358   char_choices = array_insert (char_choices, *blob_number, answer);
00359 
00360   answer =
00361     classify_blob (blob, blob->next, blob->next->next, NULL, fx, "improve 2:",
00362     Yellow, this_state, correct_state, pass, *blob_number + 1);
00363   array_value (char_choices, *blob_number + 1) = (char *) answer;
00364 
00365   return (char_choices);
00366 }
00367 
00368 
00369 /* ================== */
00379 INT16 check_seam_order(TBLOB *blob, SEAM *seam) { 
00380   TESSLINE *outline;
00381   TESSLINE *last_outline;
00382   INT8 found_em[3];
00383 
00384   if (seam->split1 == NULL || seam->split1 == NULL || blob == NULL)
00385     return (TRUE);
00386 
00387   found_em[0] = found_em[1] = found_em[2] = FALSE;
00388 
00389   for (outline = blob->outlines; outline; outline = outline->next) {
00390     if (!found_em[0] &&
00391       ((seam->split1 == NULL) ||
00392     is_split_outline (outline, seam->split1))) {
00393       found_em[0] = TRUE;
00394     }
00395     if (!found_em[1] &&
00396       ((seam->split2 == NULL) ||
00397     is_split_outline (outline, seam->split2))) {
00398       found_em[1] = TRUE;
00399     }
00400     if (!found_em[2] &&
00401       ((seam->split3 == NULL) ||
00402     is_split_outline (outline, seam->split3))) {
00403       found_em[2] = TRUE;
00404     }
00405     last_outline = outline;
00406   }
00407 
00408   if (!found_em[0] || !found_em[1] || !found_em[2])
00409     return (FALSE);
00410   else
00411     return (TRUE);
00412 }
00413 
00414 
00415 /* ================== */
00437 CHOICES_LIST chop_word_main(register TWERD *word,
00438                             int fx,
00439                             A_CHOICE *best_choice,
00440                             A_CHOICE *raw_choice,
00441                             BOOL8 tester,
00442                             BOOL8 trainer) {
00443   TBLOB *pblob;
00444   TBLOB *blob;
00445   CHOICES_LIST char_choices;
00446   int index;
00447   int did_chopping;
00448   float rating_limit = 1000.0;
00449   STATE state;
00450   SEAMS seam_list = NULL;
00451   CHOICES match_result;
00452   MATRIX ratings = NULL;
00453   DANGERR fixpt;                 /*dangerous ambig */
00454   INT32 state_count;             //no of states
00455   INT32 bit_count;               //no of bits
00456   static STATE best_state;
00457   static STATE chop_states[64];  //in between states
00458 
00459   state_count = 0;
00460   set_null_choice(best_choice);
00461   set_null_choice(raw_choice);
00462 
00463   char_choices = new_choice_list ();
00464 
00465   did_chopping = 0;
00466   for (blob = word->blobs, pblob = NULL, index = 0; blob != NULL;
00467   blob = blob->next, index++) {
00468     match_result =
00469       (CHOICES) classify_blob (pblob, blob, blob->next, NULL, fx,
00470       "chop_word:", Green, &chop_states[0],
00471       &best_state, matcher_pass, index);
00472     char_choices = array_push (char_choices, match_result);
00473     pblob = blob;
00474   }
00475   bit_count = index - 1;
00476   permute_characters(char_choices, rating_limit, best_choice, raw_choice);
00477 
00478   set_n_ones (&state, array_count (char_choices) - 1);
00479   if (matcher_fp != NULL) {
00480     if (matcher_pass == 0) {
00481       bits_in_states = bit_count;
00482       chop_states[state_count] = state;
00483     }
00484     state_count++;
00485   }
00486 
00487   if (!AcceptableChoice (char_choices, best_choice, raw_choice, &fixpt)
00488     || (tester || trainer)
00489   && strcmp (word->correct, class_string (best_choice))) {
00490     did_chopping = 1;
00491     if (first_pass)
00492       words_chopped1++;
00493     else
00494       words_chopped2++;
00495 
00496     seam_list = start_seam_list (word->blobs);
00497 
00498     if (chop_enable)
00499       improve_by_chopping(word,
00500                           &char_choices,
00501                           fx,
00502                           &state,
00503                           best_choice,
00504                           raw_choice,
00505                           &seam_list,
00506                           &fixpt,
00507                           chop_states,
00508                           &state_count,
00509                           &best_state,
00510                           matcher_pass);
00511 
00512     if (chop_debug)
00513       print_seams ("Final seam list:", seam_list);
00514 
00515     if (enable_assoc &&
00516       !AcceptableChoice (char_choices, best_choice, raw_choice, NULL)
00517       || (tester || trainer)
00518     && strcmp (word->correct, class_string (best_choice))) {
00519       ratings = word_associator (word->blobs, seam_list, &state, fx,
00520         best_choice, raw_choice, word->correct,
00521         /*0, */ &fixpt,
00522         &best_state, matcher_pass);
00523     }
00524     bits_in_states = bit_count + state_count - 1;
00525 
00526   }
00527   if (ratings != NULL)
00528     free_matrix(ratings);
00529   if (did_chopping || tester || trainer)
00530     char_choices = rebuild_current_state (word->blobs, seam_list, &state,
00531       char_choices, fx);
00532   if (seam_list != NULL)
00533     free_seam_list(seam_list);
00534   if (matcher_fp != NULL) {
00535     best_state = state;
00536   }
00537   FilterWordChoices();
00538   return char_choices;
00539 }
00540 
00541 
00542 /* ================== */
00568 void improve_by_chopping(register TWERD *word,
00569                          CHOICES_LIST *char_choices,
00570                          int fx,
00571                          STATE *best_state,
00572                          A_CHOICE *best_choice,
00573                          A_CHOICE *raw_choice,
00574                          SEAMS *seam_list,
00575                          DANGERR *fixpt,
00576                          STATE *chop_states,
00577                          INT32 *state_count,
00578                          STATE *correct_state,
00579                          INT32 pass) {
00580   INT32 blob_number;
00581   INT32 index;                   //to states
00582   CHOICES_LIST choices = *char_choices;
00583   float old_best;
00584   int fixpt_valid = 1;
00585   static INT32 old_count;        //from pass1
00586 
00587   do { /* Improvement loop */
00588     if (!fixpt_valid)
00589       fixpt->index = -1;
00590     old_best = class_probability (best_choice);
00591     choices = improve_one_blob (word, *char_choices, fx,
00592       &blob_number, seam_list, fixpt,
00593       chop_states + *state_count, correct_state,
00594       pass);
00595     if (choices != NULL) {
00596       LogNewSplit(blob_number);
00597       permute_characters (choices,
00598         class_probability (best_choice),
00599         best_choice, raw_choice);
00600       *char_choices = choices;
00601 
00602       if (old_best > class_probability (best_choice)) {
00603         set_n_ones (best_state, array_count (*char_choices) - 1);
00604         fixpt_valid = 1;
00605       }
00606       else {
00607         insert_new_chunk (best_state, blob_number,
00608           array_count (*char_choices) - 2);
00609         fixpt_valid = 0;
00610       }
00611       if (*state_count > 0) {
00612         if (pass == 0) {
00613           for (index = 0; index < *state_count; index++)
00614             insert_new_chunk (&chop_states[index], blob_number,
00615               array_count (*char_choices) - 2);
00616           set_n_ones (&chop_states[index],
00617             array_count (*char_choices) - 1);
00618         }
00619         (*state_count)++;
00620       }
00621 
00622       if (chop_debug)
00623         print_state ("best state = ",
00624           best_state, count_blobs (word->blobs) - 1);
00625       if (first_pass)
00626         chops_performed1++;
00627       else
00628         chops_performed2++;
00629 
00630     }
00631   }
00632   while (choices &&
00633     !AcceptableChoice (*char_choices, best_choice, raw_choice, fixpt) &&
00634     !blob_skip && array_count (*char_choices) < MAX_NUM_CHUNKS);
00635   if (pass == 0)
00636     old_count = *state_count;
00637   else {
00638     if (old_count != *state_count)
00639       fprintf (matcher_fp,
00640         "Mis-matched state counts, " INT32FORMAT " pass1, "
00641         INT32FORMAT " pass2\n", old_count, *state_count);
00642   }
00643   if (!fixpt_valid)
00644     fixpt->index = -1;
00645 }
00646 
00647 
00648 /* ================== */
00654 INT16 select_blob_to_split(CHOICES_LIST char_choices, float rating_ceiling) { 
00655   CHOICES this_choice;
00656   int x;
00657   float worst = -MAX_FLOAT32;
00658   int worst_index = -1;
00659 
00660   if (chop_debug)
00661     if (rating_ceiling < MAX_FLOAT32)
00662       cprintf ("rating_ceiling = %8.4f\n", rating_ceiling);
00663   else
00664     cprintf ("rating_ceiling = No Limit\n");
00665 
00666   for_each_choice(char_choices, x) {
00667     this_choice = (CHOICES) array_value (char_choices, x);
00668     if (this_choice == NIL) {
00669       return (x);
00670     }
00671     else {
00672       if (best_probability (this_choice) > worst &&
00673         best_probability (this_choice) < rating_ceiling &&
00674       best_certainty (this_choice) < tessedit_certainty_threshold) {
00675         worst_index = x;
00676         worst = best_probability (this_choice);
00677       }
00678     }
00679   }
00680 
00681   if (chop_debug)
00682     cprintf ("blob_number = %4d\n", worst_index);
00683 
00684   return (worst_index);
00685 }
00686 
00687 
00688 /* ================== */
00700 SEAMS start_seam_list(TBLOB *blobs) { 
00701   TBLOB *blob;
00702   SEAMS seam_list;
00703   TPOINT topleft;
00704   TPOINT botright;
00705   int location;
00706   /* Seam slot per char */
00707   seam_list = new_seam_list ();
00708 
00709   for (blob = blobs; blob->next != NULL; blob = blob->next) {
00710 
00711     blob_bounding_box(blob, &topleft, &botright);
00712     location = botright.x;
00713     blob_bounding_box (blob->next, &topleft, &botright);
00714     location += topleft.x;
00715     location /= 2;
00716 
00717     seam_list = add_seam (seam_list,
00718       new_seam (0.0, location, NULL, NULL, NULL));
00719   }
00720 
00721   return (seam_list);
00722 }
00723 
00724 
00725 /* ================== */
00730 INT16 total_containment(TBLOB *blob1, TBLOB *blob2) {
00731   TPOINT topleft1;
00732   TPOINT botright1;
00733   TPOINT topleft2;
00734   TPOINT botright2;
00735 
00736   blob_bounding_box(blob1, &topleft1, &botright1);
00737   blob_bounding_box(blob2, &topleft2, &botright2);
00738 
00739   return (bounds_inside (topleft1, botright1, topleft2, botright2) ||
00740     bounds_inside (topleft2, botright2, topleft1, botright1));
00741 }
00742 
00743 
00744 /* ================== */
00749 MATRIX word_associator(TBLOB *blobs,
00750                        SEAMS seams,
00751                        STATE *state,
00752                        int fxid,
00753                        A_CHOICE *best_choice,
00754                        A_CHOICE *raw_choice,
00755                        char *correct,
00756                        DANGERR *fixpt,
00757                        STATE *best_state,
00758                        INT32 pass) {
00759   CHUNKS_RECORD chunks_record;
00760   BLOB_WEIGHTS blob_weights;
00761   int x;
00762   int num_chunks;
00763   A_CHOICE *this_choice;
00764 
00765   num_chunks = array_count (seams) + 1;
00766 
00767   chunks_record.chunks = blobs;
00768   chunks_record.splits = seams;
00769   chunks_record.ratings = record_piece_ratings (blobs);
00770   chunks_record.char_widths = blobs_widths (blobs);
00771   chunks_record.chunk_widths = blobs_widths (blobs);
00772   chunks_record.fx = fxid;
00773   /* Save chunk weights */
00774   for (x = 0; x < num_chunks; x++) {
00775     this_choice =
00776       (A_CHOICE *) first (matrix_get (chunks_record.ratings, x, x));
00777     blob_weights[x] = -(INT16) (10 * class_probability (this_choice) /
00778       class_certainty (this_choice));
00779   }
00780   chunks_record.weights = blob_weights;
00781 
00782   if (chop_debug)
00783     print_matrix (chunks_record.ratings);
00784   best_first_search(&chunks_record,
00785                     best_choice,
00786                     raw_choice,
00787                     state,
00788                     fixpt,
00789                     best_state,
00790                     pass);
00791 
00792   free_widths (chunks_record.chunk_widths);
00793   free_widths (chunks_record.char_widths);
00794   return chunks_record.ratings;
00795 }

Generated on Wed Feb 28 19:49:13 2007 for Tesseract by  doxygen 1.5.1