ccmain/adaptions.cpp File Reference

#include "mfcpch.h"
#include <ctype.h>
#include <string.h>
#include "tessbox.h"
#include "tessvars.h"
#include "memry.h"
#include "mainblk.h"
#include "charcut.h"
#include "imgs.h"
#include "scaleimg.h"
#include "reject.h"
#include "control.h"
#include "adaptions.h"
#include "stopper.h"
#include "charsample.h"
#include "matmatch.h"
#include "secname.h"

Go to the source code of this file.

Defines

Functions

Variables


Define Documentation

#define EXTERN

Definition at line 42 of file adaptions.cpp.


Function Documentation

void adapt_to_good_ems ( WERD_RES word,
CHAR_SAMPLES_LIST *  char_clusters,
CHAR_SAMPLE_LIST *  chars_waiting 
)

Keep playing with word that we THINK has m's in it until we're happy?

Parameters:
word Word Results
char_clusters ?? one variable in
See also:
recog_all_words
Parameters:
chars_waiting ?? one variable in
See also:
recog_all_words
Note:
Global: uses
See also:
tessedit_reject_ems,

tessedit_reject_suspect_ems,

tessedit_cluster_debug,

tessedit_em_adaption_mode,

tessedit_process_rns,

tessedit_matrix_match, and

tessedit_demo_adaption

Returns:
none

Definition at line 541 of file adaptions.cpp.

References WERD::baseline_denormalise(), WERD_RES::best_choice, WERD::blob_list(), WERD::bounding_box(), char_clip_word(), CHAR_SAMPLE::character(), clip_sample(), complete_clustering(), demo_word, display_clip_image(), WERD::flag(), IMAGE::get_res(), BOX::height(), imagebasename, CHAR_SAMPLES::match_score(), MAX_INT32, NULL, WERD_RES::outword, page_image, print_em_stats(), reject_all_ems(), reject_suspect_ems(), resolution, SECURE_NAMES, STRING::string(), tprintf(), W_INVERSE, WERD_RES::word, and word_adaptable().

Referenced by recog_all_words().

00543                                                         {
00544   PBLOB_LIST *blobs = word->outword->blob_list ();
00545   PBLOB_IT blob_it(blobs);
00546   INT16 i;
00547   CHAR_SAMPLE *sample;
00548   CHAR_SAMPLES_IT c_it = char_clusters;
00549   CHAR_SAMPLE_IT cw_it = chars_waiting;
00550   float score;
00551   float best_score;
00552   char best_char;
00553   CHAR_SAMPLES *best_cluster;
00554   PIXROW_LIST *pixrow_list;
00555   PIXROW_IT pixrow_it;
00556   IMAGELINE *imlines;            // lines of the image
00557   BOX pix_box;                   // box of imlines
00558   // extent
00559   WERD copy_outword;             // copy to denorm
00560   BOX b_box;
00561   PBLOB_IT copy_blob_it;
00562   OUTLINE_IT copy_outline_it;
00563   PIXROW *pixrow = NULL;
00564 
00565   static INT32 word_number = 0;
00566 
00567 #ifndef GRAPHICS_DISABLED
00568   WINDOW demo_win = NULL;
00569 #endif
00570 
00571   INT32 resolution = page_image.get_res ();
00572 
00573   if (word->word->bounding_box ().height () > resolution / 3)
00574     return;
00575 
00576   word_number++;
00577 
00578   if (strchr (word->best_choice->string ().string (), 'm') == NULL
00579     && (tessedit_process_rns
00580     && strstr (word->best_choice->string ().string (), "rn") == NULL))
00581     return;
00582 
00583   if (tessedit_reject_ems)
00584     reject_all_ems(word);
00585   else if (tessedit_reject_suspect_ems)
00586     reject_suspect_ems(word);
00587   else {
00588     if (char_clusters->length () == 0) {
00589       #ifndef SECURE_NAMES
00590       if (tessedit_cluster_debug)
00591         tprintf ("No clusters to use for em adaption\n");
00592       #endif
00593       return;
00594     }
00595 
00596     if (!cw_it.empty ()) {
00597       complete_clustering(char_clusters, chars_waiting);
00598       print_em_stats(char_clusters, chars_waiting);
00599     }
00600 
00601     if ((!word_adaptable (word, tessedit_em_adaption_mode) ||
00602       word->reject_map.reject_count () != 0)
00603       && (strchr (word->best_choice->string ().string (), 'm') != NULL
00604       || (tessedit_process_rns
00605       && strstr (word->best_choice->string ().string (), "rn") != NULL))) {
00606       if (tessedit_process_rns
00607         && strstr (word->best_choice->string ().string (), "rn") != NULL) {
00608         copy_outword = *(word->outword);
00609         copy_blob_it.set_to_list (copy_outword.blob_list ());
00610         i = 0;
00611         while (word->best_choice->string ()[i] != '\0') {
00612           if (word->best_choice->string ()[i] == 'r'
00613           && word->best_choice->string ()[i + 1] == 'n') {
00614             copy_outline_it.set_to_list (copy_blob_it.data ()-> out_list ());
00615             copy_outline_it.add_list_after (copy_blob_it. data_relative (1)->
00616               out_list ());
00617             copy_blob_it.forward ();
00618             delete (copy_blob_it.extract ());
00619             i++;
00620           }
00621           copy_blob_it.forward ();
00622           i++;
00623         }
00624       }
00625       else
00626         copy_outword = *(word->outword);
00627 
00628       copy_outword.baseline_denormalise (&word->denorm);
00629       copy_blob_it.set_to_list (copy_outword.blob_list ());
00630       char_clip_word(&copy_outword, page_image, pixrow_list, imlines, pix_box); 
00631       pixrow_it.set_to_list (pixrow_list);
00632       pixrow_it.move_to_first ();
00633 
00634       // For debugging only
00635       b_box = copy_outword.bounding_box ();
00636       pixrow = pixrow_it.data ();
00637 
00638       blob_it.move_to_first ();
00639       copy_blob_it.move_to_first ();
00640       for (i = 0;
00641         word->best_choice->string ()[i] != '\0';
00642         i++, pixrow_it.forward (), blob_it.forward (),
00643       copy_blob_it.forward ()) {
00644         if ((word->best_choice->string ()[i] == 'm'
00645           || (word->best_choice->string ()[i] == 'r'
00646           && word->best_choice->string ()[i + 1] == 'n'))
00647         && !word->reject_map[i].perm_rejected ()) {
00648           if (tessedit_cluster_debug)
00649             tprintf ("Sample %c to check found in %s, index %d\n",
00650               word->best_choice->string ()[i],
00651               word->best_choice->string ().string (), i);
00652 
00653           if (tessedit_demo_adaption)
00654             tprintf
00655               ("Sample %c to check found in %s (%d), index %d\n",
00656               word->best_choice->string ()[i],
00657               word->best_choice->string ().string (), word_number,
00658               i);
00659 
00660           if (tessedit_matrix_match) {
00661             BOX copy_box = copy_blob_it.data ()->bounding_box ();
00662 
00663             sample = clip_sample (pixrow_it.data (),
00664               imlines,
00665               pix_box,
00666               copy_outword.flag (W_INVERSE),
00667               word->best_choice->string ()[i]);
00668 
00669             //Clip failed
00670             if (sample == NULL) {
00671               tprintf
00672                 ("Unable to clip sample from %s, index %d\n",
00673                 word->best_choice->string ().string (), i);
00674               #ifndef SECURE_NAMES
00675               if (tessedit_cluster_debug)
00676                 tprintf ("Sample rejected (no sample)\n");
00677               #endif
00678               word->reject_map[i].setrej_mm_reject ();
00679               if (word->best_choice->string ()[i] == 'r') {
00680                 word->reject_map[i + 1].setrej_mm_reject ();
00681                 i++;
00682               }
00683               continue;
00684             }
00685           }
00686           else
00687             sample = new CHAR_SAMPLE (blob_it.data (),
00688               &word->denorm,
00689               word->best_choice->
00690               string ()[i]);
00691 
00692           best_score = MAX_INT32;
00693           best_char = '\0';
00694           best_cluster = NULL;
00695 
00696           for (c_it.mark_cycle_pt ();
00697           !c_it.cycled_list (); c_it.forward ()) {
00698             if (c_it.data ()->character () != '\0') {
00699               score = c_it.data ()->match_score (sample);
00700               if (score < best_score) {
00701                 best_cluster = c_it.data ();
00702                 best_score = score;
00703                 best_char = c_it.data ()->character ();
00704               }
00705             }
00706           }
00707 
00708           if (best_score > tessedit_cluster_t1) {
00709             #ifndef SECURE_NAMES
00710             if (tessedit_cluster_debug)
00711               tprintf ("Sample rejected (score %f)\n", best_score);
00712             if (tessedit_demo_adaption)
00713               tprintf ("Sample rejected (score %f)\n", best_score);
00714             #endif
00715             word->reject_map[i].setrej_mm_reject ();
00716             if (word->best_choice->string ()[i] == 'r')
00717               word->reject_map[i + 1].setrej_mm_reject ();
00718           }
00719           else {
00720             if (word->best_choice->string ()[i] == best_char) {
00721               #ifndef SECURE_NAMES
00722               if (tessedit_cluster_debug)
00723                 tprintf ("Sample accepted (score %f)\n",
00724                   best_score);
00725               if (tessedit_demo_adaption)
00726                 tprintf ("Sample accepted (score %f)\n",
00727                   best_score);
00728               #endif
00729               word->reject_map[i].setrej_mm_accept ();
00730               if (word->best_choice->string ()[i] == 'r')
00731                 word->reject_map[i + 1].setrej_mm_accept ();
00732             }
00733             else {
00734               #ifndef SECURE_NAMES
00735               if (tessedit_cluster_debug)
00736                 tprintf ("Sample rejected (char %c, score %f)\n",
00737                   best_char, best_score);
00738               if (tessedit_demo_adaption)
00739                 tprintf ("Sample rejected (char %c, score %f)\n",
00740                   best_char, best_score);
00741               #endif
00742               word->reject_map[i].setrej_mm_reject ();
00743               if (word->best_choice->string ()[i] == 'r')
00744                 word->reject_map[i + 1].setrej_mm_reject ();
00745             }
00746           }
00747 
00748           if (tessedit_demo_adaption) {
00749             if (strcmp (imagebasename.string (),
00750               tessedit_demo_file.string ()) != 0
00751               || word_number == tessedit_demo_word1
00752             || word_number == tessedit_demo_word2) {
00753 #ifndef GRAPHICS_DISABLED
00754               demo_win =
00755                 display_clip_image(&copy_outword,
00756                                    page_image,
00757                                    pixrow_list,
00758                                    pix_box);
00759 #endif
00760               demo_word = word_number;
00761               best_cluster->match_score (sample);
00762               demo_word = 0;
00763             }
00764           }
00765           if (word->best_choice->string ()[i] == 'r')
00766             i++;                 // Skip next character
00767         }
00768       }
00769       delete[]imlines;           // Free array of imlines
00770       delete pixrow_list;
00771     }
00772   }
00773 }

void adapt_to_good_samples ( WERD_RES word,
CHAR_SAMPLES_LIST *  char_clusters,
CHAR_SAMPLE_LIST *  chars_waiting 
)

?

Parameters:
word Word in question
char_clusters ?
chars_waiting characters waiting (
See also:
recog_all_words)
Returns:
none

Definition at line 784 of file adaptions.cpp.

References WERD::baseline_denormalise(), WERD::blob_list(), WERD::bounding_box(), PIXROW::bounding_box(), char_clip_word(), CHAR_SAMPLES::character(), clip_sample(), complete_clustering(), debug_fp, demo_word, display_clip_image(), WERD::flag(), IMAGE::get_res(), BOX::height(), imagebasename, CHAR_SAMPLES::match_score(), MAX_INT32, NULL, WERD_RES::outword, page_image, print_em_stats(), resolution, SECURE_NAMES, STRING::string(), tprintf(), W_INVERSE, WERD_RES::word, and word_adaptable().

Referenced by classify_word_pass1(), and recog_all_words().

00786                                                             {
00787   PBLOB_LIST *blobs = word->outword->blob_list ();
00788   PBLOB_IT blob_it(blobs);
00789   INT16 i;
00790   CHAR_SAMPLE *sample;
00791   CHAR_SAMPLES_IT c_it = char_clusters;
00792   CHAR_SAMPLE_IT cw_it = chars_waiting;
00793   float score;
00794   float best_score;
00795   char best_char;
00796   CHAR_SAMPLES *best_cluster;
00797   PIXROW_LIST *pixrow_list;
00798   PIXROW_IT pixrow_it;
00799   IMAGELINE *imlines;            // lines of the image
00800   BOX pix_box;                   // box of imlines
00801   // extent
00802   WERD copy_outword;             // copy to denorm
00803   BOX b_box;
00804   PBLOB_IT copy_blob_it;
00805   PIXROW *pixrow = NULL;
00806 
00807   static INT32 word_number = 0;
00808 
00809 #ifndef GRAPHICS_DISABLED
00810   WINDOW demo_win = NULL;
00811 #endif
00812 
00813   INT32 resolution = page_image.get_res ();
00814 
00815   word_number++;
00816 
00817   if (tessedit_test_cluster_input)
00818     return;
00819 
00820   if (word->word->bounding_box ().height () > resolution / 3)
00821     return;
00822 
00823   if (char_clusters->length () == 0) {
00824     #ifndef SECURE_NAMES
00825     if (tessedit_cluster_debug)
00826       tprintf ("No clusters to use for adaption\n");
00827     #endif
00828     return;
00829   }
00830 
00831   if (!cw_it.empty ()) {
00832     complete_clustering(char_clusters, chars_waiting);
00833     print_em_stats(char_clusters, chars_waiting);
00834   }
00835 
00836   if ((!word_adaptable (word, tessedit_cluster_adaption_mode)
00837   && word->reject_map.reject_count () != 0) || tessedit_mm_use_rejmap) {
00838     if (tessedit_cluster_debug) {
00839       tprintf ("\nChecking: \"%s\"  MAP ",
00840         word->best_choice->string ().string ());
00841       word->reject_map.print (debug_fp);
00842       tprintf ("\n");
00843     }
00844 
00845     copy_outword = *(word->outword);
00846     copy_outword.baseline_denormalise (&word->denorm);
00847     copy_blob_it.set_to_list (copy_outword.blob_list ());
00848     char_clip_word(&copy_outword, page_image, pixrow_list, imlines, pix_box);
00849     pixrow_it.set_to_list (pixrow_list);
00850     pixrow_it.move_to_first ();
00851 
00852                                  // For debugging only
00853     b_box = copy_outword.bounding_box ();
00854     pixrow = pixrow_it.data ();
00855 
00856     blob_it.move_to_first ();
00857     copy_blob_it.move_to_first ();
00858     for (i = 0;
00859       word->best_choice->string ()[i] != '\0';
00860       i++, pixrow_it.forward (), blob_it.forward (),
00861     copy_blob_it.forward ()) {
00862       if (word->reject_map[i].recoverable ()
00863       || (tessedit_mm_all_rejects && word->reject_map[i].rejected ())) {
00864         BOX copy_box = copy_blob_it.data ()->bounding_box ();
00865 
00866         if (tessedit_cluster_debug)
00867           tprintf ("Sample %c to check found in %s, index %d\n",
00868             word->best_choice->string ()[i],
00869             word->best_choice->string ().string (), i);
00870 
00871         if (tessedit_demo_adaption)
00872           tprintf ("Sample %c to check found in %s (%d), index %d\n",
00873             word->best_choice->string ()[i],
00874             word->best_choice->string ().string (),
00875             word_number, i);
00876 
00877         sample = clip_sample (pixrow_it.data (),
00878           imlines,
00879           pix_box,
00880           copy_outword.flag (W_INVERSE),
00881           word->best_choice->string ()[i]);
00882 
00883         if (sample == NULL) {    //Clip failed
00884           tprintf ("Unable to clip sample from %s, index %d\n",
00885             word->best_choice->string ().string (), i);
00886           #ifndef SECURE_NAMES
00887           if (tessedit_cluster_debug)
00888             tprintf ("Sample rejected (no sample)\n");
00889           #endif
00890           word->reject_map[i].setrej_mm_reject ();
00891 
00892           continue;
00893         }
00894 
00895         best_score = MAX_INT32;
00896         best_char = '\0';
00897         best_cluster = NULL;
00898 
00899         for (c_it.mark_cycle_pt ();
00900         !c_it.cycled_list (); c_it.forward ()) {
00901           if (c_it.data ()->character () != '\0') {
00902             score = c_it.data ()->match_score (sample);
00903             if (score < best_score) {
00904               best_cluster = c_it.data ();
00905               best_score = score;
00906               best_char = c_it.data ()->character ();
00907             }
00908           }
00909         }
00910 
00911         if (best_score > tessedit_cluster_t1) {
00912           #ifndef SECURE_NAMES
00913           if (tessedit_cluster_debug)
00914             tprintf ("Sample rejected (score %f)\n", best_score);
00915           if (tessedit_demo_adaption)
00916             tprintf ("Sample rejected (score %f)\n", best_score);
00917           #endif
00918           word->reject_map[i].setrej_mm_reject ();
00919         }
00920         else {
00921           if (word->best_choice->string ()[i] == best_char) {
00922             #ifndef SECURE_NAMES
00923             if (tessedit_cluster_debug)
00924               tprintf ("Sample accepted (score %f)\n", best_score);
00925             if (tessedit_demo_adaption)
00926               tprintf ("Sample accepted (score %f)\n", best_score);
00927             #endif
00928             if (tessedit_test_adaption)
00929               word->reject_map[i].setrej_minimal_rej_accept ();
00930             else
00931               word->reject_map[i].setrej_mm_accept ();
00932           }
00933           else {
00934             #ifndef SECURE_NAMES
00935             if (tessedit_cluster_debug)
00936               tprintf ("Sample rejected (char %c, score %f)\n",
00937                 best_char, best_score);
00938             if (tessedit_demo_adaption)
00939               tprintf ("Sample rejected (char %c, score %f)\n",
00940                 best_char, best_score);
00941             #endif
00942             word->reject_map[i].setrej_mm_reject ();
00943           }
00944         }
00945 
00946         if (tessedit_demo_adaption) {
00947           if (strcmp (imagebasename.string (),
00948             tessedit_demo_file.string ()) != 0
00949             || word_number == tessedit_demo_word1
00950           || word_number == tessedit_demo_word2) {
00951 #ifndef GRAPHICS_DISABLED
00952             demo_win =
00953               display_clip_image(&copy_outword,
00954                                  page_image,
00955                                  pixrow_list,
00956                                  pix_box);
00957 #endif
00958             demo_word = word_number;
00959             best_cluster->match_score (sample);
00960             demo_word = 0;
00961           }
00962         }
00963       }
00964     }
00965     delete[]imlines;             // Free array of imlines
00966     delete pixrow_list;
00967 
00968     if (tessedit_cluster_debug) {
00969       tprintf ("\nFinal: \"%s\"  MAP ",
00970         word->best_choice->string ().string ());
00971       word->reject_map.print (debug_fp);
00972       tprintf ("\n");
00973     }
00974   }
00975 }

void check_wait_list ( CHAR_SAMPLE_LIST *  chars_waiting,
CHAR_SAMPLE sample,
CHAR_SAMPLES best_cluster 
)

Traverse all characters comparing scores and either add wait sample to existing cluster or drop wait sample when it's aready in existing cluster?

Parameters:
chars_waiting characters waiting (
See also:
recog_all_words)
Parameters:
sample sample to use?
best_cluster ? (
See also:
STATS::cluster)
Returns:
none

Definition at line 429 of file adaptions.cpp.

References CHAR_SAMPLES::add_sample(), FALSE, CHAR_SAMPLE::match_sample(), CHAR_SAMPLES::match_score(), and tprintf().

Referenced by cluster_sample(), and complete_clustering().

00431                                                  {
00432   CHAR_SAMPLE *wait_sample;
00433   CHAR_SAMPLE *test_sample = sample;
00434   CHAR_SAMPLE_IT cw_it = chars_waiting;
00435   CHAR_SAMPLE_LIST add_list;     // Samples added to best cluster
00436   CHAR_SAMPLE_IT add_it = &add_list;
00437   float score;
00438 
00439   add_list.clear ();
00440 
00441   if (!cw_it.empty ()) {
00442     do {
00443       if (!add_list.empty ()) {
00444         add_it.forward ();
00445         test_sample = add_it.extract ();
00446         best_cluster->add_sample (test_sample);
00447       }
00448 
00449       for (cw_it.mark_cycle_pt ();
00450       !cw_it.cycled_list (); cw_it.forward ()) {
00451         wait_sample = cw_it.data ();
00452         if (tessedit_mm_use_prototypes)
00453           score = best_cluster->match_score (wait_sample);
00454         else
00455           score = sample->match_sample (wait_sample, FALSE);
00456         if (score < tessedit_cluster_t1) {
00457           if (score > tessedit_cluster_t3
00458           || tessedit_mm_use_prototypes) {
00459             add_it.add_after_stay_put (cw_it.extract ());
00460             #ifndef SECURE_NAMES
00461             if (tessedit_cluster_debug)
00462               tprintf
00463                 ("Wait sample added to an existing cluster\n");
00464             #endif
00465           }
00466           else {
00467             #ifndef SECURE_NAMES
00468             if (tessedit_cluster_debug)
00469               tprintf
00470                 ("Wait sample dropped, good match to an existing cluster\n");
00471             #endif
00472           }
00473         }
00474       }
00475     }
00476     while (!add_list.empty ());
00477   }
00478 }

CHAR_SAMPLE* clip_sample ( PIXROW pixrow,
IMAGELINE imlines,
BOX  pix_box,
BOOL8  white_on_black,
char  c 
)

Parameters:
pixrow 
imlines array of lines [cut from/] of image
pix_box Box of imlines extent
white_on_black 1 or 0, deremines if should set background to white (
See also:
invert_image)
Parameters:
c e.g., word->best_choice->string ()[i] (THE character we're working on)
Returns:
image of clipped sample associated with c ?

Definition at line 1013 of file adaptions.cpp.

References ASSERT_HOST, PIXROW::bounding_box(), PIXROW::char_clip_image(), IMAGE::create(), IMAGE::get_res(), IMAGE::get_xsize(), IMAGE::get_ysize(), BOX::height(), invert_image(), NULL, BOX::null_box(), page_image, resolution, tprintf(), and BOX::width().

Referenced by adapt_to_good_ems(), adapt_to_good_samples(), collect_characters_for_adaption(), and collect_ems_for_adaption().

01018                                  {
01019   IMAGE *image = new (IMAGE); // unscaled char image
01020   BOX b_box = pixrow->bounding_box ();
01021   float baseline_pos = 0; // baseline ht in image
01022   INT32 resolution = page_image.get_res ();
01023 
01024   if (!b_box.null_box ()) {
01025     ASSERT_HOST (b_box.width () < page_image.get_xsize () &&
01026       b_box.height () < page_image.get_ysize ());
01027 
01028     if (b_box.width () > resolution || b_box.height () > resolution) {
01029       tprintf ("clip sample: sample too big (%d x %d)\n",
01030         b_box.width (), b_box.height ());
01031 
01032       return NULL;
01033     }
01034 
01035     IMAGE *image = new (IMAGE);
01036     if (image->create (b_box.width (), b_box.height (), 1) == -1) {
01037       tprintf ("clip sample: create image failed (%d x %d)\n",
01038         b_box.width (), b_box.height ());
01039 
01040       delete image;
01041       return NULL;
01042     }
01043 
01044     if (!white_on_black)
01045       invert_image(image);  // Set background to white
01046     pixrow->char_clip_image (imlines, pix_box, NULL, *image, baseline_pos);
01047     if (white_on_black)
01048       invert_image(image);  //invert white on black for scaling &NN
01049     return new CHAR_SAMPLE (image, c);
01050   }
01051   else
01052     return NULL;
01053 }

void cluster_sample ( CHAR_SAMPLE sample,
CHAR_SAMPLES_LIST *  char_clusters,
CHAR_SAMPLE_LIST *  chars_waiting 
)

Find best score of sample and assign it to cluster, if not already 'there'?

Parameters:
sample sample to use?
char_clusters ?
chars_waiting characters waiting (
See also:
recog_all_words)
Returns:
none

Definition at line 364 of file adaptions.cpp.

References CHAR_SAMPLES::add_sample(), check_wait_list(), MAX_INT32, NULL, and tprintf().

Referenced by collect_characters_for_adaption(), and collect_ems_for_adaption().

00366                                                      {
00367   CHAR_SAMPLES *best_cluster = NULL;
00368   CHAR_SAMPLES_IT c_it = char_clusters;
00369   CHAR_SAMPLE_IT cw_it = chars_waiting;
00370   float score;
00371   float best_score = MAX_INT32;
00372 
00373   if (c_it.empty ())
00374     c_it.add_to_end (new CHAR_SAMPLES (sample));
00375   else {
00376     for (c_it.mark_cycle_pt (); !c_it.cycled_list (); c_it.forward ()) {
00377       score = c_it.data ()->match_score (sample);
00378       if (score < best_score) {
00379         best_score = score;
00380         best_cluster = c_it.data ();
00381       }
00382     }
00383 
00384     if (tessedit_cluster_debug)
00385       tprintf ("Sample's best score %f\n", best_score);
00386 
00387     if (best_score < tessedit_cluster_t1) {
00388       if (best_score > tessedit_cluster_t3 || tessedit_mm_use_prototypes) {
00389         best_cluster->add_sample (sample);
00390         check_wait_list(chars_waiting, sample, best_cluster);
00391         #ifndef SECURE_NAMES
00392         if (tessedit_cluster_debug)
00393           tprintf ("Sample added to an existing cluster\n");
00394         #endif
00395       }
00396       else {
00397         #ifndef SECURE_NAMES
00398         if (tessedit_cluster_debug)
00399           tprintf
00400             ("Sample dropped, good match to an existing cluster\n");
00401         #endif
00402       }
00403     }
00404     else if (best_score > tessedit_cluster_t2) {
00405       c_it.add_to_end (new CHAR_SAMPLES (sample));
00406       #ifndef SECURE_NAMES
00407       if (tessedit_cluster_debug)
00408         tprintf ("New cluster created for this sample\n");
00409       #endif
00410     }
00411     else {
00412       cw_it.add_to_end (sample);
00413       if (tessedit_cluster_debug)
00414         tprintf ("Sample added to the wait list\n");
00415     }
00416   }
00417 }

void collect_characters_for_adaption ( WERD_RES word,
CHAR_SAMPLES_LIST *  char_clusters,
CHAR_SAMPLE_LIST *  chars_waiting 
)

Within bounding box, collect good matches of blobs into characters 'for adaptation'?

Parameters:
word Word Results
char_clusters ?
chars_waiting characters waiting (
See also:
recog_all_words)
Note:
Global:
See also:
tessedit_display_mm,

tessedit_demo_adaption,

tessedit_cluster_adaption_mode,

tessedit_cluster_debug,

tessedit_test_cluster_input, and

tessedit_mm_use_rejmap

Returns:
none

Definition at line 284 of file adaptions.cpp.

References WERD::baseline_denormalise(), WERD::blob_list(), WERD::bounding_box(), char_clip_word(), clip_sample(), cluster_sample(), FALSE, WERD::flag(), IMAGE::get_res(), BOX::height(), NULL, WERD_RES::outword, page_image, resolution, tprintf(), W_INVERSE, WERD_RES::word, and word_adaptable().

Referenced by recog_all_words().

00286                                                                       {
00287   PBLOB_LIST *blobs = word->outword->blob_list ();
00288   PBLOB_IT blob_it(blobs);
00289   INT16 i;
00290   CHAR_SAMPLE *sample;
00291   PIXROW_LIST *pixrow_list;
00292   PIXROW_IT pixrow_it;
00293   IMAGELINE *imlines;            // lines of the image
00294   BOX pix_box;                   // box of imlines
00295   // extent
00296   WERD copy_outword;             // copy to denorm
00297   INT32 resolution = page_image.get_res ();
00298 
00299   if (word->word->bounding_box ().height () > resolution / 3)
00300     return;
00301 
00302   if (tessedit_demo_adaption)
00303     tessedit_display_mm.set_value (FALSE); // Make sure not set
00304 
00305   if ((word_adaptable (word, tessedit_cluster_adaption_mode)
00306   && word->reject_map.reject_count () == 0) || tessedit_mm_use_rejmap) {
00307     if (tessedit_test_cluster_input && !tessedit_mm_use_rejmap)
00308       return;                    // Reject map set to acceptable
00309     /* Collect information about good matches */
00310     copy_outword = *(word->outword);
00311     copy_outword.baseline_denormalise (&word->denorm);
00312     char_clip_word(&copy_outword, page_image, pixrow_list, imlines, pix_box); 
00313     pixrow_it.set_to_list (pixrow_list);
00314     pixrow_it.move_to_first ();
00315 
00316     blob_it.move_to_first ();
00317     for (i = 0;
00318       word->best_choice->string ()[i] != '\0';
00319     i++, pixrow_it.forward (), blob_it.forward ()) {
00320 
00321       if (!(tessedit_mm_use_non_adaption_set
00322         && STRING (tessedit_non_adaption_set).contains (word->
00323         best_choice->
00324         string ()[i]))
00325       || (tessedit_mm_use_rejmap && word->reject_map[i].accepted ())) {
00326         #ifndef SECURE_NAMES
00327         if (tessedit_cluster_debug)
00328           tprintf ("Sample %c for adaption found in %s, index %d\n",
00329             word->best_choice->string ()[i],
00330             word->best_choice->string ().string (), i);
00331         #endif
00332         sample = clip_sample (pixrow_it.data (),
00333           imlines,
00334           pix_box,
00335           copy_outword.flag (W_INVERSE),
00336           word->best_choice->string ()[i]);
00337 
00338         if (sample == NULL) {    // Clip failed
00339           #ifndef SECURE_NAMES
00340           tprintf ("Unable to clip sample from %s, index %d\n",
00341             word->best_choice->string ().string (), i);
00342           #endif
00343           continue;
00344         }
00345         cluster_sample(sample, char_clusters, chars_waiting);
00346       }
00347     }
00348     delete[]imlines;             // Free array of imlines
00349     delete pixrow_list;
00350   }
00351   else if (tessedit_test_cluster_input && !tessedit_mm_use_rejmap)
00352     word->reject_map.rej_word_tess_failure (); // Set word to all rejects
00353 }

void collect_ems_for_adaption ( WERD_RES word,
CHAR_SAMPLES_LIST *  char_clusters,
CHAR_SAMPLE_LIST *  chars_waiting 
)

Within bounding box, collect blobs that might be 'm's for adaptation'?

Parameters:
word Word Results
char_clusters ?
chars_waiting characters waiting (
See also:
recog_all_words)
Returns:
none

lines of the image

box of imlines

copy to denorm

Make sure not set

Definition at line 163 of file adaptions.cpp.

References WERD::baseline_denormalise(), WERD::blob_list(), WERD::bounding_box(), char_clip_word(), clip_sample(), cluster_sample(), FALSE, WERD::flag(), IMAGE::get_res(), BOX::height(), NULL, WERD_RES::outword, page_image, resolution, tprintf(), W_INVERSE, WERD_RES::word, and word_adaptable().

Referenced by recog_all_words().

00165                                                                {
00166   PBLOB_LIST *blobs = word->outword->blob_list ();
00167   PBLOB_IT blob_it(blobs); 
00168   INT16 i;
00169   CHAR_SAMPLE *sample;
00170   PIXROW_LIST *pixrow_list;
00171   PIXROW_IT pixrow_it;
00172   IMAGELINE *imlines;            
00173   BOX pix_box;                   
00174   // extent
00175   WERD copy_outword;             
00176   PBLOB_IT copy_blob_it;
00177   OUTLINE_IT copy_outline_it;
00178   INT32 resolution = page_image.get_res ();
00179 
00180   if (tessedit_reject_ems || tessedit_reject_suspect_ems)
00181     return;                      // Do nothing
00182 
00183   if (word->word->bounding_box ().height () > resolution / 3)
00184     return;
00185 
00186   if (tessedit_demo_adaption)
00187     tessedit_display_mm.set_value (FALSE); 
00188 
00189   if (word_adaptable (word, tessedit_em_adaption_mode)
00190     && word->reject_map.reject_count () == 0
00191     && (strchr (word->best_choice->string ().string (), 'm') != NULL
00192     || (tessedit_process_rns
00193     && strstr (word->best_choice->string ().string (), "rn") != NULL))) {
00194     if (tessedit_process_rns
00195     && strstr (word->best_choice->string ().string (), "rn") != NULL) {
00196       copy_outword = *(word->outword);
00197       copy_blob_it.set_to_list (copy_outword.blob_list ());
00198       i = 0;
00199       while (word->best_choice->string ()[i] != '\0') {
00200         if (word->best_choice->string ()[i] == 'r'
00201         && word->best_choice->string ()[i + 1] == 'n') {
00202           copy_outline_it.set_to_list (copy_blob_it.data ()->
00203             out_list ());
00204           copy_outline_it.add_list_after (copy_blob_it.
00205             data_relative (1)->
00206             out_list ());
00207           copy_blob_it.forward ();
00208           delete (copy_blob_it.extract ());
00209           i++;
00210         }
00211         copy_blob_it.forward ();
00212         i++;
00213       }
00214     }
00215     else
00216       copy_outword = *(word->outword);
00217 
00218     copy_outword.baseline_denormalise (&word->denorm);
00219     char_clip_word(&copy_outword, page_image, pixrow_list, imlines, pix_box); 
00220     pixrow_it.set_to_list (pixrow_list);
00221     pixrow_it.move_to_first ();
00222 
00223     blob_it.move_to_first ();
00224     for (i = 0;
00225       word->best_choice->string ()[i] != '\0';
00226     i++, pixrow_it.forward (), blob_it.forward ()) {
00227 
00228       if (word->best_choice->string ()[i] == 'm'
00229         || (word->best_choice->string ()[i] == 'r'
00230       && word->best_choice->string ()[i + 1] == 'n')) {
00231         #ifndef SECURE_NAMES
00232         if (tessedit_cluster_debug)
00233           tprintf ("Sample %c for adaption found in %s, index %d\n",
00234             word->best_choice->string ()[i],
00235             word->best_choice->string ().string (), i);
00236         #endif
00237         if (tessedit_matrix_match) {
00238           sample = clip_sample (pixrow_it.data (),
00239             imlines,
00240             pix_box,
00241             copy_outword.flag (W_INVERSE),
00242             word->best_choice->string ()[i]);
00243 
00244           if (sample == NULL) {  //Clip failed
00245             #ifndef SECURE_NAMES
00246             tprintf ("Unable to clip sample from %s, index %d\n",
00247               word->best_choice->string ().string (), i);
00248             #endif
00249             if (word->best_choice->string ()[i] == 'r')
00250               i++;
00251 
00252             continue;
00253           }
00254         }
00255         else
00256           sample = new CHAR_SAMPLE (blob_it.data (),
00257             &word->denorm,
00258             word->best_choice->string ()[i]);
00259 
00260         cluster_sample(sample, char_clusters, chars_waiting); 
00261 
00262         if (word->best_choice->string ()[i] == 'r')
00263           i++;                   // Skip next character
00264       }
00265     }
00266     delete[]imlines;             // Free array of imlines
00267     delete pixrow_list;
00268   }
00269 }

void complete_clustering ( CHAR_SAMPLES_LIST *  char_clusters,
CHAR_SAMPLE_LIST *  chars_waiting 
)

Cluster until chars_waiting is used up (.

Parameters:
char_clusters ?? one variable in
See also:
recog_all_words
Parameters:
chars_waiting ?? one variable in
See also:
recog_all_words
Note:
Global: uses
See also:
tessedit_use_best_sample,

tessedit_cluster_debug,

tessedit_mm_adapt_using_prototypes, and

tessedit_demo_adaption

Returns:
none
See also:
check_wait_list), then turn guesses into characters?

Definition at line 490 of file adaptions.cpp.

References check_wait_list(), display_cluster_prototypes(), and tprintf().

Referenced by adapt_to_good_ems(), and adapt_to_good_samples().

00491                                                           {
00492   CHAR_SAMPLES *best_cluster;
00493   CHAR_SAMPLES_IT c_it = char_clusters;
00494   CHAR_SAMPLE_IT cw_it = chars_waiting;
00495   CHAR_SAMPLE *sample;
00496   INT32 total_sample_count = 0;
00497 
00498   while (!cw_it.empty ()) {
00499     cw_it.move_to_first ();
00500     sample = cw_it.extract ();
00501     best_cluster = new CHAR_SAMPLES (sample);
00502     c_it.add_to_end (best_cluster);
00503     check_wait_list(chars_waiting, sample, best_cluster);
00504   }
00505 
00506   for (c_it.mark_cycle_pt (); !c_it.cycled_list (); c_it.forward ()) {
00507     c_it.data ()->assign_to_char ();
00508     if (tessedit_use_best_sample)
00509       c_it.data ()->find_best_sample ();
00510     else if (tessedit_mm_adapt_using_prototypes)
00511       c_it.data ()->build_prototype ();
00512 
00513     if (tessedit_cluster_debug)
00514       total_sample_count += c_it.data ()->n_samples ();
00515   }
00516   #ifndef SECURE_NAMES
00517   if (tessedit_cluster_debug)
00518     tprintf ("Clustering completed, %d samples in all\n", total_sample_count);
00519   #endif
00520 
00521 #ifndef GRAPHICS_DISABLED
00522   if (tessedit_demo_adaption)
00523     display_cluster_prototypes(char_clusters);
00524 #endif
00525 
00526 }

void display_cluster_prototypes ( CHAR_SAMPLES_LIST *  char_clusters  ) 

Dump out prototypes.

Parameters:
char_clusters ?? one variable in
See also:
recog_all_words
Returns:
none

Definition at line 1063 of file adaptions.cpp.

References display_image, FALSE, NULL, tprintf(), and WINDOWNAMESIZE.

Referenced by complete_clustering().

01063                                                                   { 
01064   INT16 proto_number = 0;
01065   CHAR_SAMPLES_IT c_it = char_clusters;
01066   char title[WINDOWNAMESIZE];
01067 
01068   for (c_it.mark_cycle_pt (); !c_it.cycled_list (); c_it.forward ()) {
01069     proto_number++;
01070 
01071     #ifndef SECURE_NAMES
01072     tprintf ("Displaying proto number %d\n", proto_number);
01073     #endif
01074 
01075     if (c_it.data ()->prototype () != NULL) {
01076       sprintf (title, "Proto - %d", proto_number);
01077       display_image (c_it.data ()->prototype ()->make_image (),
01078         title, (proto_number - 1) * 400, 0, FALSE);
01079     }
01080   }
01081 }

void print_em_stats ( CHAR_SAMPLES_LIST *  char_clusters,
CHAR_SAMPLE_LIST *  chars_waiting 
)

Dump out stats.

Parameters:
char_clusters ?? one variable in
See also:
recog_all_words
Parameters:
chars_waiting ?? one variable in
See also:
recog_all_words
Returns:
none

Definition at line 985 of file adaptions.cpp.

References debug_fp, and tprintf().

Referenced by adapt_to_good_ems(), adapt_to_good_samples(), and recog_all_words().

00986                                                      {
00987   CHAR_SAMPLES_IT c_it = char_clusters;
00988 
00989   if (!tessedit_cluster_debug)
00990     return;
00991   #ifndef SECURE_NAMES
00992   tprintf ("There are %d clusters and %d samples waiting\n",
00993     char_clusters->length (), chars_waiting->length ());
00994 
00995   for (c_it.mark_cycle_pt (); !c_it.cycled_list (); c_it.forward ())
00996     c_it.data ()->print (debug_fp);
00997   #endif
00998   tprintf ("\n");
00999 }

void reject_all_ems ( WERD_RES word  ) 

Simplistic routine to test the effect of rejecting ems (i.e., the letter 'm').

Parameters:
word word to be processed
Returns:
none

Definition at line 1090 of file adaptions.cpp.

References WERD_RES::best_choice, and WERD_RES::reject_map.

Referenced by adapt_to_good_ems().

01090                                     { 
01091   INT16 i;
01092 
01093   for (i = 0; word->best_choice->string ()[i] != '\0'; i++) {
01094     if (word->best_choice->string ()[i] == 'm')
01095                                  // reject all ems
01096       word->reject_map[i].setrej_mm_reject ();
01097   }
01098 }

void reject_all_fullstops ( WERD_RES word  ) 

Simplistic routine to test the effect of rejecting fullstops (i.e., '.').

Parameters:
word word to be processed
Returns:
none

Definition at line 1106 of file adaptions.cpp.

References WERD_RES::best_choice, and WERD_RES::reject_map.

Referenced by recog_all_words().

01106                                           { 
01107   INT16 i;
01108 
01109   for (i = 0; word->best_choice->string ()[i] != '\0'; i++) {
01110     if (word->best_choice->string ()[i] == '.')
01111                                  // reject all fullstops
01112       word->reject_map[i].setrej_mm_reject ();
01113   }
01114 }

void reject_suspect_ems ( WERD_RES word  ) 

Reject ems (the letter 'm') if word is not adaptable (.

Parameters:
word word to be processed
Returns:
none
See also:
word_adaptable)

Definition at line 1123 of file adaptions.cpp.

References suspect_em(), and word_adaptable().

Referenced by adapt_to_good_ems().

01123                                         { 
01124   INT16 i;
01125 
01126   if (!word_adaptable (word, tessedit_cluster_adaption_mode))
01127   for (i = 0; word->best_choice->string ()[i] != '\0'; i++) {
01128     if (word->best_choice->string ()[i] == 'm' && suspect_em (word, i))
01129                                  // reject all ems
01130       word->reject_map[i].setrej_mm_reject ();
01131   }
01132 }

void reject_suspect_fullstops ( WERD_RES word  ) 

Reject fullstops ('.' [& ','?]).

Parameters:
word word to be processed
Returns:
none

Definition at line 1140 of file adaptions.cpp.

References WERD_RES::best_choice, WERD_RES::reject_map, and suspect_fullstop().

Referenced by recog_all_words().

01140                                               { 
01141   INT16 i;
01142 
01143   for (i = 0; word->best_choice->string ()[i] != '\0'; i++) {
01144     if (word->best_choice->string ()[i] == '.'
01145       && suspect_fullstop (word, i))
01146       word->reject_map[i].setrej_mm_reject (); // reject all commas
01147   }
01148 }

BOOL8 suspect_em ( WERD_RES word,
INT16  index 
)

Suspect that 'm' in word is something else (like 'rn').

Parameters:
word word to be processed
index position to which blob 'consumes' XXX?
Returns:
TRUE if 'm' suspected

Definition at line 1158 of file adaptions.cpp.

References WERD::blob_list(), and WERD_RES::outword.

Referenced by reject_suspect_ems().

01158                                               { 
01159   PBLOB_LIST *blobs = word->outword->blob_list ();
01160   PBLOB_IT blob_it(blobs);
01161   INT16 j;
01162 
01163   for (j = 0; j < index; j++)
01164     blob_it.forward ();
01165 
01166   return (blob_it.data ()->out_list ()->length () != 1);
01167 }

BOOL8 suspect_fullstop ( WERD_RES word,
INT16  i 
)

?

Parameters:
word word to be processed
i Same as index in
See also:
suspect_em
Returns:
TRUE if '.'/',' suspected

Definition at line 1177 of file adaptions.cpp.

References WERD::blob_list(), BOX::height(), WERD_RES::outword, and BOX::width().

Referenced by reject_suspect_fullstops().

01177                                                 { 
01178   float aspect_ratio;
01179   PBLOB_LIST *blobs = word->outword->blob_list ();
01180   PBLOB_IT blob_it(blobs);
01181   INT16 j;
01182   BOX box;
01183   INT16 width;
01184   INT16 height;
01185 
01186   for (j = 0; j < i; j++)
01187     blob_it.forward ();
01188 
01189   box = blob_it.data ()->bounding_box ();
01190 
01191   width = box.width ();
01192   height = box.height ();
01193 
01194   aspect_ratio = ((width > height) ? ((float) width) / height :
01195   ((float) height) / width);
01196 
01197   return (aspect_ratio > tessed_fullstop_aspect_ratio);
01198 }

BOOL8 word_adaptable ( WERD_RES word,
UINT16  mode 
)

Determine if word IS adaptable.

Parameters:
word Word Results
mode ex: tessedit_cluster_adaption_mode ("Adaptation decision algorithm for matrix matcher")
Returns:
TRUE or FALSE

Definition at line 100 of file adaptions.cpp.

References ADAPTABLE_WERD, WERD_RES::best_choice, BITS16::bit(), FALSE, FREQ_DAWG_PERM, NoDangerousAmbig(), NULL, NUMBER_PERM, one_ell_conflict(), SYSTEM_DAWG_PERM, WERD_RES::tess_accepted, WERD_RES::tess_would_adapt, and USER_DAWG_PERM.

Referenced by adapt_to_good_ems(), adapt_to_good_samples(), classify_word_pass1(), collect_characters_for_adaption(), collect_ems_for_adaption(), recog_all_words(), and reject_suspect_ems().

00102                                   {
00103   BOOL8 status = FALSE;
00104   BITS16 flags(mode); 
00105 
00106   enum MODES
00107   {
00108     ADAPTABLE_WERD,
00109     ACCEPTABLE_WERD,
00110     CHECK_DAWGS,
00111     CHECK_SPACES,
00112     CHECK_ONE_ELL_CONFLICT,
00113     CHECK_AMBIG_WERD
00114   };
00115 
00116   /*
00117   0: NO adaption
00118   */
00119   if (mode == 0) {
00120     return FALSE;
00121   }
00122 
00123   if (flags.bit (ADAPTABLE_WERD))
00124     status |= word->tess_would_adapt;
00125 
00126   if (flags.bit (ACCEPTABLE_WERD))
00127     status |= word->tess_accepted;
00128 
00129   if (!status)                   // If not set then
00130     return FALSE;                // ignore other checks
00131 
00132   if (flags.bit (CHECK_DAWGS) &&
00133     (word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
00134     (word->best_choice->permuter () != FREQ_DAWG_PERM) &&
00135     (word->best_choice->permuter () != USER_DAWG_PERM) &&
00136     (word->best_choice->permuter () != NUMBER_PERM))
00137     return FALSE;
00138 
00139   if (flags.bit (CHECK_ONE_ELL_CONFLICT) && one_ell_conflict (word, FALSE))
00140     return FALSE;
00141 
00142   if (flags.bit (CHECK_SPACES) &&
00143     (strchr (word->best_choice->string ().string (), ' ') != NULL))
00144     return FALSE;
00145 
00146 //  if (flags.bit (CHECK_AMBIG_WERD) && test_ambig_word (word))
00147   if (flags.bit (CHECK_AMBIG_WERD) &&
00148       !NoDangerousAmbig(word->best_choice->string().string(), NULL))
00149     return FALSE;
00150 
00151   return status;
00152 }


Variable Documentation

INT32 demo_word = 0

Hack for demos.

Note:
File: adaptions.cpp (Formerly adaptions.c)
Functions used to adapt to blobs already confidently identified
Author:
Chris Newton
Date:
Thu Oct 7 10:17:28 BST 1993
 * (C) Copyright 1992, Hewlett-Packard Ltd.
 ** Licensed under the Apache License, Version 2.0 (the "License");
 ** you may not use this file except in compliance with the License.
 ** You may obtain a copy of the License at
 ** http://www.apache.org/licenses/LICENSE-2.0
 ** Unless required by applicable law or agreed to in writing, software
 ** distributed under the License is distributed on an "AS IS" BASIS,
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 ** See the License for the specific language governing permissions and
 ** limitations under the License.

Definition at line 40 of file adaptions.cpp.

Referenced by adapt_to_good_ems(), adapt_to_good_samples(), CHAR_PROTO::match(), and CHAR_PROTO::match_sample().


Generated on Wed Feb 28 19:49:13 2007 for Tesseract by  doxygen 1.5.1