00001
00019 #include "mfcpch.h"
00020 #ifdef __UNIX__
00021 #include <assert.h>
00022 #endif
00023 #include <ctype.h>
00024 #include <string.h>
00025 #include "tessbox.h"
00026 #include "tessvars.h"
00027 #include "memry.h"
00028 #include "mainblk.h"
00029 #include "charcut.h"
00030 #include "imgs.h"
00031 #include "scaleimg.h"
00032 #include "reject.h"
00033 #include "control.h"
00034 #include "adaptions.h"
00035 #include "stopper.h"
00036 #include "charsample.h"
00037 #include "matmatch.h"
00038 #include "secname.h"
00039
00040 INT32 demo_word = 0;
00041
00042 #define EXTERN
00043
00045 EXTERN BOOL_VAR (tessedit_reject_ems, FALSE, "Reject all m's");
00046 EXTERN BOOL_VAR (tessedit_reject_suspect_ems, FALSE, "Reject suspect m's");
00047
00048 EXTERN double_VAR (tessedit_cluster_t1, 0.20,
00049 "t1 threshold for clustering samples");
00050 EXTERN double_VAR (tessedit_cluster_t2, 0.40,
00051 "t2 threshold for clustering samples");
00052 EXTERN double_VAR (tessedit_cluster_t3, 0.12,
00053 "Extra threshold for clustering samples, only keep a new sample if best score greater than this value");
00054 EXTERN double_VAR (tessedit_cluster_accept_fraction, 0.80,
00055 "Largest fraction of characters in cluster for it to be used for adaption");
00056 EXTERN INT_VAR (tessedit_cluster_min_size, 3,
00057 "Smallest number of samples in a cluster for it to be used for adaption");
00058 EXTERN BOOL_VAR (tessedit_cluster_debug, FALSE,
00059 "Generate and print debug information for adaption by clustering");
00060 EXTERN BOOL_VAR (tessedit_use_best_sample, FALSE,
00061 "Use best sample from cluster when adapting");
00062 EXTERN BOOL_VAR (tessedit_test_cluster_input, FALSE,
00063 "Set reject map to enable cluster input to be measured");
00064
00065 EXTERN BOOL_VAR (tessedit_matrix_match, TRUE, "Use matrix matcher");
00066 EXTERN BOOL_VAR (tessedit_mm_use_non_adaption_set, FALSE,
00067 "Don't try to adapt to characters on this list");
00068 EXTERN STRING_VAR (tessedit_non_adaption_set, ",.;:'~@*",
00069 "Characters to be avoided when adapting");
00070 EXTERN BOOL_VAR (tessedit_mm_adapt_using_prototypes, TRUE,
00071 "Use prototypes when adapting");
00072 EXTERN BOOL_VAR (tessedit_mm_use_prototypes, TRUE,
00073 "Use prototypes as clusters are built");
00074 EXTERN BOOL_VAR (tessedit_mm_use_rejmap, FALSE,
00075 "Adapt to characters using reject map");
00076 EXTERN BOOL_VAR (tessedit_mm_all_rejects, FALSE,
00077 "Adapt to all characters using, matrix matcher");
00078 EXTERN BOOL_VAR (tessedit_mm_only_match_same_char, FALSE,
00079 "Only match samples against clusters for the same character");
00080 EXTERN BOOL_VAR (tessedit_process_rns, FALSE, "Handle m - rn ambigs");
00081
00082 EXTERN BOOL_VAR (tessedit_demo_adaption, FALSE,
00083 "Display cut images and matrix match for demo purposes");
00084 EXTERN INT_VAR (tessedit_demo_word1, 62,
00085 "Word number of first word to display");
00086 EXTERN INT_VAR (tessedit_demo_word2, 64,
00087 "Word number of second word to display");
00088 EXTERN STRING_VAR (tessedit_demo_file, "academe",
00089 "Name of document containing demo words");
00100 BOOL8 word_adaptable(
00101 WERD_RES *word,
00102 UINT16 mode) {
00103 BOOL8 status = FALSE;
00104 BITS16 flags(mode);
00105
00106 enum MODES
00107 {
00108 ADAPTABLE_WERD,
00109 ACCEPTABLE_WERD,
00110 CHECK_DAWGS,
00111 CHECK_SPACES,
00112 CHECK_ONE_ELL_CONFLICT,
00113 CHECK_AMBIG_WERD
00114 };
00115
00116
00117
00118
00119 if (mode == 0) {
00120 return FALSE;
00121 }
00122
00123 if (flags.bit (ADAPTABLE_WERD))
00124 status |= word->tess_would_adapt;
00125
00126 if (flags.bit (ACCEPTABLE_WERD))
00127 status |= word->tess_accepted;
00128
00129 if (!status)
00130 return FALSE;
00131
00132 if (flags.bit (CHECK_DAWGS) &&
00133 (word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
00134 (word->best_choice->permuter () != FREQ_DAWG_PERM) &&
00135 (word->best_choice->permuter () != USER_DAWG_PERM) &&
00136 (word->best_choice->permuter () != NUMBER_PERM))
00137 return FALSE;
00138
00139 if (flags.bit (CHECK_ONE_ELL_CONFLICT) && one_ell_conflict (word, FALSE))
00140 return FALSE;
00141
00142 if (flags.bit (CHECK_SPACES) &&
00143 (strchr (word->best_choice->string ().string (), ' ') != NULL))
00144 return FALSE;
00145
00146
00147 if (flags.bit (CHECK_AMBIG_WERD) &&
00148 !NoDangerousAmbig(word->best_choice->string().string(), NULL))
00149 return FALSE;
00150
00151 return status;
00152 }
00153
00154
00163 void collect_ems_for_adaption(WERD_RES *word,
00164 CHAR_SAMPLES_LIST *char_clusters,
00165 CHAR_SAMPLE_LIST *chars_waiting) {
00166 PBLOB_LIST *blobs = word->outword->blob_list ();
00167 PBLOB_IT blob_it(blobs);
00168 INT16 i;
00169 CHAR_SAMPLE *sample;
00170 PIXROW_LIST *pixrow_list;
00171 PIXROW_IT pixrow_it;
00172 IMAGELINE *imlines;
00173 BOX pix_box;
00174
00175 WERD copy_outword;
00176 PBLOB_IT copy_blob_it;
00177 OUTLINE_IT copy_outline_it;
00178 INT32 resolution = page_image.get_res ();
00179
00180 if (tessedit_reject_ems || tessedit_reject_suspect_ems)
00181 return;
00182
00183 if (word->word->bounding_box ().height () > resolution / 3)
00184 return;
00185
00186 if (tessedit_demo_adaption)
00187 tessedit_display_mm.set_value (FALSE);
00188
00189 if (word_adaptable (word, tessedit_em_adaption_mode)
00190 && word->reject_map.reject_count () == 0
00191 && (strchr (word->best_choice->string ().string (), 'm') != NULL
00192 || (tessedit_process_rns
00193 && strstr (word->best_choice->string ().string (), "rn") != NULL))) {
00194 if (tessedit_process_rns
00195 && strstr (word->best_choice->string ().string (), "rn") != NULL) {
00196 copy_outword = *(word->outword);
00197 copy_blob_it.set_to_list (copy_outword.blob_list ());
00198 i = 0;
00199 while (word->best_choice->string ()[i] != '\0') {
00200 if (word->best_choice->string ()[i] == 'r'
00201 && word->best_choice->string ()[i + 1] == 'n') {
00202 copy_outline_it.set_to_list (copy_blob_it.data ()->
00203 out_list ());
00204 copy_outline_it.add_list_after (copy_blob_it.
00205 data_relative (1)->
00206 out_list ());
00207 copy_blob_it.forward ();
00208 delete (copy_blob_it.extract ());
00209 i++;
00210 }
00211 copy_blob_it.forward ();
00212 i++;
00213 }
00214 }
00215 else
00216 copy_outword = *(word->outword);
00217
00218 copy_outword.baseline_denormalise (&word->denorm);
00219 char_clip_word(©_outword, page_image, pixrow_list, imlines, pix_box);
00220 pixrow_it.set_to_list (pixrow_list);
00221 pixrow_it.move_to_first ();
00222
00223 blob_it.move_to_first ();
00224 for (i = 0;
00225 word->best_choice->string ()[i] != '\0';
00226 i++, pixrow_it.forward (), blob_it.forward ()) {
00227
00228 if (word->best_choice->string ()[i] == 'm'
00229 || (word->best_choice->string ()[i] == 'r'
00230 && word->best_choice->string ()[i + 1] == 'n')) {
00231 #ifndef SECURE_NAMES
00232 if (tessedit_cluster_debug)
00233 tprintf ("Sample %c for adaption found in %s, index %d\n",
00234 word->best_choice->string ()[i],
00235 word->best_choice->string ().string (), i);
00236 #endif
00237 if (tessedit_matrix_match) {
00238 sample = clip_sample (pixrow_it.data (),
00239 imlines,
00240 pix_box,
00241 copy_outword.flag (W_INVERSE),
00242 word->best_choice->string ()[i]);
00243
00244 if (sample == NULL) {
00245 #ifndef SECURE_NAMES
00246 tprintf ("Unable to clip sample from %s, index %d\n",
00247 word->best_choice->string ().string (), i);
00248 #endif
00249 if (word->best_choice->string ()[i] == 'r')
00250 i++;
00251
00252 continue;
00253 }
00254 }
00255 else
00256 sample = new CHAR_SAMPLE (blob_it.data (),
00257 &word->denorm,
00258 word->best_choice->string ()[i]);
00259
00260 cluster_sample(sample, char_clusters, chars_waiting);
00261
00262 if (word->best_choice->string ()[i] == 'r')
00263 i++;
00264 }
00265 }
00266 delete[]imlines;
00267 delete pixrow_list;
00268 }
00269 }
00270
00271
00284 void collect_characters_for_adaption(WERD_RES *word,
00285 CHAR_SAMPLES_LIST *char_clusters,
00286 CHAR_SAMPLE_LIST *chars_waiting) {
00287 PBLOB_LIST *blobs = word->outword->blob_list ();
00288 PBLOB_IT blob_it(blobs);
00289 INT16 i;
00290 CHAR_SAMPLE *sample;
00291 PIXROW_LIST *pixrow_list;
00292 PIXROW_IT pixrow_it;
00293 IMAGELINE *imlines;
00294 BOX pix_box;
00295
00296 WERD copy_outword;
00297 INT32 resolution = page_image.get_res ();
00298
00299 if (word->word->bounding_box ().height () > resolution / 3)
00300 return;
00301
00302 if (tessedit_demo_adaption)
00303 tessedit_display_mm.set_value (FALSE);
00304
00305 if ((word_adaptable (word, tessedit_cluster_adaption_mode)
00306 && word->reject_map.reject_count () == 0) || tessedit_mm_use_rejmap) {
00307 if (tessedit_test_cluster_input && !tessedit_mm_use_rejmap)
00308 return;
00309
00310 copy_outword = *(word->outword);
00311 copy_outword.baseline_denormalise (&word->denorm);
00312 char_clip_word(©_outword, page_image, pixrow_list, imlines, pix_box);
00313 pixrow_it.set_to_list (pixrow_list);
00314 pixrow_it.move_to_first ();
00315
00316 blob_it.move_to_first ();
00317 for (i = 0;
00318 word->best_choice->string ()[i] != '\0';
00319 i++, pixrow_it.forward (), blob_it.forward ()) {
00320
00321 if (!(tessedit_mm_use_non_adaption_set
00322 && STRING (tessedit_non_adaption_set).contains (word->
00323 best_choice->
00324 string ()[i]))
00325 || (tessedit_mm_use_rejmap && word->reject_map[i].accepted ())) {
00326 #ifndef SECURE_NAMES
00327 if (tessedit_cluster_debug)
00328 tprintf ("Sample %c for adaption found in %s, index %d\n",
00329 word->best_choice->string ()[i],
00330 word->best_choice->string ().string (), i);
00331 #endif
00332 sample = clip_sample (pixrow_it.data (),
00333 imlines,
00334 pix_box,
00335 copy_outword.flag (W_INVERSE),
00336 word->best_choice->string ()[i]);
00337
00338 if (sample == NULL) {
00339 #ifndef SECURE_NAMES
00340 tprintf ("Unable to clip sample from %s, index %d\n",
00341 word->best_choice->string ().string (), i);
00342 #endif
00343 continue;
00344 }
00345 cluster_sample(sample, char_clusters, chars_waiting);
00346 }
00347 }
00348 delete[]imlines;
00349 delete pixrow_list;
00350 }
00351 else if (tessedit_test_cluster_input && !tessedit_mm_use_rejmap)
00352 word->reject_map.rej_word_tess_failure ();
00353 }
00354
00355
00364 void cluster_sample(CHAR_SAMPLE *sample,
00365 CHAR_SAMPLES_LIST *char_clusters,
00366 CHAR_SAMPLE_LIST *chars_waiting) {
00367 CHAR_SAMPLES *best_cluster = NULL;
00368 CHAR_SAMPLES_IT c_it = char_clusters;
00369 CHAR_SAMPLE_IT cw_it = chars_waiting;
00370 float score;
00371 float best_score = MAX_INT32;
00372
00373 if (c_it.empty ())
00374 c_it.add_to_end (new CHAR_SAMPLES (sample));
00375 else {
00376 for (c_it.mark_cycle_pt (); !c_it.cycled_list (); c_it.forward ()) {
00377 score = c_it.data ()->match_score (sample);
00378 if (score < best_score) {
00379 best_score = score;
00380 best_cluster = c_it.data ();
00381 }
00382 }
00383
00384 if (tessedit_cluster_debug)
00385 tprintf ("Sample's best score %f\n", best_score);
00386
00387 if (best_score < tessedit_cluster_t1) {
00388 if (best_score > tessedit_cluster_t3 || tessedit_mm_use_prototypes) {
00389 best_cluster->add_sample (sample);
00390 check_wait_list(chars_waiting, sample, best_cluster);
00391 #ifndef SECURE_NAMES
00392 if (tessedit_cluster_debug)
00393 tprintf ("Sample added to an existing cluster\n");
00394 #endif
00395 }
00396 else {
00397 #ifndef SECURE_NAMES
00398 if (tessedit_cluster_debug)
00399 tprintf
00400 ("Sample dropped, good match to an existing cluster\n");
00401 #endif
00402 }
00403 }
00404 else if (best_score > tessedit_cluster_t2) {
00405 c_it.add_to_end (new CHAR_SAMPLES (sample));
00406 #ifndef SECURE_NAMES
00407 if (tessedit_cluster_debug)
00408 tprintf ("New cluster created for this sample\n");
00409 #endif
00410 }
00411 else {
00412 cw_it.add_to_end (sample);
00413 if (tessedit_cluster_debug)
00414 tprintf ("Sample added to the wait list\n");
00415 }
00416 }
00417 }
00418
00419
00429 void check_wait_list(CHAR_SAMPLE_LIST *chars_waiting,
00430 CHAR_SAMPLE *sample,
00431 CHAR_SAMPLES *best_cluster) {
00432 CHAR_SAMPLE *wait_sample;
00433 CHAR_SAMPLE *test_sample = sample;
00434 CHAR_SAMPLE_IT cw_it = chars_waiting;
00435 CHAR_SAMPLE_LIST add_list;
00436 CHAR_SAMPLE_IT add_it = &add_list;
00437 float score;
00438
00439 add_list.clear ();
00440
00441 if (!cw_it.empty ()) {
00442 do {
00443 if (!add_list.empty ()) {
00444 add_it.forward ();
00445 test_sample = add_it.extract ();
00446 best_cluster->add_sample (test_sample);
00447 }
00448
00449 for (cw_it.mark_cycle_pt ();
00450 !cw_it.cycled_list (); cw_it.forward ()) {
00451 wait_sample = cw_it.data ();
00452 if (tessedit_mm_use_prototypes)
00453 score = best_cluster->match_score (wait_sample);
00454 else
00455 score = sample->match_sample (wait_sample, FALSE);
00456 if (score < tessedit_cluster_t1) {
00457 if (score > tessedit_cluster_t3
00458 || tessedit_mm_use_prototypes) {
00459 add_it.add_after_stay_put (cw_it.extract ());
00460 #ifndef SECURE_NAMES
00461 if (tessedit_cluster_debug)
00462 tprintf
00463 ("Wait sample added to an existing cluster\n");
00464 #endif
00465 }
00466 else {
00467 #ifndef SECURE_NAMES
00468 if (tessedit_cluster_debug)
00469 tprintf
00470 ("Wait sample dropped, good match to an existing cluster\n");
00471 #endif
00472 }
00473 }
00474 }
00475 }
00476 while (!add_list.empty ());
00477 }
00478 }
00479
00490 void complete_clustering(CHAR_SAMPLES_LIST *char_clusters,
00491 CHAR_SAMPLE_LIST *chars_waiting) {
00492 CHAR_SAMPLES *best_cluster;
00493 CHAR_SAMPLES_IT c_it = char_clusters;
00494 CHAR_SAMPLE_IT cw_it = chars_waiting;
00495 CHAR_SAMPLE *sample;
00496 INT32 total_sample_count = 0;
00497
00498 while (!cw_it.empty ()) {
00499 cw_it.move_to_first ();
00500 sample = cw_it.extract ();
00501 best_cluster = new CHAR_SAMPLES (sample);
00502 c_it.add_to_end (best_cluster);
00503 check_wait_list(chars_waiting, sample, best_cluster);
00504 }
00505
00506 for (c_it.mark_cycle_pt (); !c_it.cycled_list (); c_it.forward ()) {
00507 c_it.data ()->assign_to_char ();
00508 if (tessedit_use_best_sample)
00509 c_it.data ()->find_best_sample ();
00510 else if (tessedit_mm_adapt_using_prototypes)
00511 c_it.data ()->build_prototype ();
00512
00513 if (tessedit_cluster_debug)
00514 total_sample_count += c_it.data ()->n_samples ();
00515 }
00516 #ifndef SECURE_NAMES
00517 if (tessedit_cluster_debug)
00518 tprintf ("Clustering completed, %d samples in all\n", total_sample_count);
00519 #endif
00520
00521 #ifndef GRAPHICS_DISABLED
00522 if (tessedit_demo_adaption)
00523 display_cluster_prototypes(char_clusters);
00524 #endif
00525
00526 }
00527
00528
00541 void adapt_to_good_ems(WERD_RES *word,
00542 CHAR_SAMPLES_LIST *char_clusters,
00543 CHAR_SAMPLE_LIST *chars_waiting) {
00544 PBLOB_LIST *blobs = word->outword->blob_list ();
00545 PBLOB_IT blob_it(blobs);
00546 INT16 i;
00547 CHAR_SAMPLE *sample;
00548 CHAR_SAMPLES_IT c_it = char_clusters;
00549 CHAR_SAMPLE_IT cw_it = chars_waiting;
00550 float score;
00551 float best_score;
00552 char best_char;
00553 CHAR_SAMPLES *best_cluster;
00554 PIXROW_LIST *pixrow_list;
00555 PIXROW_IT pixrow_it;
00556 IMAGELINE *imlines;
00557 BOX pix_box;
00558
00559 WERD copy_outword;
00560 BOX b_box;
00561 PBLOB_IT copy_blob_it;
00562 OUTLINE_IT copy_outline_it;
00563 PIXROW *pixrow = NULL;
00564
00565 static INT32 word_number = 0;
00566
00567 #ifndef GRAPHICS_DISABLED
00568 WINDOW demo_win = NULL;
00569 #endif
00570
00571 INT32 resolution = page_image.get_res ();
00572
00573 if (word->word->bounding_box ().height () > resolution / 3)
00574 return;
00575
00576 word_number++;
00577
00578 if (strchr (word->best_choice->string ().string (), 'm') == NULL
00579 && (tessedit_process_rns
00580 && strstr (word->best_choice->string ().string (), "rn") == NULL))
00581 return;
00582
00583 if (tessedit_reject_ems)
00584 reject_all_ems(word);
00585 else if (tessedit_reject_suspect_ems)
00586 reject_suspect_ems(word);
00587 else {
00588 if (char_clusters->length () == 0) {
00589 #ifndef SECURE_NAMES
00590 if (tessedit_cluster_debug)
00591 tprintf ("No clusters to use for em adaption\n");
00592 #endif
00593 return;
00594 }
00595
00596 if (!cw_it.empty ()) {
00597 complete_clustering(char_clusters, chars_waiting);
00598 print_em_stats(char_clusters, chars_waiting);
00599 }
00600
00601 if ((!word_adaptable (word, tessedit_em_adaption_mode) ||
00602 word->reject_map.reject_count () != 0)
00603 && (strchr (word->best_choice->string ().string (), 'm') != NULL
00604 || (tessedit_process_rns
00605 && strstr (word->best_choice->string ().string (), "rn") != NULL))) {
00606 if (tessedit_process_rns
00607 && strstr (word->best_choice->string ().string (), "rn") != NULL) {
00608 copy_outword = *(word->outword);
00609 copy_blob_it.set_to_list (copy_outword.blob_list ());
00610 i = 0;
00611 while (word->best_choice->string ()[i] != '\0') {
00612 if (word->best_choice->string ()[i] == 'r'
00613 && word->best_choice->string ()[i + 1] == 'n') {
00614 copy_outline_it.set_to_list (copy_blob_it.data ()-> out_list ());
00615 copy_outline_it.add_list_after (copy_blob_it. data_relative (1)->
00616 out_list ());
00617 copy_blob_it.forward ();
00618 delete (copy_blob_it.extract ());
00619 i++;
00620 }
00621 copy_blob_it.forward ();
00622 i++;
00623 }
00624 }
00625 else
00626 copy_outword = *(word->outword);
00627
00628 copy_outword.baseline_denormalise (&word->denorm);
00629 copy_blob_it.set_to_list (copy_outword.blob_list ());
00630 char_clip_word(©_outword, page_image, pixrow_list, imlines, pix_box);
00631 pixrow_it.set_to_list (pixrow_list);
00632 pixrow_it.move_to_first ();
00633
00634
00635 b_box = copy_outword.bounding_box ();
00636 pixrow = pixrow_it.data ();
00637
00638 blob_it.move_to_first ();
00639 copy_blob_it.move_to_first ();
00640 for (i = 0;
00641 word->best_choice->string ()[i] != '\0';
00642 i++, pixrow_it.forward (), blob_it.forward (),
00643 copy_blob_it.forward ()) {
00644 if ((word->best_choice->string ()[i] == 'm'
00645 || (word->best_choice->string ()[i] == 'r'
00646 && word->best_choice->string ()[i + 1] == 'n'))
00647 && !word->reject_map[i].perm_rejected ()) {
00648 if (tessedit_cluster_debug)
00649 tprintf ("Sample %c to check found in %s, index %d\n",
00650 word->best_choice->string ()[i],
00651 word->best_choice->string ().string (), i);
00652
00653 if (tessedit_demo_adaption)
00654 tprintf
00655 ("Sample %c to check found in %s (%d), index %d\n",
00656 word->best_choice->string ()[i],
00657 word->best_choice->string ().string (), word_number,
00658 i);
00659
00660 if (tessedit_matrix_match) {
00661 BOX copy_box = copy_blob_it.data ()->bounding_box ();
00662
00663 sample = clip_sample (pixrow_it.data (),
00664 imlines,
00665 pix_box,
00666 copy_outword.flag (W_INVERSE),
00667 word->best_choice->string ()[i]);
00668
00669
00670 if (sample == NULL) {
00671 tprintf
00672 ("Unable to clip sample from %s, index %d\n",
00673 word->best_choice->string ().string (), i);
00674 #ifndef SECURE_NAMES
00675 if (tessedit_cluster_debug)
00676 tprintf ("Sample rejected (no sample)\n");
00677 #endif
00678 word->reject_map[i].setrej_mm_reject ();
00679 if (word->best_choice->string ()[i] == 'r') {
00680 word->reject_map[i + 1].setrej_mm_reject ();
00681 i++;
00682 }
00683 continue;
00684 }
00685 }
00686 else
00687 sample = new CHAR_SAMPLE (blob_it.data (),
00688 &word->denorm,
00689 word->best_choice->
00690 string ()[i]);
00691
00692 best_score = MAX_INT32;
00693 best_char = '\0';
00694 best_cluster = NULL;
00695
00696 for (c_it.mark_cycle_pt ();
00697 !c_it.cycled_list (); c_it.forward ()) {
00698 if (c_it.data ()->character () != '\0') {
00699 score = c_it.data ()->match_score (sample);
00700 if (score < best_score) {
00701 best_cluster = c_it.data ();
00702 best_score = score;
00703 best_char = c_it.data ()->character ();
00704 }
00705 }
00706 }
00707
00708 if (best_score > tessedit_cluster_t1) {
00709 #ifndef SECURE_NAMES
00710 if (tessedit_cluster_debug)
00711 tprintf ("Sample rejected (score %f)\n", best_score);
00712 if (tessedit_demo_adaption)
00713 tprintf ("Sample rejected (score %f)\n", best_score);
00714 #endif
00715 word->reject_map[i].setrej_mm_reject ();
00716 if (word->best_choice->string ()[i] == 'r')
00717 word->reject_map[i + 1].setrej_mm_reject ();
00718 }
00719 else {
00720 if (word->best_choice->string ()[i] == best_char) {
00721 #ifndef SECURE_NAMES
00722 if (tessedit_cluster_debug)
00723 tprintf ("Sample accepted (score %f)\n",
00724 best_score);
00725 if (tessedit_demo_adaption)
00726 tprintf ("Sample accepted (score %f)\n",
00727 best_score);
00728 #endif
00729 word->reject_map[i].setrej_mm_accept ();
00730 if (word->best_choice->string ()[i] == 'r')
00731 word->reject_map[i + 1].setrej_mm_accept ();
00732 }
00733 else {
00734 #ifndef SECURE_NAMES
00735 if (tessedit_cluster_debug)
00736 tprintf ("Sample rejected (char %c, score %f)\n",
00737 best_char, best_score);
00738 if (tessedit_demo_adaption)
00739 tprintf ("Sample rejected (char %c, score %f)\n",
00740 best_char, best_score);
00741 #endif
00742 word->reject_map[i].setrej_mm_reject ();
00743 if (word->best_choice->string ()[i] == 'r')
00744 word->reject_map[i + 1].setrej_mm_reject ();
00745 }
00746 }
00747
00748 if (tessedit_demo_adaption) {
00749 if (strcmp (imagebasename.string (),
00750 tessedit_demo_file.string ()) != 0
00751 || word_number == tessedit_demo_word1
00752 || word_number == tessedit_demo_word2) {
00753 #ifndef GRAPHICS_DISABLED
00754 demo_win =
00755 display_clip_image(©_outword,
00756 page_image,
00757 pixrow_list,
00758 pix_box);
00759 #endif
00760 demo_word = word_number;
00761 best_cluster->match_score (sample);
00762 demo_word = 0;
00763 }
00764 }
00765 if (word->best_choice->string ()[i] == 'r')
00766 i++;
00767 }
00768 }
00769 delete[]imlines;
00770 delete pixrow_list;
00771 }
00772 }
00773 }
00774
00775
00784 void adapt_to_good_samples(WERD_RES *word,
00785 CHAR_SAMPLES_LIST *char_clusters,
00786 CHAR_SAMPLE_LIST *chars_waiting) {
00787 PBLOB_LIST *blobs = word->outword->blob_list ();
00788 PBLOB_IT blob_it(blobs);
00789 INT16 i;
00790 CHAR_SAMPLE *sample;
00791 CHAR_SAMPLES_IT c_it = char_clusters;
00792 CHAR_SAMPLE_IT cw_it = chars_waiting;
00793 float score;
00794 float best_score;
00795 char best_char;
00796 CHAR_SAMPLES *best_cluster;
00797 PIXROW_LIST *pixrow_list;
00798 PIXROW_IT pixrow_it;
00799 IMAGELINE *imlines;
00800 BOX pix_box;
00801
00802 WERD copy_outword;
00803 BOX b_box;
00804 PBLOB_IT copy_blob_it;
00805 PIXROW *pixrow = NULL;
00806
00807 static INT32 word_number = 0;
00808
00809 #ifndef GRAPHICS_DISABLED
00810 WINDOW demo_win = NULL;
00811 #endif
00812
00813 INT32 resolution = page_image.get_res ();
00814
00815 word_number++;
00816
00817 if (tessedit_test_cluster_input)
00818 return;
00819
00820 if (word->word->bounding_box ().height () > resolution / 3)
00821 return;
00822
00823 if (char_clusters->length () == 0) {
00824 #ifndef SECURE_NAMES
00825 if (tessedit_cluster_debug)
00826 tprintf ("No clusters to use for adaption\n");
00827 #endif
00828 return;
00829 }
00830
00831 if (!cw_it.empty ()) {
00832 complete_clustering(char_clusters, chars_waiting);
00833 print_em_stats(char_clusters, chars_waiting);
00834 }
00835
00836 if ((!word_adaptable (word, tessedit_cluster_adaption_mode)
00837 && word->reject_map.reject_count () != 0) || tessedit_mm_use_rejmap) {
00838 if (tessedit_cluster_debug) {
00839 tprintf ("\nChecking: \"%s\" MAP ",
00840 word->best_choice->string ().string ());
00841 word->reject_map.print (debug_fp);
00842 tprintf ("\n");
00843 }
00844
00845 copy_outword = *(word->outword);
00846 copy_outword.baseline_denormalise (&word->denorm);
00847 copy_blob_it.set_to_list (copy_outword.blob_list ());
00848 char_clip_word(©_outword, page_image, pixrow_list, imlines, pix_box);
00849 pixrow_it.set_to_list (pixrow_list);
00850 pixrow_it.move_to_first ();
00851
00852
00853 b_box = copy_outword.bounding_box ();
00854 pixrow = pixrow_it.data ();
00855
00856 blob_it.move_to_first ();
00857 copy_blob_it.move_to_first ();
00858 for (i = 0;
00859 word->best_choice->string ()[i] != '\0';
00860 i++, pixrow_it.forward (), blob_it.forward (),
00861 copy_blob_it.forward ()) {
00862 if (word->reject_map[i].recoverable ()
00863 || (tessedit_mm_all_rejects && word->reject_map[i].rejected ())) {
00864 BOX copy_box = copy_blob_it.data ()->bounding_box ();
00865
00866 if (tessedit_cluster_debug)
00867 tprintf ("Sample %c to check found in %s, index %d\n",
00868 word->best_choice->string ()[i],
00869 word->best_choice->string ().string (), i);
00870
00871 if (tessedit_demo_adaption)
00872 tprintf ("Sample %c to check found in %s (%d), index %d\n",
00873 word->best_choice->string ()[i],
00874 word->best_choice->string ().string (),
00875 word_number, i);
00876
00877 sample = clip_sample (pixrow_it.data (),
00878 imlines,
00879 pix_box,
00880 copy_outword.flag (W_INVERSE),
00881 word->best_choice->string ()[i]);
00882
00883 if (sample == NULL) {
00884 tprintf ("Unable to clip sample from %s, index %d\n",
00885 word->best_choice->string ().string (), i);
00886 #ifndef SECURE_NAMES
00887 if (tessedit_cluster_debug)
00888 tprintf ("Sample rejected (no sample)\n");
00889 #endif
00890 word->reject_map[i].setrej_mm_reject ();
00891
00892 continue;
00893 }
00894
00895 best_score = MAX_INT32;
00896 best_char = '\0';
00897 best_cluster = NULL;
00898
00899 for (c_it.mark_cycle_pt ();
00900 !c_it.cycled_list (); c_it.forward ()) {
00901 if (c_it.data ()->character () != '\0') {
00902 score = c_it.data ()->match_score (sample);
00903 if (score < best_score) {
00904 best_cluster = c_it.data ();
00905 best_score = score;
00906 best_char = c_it.data ()->character ();
00907 }
00908 }
00909 }
00910
00911 if (best_score > tessedit_cluster_t1) {
00912 #ifndef SECURE_NAMES
00913 if (tessedit_cluster_debug)
00914 tprintf ("Sample rejected (score %f)\n", best_score);
00915 if (tessedit_demo_adaption)
00916 tprintf ("Sample rejected (score %f)\n", best_score);
00917 #endif
00918 word->reject_map[i].setrej_mm_reject ();
00919 }
00920 else {
00921 if (word->best_choice->string ()[i] == best_char) {
00922 #ifndef SECURE_NAMES
00923 if (tessedit_cluster_debug)
00924 tprintf ("Sample accepted (score %f)\n", best_score);
00925 if (tessedit_demo_adaption)
00926 tprintf ("Sample accepted (score %f)\n", best_score);
00927 #endif
00928 if (tessedit_test_adaption)
00929 word->reject_map[i].setrej_minimal_rej_accept ();
00930 else
00931 word->reject_map[i].setrej_mm_accept ();
00932 }
00933 else {
00934 #ifndef SECURE_NAMES
00935 if (tessedit_cluster_debug)
00936 tprintf ("Sample rejected (char %c, score %f)\n",
00937 best_char, best_score);
00938 if (tessedit_demo_adaption)
00939 tprintf ("Sample rejected (char %c, score %f)\n",
00940 best_char, best_score);
00941 #endif
00942 word->reject_map[i].setrej_mm_reject ();
00943 }
00944 }
00945
00946 if (tessedit_demo_adaption) {
00947 if (strcmp (imagebasename.string (),
00948 tessedit_demo_file.string ()) != 0
00949 || word_number == tessedit_demo_word1
00950 || word_number == tessedit_demo_word2) {
00951 #ifndef GRAPHICS_DISABLED
00952 demo_win =
00953 display_clip_image(©_outword,
00954 page_image,
00955 pixrow_list,
00956 pix_box);
00957 #endif
00958 demo_word = word_number;
00959 best_cluster->match_score (sample);
00960 demo_word = 0;
00961 }
00962 }
00963 }
00964 }
00965 delete[]imlines;
00966 delete pixrow_list;
00967
00968 if (tessedit_cluster_debug) {
00969 tprintf ("\nFinal: \"%s\" MAP ",
00970 word->best_choice->string ().string ());
00971 word->reject_map.print (debug_fp);
00972 tprintf ("\n");
00973 }
00974 }
00975 }
00976
00977
00985 void print_em_stats(CHAR_SAMPLES_LIST *char_clusters,
00986 CHAR_SAMPLE_LIST *chars_waiting) {
00987 CHAR_SAMPLES_IT c_it = char_clusters;
00988
00989 if (!tessedit_cluster_debug)
00990 return;
00991 #ifndef SECURE_NAMES
00992 tprintf ("There are %d clusters and %d samples waiting\n",
00993 char_clusters->length (), chars_waiting->length ());
00994
00995 for (c_it.mark_cycle_pt (); !c_it.cycled_list (); c_it.forward ())
00996 c_it.data ()->print (debug_fp);
00997 #endif
00998 tprintf ("\n");
00999 }
01000
01001
01013 CHAR_SAMPLE *clip_sample(
01014 PIXROW *pixrow,
01015 IMAGELINE *imlines,
01016 BOX pix_box,
01017 BOOL8 white_on_black,
01018 char c) {
01019 IMAGE *image = new (IMAGE);
01020 BOX b_box = pixrow->bounding_box ();
01021 float baseline_pos = 0;
01022 INT32 resolution = page_image.get_res ();
01023
01024 if (!b_box.null_box ()) {
01025 ASSERT_HOST (b_box.width () < page_image.get_xsize () &&
01026 b_box.height () < page_image.get_ysize ());
01027
01028 if (b_box.width () > resolution || b_box.height () > resolution) {
01029 tprintf ("clip sample: sample too big (%d x %d)\n",
01030 b_box.width (), b_box.height ());
01031
01032 return NULL;
01033 }
01034
01035 IMAGE *image = new (IMAGE);
01036 if (image->create (b_box.width (), b_box.height (), 1) == -1) {
01037 tprintf ("clip sample: create image failed (%d x %d)\n",
01038 b_box.width (), b_box.height ());
01039
01040 delete image;
01041 return NULL;
01042 }
01043
01044 if (!white_on_black)
01045 invert_image(image);
01046 pixrow->char_clip_image (imlines, pix_box, NULL, *image, baseline_pos);
01047 if (white_on_black)
01048 invert_image(image);
01049 return new CHAR_SAMPLE (image, c);
01050 }
01051 else
01052 return NULL;
01053 }
01054
01055
01056 #ifndef GRAPHICS_DISABLED
01057
01063 void display_cluster_prototypes(CHAR_SAMPLES_LIST *char_clusters) {
01064 INT16 proto_number = 0;
01065 CHAR_SAMPLES_IT c_it = char_clusters;
01066 char title[WINDOWNAMESIZE];
01067
01068 for (c_it.mark_cycle_pt (); !c_it.cycled_list (); c_it.forward ()) {
01069 proto_number++;
01070
01071 #ifndef SECURE_NAMES
01072 tprintf ("Displaying proto number %d\n", proto_number);
01073 #endif
01074
01075 if (c_it.data ()->prototype () != NULL) {
01076 sprintf (title, "Proto - %d", proto_number);
01077 display_image (c_it.data ()->prototype ()->make_image (),
01078 title, (proto_number - 1) * 400, 0, FALSE);
01079 }
01080 }
01081 }
01082 #endif
01083
01090 void reject_all_ems(WERD_RES *word) {
01091 INT16 i;
01092
01093 for (i = 0; word->best_choice->string ()[i] != '\0'; i++) {
01094 if (word->best_choice->string ()[i] == 'm')
01095
01096 word->reject_map[i].setrej_mm_reject ();
01097 }
01098 }
01099
01106 void reject_all_fullstops(WERD_RES *word) {
01107 INT16 i;
01108
01109 for (i = 0; word->best_choice->string ()[i] != '\0'; i++) {
01110 if (word->best_choice->string ()[i] == '.')
01111
01112 word->reject_map[i].setrej_mm_reject ();
01113 }
01114 }
01115
01116
01123 void reject_suspect_ems(WERD_RES *word) {
01124 INT16 i;
01125
01126 if (!word_adaptable (word, tessedit_cluster_adaption_mode))
01127 for (i = 0; word->best_choice->string ()[i] != '\0'; i++) {
01128 if (word->best_choice->string ()[i] == 'm' && suspect_em (word, i))
01129
01130 word->reject_map[i].setrej_mm_reject ();
01131 }
01132 }
01133
01140 void reject_suspect_fullstops(WERD_RES *word) {
01141 INT16 i;
01142
01143 for (i = 0; word->best_choice->string ()[i] != '\0'; i++) {
01144 if (word->best_choice->string ()[i] == '.'
01145 && suspect_fullstop (word, i))
01146 word->reject_map[i].setrej_mm_reject ();
01147 }
01148 }
01149
01150
01158 BOOL8 suspect_em(WERD_RES *word, INT16 index) {
01159 PBLOB_LIST *blobs = word->outword->blob_list ();
01160 PBLOB_IT blob_it(blobs);
01161 INT16 j;
01162
01163 for (j = 0; j < index; j++)
01164 blob_it.forward ();
01165
01166 return (blob_it.data ()->out_list ()->length () != 1);
01167 }
01168
01169
01177 BOOL8 suspect_fullstop(WERD_RES *word, INT16 i) {
01178 float aspect_ratio;
01179 PBLOB_LIST *blobs = word->outword->blob_list ();
01180 PBLOB_IT blob_it(blobs);
01181 INT16 j;
01182 BOX box;
01183 INT16 width;
01184 INT16 height;
01185
01186 for (j = 0; j < i; j++)
01187 blob_it.forward ();
01188
01189 box = blob_it.data ()->bounding_box ();
01190
01191 width = box.width ();
01192 height = box.height ();
01193
01194 aspect_ratio = ((width > height) ? ((float) width) / height :
01195 ((float) height) / width);
01196
01197 return (aspect_ratio > tessed_fullstop_aspect_ratio);
01198 }