#include "charsample.h"
#include "charcut.h"
#include "notdll.h"
Go to the source code of this file.
void adapt_to_good_ems | ( | WERD_RES * | word, | |
CHAR_SAMPLES_LIST * | char_clusters, | |||
CHAR_SAMPLE_LIST * | chars_waiting | |||
) |
Keep playing with word that we THINK has m's in it until we're happy?
word | Word Results | |
char_clusters | ?? one variable in |
chars_waiting | ?? one variable in |
tessedit_reject_suspect_ems,
tessedit_cluster_debug,
tessedit_em_adaption_mode,
tessedit_process_rns,
tessedit_matrix_match, and
tessedit_demo_adaption
Definition at line 541 of file adaptions.cpp.
References WERD::baseline_denormalise(), WERD_RES::best_choice, WERD::blob_list(), WERD::bounding_box(), char_clip_word(), CHAR_SAMPLE::character(), clip_sample(), complete_clustering(), demo_word, display_clip_image(), WERD::flag(), IMAGE::get_res(), BOX::height(), imagebasename, CHAR_SAMPLES::match_score(), MAX_INT32, NULL, WERD_RES::outword, page_image, print_em_stats(), reject_all_ems(), reject_suspect_ems(), resolution, SECURE_NAMES, STRING::string(), tprintf(), W_INVERSE, WERD_RES::word, and word_adaptable().
Referenced by recog_all_words().
00543 { 00544 PBLOB_LIST *blobs = word->outword->blob_list (); 00545 PBLOB_IT blob_it(blobs); 00546 INT16 i; 00547 CHAR_SAMPLE *sample; 00548 CHAR_SAMPLES_IT c_it = char_clusters; 00549 CHAR_SAMPLE_IT cw_it = chars_waiting; 00550 float score; 00551 float best_score; 00552 char best_char; 00553 CHAR_SAMPLES *best_cluster; 00554 PIXROW_LIST *pixrow_list; 00555 PIXROW_IT pixrow_it; 00556 IMAGELINE *imlines; // lines of the image 00557 BOX pix_box; // box of imlines 00558 // extent 00559 WERD copy_outword; // copy to denorm 00560 BOX b_box; 00561 PBLOB_IT copy_blob_it; 00562 OUTLINE_IT copy_outline_it; 00563 PIXROW *pixrow = NULL; 00564 00565 static INT32 word_number = 0; 00566 00567 #ifndef GRAPHICS_DISABLED 00568 WINDOW demo_win = NULL; 00569 #endif 00570 00571 INT32 resolution = page_image.get_res (); 00572 00573 if (word->word->bounding_box ().height () > resolution / 3) 00574 return; 00575 00576 word_number++; 00577 00578 if (strchr (word->best_choice->string ().string (), 'm') == NULL 00579 && (tessedit_process_rns 00580 && strstr (word->best_choice->string ().string (), "rn") == NULL)) 00581 return; 00582 00583 if (tessedit_reject_ems) 00584 reject_all_ems(word); 00585 else if (tessedit_reject_suspect_ems) 00586 reject_suspect_ems(word); 00587 else { 00588 if (char_clusters->length () == 0) { 00589 #ifndef SECURE_NAMES 00590 if (tessedit_cluster_debug) 00591 tprintf ("No clusters to use for em adaption\n"); 00592 #endif 00593 return; 00594 } 00595 00596 if (!cw_it.empty ()) { 00597 complete_clustering(char_clusters, chars_waiting); 00598 print_em_stats(char_clusters, chars_waiting); 00599 } 00600 00601 if ((!word_adaptable (word, tessedit_em_adaption_mode) || 00602 word->reject_map.reject_count () != 0) 00603 && (strchr (word->best_choice->string ().string (), 'm') != NULL 00604 || (tessedit_process_rns 00605 && strstr (word->best_choice->string ().string (), "rn") != NULL))) { 00606 if (tessedit_process_rns 00607 && strstr (word->best_choice->string ().string (), "rn") != NULL) { 00608 copy_outword = *(word->outword); 00609 copy_blob_it.set_to_list (copy_outword.blob_list ()); 00610 i = 0; 00611 while (word->best_choice->string ()[i] != '\0') { 00612 if (word->best_choice->string ()[i] == 'r' 00613 && word->best_choice->string ()[i + 1] == 'n') { 00614 copy_outline_it.set_to_list (copy_blob_it.data ()-> out_list ()); 00615 copy_outline_it.add_list_after (copy_blob_it. data_relative (1)-> 00616 out_list ()); 00617 copy_blob_it.forward (); 00618 delete (copy_blob_it.extract ()); 00619 i++; 00620 } 00621 copy_blob_it.forward (); 00622 i++; 00623 } 00624 } 00625 else 00626 copy_outword = *(word->outword); 00627 00628 copy_outword.baseline_denormalise (&word->denorm); 00629 copy_blob_it.set_to_list (copy_outword.blob_list ()); 00630 char_clip_word(©_outword, page_image, pixrow_list, imlines, pix_box); 00631 pixrow_it.set_to_list (pixrow_list); 00632 pixrow_it.move_to_first (); 00633 00634 // For debugging only 00635 b_box = copy_outword.bounding_box (); 00636 pixrow = pixrow_it.data (); 00637 00638 blob_it.move_to_first (); 00639 copy_blob_it.move_to_first (); 00640 for (i = 0; 00641 word->best_choice->string ()[i] != '\0'; 00642 i++, pixrow_it.forward (), blob_it.forward (), 00643 copy_blob_it.forward ()) { 00644 if ((word->best_choice->string ()[i] == 'm' 00645 || (word->best_choice->string ()[i] == 'r' 00646 && word->best_choice->string ()[i + 1] == 'n')) 00647 && !word->reject_map[i].perm_rejected ()) { 00648 if (tessedit_cluster_debug) 00649 tprintf ("Sample %c to check found in %s, index %d\n", 00650 word->best_choice->string ()[i], 00651 word->best_choice->string ().string (), i); 00652 00653 if (tessedit_demo_adaption) 00654 tprintf 00655 ("Sample %c to check found in %s (%d), index %d\n", 00656 word->best_choice->string ()[i], 00657 word->best_choice->string ().string (), word_number, 00658 i); 00659 00660 if (tessedit_matrix_match) { 00661 BOX copy_box = copy_blob_it.data ()->bounding_box (); 00662 00663 sample = clip_sample (pixrow_it.data (), 00664 imlines, 00665 pix_box, 00666 copy_outword.flag (W_INVERSE), 00667 word->best_choice->string ()[i]); 00668 00669 //Clip failed 00670 if (sample == NULL) { 00671 tprintf 00672 ("Unable to clip sample from %s, index %d\n", 00673 word->best_choice->string ().string (), i); 00674 #ifndef SECURE_NAMES 00675 if (tessedit_cluster_debug) 00676 tprintf ("Sample rejected (no sample)\n"); 00677 #endif 00678 word->reject_map[i].setrej_mm_reject (); 00679 if (word->best_choice->string ()[i] == 'r') { 00680 word->reject_map[i + 1].setrej_mm_reject (); 00681 i++; 00682 } 00683 continue; 00684 } 00685 } 00686 else 00687 sample = new CHAR_SAMPLE (blob_it.data (), 00688 &word->denorm, 00689 word->best_choice-> 00690 string ()[i]); 00691 00692 best_score = MAX_INT32; 00693 best_char = '\0'; 00694 best_cluster = NULL; 00695 00696 for (c_it.mark_cycle_pt (); 00697 !c_it.cycled_list (); c_it.forward ()) { 00698 if (c_it.data ()->character () != '\0') { 00699 score = c_it.data ()->match_score (sample); 00700 if (score < best_score) { 00701 best_cluster = c_it.data (); 00702 best_score = score; 00703 best_char = c_it.data ()->character (); 00704 } 00705 } 00706 } 00707 00708 if (best_score > tessedit_cluster_t1) { 00709 #ifndef SECURE_NAMES 00710 if (tessedit_cluster_debug) 00711 tprintf ("Sample rejected (score %f)\n", best_score); 00712 if (tessedit_demo_adaption) 00713 tprintf ("Sample rejected (score %f)\n", best_score); 00714 #endif 00715 word->reject_map[i].setrej_mm_reject (); 00716 if (word->best_choice->string ()[i] == 'r') 00717 word->reject_map[i + 1].setrej_mm_reject (); 00718 } 00719 else { 00720 if (word->best_choice->string ()[i] == best_char) { 00721 #ifndef SECURE_NAMES 00722 if (tessedit_cluster_debug) 00723 tprintf ("Sample accepted (score %f)\n", 00724 best_score); 00725 if (tessedit_demo_adaption) 00726 tprintf ("Sample accepted (score %f)\n", 00727 best_score); 00728 #endif 00729 word->reject_map[i].setrej_mm_accept (); 00730 if (word->best_choice->string ()[i] == 'r') 00731 word->reject_map[i + 1].setrej_mm_accept (); 00732 } 00733 else { 00734 #ifndef SECURE_NAMES 00735 if (tessedit_cluster_debug) 00736 tprintf ("Sample rejected (char %c, score %f)\n", 00737 best_char, best_score); 00738 if (tessedit_demo_adaption) 00739 tprintf ("Sample rejected (char %c, score %f)\n", 00740 best_char, best_score); 00741 #endif 00742 word->reject_map[i].setrej_mm_reject (); 00743 if (word->best_choice->string ()[i] == 'r') 00744 word->reject_map[i + 1].setrej_mm_reject (); 00745 } 00746 } 00747 00748 if (tessedit_demo_adaption) { 00749 if (strcmp (imagebasename.string (), 00750 tessedit_demo_file.string ()) != 0 00751 || word_number == tessedit_demo_word1 00752 || word_number == tessedit_demo_word2) { 00753 #ifndef GRAPHICS_DISABLED 00754 demo_win = 00755 display_clip_image(©_outword, 00756 page_image, 00757 pixrow_list, 00758 pix_box); 00759 #endif 00760 demo_word = word_number; 00761 best_cluster->match_score (sample); 00762 demo_word = 0; 00763 } 00764 } 00765 if (word->best_choice->string ()[i] == 'r') 00766 i++; // Skip next character 00767 } 00768 } 00769 delete[]imlines; // Free array of imlines 00770 delete pixrow_list; 00771 } 00772 } 00773 }
void adapt_to_good_samples | ( | WERD_RES * | word, | |
CHAR_SAMPLES_LIST * | char_clusters, | |||
CHAR_SAMPLE_LIST * | chars_waiting | |||
) |
?
word | Word in question | |
char_clusters | ? | |
chars_waiting | characters waiting ( |
Definition at line 784 of file adaptions.cpp.
References WERD::baseline_denormalise(), WERD::blob_list(), PIXROW::bounding_box(), WERD::bounding_box(), char_clip_word(), CHAR_SAMPLES::character(), clip_sample(), complete_clustering(), debug_fp, demo_word, display_clip_image(), WERD::flag(), IMAGE::get_res(), BOX::height(), imagebasename, CHAR_SAMPLES::match_score(), MAX_INT32, NULL, WERD_RES::outword, page_image, print_em_stats(), resolution, SECURE_NAMES, STRING::string(), tprintf(), W_INVERSE, WERD_RES::word, and word_adaptable().
Referenced by classify_word_pass1(), and recog_all_words().
00786 { 00787 PBLOB_LIST *blobs = word->outword->blob_list (); 00788 PBLOB_IT blob_it(blobs); 00789 INT16 i; 00790 CHAR_SAMPLE *sample; 00791 CHAR_SAMPLES_IT c_it = char_clusters; 00792 CHAR_SAMPLE_IT cw_it = chars_waiting; 00793 float score; 00794 float best_score; 00795 char best_char; 00796 CHAR_SAMPLES *best_cluster; 00797 PIXROW_LIST *pixrow_list; 00798 PIXROW_IT pixrow_it; 00799 IMAGELINE *imlines; // lines of the image 00800 BOX pix_box; // box of imlines 00801 // extent 00802 WERD copy_outword; // copy to denorm 00803 BOX b_box; 00804 PBLOB_IT copy_blob_it; 00805 PIXROW *pixrow = NULL; 00806 00807 static INT32 word_number = 0; 00808 00809 #ifndef GRAPHICS_DISABLED 00810 WINDOW demo_win = NULL; 00811 #endif 00812 00813 INT32 resolution = page_image.get_res (); 00814 00815 word_number++; 00816 00817 if (tessedit_test_cluster_input) 00818 return; 00819 00820 if (word->word->bounding_box ().height () > resolution / 3) 00821 return; 00822 00823 if (char_clusters->length () == 0) { 00824 #ifndef SECURE_NAMES 00825 if (tessedit_cluster_debug) 00826 tprintf ("No clusters to use for adaption\n"); 00827 #endif 00828 return; 00829 } 00830 00831 if (!cw_it.empty ()) { 00832 complete_clustering(char_clusters, chars_waiting); 00833 print_em_stats(char_clusters, chars_waiting); 00834 } 00835 00836 if ((!word_adaptable (word, tessedit_cluster_adaption_mode) 00837 && word->reject_map.reject_count () != 0) || tessedit_mm_use_rejmap) { 00838 if (tessedit_cluster_debug) { 00839 tprintf ("\nChecking: \"%s\" MAP ", 00840 word->best_choice->string ().string ()); 00841 word->reject_map.print (debug_fp); 00842 tprintf ("\n"); 00843 } 00844 00845 copy_outword = *(word->outword); 00846 copy_outword.baseline_denormalise (&word->denorm); 00847 copy_blob_it.set_to_list (copy_outword.blob_list ()); 00848 char_clip_word(©_outword, page_image, pixrow_list, imlines, pix_box); 00849 pixrow_it.set_to_list (pixrow_list); 00850 pixrow_it.move_to_first (); 00851 00852 // For debugging only 00853 b_box = copy_outword.bounding_box (); 00854 pixrow = pixrow_it.data (); 00855 00856 blob_it.move_to_first (); 00857 copy_blob_it.move_to_first (); 00858 for (i = 0; 00859 word->best_choice->string ()[i] != '\0'; 00860 i++, pixrow_it.forward (), blob_it.forward (), 00861 copy_blob_it.forward ()) { 00862 if (word->reject_map[i].recoverable () 00863 || (tessedit_mm_all_rejects && word->reject_map[i].rejected ())) { 00864 BOX copy_box = copy_blob_it.data ()->bounding_box (); 00865 00866 if (tessedit_cluster_debug) 00867 tprintf ("Sample %c to check found in %s, index %d\n", 00868 word->best_choice->string ()[i], 00869 word->best_choice->string ().string (), i); 00870 00871 if (tessedit_demo_adaption) 00872 tprintf ("Sample %c to check found in %s (%d), index %d\n", 00873 word->best_choice->string ()[i], 00874 word->best_choice->string ().string (), 00875 word_number, i); 00876 00877 sample = clip_sample (pixrow_it.data (), 00878 imlines, 00879 pix_box, 00880 copy_outword.flag (W_INVERSE), 00881 word->best_choice->string ()[i]); 00882 00883 if (sample == NULL) { //Clip failed 00884 tprintf ("Unable to clip sample from %s, index %d\n", 00885 word->best_choice->string ().string (), i); 00886 #ifndef SECURE_NAMES 00887 if (tessedit_cluster_debug) 00888 tprintf ("Sample rejected (no sample)\n"); 00889 #endif 00890 word->reject_map[i].setrej_mm_reject (); 00891 00892 continue; 00893 } 00894 00895 best_score = MAX_INT32; 00896 best_char = '\0'; 00897 best_cluster = NULL; 00898 00899 for (c_it.mark_cycle_pt (); 00900 !c_it.cycled_list (); c_it.forward ()) { 00901 if (c_it.data ()->character () != '\0') { 00902 score = c_it.data ()->match_score (sample); 00903 if (score < best_score) { 00904 best_cluster = c_it.data (); 00905 best_score = score; 00906 best_char = c_it.data ()->character (); 00907 } 00908 } 00909 } 00910 00911 if (best_score > tessedit_cluster_t1) { 00912 #ifndef SECURE_NAMES 00913 if (tessedit_cluster_debug) 00914 tprintf ("Sample rejected (score %f)\n", best_score); 00915 if (tessedit_demo_adaption) 00916 tprintf ("Sample rejected (score %f)\n", best_score); 00917 #endif 00918 word->reject_map[i].setrej_mm_reject (); 00919 } 00920 else { 00921 if (word->best_choice->string ()[i] == best_char) { 00922 #ifndef SECURE_NAMES 00923 if (tessedit_cluster_debug) 00924 tprintf ("Sample accepted (score %f)\n", best_score); 00925 if (tessedit_demo_adaption) 00926 tprintf ("Sample accepted (score %f)\n", best_score); 00927 #endif 00928 if (tessedit_test_adaption) 00929 word->reject_map[i].setrej_minimal_rej_accept (); 00930 else 00931 word->reject_map[i].setrej_mm_accept (); 00932 } 00933 else { 00934 #ifndef SECURE_NAMES 00935 if (tessedit_cluster_debug) 00936 tprintf ("Sample rejected (char %c, score %f)\n", 00937 best_char, best_score); 00938 if (tessedit_demo_adaption) 00939 tprintf ("Sample rejected (char %c, score %f)\n", 00940 best_char, best_score); 00941 #endif 00942 word->reject_map[i].setrej_mm_reject (); 00943 } 00944 } 00945 00946 if (tessedit_demo_adaption) { 00947 if (strcmp (imagebasename.string (), 00948 tessedit_demo_file.string ()) != 0 00949 || word_number == tessedit_demo_word1 00950 || word_number == tessedit_demo_word2) { 00951 #ifndef GRAPHICS_DISABLED 00952 demo_win = 00953 display_clip_image(©_outword, 00954 page_image, 00955 pixrow_list, 00956 pix_box); 00957 #endif 00958 demo_word = word_number; 00959 best_cluster->match_score (sample); 00960 demo_word = 0; 00961 } 00962 } 00963 } 00964 } 00965 delete[]imlines; // Free array of imlines 00966 delete pixrow_list; 00967 00968 if (tessedit_cluster_debug) { 00969 tprintf ("\nFinal: \"%s\" MAP ", 00970 word->best_choice->string ().string ()); 00971 word->reject_map.print (debug_fp); 00972 tprintf ("\n"); 00973 } 00974 } 00975 }
void check_wait_list | ( | CHAR_SAMPLE_LIST * | chars_waiting, | |
CHAR_SAMPLE * | sample, | |||
CHAR_SAMPLES * | best_cluster | |||
) |
Traverse all characters comparing scores and either add wait sample to existing cluster or drop wait sample when it's aready in existing cluster?
chars_waiting | characters waiting ( |
sample | sample to use? | |
best_cluster | ? ( |
Definition at line 429 of file adaptions.cpp.
References CHAR_SAMPLES::add_sample(), FALSE, CHAR_SAMPLE::match_sample(), CHAR_SAMPLES::match_score(), and tprintf().
Referenced by cluster_sample(), and complete_clustering().
00431 { 00432 CHAR_SAMPLE *wait_sample; 00433 CHAR_SAMPLE *test_sample = sample; 00434 CHAR_SAMPLE_IT cw_it = chars_waiting; 00435 CHAR_SAMPLE_LIST add_list; // Samples added to best cluster 00436 CHAR_SAMPLE_IT add_it = &add_list; 00437 float score; 00438 00439 add_list.clear (); 00440 00441 if (!cw_it.empty ()) { 00442 do { 00443 if (!add_list.empty ()) { 00444 add_it.forward (); 00445 test_sample = add_it.extract (); 00446 best_cluster->add_sample (test_sample); 00447 } 00448 00449 for (cw_it.mark_cycle_pt (); 00450 !cw_it.cycled_list (); cw_it.forward ()) { 00451 wait_sample = cw_it.data (); 00452 if (tessedit_mm_use_prototypes) 00453 score = best_cluster->match_score (wait_sample); 00454 else 00455 score = sample->match_sample (wait_sample, FALSE); 00456 if (score < tessedit_cluster_t1) { 00457 if (score > tessedit_cluster_t3 00458 || tessedit_mm_use_prototypes) { 00459 add_it.add_after_stay_put (cw_it.extract ()); 00460 #ifndef SECURE_NAMES 00461 if (tessedit_cluster_debug) 00462 tprintf 00463 ("Wait sample added to an existing cluster\n"); 00464 #endif 00465 } 00466 else { 00467 #ifndef SECURE_NAMES 00468 if (tessedit_cluster_debug) 00469 tprintf 00470 ("Wait sample dropped, good match to an existing cluster\n"); 00471 #endif 00472 } 00473 } 00474 } 00475 } 00476 while (!add_list.empty ()); 00477 } 00478 }
CHAR_SAMPLE* clip_sample | ( | PIXROW * | pixrow, | |
IMAGELINE * | imlines, | |||
BOX | pix_box, | |||
BOOL8 | white_on_black, | |||
char | c | |||
) |
pixrow | ||
imlines | array of lines [cut from/] of image | |
pix_box | Box of imlines extent | |
white_on_black | 1 or 0, deremines if should set background to white ( |
c | e.g., word->best_choice->string ()[i] (THE character we're working on) |
Definition at line 1013 of file adaptions.cpp.
References ASSERT_HOST, PIXROW::bounding_box(), PIXROW::char_clip_image(), IMAGE::create(), IMAGE::get_res(), IMAGE::get_xsize(), IMAGE::get_ysize(), BOX::height(), invert_image(), NULL, BOX::null_box(), page_image, resolution, tprintf(), and BOX::width().
Referenced by adapt_to_good_ems(), adapt_to_good_samples(), collect_characters_for_adaption(), and collect_ems_for_adaption().
01018 { 01019 IMAGE *image = new (IMAGE); // unscaled char image 01020 BOX b_box = pixrow->bounding_box (); 01021 float baseline_pos = 0; // baseline ht in image 01022 INT32 resolution = page_image.get_res (); 01023 01024 if (!b_box.null_box ()) { 01025 ASSERT_HOST (b_box.width () < page_image.get_xsize () && 01026 b_box.height () < page_image.get_ysize ()); 01027 01028 if (b_box.width () > resolution || b_box.height () > resolution) { 01029 tprintf ("clip sample: sample too big (%d x %d)\n", 01030 b_box.width (), b_box.height ()); 01031 01032 return NULL; 01033 } 01034 01035 IMAGE *image = new (IMAGE); 01036 if (image->create (b_box.width (), b_box.height (), 1) == -1) { 01037 tprintf ("clip sample: create image failed (%d x %d)\n", 01038 b_box.width (), b_box.height ()); 01039 01040 delete image; 01041 return NULL; 01042 } 01043 01044 if (!white_on_black) 01045 invert_image(image); // Set background to white 01046 pixrow->char_clip_image (imlines, pix_box, NULL, *image, baseline_pos); 01047 if (white_on_black) 01048 invert_image(image); //invert white on black for scaling &NN 01049 return new CHAR_SAMPLE (image, c); 01050 } 01051 else 01052 return NULL; 01053 }
void cluster_sample | ( | CHAR_SAMPLE * | sample, | |
CHAR_SAMPLES_LIST * | char_clusters, | |||
CHAR_SAMPLE_LIST * | chars_waiting | |||
) |
Find best score of sample and assign it to cluster, if not already 'there'?
sample | sample to use? | |
char_clusters | ? | |
chars_waiting | characters waiting ( |
Definition at line 364 of file adaptions.cpp.
References CHAR_SAMPLES::add_sample(), check_wait_list(), MAX_INT32, NULL, and tprintf().
Referenced by collect_characters_for_adaption(), and collect_ems_for_adaption().
00366 { 00367 CHAR_SAMPLES *best_cluster = NULL; 00368 CHAR_SAMPLES_IT c_it = char_clusters; 00369 CHAR_SAMPLE_IT cw_it = chars_waiting; 00370 float score; 00371 float best_score = MAX_INT32; 00372 00373 if (c_it.empty ()) 00374 c_it.add_to_end (new CHAR_SAMPLES (sample)); 00375 else { 00376 for (c_it.mark_cycle_pt (); !c_it.cycled_list (); c_it.forward ()) { 00377 score = c_it.data ()->match_score (sample); 00378 if (score < best_score) { 00379 best_score = score; 00380 best_cluster = c_it.data (); 00381 } 00382 } 00383 00384 if (tessedit_cluster_debug) 00385 tprintf ("Sample's best score %f\n", best_score); 00386 00387 if (best_score < tessedit_cluster_t1) { 00388 if (best_score > tessedit_cluster_t3 || tessedit_mm_use_prototypes) { 00389 best_cluster->add_sample (sample); 00390 check_wait_list(chars_waiting, sample, best_cluster); 00391 #ifndef SECURE_NAMES 00392 if (tessedit_cluster_debug) 00393 tprintf ("Sample added to an existing cluster\n"); 00394 #endif 00395 } 00396 else { 00397 #ifndef SECURE_NAMES 00398 if (tessedit_cluster_debug) 00399 tprintf 00400 ("Sample dropped, good match to an existing cluster\n"); 00401 #endif 00402 } 00403 } 00404 else if (best_score > tessedit_cluster_t2) { 00405 c_it.add_to_end (new CHAR_SAMPLES (sample)); 00406 #ifndef SECURE_NAMES 00407 if (tessedit_cluster_debug) 00408 tprintf ("New cluster created for this sample\n"); 00409 #endif 00410 } 00411 else { 00412 cw_it.add_to_end (sample); 00413 if (tessedit_cluster_debug) 00414 tprintf ("Sample added to the wait list\n"); 00415 } 00416 } 00417 }
void collect_characters_for_adaption | ( | WERD_RES * | word, | |
CHAR_SAMPLES_LIST * | char_clusters, | |||
CHAR_SAMPLE_LIST * | chars_waiting | |||
) |
Within bounding box, collect good matches of blobs into characters 'for adaptation'?
word | Word Results | |
char_clusters | ? | |
chars_waiting | characters waiting ( |
tessedit_demo_adaption,
tessedit_cluster_adaption_mode,
tessedit_cluster_debug,
tessedit_test_cluster_input, and
tessedit_mm_use_rejmap
Definition at line 284 of file adaptions.cpp.
References WERD::baseline_denormalise(), WERD::blob_list(), WERD::bounding_box(), char_clip_word(), clip_sample(), cluster_sample(), FALSE, WERD::flag(), IMAGE::get_res(), BOX::height(), NULL, WERD_RES::outword, page_image, resolution, tprintf(), W_INVERSE, WERD_RES::word, and word_adaptable().
Referenced by recog_all_words().
00286 { 00287 PBLOB_LIST *blobs = word->outword->blob_list (); 00288 PBLOB_IT blob_it(blobs); 00289 INT16 i; 00290 CHAR_SAMPLE *sample; 00291 PIXROW_LIST *pixrow_list; 00292 PIXROW_IT pixrow_it; 00293 IMAGELINE *imlines; // lines of the image 00294 BOX pix_box; // box of imlines 00295 // extent 00296 WERD copy_outword; // copy to denorm 00297 INT32 resolution = page_image.get_res (); 00298 00299 if (word->word->bounding_box ().height () > resolution / 3) 00300 return; 00301 00302 if (tessedit_demo_adaption) 00303 tessedit_display_mm.set_value (FALSE); // Make sure not set 00304 00305 if ((word_adaptable (word, tessedit_cluster_adaption_mode) 00306 && word->reject_map.reject_count () == 0) || tessedit_mm_use_rejmap) { 00307 if (tessedit_test_cluster_input && !tessedit_mm_use_rejmap) 00308 return; // Reject map set to acceptable 00309 /* Collect information about good matches */ 00310 copy_outword = *(word->outword); 00311 copy_outword.baseline_denormalise (&word->denorm); 00312 char_clip_word(©_outword, page_image, pixrow_list, imlines, pix_box); 00313 pixrow_it.set_to_list (pixrow_list); 00314 pixrow_it.move_to_first (); 00315 00316 blob_it.move_to_first (); 00317 for (i = 0; 00318 word->best_choice->string ()[i] != '\0'; 00319 i++, pixrow_it.forward (), blob_it.forward ()) { 00320 00321 if (!(tessedit_mm_use_non_adaption_set 00322 && STRING (tessedit_non_adaption_set).contains (word-> 00323 best_choice-> 00324 string ()[i])) 00325 || (tessedit_mm_use_rejmap && word->reject_map[i].accepted ())) { 00326 #ifndef SECURE_NAMES 00327 if (tessedit_cluster_debug) 00328 tprintf ("Sample %c for adaption found in %s, index %d\n", 00329 word->best_choice->string ()[i], 00330 word->best_choice->string ().string (), i); 00331 #endif 00332 sample = clip_sample (pixrow_it.data (), 00333 imlines, 00334 pix_box, 00335 copy_outword.flag (W_INVERSE), 00336 word->best_choice->string ()[i]); 00337 00338 if (sample == NULL) { // Clip failed 00339 #ifndef SECURE_NAMES 00340 tprintf ("Unable to clip sample from %s, index %d\n", 00341 word->best_choice->string ().string (), i); 00342 #endif 00343 continue; 00344 } 00345 cluster_sample(sample, char_clusters, chars_waiting); 00346 } 00347 } 00348 delete[]imlines; // Free array of imlines 00349 delete pixrow_list; 00350 } 00351 else if (tessedit_test_cluster_input && !tessedit_mm_use_rejmap) 00352 word->reject_map.rej_word_tess_failure (); // Set word to all rejects 00353 }
void collect_ems_for_adaption | ( | WERD_RES * | word, | |
CHAR_SAMPLES_LIST * | char_clusters, | |||
CHAR_SAMPLE_LIST * | chars_waiting | |||
) |
Within bounding box, collect blobs that might be 'm's for adaptation'?
word | Word Results | |
char_clusters | ? | |
chars_waiting | characters waiting ( |
lines of the image
box of imlines
copy to denorm
Make sure not set
Definition at line 163 of file adaptions.cpp.
References WERD::baseline_denormalise(), WERD::blob_list(), WERD::bounding_box(), char_clip_word(), clip_sample(), cluster_sample(), FALSE, WERD::flag(), IMAGE::get_res(), BOX::height(), NULL, WERD_RES::outword, page_image, resolution, tprintf(), W_INVERSE, WERD_RES::word, and word_adaptable().
Referenced by recog_all_words().
00165 { 00166 PBLOB_LIST *blobs = word->outword->blob_list (); 00167 PBLOB_IT blob_it(blobs); 00168 INT16 i; 00169 CHAR_SAMPLE *sample; 00170 PIXROW_LIST *pixrow_list; 00171 PIXROW_IT pixrow_it; 00172 IMAGELINE *imlines; 00173 BOX pix_box; 00174 // extent 00175 WERD copy_outword; 00176 PBLOB_IT copy_blob_it; 00177 OUTLINE_IT copy_outline_it; 00178 INT32 resolution = page_image.get_res (); 00179 00180 if (tessedit_reject_ems || tessedit_reject_suspect_ems) 00181 return; // Do nothing 00182 00183 if (word->word->bounding_box ().height () > resolution / 3) 00184 return; 00185 00186 if (tessedit_demo_adaption) 00187 tessedit_display_mm.set_value (FALSE); 00188 00189 if (word_adaptable (word, tessedit_em_adaption_mode) 00190 && word->reject_map.reject_count () == 0 00191 && (strchr (word->best_choice->string ().string (), 'm') != NULL 00192 || (tessedit_process_rns 00193 && strstr (word->best_choice->string ().string (), "rn") != NULL))) { 00194 if (tessedit_process_rns 00195 && strstr (word->best_choice->string ().string (), "rn") != NULL) { 00196 copy_outword = *(word->outword); 00197 copy_blob_it.set_to_list (copy_outword.blob_list ()); 00198 i = 0; 00199 while (word->best_choice->string ()[i] != '\0') { 00200 if (word->best_choice->string ()[i] == 'r' 00201 && word->best_choice->string ()[i + 1] == 'n') { 00202 copy_outline_it.set_to_list (copy_blob_it.data ()-> 00203 out_list ()); 00204 copy_outline_it.add_list_after (copy_blob_it. 00205 data_relative (1)-> 00206 out_list ()); 00207 copy_blob_it.forward (); 00208 delete (copy_blob_it.extract ()); 00209 i++; 00210 } 00211 copy_blob_it.forward (); 00212 i++; 00213 } 00214 } 00215 else 00216 copy_outword = *(word->outword); 00217 00218 copy_outword.baseline_denormalise (&word->denorm); 00219 char_clip_word(©_outword, page_image, pixrow_list, imlines, pix_box); 00220 pixrow_it.set_to_list (pixrow_list); 00221 pixrow_it.move_to_first (); 00222 00223 blob_it.move_to_first (); 00224 for (i = 0; 00225 word->best_choice->string ()[i] != '\0'; 00226 i++, pixrow_it.forward (), blob_it.forward ()) { 00227 00228 if (word->best_choice->string ()[i] == 'm' 00229 || (word->best_choice->string ()[i] == 'r' 00230 && word->best_choice->string ()[i + 1] == 'n')) { 00231 #ifndef SECURE_NAMES 00232 if (tessedit_cluster_debug) 00233 tprintf ("Sample %c for adaption found in %s, index %d\n", 00234 word->best_choice->string ()[i], 00235 word->best_choice->string ().string (), i); 00236 #endif 00237 if (tessedit_matrix_match) { 00238 sample = clip_sample (pixrow_it.data (), 00239 imlines, 00240 pix_box, 00241 copy_outword.flag (W_INVERSE), 00242 word->best_choice->string ()[i]); 00243 00244 if (sample == NULL) { //Clip failed 00245 #ifndef SECURE_NAMES 00246 tprintf ("Unable to clip sample from %s, index %d\n", 00247 word->best_choice->string ().string (), i); 00248 #endif 00249 if (word->best_choice->string ()[i] == 'r') 00250 i++; 00251 00252 continue; 00253 } 00254 } 00255 else 00256 sample = new CHAR_SAMPLE (blob_it.data (), 00257 &word->denorm, 00258 word->best_choice->string ()[i]); 00259 00260 cluster_sample(sample, char_clusters, chars_waiting); 00261 00262 if (word->best_choice->string ()[i] == 'r') 00263 i++; // Skip next character 00264 } 00265 } 00266 delete[]imlines; // Free array of imlines 00267 delete pixrow_list; 00268 } 00269 }
void complete_clustering | ( | CHAR_SAMPLES_LIST * | char_clusters, | |
CHAR_SAMPLE_LIST * | chars_waiting | |||
) |
Cluster until chars_waiting is used up (.
char_clusters | ?? one variable in |
chars_waiting | ?? one variable in |
tessedit_cluster_debug,
tessedit_mm_adapt_using_prototypes, and
tessedit_demo_adaption
Definition at line 490 of file adaptions.cpp.
References check_wait_list(), display_cluster_prototypes(), and tprintf().
Referenced by adapt_to_good_ems(), and adapt_to_good_samples().
00491 { 00492 CHAR_SAMPLES *best_cluster; 00493 CHAR_SAMPLES_IT c_it = char_clusters; 00494 CHAR_SAMPLE_IT cw_it = chars_waiting; 00495 CHAR_SAMPLE *sample; 00496 INT32 total_sample_count = 0; 00497 00498 while (!cw_it.empty ()) { 00499 cw_it.move_to_first (); 00500 sample = cw_it.extract (); 00501 best_cluster = new CHAR_SAMPLES (sample); 00502 c_it.add_to_end (best_cluster); 00503 check_wait_list(chars_waiting, sample, best_cluster); 00504 } 00505 00506 for (c_it.mark_cycle_pt (); !c_it.cycled_list (); c_it.forward ()) { 00507 c_it.data ()->assign_to_char (); 00508 if (tessedit_use_best_sample) 00509 c_it.data ()->find_best_sample (); 00510 else if (tessedit_mm_adapt_using_prototypes) 00511 c_it.data ()->build_prototype (); 00512 00513 if (tessedit_cluster_debug) 00514 total_sample_count += c_it.data ()->n_samples (); 00515 } 00516 #ifndef SECURE_NAMES 00517 if (tessedit_cluster_debug) 00518 tprintf ("Clustering completed, %d samples in all\n", total_sample_count); 00519 #endif 00520 00521 #ifndef GRAPHICS_DISABLED 00522 if (tessedit_demo_adaption) 00523 display_cluster_prototypes(char_clusters); 00524 #endif 00525 00526 }
void display_cluster_prototypes | ( | CHAR_SAMPLES_LIST * | char_clusters | ) |
Dump out prototypes.
char_clusters | ?? one variable in |
Definition at line 1063 of file adaptions.cpp.
References display_image, FALSE, NULL, tprintf(), and WINDOWNAMESIZE.
Referenced by complete_clustering().
01063 { 01064 INT16 proto_number = 0; 01065 CHAR_SAMPLES_IT c_it = char_clusters; 01066 char title[WINDOWNAMESIZE]; 01067 01068 for (c_it.mark_cycle_pt (); !c_it.cycled_list (); c_it.forward ()) { 01069 proto_number++; 01070 01071 #ifndef SECURE_NAMES 01072 tprintf ("Displaying proto number %d\n", proto_number); 01073 #endif 01074 01075 if (c_it.data ()->prototype () != NULL) { 01076 sprintf (title, "Proto - %d", proto_number); 01077 display_image (c_it.data ()->prototype ()->make_image (), 01078 title, (proto_number - 1) * 400, 0, FALSE); 01079 } 01080 } 01081 }
void print_em_stats | ( | CHAR_SAMPLES_LIST * | char_clusters, | |
CHAR_SAMPLE_LIST * | chars_waiting | |||
) |
Dump out stats.
char_clusters | ?? one variable in |
chars_waiting | ?? one variable in |
Definition at line 985 of file adaptions.cpp.
References debug_fp, and tprintf().
Referenced by adapt_to_good_ems(), adapt_to_good_samples(), and recog_all_words().
00986 { 00987 CHAR_SAMPLES_IT c_it = char_clusters; 00988 00989 if (!tessedit_cluster_debug) 00990 return; 00991 #ifndef SECURE_NAMES 00992 tprintf ("There are %d clusters and %d samples waiting\n", 00993 char_clusters->length (), chars_waiting->length ()); 00994 00995 for (c_it.mark_cycle_pt (); !c_it.cycled_list (); c_it.forward ()) 00996 c_it.data ()->print (debug_fp); 00997 #endif 00998 tprintf ("\n"); 00999 }
void reject_all_ems | ( | WERD_RES * | word | ) |
Simplistic routine to test the effect of rejecting ems (i.e., the letter 'm').
word | word to be processed |
Definition at line 1090 of file adaptions.cpp.
References WERD_RES::best_choice, and WERD_RES::reject_map.
Referenced by adapt_to_good_ems().
01090 { 01091 INT16 i; 01092 01093 for (i = 0; word->best_choice->string ()[i] != '\0'; i++) { 01094 if (word->best_choice->string ()[i] == 'm') 01095 // reject all ems 01096 word->reject_map[i].setrej_mm_reject (); 01097 } 01098 }
void reject_all_fullstops | ( | WERD_RES * | word | ) |
Simplistic routine to test the effect of rejecting fullstops (i.e., '.').
word | word to be processed |
Definition at line 1106 of file adaptions.cpp.
References WERD_RES::best_choice, and WERD_RES::reject_map.
Referenced by recog_all_words().
01106 { 01107 INT16 i; 01108 01109 for (i = 0; word->best_choice->string ()[i] != '\0'; i++) { 01110 if (word->best_choice->string ()[i] == '.') 01111 // reject all fullstops 01112 word->reject_map[i].setrej_mm_reject (); 01113 } 01114 }
void reject_suspect_ems | ( | WERD_RES * | word | ) |
Reject ems (the letter 'm') if word is not adaptable (.
word | word to be processed |
Definition at line 1123 of file adaptions.cpp.
References suspect_em(), and word_adaptable().
Referenced by adapt_to_good_ems().
01123 { 01124 INT16 i; 01125 01126 if (!word_adaptable (word, tessedit_cluster_adaption_mode)) 01127 for (i = 0; word->best_choice->string ()[i] != '\0'; i++) { 01128 if (word->best_choice->string ()[i] == 'm' && suspect_em (word, i)) 01129 // reject all ems 01130 word->reject_map[i].setrej_mm_reject (); 01131 } 01132 }
void reject_suspect_fullstops | ( | WERD_RES * | word | ) |
Reject fullstops ('.' [& ','?]).
word | word to be processed |
Definition at line 1140 of file adaptions.cpp.
References WERD_RES::best_choice, WERD_RES::reject_map, and suspect_fullstop().
Referenced by recog_all_words().
01140 { 01141 INT16 i; 01142 01143 for (i = 0; word->best_choice->string ()[i] != '\0'; i++) { 01144 if (word->best_choice->string ()[i] == '.' 01145 && suspect_fullstop (word, i)) 01146 word->reject_map[i].setrej_mm_reject (); // reject all commas 01147 } 01148 }
Suspect that 'm' in word is something else (like 'rn').
word | word to be processed | |
index | position to which blob 'consumes' XXX? |
Definition at line 1158 of file adaptions.cpp.
References WERD::blob_list(), and WERD_RES::outword.
Referenced by reject_suspect_ems().
01158 { 01159 PBLOB_LIST *blobs = word->outword->blob_list (); 01160 PBLOB_IT blob_it(blobs); 01161 INT16 j; 01162 01163 for (j = 0; j < index; j++) 01164 blob_it.forward (); 01165 01166 return (blob_it.data ()->out_list ()->length () != 1); 01167 }
?
word | word to be processed | |
i | Same as index in |
Definition at line 1177 of file adaptions.cpp.
References WERD::blob_list(), BOX::height(), WERD_RES::outword, and BOX::width().
Referenced by reject_suspect_fullstops().
01177 { 01178 float aspect_ratio; 01179 PBLOB_LIST *blobs = word->outword->blob_list (); 01180 PBLOB_IT blob_it(blobs); 01181 INT16 j; 01182 BOX box; 01183 INT16 width; 01184 INT16 height; 01185 01186 for (j = 0; j < i; j++) 01187 blob_it.forward (); 01188 01189 box = blob_it.data ()->bounding_box (); 01190 01191 width = box.width (); 01192 height = box.height (); 01193 01194 aspect_ratio = ((width > height) ? ((float) width) / height : 01195 ((float) height) / width); 01196 01197 return (aspect_ratio > tessed_fullstop_aspect_ratio); 01198 }
Determine if word IS adaptable.
word | Word Results | |
mode | ex: tessedit_cluster_adaption_mode ("Adaptation decision algorithm for matrix matcher") |
Definition at line 100 of file adaptions.cpp.
References ADAPTABLE_WERD, WERD_RES::best_choice, BITS16::bit(), FALSE, FREQ_DAWG_PERM, NoDangerousAmbig(), NULL, NUMBER_PERM, one_ell_conflict(), SYSTEM_DAWG_PERM, WERD_RES::tess_accepted, WERD_RES::tess_would_adapt, and USER_DAWG_PERM.
Referenced by adapt_to_good_ems(), adapt_to_good_samples(), classify_word_pass1(), collect_characters_for_adaption(), collect_ems_for_adaption(), recog_all_words(), and reject_suspect_ems().
00102 { 00103 BOOL8 status = FALSE; 00104 BITS16 flags(mode); 00105 00106 enum MODES 00107 { 00108 ADAPTABLE_WERD, 00109 ACCEPTABLE_WERD, 00110 CHECK_DAWGS, 00111 CHECK_SPACES, 00112 CHECK_ONE_ELL_CONFLICT, 00113 CHECK_AMBIG_WERD 00114 }; 00115 00116 /* 00117 0: NO adaption 00118 */ 00119 if (mode == 0) { 00120 return FALSE; 00121 } 00122 00123 if (flags.bit (ADAPTABLE_WERD)) 00124 status |= word->tess_would_adapt; 00125 00126 if (flags.bit (ACCEPTABLE_WERD)) 00127 status |= word->tess_accepted; 00128 00129 if (!status) // If not set then 00130 return FALSE; // ignore other checks 00131 00132 if (flags.bit (CHECK_DAWGS) && 00133 (word->best_choice->permuter () != SYSTEM_DAWG_PERM) && 00134 (word->best_choice->permuter () != FREQ_DAWG_PERM) && 00135 (word->best_choice->permuter () != USER_DAWG_PERM) && 00136 (word->best_choice->permuter () != NUMBER_PERM)) 00137 return FALSE; 00138 00139 if (flags.bit (CHECK_ONE_ELL_CONFLICT) && one_ell_conflict (word, FALSE)) 00140 return FALSE; 00141 00142 if (flags.bit (CHECK_SPACES) && 00143 (strchr (word->best_choice->string ().string (), ' ') != NULL)) 00144 return FALSE; 00145 00146 // if (flags.bit (CHECK_AMBIG_WERD) && test_ambig_word (word)) 00147 if (flags.bit (CHECK_AMBIG_WERD) && 00148 !NoDangerousAmbig(word->best_choice->string().string(), NULL)) 00149 return FALSE; 00150 00151 return status; 00152 }