#include "control.h"
#include "notdll.h"
Go to the source code of this file.
enum GARBAGE_LEVEL |
Type of document quality metrics
Definition at line 31 of file docqual.h.
00032 { 00033 G_NEVER_CRUNCH, // 00034 G_OK, // 00035 G_DODGY, // 00036 G_TERRIBLE // 00037 };
void convert_bad_unlv_chs | ( | WERD_RES * | word_res | ) |
Converts all '~' to '-' & '^' to ' ' in this word.
word_res | Word results |
Definition at line 1106 of file docqual.cpp.
References WERD_RES::best_choice, REJMAP::length(), and WERD_RES::reject_map.
Referenced by tilde_crunch().
01107 { 01108 char *ptr; //string ptr 01109 int i; 01110 01111 ptr = (char *) word_res->best_choice->string ().string (); 01112 for (i = 0; i < word_res->reject_map.length (); i++) { 01113 if (ptr[i] == '~') { 01114 ptr[i] = '-'; 01115 if (word_res->reject_map[i].accepted ()) 01116 word_res->reject_map[i].setrej_unlv_rej (); 01117 } 01118 if (ptr[i] == '^') { 01119 ptr[i] = ' '; 01120 if (word_res->reject_map[i].accepted ()) 01121 word_res->reject_map[i].setrej_unlv_rej (); 01122 } 01123 } 01124 }
Test if character is in known-odd categories.
c | character | |
outline_count | outlines we have |
Definition at line 475 of file docqual.cpp.
Referenced by unrej_good_chs(), word_char_quality(), and word_outline_errs().
00475 { 00476 int expected_outline_count; 00477 00478 if (STRING (outlines_odd).contains (c)) 00479 return 0; //Dont use this char 00480 else if (STRING (outlines_2).contains (c)) 00481 expected_outline_count = 2; 00482 else 00483 expected_outline_count = 1; 00484 return abs (outline_count - expected_outline_count); 00485 }
Compares two blobs' bounding boxes & lengths of outlines.
blob1 | first blob | |
blob2 | second blob |
Definition at line 227 of file docqual.cpp.
References PBLOB::bounding_box(), BOX::contains(), FALSE, PBLOB::out_list(), and TRUE.
Referenced by unrej_good_chs(), word_blob_quality(), and word_char_quality().
00227 { 00228 BOX box1 = blob1->bounding_box (); 00229 BOX box2 = blob2->bounding_box (); 00230 00231 if (box1.contains (box2) && 00232 box2.contains (box1) && 00233 (blob1->out_list ()->length () == blob1->out_list ()->length ())) 00234 return TRUE; 00235 else 00236 return FALSE; 00237 }
void doc_and_block_rejection | ( | PAGE_RES_IT & | page_res_it, | |
BOOL8 | good_quality_doc | |||
) |
Reject big chunks.
page_res_it | pointer to page in question ? | |
good_quality_doc | 1 or 0, document is good |
If the page has too many rejects - reject all of it. If any block has too many rejects - reject all words in the block
Definition at line 608 of file docqual.cpp.
References AC_UNACCEPTABLE, acceptable_word_string(), WERD_RES::best_choice, BLOCK_RES::block, PAGE_RES_IT::block(), ROW_RES::char_count, BLOCK_RES::char_count, FALSE, PAGE_RES_IT::forward(), REJMAP::length(), NULL, PAGE_RES_IT::page_res, PAGE_RES_IT::prev_row(), ROW_RES::rej_count, BLOCK_RES::rej_count, REJMAP::rej_word_block_rej(), REJMAP::reject_count(), WERD_RES::reject_map, WERD_RES::reject_spaces, reject_whole_page(), PAGE_RES_IT::restart_page(), ROW_RES::row, PAGE_RES_IT::row(), SECURE_NAMES, WERD::space(), tprintf(), TRUE, ROW_RES::whole_word_rej_count, WERD_RES::word, PAGE_RES_IT::word(), and word_char_quality().
Referenced by quality_based_rejection().
00610 { 00611 INT16 block_no = 0; 00612 INT16 row_no = 0; 00613 BLOCK_RES *current_block; 00614 ROW_RES *current_row; 00615 00616 BOOL8 rej_word; 00617 BOOL8 prev_word_rejected; 00618 INT16 char_quality; 00619 INT16 accepted_char_quality; 00620 00621 if ((page_res_it.page_res->rej_count * 100.0 / 00622 page_res_it.page_res->char_count) > tessedit_reject_doc_percent) { 00623 reject_whole_page(page_res_it); 00624 #ifndef SECURE_NAMES 00625 if (tessedit_debug_doc_rejection) { 00626 tprintf ("REJECT ALL #chars: %d #Rejects: %d; \n", 00627 page_res_it.page_res->char_count, 00628 page_res_it.page_res->rej_count); 00629 } 00630 #endif 00631 } 00632 else { 00633 #ifndef SECURE_NAMES 00634 if (tessedit_debug_doc_rejection) 00635 tprintf ("NO PAGE REJECTION #chars: %d # Rejects: %d; \n", 00636 page_res_it.page_res->char_count, 00637 page_res_it.page_res->rej_count); 00638 #endif 00639 00640 /* Walk blocks testing for block rejection */ 00641 00642 page_res_it.restart_page (); 00643 while (page_res_it.word () != NULL) { 00644 current_block = page_res_it.block (); 00645 if (current_block->block->text_region () != NULL) 00646 block_no = current_block->block->text_region ()->id_no (); 00647 else 00648 block_no = -1; 00649 if ((page_res_it.block ()->char_count > 0) && 00650 ((page_res_it.block ()->rej_count * 100.0 / 00651 page_res_it.block ()->char_count) > 00652 tessedit_reject_block_percent)) { 00653 #ifndef SECURE_NAMES 00654 if (tessedit_debug_block_rejection) 00655 tprintf ("REJECTING BLOCK %d #chars: %d; #Rejects: %d\n", 00656 block_no, 00657 page_res_it.block ()->char_count, 00658 page_res_it.block ()->rej_count); 00659 #endif 00660 prev_word_rejected = FALSE; 00661 while ((page_res_it.word () != NULL) && 00662 (page_res_it.block () == current_block)) { 00663 if (tessedit_preserve_blk_rej_perfect_wds) { 00664 rej_word = 00665 (page_res_it.word ()->reject_map.reject_count () > 0) 00666 || (page_res_it.word ()->reject_map.length () < 00667 tessedit_preserve_min_wd_len); 00668 if (rej_word && tessedit_dont_blkrej_good_wds 00669 && !(page_res_it.word ()->reject_map.length () < 00670 tessedit_preserve_min_wd_len) 00671 && 00672 (acceptable_word_string 00673 (page_res_it.word ()->best_choice->string (). 00674 string ()) != AC_UNACCEPTABLE)) { 00675 word_char_quality (page_res_it.word (), 00676 page_res_it.row ()->row, 00677 &char_quality, 00678 &accepted_char_quality); 00679 rej_word = char_quality != 00680 page_res_it.word ()->reject_map.length (); 00681 } 00682 } 00683 else 00684 rej_word = TRUE; 00685 if (rej_word) { 00686 /* 00687 Reject spacing if both current and prev words are rejected. 00688 NOTE - this is NOT restricted to FUZZY spaces. 00689 When tried this generated more space errors. 00690 */ 00691 if (tessedit_use_reject_spaces && 00692 prev_word_rejected && 00693 (page_res_it.prev_row () == page_res_it.row ()) && 00694 (page_res_it.word ()->word->space () == 1)) 00695 page_res_it.word ()->reject_spaces = TRUE; 00696 page_res_it.word ()->reject_map.rej_word_block_rej (); 00697 } 00698 prev_word_rejected = rej_word; 00699 page_res_it.forward (); 00700 } 00701 } 00702 else { 00703 #ifndef SECURE_NAMES 00704 if (tessedit_debug_block_rejection) 00705 tprintf 00706 ("NOT REJECTING BLOCK %d #chars: %d # Rejects: %d; \n", 00707 block_no, page_res_it.block ()->char_count, 00708 page_res_it.block ()->rej_count); 00709 #endif 00710 00711 /* Walk rows in block testing for row rejection */ 00712 row_no = 0; 00713 while ((page_res_it.word () != NULL) && 00714 (page_res_it.block () == current_block)) { 00715 current_row = page_res_it.row (); 00716 row_no++; 00717 /* 00718 Reject whole row if: 00719 fraction of chars on row which are rejected exceed a limit AND 00720 fraction rejects which occur in WHOLE WERD rejects is LESS THAN a limit 00721 */ 00722 if ((page_res_it.row ()->char_count > 0) && 00723 ((page_res_it.row ()->rej_count * 100.0 / 00724 page_res_it.row ()->char_count) > 00725 tessedit_reject_row_percent) && 00726 ((page_res_it.row ()->whole_word_rej_count * 100.0 / 00727 page_res_it.row ()->rej_count) < 00728 tessedit_whole_wd_rej_row_percent)) { 00729 #ifndef SECURE_NAMES 00730 if (tessedit_debug_block_rejection) 00731 tprintf 00732 ("REJECTING ROW %d #chars: %d; #Rejects: %d\n", 00733 row_no, page_res_it.row ()->char_count, 00734 page_res_it.row ()->rej_count); 00735 #endif 00736 prev_word_rejected = FALSE; 00737 while ((page_res_it.word () != NULL) && 00738 (page_res_it.row () == current_row)) { 00739 /* Preserve words on good docs unless they are mostly rejected*/ 00740 if (!tessedit_row_rej_good_docs && good_quality_doc) { 00741 rej_word = 00742 page_res_it.word ()->reject_map. 00743 reject_count () / 00744 (float) page_res_it.word ()->reject_map. 00745 length () > tessedit_good_doc_still_rowrej_wd; 00746 } 00747 00748 /* Preserve perfect words anyway */ 00749 else if (tessedit_preserve_row_rej_perfect_wds) { 00750 rej_word = 00751 (page_res_it.word ()->reject_map. 00752 reject_count () > 0) 00753 || (page_res_it.word ()->reject_map. 00754 length () < tessedit_preserve_min_wd_len); 00755 if (rej_word && tessedit_dont_rowrej_good_wds 00756 && !(page_res_it.word ()->reject_map. 00757 length () < 00758 tessedit_preserve_min_wd_len) 00759 && 00760 (acceptable_word_string 00761 (page_res_it.word ()->best_choice-> 00762 string ().string ()) != AC_UNACCEPTABLE)) { 00763 word_char_quality (page_res_it.word (), 00764 page_res_it.row ()->row, 00765 &char_quality, 00766 &accepted_char_quality); 00767 rej_word = char_quality != 00768 page_res_it.word ()->reject_map.length (); 00769 } 00770 } 00771 else 00772 rej_word = TRUE; 00773 if (rej_word) { 00774 /* 00775 Reject spacing if both current and prev words are rejected. 00776 NOTE - this is NOT restricted to FUZZY spaces. 00777 When tried this generated more space errors. 00778 */ 00779 if (tessedit_use_reject_spaces && 00780 prev_word_rejected && 00781 (page_res_it.prev_row () == 00782 page_res_it.row ()) 00783 && (page_res_it.word ()->word->space () == 00784 1)) 00785 page_res_it.word ()->reject_spaces = TRUE; 00786 page_res_it.word ()->reject_map. 00787 rej_word_row_rej(); 00788 } 00789 prev_word_rejected = rej_word; 00790 page_res_it.forward (); 00791 } 00792 } 00793 else { 00794 #ifndef SECURE_NAMES 00795 if (tessedit_debug_block_rejection) 00796 tprintf 00797 ("NOT REJECTING ROW %d #chars: %d # Rejects: %d; \n", 00798 row_no, page_res_it.row ()->char_count, 00799 page_res_it.row ()->rej_count); 00800 #endif 00801 while ((page_res_it.word () != NULL) && 00802 (page_res_it.row () == current_row)) 00803 page_res_it.forward (); 00804 } 00805 } 00806 } 00807 } 00808 } 00809 }
Count up all the blanks (' ') in word's best_choice string.
word | Word |
Definition at line 1484 of file docqual.cpp.
References WERD_RES::best_choice.
Referenced by word_deletable().
01484 { 01485 char *str = (char *) word->best_choice->string ().string (); 01486 int tess_rejs = 0; 01487 01488 for (; *str != '\0'; str++) { 01489 if (*str == ' ') 01490 tess_rejs++; 01491 } 01492 return tess_rejs; 01493 }
GARBAGE_LEVEL garbage_word | ( | WERD_RES * | word, | |
BOOL8 | ok_dict_word | |||
) |
Determine probability that word is garbage using manu heuristic/rules.
word | Word in question | |
ok_dict_word | 0 or 1, 1 if word in dictionary |
Definition at line 1181 of file docqual.cpp.
References AC_UNACCEPTABLE, acceptable_word_string(), WERD_RES::best_choice, FREQ_DAWG_PERM, G_DODGY, G_NEVER_CRUNCH, G_OK, G_TERRIBLE, REJMAP::length(), NULL, NUMBER_PERM, WERD_RES::reject_map, SYSTEM_DAWG_PERM, tprintf(), and USER_DAWG_PERM.
Referenced by tilde_crunch().
01181 { 01182 enum STATES 01183 { 01184 JUNK, 01185 FIRST_UPPER, 01186 FIRST_LOWER, 01187 FIRST_NUM, 01188 SUBSEQUENT_UPPER, 01189 SUBSEQUENT_LOWER, 01190 SUBSEQUENT_NUM 01191 }; 01192 char *str = (char *) word->best_choice->string ().string (); 01193 STATES state = JUNK; 01194 int len = 0; 01195 int isolated_digits = 0; 01196 int isolated_alphas = 0; 01197 int bad_char_count = 0; 01198 int tess_rejs = 0; 01199 int dodgy_chars = 0; 01200 int ok_chars; 01201 char last_char = ' '; 01202 int alpha_repetition_count = 0; 01203 int longest_alpha_repetition_count = 0; 01204 int longest_lower_run_len = 0; 01205 int lower_string_count = 0; 01206 int longest_upper_run_len = 0; 01207 int upper_string_count = 0; 01208 int total_alpha_count = 0; 01209 int total_digit_count = 0; 01210 01211 /* Step 1: Scan letters of word and set up a bunch of variables 01212 Working at the level of individual letters */ 01213 for (; *str != '\0'; str++) { 01214 len++; 01215 if (isupper (*str)) { 01216 total_alpha_count++; 01217 switch (state) { 01218 case SUBSEQUENT_UPPER: 01219 case FIRST_UPPER: 01220 state = SUBSEQUENT_UPPER; 01221 upper_string_count++; 01222 if (longest_upper_run_len < upper_string_count) 01223 longest_upper_run_len = upper_string_count; 01224 if (last_char == *str) { 01225 alpha_repetition_count++; 01226 if (longest_alpha_repetition_count < alpha_repetition_count) { 01227 longest_alpha_repetition_count = alpha_repetition_count; 01228 } 01229 } 01230 else { 01231 last_char = *str; 01232 alpha_repetition_count = 1; 01233 } 01234 break; 01235 case FIRST_NUM: 01236 isolated_digits++; 01237 default: 01238 state = FIRST_UPPER; 01239 last_char = *str; 01240 alpha_repetition_count = 1; 01241 upper_string_count = 1; 01242 break; 01243 } 01244 } 01245 else if (islower (*str)) { 01246 total_alpha_count++; 01247 switch (state) { 01248 case SUBSEQUENT_LOWER: 01249 case FIRST_LOWER: 01250 state = SUBSEQUENT_LOWER; 01251 lower_string_count++; 01252 if (longest_lower_run_len < lower_string_count) 01253 longest_lower_run_len = lower_string_count; 01254 if (last_char == *str) { 01255 alpha_repetition_count++; 01256 if (longest_alpha_repetition_count < alpha_repetition_count) { 01257 longest_alpha_repetition_count = alpha_repetition_count; 01258 } 01259 } 01260 else { 01261 last_char = *str; 01262 alpha_repetition_count = 1; 01263 } 01264 break; 01265 case FIRST_NUM: 01266 isolated_digits++; 01267 default: 01268 state = FIRST_LOWER; 01269 last_char = *str; 01270 alpha_repetition_count = 1; 01271 lower_string_count = 1; 01272 break; 01273 } 01274 } 01275 else if (isdigit (*str)) { 01276 total_digit_count++; 01277 switch (state) { 01278 case FIRST_NUM: 01279 state = SUBSEQUENT_NUM; 01280 case SUBSEQUENT_NUM: 01281 break; 01282 case FIRST_UPPER: 01283 case FIRST_LOWER: 01284 isolated_alphas++; 01285 default: 01286 state = FIRST_NUM; 01287 break; 01288 } 01289 } 01290 else { 01291 if (*str == ' ') 01292 tess_rejs++; 01293 else 01294 bad_char_count++; 01295 switch (state) { 01296 case FIRST_NUM: 01297 isolated_digits++; 01298 break; 01299 case FIRST_UPPER: 01300 case FIRST_LOWER: 01301 isolated_alphas++; 01302 default: 01303 break; 01304 } 01305 state = JUNK; 01306 } 01307 } 01308 01309 /* Step 2: Combine result of Step 1 with heuristics to determine 01310 whether word is garbage. Working at the level of whole word */ 01311 switch (state) { 01312 case FIRST_NUM: 01313 isolated_digits++; 01314 break; 01315 case FIRST_UPPER: 01316 case FIRST_LOWER: 01317 isolated_alphas++; 01318 default: 01319 break; 01320 } 01321 01322 if (crunch_include_numerals) { 01323 total_alpha_count += total_digit_count - isolated_digits; 01324 } 01325 01326 if (crunch_leave_ok_strings && 01327 (len >= 4) && 01328 (2 * (total_alpha_count - isolated_alphas) > len) && 01329 (longest_alpha_repetition_count < crunch_long_repetitions)) { 01330 if ((crunch_accept_ok && 01331 (acceptable_word_string (str) != AC_UNACCEPTABLE)) || 01332 (longest_lower_run_len > crunch_leave_lc_strings) || 01333 (longest_upper_run_len > crunch_leave_uc_strings)) 01334 return G_NEVER_CRUNCH; 01335 } 01336 if ((word->reject_map.length () > 1) && 01337 (strpbrk (str, " ") == NULL) && 01338 ((word->best_choice->permuter () == SYSTEM_DAWG_PERM) || 01339 (word->best_choice->permuter () == FREQ_DAWG_PERM) || 01340 (word->best_choice->permuter () == USER_DAWG_PERM) || 01341 (word->best_choice->permuter () == NUMBER_PERM) || 01342 (acceptable_word_string (str) != AC_UNACCEPTABLE) || ok_dict_word)) 01343 return G_OK; 01344 01345 ok_chars = len - bad_char_count - isolated_digits - 01346 isolated_alphas - tess_rejs; 01347 01348 if (crunch_debug > 3) { 01349 tprintf ("garbage_word: \"%s\"\n", 01350 word->best_choice->string ().string ()); 01351 tprintf ("LEN: %d bad: %d iso_N: %d iso_A: %d rej: %d\n", 01352 len, 01353 bad_char_count, isolated_digits, isolated_alphas, tess_rejs); 01354 } 01355 if ((bad_char_count == 0) && 01356 (tess_rejs == 0) && 01357 ((len > isolated_digits + isolated_alphas) || (len <= 2))) 01358 return G_OK; 01359 01360 if ((tess_rejs > ok_chars) || 01361 ((tess_rejs > 0) && ((bad_char_count + tess_rejs) * 2 > len))) 01362 return G_TERRIBLE; 01363 01364 if (len > 4) { 01365 dodgy_chars = 2 * tess_rejs + bad_char_count + 01366 isolated_digits + isolated_alphas; 01367 if ((dodgy_chars > 5) || ((dodgy_chars / (float) len) > 0.5)) 01368 return G_DODGY; 01369 else 01370 return G_OK; 01371 } 01372 else { 01373 dodgy_chars = 2 * tess_rejs + bad_char_count; 01374 if (((len == 4) && (dodgy_chars > 2)) || 01375 ((len == 3) && (dodgy_chars > 2)) || (dodgy_chars >= len)) 01376 return G_DODGY; 01377 else 01378 return G_OK; 01379 } 01380 }
void insert_rej_cblobs | ( | WERD_RES * | word | ) |
Put rejected word blobs back into the outword.
word | Word |
Definition at line 1542 of file docqual.cpp.
References ASSERT_HOST, WERD_RES::best_choice, WERD::blob_list(), gblob_sort_list(), REJMAP::initialise(), REJMAP::length(), STRING::length(), WERD_RES::outword, WERD::rej_blob_list(), WERD_RES::reject_map, and TRUE.
Referenced by quality_based_rejection().
01543 { 01544 PBLOB_IT blob_it; // blob iterator 01545 PBLOB_IT rej_blob_it; 01546 const STRING *wordstr; 01547 int old_len; 01548 int rej_len; 01549 char new_str[512]; 01550 REJMAP new_map; 01551 int i = 0; //new_str index 01552 int j = 0; //old_str index 01553 int new_len; 01554 01555 gblob_sort_list (word->outword->rej_blob_list (), TRUE); 01556 rej_blob_it.set_to_list (word->outword->rej_blob_list ()); 01557 if (rej_blob_it.empty ()) 01558 return; 01559 rej_len = rej_blob_it.length (); 01560 blob_it.set_to_list (word->outword->blob_list ()); 01561 wordstr = &(word->best_choice->string ()); 01562 old_len = wordstr->length (); 01563 ASSERT_HOST (word->reject_map.length () == old_len); 01564 ASSERT_HOST (blob_it.length () == old_len); 01565 if ((old_len + rej_len) > 511) 01566 return; //Word is garbage anyway prevent abort 01567 new_map.initialise (old_len + rej_len); 01568 01569 while (!rej_blob_it.empty ()) { 01570 if ((j >= old_len) || 01571 (rej_blob_it.data ()->bounding_box ().left () <= 01572 blob_it.data ()->bounding_box ().left ())) { 01573 /* Insert reject blob */ 01574 if (j >= old_len) 01575 blob_it.add_to_end (rej_blob_it.extract ()); 01576 else 01577 blob_it.add_before_stay_put (rej_blob_it.extract ()); 01578 if (!rej_blob_it.empty ()) 01579 rej_blob_it.forward (); 01580 new_str[i] = ' '; 01581 new_map[i].setrej_rej_cblob (); 01582 i++; 01583 } 01584 else { 01585 new_str[i] = (*wordstr)[j]; 01586 new_map[i] = word->reject_map[j]; 01587 i++; 01588 j++; 01589 blob_it.forward (); 01590 } 01591 } 01592 /* Add any extra normal blobs to strings */ 01593 while (j < wordstr->length ()) { 01594 new_str[i] = (*wordstr)[j]; 01595 new_map[i] = word->reject_map[j]; 01596 i++; 01597 j++; 01598 } 01599 new_str[i] = '\0'; 01600 /* 01601 tprintf( 01602 "\nOld len %d; New len %d; New str \"%s\"; New map \"%s\"\n", 01603 old_len, i, new_str, new_map ); 01604 */ 01605 ASSERT_HOST (i == blob_it.length ()); 01606 ASSERT_HOST (i == old_len + rej_len); 01607 word->reject_map = new_map; 01608 *((STRING *) wordstr) = new_str; 01609 new_len = strlen (word->best_choice->string ().string ()); 01610 ASSERT_HOST (word->reject_map.length () == new_len); 01611 ASSERT_HOST (word->outword->blob_list ()->length () == new_len); 01612 }
void merge_tess_fails | ( | WERD_RES * | word_res | ) |
Change pairs of tess failures to a single one (merge/collapse).
word_res | Results on word in question |
Definition at line 1133 of file docqual.cpp.
References ASSERT_HOST, WERD_RES::best_choice, WERD::blob_list(), REJMAP::length(), merge_blobs(), WERD_RES::outword, WERD_RES::reject_map, and REJMAP::remove_pos().
Referenced by tilde_crunch(), and tilde_delete().
01134 { 01135 char *ptr; //string ptr 01136 PBLOB_IT blob_it; //blobs 01137 int i = 0; 01138 int len; 01139 01140 len = strlen (word_res->best_choice->string ().string ()); 01141 ASSERT_HOST (word_res->reject_map.length () == len); 01142 ASSERT_HOST (word_res->outword->blob_list ()->length () == len); 01143 01144 ptr = (char *) word_res->best_choice->string ().string (); 01145 blob_it = word_res->outword->blob_list (); 01146 while (*ptr != '\0') { 01147 if ((*ptr == ' ') && (*(ptr + 1) == ' ')) { 01148 strcpy (ptr + 1, ptr + 2); //shuffle up 01149 word_res->reject_map.remove_pos (i); 01150 merge_blobs (blob_it.data_relative (1), blob_it.data ()); 01151 delete blob_it.extract (); //get rid of spare 01152 } 01153 else { 01154 i++; 01155 ptr++; 01156 } 01157 blob_it.forward (); 01158 } 01159 len = strlen (word_res->best_choice->string ().string ()); 01160 ASSERT_HOST (word_res->reject_map.length () == len); 01161 ASSERT_HOST (word_res->outword->blob_list ()->length () == len); 01162 }
Determine if more 'too' small outlines than 'regular' outlines (noise).
word | Word |
Definition at line 1507 of file docqual.cpp.
References WERD::blob_list(), BOX::height(), outline_it, and BOX::width().
Referenced by word_deletable().
01507 { 01508 PBLOB_IT blob_it; 01509 OUTLINE_IT outline_it; 01510 BOX box; // BB of outline 01511 INT16 outline_count = 0; // regular count 01512 INT16 small_outline_count = 0; // tiny/noise count 01513 INT16 max_dimension; // the larger of the BB dims 01514 float small_limit = bln_x_height * crunch_small_outlines_size; 01515 01516 blob_it.set_to_list (word->blob_list ()); 01517 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) { 01518 outline_it.set_to_list (blob_it.data ()->out_list ()); 01519 for (outline_it.mark_cycle_pt(); !outline_it.cycled_list(); outline_it.forward()) { 01520 outline_count++; 01521 box = outline_it.data ()->bounding_box (); 01522 if (box.height () > box.width ()) 01523 max_dimension = box.height (); 01524 else 01525 max_dimension = box.width (); 01526 if (max_dimension < small_limit) 01527 small_outline_count++; 01528 } 01529 } 01530 return (small_outline_count >= outline_count); 01531 }
BOOL8 potential_word_crunch | ( | WERD_RES * | word, | |
GARBAGE_LEVEL | garbage_level, | |||
BOOL8 | ok_dict_word | |||
) |
Determine if word could be garbage or otherwise questionable.
word | Word | |
garbage_level | G_NEVER_CRUNCH, G_TERRIBLE, G_DODGY, or G_OK | |
ok_dict_word | 0 or 1, 1 if word in dictionary |
Definition at line 974 of file docqual.cpp.
References AC_UNACCEPTABLE, acceptable_word_string(), WERD_RES::best_choice, G_OK, REJMAP::length(), WERD_RES::reject_map, and tprintf().
Referenced by tilde_crunch().
00976 { 00977 float rating_per_ch; 00978 int adjusted_len; 00979 char *str = (char *) word->best_choice->string ().string (); // word's string 00980 BOOL8 word_crunchable; 00981 int poor_indicator_count = 0; 00982 00983 word_crunchable = 00984 !crunch_leave_accept_strings || 00985 (word->reject_map.length () < 3) || 00986 ((acceptable_word_string (str) == AC_UNACCEPTABLE) && !ok_dict_word); 00987 00988 adjusted_len = word->reject_map.length (); 00989 if (adjusted_len > 10) 00990 adjusted_len = 10; 00991 rating_per_ch = word->best_choice->rating () / adjusted_len; 00992 00993 if (rating_per_ch > crunch_pot_poor_rate) { 00994 if (crunch_debug > 2) { 00995 tprintf ("Potential poor rating on \"%s\"\n", 00996 word->best_choice->string ().string ()); 00997 } 00998 poor_indicator_count++; 00999 } 01000 01001 if (word_crunchable && 01002 (word->best_choice->certainty () < crunch_pot_poor_cert)) { 01003 if (crunch_debug > 2) { 01004 tprintf ("Potential poor cert on \"%s\"\n", 01005 word->best_choice->string ().string ()); 01006 } 01007 poor_indicator_count++; 01008 } 01009 01010 if (garbage_level != G_OK) { 01011 if (crunch_debug > 2) { 01012 tprintf ("Potential garbage on \"%s\"\n", 01013 word->best_choice->string ().string ()); 01014 } 01015 poor_indicator_count++; 01016 } 01017 return (poor_indicator_count >= crunch_pot_indicators); 01018 }
void print_boxes | ( | WERD * | word | ) |
Print all bounding boxes for blobs in word.
word | Word in question |
Definition at line 447 of file docqual.cpp.
References WERD::blob_list(), and BOX::print().
00447 { 00448 PBLOB_IT it; 00449 BOX box; // bounding box 00450 00451 it.set_to_list (word->blob_list ()); 00452 for (it.mark_cycle_pt (); !it.cycled_list (); it.forward ()) { 00453 box = it.data ()->bounding_box (); 00454 box.print (); 00455 } 00456 }
void quality_based_rejection | ( | PAGE_RES_IT & | page_res_it, | |
BOOL8 | good_quality_doc | |||
) |
Deal with rejected blobs and clean up errant tilde's.
page_res_it | pointer to page in question ? | |
good_quality_doc | 0 or 1, which pass ? |
Definition at line 496 of file docqual.cpp.
References doc_and_block_rejection(), insert_rej_cblobs(), NULL, tilde_crunch(), tilde_delete(), and unrej_good_quality_words().
Referenced by recog_all_words().
00497 { 00498 if ((tessedit_good_quality_unrej && good_quality_doc)) 00499 unrej_good_quality_words(page_res_it); 00500 doc_and_block_rejection(page_res_it, good_quality_doc); 00501 00502 page_res_it.restart_page (); 00503 while (page_res_it.word () != NULL) { 00504 insert_rej_cblobs (page_res_it.word ()); 00505 page_res_it.forward (); 00506 } 00507 00508 if (unlv_tilde_crunching) { 00509 tilde_crunch(page_res_it); 00510 tilde_delete(page_res_it); 00511 } 00512 }
void reject_whole_page | ( | PAGE_RES_IT & | page_res_it | ) |
Dont believe any of it; set map to 00..00 for all words.
page_res_it |
Definition at line 822 of file docqual.cpp.
References PAGE_RES_IT::forward(), NULL, PAGE_RES_IT::page_res, REJMAP::rej_word_doc_rej(), WERD_RES::reject_map, PAGE_RES_IT::restart_page(), TRUE, and PAGE_RES_IT::word().
Referenced by doc_and_block_rejection().
00822 { 00823 page_res_it.restart_page (); 00824 while (page_res_it.word () != NULL) { 00825 page_res_it.word ()->reject_map.rej_word_doc_rej (); 00826 page_res_it.forward (); 00827 } 00828 page_res_it.page_res->rejected = TRUE; // whole page is rejected 00829 }
BOOL8 terrible_word_crunch | ( | WERD_RES * | word, | |
GARBAGE_LEVEL | garbage_level | |||
) |
Determine if word very likely is utter garbage, tuned with several globals.
word | Word | |
garbage_level | G_NEVER_CRUNCH, G_TERRIBLE, G_DODGY, or G_OK |
Definition at line 926 of file docqual.cpp.
References WERD_RES::best_choice, FALSE, G_OK, G_TERRIBLE, REJMAP::length(), WERD_RES::reject_map, tprintf(), and TRUE.
Referenced by tilde_crunch().
00926 { 00927 float rating_per_ch; 00928 int adjusted_len; 00929 int crunch_mode = 0; 00930 00931 if ((word->best_choice->string ().length () == 0) || 00932 (strspn (word->best_choice->string ().string (), " ") == 00933 word->best_choice->string ().length ())) 00934 crunch_mode = 1; 00935 else { 00936 adjusted_len = word->reject_map.length (); 00937 if (adjusted_len > crunch_rating_max) 00938 adjusted_len = crunch_rating_max; 00939 rating_per_ch = word->best_choice->rating () / adjusted_len; 00940 00941 if (rating_per_ch > crunch_terrible_rating) 00942 crunch_mode = 2; 00943 else if (crunch_terrible_garbage && (garbage_level == G_TERRIBLE)) 00944 crunch_mode = 3; 00945 else if ((word->best_choice->certainty () < crunch_poor_garbage_cert) && 00946 (garbage_level != G_OK)) 00947 crunch_mode = 4; 00948 else if ((rating_per_ch > crunch_poor_garbage_rate) && 00949 (garbage_level != G_OK)) 00950 crunch_mode = 5; 00951 } 00952 if (crunch_mode > 0) { 00953 if (crunch_debug > 2) { 00954 tprintf ("Terrible_word_crunch (%d) on \"%s\"\n", 00955 crunch_mode, word->best_choice->string ().string ()); 00956 } 00957 return TRUE; 00958 } 00959 else 00960 return FALSE; 00961 }
void tilde_crunch | ( | PAGE_RES_IT & | page_res_it | ) |
Definition at line 834 of file docqual.cpp.
References WERD_RES::best_choice, convert_bad_unlv_chs(), CR_KEEP_SPACE, dict_word(), DOC_DAWG_PERM, FALSE, PAGE_RES_IT::forward(), G_NEVER_CRUNCH, garbage_word(), merge_tess_fails(), NULL, potential_word_crunch(), PAGE_RES_IT::restart_page(), terrible_word_crunch(), tprintf(), TRUE, WERD_RES::unlv_crunch_mode, and PAGE_RES_IT::word().
Referenced by quality_based_rejection().
00834 { 00835 WERD_RES *word; 00836 GARBAGE_LEVEL garbage_level; 00837 PAGE_RES_IT copy_it; 00838 BOOL8 prev_potential_marked = FALSE; 00839 BOOL8 found_terrible_word = FALSE; 00840 int dict_type; 00841 BOOL8 ok_dict_word; 00842 00843 page_res_it.restart_page (); 00844 while (page_res_it.word () != NULL) { 00845 word = page_res_it.word (); 00846 00847 if (crunch_early_convert_bad_unlv_chs) 00848 convert_bad_unlv_chs(word); 00849 00850 if (crunch_early_merge_tess_fails) 00851 merge_tess_fails(word); 00852 00853 if (word->reject_map.accept_count () != 0) { 00854 found_terrible_word = FALSE; 00855 prev_potential_marked = FALSE; // Forget earlier potential crunches 00856 } 00857 else { 00858 dict_type = dict_word (word->best_choice->string ().string ()); 00859 ok_dict_word = (dict_type > 0) && (dict_type != DOC_DAWG_PERM); 00860 garbage_level = garbage_word (word, ok_dict_word); 00861 00862 if ((garbage_level != G_NEVER_CRUNCH) && 00863 (terrible_word_crunch (word, garbage_level))) { 00864 if (crunch_debug > 0) { 00865 tprintf ("T CRUNCHING: \"%s\"\n", 00866 word->best_choice->string ().string ()); 00867 } 00868 word->unlv_crunch_mode = CR_KEEP_SPACE; 00869 if (prev_potential_marked) { 00870 while (copy_it.word () != word) { 00871 if (crunch_debug > 0) { 00872 tprintf ("P1 CRUNCHING: \"%s\"\n", 00873 copy_it.word ()->best_choice->string (). 00874 string ()); 00875 } 00876 copy_it.word ()->unlv_crunch_mode = CR_KEEP_SPACE; 00877 copy_it.forward (); 00878 } 00879 prev_potential_marked = FALSE; 00880 } 00881 found_terrible_word = TRUE; 00882 } 00883 else if ((garbage_level != G_NEVER_CRUNCH) && 00884 (potential_word_crunch (word, 00885 garbage_level, ok_dict_word))) { 00886 if (found_terrible_word) { 00887 if (crunch_debug > 0) { 00888 tprintf ("P2 CRUNCHING: \"%s\"\n", 00889 word->best_choice->string ().string ()); 00890 } 00891 word->unlv_crunch_mode = CR_KEEP_SPACE; 00892 } 00893 else if (!prev_potential_marked) { 00894 copy_it = page_res_it; 00895 prev_potential_marked = TRUE; 00896 if (crunch_debug > 1) { 00897 tprintf ("P3 CRUNCHING: \"%s\"\n", 00898 word->best_choice->string ().string ()); 00899 } 00900 } 00901 } 00902 else { 00903 found_terrible_word = FALSE; 00904 prev_potential_marked = FALSE; // Forget earlier potential crunches 00905 if (crunch_debug > 2) { 00906 tprintf ("NO CRUNCH: \"%s\"\n", 00907 word->best_choice->string ().string ()); 00908 } 00909 } 00910 } 00911 page_res_it.forward (); 00912 } 00913 }
void tilde_delete | ( | PAGE_RES_IT & | page_res_it | ) |
Determine if word could be bad and delete all '~'s in it?
page_res_it | Results of page |
Definition at line 1030 of file docqual.cpp.
References WERD_RES::best_choice, CR_NONE, FALSE, WERD::flag(), PAGE_RES_IT::forward(), merge_tess_fails(), NULL, PAGE_RES_IT::restart_page(), tprintf(), TRUE, WERD_RES::unlv_crunch_mode, W_BOL, W_EOL, WERD_RES::word, PAGE_RES_IT::word(), and word_deletable().
Referenced by quality_based_rejection().
01030 { 01031 WERD_RES *word; 01032 PAGE_RES_IT copy_it; 01033 BOOL8 deleting_from_bol = FALSE; 01034 BOOL8 marked_delete_point = FALSE; 01035 INT16 debug_delete_mode; 01036 CRUNCH_MODE delete_mode; 01037 INT16 x_debug_delete_mode; 01038 CRUNCH_MODE x_delete_mode; 01039 01040 page_res_it.restart_page (); 01041 while (page_res_it.word () != NULL) { 01042 word = page_res_it.word (); 01043 01044 delete_mode = word_deletable (word, debug_delete_mode); 01045 if (delete_mode != CR_NONE) { 01046 if (word->word->flag (W_BOL) || deleting_from_bol) { 01047 if (crunch_debug > 0) { 01048 tprintf ("BOL CRUNCH DELETING(%d): \"%s\"\n", 01049 debug_delete_mode, 01050 word->best_choice->string ().string ()); 01051 } 01052 word->unlv_crunch_mode = delete_mode; 01053 deleting_from_bol = TRUE; 01054 } 01055 else if (word->word->flag (W_EOL)) { 01056 if (marked_delete_point) { 01057 while (copy_it.word () != word) { 01058 x_delete_mode = word_deletable (copy_it.word (), 01059 x_debug_delete_mode); 01060 if (crunch_debug > 0) { 01061 tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n", 01062 x_debug_delete_mode, 01063 copy_it.word ()->best_choice->string (). 01064 string ()); 01065 } 01066 copy_it.word ()->unlv_crunch_mode = x_delete_mode; 01067 copy_it.forward (); 01068 } 01069 } 01070 if (crunch_debug > 0) { 01071 tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n", 01072 debug_delete_mode, 01073 word->best_choice->string ().string ()); 01074 } 01075 word->unlv_crunch_mode = delete_mode; 01076 deleting_from_bol = FALSE; 01077 marked_delete_point = FALSE; 01078 } 01079 else { 01080 if (!marked_delete_point) { 01081 copy_it = page_res_it; 01082 marked_delete_point = TRUE; 01083 } 01084 } 01085 } 01086 else { 01087 deleting_from_bol = FALSE; 01088 marked_delete_point = FALSE; // Forget earlier potential crunches 01089 } 01090 /* The following step has been left till now as the tess fails are used to 01091 determine if the word is deletable. 01092 */ 01093 if (!crunch_early_merge_tess_fails) 01094 merge_tess_fails(word); 01095 page_res_it.forward (); 01096 } 01097 }
Unreject POTENTIAL rejects if the blob passes the blob and outline checks.
word | Word in question | |
row | word's row? |
Definition at line 371 of file docqual.cpp.
References WERD_RES::best_choice, WERD::blob_list(), PBLOB::bounding_box(), count_outline_errs(), crude_match_blobs(), delete_word(), WERD_RES::denorm, FALSE, WERD::gblob_list(), BOX::left(), make_bln_copy(), make_ed_word(), make_tess_word(), NULL, WERD_RES::outword, WERD_RES::reject_map, DENORM::scale(), and WERD_RES::word.
Referenced by unrej_good_quality_words().
00371 { 00372 WERD *bln_word; //BL norm init word 00373 TWERD *tessword; //tess format 00374 WERD *init_word; //BL norm init word 00375 PBLOB_IT outword_it; 00376 PBLOB_IT initial_it; 00377 INT16 i; 00378 INT16 init_blobs_left; 00379 BOOL8 matched; 00380 BOX out_box; 00381 PBLOB *test_blob; 00382 DENORM denorm; 00383 float bln_xht; 00384 INT16 j = 0; 00385 00386 if (word->word->gblob_list ()->empty ()) 00387 return; 00388 00389 bln_xht = bln_x_height / word->denorm.scale (); // xht used for blnorm 00390 bln_word = make_bln_copy (word->word, row, bln_xht, &denorm); 00391 /* NOTE: Need to convert to tess format and back again to ensure that the 00392 same float -> int rounding of coords is done to source wd as out wd before 00393 comparison 00394 */ 00395 tessword = make_tess_word (bln_word, NULL); 00396 init_word = make_ed_word (tessword, bln_word); // convert word 00397 delete bln_word; 00398 delete_word(tessword); // get rid of it 00399 00400 initial_it.set_to_list (init_word->blob_list ()); 00401 init_blobs_left = initial_it.length (); 00402 outword_it.set_to_list (word->outword->blob_list ()); 00403 00404 for (outword_it.mark_cycle_pt (); 00405 !outword_it.cycled_list (); outword_it.forward ()) { 00406 out_box = outword_it.data ()->bounding_box (); 00407 00408 /* Skip any initial blobs LEFT of current outword blob */ 00409 while (!initial_it.at_last () && 00410 (initial_it.data ()->bounding_box ().left () < out_box.left ())) { 00411 initial_it.forward (); 00412 init_blobs_left--; 00413 } 00414 00415 /* See if current outword blob matches any initial blob with the same left 00416 coord. (Normally only one but possibly more - in unknown order) */ 00417 i = 0; 00418 matched = FALSE; 00419 do { 00420 test_blob = initial_it.data_relative (i++); 00421 matched = crude_match_blobs (test_blob, outword_it.data ()); 00422 if (matched && 00423 (word->reject_map[j].accept_if_good_quality ()) && 00424 (docqual_excuse_outline_errs || 00425 (count_outline_errs (word->best_choice->string ()[j], 00426 outword_it.data ()->out_list ()-> 00427 length ()) == 0))) 00428 word->reject_map[j].setrej_quality_accept (); 00429 } 00430 while (!matched && 00431 (init_blobs_left - i > 0) && 00432 (i < 129) && 00433 !initial_it.at_last () && 00434 test_blob->bounding_box ().left () == out_box.left ()); 00435 j++; 00436 } 00437 delete init_word; 00438 }
void unrej_good_quality_words | ( | PAGE_RES_IT & | page_res_it | ) |
Unreject potential.
page_res_it | pointer to page in question ? |
- Contains a potential reject - Word looks like a sensible alpha word. - Word segmentation is the same as the original image - All characters have the expected number of outlines - NOTE: the rejection counts are recalculated after unrejection - CANT do it in a single pass without a bit of fiddling - keep it simple but inefficient
Definition at line 533 of file docqual.cpp.
References AC_UNACCEPTABLE, acceptable_word_string(), WERD_RES::best_choice, PAGE_RES_IT::block(), BLOCK_RES::char_count, ROW_RES::char_count, check_debug_pt(), PAGE_RES_IT::forward(), REJMAP::length(), NULL, PAGE_RES_IT::page_res, REJMAP::quality_recoverable_rejects(), BLOCK_RES::rej_count, ROW_RES::rej_count, PAGE_RES_IT::rej_stat_word(), WERD_RES::reject_map, PAGE_RES_IT::restart_page(), ROW_RES::row, PAGE_RES_IT::row(), unrej_good_chs(), ROW_RES::whole_word_rej_count, and PAGE_RES_IT::word().
Referenced by quality_based_rejection().
00534 { 00535 WERD_RES *word; 00536 ROW_RES *current_row; 00537 BLOCK_RES *current_block; 00538 int i; 00539 00540 page_res_it.restart_page (); 00541 while (page_res_it.word () != NULL) { 00542 check_debug_pt (page_res_it.word (), 100); 00543 if (bland_unrej) { 00544 word = page_res_it.word (); 00545 for (i = 0; i < word->reject_map.length (); i++) { 00546 if (word->reject_map[i].accept_if_good_quality ()) 00547 word->reject_map[i].setrej_quality_accept (); 00548 } 00549 page_res_it.forward (); 00550 } 00551 else if ((page_res_it.row ()->char_count > 0) && 00552 ((page_res_it.row ()->rej_count / 00553 (float) page_res_it.row ()->char_count) <= 00554 quality_rowrej_pc)) { 00555 word = page_res_it.word (); 00556 if (word->reject_map.quality_recoverable_rejects () && 00557 (tessedit_unrej_any_wd || 00558 acceptable_word_string (word->best_choice->string ().string ()) 00559 != AC_UNACCEPTABLE)) { 00560 unrej_good_chs (word, page_res_it.row ()->row); 00561 } 00562 page_res_it.forward (); 00563 } 00564 else { 00565 /* Skip to end of dodgy row */ 00566 current_row = page_res_it.row (); 00567 while ((page_res_it.word () != NULL) && 00568 (page_res_it.row () == current_row)) 00569 page_res_it.forward (); 00570 } 00571 check_debug_pt (page_res_it.word (), 110); 00572 } 00573 page_res_it.restart_page (); 00574 page_res_it.page_res->char_count = 0; 00575 page_res_it.page_res->rej_count = 0; 00576 current_block = NULL; 00577 current_row = NULL; 00578 while (page_res_it.word () != NULL) { 00579 if (current_block != page_res_it.block ()) { 00580 current_block = page_res_it.block (); 00581 current_block->char_count = 0; 00582 current_block->rej_count = 0; 00583 } 00584 if (current_row != page_res_it.row ()) { 00585 current_row = page_res_it.row (); 00586 current_row->char_count = 0; 00587 current_row->rej_count = 0; 00588 current_row->whole_word_rej_count = 0; 00589 } 00590 page_res_it.rej_stat_word (); 00591 page_res_it.forward (); 00592 } 00593 }
Find number of blobs in outword that are identical to those of inword.
word | Word in question | |
row | Row the word came from |
Definition at line 139 of file docqual.cpp.
References WERD::blob_list(), PBLOB::bounding_box(), cprintf(), crude_match_blobs(), delete_word(), WERD_RES::denorm, FALSE, WERD::gblob_list(), BOX::left(), make_bln_copy(), make_ed_word(), make_tess_word(), NULL, WERD_RES::outword, DENORM::scale(), and WERD_RES::word.
Referenced by recog_all_words(), and recog_interactive().
00141 { 00142 WERD *bln_word; //BL norm init word 00143 TWERD *tessword; //tess format 00144 WERD *init_word; //BL norm init word 00145 PBLOB_IT outword_it; 00146 PBLOB_IT initial_it; 00147 INT16 i; 00148 INT16 init_blobs_left; 00149 INT16 match_count = 0; 00150 BOOL8 matched; 00151 BOX out_box; 00152 PBLOB *test_blob; 00153 DENORM denorm; 00154 float bln_xht; 00155 00156 #ifdef TEXT_VERBOSE 00157 // gets a 'v', see ccmain/tesseractmain.dox 00158 cprintf("v"); 00159 #endif 00160 if (word->word->gblob_list ()->empty ()) 00161 return 0; 00162 bln_xht = bln_x_height / word->denorm.scale (); //xht used for blnorm 00163 bln_word = make_bln_copy (word->word, row, bln_xht, &denorm); 00164 00165 /* NOTE: Need to convert to tess format and back again to ensure that the 00166 same float -> int rounding of coords is done to source wd as out wd before 00167 comparison 00168 */ 00169 00170 // if (!bln_word->flag(W_POLYGON)) 00171 // tprintf( "NON POLYGON BLN WERD\n"); 00172 tessword = make_tess_word (bln_word, NULL); 00173 init_word = make_ed_word (tessword, bln_word); // convert word 00174 // if (!init_word->flag(W_POLYGON)) 00175 // tprintf( "NON POLYGON INIT WERD\n"); 00176 // tprintf( "SOURCE BLOBS-AFTER TESS:\n"); 00177 // print_boxes( init_word ); 00178 // tprintf( "OUTPUT BLOBS:\n"); 00179 // print_boxes( word->outword ); 00180 00181 initial_it.set_to_list (init_word->blob_list ()); 00182 init_blobs_left = initial_it.length (); 00183 outword_it.set_to_list (word->outword->blob_list ()); 00184 delete bln_word; 00185 delete_word(tessword); //get rid of it 00186 00187 for (outword_it.mark_cycle_pt (); 00188 !outword_it.cycled_list (); outword_it.forward ()) { 00189 out_box = outword_it.data ()->bounding_box (); 00190 00191 /* Skip any initial blobs LEFT of current outword blob */ 00192 while (!initial_it.at_last () && 00193 (initial_it.data ()->bounding_box ().left () < out_box.left ())) { 00194 initial_it.forward (); 00195 init_blobs_left--; 00196 } 00197 00198 /* See if current outword blob matches any initial blob with the same left 00199 coord. (Normally only one but possibly more - in unknown order) */ 00200 00201 i = 0; 00202 matched = FALSE; 00203 do { 00204 test_blob = initial_it.data_relative (i++); 00205 matched = crude_match_blobs (test_blob, outword_it.data ()); 00206 if (matched) 00207 match_count++; 00208 } 00209 while (!matched && 00210 (init_blobs_left - i > 0) && 00211 (i < 129) && 00212 !initial_it.at_last () && 00213 test_blob->bounding_box ().left () == out_box.left ()); 00214 } 00215 delete init_word; 00216 return match_count; 00217 }
void word_char_quality | ( | WERD_RES * | word, | |
ROW * | row, | |||
INT16 * | match_count, | |||
INT16 * | accepted_match_count | |||
) |
Check word's blobs' quality.
word | Word in question | |
row | word's row? | |
match_count | Return variable for caller | |
accepted_match_count | Return variable for caller |
Definition at line 277 of file docqual.cpp.
References WERD_RES::best_choice, WERD::blob_list(), PBLOB::bounding_box(), count_outline_errs(), cprintf(), crude_match_blobs(), delete_word(), WERD_RES::denorm, FALSE, WERD::gblob_list(), BOX::left(), make_bln_copy(), make_ed_word(), make_tess_word(), NULL, WERD_RES::outword, WERD_RES::reject_map, DENORM::scale(), and WERD_RES::word.
Referenced by classify_word_pass2(), doc_and_block_rejection(), recog_all_words(), and recog_interactive().
00281 { 00282 WERD *bln_word; //BL norm init word 00283 TWERD *tessword; //tess format 00284 WERD *init_word; //BL norm init word 00285 PBLOB_IT outword_it; 00286 PBLOB_IT initial_it; 00287 INT16 i; 00288 INT16 init_blobs_left; 00289 BOOL8 matched; 00290 BOX out_box; 00291 PBLOB *test_blob; 00292 DENORM denorm; 00293 float bln_xht; 00294 INT16 j = 0; 00295 00296 #ifdef TEXT_VERBOSE 00297 // gets a 'y', see ccmain/tessvars.doxfg 00298 cprintf("y"); 00299 #endif 00300 *match_count = 0; 00301 *accepted_match_count = 0; 00302 if (word->word->gblob_list ()->empty ()) 00303 return; 00304 00305 bln_xht = bln_x_height / word->denorm.scale (); //xht used for blnorm 00306 bln_word = make_bln_copy (word->word, row, bln_xht, &denorm); 00307 /* NOTE: Need to convert to tess format and back again to ensure that the 00308 same float -> int rounding of coords is done to source wd as out wd before 00309 comparison 00310 */ 00311 tessword = make_tess_word (bln_word, NULL); 00312 init_word = make_ed_word (tessword, bln_word); //convert word 00313 delete bln_word; 00314 delete_word(tessword); //get rid of it 00315 // tprintf( "SOURCE BLOBS-AFTER TESS:\n"); 00316 // print_boxes( init_word ); 00317 // tprintf( "OUTPUT BLOBS:\n"); 00318 // print_boxes( word->outword ); 00319 00320 initial_it.set_to_list (init_word->blob_list ()); 00321 init_blobs_left = initial_it.length (); 00322 outword_it.set_to_list (word->outword->blob_list ()); 00323 00324 for (outword_it.mark_cycle_pt (); 00325 !outword_it.cycled_list (); outword_it.forward ()) { 00326 out_box = outword_it.data ()->bounding_box (); 00327 00328 /* Skip any initial blobs LEFT of current outword blob */ 00329 while (!initial_it.at_last () && 00330 (initial_it.data ()->bounding_box ().left () < out_box.left ())) { 00331 initial_it.forward (); 00332 init_blobs_left--; 00333 } 00334 00335 /* See if current outword blob matches any initial blob with the same left 00336 coord. (Normally only one but possibly more - in unknown order) */ 00337 00338 i = 0; 00339 matched = FALSE; 00340 do { 00341 test_blob = initial_it.data_relative (i++); 00342 matched = crude_match_blobs (test_blob, outword_it.data ()); 00343 if (matched && 00344 (count_outline_errs (word->best_choice->string ()[j], 00345 outword_it.data ()->out_list ()->length ()) == 0)) { 00346 (*match_count)++; 00347 if (word->reject_map[j].accepted ()) 00348 (*accepted_match_count)++; 00349 } 00350 } 00351 while (!matched && 00352 (init_blobs_left - i > 0) && 00353 (i < 129) && 00354 !initial_it.at_last () && 00355 test_blob->bounding_box ().left () == out_box.left ()); 00356 j++; 00357 } 00358 delete init_word; 00359 }
CRUNCH_MODE word_deletable | ( | WERD_RES * | word, | |
INT16 & | delete_mode | |||
) |
Delete suspicious word if a bunch of constraints are met.
word | Word in question | |
delete_mode | Actually a return value (0..11) for tilde_delete() |
DELETE WERDS AT ENDS OF ROWS IF Word is crunched AND ( string length = 0 OR > 50% of chars are "|" (before merging) OR certainty < -10 OR rating /char > 60 OR TOP of word is more than 0.5 xht BELOW baseline OR BOTTOM of word is more than 0.5 xht ABOVE xht OR length of word < 3xht OR height of word < 0.7 xht OR height of word > 3.0 xht OR >75% of the outline BBs have longest dimension < 0.5xht )
Definition at line 1409 of file docqual.cpp.
References BOX::bottom(), WERD::bounding_box(), CR_DELETE, CR_LOOSE_SPACE, CR_NONE, failure_count(), BOX::height(), REJMAP::length(), noise_outlines(), WERD_RES::outword, WERD_RES::reject_map, BOX::top(), WERD_RES::unlv_crunch_mode, and BOX::width().
Referenced by tilde_delete().
01409 { 01410 int word_len = word->reject_map.length (); 01411 float rating_per_ch; 01412 BOX box; // BB of word 01413 01414 if (word->unlv_crunch_mode == CR_NONE) { 01415 delete_mode = 0; 01416 return CR_NONE; 01417 } 01418 01419 if (word_len == 0) { 01420 delete_mode = 1; 01421 return CR_DELETE; 01422 } 01423 01424 box = word->outword->bounding_box (); 01425 if (box.height () < crunch_del_min_ht * bln_x_height) { 01426 delete_mode = 4; 01427 return CR_DELETE; 01428 } 01429 01430 if (noise_outlines (word->outword)) { 01431 delete_mode = 5; 01432 return CR_DELETE; 01433 } 01434 01435 if ((failure_count (word) * 1.5) > word_len) { 01436 delete_mode = 2; 01437 return CR_LOOSE_SPACE; 01438 } 01439 01440 if (word->best_choice->certainty () < crunch_del_cert) { 01441 delete_mode = 7; 01442 return CR_LOOSE_SPACE; 01443 } 01444 01445 rating_per_ch = word->best_choice->rating () / word_len; 01446 01447 if (rating_per_ch > crunch_del_rating) { 01448 delete_mode = 8; 01449 return CR_LOOSE_SPACE; 01450 } 01451 01452 if (box.top () < bln_baseline_offset - crunch_del_low_word * bln_x_height) { 01453 delete_mode = 9; 01454 return CR_LOOSE_SPACE; 01455 } 01456 01457 if (box.bottom () > 01458 bln_baseline_offset + crunch_del_high_word * bln_x_height) { 01459 delete_mode = 10; 01460 return CR_LOOSE_SPACE; 01461 } 01462 01463 if (box.height () > crunch_del_max_ht * bln_x_height) { 01464 delete_mode = 11; 01465 return CR_LOOSE_SPACE; 01466 } 01467 01468 if (box.width () < crunch_del_min_width * bln_x_height) { 01469 delete_mode = 3; 01470 return CR_LOOSE_SPACE; 01471 } 01472 01473 delete_mode = 0; 01474 return CR_NONE; 01475 }
Count errors in word's outlines using count_outline_errs().
word | Word |
Definition at line 246 of file docqual.cpp.
References WERD_RES::best_choice, WERD::blob_list(), count_outline_errs(), and WERD_RES::outword.
Referenced by recog_all_words(), and recog_interactive().
00247 { 00248 PBLOB_IT outword_it; 00249 INT16 i = 0; 00250 INT16 err_count = 0; 00251 00252 outword_it.set_to_list (word->outword->blob_list ()); 00253 00254 for (outword_it.mark_cycle_pt (); 00255 !outword_it.cycled_list (); outword_it.forward ()) { 00256 err_count += count_outline_errs (word->best_choice->string ()[i], 00257 outword_it.data ()->out_list ()-> length ()); 00258 i++; 00259 } 00260 return err_count; 00261 }