#include "mfcpch.h"
#include <ctype.h>
#include "reject.h"
#include "statistc.h"
#include "genblob.h"
#include "control.h"
#include "fixspace.h"
#include "tessvars.h"
#include "tessbox.h"
#include "secname.h"
Go to the source code of this file.
#define MAXSPACING 128 |
Max expected spacing in pix
Definition at line 66 of file fixspace.cpp.
Referenced by block_space_stat(), block_spacing_stats(), isolated_row_stats(), row_space_stat(), row_spacing_stats(), and uniformly_spaced().
#define PERFECT_WERDS 999 |
Definition at line 64 of file fixspace.cpp.
Referenced by eval_word_spacing(), fix_fuzzy_space_list(), and fix_noisy_space_list().
float blob_noise_score | ( | PBLOB * | blob | ) |
Compute dimensions of largest blob ?
blob | Blob |
Definition at line 963 of file fixspace.cpp.
References BOX::bottom(), PBLOB::bounding_box(), BOX::height(), PBLOB::out_list(), outline_it, BOX::top(), and BOX::width().
Referenced by fp_eval_word_spacing(), and worst_noise_blob().
00963 { 00964 OUTLINE_IT outline_it; 00965 BOX box; //BB of outline 00966 INT16 outline_count = 0; 00967 INT16 max_dimension; 00968 INT16 largest_outline_dimension = 0; 00969 00970 outline_it.set_to_list (blob->out_list ()); 00971 for (outline_it.mark_cycle_pt (); 00972 !outline_it.cycled_list (); outline_it.forward ()) { 00973 outline_count++; 00974 box = outline_it.data ()->bounding_box (); 00975 if (box.height () > box.width ()) 00976 max_dimension = box.height (); 00977 else 00978 max_dimension = box.width (); 00979 00980 if (largest_outline_dimension < max_dimension) 00981 largest_outline_dimension = max_dimension; 00982 } 00983 00984 if (fixsp_noise_score_fixing) { 00985 if (outline_count > 5) 00986 largest_outline_dimension *= 2; //penalise LOTS of blobs 00987 00988 box = blob->bounding_box (); 00989 00990 if ((box.bottom () > bln_baseline_offset * 4) || 00991 (box.top () < bln_baseline_offset / 2)) 00992 largest_outline_dimension /= 2; //Lax blob is if high or low 00993 } 00994 return largest_outline_dimension; 00995 }
void break_noisiest_blob_word | ( | WERD_RES_LIST & | words | ) |
Find the word with the blob which looks like the worst noise.
words | Words |
Definition at line 800 of file fixspace.cpp.
References WERD_RES::best_choice, WERD::cblob_list(), WERD_RES::done, FALSE, NULL, WERD_RES::outword, WERD_RES::raw_choice, WERD::rej_cblob_list(), WERD::set_blanks(), WERD::set_flag(), W_BOL, W_EOL, WERD_RES::word, and worst_noise_blob().
Referenced by fix_noisy_space_list().
00800 { 00801 WERD_RES_IT word_it(&words); 00802 WERD_RES_IT worst_word_it; 00803 float worst_noise_score = 9999; 00804 int worst_blob_index = -1; //noisiest blb of noisiest wd 00805 int blob_index; //of wds noisiest blb 00806 float noise_score; //of wds noisiest blb 00807 WERD_RES *word_res; 00808 C_BLOB_IT blob_it; 00809 C_BLOB_IT rej_cblob_it; 00810 C_BLOB_LIST new_blob_list; 00811 C_BLOB_IT new_blob_it; 00812 C_BLOB_IT new_rej_cblob_it; 00813 WERD *new_word; 00814 INT16 start_of_noise_blob; 00815 INT16 i; 00816 00817 for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { 00818 blob_index = worst_noise_blob (word_it.data (), &noise_score); 00819 if ((blob_index > -1) && (worst_noise_score > noise_score)) { 00820 worst_noise_score = noise_score; 00821 worst_blob_index = blob_index; 00822 worst_word_it = word_it; 00823 } 00824 } 00825 if (worst_blob_index < 0) { 00826 words.clear (); //signal termination 00827 return; 00828 } 00829 00830 /* Now split the worst_word_it */ 00831 word_res = worst_word_it.data (); 00832 00833 /* Move blobs before noise blob to a new bloblist */ 00834 new_blob_it.set_to_list (&new_blob_list); 00835 blob_it.set_to_list (word_res->word->cblob_list ()); 00836 for (i = 0; i < worst_blob_index; i++, blob_it.forward ()) { 00837 new_blob_it.add_after_then_move (blob_it.extract ()); 00838 } 00839 start_of_noise_blob = blob_it.data ()->bounding_box ().left (); 00840 delete blob_it.extract (); //throw out noise blb 00841 00842 new_word = new WERD (&new_blob_list, word_res->word); 00843 new_word->set_flag (W_EOL, FALSE); 00844 word_res->word->set_flag (W_BOL, FALSE); 00845 word_res->word->set_blanks (1);//After break 00846 00847 new_rej_cblob_it.set_to_list (new_word->rej_cblob_list ()); 00848 rej_cblob_it.set_to_list (word_res->word->rej_cblob_list ()); 00849 for (; 00850 (!rej_cblob_it.empty () && 00851 (rej_cblob_it.data ()->bounding_box ().left () < 00852 start_of_noise_blob)); rej_cblob_it.forward ()) { 00853 new_rej_cblob_it.add_after_then_move (rej_cblob_it.extract ()); 00854 } 00855 00856 worst_word_it.add_before_then_move (new WERD_RES (new_word)); 00857 00858 word_res->done = FALSE; 00859 if (word_res->outword != NULL) { 00860 delete word_res->outword; 00861 delete word_res->best_choice; 00862 delete word_res->raw_choice; 00863 word_res->outword = NULL; 00864 word_res->best_choice = NULL; 00865 word_res->raw_choice = NULL; 00866 } 00867 }
Check if ch in word is a digit or digit-compatible punctuation.
word | ||
ch |
Definition at line 411 of file fixspace.cpp.
References WERD_RES::best_choice, and NUMBER_PERM.
Referenced by eval_word_spacing().
00411 { 00412 return (isdigit (ch) || 00413 (fixsp_numeric_fix && 00414 (word->best_choice->permuter () == NUMBER_PERM) && 00415 STRING (numeric_punctuation).contains (ch))); 00416 }
Dump out the words recognized & stats.
perm | ? | |
score | score | |
mode | mode | |
improved | 0 or 1 |
Definition at line 526 of file fixspace.cpp.
References STRING::string(), and tprintf().
Referenced by fix_fuzzy_space_list(), and fix_noisy_space_list().
00526 { 00527 WERD_RES_IT word_res_it(&perm); 00528 static STRING initial_str; 00529 00530 if (debug_fix_space_level > 0) { 00531 if (mode == 1) { 00532 initial_str = ""; 00533 for (word_res_it.mark_cycle_pt (); 00534 !word_res_it.cycled_list (); word_res_it.forward ()) { 00535 if (!word_res_it.data ()->part_of_combo) { 00536 initial_str += word_res_it.data ()->best_choice->string (); 00537 initial_str += ' '; 00538 } 00539 } 00540 } 00541 00542 #ifndef SECURE_NAMES 00543 if (debug_fix_space_level > 1) { 00544 switch (mode) { 00545 case 1: 00546 tprintf ("EXTRACTED (%d): \"", score); 00547 break; 00548 case 2: 00549 tprintf ("TESTED (%d): \"", score); 00550 break; 00551 case 3: 00552 tprintf ("RETURNED (%d): \"", score); 00553 break; 00554 } 00555 00556 for (word_res_it.mark_cycle_pt (); 00557 !word_res_it.cycled_list (); word_res_it.forward ()) { 00558 if (!word_res_it.data ()->part_of_combo) 00559 tprintf ("%s/%1d ", 00560 word_res_it.data ()->best_choice->string (). 00561 string (), 00562 (int) word_res_it.data ()->best_choice->permuter ()); 00563 } 00564 tprintf ("\"\n"); 00565 } 00566 else if (improved) { 00567 tprintf ("FIX SPACING \"%s\" => \"", initial_str.string ()); 00568 for (word_res_it.mark_cycle_pt (); 00569 !word_res_it.cycled_list (); word_res_it.forward ()) { 00570 if (!word_res_it.data ()->part_of_combo) 00571 tprintf ("%s/%1d ", 00572 word_res_it.data ()->best_choice->string (). 00573 string (), 00574 (int) word_res_it.data ()->best_choice->permuter ()); 00575 } 00576 tprintf ("\"\n"); 00577 } 00578 #endif 00579 } 00580 }
INT16 eval_word_spacing | ( | WERD_RES_LIST & | word_res_list | ) |
Basic measure is number of characters in contextually confirmed words = the word is done.
word_res_list | Input word list |
Some fiddles are done to handle "1"s as these are VERY frequent causes of fuzzy spaces. The problem with the basic measure is that "561 63" would score the same as "56163", though given our knowledge that the space is fuzzy, and that there is a "1" next to the fuzzy space, we need to ensure that "56163" is prefered.
The solution is to NOT COUNT the score of any word which has a digit at one end and a "1Il" as the character the other side of the space.
Conversly, any character next to a "1" within a word is counted as a positive score. Thus "561 63" would score 4 (3 chars in a numeric word plus 1 side of the "1" joined). "56163" would score 7 - all chars in a numeric word + 2 sides of a "1" joined.
The joined 1 rule is applied to any word REGARDLESS of contextual confirmation. Thus "PS7a71 3/7a" scores 1 (neither word is contexutally confirmed. The only score is from the joined 1. "PS7a713/7a" scores 2.
Can we add the prev word score and potentially count this word? Yes IF it didnt end in a 1 when the first char of this word is a digit AND it didnt end in a digit when the first char of this word is a 1
Definition at line 285 of file fixspace.cpp.
References WERD_RES::best_choice, STRING::contains(), cprintf(), digit_or_numeric_punct(), FALSE, fixspace_thinks_word_done(), REJMAP::length(), NUMBER_PERM, PERFECT_WERDS, WERD_RES::reject_map, WERD_RES::tess_failed, TRUE, uniformly_spaced(), and word_count.
Referenced by fix_fuzzy_space_list(), and fp_eval_word_spacing().
00285 { 00286 WERD_RES_IT word_res_it(&word_res_list); 00287 INT16 total_score = 0; 00288 INT16 word_count = 0; 00289 INT16 done_word_count = 0; 00290 INT16 word_len; 00291 INT16 i; 00292 WERD_RES *word; //current word 00293 INT16 prev_word_score = 0; 00294 BOOL8 prev_word_done = FALSE; 00295 BOOL8 prev_char_1 = FALSE; //prev ch a "1/I/l"? 00296 BOOL8 prev_char_digit = FALSE; //prev ch 2..9 or 0 00297 BOOL8 current_char_1 = FALSE; 00298 BOOL8 current_word_ok_so_far; 00299 STRING punct_chars = "!\"`',.:;"; 00300 BOOL8 prev_char_punct = FALSE; 00301 BOOL8 current_char_punct = FALSE; 00302 BOOL8 word_done = FALSE; 00303 00304 #ifdef TEXT_VERBOSE 00305 // gets a 'z', see ccmain/tesseractmain.dox 00306 cprintf("z"); 00307 #endif 00308 do { 00309 word = word_res_it.data (); 00310 word_done = fixspace_thinks_word_done (word); 00311 word_count++; 00312 if (word->tess_failed) { 00313 total_score += prev_word_score; 00314 if (prev_word_done) 00315 done_word_count++; 00316 prev_word_score = 0; 00317 prev_char_1 = FALSE; 00318 prev_char_digit = FALSE; 00319 prev_word_done = FALSE; 00320 } 00321 else { 00327 word_len = word->reject_map.length (); 00328 current_word_ok_so_far = FALSE; 00329 if (!((prev_char_1 && 00330 digit_or_numeric_punct (word, 00331 word->best_choice->string ()[0])) || 00332 (prev_char_digit && 00333 ((word_done && 00334 (word->best_choice->string ()[0] == '1')) || 00335 (!word_done && 00336 STRING (conflict_set_I_l_1).contains (word->best_choice-> 00337 string ()[0])))))) { 00338 total_score += prev_word_score; 00339 if (prev_word_done) 00340 done_word_count++; 00341 current_word_ok_so_far = word_done; 00342 } 00343 00344 if ((current_word_ok_so_far) && 00345 (!tessedit_test_uniform_wd_spacing || 00346 ((word->best_choice->permuter ()==NUMBER_PERM)||uniformly_spaced (word)))) { 00347 prev_word_done = TRUE; 00348 prev_word_score = word_len; 00349 } 00350 else { 00351 prev_word_done = FALSE; 00352 prev_word_score = 0; 00353 } 00354 00355 if (fixsp_prefer_joined_1s) { 00356 /* Add 1 to total score for every joined 1 regardless of context and rejtn */ 00357 00358 for (i = 0, prev_char_1 = FALSE; i < word_len; i++) { 00359 current_char_1 = word->best_choice->string ()[i] == '1'; 00360 if (prev_char_1 || (current_char_1 && (i > 0))) 00361 total_score++; 00362 prev_char_1 = current_char_1; 00363 } 00364 } 00365 00366 /* Add 1 to total score for every joined punctuation regardless of context 00367 and rejtn */ 00368 if (tessedit_prefer_joined_punct) { 00369 for (i = 0, prev_char_punct = FALSE; i < word_len; i++) { 00370 current_char_punct = 00371 punct_chars.contains (word->best_choice->string ()[i]); 00372 if (prev_char_punct || (current_char_punct && (i > 0))) 00373 total_score++; 00374 prev_char_punct = current_char_punct; 00375 } 00376 } 00377 prev_char_digit = digit_or_numeric_punct (word, 00378 word->best_choice-> 00379 string ()[word_len - 1]); 00380 prev_char_1 = 00381 ((word_done 00382 && (word->best_choice->string ()[word_len - 1] == '1')) 00383 || (!word_done 00384 && STRING (conflict_set_I_l_1).contains (word->best_choice-> 00385 string ()[word_len - 00386 1]))); 00387 } 00388 /* Find next word */ 00389 do 00390 word_res_it.forward (); 00391 while (word_res_it.data ()->part_of_combo); 00392 } 00393 while (!word_res_it.at_first ()); 00394 total_score += prev_word_score; 00395 if (prev_word_done) 00396 done_word_count++; 00397 if (done_word_count == word_count) 00398 return PERFECT_WERDS; 00399 else 00400 return total_score; 00401 }
void fix_fuzzy_space_list | ( | WERD_RES_LIST & | best_perm, | |
ROW * | row | |||
) |
Explore spaces in list.
best_perm | for eval_word_spacing() | |
row | Row |
Definition at line 178 of file fixspace.cpp.
References dump_words(), eval_word_spacing(), FALSE, initialise_search(), match_current_words(), PERFECT_WERDS, transform_to_next_perm(), and TRUE.
Referenced by fix_fuzzy_spaces().
00180 { 00181 INT16 best_score; 00182 WERD_RES_LIST current_perm; 00183 INT16 current_score; 00184 BOOL8 improved = FALSE; 00185 00186 00187 best_score = eval_word_spacing (best_perm); //default score 00188 00189 dump_words (best_perm, best_score, 1, improved); 00190 00191 if (best_score != PERFECT_WERDS) 00192 initialise_search(best_perm, current_perm); 00193 00194 while ((best_score != PERFECT_WERDS) && !current_perm.empty ()) { 00195 match_current_words(current_perm, row); 00196 current_score = eval_word_spacing (current_perm); 00197 dump_words (current_perm, current_score, 2, improved); 00198 if (current_score > best_score) { 00199 best_perm.clear (); 00200 best_perm.deep_copy (¤t_perm); 00201 best_score = current_score; 00202 improved = TRUE; 00203 } 00204 if (current_score < PERFECT_WERDS) 00205 transform_to_next_perm(current_perm); 00206 } 00207 dump_words (best_perm, best_score, 3, improved); 00208 }
void fix_fuzzy_spaces | ( | volatile ETEXT_DESC * | monitor, | |
INT32 | word_count, | |||
PAGE_RES * | page_res | |||
) |
Walk over the page, finding sequences of words joined by fuzzy spaces.
monitor | progress monitor | |
word_count | count of words in doc | |
page_res |
Definition at line 79 of file fixspace.cpp.
References check_debug_pt(), WERD_RES::combination, fix_fuzzy_space_list(), fix_sp_fp_word(), WERD::gblob_list(), NULL, TRUE, W_FUZZY_NON, W_FUZZY_SP, and WERD_RES::word.
Referenced by recog_all_words().
00082 { 00083 BLOCK_RES_IT block_res_it; //iterators 00084 ROW_RES_IT row_res_it; 00085 WERD_RES_IT word_res_it_from; 00086 WERD_RES_IT word_res_it_to; 00087 WERD_RES *word_res; 00088 WERD_RES_LIST fuzzy_space_words; 00089 INT16 new_length; 00090 BOOL8 prevent_null_wd_fixsp; //DONT process blobless wds 00091 INT32 word_index; //current word 00092 00093 block_res_it.set_to_list (&page_res->block_res_list); 00094 word_index = 0; 00095 for (block_res_it.mark_cycle_pt (); 00096 !block_res_it.cycled_list (); block_res_it.forward ()) { 00097 row_res_it.set_to_list (&block_res_it.data ()->row_res_list); 00098 for (row_res_it.mark_cycle_pt (); 00099 !row_res_it.cycled_list (); row_res_it.forward ()) { 00100 word_res_it_from.set_to_list (&row_res_it.data ()->word_res_list); 00101 while (!word_res_it_from.at_last ()) { 00102 word_res = word_res_it_from.data (); 00103 while (!word_res_it_from.at_last () && 00104 !(word_res->combination || 00105 word_res_it_from.data_relative (1)-> 00106 word->flag (W_FUZZY_NON) || 00107 word_res_it_from.data_relative (1)-> 00108 word->flag (W_FUZZY_SP))) { 00109 fix_sp_fp_word (word_res_it_from, row_res_it.data ()->row); 00110 word_res = word_res_it_from.forward (); 00111 word_index++; 00112 if (monitor != NULL) { 00113 monitor->ocr_alive = TRUE; 00114 monitor->progress = 90 + 5 * word_index / word_count; 00115 } 00116 } 00117 00118 if (!word_res_it_from.at_last ()) { 00119 word_res_it_to = word_res_it_from; 00120 prevent_null_wd_fixsp = 00121 word_res->word->gblob_list ()->empty (); 00122 if (check_debug_pt (word_res, 60)) 00123 debug_fix_space_level.set_value (10); 00124 word_res_it_to.forward (); 00125 word_index++; 00126 if (monitor != NULL) { 00127 monitor->ocr_alive = TRUE; 00128 monitor->progress = 90 + 5 * word_index / word_count; 00129 } 00130 while (!word_res_it_to.at_last () && 00131 (word_res_it_to.data_relative (1)-> 00132 word->flag (W_FUZZY_NON) || 00133 word_res_it_to.data_relative (1)-> 00134 word->flag (W_FUZZY_SP))) { 00135 if (check_debug_pt (word_res, 60)) 00136 debug_fix_space_level.set_value (10); 00137 if (word_res->word->gblob_list ()->empty ()) 00138 prevent_null_wd_fixsp = TRUE; 00139 word_res = word_res_it_to.forward (); 00140 } 00141 if (check_debug_pt (word_res, 60)) 00142 debug_fix_space_level.set_value (10); 00143 if (word_res->word->gblob_list ()->empty ()) 00144 prevent_null_wd_fixsp = TRUE; 00145 if (prevent_null_wd_fixsp) 00146 word_res_it_from = word_res_it_to; 00147 else { 00148 fuzzy_space_words.assign_to_sublist (&word_res_it_from, 00149 &word_res_it_to); 00150 fix_fuzzy_space_list (fuzzy_space_words, 00151 row_res_it.data ()->row); 00152 new_length = fuzzy_space_words.length (); 00153 word_res_it_from.add_list_before (&fuzzy_space_words); 00154 for (; 00155 (!word_res_it_from.at_last () && 00156 (new_length > 0)); new_length--) { 00157 word_res_it_from.forward (); 00158 } 00159 } 00160 if (test_pt) 00161 debug_fix_space_level.set_value (0); 00162 } 00163 fix_sp_fp_word (word_res_it_from, row_res_it.data ()->row); 00164 //Last word in row 00165 } 00166 } 00167 } 00168 }
void fix_noisy_space_list | ( | WERD_RES_LIST & | best_perm, | |
ROW * | row | |||
) |
Go down and fix spaces, starting with the worst/biggest.
best_perm | to get scores | |
row | Row |
Definition at line 751 of file fixspace.cpp.
References break_noisiest_blob_word(), WERD_RES::combination, dump_words(), FALSE, fp_eval_word_spacing(), match_current_words(), PERFECT_WERDS, and TRUE.
Referenced by fix_sp_fp_word().
00751 { 00752 INT16 best_score; 00753 WERD_RES_IT best_perm_it(&best_perm); 00754 WERD_RES_LIST current_perm; 00755 WERD_RES_IT current_perm_it(¤t_perm); 00756 WERD_RES *old_word_res; 00757 WERD_RES *new_word_res; 00758 INT16 current_score; 00759 BOOL8 improved = FALSE; 00760 00761 best_score = fp_eval_word_spacing (best_perm); //default score 00762 00763 dump_words (best_perm, best_score, 1, improved); 00764 00765 new_word_res = new WERD_RES; 00766 old_word_res = best_perm_it.data (); 00767 old_word_res->combination = TRUE; //Kludge to force deep copy 00768 *new_word_res = *old_word_res; //deep copy 00769 old_word_res->combination = FALSE; //Undo kludge 00770 new_word_res->combination = FALSE; //Undo kludge 00771 current_perm_it.add_to_end (new_word_res); 00772 00773 break_noisiest_blob_word(current_perm); 00774 00775 while ((best_score != PERFECT_WERDS) && !current_perm.empty ()) { 00776 match_current_words(current_perm, row); 00777 current_score = fp_eval_word_spacing (current_perm); 00778 dump_words (current_perm, current_score, 2, improved); 00779 if (current_score > best_score) { 00780 best_perm.clear (); 00781 best_perm.deep_copy (¤t_perm); 00782 best_score = current_score; 00783 improved = TRUE; 00784 } 00785 if (current_score < PERFECT_WERDS) 00786 break_noisiest_blob_word(current_perm); 00787 } 00788 dump_words (best_perm, best_score, 3, improved); 00789 }
void fix_sp_fp_word | ( | WERD_RES_IT & | word_res_it, | |
ROW * | row | |||
) |
Test the current word to see if it can be split by deleting noise blobs.
word_res_it | Word results? | |
row |
Definition at line 708 of file fixspace.cpp.
References WERD_RES::best_choice, WERD_RES::combination, FALSE, fix_noisy_space_list(), WERD::flag(), gblob_sort_list(), WERD_RES::part_of_combo, WERD::rej_cblob_list(), tprintf(), W_DONT_CHOP, W_REP_CHAR, WERD_RES::word, and worst_noise_blob().
Referenced by fix_fuzzy_spaces().
00708 { 00709 WERD_RES *word_res; 00710 WERD_RES_LIST sub_word_list; 00711 WERD_RES_IT sub_word_list_it(&sub_word_list); 00712 INT16 blob_index; 00713 INT16 new_length; 00714 float junk; 00715 00716 word_res = word_res_it.data (); 00717 if (!fixsp_check_for_fp_noise_space || 00718 word_res->word->flag (W_REP_CHAR) || 00719 word_res->combination || 00720 word_res->part_of_combo || !word_res->word->flag (W_DONT_CHOP)) 00721 return; 00722 00723 blob_index = worst_noise_blob (word_res, &junk); 00724 if (blob_index < 0) 00725 return; 00726 00727 #ifndef SECURE_NAMES 00728 if (debug_fix_space_level > 1) { 00729 tprintf ("FP fixspace working on \"%s\"\n", 00730 word_res->best_choice->string ().string ()); 00731 } 00732 #endif 00733 gblob_sort_list ((PBLOB_LIST *) word_res->word->rej_cblob_list (), FALSE); 00734 sub_word_list_it.add_after_stay_put (word_res_it.extract ()); 00735 fix_noisy_space_list(sub_word_list, row); 00736 new_length = sub_word_list.length (); 00737 word_res_it.add_list_before (&sub_word_list); 00738 for (; (!word_res_it.at_last () && (new_length > 1)); new_length--) { 00739 word_res_it.forward (); 00740 } 00741 }
void fixspace_dbg | ( | WERD_RES * | word | ) |
Dump out info for fixing spaces functions.
word | Word in question |
Definition at line 1006 of file fixspace.cpp.
References WERD_RES::best_choice, WERD::bounding_box(), debug_fp, WERD_RES::done, FALSE, REJMAP::full_print(), WERD::gblob_list(), WERD_RES::outword, BOX::print(), REJMAP::print(), WERD::rej_blob_list(), WERD_RES::reject_map, WERD_RES::tess_accepted, tprintf(), and WERD_RES::word.
01006 { 01007 BOX box = word->word->bounding_box (); 01008 BOOL8 show_map_detail = FALSE; 01009 INT16 i; 01010 01011 box.print (); 01012 #ifndef SECURE_NAMES 01013 tprintf (" \"%s\" ", word->best_choice->string ().string ()); 01014 tprintf ("Blob count: %d (word); %d/%d (outword)\n", 01015 word->word->gblob_list ()->length (), 01016 word->outword->gblob_list ()->length (), 01017 word->outword->rej_blob_list ()->length ()); 01018 word->reject_map.print (debug_fp); 01019 tprintf ("\n"); 01020 if (show_map_detail) { 01021 tprintf ("\"%s\"\n", word->best_choice->string ().string ()); 01022 for (i = 0; word->best_choice->string ()[i] != '\0'; i++) { 01023 tprintf ("**** \"%c\" ****\n", word->best_choice->string ()[i]); 01024 word->reject_map[i].full_print (debug_fp); 01025 } 01026 } 01027 01028 tprintf ("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE"); 01029 tprintf ("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE"); 01030 #endif 01031 }
Use all the standard pass 2 conditions for mode 5 in set_done() BUT DONT REJECT IF THE WERD IS AMBIGUOUS - FOR SPACING WE DONT CARE WHETHER WE HAVE of/at on/an etc.
word | Word working on |
Definition at line 675 of file fixspace.cpp.
References WERD_RES::best_choice, WERD_RES::done, FALSE, FREQ_DAWG_PERM, NULL, NUMBER_PERM, REJMAP::reject_count(), WERD_RES::reject_map, SYSTEM_DAWG_PERM, WERD_RES::tess_accepted, TRUE, and USER_DAWG_PERM.
Referenced by eval_word_spacing().
00675 { 00676 if (word->done) 00677 return TRUE; 00678 00679 if ((fixsp_done_mode > 0) && 00680 (word->tess_accepted || 00681 ((fixsp_done_mode == 2) && 00682 (word->reject_map.reject_count () == 0)) || 00683 (fixsp_done_mode == 3)) && 00684 (strchr (word->best_choice->string ().string (), ' ') == NULL) && 00685 ((word->best_choice->permuter () == SYSTEM_DAWG_PERM) || 00686 (word->best_choice->permuter () == FREQ_DAWG_PERM) || 00687 (word->best_choice->permuter () == USER_DAWG_PERM) || 00688 (word->best_choice->permuter () == NUMBER_PERM))) 00689 return TRUE; 00690 else 00691 return FALSE; 00692 }
INT16 fp_eval_word_spacing | ( | WERD_RES_LIST & | word_res_list | ) |
Evaluation function for fixed pitch word lists.
word_res_list | list of word results |
Definition at line 1045 of file fixspace.cpp.
References WERD_RES::best_choice, WERD::blob_list(), blob_noise_score(), WERD_RES::done, eval_word_spacing(), FREQ_DAWG_PERM, REJMAP::length(), WERD_RES::outword, WERD_RES::reject_map, safe_dict_word(), SYSTEM_DAWG_PERM, WERD_RES::tess_accepted, and USER_DAWG_PERM.
Referenced by fix_noisy_space_list().
01045 { 01046 WERD_RES_IT word_it(&word_res_list); 01047 WERD_RES *word; 01048 PBLOB_IT blob_it; 01049 INT16 word_length; 01050 INT16 score = 0; 01051 INT16 i; 01052 const char *chs; 01053 float small_limit = bln_x_height * fixsp_small_outlines_size; 01054 01055 if (!fixsp_fp_eval) 01056 return (eval_word_spacing (word_res_list)); 01057 01058 for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { 01059 word = word_it.data (); 01060 word_length = word->reject_map.length (); 01061 chs = word->best_choice->string ().string (); 01062 if ((word->done || 01063 word->tess_accepted) || 01064 (word->best_choice->permuter () == SYSTEM_DAWG_PERM) || 01065 (word->best_choice->permuter () == FREQ_DAWG_PERM) || 01066 (word->best_choice->permuter () == USER_DAWG_PERM) || 01067 (safe_dict_word (chs) > 0)) { 01068 blob_it.set_to_list (word->outword->blob_list ()); 01069 for (i = 0; i < word_length; i++, blob_it.forward ()) { 01070 if ((chs[i] == ' ') || 01071 (blob_noise_score (blob_it.data ()) < small_limit)) 01072 score -= 1; //penalise possibly erroneous non-space 01073 01074 else if (word->reject_map[i].accepted ()) 01075 score++; 01076 } 01077 } 01078 } 01079 if (score < 0) 01080 score = 0; 01081 return score; 01082 }
void initialise_search | ( | WERD_RES_LIST & | src_list, | |
WERD_RES_LIST & | new_list | |||
) |
?
src_list | source list | |
new_list | new list |
Definition at line 218 of file fixspace.cpp.
References WERD_RES::combination, FALSE, and WERD_RES::part_of_combo.
Referenced by fix_fuzzy_space_list().
00218 { 00219 WERD_RES_IT src_it(&src_list); 00220 WERD_RES_IT new_it(&new_list); 00221 WERD_RES *src_wd; 00222 WERD_RES *new_wd; 00223 00224 for (src_it.mark_cycle_pt (); !src_it.cycled_list (); src_it.forward ()) { 00225 src_wd = src_it.data (); 00226 if (!src_wd->combination) { 00227 new_wd = new WERD_RES (*src_wd); 00228 new_wd->combination = FALSE; 00229 new_wd->part_of_combo = FALSE; 00230 new_it.add_after_then_move (new_wd); 00231 } 00232 } 00233 }
void match_current_words | ( | WERD_RES_LIST & | words, | |
ROW * | row | |||
) |
classifies some words (those not part of combo & not outword) using pass2?
words | list of words | |
row | Row |
Definition at line 244 of file fixspace.cpp.
References classify_word_pass2(), NULL, WERD_RES::outword, and WERD_RES::part_of_combo.
Referenced by fix_fuzzy_space_list(), and fix_noisy_space_list().
00244 { 00245 WERD_RES_IT word_it(&words); 00246 WERD_RES *word; 00247 00248 for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { 00249 word = word_it.data (); 00250 if ((!word->part_of_combo) && (word->outword == NULL)) 00251 classify_word_pass2(word, row); 00252 } 00253 }
void transform_to_next_perm | ( | WERD_RES_LIST & | words | ) |
Checks current word list to find smallest word gap size.
words | list of words |
The routine COULD be limited to stop it building words longer than N blobs.
If there are no more gaps then it DELETES the entire list and returns the empty list to cause termination.
Definition at line 433 of file fixspace.cpp.
References WERD_RES::best_choice, WERD::bounding_box(), WERD_RES::combination, WERD_RES::copy_on(), WERD_RES::done, FALSE, WERD::flag(), WERD::join_on(), BOX::left(), MAX_INT16, NULL, WERD_RES::outword, WERD_RES::part_of_combo, WERD_RES::raw_choice, BOX::right(), WERD::set_flag(), TRUE, W_EOL, and WERD_RES::word.
Referenced by fix_fuzzy_space_list().
00433 { 00434 WERD_RES_IT word_it(&words); 00435 WERD_RES_IT prev_word_it(&words); 00436 WERD_RES *word; 00437 WERD_RES *prev_word; 00438 WERD_RES *combo; 00439 WERD *copy_word; 00440 INT16 prev_right = -1; 00441 BOX box; 00442 INT16 gap; 00443 INT16 min_gap = MAX_INT16; 00444 00445 for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { 00446 word = word_it.data (); 00447 if (!word->part_of_combo) { 00448 box = word->word->bounding_box (); 00449 if (prev_right >= 0) { 00450 gap = box.left () - prev_right; 00451 if (gap < min_gap) 00452 min_gap = gap; 00453 } 00454 prev_right = box.right (); 00455 } 00456 } 00457 if (min_gap < MAX_INT16) { 00458 prev_right = -1; //back to start 00459 word_it.set_to_list (&words); 00460 for (; //cant use cycle pt due to inserted combos at start of list 00461 (prev_right < 0) || !word_it.at_first (); word_it.forward ()) { 00462 word = word_it.data (); 00463 if (!word->part_of_combo) { 00464 box = word->word->bounding_box (); 00465 if (prev_right >= 0) { 00466 gap = box.left () - prev_right; 00467 if (gap <= min_gap) { 00468 prev_word = prev_word_it.data (); 00469 if (prev_word->combination) 00470 combo = prev_word; 00471 else { 00472 /* Make a new combination and insert before the first word being joined */ 00473 copy_word = new WERD; 00474 *copy_word = *(prev_word->word); 00475 //deep copy 00476 combo = new WERD_RES (copy_word); 00477 combo->combination = TRUE; 00478 prev_word->part_of_combo = TRUE; 00479 prev_word_it.add_before_then_move (combo); 00480 } 00481 combo->word->set_flag (W_EOL, word->word->flag (W_EOL)); 00482 if (word->combination) { 00483 combo->word->join_on (word->word); 00484 //Move blbs to combo 00485 //old combo no longer needed 00486 delete word_it.extract (); 00487 } 00488 else { 00489 //Copy current wd to combo 00490 combo->copy_on (word); 00491 word->part_of_combo = TRUE; 00492 } 00493 combo->done = FALSE; 00494 if (combo->outword != NULL) { 00495 delete combo->outword; 00496 delete combo->best_choice; 00497 delete combo->raw_choice; 00498 combo->outword = NULL; 00499 combo->best_choice = NULL; 00500 combo->raw_choice = NULL; 00501 } 00502 } 00503 else 00504 //catch up 00505 prev_word_it = word_it; 00506 } 00507 prev_right = box.right (); 00508 } 00509 } 00510 } 00511 else 00512 words.clear (); //signal termination 00513 }
Test if spacing of word is uniform/sensible.
word | Word of interest |
Definition at line 597 of file fixspace.cpp.
References STATS::add(), WERD_RES::best_choice, WERD::blob_list(), STRING::contains(), WERD_RES::denorm, STATS::get_total(), ROW::kern(), BOX::left(), MAX_INT16, MAXSPACING, STATS::mean(), STATS::median(), WERD_RES::outword, BOX::right(), DENORM::row(), ROW::space(), tprintf(), and ROW::x_height().
Referenced by eval_word_spacing().
00598 { 00599 PBLOB_IT blob_it; 00600 BOX box; 00601 INT16 prev_right = -MAX_INT16; 00602 INT16 gap; 00603 INT16 max_gap = -MAX_INT16; 00604 INT16 max_gap_count = 0; 00605 STATS gap_stats (0, MAXSPACING); 00606 BOOL8 result; 00607 const ROW *row = word->denorm.row (); 00608 float max_non_space; 00609 float normalised_max_nonspace; 00610 INT16 i = 0; 00611 STRING punct_chars = "\"`',.:;"; 00612 00613 blob_it.set_to_list (word->outword->blob_list ()); 00614 00615 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) { 00616 box = blob_it.data ()->bounding_box (); 00617 if ((prev_right > -MAX_INT16) && 00618 (!fixsp_ignore_punct || 00619 (!punct_chars.contains (word->best_choice->string ()[i - 1]) && 00620 !punct_chars.contains (word->best_choice->string ()[i])))) { 00621 gap = box.left () - prev_right; 00622 if (gap < max_gap) 00623 gap_stats.add (gap, 1); 00624 else if (gap == max_gap) 00625 max_gap_count++; 00626 else { 00627 if (max_gap_count > 0) 00628 gap_stats.add (max_gap, max_gap_count); 00629 max_gap = gap; 00630 max_gap_count = 1; 00631 } 00632 } 00633 prev_right = box.right (); 00634 i++; 00635 } 00636 00637 max_non_space = (row->space () + 3 * row->kern ()) / 4; 00638 normalised_max_nonspace = max_non_space * bln_x_height / row->x_height (); 00639 00640 result = ((gap_stats.get_total () == 0) || 00641 (max_gap <= normalised_max_nonspace) || 00642 ((gap_stats.get_total () > 2) && 00643 (max_gap <= 2 * gap_stats.median ())) || 00644 ((gap_stats.get_total () <= 2) && 00645 (max_gap <= 2 * gap_stats.mean ()))); 00646 #ifndef SECURE_NAMES 00647 if ((debug_fix_space_level > 1)) { 00648 if (result) 00649 tprintf 00650 ("ACCEPT SPACING FOR: \"%s\" norm_maxnon = %f max=%d maxcount=%d total=%d mean=%f median=%f\n", 00651 word->best_choice->string ().string (), normalised_max_nonspace, 00652 max_gap, max_gap_count, gap_stats.get_total (), gap_stats.mean (), 00653 gap_stats.median ()); 00654 else 00655 tprintf 00656 ("REJECT SPACING FOR: \"%s\" norm_maxnon = %f max=%d maxcount=%d total=%d mean=%f median=%f\n", 00657 word->best_choice->string ().string (), normalised_max_nonspace, 00658 max_gap, max_gap_count, gap_stats.get_total (), gap_stats.mean (), 00659 gap_stats.median ()); 00660 } 00661 #endif 00662 00663 return result; 00664 }
Examine blobs and return reference to the worst one,.
word_res | Word results | |
worst_noise_score | Modified by this function |
Definition at line 880 of file fixspace.cpp.
References ASSERT_HOST, WERD_RES::best_choice, blob_count, WERD::blob_list(), blob_noise_score(), WERD_RES::outword, WERD_RES::reject_map, tprintf(), and worst_noise_blob().
Referenced by break_noisiest_blob_word(), fix_sp_fp_word(), and worst_noise_blob().
00880 { 00881 PBLOB_IT blob_it; 00882 INT16 blob_count; 00883 float noise_score[512]; 00884 int i; 00885 int min_noise_blob; //1st contender 00886 int max_noise_blob; //last contender 00887 int non_noise_count; 00888 int worst_noise_blob; //Worst blob 00889 float small_limit = bln_x_height * fixsp_small_outlines_size; 00890 float non_noise_limit = bln_x_height * 0.8; 00891 00892 blob_it.set_to_list (word_res->outword->blob_list ()); 00893 //normalised 00894 blob_count = blob_it.length (); 00895 ASSERT_HOST (blob_count <= 512); 00896 if (blob_count < 5) 00897 return -1; //too short to split 00898 /* Get the noise scores for all blobs */ 00899 00900 #ifndef SECURE_NAMES 00901 if (debug_fix_space_level > 5) 00902 tprintf ("FP fixspace Noise metrics for \"%s\": ", 00903 word_res->best_choice->string ().string ()); 00904 #endif 00905 00906 for (i = 0; i < blob_count; i++, blob_it.forward ()) { 00907 if (word_res->reject_map[i].accepted ()) 00908 noise_score[i] = non_noise_limit; 00909 else 00910 noise_score[i] = blob_noise_score (blob_it.data ()); 00911 00912 if (debug_fix_space_level > 5) 00913 tprintf ("%1.1f ", noise_score[i]); 00914 } 00915 if (debug_fix_space_level > 5) 00916 tprintf ("\n"); 00917 00918 /* Now find the worst one which is far enough away from the end of the word */ 00919 00920 non_noise_count = 0; 00921 for (i = 0; 00922 (i < blob_count) && (non_noise_count < fixsp_non_noise_limit); i++) { 00923 if (noise_score[i] >= non_noise_limit) 00924 non_noise_count++; 00925 } 00926 if (non_noise_count < fixsp_non_noise_limit) 00927 return -1; 00928 min_noise_blob = i; 00929 00930 non_noise_count = 0; 00931 for (i = blob_count - 1; 00932 (i >= 0) && (non_noise_count < fixsp_non_noise_limit); i--) { 00933 if (noise_score[i] >= non_noise_limit) 00934 non_noise_count++; 00935 } 00936 if (non_noise_count < fixsp_non_noise_limit) 00937 return -1; 00938 max_noise_blob = i; 00939 00940 if (min_noise_blob > max_noise_blob) 00941 return -1; 00942 00943 *worst_noise_score = small_limit; 00944 worst_noise_blob = -1; 00945 for (i = min_noise_blob; i <= max_noise_blob; i++) { 00946 if (noise_score[i] < *worst_noise_score) { 00947 worst_noise_blob = i; 00948 *worst_noise_score = noise_score[i]; 00949 } 00950 } 00951 return worst_noise_blob; 00952 }