00001
00023 #include "mfcpch.h"
00024 #include <ctype.h>
00025 #include "reject.h"
00026 #include "statistc.h"
00027 #include "genblob.h"
00028 #include "control.h"
00029 #include "fixspace.h"
00030 #include "tessvars.h"
00031 #include "tessbox.h"
00032 #include "secname.h"
00033 #ifdef TEXT_VERBOSE
00034 #include "callcpp.h"
00035 #endif
00036
00037
00041 #define EXTERN
00042
00043 EXTERN BOOL_VAR (fixsp_check_for_fp_noise_space, TRUE,
00044 "Try turning noise to space in fixed pitch");
00045 EXTERN BOOL_VAR (fixsp_fp_eval, TRUE, "Use alternate evaluation for fp");
00046 EXTERN BOOL_VAR (fixsp_noise_score_fixing, TRUE, "More sophisticated?");
00047 EXTERN INT_VAR (fixsp_non_noise_limit, 1,
00048 "How many non-noise blbs either side?");
00049 EXTERN double_VAR (fixsp_small_outlines_size, 0.28, "Small if lt xht x this");
00050
00051 EXTERN BOOL_VAR (fixsp_ignore_punct, TRUE, "In uniform spacing calc");
00052 EXTERN BOOL_VAR (fixsp_numeric_fix, TRUE, "Try to deal with numeric punct");
00053 EXTERN BOOL_VAR (fixsp_prefer_joined_1s, TRUE, "Arbitrary boost");
00054 EXTERN BOOL_VAR (tessedit_test_uniform_wd_spacing, FALSE,
00055 "Limit context word spacing");
00056 EXTERN BOOL_VAR (tessedit_prefer_joined_punct, FALSE,
00057 "Reward punctation joins");
00058 EXTERN INT_VAR (fixsp_done_mode, 1, "What constitues done for spacing");
00059 EXTERN INT_VAR (debug_fix_space_level, 0, "Contextual fixspace debug");
00060 EXTERN STRING_VAR (numeric_punctuation, ".,",
00061 "Punct. chs expected WITHIN numbers");
00064 #define PERFECT_WERDS 999
00065
00066 #define MAXSPACING 128
00067
00079 void fix_fuzzy_spaces(
00080 volatile ETEXT_DESC *monitor,
00081 INT32 word_count,
00082 PAGE_RES *page_res) {
00083 BLOCK_RES_IT block_res_it;
00084 ROW_RES_IT row_res_it;
00085 WERD_RES_IT word_res_it_from;
00086 WERD_RES_IT word_res_it_to;
00087 WERD_RES *word_res;
00088 WERD_RES_LIST fuzzy_space_words;
00089 INT16 new_length;
00090 BOOL8 prevent_null_wd_fixsp;
00091 INT32 word_index;
00092
00093 block_res_it.set_to_list (&page_res->block_res_list);
00094 word_index = 0;
00095 for (block_res_it.mark_cycle_pt ();
00096 !block_res_it.cycled_list (); block_res_it.forward ()) {
00097 row_res_it.set_to_list (&block_res_it.data ()->row_res_list);
00098 for (row_res_it.mark_cycle_pt ();
00099 !row_res_it.cycled_list (); row_res_it.forward ()) {
00100 word_res_it_from.set_to_list (&row_res_it.data ()->word_res_list);
00101 while (!word_res_it_from.at_last ()) {
00102 word_res = word_res_it_from.data ();
00103 while (!word_res_it_from.at_last () &&
00104 !(word_res->combination ||
00105 word_res_it_from.data_relative (1)->
00106 word->flag (W_FUZZY_NON) ||
00107 word_res_it_from.data_relative (1)->
00108 word->flag (W_FUZZY_SP))) {
00109 fix_sp_fp_word (word_res_it_from, row_res_it.data ()->row);
00110 word_res = word_res_it_from.forward ();
00111 word_index++;
00112 if (monitor != NULL) {
00113 monitor->ocr_alive = TRUE;
00114 monitor->progress = 90 + 5 * word_index / word_count;
00115 }
00116 }
00117
00118 if (!word_res_it_from.at_last ()) {
00119 word_res_it_to = word_res_it_from;
00120 prevent_null_wd_fixsp =
00121 word_res->word->gblob_list ()->empty ();
00122 if (check_debug_pt (word_res, 60))
00123 debug_fix_space_level.set_value (10);
00124 word_res_it_to.forward ();
00125 word_index++;
00126 if (monitor != NULL) {
00127 monitor->ocr_alive = TRUE;
00128 monitor->progress = 90 + 5 * word_index / word_count;
00129 }
00130 while (!word_res_it_to.at_last () &&
00131 (word_res_it_to.data_relative (1)->
00132 word->flag (W_FUZZY_NON) ||
00133 word_res_it_to.data_relative (1)->
00134 word->flag (W_FUZZY_SP))) {
00135 if (check_debug_pt (word_res, 60))
00136 debug_fix_space_level.set_value (10);
00137 if (word_res->word->gblob_list ()->empty ())
00138 prevent_null_wd_fixsp = TRUE;
00139 word_res = word_res_it_to.forward ();
00140 }
00141 if (check_debug_pt (word_res, 60))
00142 debug_fix_space_level.set_value (10);
00143 if (word_res->word->gblob_list ()->empty ())
00144 prevent_null_wd_fixsp = TRUE;
00145 if (prevent_null_wd_fixsp)
00146 word_res_it_from = word_res_it_to;
00147 else {
00148 fuzzy_space_words.assign_to_sublist (&word_res_it_from,
00149 &word_res_it_to);
00150 fix_fuzzy_space_list (fuzzy_space_words,
00151 row_res_it.data ()->row);
00152 new_length = fuzzy_space_words.length ();
00153 word_res_it_from.add_list_before (&fuzzy_space_words);
00154 for (;
00155 (!word_res_it_from.at_last () &&
00156 (new_length > 0)); new_length--) {
00157 word_res_it_from.forward ();
00158 }
00159 }
00160 if (test_pt)
00161 debug_fix_space_level.set_value (0);
00162 }
00163 fix_sp_fp_word (word_res_it_from, row_res_it.data ()->row);
00164
00165 }
00166 }
00167 }
00168 }
00169
00170
00178 void fix_fuzzy_space_list(
00179 WERD_RES_LIST &best_perm,
00180 ROW *row) {
00181 INT16 best_score;
00182 WERD_RES_LIST current_perm;
00183 INT16 current_score;
00184 BOOL8 improved = FALSE;
00185
00186
00187 best_score = eval_word_spacing (best_perm);
00188
00189 dump_words (best_perm, best_score, 1, improved);
00190
00191 if (best_score != PERFECT_WERDS)
00192 initialise_search(best_perm, current_perm);
00193
00194 while ((best_score != PERFECT_WERDS) && !current_perm.empty ()) {
00195 match_current_words(current_perm, row);
00196 current_score = eval_word_spacing (current_perm);
00197 dump_words (current_perm, current_score, 2, improved);
00198 if (current_score > best_score) {
00199 best_perm.clear ();
00200 best_perm.deep_copy (¤t_perm);
00201 best_score = current_score;
00202 improved = TRUE;
00203 }
00204 if (current_score < PERFECT_WERDS)
00205 transform_to_next_perm(current_perm);
00206 }
00207 dump_words (best_perm, best_score, 3, improved);
00208 }
00209
00210
00218 void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list) {
00219 WERD_RES_IT src_it(&src_list);
00220 WERD_RES_IT new_it(&new_list);
00221 WERD_RES *src_wd;
00222 WERD_RES *new_wd;
00223
00224 for (src_it.mark_cycle_pt (); !src_it.cycled_list (); src_it.forward ()) {
00225 src_wd = src_it.data ();
00226 if (!src_wd->combination) {
00227 new_wd = new WERD_RES (*src_wd);
00228 new_wd->combination = FALSE;
00229 new_wd->part_of_combo = FALSE;
00230 new_it.add_after_then_move (new_wd);
00231 }
00232 }
00233 }
00234
00235
00244 void match_current_words(WERD_RES_LIST &words, ROW *row) {
00245 WERD_RES_IT word_it(&words);
00246 WERD_RES *word;
00247
00248 for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
00249 word = word_it.data ();
00250 if ((!word->part_of_combo) && (word->outword == NULL))
00251 classify_word_pass2(word, row);
00252 }
00253 }
00254
00255
00285 INT16 eval_word_spacing(WERD_RES_LIST &word_res_list) {
00286 WERD_RES_IT word_res_it(&word_res_list);
00287 INT16 total_score = 0;
00288 INT16 word_count = 0;
00289 INT16 done_word_count = 0;
00290 INT16 word_len;
00291 INT16 i;
00292 WERD_RES *word;
00293 INT16 prev_word_score = 0;
00294 BOOL8 prev_word_done = FALSE;
00295 BOOL8 prev_char_1 = FALSE;
00296 BOOL8 prev_char_digit = FALSE;
00297 BOOL8 current_char_1 = FALSE;
00298 BOOL8 current_word_ok_so_far;
00299 STRING punct_chars = "!\"`',.:;";
00300 BOOL8 prev_char_punct = FALSE;
00301 BOOL8 current_char_punct = FALSE;
00302 BOOL8 word_done = FALSE;
00303
00304 #ifdef TEXT_VERBOSE
00305
00306 cprintf("z");
00307 #endif
00308 do {
00309 word = word_res_it.data ();
00310 word_done = fixspace_thinks_word_done (word);
00311 word_count++;
00312 if (word->tess_failed) {
00313 total_score += prev_word_score;
00314 if (prev_word_done)
00315 done_word_count++;
00316 prev_word_score = 0;
00317 prev_char_1 = FALSE;
00318 prev_char_digit = FALSE;
00319 prev_word_done = FALSE;
00320 }
00321 else {
00327 word_len = word->reject_map.length ();
00328 current_word_ok_so_far = FALSE;
00329 if (!((prev_char_1 &&
00330 digit_or_numeric_punct (word,
00331 word->best_choice->string ()[0])) ||
00332 (prev_char_digit &&
00333 ((word_done &&
00334 (word->best_choice->string ()[0] == '1')) ||
00335 (!word_done &&
00336 STRING (conflict_set_I_l_1).contains (word->best_choice->
00337 string ()[0])))))) {
00338 total_score += prev_word_score;
00339 if (prev_word_done)
00340 done_word_count++;
00341 current_word_ok_so_far = word_done;
00342 }
00343
00344 if ((current_word_ok_so_far) &&
00345 (!tessedit_test_uniform_wd_spacing ||
00346 ((word->best_choice->permuter ()==NUMBER_PERM)||uniformly_spaced (word)))) {
00347 prev_word_done = TRUE;
00348 prev_word_score = word_len;
00349 }
00350 else {
00351 prev_word_done = FALSE;
00352 prev_word_score = 0;
00353 }
00354
00355 if (fixsp_prefer_joined_1s) {
00356
00357
00358 for (i = 0, prev_char_1 = FALSE; i < word_len; i++) {
00359 current_char_1 = word->best_choice->string ()[i] == '1';
00360 if (prev_char_1 || (current_char_1 && (i > 0)))
00361 total_score++;
00362 prev_char_1 = current_char_1;
00363 }
00364 }
00365
00366
00367
00368 if (tessedit_prefer_joined_punct) {
00369 for (i = 0, prev_char_punct = FALSE; i < word_len; i++) {
00370 current_char_punct =
00371 punct_chars.contains (word->best_choice->string ()[i]);
00372 if (prev_char_punct || (current_char_punct && (i > 0)))
00373 total_score++;
00374 prev_char_punct = current_char_punct;
00375 }
00376 }
00377 prev_char_digit = digit_or_numeric_punct (word,
00378 word->best_choice->
00379 string ()[word_len - 1]);
00380 prev_char_1 =
00381 ((word_done
00382 && (word->best_choice->string ()[word_len - 1] == '1'))
00383 || (!word_done
00384 && STRING (conflict_set_I_l_1).contains (word->best_choice->
00385 string ()[word_len -
00386 1])));
00387 }
00388
00389 do
00390 word_res_it.forward ();
00391 while (word_res_it.data ()->part_of_combo);
00392 }
00393 while (!word_res_it.at_first ());
00394 total_score += prev_word_score;
00395 if (prev_word_done)
00396 done_word_count++;
00397 if (done_word_count == word_count)
00398 return PERFECT_WERDS;
00399 else
00400 return total_score;
00401 }
00402
00403
00411 BOOL8 digit_or_numeric_punct(WERD_RES *word, char ch) {
00412 return (isdigit (ch) ||
00413 (fixsp_numeric_fix &&
00414 (word->best_choice->permuter () == NUMBER_PERM) &&
00415 STRING (numeric_punctuation).contains (ch)));
00416 }
00417
00418
00433 void transform_to_next_perm(WERD_RES_LIST &words) {
00434 WERD_RES_IT word_it(&words);
00435 WERD_RES_IT prev_word_it(&words);
00436 WERD_RES *word;
00437 WERD_RES *prev_word;
00438 WERD_RES *combo;
00439 WERD *copy_word;
00440 INT16 prev_right = -1;
00441 BOX box;
00442 INT16 gap;
00443 INT16 min_gap = MAX_INT16;
00444
00445 for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
00446 word = word_it.data ();
00447 if (!word->part_of_combo) {
00448 box = word->word->bounding_box ();
00449 if (prev_right >= 0) {
00450 gap = box.left () - prev_right;
00451 if (gap < min_gap)
00452 min_gap = gap;
00453 }
00454 prev_right = box.right ();
00455 }
00456 }
00457 if (min_gap < MAX_INT16) {
00458 prev_right = -1;
00459 word_it.set_to_list (&words);
00460 for (;
00461 (prev_right < 0) || !word_it.at_first (); word_it.forward ()) {
00462 word = word_it.data ();
00463 if (!word->part_of_combo) {
00464 box = word->word->bounding_box ();
00465 if (prev_right >= 0) {
00466 gap = box.left () - prev_right;
00467 if (gap <= min_gap) {
00468 prev_word = prev_word_it.data ();
00469 if (prev_word->combination)
00470 combo = prev_word;
00471 else {
00472
00473 copy_word = new WERD;
00474 *copy_word = *(prev_word->word);
00475
00476 combo = new WERD_RES (copy_word);
00477 combo->combination = TRUE;
00478 prev_word->part_of_combo = TRUE;
00479 prev_word_it.add_before_then_move (combo);
00480 }
00481 combo->word->set_flag (W_EOL, word->word->flag (W_EOL));
00482 if (word->combination) {
00483 combo->word->join_on (word->word);
00484
00485
00486 delete word_it.extract ();
00487 }
00488 else {
00489
00490 combo->copy_on (word);
00491 word->part_of_combo = TRUE;
00492 }
00493 combo->done = FALSE;
00494 if (combo->outword != NULL) {
00495 delete combo->outword;
00496 delete combo->best_choice;
00497 delete combo->raw_choice;
00498 combo->outword = NULL;
00499 combo->best_choice = NULL;
00500 combo->raw_choice = NULL;
00501 }
00502 }
00503 else
00504
00505 prev_word_it = word_it;
00506 }
00507 prev_right = box.right ();
00508 }
00509 }
00510 }
00511 else
00512 words.clear ();
00513 }
00514
00515
00526 void dump_words(WERD_RES_LIST &perm, INT16 score, INT16 mode, BOOL8 improved) {
00527 WERD_RES_IT word_res_it(&perm);
00528 static STRING initial_str;
00529
00530 if (debug_fix_space_level > 0) {
00531 if (mode == 1) {
00532 initial_str = "";
00533 for (word_res_it.mark_cycle_pt ();
00534 !word_res_it.cycled_list (); word_res_it.forward ()) {
00535 if (!word_res_it.data ()->part_of_combo) {
00536 initial_str += word_res_it.data ()->best_choice->string ();
00537 initial_str += ' ';
00538 }
00539 }
00540 }
00541
00542 #ifndef SECURE_NAMES
00543 if (debug_fix_space_level > 1) {
00544 switch (mode) {
00545 case 1:
00546 tprintf ("EXTRACTED (%d): \"", score);
00547 break;
00548 case 2:
00549 tprintf ("TESTED (%d): \"", score);
00550 break;
00551 case 3:
00552 tprintf ("RETURNED (%d): \"", score);
00553 break;
00554 }
00555
00556 for (word_res_it.mark_cycle_pt ();
00557 !word_res_it.cycled_list (); word_res_it.forward ()) {
00558 if (!word_res_it.data ()->part_of_combo)
00559 tprintf ("%s/%1d ",
00560 word_res_it.data ()->best_choice->string ().
00561 string (),
00562 (int) word_res_it.data ()->best_choice->permuter ());
00563 }
00564 tprintf ("\"\n");
00565 }
00566 else if (improved) {
00567 tprintf ("FIX SPACING \"%s\" => \"", initial_str.string ());
00568 for (word_res_it.mark_cycle_pt ();
00569 !word_res_it.cycled_list (); word_res_it.forward ()) {
00570 if (!word_res_it.data ()->part_of_combo)
00571 tprintf ("%s/%1d ",
00572 word_res_it.data ()->best_choice->string ().
00573 string (),
00574 (int) word_res_it.data ()->best_choice->permuter ());
00575 }
00576 tprintf ("\"\n");
00577 }
00578 #endif
00579 }
00580 }
00581
00582
00597 BOOL8 uniformly_spaced(
00598 WERD_RES *word) {
00599 PBLOB_IT blob_it;
00600 BOX box;
00601 INT16 prev_right = -MAX_INT16;
00602 INT16 gap;
00603 INT16 max_gap = -MAX_INT16;
00604 INT16 max_gap_count = 0;
00605 STATS gap_stats (0, MAXSPACING);
00606 BOOL8 result;
00607 const ROW *row = word->denorm.row ();
00608 float max_non_space;
00609 float normalised_max_nonspace;
00610 INT16 i = 0;
00611 STRING punct_chars = "\"`',.:;";
00612
00613 blob_it.set_to_list (word->outword->blob_list ());
00614
00615 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
00616 box = blob_it.data ()->bounding_box ();
00617 if ((prev_right > -MAX_INT16) &&
00618 (!fixsp_ignore_punct ||
00619 (!punct_chars.contains (word->best_choice->string ()[i - 1]) &&
00620 !punct_chars.contains (word->best_choice->string ()[i])))) {
00621 gap = box.left () - prev_right;
00622 if (gap < max_gap)
00623 gap_stats.add (gap, 1);
00624 else if (gap == max_gap)
00625 max_gap_count++;
00626 else {
00627 if (max_gap_count > 0)
00628 gap_stats.add (max_gap, max_gap_count);
00629 max_gap = gap;
00630 max_gap_count = 1;
00631 }
00632 }
00633 prev_right = box.right ();
00634 i++;
00635 }
00636
00637 max_non_space = (row->space () + 3 * row->kern ()) / 4;
00638 normalised_max_nonspace = max_non_space * bln_x_height / row->x_height ();
00639
00640 result = ((gap_stats.get_total () == 0) ||
00641 (max_gap <= normalised_max_nonspace) ||
00642 ((gap_stats.get_total () > 2) &&
00643 (max_gap <= 2 * gap_stats.median ())) ||
00644 ((gap_stats.get_total () <= 2) &&
00645 (max_gap <= 2 * gap_stats.mean ())));
00646 #ifndef SECURE_NAMES
00647 if ((debug_fix_space_level > 1)) {
00648 if (result)
00649 tprintf
00650 ("ACCEPT SPACING FOR: \"%s\" norm_maxnon = %f max=%d maxcount=%d total=%d mean=%f median=%f\n",
00651 word->best_choice->string ().string (), normalised_max_nonspace,
00652 max_gap, max_gap_count, gap_stats.get_total (), gap_stats.mean (),
00653 gap_stats.median ());
00654 else
00655 tprintf
00656 ("REJECT SPACING FOR: \"%s\" norm_maxnon = %f max=%d maxcount=%d total=%d mean=%f median=%f\n",
00657 word->best_choice->string ().string (), normalised_max_nonspace,
00658 max_gap, max_gap_count, gap_stats.get_total (), gap_stats.mean (),
00659 gap_stats.median ());
00660 }
00661 #endif
00662
00663 return result;
00664 }
00665
00666
00675 BOOL8 fixspace_thinks_word_done(WERD_RES *word) {
00676 if (word->done)
00677 return TRUE;
00678
00679 if ((fixsp_done_mode > 0) &&
00680 (word->tess_accepted ||
00681 ((fixsp_done_mode == 2) &&
00682 (word->reject_map.reject_count () == 0)) ||
00683 (fixsp_done_mode == 3)) &&
00684 (strchr (word->best_choice->string ().string (), ' ') == NULL) &&
00685 ((word->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
00686 (word->best_choice->permuter () == FREQ_DAWG_PERM) ||
00687 (word->best_choice->permuter () == USER_DAWG_PERM) ||
00688 (word->best_choice->permuter () == NUMBER_PERM)))
00689 return TRUE;
00690 else
00691 return FALSE;
00692 }
00693
00694
00708 void fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row) {
00709 WERD_RES *word_res;
00710 WERD_RES_LIST sub_word_list;
00711 WERD_RES_IT sub_word_list_it(&sub_word_list);
00712 INT16 blob_index;
00713 INT16 new_length;
00714 float junk;
00715
00716 word_res = word_res_it.data ();
00717 if (!fixsp_check_for_fp_noise_space ||
00718 word_res->word->flag (W_REP_CHAR) ||
00719 word_res->combination ||
00720 word_res->part_of_combo || !word_res->word->flag (W_DONT_CHOP))
00721 return;
00722
00723 blob_index = worst_noise_blob (word_res, &junk);
00724 if (blob_index < 0)
00725 return;
00726
00727 #ifndef SECURE_NAMES
00728 if (debug_fix_space_level > 1) {
00729 tprintf ("FP fixspace working on \"%s\"\n",
00730 word_res->best_choice->string ().string ());
00731 }
00732 #endif
00733 gblob_sort_list ((PBLOB_LIST *) word_res->word->rej_cblob_list (), FALSE);
00734 sub_word_list_it.add_after_stay_put (word_res_it.extract ());
00735 fix_noisy_space_list(sub_word_list, row);
00736 new_length = sub_word_list.length ();
00737 word_res_it.add_list_before (&sub_word_list);
00738 for (; (!word_res_it.at_last () && (new_length > 1)); new_length--) {
00739 word_res_it.forward ();
00740 }
00741 }
00742
00743
00751 void fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row) {
00752 INT16 best_score;
00753 WERD_RES_IT best_perm_it(&best_perm);
00754 WERD_RES_LIST current_perm;
00755 WERD_RES_IT current_perm_it(¤t_perm);
00756 WERD_RES *old_word_res;
00757 WERD_RES *new_word_res;
00758 INT16 current_score;
00759 BOOL8 improved = FALSE;
00760
00761 best_score = fp_eval_word_spacing (best_perm);
00762
00763 dump_words (best_perm, best_score, 1, improved);
00764
00765 new_word_res = new WERD_RES;
00766 old_word_res = best_perm_it.data ();
00767 old_word_res->combination = TRUE;
00768 *new_word_res = *old_word_res;
00769 old_word_res->combination = FALSE;
00770 new_word_res->combination = FALSE;
00771 current_perm_it.add_to_end (new_word_res);
00772
00773 break_noisiest_blob_word(current_perm);
00774
00775 while ((best_score != PERFECT_WERDS) && !current_perm.empty ()) {
00776 match_current_words(current_perm, row);
00777 current_score = fp_eval_word_spacing (current_perm);
00778 dump_words (current_perm, current_score, 2, improved);
00779 if (current_score > best_score) {
00780 best_perm.clear ();
00781 best_perm.deep_copy (¤t_perm);
00782 best_score = current_score;
00783 improved = TRUE;
00784 }
00785 if (current_score < PERFECT_WERDS)
00786 break_noisiest_blob_word(current_perm);
00787 }
00788 dump_words (best_perm, best_score, 3, improved);
00789 }
00790
00791
00800 void break_noisiest_blob_word(WERD_RES_LIST &words) {
00801 WERD_RES_IT word_it(&words);
00802 WERD_RES_IT worst_word_it;
00803 float worst_noise_score = 9999;
00804 int worst_blob_index = -1;
00805 int blob_index;
00806 float noise_score;
00807 WERD_RES *word_res;
00808 C_BLOB_IT blob_it;
00809 C_BLOB_IT rej_cblob_it;
00810 C_BLOB_LIST new_blob_list;
00811 C_BLOB_IT new_blob_it;
00812 C_BLOB_IT new_rej_cblob_it;
00813 WERD *new_word;
00814 INT16 start_of_noise_blob;
00815 INT16 i;
00816
00817 for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
00818 blob_index = worst_noise_blob (word_it.data (), &noise_score);
00819 if ((blob_index > -1) && (worst_noise_score > noise_score)) {
00820 worst_noise_score = noise_score;
00821 worst_blob_index = blob_index;
00822 worst_word_it = word_it;
00823 }
00824 }
00825 if (worst_blob_index < 0) {
00826 words.clear ();
00827 return;
00828 }
00829
00830
00831 word_res = worst_word_it.data ();
00832
00833
00834 new_blob_it.set_to_list (&new_blob_list);
00835 blob_it.set_to_list (word_res->word->cblob_list ());
00836 for (i = 0; i < worst_blob_index; i++, blob_it.forward ()) {
00837 new_blob_it.add_after_then_move (blob_it.extract ());
00838 }
00839 start_of_noise_blob = blob_it.data ()->bounding_box ().left ();
00840 delete blob_it.extract ();
00841
00842 new_word = new WERD (&new_blob_list, word_res->word);
00843 new_word->set_flag (W_EOL, FALSE);
00844 word_res->word->set_flag (W_BOL, FALSE);
00845 word_res->word->set_blanks (1);
00846
00847 new_rej_cblob_it.set_to_list (new_word->rej_cblob_list ());
00848 rej_cblob_it.set_to_list (word_res->word->rej_cblob_list ());
00849 for (;
00850 (!rej_cblob_it.empty () &&
00851 (rej_cblob_it.data ()->bounding_box ().left () <
00852 start_of_noise_blob)); rej_cblob_it.forward ()) {
00853 new_rej_cblob_it.add_after_then_move (rej_cblob_it.extract ());
00854 }
00855
00856 worst_word_it.add_before_then_move (new WERD_RES (new_word));
00857
00858 word_res->done = FALSE;
00859 if (word_res->outword != NULL) {
00860 delete word_res->outword;
00861 delete word_res->best_choice;
00862 delete word_res->raw_choice;
00863 word_res->outword = NULL;
00864 word_res->best_choice = NULL;
00865 word_res->raw_choice = NULL;
00866 }
00867 }
00868
00869
00880 INT16 worst_noise_blob(WERD_RES *word_res, float *worst_noise_score) {
00881 PBLOB_IT blob_it;
00882 INT16 blob_count;
00883 float noise_score[512];
00884 int i;
00885 int min_noise_blob;
00886 int max_noise_blob;
00887 int non_noise_count;
00888 int worst_noise_blob;
00889 float small_limit = bln_x_height * fixsp_small_outlines_size;
00890 float non_noise_limit = bln_x_height * 0.8;
00891
00892 blob_it.set_to_list (word_res->outword->blob_list ());
00893
00894 blob_count = blob_it.length ();
00895 ASSERT_HOST (blob_count <= 512);
00896 if (blob_count < 5)
00897 return -1;
00898
00899
00900 #ifndef SECURE_NAMES
00901 if (debug_fix_space_level > 5)
00902 tprintf ("FP fixspace Noise metrics for \"%s\": ",
00903 word_res->best_choice->string ().string ());
00904 #endif
00905
00906 for (i = 0; i < blob_count; i++, blob_it.forward ()) {
00907 if (word_res->reject_map[i].accepted ())
00908 noise_score[i] = non_noise_limit;
00909 else
00910 noise_score[i] = blob_noise_score (blob_it.data ());
00911
00912 if (debug_fix_space_level > 5)
00913 tprintf ("%1.1f ", noise_score[i]);
00914 }
00915 if (debug_fix_space_level > 5)
00916 tprintf ("\n");
00917
00918
00919
00920 non_noise_count = 0;
00921 for (i = 0;
00922 (i < blob_count) && (non_noise_count < fixsp_non_noise_limit); i++) {
00923 if (noise_score[i] >= non_noise_limit)
00924 non_noise_count++;
00925 }
00926 if (non_noise_count < fixsp_non_noise_limit)
00927 return -1;
00928 min_noise_blob = i;
00929
00930 non_noise_count = 0;
00931 for (i = blob_count - 1;
00932 (i >= 0) && (non_noise_count < fixsp_non_noise_limit); i--) {
00933 if (noise_score[i] >= non_noise_limit)
00934 non_noise_count++;
00935 }
00936 if (non_noise_count < fixsp_non_noise_limit)
00937 return -1;
00938 max_noise_blob = i;
00939
00940 if (min_noise_blob > max_noise_blob)
00941 return -1;
00942
00943 *worst_noise_score = small_limit;
00944 worst_noise_blob = -1;
00945 for (i = min_noise_blob; i <= max_noise_blob; i++) {
00946 if (noise_score[i] < *worst_noise_score) {
00947 worst_noise_blob = i;
00948 *worst_noise_score = noise_score[i];
00949 }
00950 }
00951 return worst_noise_blob;
00952 }
00953
00954
00963 float blob_noise_score(PBLOB *blob) {
00964 OUTLINE_IT outline_it;
00965 BOX box;
00966 INT16 outline_count = 0;
00967 INT16 max_dimension;
00968 INT16 largest_outline_dimension = 0;
00969
00970 outline_it.set_to_list (blob->out_list ());
00971 for (outline_it.mark_cycle_pt ();
00972 !outline_it.cycled_list (); outline_it.forward ()) {
00973 outline_count++;
00974 box = outline_it.data ()->bounding_box ();
00975 if (box.height () > box.width ())
00976 max_dimension = box.height ();
00977 else
00978 max_dimension = box.width ();
00979
00980 if (largest_outline_dimension < max_dimension)
00981 largest_outline_dimension = max_dimension;
00982 }
00983
00984 if (fixsp_noise_score_fixing) {
00985 if (outline_count > 5)
00986 largest_outline_dimension *= 2;
00987
00988 box = blob->bounding_box ();
00989
00990 if ((box.bottom () > bln_baseline_offset * 4) ||
00991 (box.top () < bln_baseline_offset / 2))
00992 largest_outline_dimension /= 2;
00993 }
00994 return largest_outline_dimension;
00995 }
00996
00997
01006 void fixspace_dbg(WERD_RES *word) {
01007 BOX box = word->word->bounding_box ();
01008 BOOL8 show_map_detail = FALSE;
01009 INT16 i;
01010
01011 box.print ();
01012 #ifndef SECURE_NAMES
01013 tprintf (" \"%s\" ", word->best_choice->string ().string ());
01014 tprintf ("Blob count: %d (word); %d/%d (outword)\n",
01015 word->word->gblob_list ()->length (),
01016 word->outword->gblob_list ()->length (),
01017 word->outword->rej_blob_list ()->length ());
01018 word->reject_map.print (debug_fp);
01019 tprintf ("\n");
01020 if (show_map_detail) {
01021 tprintf ("\"%s\"\n", word->best_choice->string ().string ());
01022 for (i = 0; word->best_choice->string ()[i] != '\0'; i++) {
01023 tprintf ("**** \"%c\" ****\n", word->best_choice->string ()[i]);
01024 word->reject_map[i].full_print (debug_fp);
01025 }
01026 }
01027
01028 tprintf ("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE");
01029 tprintf ("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE");
01030 #endif
01031 }
01032
01033
01045 INT16 fp_eval_word_spacing(WERD_RES_LIST &word_res_list) {
01046 WERD_RES_IT word_it(&word_res_list);
01047 WERD_RES *word;
01048 PBLOB_IT blob_it;
01049 INT16 word_length;
01050 INT16 score = 0;
01051 INT16 i;
01052 const char *chs;
01053 float small_limit = bln_x_height * fixsp_small_outlines_size;
01054
01055 if (!fixsp_fp_eval)
01056 return (eval_word_spacing (word_res_list));
01057
01058 for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
01059 word = word_it.data ();
01060 word_length = word->reject_map.length ();
01061 chs = word->best_choice->string ().string ();
01062 if ((word->done ||
01063 word->tess_accepted) ||
01064 (word->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
01065 (word->best_choice->permuter () == FREQ_DAWG_PERM) ||
01066 (word->best_choice->permuter () == USER_DAWG_PERM) ||
01067 (safe_dict_word (chs) > 0)) {
01068 blob_it.set_to_list (word->outword->blob_list ());
01069 for (i = 0; i < word_length; i++, blob_it.forward ()) {
01070 if ((chs[i] == ' ') ||
01071 (blob_noise_score (blob_it.data ()) < small_limit))
01072 score -= 1;
01073
01074 else if (word->reject_map[i].accepted ())
01075 score++;
01076 }
01077 }
01078 }
01079 if (score < 0)
01080 score = 0;
01081 return score;
01082 }