00001
00029 #include "mfcpch.h"
00030 #include "tessvars.h"
00031 #ifdef __UNIX__
00032 #include <assert.h>
00033 #include <errno.h>
00034 #endif
00035 #include "scanutils.h"
00036 #include <ctype.h>
00037 #include <string.h>
00038
00039 #include "memry.h"
00040 #include "reject.h"
00041 #include "tfacep.h"
00042 #include "mainblk.h"
00043 #include "charcut.h"
00044 #include "imgs.h"
00045 #include "scaleimg.h"
00046 #include "control.h"
00047 #include "docqual.h"
00048 #include "secname.h"
00049
00050
00051
00052
00053 #include "callnet.h"
00054
00055
00056 #include "notdll.h"
00057
00058 CLISTIZEH (STRING) CLISTIZE (STRING)
00059 #define EXTERN
00060
00062 EXTERN
00063 INT_VAR (tessedit_reject_mode, 0, "Rejection algorithm");
00064 EXTERN
00065 INT_VAR (tessedit_ok_mode, 5, "Acceptance decision algorithm");
00066 EXTERN
00067 BOOL_VAR (tessedit_use_nn, FALSE, "");
00068 EXTERN
00069 BOOL_VAR (tessedit_rejection_debug, FALSE, "Adaption debug");
00070 EXTERN
00071 BOOL_VAR (tessedit_rejection_stats, FALSE, "Show NN stats");
00072 EXTERN
00073 BOOL_VAR (tessedit_flip_0O, TRUE, "Contextual 0O O0 flips");
00074 EXTERN
00075 double_VAR (tessedit_lower_flip_hyphen, 1.5,
00076 "Aspect ratio dot/hyphen test");
00077 EXTERN
00078 double_VAR (tessedit_upper_flip_hyphen, 1.8,
00079 "Aspect ratio dot/hyphen test");
00080
00081 EXTERN
00082 BOOL_VAR (rej_trust_doc_dawg, FALSE,
00083 "Use DOC dawg in 11l conf. detector");
00084 EXTERN
00085 BOOL_VAR (rej_1Il_use_dict_word, FALSE, "Use dictword test");
00086 EXTERN
00087 BOOL_VAR (rej_1Il_trust_permuter_type, TRUE, "Dont double check");
00088
00089 EXTERN
00090 BOOL_VAR (one_ell_conflict_default, TRUE, "one_ell_conflict default");
00091 EXTERN
00092 BOOL_VAR (show_char_clipping, FALSE, "Show clip image window?");
00093 EXTERN
00094 BOOL_VAR (nn_debug, FALSE, "NN DEBUGGING?");
00095 EXTERN
00096 BOOL_VAR (nn_reject_debug, FALSE, "NN DEBUG each char?");
00097 EXTERN
00098 BOOL_VAR (nn_lax, FALSE, "Use 2nd rate matches");
00099 EXTERN
00100 BOOL_VAR (nn_double_check_dict, FALSE, "Double check");
00101 EXTERN
00102 BOOL_VAR (nn_conf_double_check_dict, TRUE,
00103 "Double check for confusions");
00104 EXTERN
00105 BOOL_VAR (nn_conf_1Il, TRUE, "NN use 1Il conflicts");
00106 EXTERN
00107 BOOL_VAR (nn_conf_Ss, TRUE, "NN use Ss conflicts");
00108 EXTERN
00109 BOOL_VAR (nn_conf_hyphen, TRUE, "NN hyphen conflicts");
00110 EXTERN
00111 BOOL_VAR (nn_conf_test_good_qual, FALSE, "NN dodgy 1Il cross check");
00112 EXTERN
00113 BOOL_VAR (nn_conf_test_dict, TRUE, "NN dodgy 1Il cross check");
00114 EXTERN
00115 BOOL_VAR (nn_conf_test_sensible, TRUE, "NN dodgy 1Il cross check");
00116 EXTERN
00117 BOOL_VAR (nn_conf_strict_on_dodgy_chs, TRUE,
00118 "Require stronger NN match");
00119 EXTERN
00120 double_VAR (nn_dodgy_char_threshold, 0.99, "min accept score");
00121 EXTERN
00122 INT_VAR (nn_conf_accept_level, 4, "NN accept dodgy 1Il matches? ");
00123 EXTERN
00124 INT_VAR (nn_conf_initial_i_level, 3,
00125 "NN accept initial Ii match level ");
00126
00127 EXTERN
00128 BOOL_VAR (no_unrej_dubious_chars, TRUE, "Dubious chars next to reject?");
00129 EXTERN
00130 BOOL_VAR (no_unrej_no_alphanum_wds, TRUE, "Stop unrej of non A/N wds?");
00131 EXTERN
00132 BOOL_VAR (no_unrej_1Il, FALSE, "Stop unrej of 1Ilchars?");
00133 EXTERN
00134 BOOL_VAR (rej_use_tess_accepted, TRUE, "Individual rejection control");
00135 EXTERN
00136 BOOL_VAR (rej_use_tess_blanks, TRUE, "Individual rejection control");
00137 EXTERN
00138 BOOL_VAR (rej_use_good_perm, TRUE, "Individual rejection control");
00139 EXTERN
00140 BOOL_VAR (rej_use_sensible_wd, FALSE, "Extend permuter check");
00141 EXTERN
00142 BOOL_VAR (rej_alphas_in_number_perm, FALSE, "Extend permuter check");
00143
00144 EXTERN
00145 double_VAR (rej_whole_of_mostly_reject_word_fract, 0.85,
00146 "if >this fract");
00147 EXTERN
00148 INT_VAR (rej_mostly_reject_mode, 1,
00149 "0-never, 1-afterNN, 2-after new xht");
00150 EXTERN
00151 double_VAR (tessed_fullstop_aspect_ratio, 1.2,
00152 "if >this fract then reject");
00153
00154 EXTERN
00155 INT_VAR (net_image_width, 40, "NN input image width");
00156 EXTERN
00157 INT_VAR (net_image_height, 36, "NN input image height");
00158 EXTERN
00159 INT_VAR (net_image_x_height, 22, "NN input image x_height");
00160 EXTERN
00161 INT_VAR (tessedit_image_border, 2, "Rej blbs near image edge limit");
00162
00163 EXTERN
00164 INT_VAR (net_bl_nodes, 20, "Number of baseline nodes");
00165
00166 EXTERN
00167 double_VAR (nn_reject_threshold, 0.5, "NN min accept score");
00168 EXTERN
00169 double_VAR (nn_reject_head_and_shoulders, 0.6, "top scores sep factor");
00170
00171
00172 EXTERN
00173 STRING_VAR (ok_single_ch_non_alphanum_wds, "-?\075",
00174 "Allow NN to unrej");
00175 EXTERN
00176 STRING_VAR (ok_repeated_ch_non_alphanum_wds, "-?*\075",
00177 "Allow NN to unrej");
00178 EXTERN
00179 STRING_VAR (conflict_set_I_l_1, "Il1[]", "Il1 conflict set");
00180 EXTERN
00181 STRING_VAR (conflict_set_S_s, "Ss$", "Ss conflict set");
00182 EXTERN
00183 STRING_VAR (conflict_set_hyphen, "-_~", "hyphen conflict set");
00184 EXTERN
00185 STRING_VAR (dubious_chars_left_of_reject, "!'+`()-./\\<>;:^_,~\"",
00186 "Unreliable chars");
00187 EXTERN
00188 STRING_VAR (dubious_chars_right_of_reject, "!'+`()-./\\<>;:^_,~\"",
00189 "Unreliable chars");
00190
00191 EXTERN
00192 INT_VAR (min_sane_x_ht_pixels, 8, "Reject any x-ht lt or eq than this");
00213 void set_done(
00214 WERD_RES *word,
00215 INT16 pass) {
00216
00217
00218
00219 if (tessedit_ok_mode == 0) {
00220
00221 word->done = word->tess_accepted;
00222 }
00223
00224
00225
00226 else if (tessedit_ok_mode == 1) {
00227 word->done = word->tess_accepted &&
00228 (strchr (word->best_choice->string ().string (), ' ') == NULL);
00229
00230 if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
00231 word->done = FALSE;
00232 }
00233
00234
00235
00236 else if (tessedit_ok_mode == 2) {
00237 word->done = word->tess_accepted &&
00238 (strchr (word->best_choice->string ().string (), ' ') == NULL);
00239
00240 if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
00241 word->done = FALSE;
00242
00243 if (word->done &&
00244 (pass == 1) &&
00245 (word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
00246 (word->best_choice->permuter () != FREQ_DAWG_PERM) &&
00247 (word->best_choice->permuter () != USER_DAWG_PERM) &&
00248 (word->best_choice->permuter () != NUMBER_PERM)) {
00249 #ifndef SECURE_NAMES
00250 if (tessedit_rejection_debug)
00251 tprintf ("\nVETO Tess accepting poor word \"%s\"\n",
00252 word->best_choice->string ().string ());
00253 #endif
00254 word->done = FALSE;
00255 }
00256 }
00257
00258
00259
00260 else if (tessedit_ok_mode == 3) {
00261 word->done = word->tess_accepted &&
00262 (strchr (word->best_choice->string ().string (), ' ') == NULL);
00263
00264 if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
00265 word->done = FALSE;
00266
00267 if (word->done &&
00268 (word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
00269 (word->best_choice->permuter () != FREQ_DAWG_PERM) &&
00270 (word->best_choice->permuter () != USER_DAWG_PERM) &&
00271 (word->best_choice->permuter () != NUMBER_PERM)) {
00272 #ifndef SECURE_NAMES
00273 if (tessedit_rejection_debug)
00274 tprintf ("\nVETO Tess accepting poor word \"%s\"\n",
00275 word->best_choice->string ().string ());
00276 #endif
00277 word->done = FALSE;
00278 }
00279 }
00280
00281
00282
00283 else if (tessedit_ok_mode == 4) {
00284 word->done = word->tess_accepted &&
00285 (strchr (word->best_choice->string ().string (), ' ') == NULL);
00286
00287 if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
00288 word->done = FALSE;
00289
00290 if (word->done &&
00291 (pass == 1) &&
00292 ((word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
00293 (word->best_choice->permuter () != FREQ_DAWG_PERM) &&
00294 (word->best_choice->permuter () != USER_DAWG_PERM) &&
00295 (word->best_choice->permuter () != NUMBER_PERM)) ||
00296 (test_ambig_word (word))) {
00297 #ifndef SECURE_NAMES
00298 if (tessedit_rejection_debug)
00299 tprintf ("\nVETO Tess accepting poor word \"%s\"\n",
00300 word->best_choice->string ().string ());
00301 #endif
00302 word->done = FALSE;
00303 }
00304 }
00305
00306
00307
00308 else if (tessedit_ok_mode == 5) {
00309 word->done = word->tess_accepted &&
00310 (strchr (word->best_choice->string ().string (), ' ') == NULL);
00311
00312 if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
00313 word->done = FALSE;
00314
00315 if (word->done &&
00316 ((word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
00317 (word->best_choice->permuter () != FREQ_DAWG_PERM) &&
00318 (word->best_choice->permuter () != USER_DAWG_PERM) &&
00319 (word->best_choice->permuter () != NUMBER_PERM)) ||
00320 (test_ambig_word (word))) {
00321 #ifndef SECURE_NAMES
00322 if (tessedit_rejection_debug)
00323 tprintf ("\nVETO Tess accepting poor word \"%s\"\n",
00324 word->best_choice->string ().string ());
00325 #endif
00326 word->done = FALSE;
00327 }
00328 }
00329
00330 else {
00331 tprintf ("BAD tessedit_ok_mode\n");
00332 err_exit();
00333 }
00334 }
00335
00336
00357 void make_reject_map(
00358 WERD_RES *word,
00359 BLOB_CHOICE_LIST_CLIST *blob_choices,
00360 ROW *row,
00361 INT16 pass
00362 ) {
00363 INT16 i;
00364
00365 flip_0O(word);
00366 check_debug_pt (word, -1);
00367 set_done(word, pass);
00368 word->reject_map.initialise (word->best_choice->string ().length ());
00369 reject_blanks(word);
00370
00371
00372
00373 if (tessedit_reject_mode == 0) {
00374 if (!word->done)
00375 reject_poor_matches(word, blob_choices);
00376 }
00377
00378
00379
00380
00381
00382 else if (tessedit_reject_mode == 5) {
00383 if (bln_x_height / word->denorm.scale () <= min_sane_x_ht_pixels)
00384 word->reject_map.rej_word_small_xht ();
00385 else {
00386 one_ell_conflict(word, TRUE);
00387
00388
00389
00390
00391
00392
00393 if (rej_use_tess_accepted && !word->tess_accepted)
00394 word->reject_map.rej_word_not_tess_accepted ();
00395
00396 if (rej_use_tess_blanks &&
00397 (strchr (word->best_choice->string ().string (), ' ') != NULL))
00398 word->reject_map.rej_word_contains_blanks ();
00399
00400 if (rej_use_good_perm) {
00401 if (((word->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
00402 (word->best_choice->permuter () == FREQ_DAWG_PERM) ||
00403 (word->best_choice->permuter () == USER_DAWG_PERM)) &&
00404 (!rej_use_sensible_wd ||
00405 (acceptable_word_string
00406 (word->best_choice->string ().string ()) != AC_UNACCEPTABLE))) {
00407
00408 }
00409 else if (word->best_choice->permuter () == NUMBER_PERM) {
00410 if (rej_alphas_in_number_perm) {
00411 for (i = 0; word->best_choice->string ()[i] != '\0';
00412 i++) {
00413 if (word->reject_map[i].accepted () &&
00414 isalpha (word->best_choice->string ()[i]))
00415 word->reject_map[i].setrej_bad_permuter ();
00416
00417 }
00418 }
00419 }
00420 else {
00421 word->reject_map.rej_word_bad_permuter ();
00422 }
00423 }
00424
00425
00426
00427 }
00428 }
00429 else {
00430 tprintf ("BAD tessedit_reject_mode\n");
00431 err_exit();
00432 }
00433
00434 if (tessedit_image_border > -1)
00435 reject_edge_blobs(word);
00436
00437 check_debug_pt (word, 10);
00438 if (tessedit_rejection_debug) {
00439 tprintf ("Permuter Type = %d\n", word->best_choice->permuter ());
00440 tprintf ("Certainty: %f Rating: %f\n",
00441 word->best_choice->certainty (), word->best_choice->rating ());
00442 tprintf ("Dict word: %d\n",
00443 dict_word (word->best_choice->string ().string ()));
00444 }
00445
00446
00447
00448 if (tessedit_use_nn && (pass == 2) &&
00449 word->reject_map.recoverable_rejects ())
00450 nn_recover_rejects(word, row);
00451 flip_hyphens(word);
00452 check_debug_pt (word, 20);
00453 }
00454
00455
00463 void reject_blanks(WERD_RES *word) {
00464 INT16 i;
00465
00466 for (i = 0; word->best_choice->string ()[i] != '\0'; i++) {
00467 if (word->best_choice->string ()[i] == ' ')
00468 word->reject_map[i].setrej_tess_failure ();
00469 }
00470 }
00471
00472
00481 void reject_I_1_L(WERD_RES *word) {
00482 INT16 i;
00483
00484 for (i = 0; word->best_choice->string ()[i] != '\0'; i++) {
00485 if (STRING (conflict_set_I_l_1).
00486 contains (word->best_choice->string ()[i])) {
00487 word->reject_map[i].setrej_1Il_conflict ();
00488 }
00489 }
00490 }
00491
00492
00501 void reject_poor_matches(
00502 WERD_RES *word,
00503 BLOB_CHOICE_LIST_CLIST *blob_choices) {
00504 float threshold;
00505 INT16 i = 0;
00506
00507 BLOB_CHOICE_LIST_C_IT list_it = blob_choices;
00508 BLOB_CHOICE_IT choice_it;
00509
00510 #ifndef SECURE_NAMES
00511 if (strlen (word->best_choice->string ().string ()) != list_it.length ()) {
00512 tprintf
00513 ("ASSERT FAIL string:\"%s\"; strlen=%d; choices len=%d; blob len=%d\n",
00514 word->best_choice->string ().string (),
00515 strlen (word->best_choice->string ().string ()), list_it.length (),
00516 word->outword->blob_list ()->length ());
00517 }
00518 #endif
00519 ASSERT_HOST (strlen (word->best_choice->string ().string ()) ==
00520 list_it.length ());
00521 ASSERT_HOST (word->outword->blob_list ()->length () == list_it.length ());
00522 threshold = compute_reject_threshold (blob_choices);
00523
00524 for (list_it.mark_cycle_pt ();
00525 !list_it.cycled_list (); list_it.forward (), i++) {
00526
00527
00528
00529
00530 choice_it.set_to_list (list_it.data ());
00531 if ((word->best_choice->string ()[i] == ' ') ||
00532 (choice_it.length () == 0))
00533 word->reject_map[i].setrej_tess_failure ();
00534 else if (choice_it.data ()->certainty () < threshold)
00535 word->reject_map[i].setrej_poor_match ();
00536 }
00537 }
00538
00539
00549 float compute_reject_threshold(
00550 BLOB_CHOICE_LIST_CLIST *blob_choices) {
00551 INT16 index;
00552 INT16 blob_count;
00553 INT16 ok_blob_count = 0;
00554 float *ratings;
00555 float threshold;
00556 float bestgap;
00557 float gapstart;
00558
00559 BLOB_CHOICE_LIST_C_IT list_it = blob_choices;
00560 BLOB_CHOICE_IT choice_it;
00561
00562 blob_count = blob_choices->length ();
00563 ratings = (float *) alloc_mem (blob_count * sizeof (float));
00564 for (list_it.mark_cycle_pt (), index = 0;
00565 !list_it.cycled_list (); list_it.forward (), index++) {
00566 choice_it.set_to_list (list_it.data ());
00567 if (choice_it.length () > 0) {
00568 ratings[ok_blob_count] = choice_it.data ()->certainty ();
00569
00570
00571
00572
00573 ok_blob_count++;
00574 }
00575 }
00576 ASSERT_HOST (index == blob_count);
00577 qsort (ratings, ok_blob_count, sizeof (float), sort_floats);
00578
00579 bestgap = 0;
00580 gapstart = ratings[0] - 1;
00581 if (ok_blob_count >= 3) {
00582 for (index = 0; index < ok_blob_count - 1; index++) {
00583 if (ratings[index + 1] - ratings[index] > bestgap) {
00584 bestgap = ratings[index + 1] - ratings[index];
00585
00586 gapstart = ratings[index];
00587 }
00588 }
00589 }
00590 threshold = gapstart + bestgap / 2;
00591
00592
00593
00594 free_mem(ratings);
00595 return threshold;
00596 }
00597
00598
00606 int sort_floats(
00607 const void *arg1,
00608 const void *arg2) {
00609 float diff;
00610
00611 diff = *((float *) arg1) - *((float *) arg2);
00612 if (diff > 0)
00613 return 1;
00614 else if (diff < 0)
00615 return -1;
00616 else
00617 return 0;
00618 }
00619
00620
00631 void reject_edge_blobs(WERD_RES *word) {
00632 BOX word_box = word->word->bounding_box ();
00633 BOX blob_box;
00634 PBLOB_IT blob_it = word->outword->blob_list ();
00635
00636 int blobindex = 0;
00637 float centre;
00638
00639 if ((word_box.left () < tessedit_image_border) ||
00640 (word_box.bottom () < tessedit_image_border) ||
00641 (word_box.right () + tessedit_image_border >
00642 page_image.get_xsize () - 1) ||
00643 (word_box.top () + tessedit_image_border > page_image.get_ysize () - 1)) {
00644 ASSERT_HOST (word->reject_map.length () == blob_it.length ());
00645 for (blobindex = 0, blob_it.mark_cycle_pt ();
00646 !blob_it.cycled_list (); blobindex++, blob_it.forward ()) {
00647 blob_box = blob_it.data ()->bounding_box ();
00648 centre = (blob_box.left () + blob_box.right ()) / 2.0;
00649 if ((word->denorm.x (blob_box.left ()) < tessedit_image_border) ||
00650 (word->denorm.y (blob_box.bottom (), centre) <
00651 tessedit_image_border) ||
00652 (word->denorm.x (blob_box.right ()) + tessedit_image_border >
00653 page_image.get_xsize () - 1) ||
00654 (word->denorm.y (blob_box.top (), centre)
00655 + tessedit_image_border > page_image.get_ysize () - 1)) {
00656 word->reject_map[blobindex].setrej_edge_char ();
00657 }
00658 }
00659 }
00660 }
00661
00662
00684 BOOL8 one_ell_conflict(WERD_RES *word_res, BOOL8 update_map) {
00685 const char *word;
00686 INT16 word_len;
00687 INT16 first_alphanum_idx;
00688 INT16 i;
00689 BOOL8 non_conflict_set_char;
00690 BOOL8 conflict = FALSE;
00691 BOOL8 allow_1s;
00692 ACCEPTABLE_WERD_TYPE word_type;
00693 BOOL8 dict_perm_type;
00694 BOOL8 dict_word_ok;
00695 int dict_word_type;
00696
00697 word = word_res->best_choice->string ().string ();
00698 word_len = strlen (word);
00699
00700
00701
00702
00703 if (strpbrk (word, conflict_set_I_l_1.string ()) == NULL)
00704 return FALSE;
00705
00706
00707
00708
00709
00710
00711 for (i = 0, non_conflict_set_char = FALSE;
00712 (i < word_len) && !non_conflict_set_char; i++)
00713 non_conflict_set_char = isalnum (word[i]) &&
00714 !STRING (conflict_set_I_l_1).contains (word[i]);
00715 if (!non_conflict_set_char) {
00716 if (update_map)
00717 reject_I_1_L(word_res);
00718 return TRUE;
00719 }
00720
00721
00722
00723
00724
00725
00726
00727 dict_perm_type = (word_res->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
00728 (word_res->best_choice->permuter () == USER_DAWG_PERM) ||
00729 (rej_trust_doc_dawg &&
00730 (word_res->best_choice->permuter () == DOC_DAWG_PERM)) ||
00731 (word_res->best_choice->permuter () == FREQ_DAWG_PERM);
00732 dict_word_type = dict_word (word);
00733 dict_word_ok = (dict_word_type > 0) &&
00734 (rej_trust_doc_dawg || (dict_word_type != DOC_DAWG_PERM));
00735
00736 if ((rej_1Il_use_dict_word && dict_word_ok) ||
00737 (rej_1Il_trust_permuter_type && dict_perm_type) ||
00738 (dict_perm_type && dict_word_ok)) {
00739 first_alphanum_idx = first_alphanum_pos (word);
00740 if (word[first_alphanum_idx] == 'I') {
00741 word_res->best_choice->string ()[first_alphanum_idx] = 'l';
00742 if (safe_dict_word (word) > 0) {
00743 word_res->best_choice->string ()[first_alphanum_idx] = 'I';
00744 if (update_map)
00745 word_res->reject_map[first_alphanum_idx].
00746 setrej_1Il_conflict();
00747 return TRUE;
00748 }
00749 else {
00750 word_res->best_choice->string ()[first_alphanum_idx] = 'I';
00751 return FALSE;
00752 }
00753 }
00754
00755 if (word[first_alphanum_idx] == 'l') {
00756 word_res->best_choice->string ()[first_alphanum_idx] = 'I';
00757 if (safe_dict_word (word) > 0) {
00758 word_res->best_choice->string ()[first_alphanum_idx] = 'l';
00759 if (update_map)
00760 word_res->reject_map[first_alphanum_idx].
00761 setrej_1Il_conflict();
00762 return TRUE;
00763 }
00764 else {
00765 word_res->best_choice->string ()[first_alphanum_idx] = 'l';
00766 return FALSE;
00767 }
00768 }
00769 return FALSE;
00770 }
00771
00772
00773
00774
00775
00776
00777
00778
00779
00780
00781
00782
00783
00784
00785
00786 first_alphanum_idx = first_alphanum_pos (word);
00787 if (word[first_alphanum_idx] == 'l') {
00788 word_res->best_choice->string ()[first_alphanum_idx] = 'I';
00789 if (safe_dict_word (word) > 0)
00790 return FALSE;
00791 else
00792 word_res->best_choice->string ()[first_alphanum_idx] = 'l';
00793 }
00794 else if (word[first_alphanum_idx] == 'I') {
00795 word_res->best_choice->string ()[first_alphanum_idx] = 'l';
00796 if (safe_dict_word (word) > 0)
00797 return FALSE;
00798 else
00799 word_res->best_choice->string ()[first_alphanum_idx] = 'I';
00800 }
00801
00802
00803
00804
00805
00806
00807 if (word_contains_non_1_digit (word)) {
00808 allow_1s = (alpha_count (word) == 0) ||
00809 (word_res->best_choice->permuter () == NUMBER_PERM);
00810
00811 conflict = FALSE;
00812 for (i = 0; i < word_len; i++) {
00813 if ((!allow_1s || (word[i] != '1')) &&
00814 STRING (conflict_set_I_l_1).contains (word[i])) {
00815 if (update_map)
00816 word_res->reject_map[i].setrej_1Il_conflict ();
00817 conflict = TRUE;
00818 }
00819 }
00820 return conflict;
00821 }
00822
00823
00824
00825
00826 word_type = acceptable_word_string (word);
00827 if ((word_type == AC_LOWER_CASE) || (word_type == AC_INITIAL_CAP)) {
00828 first_alphanum_idx = first_alphanum_pos (word);
00829 if (STRING (conflict_set_I_l_1).contains (word[first_alphanum_idx])) {
00830 if (update_map)
00831 word_res->reject_map[first_alphanum_idx].setrej_1Il_conflict ();
00832 return TRUE;
00833 }
00834 else
00835 return FALSE;
00836 }
00837 else if (word_type == AC_UPPER_CASE) {
00838 return FALSE;
00839 }
00840 else {
00841 if (update_map)
00842 reject_I_1_L(word_res);
00843 return TRUE;
00844 }
00845 }
00846
00847
00851 INT16 first_alphanum_pos(const char *word) {
00852 INT16 i;
00853
00854 for (i = 0; word[i] != '\0'; i++) {
00855 if (isalnum (word[i]))
00856 return i;
00857 }
00858 return -1;
00859 }
00860
00861
00865 INT16 alpha_count(const char *word) {
00866 INT16 i;
00867 INT16 count = 0;
00868
00869 for (i = 0; word[i] != '\0'; i++) {
00870 if (isalpha (word[i]))
00871 count++;
00872 }
00873 return count;
00874 }
00875
00876
00880 BOOL8 word_contains_non_1_digit(const char *word) {
00881 INT16 i;
00882
00883 for (i = 0; word[i] != '\0'; i++) {
00884 if (isdigit (word[i]) && word[i] != '1')
00885 return TRUE;
00886 }
00887 return FALSE;
00888 }
00889
00890
00896 BOOL8 test_ambig_word(
00897 WERD_RES *word) {
00898 BOOL8 ambig = FALSE;
00899
00900 if ((word->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
00901 (word->best_choice->permuter () == FREQ_DAWG_PERM) ||
00902 (word->best_choice->permuter () == USER_DAWG_PERM)) {
00903 ambig = !NoDangerousAmbig(word->best_choice->string().string(), NULL);
00904 }
00905 return ambig;
00906 }
00907
00908
00919 BOOL8 ambig_word(
00920 const char *start_word,
00921 char *temp_word,
00922 INT16 test_char_pos
00923 ) {
00924 const char *ambigs;
00925
00926 if (*(temp_word + test_char_pos) == '\0') {
00927 if (safe_dict_word (temp_word)) {
00928 if (strcmp (start_word, temp_word) == 0)
00929 return FALSE;
00930 else
00931 return TRUE;
00932 }
00933 else
00934 return FALSE;
00935 }
00936 else {
00937 ambigs = char_ambiguities (*(temp_word + test_char_pos));
00938 if (ambigs == NULL)
00939 return ambig_word (start_word, temp_word, test_char_pos + 1);
00940 else {
00941 while (*ambigs != '\0') {
00942 *(temp_word + test_char_pos) = *ambigs++;
00943
00944 if (ambig_word (start_word, temp_word, test_char_pos + 1))
00945 return TRUE;
00946 }
00947 return FALSE;
00948 }
00949 }
00950 }
00951
00952
00971 const char *char_ambiguities(char c) {
00972 static STRING_CLIST conflict_sets;
00973 static BOOL8 read_conflict_sets = FALSE;
00974 STRING_C_IT cs_it(&conflict_sets);
00975 const char *cs;
00976 STRING cs_file_name;
00977 FILE *cs_file;
00978 char buff[1024];
00979
00980 if (!read_conflict_sets) {
00981 cs_file_name = datadir + "confsets";
00982 if (!(cs_file = fopen (cs_file_name.string (), "r"))) {
00983 CANTOPENFILE.error ("char_ambiguities", EXIT, "%s %d",
00984 cs_file_name.string (), errno);
00985 }
00986 while (fscanf (cs_file, "%s", buff) == 1) {
00987 cs_it.add_after_then_move (new STRING (buff));
00988 }
00989 read_conflict_sets = TRUE;
00990 cs_it.move_to_first ();
00991 if (tessedit_rejection_debug) {
00992 for (cs_it.mark_cycle_pt ();
00993 !cs_it.cycled_list (); cs_it.forward ()) {
00994 tprintf ("\"%s\"\n", cs_it.data ()->string ());
00995 }
00996 }
00997 }
00998
00999 cs_it.move_to_first ();
01000 for (cs_it.mark_cycle_pt (); !cs_it.cycled_list (); cs_it.forward ()) {
01001 cs = cs_it.data ()->string ();
01002 if (strchr (cs, c) != NULL)
01003 return cs;
01004 }
01005 return NULL;
01006 }
01007
01008 #ifndef EMBEDDED
01009
01018 void test_ambigs(const char *word) {
01019 char orig_word[80];
01020 char temp_word[80];
01021
01022 if (strlen (word) > 80)
01023 tprintf ("Ridiculously long word \"%s\"\n", word);
01024 else {
01025 strcpy(orig_word, word);
01026 while (strlen (orig_word) > 0) {
01027 strcpy(temp_word, orig_word);
01028
01029 #ifndef SECURE_NAMES
01030 if (ambig_word (orig_word, temp_word, 0))
01031 tprintf ("Ambiguity \"%s\" -> \"%s\"\n", orig_word, temp_word);
01032 else
01033 tprintf ("NO Ambiguities for \"%s\"\n", orig_word);
01034 tprintf ("Next Word > ");
01035 #endif
01036 scanf ("%s", orig_word);
01037 }
01038 }
01039 }
01040 #endif
01041
01042
01054 void nn_recover_rejects(WERD_RES *word, ROW *row) {
01055 REJMAP old_map = word->reject_map;
01056
01057
01058
01059
01060
01061
01062
01063
01064 set_global_subsubloc_code(SUBSUBLOC_NN);
01065 nn_match_word(word, row);
01066
01067 if (no_unrej_1Il)
01068 dont_allow_1Il(word);
01069 if (no_unrej_dubious_chars)
01070 dont_allow_dubious_chars(word);
01071
01072 if (rej_mostly_reject_mode == 1)
01073 reject_mostly_rejects(word);
01074
01075
01076
01077
01078
01079
01080 if (no_unrej_no_alphanum_wds &&
01081 (count_alphanums (word) < 1) &&
01082 !((word->best_choice->string ().length () == 1) &&
01083 STRING (ok_single_ch_non_alphanum_wds).contains (word->best_choice->
01084 string ()[0]))
01085 && !repeated_nonalphanum_wd (word, row))
01086
01087 word->reject_map.rej_word_no_alphanums ();
01088
01089 #ifndef SECURE_NAMES
01090
01091 if (nn_debug) {
01092 tprintf ("\nTess: \"%s\" MAP ", word->best_choice->string ().string ());
01093 old_map.print (stdout);
01094 tprintf ("->");
01095 word->reject_map.print (stdout);
01096 tprintf ("\n");
01097 }
01098 #endif
01099 set_global_subsubloc_code(SUBSUBLOC_OTHER);
01100 }
01101
01102
01121 void nn_match_word(
01122 WERD_RES *word,
01123 ROW *row) {
01124 PIXROW_LIST *pixrow_list;
01125 PIXROW_IT pixrow_it;
01126 IMAGELINE *imlines;
01127 BOX pix_box;
01128 #ifndef GRAPHICS_DISABLED
01129 WINDOW win = NULL;
01130 #endif
01131 IMAGE clip_image;
01132 IMAGE scaled_image;
01133 float baseline_pos;
01134 INT16 net_image_size;
01135 INT16 clip_image_size;
01136 WERD copy_outword;
01137 INT16 i;
01138
01139 const char *word_string;
01140 BOOL8 word_in_dict;
01141 BOOL8 checked_dict_word;
01142 BOOL8 sensible_word;
01143 BOOL8 centre;
01144 BOOL8 good_quality_word;
01145 INT16 char_quality;
01146 INT16 accepted_char_quality;
01147
01148 INT16 conf_level;
01149 INT16 first_alphanum_idx;
01150
01151 word_string = word->best_choice->string ().string ();
01152 first_alphanum_idx = first_alphanum_pos (word_string);
01153 word_in_dict = ((word->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
01154 (word->best_choice->permuter () == FREQ_DAWG_PERM) ||
01155 (word->best_choice->permuter () == USER_DAWG_PERM));
01156 checked_dict_word = word_in_dict && (safe_dict_word (word_string) > 0);
01157 sensible_word = acceptable_word_string (word_string) != AC_UNACCEPTABLE;
01158
01159 word_char_quality(word, row, &char_quality, &accepted_char_quality);
01160 good_quality_word = word->best_choice->string ().length () == char_quality;
01161
01162 #ifndef SECURE_NAMES
01163 if (nn_reject_debug) {
01164 tprintf ("Dict: %c Checked Dict: %c Sensible: %c Quality: %c\n",
01165 word_in_dict ? 'T' : 'F',
01166 checked_dict_word ? 'T' : 'F',
01167 sensible_word ? 'T' : 'F', good_quality_word ? 'T' : 'F');
01168 }
01169 #endif
01170
01171 if (word->best_choice->string ().length () !=
01172 word->outword->blob_list ()->length ()) {
01173 #ifndef SECURE_NAMES
01174 tprintf ("nn_match_word ASSERT FAIL String:\"%s\"; #Blobs=%d\n",
01175 word->best_choice->string ().string (),
01176 word->outword->blob_list ()->length ());
01177 #endif
01178 err_exit();
01179 }
01180
01181 copy_outword = *(word->outword);
01182 copy_outword.baseline_denormalise (&word->denorm);
01183
01184
01185
01186
01187
01188
01189 net_image_size = (net_image_width > net_image_height) ?
01190 net_image_width : net_image_height;
01191 clip_image_size = (INT16) floor (0.5 +
01192 net_image_size * word->x_height /
01193 net_image_x_height);
01194 if ((clip_image_size <= 1) || (net_image_size <= 1)) {
01195 return;
01196 }
01197
01198
01199
01200
01201 char_clip_word(©_outword, page_image, pixrow_list, imlines, pix_box);
01202 #ifndef GRAPHICS_DISABLED
01203 if (show_char_clipping) {
01204 win = display_clip_image (©_outword, page_image,
01205 pixrow_list, pix_box);
01206 }
01207 #endif
01208 pixrow_it.set_to_list (pixrow_list);
01209 pixrow_it.move_to_first ();
01210 for (pixrow_it.mark_cycle_pt (), i = 0;
01211 !pixrow_it.cycled_list (); pixrow_it.forward (), i++) {
01212 if (pixrow_it.data ()->
01213 bad_box (page_image.get_xsize (), page_image.get_ysize ()))
01214 continue;
01215 clip_image.create (clip_image_size, clip_image_size, 1);
01216
01217 if (!copy_outword.flag (W_INVERSE))
01218 invert_image(&clip_image);
01219 pixrow_it.data ()->char_clip_image (imlines, pix_box, row,
01220 clip_image, baseline_pos);
01221 if (copy_outword.flag (W_INVERSE))
01222 invert_image(&clip_image);
01223 scaled_image.create (net_image_size, net_image_size, 1);
01224 scale_image(clip_image, scaled_image);
01225 baseline_pos *= net_image_size / clip_image_size;
01226
01227 centre = !pixrow_it.at_first () && !pixrow_it.at_last ();
01228
01229 conf_level = nn_match_char (scaled_image, baseline_pos,
01230 word_in_dict, checked_dict_word,
01231 sensible_word, centre,
01232 good_quality_word, word_string[i]);
01233 if (word->reject_map[i].recoverable ()) {
01234 if ((i == first_alphanum_idx) &&
01235 ((word_string[i] == 'I') || (word_string[i] == 'i'))) {
01236 if (conf_level >= nn_conf_initial_i_level)
01237 word->reject_map[i].setrej_nn_accept ();
01238 }
01239 else if (conf_level > 0)
01240 word->reject_map[i].setrej_nn_accept ();
01241 }
01242 #ifndef GRAPHICS_DISABLED
01243 if (show_char_clipping)
01244 display_images(clip_image, scaled_image);
01245 #endif
01246 clip_image.destroy ();
01247 scaled_image.destroy ();
01248 }
01249
01250 delete[]imlines;
01251 delete pixrow_list;
01252
01253 #ifndef GRAPHICS_DISABLED
01254 if (show_char_clipping) {
01255 destroy_window(win);
01256 }
01257 #endif
01258 }
01259
01260
01276 INT16 nn_match_char(
01277 IMAGE &scaled_image,
01278 float baseline_pos,
01279 BOOL8 dict_word,
01280 BOOL8 checked_dict_word,
01281 BOOL8 sensible_word,
01282 BOOL8 centre,
01283 BOOL8 good_quality_word,
01284 char tess_ch
01285 ) {
01286 INT16 conf_level;
01287 INT32 row;
01288 INT32 col;
01289 INT32 y_size = scaled_image.get_ysize ();
01290 INT32 start_y = y_size - (y_size - net_image_height) / 2 - 1;
01291 INT32 end_y = start_y - net_image_height + 1;
01292 IMAGELINE imline;
01293 float *input_vector;
01294 float *input_vec_ptr;
01295 char top;
01296 float top_score;
01297 char next;
01298 float next_score;
01299 INT16 input_nodes = (net_image_height * net_image_width) + net_bl_nodes;
01300 INT16 j;
01301
01302 input_vector = (float *) alloc_mem (input_nodes * sizeof (float));
01303 input_vec_ptr = input_vector;
01304
01305 invert_image(&scaled_image);
01306 for (row = start_y; row >= end_y; row--) {
01307 scaled_image.fast_get_line (0, row, net_image_width, &imline);
01308 for (col = 0; col < net_image_width; col++)
01309 *input_vec_ptr++ = imline.pixels[col];
01310 }
01311
01312
01313
01314
01315 baseline_pos -= (y_size - net_image_height) / 2.0;
01316
01317
01318
01319
01320
01321
01322
01323 if (baseline_pos < 0)
01324 baseline_pos = 0;
01325 else if (baseline_pos >= net_image_height)
01326 baseline_pos = net_image_height + 1;
01327 else
01328 baseline_pos = baseline_pos + 1;
01329 baseline_pos = baseline_pos / (net_image_height + 1);
01330
01331 if (net_bl_nodes > 0) {
01332 baseline_pos *= 1.7;
01333 if (net_bl_nodes > 1) {
01334
01335 for (j = 0; j < net_bl_nodes; j++) {
01336 if (baseline_pos > ((float) j / net_bl_nodes))
01337 *input_vec_ptr++ = 1.0;
01338 else
01339 *input_vec_ptr++ = 0.0;
01340 }
01341 }
01342 else {
01343
01344 *input_vec_ptr++ = baseline_pos;
01345 }
01346 }
01347
01348 callnet(input_vector, &top, &top_score, &next, &next_score);
01349 conf_level = evaluate_net_match (top, top_score, next, next_score,
01350 tess_ch, dict_word, checked_dict_word,
01351 sensible_word, centre, good_quality_word);
01352 #ifndef SECURE_NAMES
01353 if (nn_reject_debug) {
01354 tprintf ("top:\"%c\" %4.2f next:\"%c\" %4.2f TESS:\"%c\" Conf: %d\n",
01355 top, top_score, next, next_score, tess_ch, conf_level);
01356 }
01357 #endif
01358 free_mem(input_vector);
01359 return conf_level;
01360 }
01361
01386 INT16 evaluate_net_match(char top,
01387 float top_score,
01388 char next,
01389 float next_score,
01390 char tess_ch,
01391 BOOL8 dict_word,
01392 BOOL8 checked_dict_word,
01393 BOOL8 sensible_word,
01394 BOOL8 centre,
01395 BOOL8 good_quality_word) {
01396 INT16 accept_level;
01397 BOOL8 good_top_choice;
01398 BOOL8 excellent_top_choice;
01399 BOOL8 confusion_match = FALSE;
01400 BOOL8 dodgy_char = !isalnum (tess_ch);
01401
01402 good_top_choice = (top_score > nn_reject_threshold) &&
01403 (nn_reject_head_and_shoulders * top_score > next_score);
01404
01405 excellent_top_choice = good_top_choice &&
01406 (top_score > nn_dodgy_char_threshold);
01407
01408 if (top == tess_ch) {
01409 if (excellent_top_choice)
01410 accept_level = 0;
01411 else if (good_top_choice)
01412 accept_level = 1;
01413 else
01414 accept_level = 2;
01415 }
01416 else if ((nn_conf_1Il &&
01417 STRING (conflict_set_I_l_1).contains (tess_ch) &&
01418 STRING (conflict_set_I_l_1).contains (top)) ||
01419 (nn_conf_hyphen &&
01420 STRING (conflict_set_hyphen).contains (tess_ch) &&
01421 STRING (conflict_set_hyphen).contains (top)) ||
01422 (nn_conf_Ss &&
01423 STRING (conflict_set_S_s).contains (tess_ch) &&
01424 STRING (conflict_set_S_s).contains (top))) {
01425 confusion_match = TRUE;
01426 if (good_top_choice)
01427 accept_level = 1;
01428 else
01429 accept_level = 2;
01430 }
01431 else if ((nn_conf_1Il &&
01432 STRING (conflict_set_I_l_1).contains (tess_ch) &&
01433 STRING (conflict_set_I_l_1).contains (next)) ||
01434 (nn_conf_hyphen &&
01435 STRING (conflict_set_hyphen).contains (tess_ch) &&
01436 STRING (conflict_set_hyphen).contains (next)) ||
01437 (nn_conf_Ss &&
01438 STRING (conflict_set_S_s).contains (tess_ch) &&
01439 STRING (conflict_set_S_s).contains (next))) {
01440 confusion_match = TRUE;
01441 if (!good_top_choice)
01442 accept_level = 3;
01443 else
01444 accept_level = 4;
01445 }
01446 else if (next == tess_ch) {
01447 if (!good_top_choice)
01448 accept_level = 3;
01449 else
01450 accept_level = 4;
01451 }
01452 else
01453 accept_level = 5;
01454
01455
01456
01457
01458
01459
01460 if ((accept_level == 0) && !confusion_match)
01461 return 3;
01462
01463 if ((accept_level <= 1) &&
01464 (!nn_conf_strict_on_dodgy_chs || !dodgy_char) && !confusion_match)
01465 return 3;
01466
01467 if ((accept_level == 2) &&
01468 !confusion_match && !dodgy_char &&
01469 good_quality_word &&
01470 dict_word &&
01471 (checked_dict_word || !nn_double_check_dict) && sensible_word)
01472 return 2;
01473
01474 if (confusion_match &&
01475 (accept_level <= nn_conf_accept_level) &&
01476 (good_quality_word ||
01477 (!nn_conf_test_good_qual &&
01478 !STRING (conflict_set_I_l_1).contains (tess_ch))) &&
01479 (dict_word || !nn_conf_test_dict) &&
01480 (checked_dict_word || !nn_conf_double_check_dict) &&
01481 (sensible_word || !nn_conf_test_sensible))
01482 return 1;
01483
01484 if (!confusion_match &&
01485 nn_lax &&
01486 (accept_level == 3) &&
01487 (good_quality_word || !nn_conf_test_good_qual) &&
01488 (dict_word || !nn_conf_test_dict) &&
01489 (sensible_word || !nn_conf_test_sensible))
01490 return 1;
01491 else
01492 return 0;
01493 }
01494
01495
01505 void dont_allow_dubious_chars(WERD_RES *word) {
01506 int i = 0;
01507 int rej_pos;
01508 int word_len = word->reject_map.length ();
01509
01510 while (i < word_len) {
01511
01512
01513 while ((i < word_len) && (word->reject_map[i].accepted ()))
01514 i++;
01515
01516 if (i < word_len) {
01517 rej_pos = i;
01518
01519
01520 i--;
01521 while ((i >= 0) &&
01522 STRING (dubious_chars_left_of_reject).contains (word->
01523 best_choice->
01524 string ()
01525 [i])) {
01526 word->reject_map[i--].setrej_dubious ();
01527 }
01528
01529
01530
01531 for (i = rej_pos;
01532 (i < word_len) && (word->reject_map[i].rejected ()); i++);
01533
01534
01535
01536 while ((i < word_len) &&
01537 STRING (dubious_chars_right_of_reject).contains (word->
01538 best_choice->
01539 string ()
01540 [i])) {
01541 word->reject_map[i++].setrej_dubious ();
01542 }
01543 }
01544 }
01545 }
01546
01547
01555 void dont_allow_1Il(WERD_RES *word) {
01556 int i = 0;
01557 int word_len = word->reject_map.length ();
01558 const char *s = word->best_choice->string ().string ();
01559 BOOL8 accepted_1Il = FALSE;
01560
01561 for (i = 0; i < word_len; i++) {
01562 if (word->reject_map[i].accepted ()) {
01563 if (STRING (conflict_set_I_l_1).contains (s[i]))
01564 accepted_1Il = TRUE;
01565 else {
01566 if (isalnum (s[i]))
01567 return;
01568 }
01569 }
01570 }
01571 if (!accepted_1Il)
01572 return;
01573
01574 for (i = 0; i < word_len; i++) {
01575 if (STRING (conflict_set_I_l_1).contains (s[i]) &&
01576 word->reject_map[i].accepted ())
01577 word->reject_map[i].setrej_postNN_1Il ();
01578 }
01579 }
01580
01581
01585 INT16 count_alphanums(
01586 WERD_RES *word) {
01587 int count = 0;
01588 int i;
01589
01590 for (i = 0; i < word->reject_map.length (); i++) {
01591 if ((word->reject_map[i].accepted ()) &&
01592 (isalnum (word->best_choice->string ()[i])))
01593 count++;
01594 }
01595 return count;
01596 }
01597
01606 void reject_mostly_rejects(
01607 WERD_RES *word) {
01608
01609 if ((float) word->reject_map.reject_count () / word->reject_map.length () >=
01610 rej_whole_of_mostly_reject_word_fract)
01611 word->reject_map.rej_word_mostly_rej ();
01612 }
01613
01614
01618 BOOL8 repeated_nonalphanum_wd(WERD_RES *word, ROW *row) {
01619 INT16 char_quality;
01620 INT16 accepted_char_quality;
01621
01622 if (word->best_choice->string ().length () <= 1)
01623 return FALSE;
01624
01625 if (!STRING (ok_repeated_ch_non_alphanum_wds).
01626 contains (word->best_choice->string ()[0]))
01627 return FALSE;
01628
01629 if (!repeated_ch_string (word->best_choice->string ().string ()))
01630 return FALSE;
01631
01632 word_char_quality(word, row, &char_quality, &accepted_char_quality);
01633
01634 if ((word->best_choice->string ().length () == char_quality) &&
01635 (char_quality == accepted_char_quality))
01636 return TRUE;
01637 else
01638 return FALSE;
01639 }
01640
01641
01645 BOOL8 repeated_ch_string(const char *rep_ch_str) {
01646 char c;
01647
01648 if ((rep_ch_str == NULL) || (*rep_ch_str == '\0')) {
01649 return FALSE;
01650 }
01651
01652 c = *rep_ch_str;
01653 rep_ch_str++;
01654 while (*rep_ch_str == c) {
01655 rep_ch_str++;
01656 }
01657 if (*rep_ch_str == '\0')
01658 return TRUE;
01659 return FALSE;
01660 }
01661
01662
01666 INT16 safe_dict_word(const char *s) {
01667 int dict_word_type;
01668
01669 dict_word_type = dict_word (s);
01670 if (dict_word_type == DOC_DAWG_PERM)
01671 return 0;
01672 else
01673 return dict_word_type;
01674 }
01675
01676
01686 void flip_hyphens(WERD_RES *word) {
01687 char *str = (char *) word->best_choice->string ().string ();
01688 int i = 0;
01689 PBLOB_IT outword_it;
01690 int prev_right = -9999;
01691 int next_left;
01692 BOX out_box;
01693 float aspect_ratio;
01694
01695 if (tessedit_lower_flip_hyphen <= 1)
01696 return;
01697
01698 outword_it.set_to_list (word->outword->blob_list ());
01699
01700 for (outword_it.mark_cycle_pt ();
01701 !outword_it.cycled_list (); outword_it.forward (), i++) {
01702 out_box = outword_it.data ()->bounding_box ();
01703 if (outword_it.at_last ())
01704 next_left = 9999;
01705 else
01706 next_left = outword_it.data_relative (1)->bounding_box ().left ();
01707
01708
01709
01710 if ((out_box.width () > 8 * word->denorm.scale ()) &&
01711 (out_box.left () > prev_right) && (out_box.right () < next_left)) {
01712 aspect_ratio = out_box.width () / (float) out_box.height ();
01713 if (str[i] == '.') {
01714 if (aspect_ratio >= tessedit_upper_flip_hyphen) {
01715 str[i] = '-';
01716 if (word->reject_map[i].rejected ())
01717 word->reject_map[i].setrej_hyphen_accept ();
01718 }
01719 if ((aspect_ratio > tessedit_lower_flip_hyphen) &&
01720 word->reject_map[i].accepted ())
01721 word->reject_map[i].setrej_hyphen ();
01722 }
01723 else if (str[i] == '-') {
01724 if ((aspect_ratio >= tessedit_upper_flip_hyphen) &&
01725 (word->reject_map[i].rejected ()))
01726 word->reject_map[i].setrej_hyphen_accept ();
01727
01728 if ((aspect_ratio <= tessedit_lower_flip_hyphen) &&
01729 (word->reject_map[i].accepted ()))
01730 word->reject_map[i].setrej_hyphen ();
01731 }
01732 }
01733 prev_right = out_box.right ();
01734 }
01735 }
01736
01737
01746 void flip_0O(WERD_RES *word) {
01747 char *str = (char *) word->best_choice->string ().string ();
01748 int i;
01749 PBLOB_IT outword_it;
01750 BOX out_box;
01751
01752 if (!tessedit_flip_0O)
01753 return;
01754
01755 outword_it.set_to_list (word->outword->blob_list ());
01756
01757 for (i = 0, outword_it.mark_cycle_pt ();
01758 !outword_it.cycled_list (); i++, outword_it.forward ()) {
01759 if (isupper (str[i]) || isdigit (str[i])) {
01760 out_box = outword_it.data ()->bounding_box ();
01761 if ((out_box.top () < bln_baseline_offset + bln_x_height) ||
01762 (out_box.bottom () > bln_baseline_offset + bln_x_height / 4))
01763 return;
01764 }
01765 }
01766
01767 for (i = 1; str[i] != '\0'; i++, outword_it.forward ()) {
01768 if ((str[i] == '0') || (str[i] == 'O')) {
01769
01770 if (non_O_upper (str[i - 1]) && non_O_upper (str[i + 1])) {
01771 str[i] = 'O';
01772 }
01773
01774 if (non_O_upper (str[i - 1])
01775 && ((str[i + 1] == '0') || (str[i + 1] == 'O'))
01776 && non_O_upper (str[i + 2])) {
01777 str[i] = 'O';
01778 str[i + 1] = 'O';
01779 i++;
01780 }
01781
01782 if ((i > 1) &&
01783 non_O_upper (str[i - 2]) &&
01784 non_O_upper (str[i - 1]) &&
01785 !isdigit (str[i + 1]) &&
01786 (str[i + 1] != 'l') && (str[i + 1] != 'I')) {
01787 str[i] = 'O';
01788 }
01789
01790 if (non_0_digit (str[i - 1]) && non_0_digit (str[i + 1])) {
01791 str[i] = '0';
01792 }
01793
01794 if (non_0_digit (str[i - 1]) &&
01795 ((str[i + 1] == '0') || (str[i + 1] == 'O')) &&
01796 ((str[i + 2] == '0') || (str[i + 2] == 'O'))) {
01797 str[i] = '0';
01798 str[i + 1] = '0';
01799 str[i + 2] = '0';
01800 i += 2;
01801 }
01802
01803 if (non_0_digit (str[i - 1]) &&
01804 ((str[i + 1] == '0') || (str[i + 1] == 'O')) &&
01805 !isupper (str[i + 2])) {
01806 str[i] = '0';
01807 str[i + 1] = '0';
01808 i++;
01809 }
01810
01811 if (non_0_digit (str[i - 1]) && !isupper (str[i + 1])) {
01812 str[i] = '0';
01813 }
01814
01815 if ((i > 1) &&
01816 ((str[i - 1] == '.') || (str[i - 1] == ',')) &&
01817 (isdigit (str[i - 2]) || (str[i - 2] == 'O'))) {
01818 if (str[i - 2] == 'O')
01819 str[i - 2] = '0';
01820 while ((str[i] == 'O') || (str[i] == '0')) {
01821 str[i++] = '0';
01822 }
01823 i--;
01824 }
01825 }
01826 }
01827 }
01828
01829
01833 BOOL8 non_O_upper(char c) {
01834 return isupper (c) && (c != 'O');
01835 }
01836
01837
01841 BOOL8 non_0_digit(char c) {
01842 return isdigit (c) && (c != '0');
01843 }