00001
00020 #include "mfcpch.h"
00021 #include <ctype.h>
00022 #include "docqual.h"
00023 #include "tstruct.h"
00024 #include "tfacep.h"
00025 #include "reject.h"
00026 #include "tessvars.h"
00027 #include "genblob.h"
00028 #include "secname.h"
00029 #ifdef TEXT_VERBOSE
00030 #include "callcpp.h"
00031 #endif
00032
00033 #define EXTERN
00034
00036 EXTERN STRING_VAR (outlines_odd, "%| ", "Non standard number of outlines");
00037 EXTERN STRING_VAR (outlines_2, "ij!?%\":;",
00038 "Non standard number of outlines");
00039 EXTERN BOOL_VAR (docqual_excuse_outline_errs, FALSE,
00040 "Allow outline errs in unrejection?");
00041 EXTERN BOOL_VAR (tessedit_good_quality_unrej, TRUE,
00042 "Reduce rejection on good docs");
00043 EXTERN BOOL_VAR (tessedit_use_reject_spaces, TRUE, "Reject spaces?");
00044 EXTERN double_VAR (tessedit_reject_doc_percent, 65.00,
00045 "%rej allowed before rej whole doc");
00046 EXTERN double_VAR (tessedit_reject_block_percent, 45.00,
00047 "%rej allowed before rej whole block");
00048 EXTERN double_VAR (tessedit_reject_row_percent, 40.00,
00049 "%rej allowed before rej whole row");
00050 EXTERN double_VAR (tessedit_whole_wd_rej_row_percent, 70.00,
00051 "%of row rejects in whole word rejects which prevents whole row rejection");
00052 EXTERN BOOL_VAR (tessedit_preserve_blk_rej_perfect_wds, TRUE,
00053 "Only rej partially rejected words in block rejection");
00054 EXTERN BOOL_VAR (tessedit_preserve_row_rej_perfect_wds, TRUE,
00055 "Only rej partially rejected words in row rejection");
00056 EXTERN BOOL_VAR (tessedit_dont_blkrej_good_wds, FALSE,
00057 "Use word segmentation quality metric");
00058 EXTERN BOOL_VAR (tessedit_dont_rowrej_good_wds, FALSE,
00059 "Use word segmentation quality metric");
00060 EXTERN INT_VAR (tessedit_preserve_min_wd_len, 2,
00061 "Only preserve wds longer than this");
00062 EXTERN BOOL_VAR (tessedit_row_rej_good_docs, TRUE,
00063 "Apply row rejection to good docs");
00064 EXTERN double_VAR (tessedit_good_doc_still_rowrej_wd, 1.1,
00065 "rej good doc wd if more than this fraction rejected");
00066 EXTERN BOOL_VAR (tessedit_reject_bad_qual_wds, TRUE,
00067 "Reject all bad quality wds");
00068 EXTERN BOOL_VAR (tessedit_debug_doc_rejection, FALSE, "Page stats");
00069 EXTERN BOOL_VAR (tessedit_debug_quality_metrics, FALSE,
00070 "Output data to debug file");
00071 EXTERN BOOL_VAR (bland_unrej, FALSE, "unrej potential with no chekcs");
00072 EXTERN double_VAR (quality_rowrej_pc, 1.1,
00073 "good_quality_doc gte good char limit");
00074
00075 EXTERN BOOL_VAR (unlv_tilde_crunching, TRUE,
00076 "Mark v.bad words for tilde crunch");
00077 EXTERN BOOL_VAR (crunch_early_merge_tess_fails, TRUE, "Before word crunch?");
00078 EXTERN BOOL_EVAR (crunch_early_convert_bad_unlv_chs, FALSE,
00079 "Take out ~^ early?");
00080
00081 EXTERN double_VAR (crunch_terrible_rating, 80.0, "crunch rating lt this");
00082 EXTERN BOOL_VAR (crunch_terrible_garbage, TRUE, "As it says");
00083 EXTERN double_VAR (crunch_poor_garbage_cert, -9.0,
00084 "crunch garbage cert lt this");
00085 EXTERN double_VAR (crunch_poor_garbage_rate, 60,
00086 "crunch garbage rating lt this");
00087
00088 EXTERN double_VAR (crunch_pot_poor_rate, 40,
00089 "POTENTIAL crunch rating lt this");
00090 EXTERN double_VAR (crunch_pot_poor_cert, -8.0,
00091 "POTENTIAL crunch cert lt this");
00092 EXTERN BOOL_VAR (crunch_pot_garbage, TRUE, "POTENTIAL crunch garbage");
00093
00094 EXTERN double_VAR (crunch_del_rating, 60, "POTENTIAL crunch rating lt this");
00095 EXTERN double_VAR (crunch_del_cert, -10.0, "POTENTIAL crunch cert lt this");
00096 EXTERN double_VAR (crunch_del_min_ht, 0.7, "Del if word ht lt xht x this");
00097 EXTERN double_VAR (crunch_del_max_ht, 3.0, "Del if word ht gt xht x this");
00098 EXTERN double_VAR (crunch_del_min_width, 3.0,
00099 "Del if word width lt xht x this");
00100 EXTERN double_VAR (crunch_del_high_word, 1.5,
00101 "Del if word gt xht x this above bl");
00102 EXTERN double_VAR (crunch_del_low_word, 0.5,
00103 "Del if word gt xht x this below bl");
00104 EXTERN double_VAR (crunch_small_outlines_size, 0.6, "Small if lt xht x this");
00105
00106 EXTERN INT_VAR (crunch_rating_max, 10, "For adj length in rating per ch");
00107 EXTERN INT_VAR (crunch_pot_indicators, 1,
00108 "How many potential indicators needed");
00109
00110 EXTERN BOOL_VAR (crunch_leave_ok_strings, TRUE,
00111 "Dont touch sensible strings");
00112 EXTERN BOOL_VAR (crunch_accept_ok, TRUE, "Use acceptability in okstring");
00113 EXTERN BOOL_VAR (crunch_leave_accept_strings, FALSE,
00114 "Dont pot crunch sensible strings");
00115 EXTERN BOOL_VAR (crunch_include_numerals, FALSE, "Fiddle alpha figures");
00116 EXTERN INT_VAR (crunch_leave_lc_strings, 4,
00117 "Dont crunch words with long lower case strings");
00118 EXTERN INT_VAR (crunch_leave_uc_strings, 4,
00119 "Dont crunch words with long lower case strings");
00120 EXTERN INT_VAR (crunch_long_repetitions, 3,
00121 "Crunch words with long repetitions");
00122
00123 EXTERN INT_VAR (crunch_debug, 0, "As it says");
00139 INT16 word_blob_quality(
00140 WERD_RES *word,
00141 ROW *row) {
00142 WERD *bln_word;
00143 TWERD *tessword;
00144 WERD *init_word;
00145 PBLOB_IT outword_it;
00146 PBLOB_IT initial_it;
00147 INT16 i;
00148 INT16 init_blobs_left;
00149 INT16 match_count = 0;
00150 BOOL8 matched;
00151 BOX out_box;
00152 PBLOB *test_blob;
00153 DENORM denorm;
00154 float bln_xht;
00155
00156 #ifdef TEXT_VERBOSE
00157
00158 cprintf("v");
00159 #endif
00160 if (word->word->gblob_list ()->empty ())
00161 return 0;
00162 bln_xht = bln_x_height / word->denorm.scale ();
00163 bln_word = make_bln_copy (word->word, row, bln_xht, &denorm);
00164
00165
00166
00167
00168
00169
00170
00171
00172 tessword = make_tess_word (bln_word, NULL);
00173 init_word = make_ed_word (tessword, bln_word);
00174
00175
00176
00177
00178
00179
00180
00181 initial_it.set_to_list (init_word->blob_list ());
00182 init_blobs_left = initial_it.length ();
00183 outword_it.set_to_list (word->outword->blob_list ());
00184 delete bln_word;
00185 delete_word(tessword);
00186
00187 for (outword_it.mark_cycle_pt ();
00188 !outword_it.cycled_list (); outword_it.forward ()) {
00189 out_box = outword_it.data ()->bounding_box ();
00190
00191
00192 while (!initial_it.at_last () &&
00193 (initial_it.data ()->bounding_box ().left () < out_box.left ())) {
00194 initial_it.forward ();
00195 init_blobs_left--;
00196 }
00197
00198
00199
00200
00201 i = 0;
00202 matched = FALSE;
00203 do {
00204 test_blob = initial_it.data_relative (i++);
00205 matched = crude_match_blobs (test_blob, outword_it.data ());
00206 if (matched)
00207 match_count++;
00208 }
00209 while (!matched &&
00210 (init_blobs_left - i > 0) &&
00211 (i < 129) &&
00212 !initial_it.at_last () &&
00213 test_blob->bounding_box ().left () == out_box.left ());
00214 }
00215 delete init_word;
00216 return match_count;
00217 }
00218
00219
00227 BOOL8 crude_match_blobs(PBLOB *blob1, PBLOB *blob2) {
00228 BOX box1 = blob1->bounding_box ();
00229 BOX box2 = blob2->bounding_box ();
00230
00231 if (box1.contains (box2) &&
00232 box2.contains (box1) &&
00233 (blob1->out_list ()->length () == blob1->out_list ()->length ()))
00234 return TRUE;
00235 else
00236 return FALSE;
00237 }
00238
00239
00246 INT16 word_outline_errs(
00247 WERD_RES *word) {
00248 PBLOB_IT outword_it;
00249 INT16 i = 0;
00250 INT16 err_count = 0;
00251
00252 outword_it.set_to_list (word->outword->blob_list ());
00253
00254 for (outword_it.mark_cycle_pt ();
00255 !outword_it.cycled_list (); outword_it.forward ()) {
00256 err_count += count_outline_errs (word->best_choice->string ()[i],
00257 outword_it.data ()->out_list ()-> length ());
00258 i++;
00259 }
00260 return err_count;
00261 }
00262
00263
00277 void word_char_quality(
00278 WERD_RES *word,
00279 ROW *row,
00280 INT16 *match_count,
00281 INT16 *accepted_match_count) {
00282 WERD *bln_word;
00283 TWERD *tessword;
00284 WERD *init_word;
00285 PBLOB_IT outword_it;
00286 PBLOB_IT initial_it;
00287 INT16 i;
00288 INT16 init_blobs_left;
00289 BOOL8 matched;
00290 BOX out_box;
00291 PBLOB *test_blob;
00292 DENORM denorm;
00293 float bln_xht;
00294 INT16 j = 0;
00295
00296 #ifdef TEXT_VERBOSE
00297
00298 cprintf("y");
00299 #endif
00300 *match_count = 0;
00301 *accepted_match_count = 0;
00302 if (word->word->gblob_list ()->empty ())
00303 return;
00304
00305 bln_xht = bln_x_height / word->denorm.scale ();
00306 bln_word = make_bln_copy (word->word, row, bln_xht, &denorm);
00307
00308
00309
00310
00311 tessword = make_tess_word (bln_word, NULL);
00312 init_word = make_ed_word (tessword, bln_word);
00313 delete bln_word;
00314 delete_word(tessword);
00315
00316
00317
00318
00319
00320 initial_it.set_to_list (init_word->blob_list ());
00321 init_blobs_left = initial_it.length ();
00322 outword_it.set_to_list (word->outword->blob_list ());
00323
00324 for (outword_it.mark_cycle_pt ();
00325 !outword_it.cycled_list (); outword_it.forward ()) {
00326 out_box = outword_it.data ()->bounding_box ();
00327
00328
00329 while (!initial_it.at_last () &&
00330 (initial_it.data ()->bounding_box ().left () < out_box.left ())) {
00331 initial_it.forward ();
00332 init_blobs_left--;
00333 }
00334
00335
00336
00337
00338 i = 0;
00339 matched = FALSE;
00340 do {
00341 test_blob = initial_it.data_relative (i++);
00342 matched = crude_match_blobs (test_blob, outword_it.data ());
00343 if (matched &&
00344 (count_outline_errs (word->best_choice->string ()[j],
00345 outword_it.data ()->out_list ()->length ()) == 0)) {
00346 (*match_count)++;
00347 if (word->reject_map[j].accepted ())
00348 (*accepted_match_count)++;
00349 }
00350 }
00351 while (!matched &&
00352 (init_blobs_left - i > 0) &&
00353 (i < 129) &&
00354 !initial_it.at_last () &&
00355 test_blob->bounding_box ().left () == out_box.left ());
00356 j++;
00357 }
00358 delete init_word;
00359 }
00360
00361
00371 void unrej_good_chs(WERD_RES *word, ROW *row) {
00372 WERD *bln_word;
00373 TWERD *tessword;
00374 WERD *init_word;
00375 PBLOB_IT outword_it;
00376 PBLOB_IT initial_it;
00377 INT16 i;
00378 INT16 init_blobs_left;
00379 BOOL8 matched;
00380 BOX out_box;
00381 PBLOB *test_blob;
00382 DENORM denorm;
00383 float bln_xht;
00384 INT16 j = 0;
00385
00386 if (word->word->gblob_list ()->empty ())
00387 return;
00388
00389 bln_xht = bln_x_height / word->denorm.scale ();
00390 bln_word = make_bln_copy (word->word, row, bln_xht, &denorm);
00391
00392
00393
00394
00395 tessword = make_tess_word (bln_word, NULL);
00396 init_word = make_ed_word (tessword, bln_word);
00397 delete bln_word;
00398 delete_word(tessword);
00399
00400 initial_it.set_to_list (init_word->blob_list ());
00401 init_blobs_left = initial_it.length ();
00402 outword_it.set_to_list (word->outword->blob_list ());
00403
00404 for (outword_it.mark_cycle_pt ();
00405 !outword_it.cycled_list (); outword_it.forward ()) {
00406 out_box = outword_it.data ()->bounding_box ();
00407
00408
00409 while (!initial_it.at_last () &&
00410 (initial_it.data ()->bounding_box ().left () < out_box.left ())) {
00411 initial_it.forward ();
00412 init_blobs_left--;
00413 }
00414
00415
00416
00417 i = 0;
00418 matched = FALSE;
00419 do {
00420 test_blob = initial_it.data_relative (i++);
00421 matched = crude_match_blobs (test_blob, outword_it.data ());
00422 if (matched &&
00423 (word->reject_map[j].accept_if_good_quality ()) &&
00424 (docqual_excuse_outline_errs ||
00425 (count_outline_errs (word->best_choice->string ()[j],
00426 outword_it.data ()->out_list ()->
00427 length ()) == 0)))
00428 word->reject_map[j].setrej_quality_accept ();
00429 }
00430 while (!matched &&
00431 (init_blobs_left - i > 0) &&
00432 (i < 129) &&
00433 !initial_it.at_last () &&
00434 test_blob->bounding_box ().left () == out_box.left ());
00435 j++;
00436 }
00437 delete init_word;
00438 }
00439
00440
00447 void print_boxes(WERD *word) {
00448 PBLOB_IT it;
00449 BOX box;
00450
00451 it.set_to_list (word->blob_list ());
00452 for (it.mark_cycle_pt (); !it.cycled_list (); it.forward ()) {
00453 box = it.data ()->bounding_box ();
00454 box.print ();
00455 }
00456 }
00457
00458
00475 INT16 count_outline_errs(char c, INT16 outline_count) {
00476 int expected_outline_count;
00477
00478 if (STRING (outlines_odd).contains (c))
00479 return 0;
00480 else if (STRING (outlines_2).contains (c))
00481 expected_outline_count = 2;
00482 else
00483 expected_outline_count = 1;
00484 return abs (outline_count - expected_outline_count);
00485 }
00486
00487
00496 void quality_based_rejection(PAGE_RES_IT &page_res_it,
00497 BOOL8 good_quality_doc) {
00498 if ((tessedit_good_quality_unrej && good_quality_doc))
00499 unrej_good_quality_words(page_res_it);
00500 doc_and_block_rejection(page_res_it, good_quality_doc);
00501
00502 page_res_it.restart_page ();
00503 while (page_res_it.word () != NULL) {
00504 insert_rej_cblobs (page_res_it.word ());
00505 page_res_it.forward ();
00506 }
00507
00508 if (unlv_tilde_crunching) {
00509 tilde_crunch(page_res_it);
00510 tilde_delete(page_res_it);
00511 }
00512 }
00513
00514
00533 void unrej_good_quality_words(
00534 PAGE_RES_IT &page_res_it) {
00535 WERD_RES *word;
00536 ROW_RES *current_row;
00537 BLOCK_RES *current_block;
00538 int i;
00539
00540 page_res_it.restart_page ();
00541 while (page_res_it.word () != NULL) {
00542 check_debug_pt (page_res_it.word (), 100);
00543 if (bland_unrej) {
00544 word = page_res_it.word ();
00545 for (i = 0; i < word->reject_map.length (); i++) {
00546 if (word->reject_map[i].accept_if_good_quality ())
00547 word->reject_map[i].setrej_quality_accept ();
00548 }
00549 page_res_it.forward ();
00550 }
00551 else if ((page_res_it.row ()->char_count > 0) &&
00552 ((page_res_it.row ()->rej_count /
00553 (float) page_res_it.row ()->char_count) <=
00554 quality_rowrej_pc)) {
00555 word = page_res_it.word ();
00556 if (word->reject_map.quality_recoverable_rejects () &&
00557 (tessedit_unrej_any_wd ||
00558 acceptable_word_string (word->best_choice->string ().string ())
00559 != AC_UNACCEPTABLE)) {
00560 unrej_good_chs (word, page_res_it.row ()->row);
00561 }
00562 page_res_it.forward ();
00563 }
00564 else {
00565
00566 current_row = page_res_it.row ();
00567 while ((page_res_it.word () != NULL) &&
00568 (page_res_it.row () == current_row))
00569 page_res_it.forward ();
00570 }
00571 check_debug_pt (page_res_it.word (), 110);
00572 }
00573 page_res_it.restart_page ();
00574 page_res_it.page_res->char_count = 0;
00575 page_res_it.page_res->rej_count = 0;
00576 current_block = NULL;
00577 current_row = NULL;
00578 while (page_res_it.word () != NULL) {
00579 if (current_block != page_res_it.block ()) {
00580 current_block = page_res_it.block ();
00581 current_block->char_count = 0;
00582 current_block->rej_count = 0;
00583 }
00584 if (current_row != page_res_it.row ()) {
00585 current_row = page_res_it.row ();
00586 current_row->char_count = 0;
00587 current_row->rej_count = 0;
00588 current_row->whole_word_rej_count = 0;
00589 }
00590 page_res_it.rej_stat_word ();
00591 page_res_it.forward ();
00592 }
00593 }
00594
00595
00608 void doc_and_block_rejection(
00609 PAGE_RES_IT &page_res_it,
00610 BOOL8 good_quality_doc) {
00611 INT16 block_no = 0;
00612 INT16 row_no = 0;
00613 BLOCK_RES *current_block;
00614 ROW_RES *current_row;
00615
00616 BOOL8 rej_word;
00617 BOOL8 prev_word_rejected;
00618 INT16 char_quality;
00619 INT16 accepted_char_quality;
00620
00621 if ((page_res_it.page_res->rej_count * 100.0 /
00622 page_res_it.page_res->char_count) > tessedit_reject_doc_percent) {
00623 reject_whole_page(page_res_it);
00624 #ifndef SECURE_NAMES
00625 if (tessedit_debug_doc_rejection) {
00626 tprintf ("REJECT ALL #chars: %d #Rejects: %d; \n",
00627 page_res_it.page_res->char_count,
00628 page_res_it.page_res->rej_count);
00629 }
00630 #endif
00631 }
00632 else {
00633 #ifndef SECURE_NAMES
00634 if (tessedit_debug_doc_rejection)
00635 tprintf ("NO PAGE REJECTION #chars: %d # Rejects: %d; \n",
00636 page_res_it.page_res->char_count,
00637 page_res_it.page_res->rej_count);
00638 #endif
00639
00640
00641
00642 page_res_it.restart_page ();
00643 while (page_res_it.word () != NULL) {
00644 current_block = page_res_it.block ();
00645 if (current_block->block->text_region () != NULL)
00646 block_no = current_block->block->text_region ()->id_no ();
00647 else
00648 block_no = -1;
00649 if ((page_res_it.block ()->char_count > 0) &&
00650 ((page_res_it.block ()->rej_count * 100.0 /
00651 page_res_it.block ()->char_count) >
00652 tessedit_reject_block_percent)) {
00653 #ifndef SECURE_NAMES
00654 if (tessedit_debug_block_rejection)
00655 tprintf ("REJECTING BLOCK %d #chars: %d; #Rejects: %d\n",
00656 block_no,
00657 page_res_it.block ()->char_count,
00658 page_res_it.block ()->rej_count);
00659 #endif
00660 prev_word_rejected = FALSE;
00661 while ((page_res_it.word () != NULL) &&
00662 (page_res_it.block () == current_block)) {
00663 if (tessedit_preserve_blk_rej_perfect_wds) {
00664 rej_word =
00665 (page_res_it.word ()->reject_map.reject_count () > 0)
00666 || (page_res_it.word ()->reject_map.length () <
00667 tessedit_preserve_min_wd_len);
00668 if (rej_word && tessedit_dont_blkrej_good_wds
00669 && !(page_res_it.word ()->reject_map.length () <
00670 tessedit_preserve_min_wd_len)
00671 &&
00672 (acceptable_word_string
00673 (page_res_it.word ()->best_choice->string ().
00674 string ()) != AC_UNACCEPTABLE)) {
00675 word_char_quality (page_res_it.word (),
00676 page_res_it.row ()->row,
00677 &char_quality,
00678 &accepted_char_quality);
00679 rej_word = char_quality !=
00680 page_res_it.word ()->reject_map.length ();
00681 }
00682 }
00683 else
00684 rej_word = TRUE;
00685 if (rej_word) {
00686
00687
00688
00689
00690
00691 if (tessedit_use_reject_spaces &&
00692 prev_word_rejected &&
00693 (page_res_it.prev_row () == page_res_it.row ()) &&
00694 (page_res_it.word ()->word->space () == 1))
00695 page_res_it.word ()->reject_spaces = TRUE;
00696 page_res_it.word ()->reject_map.rej_word_block_rej ();
00697 }
00698 prev_word_rejected = rej_word;
00699 page_res_it.forward ();
00700 }
00701 }
00702 else {
00703 #ifndef SECURE_NAMES
00704 if (tessedit_debug_block_rejection)
00705 tprintf
00706 ("NOT REJECTING BLOCK %d #chars: %d # Rejects: %d; \n",
00707 block_no, page_res_it.block ()->char_count,
00708 page_res_it.block ()->rej_count);
00709 #endif
00710
00711
00712 row_no = 0;
00713 while ((page_res_it.word () != NULL) &&
00714 (page_res_it.block () == current_block)) {
00715 current_row = page_res_it.row ();
00716 row_no++;
00717
00718
00719
00720
00721
00722 if ((page_res_it.row ()->char_count > 0) &&
00723 ((page_res_it.row ()->rej_count * 100.0 /
00724 page_res_it.row ()->char_count) >
00725 tessedit_reject_row_percent) &&
00726 ((page_res_it.row ()->whole_word_rej_count * 100.0 /
00727 page_res_it.row ()->rej_count) <
00728 tessedit_whole_wd_rej_row_percent)) {
00729 #ifndef SECURE_NAMES
00730 if (tessedit_debug_block_rejection)
00731 tprintf
00732 ("REJECTING ROW %d #chars: %d; #Rejects: %d\n",
00733 row_no, page_res_it.row ()->char_count,
00734 page_res_it.row ()->rej_count);
00735 #endif
00736 prev_word_rejected = FALSE;
00737 while ((page_res_it.word () != NULL) &&
00738 (page_res_it.row () == current_row)) {
00739
00740 if (!tessedit_row_rej_good_docs && good_quality_doc) {
00741 rej_word =
00742 page_res_it.word ()->reject_map.
00743 reject_count () /
00744 (float) page_res_it.word ()->reject_map.
00745 length () > tessedit_good_doc_still_rowrej_wd;
00746 }
00747
00748
00749 else if (tessedit_preserve_row_rej_perfect_wds) {
00750 rej_word =
00751 (page_res_it.word ()->reject_map.
00752 reject_count () > 0)
00753 || (page_res_it.word ()->reject_map.
00754 length () < tessedit_preserve_min_wd_len);
00755 if (rej_word && tessedit_dont_rowrej_good_wds
00756 && !(page_res_it.word ()->reject_map.
00757 length () <
00758 tessedit_preserve_min_wd_len)
00759 &&
00760 (acceptable_word_string
00761 (page_res_it.word ()->best_choice->
00762 string ().string ()) != AC_UNACCEPTABLE)) {
00763 word_char_quality (page_res_it.word (),
00764 page_res_it.row ()->row,
00765 &char_quality,
00766 &accepted_char_quality);
00767 rej_word = char_quality !=
00768 page_res_it.word ()->reject_map.length ();
00769 }
00770 }
00771 else
00772 rej_word = TRUE;
00773 if (rej_word) {
00774
00775
00776
00777
00778
00779 if (tessedit_use_reject_spaces &&
00780 prev_word_rejected &&
00781 (page_res_it.prev_row () ==
00782 page_res_it.row ())
00783 && (page_res_it.word ()->word->space () ==
00784 1))
00785 page_res_it.word ()->reject_spaces = TRUE;
00786 page_res_it.word ()->reject_map.
00787 rej_word_row_rej();
00788 }
00789 prev_word_rejected = rej_word;
00790 page_res_it.forward ();
00791 }
00792 }
00793 else {
00794 #ifndef SECURE_NAMES
00795 if (tessedit_debug_block_rejection)
00796 tprintf
00797 ("NOT REJECTING ROW %d #chars: %d # Rejects: %d; \n",
00798 row_no, page_res_it.row ()->char_count,
00799 page_res_it.row ()->rej_count);
00800 #endif
00801 while ((page_res_it.word () != NULL) &&
00802 (page_res_it.row () == current_row))
00803 page_res_it.forward ();
00804 }
00805 }
00806 }
00807 }
00808 }
00809 }
00810
00811
00822 void reject_whole_page(PAGE_RES_IT &page_res_it) {
00823 page_res_it.restart_page ();
00824 while (page_res_it.word () != NULL) {
00825 page_res_it.word ()->reject_map.rej_word_doc_rej ();
00826 page_res_it.forward ();
00827 }
00828 page_res_it.page_res->rejected = TRUE;
00829 }
00830
00831
00834 void tilde_crunch(PAGE_RES_IT &page_res_it) {
00835 WERD_RES *word;
00836 GARBAGE_LEVEL garbage_level;
00837 PAGE_RES_IT copy_it;
00838 BOOL8 prev_potential_marked = FALSE;
00839 BOOL8 found_terrible_word = FALSE;
00840 int dict_type;
00841 BOOL8 ok_dict_word;
00842
00843 page_res_it.restart_page ();
00844 while (page_res_it.word () != NULL) {
00845 word = page_res_it.word ();
00846
00847 if (crunch_early_convert_bad_unlv_chs)
00848 convert_bad_unlv_chs(word);
00849
00850 if (crunch_early_merge_tess_fails)
00851 merge_tess_fails(word);
00852
00853 if (word->reject_map.accept_count () != 0) {
00854 found_terrible_word = FALSE;
00855 prev_potential_marked = FALSE;
00856 }
00857 else {
00858 dict_type = dict_word (word->best_choice->string ().string ());
00859 ok_dict_word = (dict_type > 0) && (dict_type != DOC_DAWG_PERM);
00860 garbage_level = garbage_word (word, ok_dict_word);
00861
00862 if ((garbage_level != G_NEVER_CRUNCH) &&
00863 (terrible_word_crunch (word, garbage_level))) {
00864 if (crunch_debug > 0) {
00865 tprintf ("T CRUNCHING: \"%s\"\n",
00866 word->best_choice->string ().string ());
00867 }
00868 word->unlv_crunch_mode = CR_KEEP_SPACE;
00869 if (prev_potential_marked) {
00870 while (copy_it.word () != word) {
00871 if (crunch_debug > 0) {
00872 tprintf ("P1 CRUNCHING: \"%s\"\n",
00873 copy_it.word ()->best_choice->string ().
00874 string ());
00875 }
00876 copy_it.word ()->unlv_crunch_mode = CR_KEEP_SPACE;
00877 copy_it.forward ();
00878 }
00879 prev_potential_marked = FALSE;
00880 }
00881 found_terrible_word = TRUE;
00882 }
00883 else if ((garbage_level != G_NEVER_CRUNCH) &&
00884 (potential_word_crunch (word,
00885 garbage_level, ok_dict_word))) {
00886 if (found_terrible_word) {
00887 if (crunch_debug > 0) {
00888 tprintf ("P2 CRUNCHING: \"%s\"\n",
00889 word->best_choice->string ().string ());
00890 }
00891 word->unlv_crunch_mode = CR_KEEP_SPACE;
00892 }
00893 else if (!prev_potential_marked) {
00894 copy_it = page_res_it;
00895 prev_potential_marked = TRUE;
00896 if (crunch_debug > 1) {
00897 tprintf ("P3 CRUNCHING: \"%s\"\n",
00898 word->best_choice->string ().string ());
00899 }
00900 }
00901 }
00902 else {
00903 found_terrible_word = FALSE;
00904 prev_potential_marked = FALSE;
00905 if (crunch_debug > 2) {
00906 tprintf ("NO CRUNCH: \"%s\"\n",
00907 word->best_choice->string ().string ());
00908 }
00909 }
00910 }
00911 page_res_it.forward ();
00912 }
00913 }
00914
00915
00926 BOOL8 terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level) {
00927 float rating_per_ch;
00928 int adjusted_len;
00929 int crunch_mode = 0;
00930
00931 if ((word->best_choice->string ().length () == 0) ||
00932 (strspn (word->best_choice->string ().string (), " ") ==
00933 word->best_choice->string ().length ()))
00934 crunch_mode = 1;
00935 else {
00936 adjusted_len = word->reject_map.length ();
00937 if (adjusted_len > crunch_rating_max)
00938 adjusted_len = crunch_rating_max;
00939 rating_per_ch = word->best_choice->rating () / adjusted_len;
00940
00941 if (rating_per_ch > crunch_terrible_rating)
00942 crunch_mode = 2;
00943 else if (crunch_terrible_garbage && (garbage_level == G_TERRIBLE))
00944 crunch_mode = 3;
00945 else if ((word->best_choice->certainty () < crunch_poor_garbage_cert) &&
00946 (garbage_level != G_OK))
00947 crunch_mode = 4;
00948 else if ((rating_per_ch > crunch_poor_garbage_rate) &&
00949 (garbage_level != G_OK))
00950 crunch_mode = 5;
00951 }
00952 if (crunch_mode > 0) {
00953 if (crunch_debug > 2) {
00954 tprintf ("Terrible_word_crunch (%d) on \"%s\"\n",
00955 crunch_mode, word->best_choice->string ().string ());
00956 }
00957 return TRUE;
00958 }
00959 else
00960 return FALSE;
00961 }
00962
00963
00974 BOOL8 potential_word_crunch(WERD_RES *word,
00975 GARBAGE_LEVEL garbage_level,
00976 BOOL8 ok_dict_word) {
00977 float rating_per_ch;
00978 int adjusted_len;
00979 char *str = (char *) word->best_choice->string ().string ();
00980 BOOL8 word_crunchable;
00981 int poor_indicator_count = 0;
00982
00983 word_crunchable =
00984 !crunch_leave_accept_strings ||
00985 (word->reject_map.length () < 3) ||
00986 ((acceptable_word_string (str) == AC_UNACCEPTABLE) && !ok_dict_word);
00987
00988 adjusted_len = word->reject_map.length ();
00989 if (adjusted_len > 10)
00990 adjusted_len = 10;
00991 rating_per_ch = word->best_choice->rating () / adjusted_len;
00992
00993 if (rating_per_ch > crunch_pot_poor_rate) {
00994 if (crunch_debug > 2) {
00995 tprintf ("Potential poor rating on \"%s\"\n",
00996 word->best_choice->string ().string ());
00997 }
00998 poor_indicator_count++;
00999 }
01000
01001 if (word_crunchable &&
01002 (word->best_choice->certainty () < crunch_pot_poor_cert)) {
01003 if (crunch_debug > 2) {
01004 tprintf ("Potential poor cert on \"%s\"\n",
01005 word->best_choice->string ().string ());
01006 }
01007 poor_indicator_count++;
01008 }
01009
01010 if (garbage_level != G_OK) {
01011 if (crunch_debug > 2) {
01012 tprintf ("Potential garbage on \"%s\"\n",
01013 word->best_choice->string ().string ());
01014 }
01015 poor_indicator_count++;
01016 }
01017 return (poor_indicator_count >= crunch_pot_indicators);
01018 }
01019
01020
01030 void tilde_delete(PAGE_RES_IT &page_res_it) {
01031 WERD_RES *word;
01032 PAGE_RES_IT copy_it;
01033 BOOL8 deleting_from_bol = FALSE;
01034 BOOL8 marked_delete_point = FALSE;
01035 INT16 debug_delete_mode;
01036 CRUNCH_MODE delete_mode;
01037 INT16 x_debug_delete_mode;
01038 CRUNCH_MODE x_delete_mode;
01039
01040 page_res_it.restart_page ();
01041 while (page_res_it.word () != NULL) {
01042 word = page_res_it.word ();
01043
01044 delete_mode = word_deletable (word, debug_delete_mode);
01045 if (delete_mode != CR_NONE) {
01046 if (word->word->flag (W_BOL) || deleting_from_bol) {
01047 if (crunch_debug > 0) {
01048 tprintf ("BOL CRUNCH DELETING(%d): \"%s\"\n",
01049 debug_delete_mode,
01050 word->best_choice->string ().string ());
01051 }
01052 word->unlv_crunch_mode = delete_mode;
01053 deleting_from_bol = TRUE;
01054 }
01055 else if (word->word->flag (W_EOL)) {
01056 if (marked_delete_point) {
01057 while (copy_it.word () != word) {
01058 x_delete_mode = word_deletable (copy_it.word (),
01059 x_debug_delete_mode);
01060 if (crunch_debug > 0) {
01061 tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
01062 x_debug_delete_mode,
01063 copy_it.word ()->best_choice->string ().
01064 string ());
01065 }
01066 copy_it.word ()->unlv_crunch_mode = x_delete_mode;
01067 copy_it.forward ();
01068 }
01069 }
01070 if (crunch_debug > 0) {
01071 tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
01072 debug_delete_mode,
01073 word->best_choice->string ().string ());
01074 }
01075 word->unlv_crunch_mode = delete_mode;
01076 deleting_from_bol = FALSE;
01077 marked_delete_point = FALSE;
01078 }
01079 else {
01080 if (!marked_delete_point) {
01081 copy_it = page_res_it;
01082 marked_delete_point = TRUE;
01083 }
01084 }
01085 }
01086 else {
01087 deleting_from_bol = FALSE;
01088 marked_delete_point = FALSE;
01089 }
01090
01091
01092
01093 if (!crunch_early_merge_tess_fails)
01094 merge_tess_fails(word);
01095 page_res_it.forward ();
01096 }
01097 }
01098
01099
01106 void convert_bad_unlv_chs(
01107 WERD_RES *word_res) {
01108 char *ptr;
01109 int i;
01110
01111 ptr = (char *) word_res->best_choice->string ().string ();
01112 for (i = 0; i < word_res->reject_map.length (); i++) {
01113 if (ptr[i] == '~') {
01114 ptr[i] = '-';
01115 if (word_res->reject_map[i].accepted ())
01116 word_res->reject_map[i].setrej_unlv_rej ();
01117 }
01118 if (ptr[i] == '^') {
01119 ptr[i] = ' ';
01120 if (word_res->reject_map[i].accepted ())
01121 word_res->reject_map[i].setrej_unlv_rej ();
01122 }
01123 }
01124 }
01125
01126
01133 void merge_tess_fails(
01134 WERD_RES *word_res) {
01135 char *ptr;
01136 PBLOB_IT blob_it;
01137 int i = 0;
01138 int len;
01139
01140 len = strlen (word_res->best_choice->string ().string ());
01141 ASSERT_HOST (word_res->reject_map.length () == len);
01142 ASSERT_HOST (word_res->outword->blob_list ()->length () == len);
01143
01144 ptr = (char *) word_res->best_choice->string ().string ();
01145 blob_it = word_res->outword->blob_list ();
01146 while (*ptr != '\0') {
01147 if ((*ptr == ' ') && (*(ptr + 1) == ' ')) {
01148 strcpy (ptr + 1, ptr + 2);
01149 word_res->reject_map.remove_pos (i);
01150 merge_blobs (blob_it.data_relative (1), blob_it.data ());
01151 delete blob_it.extract ();
01152 }
01153 else {
01154 i++;
01155 ptr++;
01156 }
01157 blob_it.forward ();
01158 }
01159 len = strlen (word_res->best_choice->string ().string ());
01160 ASSERT_HOST (word_res->reject_map.length () == len);
01161 ASSERT_HOST (word_res->outword->blob_list ()->length () == len);
01162 }
01163
01164
01181 GARBAGE_LEVEL garbage_word(WERD_RES *word, BOOL8 ok_dict_word) {
01182 enum STATES
01183 {
01184 JUNK,
01185 FIRST_UPPER,
01186 FIRST_LOWER,
01187 FIRST_NUM,
01188 SUBSEQUENT_UPPER,
01189 SUBSEQUENT_LOWER,
01190 SUBSEQUENT_NUM
01191 };
01192 char *str = (char *) word->best_choice->string ().string ();
01193 STATES state = JUNK;
01194 int len = 0;
01195 int isolated_digits = 0;
01196 int isolated_alphas = 0;
01197 int bad_char_count = 0;
01198 int tess_rejs = 0;
01199 int dodgy_chars = 0;
01200 int ok_chars;
01201 char last_char = ' ';
01202 int alpha_repetition_count = 0;
01203 int longest_alpha_repetition_count = 0;
01204 int longest_lower_run_len = 0;
01205 int lower_string_count = 0;
01206 int longest_upper_run_len = 0;
01207 int upper_string_count = 0;
01208 int total_alpha_count = 0;
01209 int total_digit_count = 0;
01210
01211
01212
01213 for (; *str != '\0'; str++) {
01214 len++;
01215 if (isupper (*str)) {
01216 total_alpha_count++;
01217 switch (state) {
01218 case SUBSEQUENT_UPPER:
01219 case FIRST_UPPER:
01220 state = SUBSEQUENT_UPPER;
01221 upper_string_count++;
01222 if (longest_upper_run_len < upper_string_count)
01223 longest_upper_run_len = upper_string_count;
01224 if (last_char == *str) {
01225 alpha_repetition_count++;
01226 if (longest_alpha_repetition_count < alpha_repetition_count) {
01227 longest_alpha_repetition_count = alpha_repetition_count;
01228 }
01229 }
01230 else {
01231 last_char = *str;
01232 alpha_repetition_count = 1;
01233 }
01234 break;
01235 case FIRST_NUM:
01236 isolated_digits++;
01237 default:
01238 state = FIRST_UPPER;
01239 last_char = *str;
01240 alpha_repetition_count = 1;
01241 upper_string_count = 1;
01242 break;
01243 }
01244 }
01245 else if (islower (*str)) {
01246 total_alpha_count++;
01247 switch (state) {
01248 case SUBSEQUENT_LOWER:
01249 case FIRST_LOWER:
01250 state = SUBSEQUENT_LOWER;
01251 lower_string_count++;
01252 if (longest_lower_run_len < lower_string_count)
01253 longest_lower_run_len = lower_string_count;
01254 if (last_char == *str) {
01255 alpha_repetition_count++;
01256 if (longest_alpha_repetition_count < alpha_repetition_count) {
01257 longest_alpha_repetition_count = alpha_repetition_count;
01258 }
01259 }
01260 else {
01261 last_char = *str;
01262 alpha_repetition_count = 1;
01263 }
01264 break;
01265 case FIRST_NUM:
01266 isolated_digits++;
01267 default:
01268 state = FIRST_LOWER;
01269 last_char = *str;
01270 alpha_repetition_count = 1;
01271 lower_string_count = 1;
01272 break;
01273 }
01274 }
01275 else if (isdigit (*str)) {
01276 total_digit_count++;
01277 switch (state) {
01278 case FIRST_NUM:
01279 state = SUBSEQUENT_NUM;
01280 case SUBSEQUENT_NUM:
01281 break;
01282 case FIRST_UPPER:
01283 case FIRST_LOWER:
01284 isolated_alphas++;
01285 default:
01286 state = FIRST_NUM;
01287 break;
01288 }
01289 }
01290 else {
01291 if (*str == ' ')
01292 tess_rejs++;
01293 else
01294 bad_char_count++;
01295 switch (state) {
01296 case FIRST_NUM:
01297 isolated_digits++;
01298 break;
01299 case FIRST_UPPER:
01300 case FIRST_LOWER:
01301 isolated_alphas++;
01302 default:
01303 break;
01304 }
01305 state = JUNK;
01306 }
01307 }
01308
01309
01310
01311 switch (state) {
01312 case FIRST_NUM:
01313 isolated_digits++;
01314 break;
01315 case FIRST_UPPER:
01316 case FIRST_LOWER:
01317 isolated_alphas++;
01318 default:
01319 break;
01320 }
01321
01322 if (crunch_include_numerals) {
01323 total_alpha_count += total_digit_count - isolated_digits;
01324 }
01325
01326 if (crunch_leave_ok_strings &&
01327 (len >= 4) &&
01328 (2 * (total_alpha_count - isolated_alphas) > len) &&
01329 (longest_alpha_repetition_count < crunch_long_repetitions)) {
01330 if ((crunch_accept_ok &&
01331 (acceptable_word_string (str) != AC_UNACCEPTABLE)) ||
01332 (longest_lower_run_len > crunch_leave_lc_strings) ||
01333 (longest_upper_run_len > crunch_leave_uc_strings))
01334 return G_NEVER_CRUNCH;
01335 }
01336 if ((word->reject_map.length () > 1) &&
01337 (strpbrk (str, " ") == NULL) &&
01338 ((word->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
01339 (word->best_choice->permuter () == FREQ_DAWG_PERM) ||
01340 (word->best_choice->permuter () == USER_DAWG_PERM) ||
01341 (word->best_choice->permuter () == NUMBER_PERM) ||
01342 (acceptable_word_string (str) != AC_UNACCEPTABLE) || ok_dict_word))
01343 return G_OK;
01344
01345 ok_chars = len - bad_char_count - isolated_digits -
01346 isolated_alphas - tess_rejs;
01347
01348 if (crunch_debug > 3) {
01349 tprintf ("garbage_word: \"%s\"\n",
01350 word->best_choice->string ().string ());
01351 tprintf ("LEN: %d bad: %d iso_N: %d iso_A: %d rej: %d\n",
01352 len,
01353 bad_char_count, isolated_digits, isolated_alphas, tess_rejs);
01354 }
01355 if ((bad_char_count == 0) &&
01356 (tess_rejs == 0) &&
01357 ((len > isolated_digits + isolated_alphas) || (len <= 2)))
01358 return G_OK;
01359
01360 if ((tess_rejs > ok_chars) ||
01361 ((tess_rejs > 0) && ((bad_char_count + tess_rejs) * 2 > len)))
01362 return G_TERRIBLE;
01363
01364 if (len > 4) {
01365 dodgy_chars = 2 * tess_rejs + bad_char_count +
01366 isolated_digits + isolated_alphas;
01367 if ((dodgy_chars > 5) || ((dodgy_chars / (float) len) > 0.5))
01368 return G_DODGY;
01369 else
01370 return G_OK;
01371 }
01372 else {
01373 dodgy_chars = 2 * tess_rejs + bad_char_count;
01374 if (((len == 4) && (dodgy_chars > 2)) ||
01375 ((len == 3) && (dodgy_chars > 2)) || (dodgy_chars >= len))
01376 return G_DODGY;
01377 else
01378 return G_OK;
01379 }
01380 }
01381
01382
01409 CRUNCH_MODE word_deletable(WERD_RES *word, INT16 &delete_mode) {
01410 int word_len = word->reject_map.length ();
01411 float rating_per_ch;
01412 BOX box;
01413
01414 if (word->unlv_crunch_mode == CR_NONE) {
01415 delete_mode = 0;
01416 return CR_NONE;
01417 }
01418
01419 if (word_len == 0) {
01420 delete_mode = 1;
01421 return CR_DELETE;
01422 }
01423
01424 box = word->outword->bounding_box ();
01425 if (box.height () < crunch_del_min_ht * bln_x_height) {
01426 delete_mode = 4;
01427 return CR_DELETE;
01428 }
01429
01430 if (noise_outlines (word->outword)) {
01431 delete_mode = 5;
01432 return CR_DELETE;
01433 }
01434
01435 if ((failure_count (word) * 1.5) > word_len) {
01436 delete_mode = 2;
01437 return CR_LOOSE_SPACE;
01438 }
01439
01440 if (word->best_choice->certainty () < crunch_del_cert) {
01441 delete_mode = 7;
01442 return CR_LOOSE_SPACE;
01443 }
01444
01445 rating_per_ch = word->best_choice->rating () / word_len;
01446
01447 if (rating_per_ch > crunch_del_rating) {
01448 delete_mode = 8;
01449 return CR_LOOSE_SPACE;
01450 }
01451
01452 if (box.top () < bln_baseline_offset - crunch_del_low_word * bln_x_height) {
01453 delete_mode = 9;
01454 return CR_LOOSE_SPACE;
01455 }
01456
01457 if (box.bottom () >
01458 bln_baseline_offset + crunch_del_high_word * bln_x_height) {
01459 delete_mode = 10;
01460 return CR_LOOSE_SPACE;
01461 }
01462
01463 if (box.height () > crunch_del_max_ht * bln_x_height) {
01464 delete_mode = 11;
01465 return CR_LOOSE_SPACE;
01466 }
01467
01468 if (box.width () < crunch_del_min_width * bln_x_height) {
01469 delete_mode = 3;
01470 return CR_LOOSE_SPACE;
01471 }
01472
01473 delete_mode = 0;
01474 return CR_NONE;
01475 }
01476
01477
01484 INT16 failure_count(WERD_RES *word) {
01485 char *str = (char *) word->best_choice->string ().string ();
01486 int tess_rejs = 0;
01487
01488 for (; *str != '\0'; str++) {
01489 if (*str == ' ')
01490 tess_rejs++;
01491 }
01492 return tess_rejs;
01493 }
01494
01495
01507 BOOL8 noise_outlines(WERD *word) {
01508 PBLOB_IT blob_it;
01509 OUTLINE_IT outline_it;
01510 BOX box;
01511 INT16 outline_count = 0;
01512 INT16 small_outline_count = 0;
01513 INT16 max_dimension;
01514 float small_limit = bln_x_height * crunch_small_outlines_size;
01515
01516 blob_it.set_to_list (word->blob_list ());
01517 for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
01518 outline_it.set_to_list (blob_it.data ()->out_list ());
01519 for (outline_it.mark_cycle_pt(); !outline_it.cycled_list(); outline_it.forward()) {
01520 outline_count++;
01521 box = outline_it.data ()->bounding_box ();
01522 if (box.height () > box.width ())
01523 max_dimension = box.height ();
01524 else
01525 max_dimension = box.width ();
01526 if (max_dimension < small_limit)
01527 small_outline_count++;
01528 }
01529 }
01530 return (small_outline_count >= outline_count);
01531 }
01532
01533
01542 void insert_rej_cblobs(
01543 WERD_RES *word) {
01544 PBLOB_IT blob_it;
01545 PBLOB_IT rej_blob_it;
01546 const STRING *wordstr;
01547 int old_len;
01548 int rej_len;
01549 char new_str[512];
01550 REJMAP new_map;
01551 int i = 0;
01552 int j = 0;
01553 int new_len;
01554
01555 gblob_sort_list (word->outword->rej_blob_list (), TRUE);
01556 rej_blob_it.set_to_list (word->outword->rej_blob_list ());
01557 if (rej_blob_it.empty ())
01558 return;
01559 rej_len = rej_blob_it.length ();
01560 blob_it.set_to_list (word->outword->blob_list ());
01561 wordstr = &(word->best_choice->string ());
01562 old_len = wordstr->length ();
01563 ASSERT_HOST (word->reject_map.length () == old_len);
01564 ASSERT_HOST (blob_it.length () == old_len);
01565 if ((old_len + rej_len) > 511)
01566 return;
01567 new_map.initialise (old_len + rej_len);
01568
01569 while (!rej_blob_it.empty ()) {
01570 if ((j >= old_len) ||
01571 (rej_blob_it.data ()->bounding_box ().left () <=
01572 blob_it.data ()->bounding_box ().left ())) {
01573
01574 if (j >= old_len)
01575 blob_it.add_to_end (rej_blob_it.extract ());
01576 else
01577 blob_it.add_before_stay_put (rej_blob_it.extract ());
01578 if (!rej_blob_it.empty ())
01579 rej_blob_it.forward ();
01580 new_str[i] = ' ';
01581 new_map[i].setrej_rej_cblob ();
01582 i++;
01583 }
01584 else {
01585 new_str[i] = (*wordstr)[j];
01586 new_map[i] = word->reject_map[j];
01587 i++;
01588 j++;
01589 blob_it.forward ();
01590 }
01591 }
01592
01593 while (j < wordstr->length ()) {
01594 new_str[i] = (*wordstr)[j];
01595 new_map[i] = word->reject_map[j];
01596 i++;
01597 j++;
01598 }
01599 new_str[i] = '\0';
01600
01601
01602
01603
01604
01605 ASSERT_HOST (i == blob_it.length ());
01606 ASSERT_HOST (i == old_len + rej_len);
01607 word->reject_map = new_map;
01608 *((STRING *) wordstr) = new_str;
01609 new_len = strlen (word->best_choice->string ().string ());
01610 ASSERT_HOST (word->reject_map.length () == new_len);
01611 ASSERT_HOST (word->outword->blob_list ()->length () == new_len);
01612 }