#include "mfcpch.h"
#include "mainblk.h"
#include <string.h>
#include <math.h>
#include <ctype.h>
#include "ocrclass.h"
#include "werdit.h"
#include "drawfx.h"
#include "tfacep.h"
#include "tessbox.h"
#include "tessvars.h"
#include "pgedit.h"
#include "reject.h"
#include "adaptions.h"
#include "charcut.h"
#include "fixxht.h"
#include "fixspace.h"
#include "genblob.h"
#include "docqual.h"
#include "control.h"
#include "secname.h"
#include "output.h"
#include "callcpp.h"
#include "notdll.h"
#include "tordvars.h"
#include "adaptmatch.h"
Go to the source code of this file.
#define EXTERN |
Definition at line 56 of file control.cpp.
#define MAX_XHEIGHT_DIFF 3 |
Definition at line 55 of file control.cpp.
#define MIN_FONT_ROW_COUNT 8 |
* (C) Copyright 1992, Hewlett-Packard Ltd. ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. ** You may obtain a copy of the License at ** http://www.apache.org/licenses/LICENSE-2.0 ** Unless required by applicable law or agreed to in writing, software ** distributed under the License is distributed on an "AS IS" BASIS, ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ** See the License for the specific language governing permissions and ** limitations under the License.
Definition at line 54 of file control.cpp.
ACCEPTABLE_WERD_TYPE acceptable_word_string | ( | const char * | s | ) |
Determine if word is acceptable.
Iterate through word and apply heuristics
Definition at line 1222 of file control.cpp.
References AC_INITIAL_CAP, AC_LC_ABBREV, AC_LOWER_CASE, AC_UC_ABBREV, AC_UNACCEPTABLE, and AC_UPPER_CASE.
Referenced by doc_and_block_rejection(), garbage_word(), potential_word_crunch(), set_unlv_suspects(), and unrej_good_quality_words().
01222 { 01223 int i = 0; 01224 int leading_punct_count; 01225 int upper_count = 0; 01226 int hyphen_pos = -1; 01227 ACCEPTABLE_WERD_TYPE word_type = AC_UNACCEPTABLE; 01228 01229 if (strlen (s) > 20) 01230 return word_type; 01231 01232 /* Single Leading punctuation char*/ 01233 01234 if ((s[i] != '\0') && (STRING (chs_leading_punct).contains (s[i]))) 01235 i++; 01236 leading_punct_count = i; 01237 01238 /* Initial cap */ 01239 while (isupper (s[i])) { 01240 i++; 01241 upper_count++; 01242 } 01243 if (upper_count > 1) 01244 word_type = AC_UPPER_CASE; 01245 else { 01246 /* Lower case word, possibly with an initial cap */ 01247 while (islower (s[i])) { 01248 i++; 01249 } 01250 if (i - leading_punct_count < quality_min_initial_alphas_reqd) 01251 goto not_a_word; 01252 /* 01253 Allow a single hyphen in a lower case word 01254 - dont trust upper case - I've seen several cases of "H" -> "I-I" 01255 */ 01256 if (s[i] == '-') { 01257 hyphen_pos = i++; 01258 if (s[i] != '\0') { 01259 while (islower (s[i])) { 01260 i++; 01261 } 01262 if (i < hyphen_pos + 3) 01263 goto not_a_word; 01264 } 01265 } 01266 else { 01267 /* Allow "'s" in NON hyphenated lower case words */ 01268 if ((s[i] == '\'') && (s[i + 1] == 's')) 01269 i += 2; 01270 } 01271 if (upper_count > 0) 01272 word_type = AC_INITIAL_CAP; 01273 else 01274 word_type = AC_LOWER_CASE; 01275 } 01276 01277 /* Up to two different, constrained trailing punctuation chars */ 01278 if ((s[i] != '\0') && (STRING (chs_trailing_punct1).contains (s[i]))) 01279 i++; 01280 if ((s[i] != '\0') && 01281 (s[i - 1] != s[i]) && (STRING (chs_trailing_punct2).contains (s[i]))) 01282 i++; 01283 01284 if (s[i] != '\0') 01285 word_type = AC_UNACCEPTABLE; 01286 01287 not_a_word: 01288 01289 if (word_type == AC_UNACCEPTABLE) { 01290 /* Look for abbreviation string */ 01291 i = 0; 01292 if (isupper (s[0])) { 01293 word_type = AC_UC_ABBREV; 01294 while ((s[i] != '\0') && isupper (s[i]) && (s[i + 1] == '.')) 01295 i += 2; 01296 } 01297 else if (islower (s[0])) { 01298 word_type = AC_LC_ABBREV; 01299 while ((s[i] != '\0') && islower (s[i]) && (s[i + 1] == '.')) 01300 i += 2; 01301 } 01302 if (s[i] != '\0') 01303 word_type = AC_UNACCEPTABLE; 01304 } 01305 01306 return word_type; 01307 }
Add into the stats for one row.
Definition at line 1628 of file control.cpp.
References STATS::add(), WERD_RES::bold, WERD_RES::font1, WERD_RES::font1_count, WERD_RES::font2, WERD_RES::font2_count, WERD_RES::italic, and ROW_RES::word_res_list.
01633 { 01634 WERD_RES *word; //current word 01635 WERD_RES_IT word_it = &row->word_res_list; 01636 01637 for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { 01638 word = word_it.data (); 01639 *italic += word->italic; 01640 *bold += word->bold; 01641 if (word->font1_count > 0) 01642 fonts->add (word->font1, word->font1_count); 01643 if (word->font2_count > 0) 01644 fonts->add (word->font2, word->font2_count); 01645 01646 } 01647 }
DEBUGGING ROUTINE.
Definition at line 1314 of file control.cpp.
References WERD_RES::best_choice, WERD::bounding_box(), BOX::contains(), debug_fp, WERD_RES::done, FALSE, REJMAP::full_print(), WERD::print(), REJMAP::print(), WERD_RES::reject_map, WERD_RES::tess_accepted, tprintf(), TRUE, and WERD_RES::word.
Referenced by classify_word_pass1(), classify_word_pass2(), fix_fuzzy_spaces(), output_pass(), recog_all_words(), unrej_good_quality_words(), and write_results().
01314 { 01315 BOOL8 show_map_detail = FALSE; 01316 INT16 i; 01317 01318 #ifndef SECURE_NAMES 01319 if (!test_pt) 01320 return FALSE; 01321 01322 tessedit_rejection_debug.set_value (FALSE); 01323 debug_x_ht_level.set_value (0); 01324 tessedit_cluster_debug.set_value (FALSE); 01325 nn_debug.set_value (FALSE); 01326 nn_reject_debug.set_value (FALSE); 01327 01328 if (word->word->bounding_box ().contains (FCOORD (test_pt_x, test_pt_y))) { 01329 if (location < 0) 01330 return TRUE; //For breakpoint use 01331 tessedit_rejection_debug.set_value (TRUE); 01332 debug_x_ht_level.set_value (20); 01333 tessedit_cluster_debug.set_value (TRUE); 01334 nn_debug.set_value (TRUE); 01335 nn_reject_debug.set_value (TRUE); 01336 tprintf ("\n\nTESTWD::"); 01337 switch (location) { 01338 case 0: 01339 tprintf ("classify_word_pass1 start\n"); 01340 word->word->print (debug_fp); 01341 break; 01342 case 10: 01343 tprintf ("make_reject_map: initial map"); 01344 break; 01345 case 20: 01346 tprintf ("make_reject_map: after NN"); 01347 break; 01348 case 30: 01349 tprintf ("classify_word_pass2 - START"); 01350 break; 01351 case 40: 01352 tprintf ("classify_word_pass2 - Pre Xht"); 01353 break; 01354 case 50: 01355 tprintf ("classify_word_pass2 - END"); 01356 show_map_detail = TRUE; 01357 break; 01358 case 60: 01359 tprintf ("fixspace"); 01360 break; 01361 case 70: 01362 tprintf ("MM pass START"); 01363 break; 01364 case 80: 01365 tprintf ("MM pass END"); 01366 break; 01367 case 90: 01368 tprintf ("After Poor quality rejection"); 01369 break; 01370 case 100: 01371 tprintf ("unrej_good_quality_words - START"); 01372 break; 01373 case 110: 01374 tprintf ("unrej_good_quality_words - END"); 01375 break; 01376 case 120: 01377 tprintf ("Write results pass"); 01378 show_map_detail = TRUE; 01379 break; 01380 } 01381 tprintf (" \"%s\" ", word->best_choice->string ().string ()); 01382 word->reject_map.print (debug_fp); 01383 tprintf ("\n"); 01384 if (show_map_detail) { 01385 tprintf ("\"%s\"\n", word->best_choice->string ().string ()); 01386 for (i = 0; word->best_choice->string ()[i] != '\0'; i++) { 01387 tprintf ("**** \"%c\" ****\n", word->best_choice->string ()[i]); 01388 word->reject_map[i].full_print (debug_fp); 01389 } 01390 } 01391 01392 tprintf ("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE"); 01393 tprintf ("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE"); 01394 return TRUE; 01395 } 01396 else 01397 #endif 01398 return FALSE; 01399 }
void choice_dump_tester | ( | PBLOB * | , | |
DENORM * | , | |||
BOOL8 | correct, | |||
char * | text, | |||
INT32 | count, | |||
BLOB_CHOICE_LIST * | ratings | |||
) |
Matcher tester function which generates .chc file entries.
Called via test_segment_pass2 for every blob tested by tess in a word. (But only for words for which a correct segmentation could be found.)
Definition at line 1134 of file control.cpp.
References CANTOPENFILE, BLOB_CHOICE::certainty(), BLOB_CHOICE::char_class(), choice_file, ERRCODE::error(), EXIT, imagebasename, NULL, BLOB_CHOICE::rating(), and STRING::string().
Referenced by match_word_pass2().
01141 { 01142 STRING choice_file_name; 01143 BLOB_CHOICE *blob_choice; 01144 BLOB_CHOICE_IT it; 01145 char source_chars[20]; 01146 char correct_char[3]; 01147 01148 if (choice_file == NULL) { 01149 choice_file_name = imagebasename + ".chc"; 01150 if (!(choice_file = fopen (choice_file_name.string (), "w"))) { 01151 CANTOPENFILE.error ("choice_dump_tester", EXIT, "%s %d", 01152 choice_file_name.string (), errno); 01153 } 01154 } 01155 01156 if ((count == 0) || (text == NULL) || (text[0] == '\0')) { 01157 strcpy (source_chars, "$$"); 01158 strcpy (correct_char, "$$"); 01159 } 01160 else { 01161 strncpy(source_chars, text, count); 01162 source_chars[count] = '\0'; 01163 if (correct) { 01164 correct_char[0] = text[0]; 01165 correct_char[1] = '\0'; 01166 } 01167 else { 01168 strcpy (correct_char, "$$"); 01169 } 01170 } 01171 fprintf (choice_file, "%s\t%s", source_chars, correct_char); 01172 01173 it.set_to_list (ratings); 01174 for (it.mark_cycle_pt (); !it.cycled_list (); it.forward ()) { 01175 blob_choice = it.data (); 01176 if ((blob_choice->char_class () >= '!') && 01177 (blob_choice->char_class () <= '~')) 01178 fprintf (choice_file, "\t%c\t%f\t%f", 01179 blob_choice->char_class (), 01180 blob_choice->rating (), blob_choice->certainty ()); 01181 } 01182 fprintf (choice_file, "\n"); 01183 }
void classify_word_pass1 | ( | WERD_RES * | word, | |
ROW * | row, | |||
BOOL8 | cluster_adapt, | |||
CHAR_SAMPLES_LIST * | char_clusters, | |||
CHAR_SAMPLE_LIST * | chars_waiting | |||
) |
Recognize one word.
Baseline normalize the word and pass it to Tess.
Test for TESS screw up on word. Recog_word has already ensured that the choice list, outword blob lists and best_choice string are the same length. A TESS screw up is indicated by a blank filled or 0 length string.
Definition at line 517 of file control.cpp.
References adapt_to_good_samples(), ASSERT_HOST, WERD_RES::best_choice, WERD::blob_list(), check_debug_pt(), correct_fp, WERD_RES::denorm, WERD_RES::done, FALSE, fix_hyphens(), fix_quotes(), fix_rep_char(), WERD::flag(), REJMAP::initialise(), make_bln_copy(), make_reject_map(), matcher_fp, matcher_pass, NULL, WERD_RES::outword, WERD_RES::raw_choice, record_certainty(), REJMAP::rej_word_tess_failure(), WERD_RES::reject_map, WERD::set_text(), set_word_fonts(), STRING::string(), tess_acceptable_word(), WERD_RES::tess_accepted, tess_adaptable_word(), tess_adapter(), tess_add_doc_word(), tess_default_matcher(), WERD_RES::tess_failed, tess_segment_pass1(), WERD_RES::tess_would_adapt, WERD::text(), tprintf(), TRUE, W_REP_CHAR, WERD_RES::word, word_adaptable(), word_answer, write_cooked_text(), and ROW::x_height().
Referenced by recog_all_words().
00522 { 00523 WERD *bln_word; //baseline norm copy 00524 //detailed results 00525 BLOB_CHOICE_LIST_CLIST blob_choices; 00526 BOOL8 adapt_ok; 00527 const char *rejmap; 00528 INT16 index; 00529 STRING mapstr = ""; 00530 char *match_string; 00531 char word_string[1024]; 00532 00533 if (matcher_fp != NULL) { 00534 fgets (word_string, 1023, correct_fp); 00535 if ((match_string = strchr (word_string, '\r')) != NULL) 00536 *match_string = '\0'; 00537 if ((match_string = strchr (word_string, '\n')) != NULL) 00538 *match_string = '\0'; 00539 if (word_string[0] != '\0') { 00540 word->word->set_text (word_string); 00541 word_answer = (char *) word->word->text (); 00542 } 00543 else 00544 word_answer = NULL; 00545 } 00546 00547 check_debug_pt (word, 0); 00548 matcher_pass = 0; 00549 bln_word = make_bln_copy (word->word, row, row->x_height (), &word->denorm); 00550 00551 word->best_choice = tess_segment_pass1 (bln_word, &word->denorm, 00552 tess_default_matcher, 00553 word->raw_choice, &blob_choices, 00554 word->outword); 00555 00561 if ((word->best_choice->string ().length () == 0) || 00562 (strspn (word->best_choice->string ().string (), " ") == 00563 word->best_choice->string ().length ())) { 00564 word->done = FALSE; //Try again on pass2 - adaption may help 00565 word->tess_failed = TRUE; 00566 word->reject_map.initialise (word->best_choice->string ().length ()); 00567 word->reject_map.rej_word_tess_failure (); 00568 } 00569 else { 00570 word->tess_failed = FALSE; 00571 if ((word->best_choice->string ().length () != 00572 word->outword->blob_list ()->length ()) || 00573 (word->best_choice->string ().length () != blob_choices.length ())) { 00574 tprintf 00575 ("ASSERT FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n", 00576 word->best_choice->string ().string (), 00577 word->best_choice->string ().length (), 00578 word->outword->blob_list ()->length (), blob_choices.length ()); 00579 } 00580 ASSERT_HOST (word->best_choice->string ().length () == 00581 word->outword->blob_list ()->length ()); 00582 ASSERT_HOST (word->best_choice->string ().length () == 00583 blob_choices.length ()); 00584 00585 /* 00586 The adaption step used to be here. It has been moved to after 00587 make_reject_map so that we know whether the word will be accepted in the 00588 first pass or not. This move will PREVENT adaption to words containing 00589 double quotes because the word will not be identical to what tess thinks 00590 its best choice is. (See CurrentBestChoiceIs which is used by 00591 AdaptableWord) 00592 */ 00593 00594 if (word->word->flag (W_REP_CHAR)) { 00595 fix_rep_char(word); 00596 } 00597 else { 00598 fix_quotes ((char *) word->best_choice->string ().string (), 00599 //turn to double 00600 word->outword, &blob_choices); 00601 if (tessedit_fix_hyphens) 00602 //turn 2 to 1 00603 fix_hyphens ((char *) word->best_choice->string ().string (), 00604 word->outword, &blob_choices); 00605 record_certainty (word->best_choice->certainty (), 1); //accounting 00606 00607 word->tess_accepted = tess_acceptable_word (word->best_choice, 00608 word->raw_choice); 00609 00610 word->tess_would_adapt = tess_adaptable_word (word->outword, 00611 word->best_choice, 00612 word->raw_choice); 00613 // Also sets word->done flag 00614 make_reject_map (word, &blob_choices, row, 1); 00615 00616 adapt_ok = word_adaptable (word, tessedit_tess_adaption_mode); 00617 00618 if (cluster_adapt) 00619 adapt_to_good_samples(word, char_clusters, chars_waiting); 00620 00621 if (adapt_ok || tessedit_tess_adapt_to_rejmap) { 00622 if (!tessedit_tess_adapt_to_rejmap) 00623 rejmap = NULL; 00624 else { 00625 ASSERT_HOST (word->reject_map.length () == 00626 word->best_choice->string ().length ()); 00627 00628 for (index = 0; index < word->reject_map.length (); index++) { 00629 if (adapt_ok || word->reject_map[index].accepted ()) 00630 mapstr += '1'; 00631 else 00632 mapstr += '0'; 00633 } 00634 rejmap = mapstr.string (); 00635 } 00636 00637 //adapt to it 00638 tess_adapter (word->outword, &word->denorm, 00639 word->best_choice->string ().string (), 00640 word->raw_choice->string ().string (), rejmap); 00641 } 00642 00643 if (tessedit_enable_doc_dict) 00644 tess_add_doc_word (word->best_choice); 00645 set_word_fonts(word, &blob_choices); 00646 } 00647 } 00648 if (tessedit_print_text) { 00649 write_cooked_text (bln_word, word->best_choice->string (), 00650 word->done, FALSE, stdout); 00651 } 00652 delete bln_word; 00653 blob_choices.deep_clear (); 00654 }
Control what to do with the word in pass 2.
Definition at line 661 of file control.cpp.
References WERD_RES::best_choice, WERD_RES::caps_height, check_block_occ(), check_debug_pt(), clear_fx_win(), count_alphanums(), create_fx_win(), debug_fp, WERD_RES::denorm, WERD_RES::done, dummy, FALSE, WERD::flag(), fx_win, WERD_RES::guessed_x_ht, REJMAP::length(), make_picture_current, match_word_pass2(), NO_WINDOW, NULL, WERD_RES::outword, WERD::plot(), REJMAP::print(), WERD_RES::raw_choice, re_estimate_x_ht(), record_certainty(), REJMAP::rej_word_xht_fixup(), REJMAP::reject_count(), WERD_RES::reject_map, reject_mostly_rejects(), SaveBadWord(), set_global_subloc_code(), SUBLOC_FIX_XHT, SUBLOC_NORM, WERD_RES::tess_failed, tprintf(), TRUE, W_REP_CHAR, WERD_RES::word, word_char_quality(), write_cooked_text(), WERD_RES::x_height, and ROW::x_height().
Referenced by match_current_words(), recog_all_words(), and recog_interactive().
00663 { 00664 BOOL8 done_this_pass = FALSE; 00665 WERD_RES new_x_ht_word (word->word); 00666 float new_x_ht = 0.0; 00667 INT16 old_xht_reject_count; 00668 INT16 new_xht_reject_count; 00669 INT16 old_xht_accept_count; 00670 INT16 new_xht_accept_count; 00671 BOOL8 accept_new_x_ht = FALSE; 00672 INT16 old_chs_in_wd; 00673 INT16 new_chs_in_wd; 00674 INT16 old_word_quality; 00675 INT16 new_word_quality; 00676 INT16 dummy; 00677 00678 set_global_subloc_code(SUBLOC_NORM); 00679 check_debug_pt (word, 30); 00680 if (!word->done || 00681 tessedit_training_tess || 00682 tessedit_training_wiseowl || tessedit_dump_choices) { 00683 word->x_height = row->x_height (); 00684 word->caps_height = 0.0; 00685 if (word->outword != NULL) { 00686 delete word->outword; //get rid of junk 00687 delete word->best_choice; 00688 delete word->raw_choice; 00689 } 00690 match_word_pass2 (word, row, row->x_height ()); 00691 done_this_pass = TRUE; 00692 check_debug_pt (word, 40); 00693 } 00694 00695 if (!word->tess_failed && !word->word->flag (W_REP_CHAR)) { 00696 set_global_subloc_code(SUBLOC_FIX_XHT); 00697 if ((tessedit_xht_fiddles_on_done_wds || !word->done) && 00698 (tessedit_xht_fiddles_on_no_rej_wds || 00699 (word->reject_map.reject_count () > 0))) { 00700 if ((x_ht_check_word_occ >= 2) && word_occ_first) 00701 check_block_occ(word); 00702 00703 if (tessedit_redo_xheight) 00704 re_estimate_x_ht(word, &new_x_ht); 00705 00706 if (((x_ht_check_word_occ >= 2) && !word_occ_first) || 00707 ((x_ht_check_word_occ >= 1) && (new_x_ht > 0))) 00708 check_block_occ(word); 00709 } 00710 if (new_x_ht > 0) { 00711 old_chs_in_wd = word->reject_map.length (); 00712 00713 /* Re-estimated x_ht error suggests a rematch is worthwhile. */ 00714 new_x_ht_word.x_height = new_x_ht; 00715 new_x_ht_word.caps_height = 0.0; 00716 match_word_pass2 (&new_x_ht_word, row, new_x_ht_word.x_height); 00717 if (!new_x_ht_word.tess_failed) { 00718 if ((x_ht_check_word_occ >= 1) && word_occ_first) 00719 check_block_occ(&new_x_ht_word); 00720 00721 re_estimate_x_ht(&new_x_ht_word, &new_x_ht); 00722 00723 if ((x_ht_check_word_occ >= 1) && !word_occ_first) 00724 check_block_occ(&new_x_ht_word); 00725 00726 old_xht_reject_count = word->reject_map.reject_count (); 00727 old_xht_accept_count = old_chs_in_wd - old_xht_reject_count; 00728 new_xht_reject_count = new_x_ht_word.reject_map.reject_count (); 00729 new_chs_in_wd = new_x_ht_word.reject_map.length (); 00730 new_xht_accept_count = new_chs_in_wd - new_xht_reject_count; 00731 accept_new_x_ht = 00732 ((new_xht_accept_count > old_xht_accept_count) || 00733 ((new_xht_accept_count == old_xht_accept_count) && 00734 (new_xht_accept_count > 0))) && 00735 (!new_x_ht_word.guessed_x_ht || 00736 !new_x_ht_word.guessed_caps_ht); 00737 00738 if (accept_new_x_ht && x_ht_quality_check) { 00739 word_char_quality(word, row, &old_word_quality, &dummy); 00740 word_char_quality(&new_x_ht_word, row, &new_word_quality, &dummy); 00741 if (old_word_quality > new_word_quality) 00742 accept_new_x_ht = FALSE; 00743 } 00744 00745 if (accept_new_x_ht && (x_ht_stringency > 0)) { 00746 accept_new_x_ht = 00747 (count_alphanums (&new_x_ht_word) > x_ht_stringency); 00748 if (!accept_new_x_ht && rej_use_xht) { 00749 if (debug_x_ht_level >= 1) 00750 tprintf 00751 ("Failed stringency test so reject original word\n"); 00752 word->reject_map.rej_word_xht_fixup (); 00753 } 00754 } 00755 00756 #ifndef SECURE_NAMES 00757 if (debug_x_ht_level >= 1) { 00758 tprintf ("New XHT Match:: %s ", 00759 word->best_choice->string ().string ()); 00760 word->reject_map.print (debug_fp); 00761 tprintf (" -> %s ", 00762 new_x_ht_word.best_choice->string ().string ()); 00763 new_x_ht_word.reject_map.print (debug_fp); 00764 tprintf (" %s->%s %s %s\n", 00765 word->guessed_x_ht ? "GUESS" : "CERT", 00766 new_x_ht_word.guessed_x_ht ? "GUESS" : "CERT", 00767 new_x_ht > 0.1 ? "STILL DOUBT" : "OK", 00768 accept_new_x_ht ? "ACCEPTED" : ""); 00769 } 00770 #endif 00771 } 00772 if (accept_new_x_ht) { 00773 /* 00774 The new x_ht is deemed superior so put the final results in the real word 00775 and destroy the old results 00776 */ 00777 delete word->outword; //get rid of junk 00778 word->outword = new_x_ht_word.outword; 00779 word->denorm = new_x_ht_word.denorm; 00780 delete word->best_choice; 00781 word->best_choice = new_x_ht_word.best_choice; 00782 delete word->raw_choice; 00783 word->raw_choice = new_x_ht_word.raw_choice; 00784 word->reject_map = new_x_ht_word.reject_map; 00785 word->done = new_x_ht_word.done; 00786 done_this_pass = TRUE; 00787 } 00788 else { 00789 /* 00790 The new x_ht is no better, so destroy the copy word and put any uncertain 00791 x or cap ht estimate back to default. (I.e. dont blame me if its bad!) 00792 Conditionally, use any ammended block occ chars. 00793 */ 00794 //get rid of junk 00795 delete new_x_ht_word.outword; 00796 delete new_x_ht_word.best_choice; 00797 delete new_x_ht_word.raw_choice; 00798 } 00799 new_x_ht_word.outword = NULL; //to keep new destructor happy 00800 new_x_ht_word.best_choice = NULL; //to keep new destructor happy 00801 new_x_ht_word.raw_choice = NULL; //to keep new destructor happy 00802 00803 if (rej_mostly_reject_mode == 2) { 00804 reject_mostly_rejects(word); 00805 tprintf ("Rejecting mostly rejects on %s ", 00806 word->best_choice->string ().string ()); 00807 } 00808 } 00809 00810 set_global_subloc_code(SUBLOC_NORM); 00811 00812 if (done_this_pass && !word->done && tessedit_save_stats) 00813 SaveBadWord (word->best_choice->string ().string (), 00814 word->best_choice->certainty ()); 00815 record_certainty (word->best_choice->certainty (), 2); //accounting 00816 } 00817 #ifndef GRAPHICS_DISABLED 00818 if (tessedit_draw_outwords) { 00819 if (fx_win == NO_WINDOW) 00820 create_fx_win(); 00821 clear_fx_win(); 00822 word->outword->plot (fx_win); 00823 make_picture_current(fx_win); 00824 } 00825 #endif 00826 00827 set_global_subloc_code(SUBLOC_NORM); 00828 if (tessedit_print_text) { 00829 write_cooked_text (word->outword, word->best_choice->string (), 00830 word->done, done_this_pass, stdout); 00831 } 00832 check_debug_pt (word, 50); 00833 }
CLISTIZEH | ( | PBLOB | ) |
Find the modal font and remove from the stats.
Definition at line 1654 of file control.cpp.
References STATS::add(), count(), STATS::get_total(), MAX_INT8, STATS::mode(), and STATS::pile_count().
Referenced by font_recognition_pass(), and set_word_fonts().
01658 { 01659 INT8 font; //font index 01660 INT32 count; //pile couat 01661 01662 if (fonts->get_total () > 0) { 01663 font = (INT8) fonts->mode (); 01664 *font_out = font; 01665 count = fonts->pile_count (font); 01666 *font_count = count < MAX_INT8 ? count : MAX_INT8; 01667 fonts->add (font, -*font_count); 01668 } 01669 else { 01670 *font_out = -1; 01671 *font_count = 0; 01672 } 01673 }
void fix_hyphens | ( | char * | string, | |
WERD * | word, | |||
BLOB_CHOICE_LIST_CLIST * | blob_choices | |||
) |
Change pairs of hyphens to a single hyphen if the bounding boxes touch.
Typically a long dash which has been segmented.
Definition at line 1067 of file control.cpp.
References WERD::blob_list(), and merge_blobs().
Referenced by classify_word_pass1(), and match_word_pass2().
01070 { 01071 char *ptr; //string ptr 01072 //blobs 01073 PBLOB_IT blob_it = word->blob_list (); 01074 //choices 01075 BLOB_CHOICE_LIST_C_IT choice_it = blob_choices; 01076 BLOB_CHOICE_IT it1; //first choices 01077 BLOB_CHOICE_IT it2; //second choices 01078 01079 for (ptr = string; 01080 *ptr != '\0'; ptr++, blob_it.forward (), choice_it.forward ()) { 01081 if ((*ptr == '-' || *ptr == '~') && 01082 (*(ptr + 1) == '-' || *(ptr + 1) == '~') && 01083 (blob_it.data ()->bounding_box ().right () >= 01084 blob_it.data_relative (1)->bounding_box ().left ())) { 01085 *ptr = '-'; //turn to single hyphen 01086 strcpy (ptr + 1, ptr + 2); //shuffle up 01087 merge_blobs (blob_it.data (), blob_it.data_relative (1)); 01088 blob_it.forward (); 01089 delete blob_it.extract (); //get rid of spare 01090 01091 it1.set_to_list (choice_it.data ()); 01092 it2.set_to_list (choice_it.data_relative (1)); 01093 if (it1.data ()->certainty () < it2.data ()->certainty ()) { 01094 choice_it.forward (); 01095 //get rid of spare 01096 delete choice_it.extract (); 01097 } 01098 else { 01099 //get rid of spare 01100 delete choice_it.extract (); 01101 choice_it.forward (); 01102 } 01103 } 01104 } 01105 }
void fix_quotes | ( | char * | string, | |
WERD * | word, | |||
BLOB_CHOICE_LIST_CLIST * | blob_choices | |||
) |
Change pairs of quotes to double quotes.
Definition at line 1022 of file control.cpp.
Referenced by apply_box_testing(), classify_word_pass1(), and match_word_pass2().
01025 { 01026 char *ptr; //string ptr 01027 //blobs 01028 PBLOB_IT blob_it = word->blob_list (); 01029 //choices 01030 BLOB_CHOICE_LIST_C_IT choice_it = blob_choices; 01031 BLOB_CHOICE_IT it1; //first choices 01032 BLOB_CHOICE_IT it2; //second choices 01033 01034 for (ptr = string; 01035 *ptr != '\0'; ptr++, blob_it.forward (), choice_it.forward ()) { 01036 if ((*ptr == '\'' || *ptr == '`') 01037 && (*(ptr + 1) == '\'' || *(ptr + 1) == '`')) { 01038 *ptr = '"'; //turn to double 01039 strcpy (ptr + 1, ptr + 2); //shuffle up 01040 merge_blobs (blob_it.data (), blob_it.data_relative (1)); 01041 blob_it.forward (); 01042 delete blob_it.extract (); //get rid of spare 01043 01044 it1.set_to_list (choice_it.data ()); 01045 it2.set_to_list (choice_it.data_relative (1)); 01046 if (it1.data ()->certainty () < it2.data ()->certainty ()) { 01047 choice_it.forward (); 01048 //get rid of spare 01049 delete choice_it.extract (); 01050 } 01051 else { 01052 //get rid of spare 01053 delete choice_it.extract (); 01054 choice_it.forward (); 01055 } 01056 } 01057 } 01058 }
void fix_rep_char | ( | WERD_RES * | word | ) |
Fix word with a repeated char.
word | Word to do |
Definition at line 965 of file control.cpp.
References alloc_mem(), WERD_RES::best_choice, count(), WERD_RES::done, free_mem(), REJMAP::initialise(), max, WERD_RES::reject_map, and TRUE.
Referenced by classify_word_pass1(), and match_word_pass2().
00967 { 00968 struct REP_CH 00969 { 00970 char ch; 00971 int count; 00972 }; 00973 00974 REP_CH *rep_ch; //array of char counts 00975 int word_len; 00976 int rep_ch_count = 0; //how many unique chs 00977 const char *word_str; //the repeated chs 00978 int i, j; 00979 int total = 0; 00980 int max = 0; 00981 char maxch = ' '; //Most common char 00982 00983 word_str = word->best_choice->string ().string (); 00984 word_len = strlen (word_str); 00985 rep_ch = (REP_CH *) alloc_mem (word_len * sizeof (REP_CH)); 00986 for (i = 0; i < word_len; i++) { 00987 for (j = 0; j < rep_ch_count && rep_ch[j].ch != word_str[i]; j++); 00988 if (j < rep_ch_count) 00989 rep_ch[j].count++; 00990 else { 00991 rep_ch[rep_ch_count].ch = word_str[i]; 00992 rep_ch[rep_ch_count].count = 1; 00993 rep_ch_count++; 00994 } 00995 } 00996 00997 for (j = 0; j < rep_ch_count; j++) { 00998 total += rep_ch[j].count; 00999 if ((rep_ch[j].count > max) && (rep_ch[j].ch != ' ')) { 01000 max = rep_ch[j].count; 01001 maxch = rep_ch[j].ch; 01002 } 01003 } 01004 // tprintf( "REPEATED CHAR %s len=%d total=%d choice=%c\n", 01005 // word_str, word_len, total, maxch ); 01006 free_mem(rep_ch); 01007 01008 word->reject_map.initialise (word_len); 01009 for (i = 0; i < word_len; i++) { 01010 if (word_str[i] != maxch) 01011 //rej unrecognised blobs 01012 word->reject_map[i].setrej_bad_repetition (); 01013 } 01014 word->done = TRUE; 01015 }
void font_recognition_pass | ( | PAGE_RES_IT & | page_res_it | ) |
Smooth the fonts for the document.
Definition at line 1505 of file control.cpp.
References STATS::add(), WERD_RES::best_choice, ROW_RES::bold, WERD_RES::bold, STATS::clear(), count(), find_modal_font(), ROW_RES::font1, WERD_RES::font1, ROW_RES::font1_count, WERD_RES::font1_count, ROW_RES::font2, WERD_RES::font2, ROW_RES::font2_count, WERD_RES::font2_count, PAGE_RES_IT::forward(), ROW_RES::italic, WERD_RES::italic, NULL, PAGE_RES_IT::restart_page(), PAGE_RES_IT::row(), and PAGE_RES_IT::word().
Referenced by recog_all_words().
01506 { 01507 INT32 length; //of word 01508 INT32 count; //of a feature 01509 INT8 doc_font; //modal font 01510 INT8 doc_font_count; //modal font 01511 INT32 doc_italic; //total italics 01512 INT32 doc_bold; //total bolds 01513 ROW_RES *row = NULL; //current row 01514 WERD_RES *word; //current word 01515 STATS fonts (0, 32); //font counters 01516 STATS doc_fonts (0, 32); //font counters 01517 01518 doc_italic = 0; 01519 doc_bold = 0; 01520 page_res_it.restart_page (); 01521 while (page_res_it.word () != NULL) { 01522 if (row != page_res_it.row ()) { 01523 if (row != NULL) { 01524 find_modal_font (&fonts, &row->font1, &row->font1_count); 01525 find_modal_font (&fonts, &row->font2, &row->font2_count); 01526 } 01527 row = page_res_it.row (); //current row 01528 fonts.clear (); //clear counters 01529 row->italic = 0; 01530 row->bold = 0; 01531 } 01532 word = page_res_it.word (); 01533 row->italic += word->italic; 01534 row->bold += word->bold; 01535 fonts.add (word->font1, word->font1_count); 01536 fonts.add (word->font2, word->font2_count); 01537 doc_italic += word->italic; 01538 doc_bold += word->bold; 01539 doc_fonts.add (word->font1, word->font1_count); 01540 doc_fonts.add (word->font2, word->font2_count); 01541 page_res_it.forward (); 01542 } 01543 if (row != NULL) { 01544 find_modal_font (&fonts, &row->font1, &row->font1_count); 01545 find_modal_font (&fonts, &row->font2, &row->font2_count); 01546 } 01547 find_modal_font(&doc_fonts, &doc_font, &doc_font_count); 01548 /* 01549 row=NULL; 01550 page_res_it.restart_page(); 01551 while (page_res_it.word() != NULL) 01552 { 01553 if (row!=page_res_it.row()) 01554 { 01555 row2=row; 01556 row=page_res_it.row(); 01557 if (row->font1_count<MIN_FONT_ROW_COUNT) 01558 { 01559 fonts.clear(); 01560 italic=0; 01561 bold=0; 01562 add_in_one_row(row,&fonts,&italic,&bold); 01563 if (row2!=NULL) 01564 { 01565 hdiff=row->row->x_height()-row2->row->x_height(); 01566 if (hdiff<0) 01567 hdiff=-hdiff; 01568 if (hdiff<MAX_XHEIGHT_DIFF) 01569 add_in_one_row(row2,&fonts,&italic,&bold); 01570 } 01571 do 01572 page_res_it.forward(); 01573 while (page_res_it.row()==row); 01574 row2=page_res_it.row(); 01575 if (row2!=NULL) 01576 { 01577 hdiff=row->row->x_height()-row2->row->x_height(); 01578 if (hdiff<0) 01579 hdiff=-hdiff; 01580 if (hdiff<MAX_XHEIGHT_DIFF) 01581 add_in_one_row(row2,&fonts,&italic,&bold); 01582 } 01583 row->italic=italic; 01584 row->bold=bold; 01585 find_modal_font(&fonts,&row->font1,&row->font1_count); 01586 find_modal_font(&fonts,&row->font2,&row->font2_count); 01587 } 01588 else 01589 page_res_it.forward(); 01590 } 01591 else 01592 page_res_it.forward(); 01593 }*/ 01594 01595 page_res_it.restart_page (); 01596 while (page_res_it.word () != NULL) { 01597 row = page_res_it.row (); //current row 01598 word = page_res_it.word (); 01599 length = word->best_choice->string ().length (); 01600 01601 count = word->italic; 01602 if (count < 0) 01603 count = -count; 01604 if (!(count == length || length > 3 && count >= length * 3 / 4)) 01605 word->italic = doc_italic > 0 ? 1 : -1; 01606 01607 count = word->bold; 01608 if (count < 0) 01609 count = -count; 01610 if (!(count == length || length > 3 && count >= length * 3 / 4)) 01611 word->bold = doc_bold > 0 ? 1 : -1; 01612 01613 count = word->font1_count; 01614 if (!(count == length || length > 3 && count >= length * 3 / 4)) { 01615 word->font1 = doc_font; 01616 word->font1_count = doc_font_count; 01617 } 01618 01619 page_res_it.forward (); 01620 } 01621 }
Generate a baseline normalised copy of the source word.
The copy is done so that whatever format the original word is in, a polygonal bln version is generated as output.
Definition at line 1193 of file control.cpp.
References WERD::baseline_normalise_x(), WERD::poly_copy(), and ROW::x_height().
Referenced by apply_box_testing(), apply_box_training(), classify_word_pass1(), match_word_pass2(), unrej_good_chs(), word_blob_quality(), and word_char_quality().
01193 { 01194 WERD *result; 01195 01196 // if (wordit_linearc && !src_word->flag(W_POLYGON)) 01197 // { 01198 // larc_word = src_word->larc_copy( row->x_height() ); 01199 // result = larc_word->poly_copy( row->x_height() ); 01200 // delete larc_word; 01201 // } 01202 // else 01203 result = src_word->poly_copy (row->x_height ()); 01204 01205 // if (tessedit_draw_words) 01206 // { 01207 // if ( la_win == NO_WINDOW ) 01208 // create_la_win(); 01209 // result->plot( la_win ); 01210 // } 01211 result->baseline_normalise_x (row, x_height, denorm); 01212 return result; 01213 }
Baseline normalize the word and pass it to Tess.
Definition at line 840 of file control.cpp.
References assert(), ASSERT_HOST, WERD_RES::best_choice, WERD::blob_list(), choice_dump_tester(), correct_segment_pass2(), WERD_RES::denorm, FALSE, fix_hyphens(), fix_quotes(), fix_rep_char(), WERD::flag(), REJMAP::initialise(), make_bln_copy(), make_reject_map(), matcher_fp, matcher_pass, NULL, WERD_RES::outword, WERD_RES::raw_choice, REJMAP::rej_word_tess_failure(), WERD_RES::reject_map, set_global_subsubloc_code(), SUBSUBLOC_OTHER, SUBSUBLOC_TESS, tess_acceptable_word(), WERD_RES::tess_accepted, tess_default_matcher(), WERD_RES::tess_failed, tess_segment_pass2(), tess_training_tester(), test_segment_pass2(), WERD::text(), tprintf(), TRUE, W_REP_CHAR, WERD_RES::word, and word_answer.
Referenced by classify_word_pass2().
00843 { 00844 WERD *bln_word; //baseline norm copy 00845 //detailed results 00846 BLOB_CHOICE_LIST_CLIST blob_choices; 00847 00848 set_global_subsubloc_code(SUBSUBLOC_OTHER); 00849 if (matcher_fp != NULL) { 00850 word_answer = (char *) word->word->text (); 00851 if (word_answer != NULL && word_answer[0] == '\0') 00852 word_answer = NULL; 00853 } 00854 matcher_pass = 0; 00855 bln_word = make_bln_copy (word->word, row, x_height, &word->denorm); 00856 set_global_subsubloc_code(SUBSUBLOC_TESS); 00857 if (tessedit_training_tess) 00858 word->best_choice = correct_segment_pass2 (bln_word, 00859 &word->denorm, 00860 tess_default_matcher, 00861 tess_training_tester, 00862 word->raw_choice, 00863 &blob_choices, word->outword); 00864 else if (tessedit_dump_choices) 00865 word->best_choice = test_segment_pass2 (bln_word, 00866 &word->denorm, 00867 tess_default_matcher, 00868 choice_dump_tester, 00869 word->raw_choice, 00870 &blob_choices, word->outword); 00871 // else if (tessedit_training_wiseowl) 00872 // best_choice=correct_segment_pass2( word, &denorm, 00873 // tess_default_matcher,wo_learn, 00874 // raw_choice,&blob_choices,outword); 00875 // else if (tessedit_matcher_is_wiseowl) 00876 // best_choice=tess_segment_pass2( word, &denorm, wo_classify, 00877 // raw_choice, &blob_choices, outword); 00878 else { 00879 word->best_choice = tess_segment_pass2 (bln_word, &word->denorm, 00880 tess_default_matcher, 00881 word->raw_choice, &blob_choices, 00882 word->outword); 00883 } 00884 set_global_subsubloc_code(SUBSUBLOC_OTHER); 00885 /* 00886 Test for TESS screw up on word. Recog_word has already ensured that the 00887 choice list, outword blob lists and best_choice string are the same 00888 length. A TESS screw up is indicated by a blank filled or 0 length string. 00889 */ 00890 if ((word->best_choice->string ().length () == 0) || 00891 (strspn (word->best_choice->string ().string (), " ") == 00892 word->best_choice->string ().length ())) { 00893 word->tess_failed = TRUE; 00894 word->reject_map.initialise (word->best_choice->string ().length ()); 00895 word->reject_map.rej_word_tess_failure (); 00896 // tprintf("Empty word produced\n"); 00897 } 00898 else { 00899 if ((word->best_choice->string ().length () != 00900 word->outword->blob_list ()->length ()) || 00901 (word->best_choice->string ().length () != blob_choices.length ())) { 00902 tprintf 00903 ("ASSERT FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n", 00904 word->best_choice->string ().string (), 00905 word->best_choice->string ().length (), 00906 word->outword->blob_list ()->length (), blob_choices.length ()); 00907 } 00908 ASSERT_HOST (word->best_choice->string ().length () == 00909 word->outword->blob_list ()->length ()); 00910 ASSERT_HOST (word->best_choice->string ().length () == 00911 blob_choices.length ()); 00912 00913 word->tess_failed = FALSE; 00914 if (word->word->flag (W_REP_CHAR)) { 00915 fix_rep_char(word); 00916 } 00917 else { 00918 fix_quotes ((char *) word->best_choice->string ().string (), 00919 word->outword, &blob_choices); 00920 if (tessedit_fix_hyphens) 00921 fix_hyphens ((char *) word->best_choice->string ().string (), 00922 word->outword, &blob_choices); 00923 /* Dont trust fix_quotes! - though I think I've fixed the bug */ 00924 if ((word->best_choice->string ().length () != 00925 word->outword->blob_list ()->length ()) || 00926 (word->best_choice->string ().length () != 00927 blob_choices.length ())) { 00928 #ifndef SECURE_NAMES 00929 tprintf 00930 ("POST FIX_QUOTES FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n", 00931 word->best_choice->string ().string (), 00932 word->best_choice->string ().length (), 00933 word->outword->blob_list ()->length (), 00934 blob_choices.length ()); 00935 #endif 00936 00937 } 00938 ASSERT_HOST (word->best_choice->string ().length () == 00939 word->outword->blob_list ()->length ()); 00940 ASSERT_HOST (word->best_choice->string ().length () == 00941 blob_choices.length ()); 00942 00943 word->tess_accepted = tess_acceptable_word (word->best_choice, 00944 word->raw_choice); 00945 00946 make_reject_map (word, &blob_choices, row, 2); 00947 } 00948 } 00949 blob_choices.deep_clear (); 00950 delete bln_word; 00951 assert (word->raw_choice != NULL); 00952 }
Add the outlines from blob2 to blob1.
Blob2 is emptied but not deleted.
Definition at line 1114 of file control.cpp.
References PBLOB::out_list(), and outline_it.
Referenced by fix_hyphens(), fix_quotes(), and merge_tess_fails().
01117 { 01118 OUTLINE_IT outline_it = blob1->out_list (); 01119 //iterator 01120 01121 outline_it.move_to_last (); //go to end 01122 //do it 01123 outline_it.add_list_after (blob2->out_list ()); 01124 }
void recog_all_words | ( | PAGE_RES * | page_res, | |
volatile ETEXT_DESC * | monitor | |||
) |
Walk the current block list applying the specified word processor function to all words. Does several passes!
page_res | page structure | |
monitor | progress monitor |
Definition at line 245 of file control.cpp.
References adapt_to_good_ems(), adapt_to_good_samples(), WERD_RES::best_choice, check_debug_pt(), classify_word_pass1(), classify_word_pass2(), collect_characters_for_adaption(), collect_ems_for_adaption(), FALSE, fix_fuzzy_spaces(), font_recognition_pass(), PAGE_RES_IT::forward(), FREQ_DAWG_PERM, REJMAP::length(), LOC_DOC_BLK_REJ, LOC_FUZZY_SPACE, LOC_MM_ADAPT, LOC_PASS1, LOC_PASS2, LOC_WRITE_RESULTS, NULL, print_em_stats(), quality_based_rejection(), PAGE_RES_IT::rej_stat_word(), REJMAP::rej_word_bad_quality(), REJMAP::rej_word_tess_failure(), reject_all_fullstops(), REJMAP::reject_count(), WERD_RES::reject_map, reject_suspect_fullstops(), PAGE_RES_IT::restart_page(), PAGE_RES_IT::row(), ROW_RES::row, set_global_loc_code(), STRING::string(), SYSTEM_DAWG_PERM, tprintf(), TRUE, USER_DAWG_PERM, PAGE_RES_IT::word(), word_adaptable(), word_blob_quality(), word_char_quality(), word_count, and word_outline_errs().
Referenced by TessBaseAPI::Recognize().
00248 { 00249 PAGE_RES_IT page_res_it(page_res); //reset page iterator 00250 INT16 chars_in_word; 00251 INT16 rejects_in_word; 00252 CHAR_SAMPLES_LIST em_clusters; 00253 CHAR_SAMPLE_LIST ems_waiting; 00254 CHAR_SAMPLES_LIST char_clusters; 00255 CHAR_SAMPLE_LIST chars_waiting; 00256 INT16 blob_quality = 0; 00257 INT16 outline_errs = 0; 00258 INT16 doc_blob_quality = 0; 00259 INT16 doc_outline_errs = 0; 00260 INT16 doc_char_quality = 0; 00261 INT16 all_char_quality; 00262 INT16 accepted_all_char_quality; 00263 INT16 good_char_count = 0; 00264 INT16 doc_good_char_quality = 0; 00265 const STRING *wordstr; 00266 const char *text; 00267 int i; 00268 00269 BOOL8 good_quality_doc; 00270 UINT8 permuter_type; 00271 00272 INT32 tess_adapt_mode = 0; 00273 INT32 word_count; //count of words in doc 00274 INT32 word_index; //current word 00275 00276 if (tessedit_minimal_rej_pass1) { 00277 tessedit_test_adaption.set_value (TRUE); 00278 tessedit_minimal_rejection.set_value (TRUE); 00279 } 00280 00281 if (tessedit_cluster_adapt_before_pass1) { 00282 tess_adapt_mode = tessedit_tess_adaption_mode; 00283 tessedit_tess_adaption_mode.set_value (0); 00284 tessedit_tess_adapt_to_rejmap.set_value (TRUE); 00285 } 00286 00287 /* Pass 1 */ 00288 word_count = 0; 00289 if (monitor != NULL) { 00290 monitor->ocr_alive = TRUE; 00291 while (page_res_it.word () != NULL) { 00292 word_count++; 00293 page_res_it.forward (); 00294 } 00295 page_res_it.restart_page (); 00296 } 00297 else 00298 word_count = 1; 00299 00300 word_index = 0; 00301 int dict_words = 0; 00302 while (page_res_it.word () != NULL) { 00303 set_global_loc_code(LOC_PASS1); 00304 word_index++; 00305 if (monitor != NULL) { 00306 monitor->ocr_alive = TRUE; 00307 monitor->progress = 30 + 50 * word_index / word_count; 00308 if ((monitor->end_time != 0 && clock() > monitor->end_time) || 00309 (monitor->cancel != NULL && (*monitor->cancel)(monitor->cancel_this, 00310 dict_words))) 00311 return; 00312 } 00313 classify_word_pass1 (page_res_it.word (), 00314 page_res_it.row ()->row, FALSE, NULL, NULL); 00315 00316 if (tessedit_test_adaption && !tessedit_minimal_rejection) { 00317 if (!word_adaptable (page_res_it.word (), 00318 tessedit_test_adaption_mode)) 00319 page_res_it.word ()->reject_map.rej_word_tess_failure (); 00320 //FAKE PERM REJ 00321 else { 00322 wordstr = &(page_res_it.word ()->best_choice->string ()); 00323 /* Override rejection mechanisms for this word */ 00324 text = wordstr->string (); 00325 for (i = 0; text[i] != '\0'; i++) { 00326 if ((text[i] != ' ') 00327 && page_res_it.word ()->reject_map[i].rejected ()) 00328 page_res_it.word ()->reject_map[i]. 00329 setrej_minimal_rej_accept(); 00330 } 00331 } 00332 } 00333 00334 if ((tessedit_cluster_adapt_after_pass1 00335 || tessedit_cluster_adapt_after_pass3 00336 || tessedit_cluster_adapt_before_pass1) 00337 && tessedit_cluster_adaption_mode != 0) { 00338 collect_characters_for_adaption (page_res_it.word (), 00339 &char_clusters, &chars_waiting); 00340 } 00341 // Count dict words. 00342 if (page_res_it.word()->best_choice->permuter() == USER_DAWG_PERM) 00343 ++dict_words; 00344 page_res_it.forward (); 00345 } 00346 00347 if (tessedit_cluster_adapt_before_pass1) 00348 tessedit_tess_adaption_mode.set_value (tess_adapt_mode); 00349 00350 page_res_it.restart_page (); 00351 while ((tessedit_cluster_adapt_after_pass1 00352 || tessedit_cluster_adapt_before_pass1) 00353 && page_res_it.word () != NULL) { 00354 if (monitor != NULL) 00355 monitor->ocr_alive = TRUE; 00356 if (tessedit_cluster_adapt_after_pass1) 00357 adapt_to_good_samples (page_res_it.word (), 00358 &char_clusters, &chars_waiting); 00359 else 00360 classify_word_pass1 (page_res_it.word (), 00361 page_res_it.row ()->row, 00362 TRUE, &char_clusters, &chars_waiting); 00363 00364 page_res_it.forward (); 00365 } 00366 00367 /* Pass 2 */ 00368 page_res_it.restart_page (); 00369 word_index = 0; 00370 while (!tessedit_test_adaption && page_res_it.word () != NULL) { 00371 set_global_loc_code(LOC_PASS2); 00372 word_index++; 00373 if (monitor != NULL) { 00374 monitor->ocr_alive = TRUE; 00375 monitor->progress = 80 + 10 * word_index / word_count; 00376 if ((monitor->end_time != 0 && clock() > monitor->end_time) || 00377 (monitor->cancel != NULL && (*monitor->cancel)(monitor->cancel_this, 00378 dict_words))) 00379 return; 00380 } 00381 classify_word_pass2 (page_res_it.word (), page_res_it.row ()->row); 00382 00383 if (tessedit_em_adaption_mode > 0) 00384 collect_ems_for_adaption (page_res_it.word (), 00385 &em_clusters, &ems_waiting); 00386 00387 if (tessedit_cluster_adapt_after_pass2 00388 && tessedit_cluster_adaption_mode != 0) 00389 collect_characters_for_adaption (page_res_it.word (), 00390 &char_clusters, &chars_waiting); 00391 page_res_it.forward (); 00392 } 00393 00394 /* Another pass */ 00395 set_global_loc_code(LOC_FUZZY_SPACE); 00396 00397 if (!tessedit_test_adaption && tessedit_fix_fuzzy_spaces 00398 && !tessedit_word_for_word) 00399 fix_fuzzy_spaces(monitor, word_count, page_res); 00400 00401 if (!tessedit_test_adaption && tessedit_em_adaption_mode != 0) 00402 // Initially ems only 00403 print_em_stats(&em_clusters, &ems_waiting); 00404 00405 /* Pass 3 - used for checking confusion sets */ 00406 page_res_it.restart_page (); 00407 word_index = 0; 00408 while (!tessedit_test_adaption && page_res_it.word () != NULL) { 00409 set_global_loc_code(LOC_MM_ADAPT); 00410 word_index++; 00411 if (monitor != NULL) { 00412 monitor->ocr_alive = TRUE; 00413 monitor->progress = 95 + 5 * word_index / word_count; 00414 } 00415 check_debug_pt (page_res_it.word (), 70); 00416 /* Use good matches to sort out confusions */ 00417 00418 if (tessedit_em_adaption_mode != 0) 00419 adapt_to_good_ems (page_res_it.word (), &em_clusters, &ems_waiting); 00420 00421 if (tessedit_cluster_adapt_after_pass2 00422 && tessedit_cluster_adaption_mode != 0) 00423 adapt_to_good_samples (page_res_it.word (), 00424 &char_clusters, &chars_waiting); 00425 00426 if (tessedit_reject_fullstops 00427 && strchr (page_res_it.word ()->best_choice->string ().string (), 00428 '.') != NULL) 00429 reject_all_fullstops (page_res_it.word ()); 00430 else if (tessedit_reject_suspect_fullstops 00431 && strchr (page_res_it.word ()->best_choice->string (). 00432 string (), '.') != NULL) 00433 reject_suspect_fullstops (page_res_it.word ()); 00434 00435 page_res_it.rej_stat_word (); 00436 chars_in_word = page_res_it.word ()->reject_map.length (); 00437 rejects_in_word = page_res_it.word ()->reject_map.reject_count (); 00438 00439 blob_quality = word_blob_quality (page_res_it.word (), 00440 page_res_it.row ()->row); 00441 doc_blob_quality += blob_quality; 00442 outline_errs = word_outline_errs (page_res_it.word ()); 00443 doc_outline_errs += outline_errs; 00444 word_char_quality (page_res_it.word (), 00445 page_res_it.row ()->row, 00446 &all_char_quality, &accepted_all_char_quality); 00447 doc_char_quality += all_char_quality; 00448 permuter_type = page_res_it.word ()->best_choice->permuter (); 00449 if ((permuter_type == SYSTEM_DAWG_PERM) || 00450 (permuter_type == FREQ_DAWG_PERM) || 00451 (permuter_type == USER_DAWG_PERM)) { 00452 good_char_count += chars_in_word - rejects_in_word; 00453 doc_good_char_quality += accepted_all_char_quality; 00454 } 00455 check_debug_pt (page_res_it.word (), 80); 00456 if (tessedit_reject_bad_qual_wds && 00457 (blob_quality == 0) && (outline_errs >= chars_in_word)) 00458 page_res_it.word ()->reject_map.rej_word_bad_quality (); 00459 check_debug_pt (page_res_it.word (), 90); 00460 page_res_it.forward (); 00461 } 00462 00463 page_res_it.restart_page (); 00464 while (!tessedit_test_adaption 00465 && tessedit_cluster_adapt_after_pass3 && page_res_it.word () != NULL) { 00466 if (monitor != NULL) 00467 monitor->ocr_alive = TRUE; 00468 if (tessedit_cluster_adaption_mode != 0) 00469 adapt_to_good_samples (page_res_it.word (), 00470 &char_clusters, &chars_waiting); 00471 page_res_it.forward (); 00472 } 00473 00474 #ifndef SECURE_NAMES 00475 if (tessedit_debug_quality_metrics) { 00476 tprintf 00477 ("QUALITY: num_chs= %d num_rejs= %d %5.3f blob_qual= %d %5.3f outline_errs= %d %5.3f char_qual= %d %5.3f good_ch_qual= %d %5.3f\n", 00478 page_res->char_count, page_res->rej_count, 00479 page_res->rej_count / (float) page_res->char_count, doc_blob_quality, 00480 doc_blob_quality / (float) page_res->char_count, doc_outline_errs, 00481 doc_outline_errs / (float) page_res->char_count, doc_char_quality, 00482 doc_char_quality / (float) page_res->char_count, 00483 doc_good_char_quality, 00484 good_char_count > 00485 0 ? doc_good_char_quality / (float) good_char_count : 0.0); 00486 } 00487 #endif 00488 good_quality_doc = 00489 (page_res->rej_count / (float) page_res->char_count <= quality_rej_pc) 00490 && 00491 (doc_blob_quality / (float) page_res->char_count >= quality_blob_pc) && 00492 (doc_outline_errs / (float) page_res->char_count <= quality_outline_pc) && 00493 (doc_char_quality / (float) page_res->char_count >= quality_char_pc); 00494 00495 /* Do whole document or whole block rejection pass*/ 00496 00497 if (!tessedit_test_adaption) { 00498 set_global_loc_code(LOC_DOC_BLK_REJ); 00499 quality_based_rejection(page_res_it, good_quality_doc); 00500 } 00501 font_recognition_pass(page_res_it); 00502 00503 /* Write results pass */ 00504 set_global_loc_code(LOC_WRITE_RESULTS); 00505 // This is now redundant, but retained commented so show how to obtain 00506 // bounding boxes and style information. 00507 // output_pass (page_res_it, false); 00508 }
Recognize a single word in interactive mode.
* | block | |
row | row of word | |
word | word to recognize |
Definition at line 207 of file control.cpp.
References classify_word_pass2(), tprintf(), TRUE, word_blob_quality(), word_char_quality(), and word_outline_errs().
Referenced by extend_moded_commands(), and recog_pseudo_word().
00211 { 00212 WERD_RES word_res(word); 00213 INT16 char_qual; 00214 INT16 good_char_qual; 00215 00216 classify_word_pass2(&word_res, row); 00217 #ifndef SECURE_NAMES 00218 if (tessedit_debug_quality_metrics) { 00219 word_char_quality(&word_res, row, &char_qual, &good_char_qual); 00220 tprintf 00221 ("\n%d chars; word_blob_quality: %d; outline_errs: %d; char_quality: %d; good_char_quality: %d\n", 00222 word_res.reject_map.length (), word_blob_quality (&word_res, row), 00223 word_outline_errs (&word_res), char_qual, good_char_qual); 00224 } 00225 #endif 00226 return TRUE; 00227 }
void recog_pseudo_word | ( | BLOCK_LIST * | block_list, | |
BOX & | selection_box | |||
) |
Make a word from the selected blobs and run Tess on them i.e., recognize blobs.
block_list | blocks to check | |
selection_box |
Definition at line 182 of file control.cpp.
References make_pseudo_word(), NULL, and recog_interactive().
Referenced by extend_moded_commands().
00184 { 00185 WERD *word; 00186 ROW *pseudo_row; //row of word 00187 BLOCK *pseudo_block; //block of word 00188 00189 word = make_pseudo_word (block_list, selection_box, 00190 pseudo_block, pseudo_row); 00191 if (word != NULL) { 00192 recog_interactive(pseudo_block, pseudo_row, word); 00193 delete word; 00194 } 00195 }
void set_word_fonts | ( | WERD_RES * | word, | |
BLOB_CHOICE_LIST_CLIST * | blob_choices | |||
) |
Get the fonts for the word.
Definition at line 1406 of file control.cpp.
References STATS::add(), WERD_RES::best_choice, WERD_RES::bold, find_modal_font(), WERD_RES::font1, WERD_RES::font1_count, WERD_RES::font2, WERD_RES::font2_count, WERD_RES::italic, and tprintf().
Referenced by classify_word_pass1().
01408 { 01409 INT32 index; //char index 01410 char choice_char; //char from word 01411 INT8 config; //font of char 01412 //character iterator 01413 BLOB_CHOICE_LIST_C_IT char_it = blob_choices; 01414 BLOB_CHOICE_IT choice_it; //choice iterator 01415 STATS fonts (0, 32); //font counters 01416 static INT8 italic_table[32] = { 01417 1, -1, 1, -1, 01418 1, -1, 1, -1, 01419 1, -1, 1, -1, 01420 1, -1, 1, -1, 01421 1, -1, 1, -1, 01422 1, -1, 1, -1, 01423 1, -1, 1, -1, 01424 1, -1, 1, -1 01425 }; 01426 static INT8 bold_table[32] = { 01427 1, 1, -1, -1, 01428 1, 1, -1, -1, 01429 1, 1, -1, -1, 01430 1, 1, -1, -1, 01431 1, 1, -1, -1, 01432 1, 1, -1, -1, 01433 1, 1, -1, -1, 01434 1, 1, -1, -1 01435 }; 01436 static INT8 font_table[32] = { 01437 2, 2, 2, 2, 01438 -1, -1, -1, -1, 01439 0, 0, 0, 0, 01440 1, 1, 1, 1, 01441 3, 3, 3, 3, 01442 4, 4, 4, 4, 01443 5, 5, 5, 5, 01444 2, 2, 2, 2 01445 }; 01446 01447 word->italic = 0; 01448 word->bold = 0; 01449 for (char_it.mark_cycle_pt (), index = 0; 01450 !char_it.cycled_list (); char_it.forward (), index++) { 01451 choice_char = word->best_choice->string ()[index]; 01452 choice_it.set_to_list (char_it.data ()); 01453 for (choice_it.mark_cycle_pt (); !choice_it.cycled_list (); 01454 choice_it.forward ()) { 01455 if (choice_it.data ()->char_class () == choice_char) { 01456 config = choice_it.data ()->config (); 01457 if (tessedit_debug_fonts) 01458 tprintf ("%c(%d=%d%c%c)", 01459 choice_char, config, (config & 31) >> 2, 01460 config & 2 ? 'N' : 'B', config & 1 ? 'N' : 'I'); 01461 if (config != -1) { 01462 config &= 31; 01463 word->italic += italic_table[config]; 01464 word->bold += bold_table[config]; 01465 if (font_table[config] != -1) 01466 fonts.add (font_table[config], 1); 01467 } 01468 break; 01469 } 01470 } 01471 } 01472 find_modal_font (&fonts, &word->font1, &word->font1_count); 01473 find_modal_font (&fonts, &word->font2, &word->font2_count); 01474 if (tessedit_debug_fonts) 01475 tprintf ("\n"); 01476 /* if (word->font1_count>0) 01477 { 01478 for (char_it.mark_cycle_pt(),index=0; 01479 !char_it.cycled_list();char_it.forward(),index++) 01480 { 01481 choice_char=word->best_choice->string()[index]; 01482 choice_it.set_to_list(char_it.data()); 01483 for (choice_it.mark_cycle_pt();!choice_it.cycled_list();choice_it.forward()) 01484 { 01485 if (choice_it.data()->char_class()==choice_char) 01486 { 01487 config=choice_it.data()->config(); 01488 if (config!=-1 && font_table[config&31]==word->font1) 01489 { 01490 word->italic+=italic_table[config]; 01491 word->bold+=bold_table[config]; 01492 } 01493 break; 01494 } 01495 } 01496 } 01497 }*/ 01498 }
int adjust_debug |
Referenced by adjust_non_word(), adjust_number(), and adjust_word().
FILE* choice_file = NULL |
Definition at line 166 of file control.cpp.
Referenced by choice_dump_tester(), close_choices(), and write_choice_line().
int display_ratings |
Show the ratings
Definition at line 53 of file tordvars.h.
Referenced by BaselineClassifier(), CharNormClassifier(), ClassPruner(), dawg_permute_and_select(), feature_pruner(), IMFindBestMatch(), number_permute_and_select(), permute_characters(), prune_configs(), and set_hyphen_word().
Referenced by AdaptiveClassifier(), AmbigClassifier(), BaselineClassifier(), and CharNormClassifier().
int number_debug |
Referenced by append_number_choices(), and number_permute().