00001
00021 #include "mfcpch.h"
00022 #include "mainblk.h"
00023 #include <string.h>
00024 #include <math.h>
00025 #ifdef __UNIX__
00026 #include <assert.h>
00027 #include <unistd.h>
00028 #include <errno.h>
00029 #endif
00030 #include <ctype.h>
00031 #include "ocrclass.h"
00032 #include "werdit.h"
00033 #include "drawfx.h"
00034 #include "tfacep.h"
00035 #include "tessbox.h"
00036 #include "tessvars.h"
00037
00038 #include "pgedit.h"
00039 #include "reject.h"
00040 #include "adaptions.h"
00041 #include "charcut.h"
00042 #include "fixxht.h"
00043 #include "fixspace.h"
00044 #include "genblob.h"
00045 #include "docqual.h"
00046 #include "control.h"
00047 #include "secname.h"
00048 #include "output.h"
00049 #include "callcpp.h"
00050 #include "notdll.h"
00051 #include "tordvars.h"
00052 #include "adaptmatch.h"
00053
00054 #define MIN_FONT_ROW_COUNT 8
00055 #define MAX_XHEIGHT_DIFF 3
00056 #define EXTERN
00057
00060
00061
00062
00063
00064
00065
00066 BOOL_VAR (tessedit_small_match, FALSE, "Use small matrix matcher");
00067 EXTERN BOOL_VAR (tessedit_print_text, FALSE, "Write text to stdout");
00068 EXTERN BOOL_VAR (tessedit_draw_words, FALSE, "Draw source words");
00069 EXTERN BOOL_VAR (tessedit_draw_outwords, FALSE, "Draw output words");
00070 EXTERN BOOL_VAR (tessedit_training_wiseowl, FALSE, "Call WO to learn blobs");
00071 EXTERN BOOL_VAR (tessedit_training_tess, FALSE, "Call Tess to learn blobs");
00072 EXTERN BOOL_VAR (tessedit_matcher_is_wiseowl, FALSE, "Call WO to classify");
00073 EXTERN BOOL_VAR (tessedit_dump_choices, FALSE, "Dump char choices");
00074 EXTERN BOOL_VAR (tessedit_fix_fuzzy_spaces, TRUE,
00075 "Try to improve fuzzy spaces");
00076 EXTERN BOOL_VAR (tessedit_unrej_any_wd, FALSE,
00077 "Dont bother with word plausibility");
00078 EXTERN BOOL_VAR (tessedit_fix_hyphens, TRUE, "Crunch double hyphens?");
00079
00080 EXTERN BOOL_VAR (tessedit_reject_fullstops, FALSE, "Reject all fullstops");
00081 EXTERN BOOL_VAR (tessedit_reject_suspect_fullstops, FALSE,
00082 "Reject suspect fullstops");
00083 EXTERN BOOL_VAR (tessedit_redo_xheight, TRUE, "Check/Correct x-height");
00084 EXTERN BOOL_VAR (tessedit_cluster_adaption_on, TRUE,
00085 "Do our own adaption - ems only");
00086 EXTERN BOOL_VAR (tessedit_enable_doc_dict, TRUE,
00087 "Add words to the document dictionary");
00088 EXTERN BOOL_VAR (word_occ_first, FALSE, "Do word occ before re-est xht");
00089 EXTERN BOOL_VAR (tessedit_debug_fonts, FALSE, "Output font info per char");
00090 EXTERN BOOL_VAR (tessedit_xht_fiddles_on_done_wds, TRUE,
00091 "Apply xht fix up even if done");
00092 EXTERN BOOL_VAR (tessedit_xht_fiddles_on_no_rej_wds, TRUE,
00093 "Apply xht fix up even in no rejects");
00094 EXTERN INT_VAR (x_ht_check_word_occ, 2, "Check Char Block occupancy");
00095 EXTERN INT_VAR (x_ht_stringency, 1, "How many confirmed a/n to accept?");
00096 EXTERN BOOL_VAR (x_ht_quality_check, TRUE, "Dont allow worse quality");
00097 EXTERN BOOL_VAR (tessedit_debug_block_rejection, FALSE,
00098 "Block and Row stats");
00099 EXTERN INT_VAR (debug_x_ht_level, 0, "Reestimate debug");
00100 EXTERN BOOL_VAR (rej_use_xht, TRUE, "Individual rejection control");
00101 EXTERN BOOL_VAR (debug_acceptable_wds, FALSE, "Dump word pass/fail chk");
00102
00103 EXTERN STRING_VAR (chs_leading_punct, "('`\"", "Leading punctuation");
00104 EXTERN
00105 STRING_VAR (chs_trailing_punct1, ").,;:?!", "1st Trailing punctuation");
00106 EXTERN STRING_VAR (chs_trailing_punct2, ")'`\"",
00107 "2nd Trailing punctuation");
00108
00109 EXTERN double_VAR (quality_rej_pc, 0.08,
00110 "good_quality_doc lte rejection limit");
00111 EXTERN double_VAR (quality_blob_pc, 0.0,
00112 "good_quality_doc gte good blobs limit");
00113 EXTERN double_VAR (quality_outline_pc, 1.0,
00114 "good_quality_doc lte outline error limit");
00115 EXTERN double_VAR (quality_char_pc, 0.95,
00116 "good_quality_doc gte good char limit");
00117 EXTERN INT_VAR (quality_min_initial_alphas_reqd, 2,
00118 "alphas in a good word");
00119
00120 EXTERN BOOL_VAR (tessedit_tess_adapt_to_rejmap, FALSE,
00121 "Use reject map to control Tesseract adaption");
00122 EXTERN INT_VAR (tessedit_tess_adaption_mode, 0x27,
00123 "Adaptation decision algorithm for tess");
00124 EXTERN INT_VAR (tessedit_em_adaption_mode, 0,
00125 "Adaptation decision algorithm for ems matrix matcher");
00126 EXTERN BOOL_VAR (tessedit_cluster_adapt_after_pass1, FALSE,
00127 "Adapt using clusterer after pass 1");
00128 EXTERN BOOL_VAR (tessedit_cluster_adapt_after_pass2, FALSE,
00129 "Adapt using clusterer after pass 1");
00130 EXTERN BOOL_VAR (tessedit_cluster_adapt_after_pass3, FALSE,
00131 "Adapt using clusterer after pass 1");
00132 EXTERN BOOL_VAR (tessedit_cluster_adapt_before_pass1, FALSE,
00133 "Adapt using clusterer before Tess adaping during pass 1");
00134 EXTERN INT_VAR (tessedit_cluster_adaption_mode, 0,
00135 "Adaptation decision algorithm for matrix matcher");
00136 EXTERN BOOL_VAR (tessedit_adaption_debug, FALSE,
00137 "Generate and print debug information for adaption");
00138 EXTERN BOOL_VAR (tessedit_minimal_rej_pass1, FALSE,
00139 "Do minimal rejection on pass 1 output");
00140 EXTERN BOOL_VAR (tessedit_test_adaption, FALSE,
00141 "Test adaption criteria");
00142 EXTERN BOOL_VAR (tessedit_global_adaption, FALSE,
00143 "Adapt to all docs over time");
00144 EXTERN BOOL_VAR (tessedit_matcher_log, FALSE, "Log matcher activity");
00145 EXTERN INT_VAR (tessedit_test_adaption_mode, 3,
00146 "Adaptation decision algorithm for tess");
00147
00148 EXTERN BOOL_VAR (test_pt, FALSE, "Test for point");
00149 EXTERN double_VAR (test_pt_x, 99999.99, "xcoord");
00150 EXTERN double_VAR (test_pt_y, 99999.99, "ycoord");
00153 extern int MatcherDebugLevel;
00154 extern int display_ratings;
00155 extern int number_debug;
00156 extern int adjust_debug;
00157
00158
00159
00160
00161
00162
00163
00164
00165
00166 FILE *choice_file = NULL;
00167
00168 CLISTIZEH (PBLOB) CLISTIZE (PBLOB)
00169
00170 INT16 blob_count(WERD *w) {
00171 return w->blob_list ()->length ();
00172 }
00173
00174
00182 void recog_pseudo_word(
00183 BLOCK_LIST *block_list,
00184 BOX &selection_box) {
00185 WERD *word;
00186 ROW *pseudo_row;
00187 BLOCK *pseudo_block;
00188
00189 word = make_pseudo_word (block_list, selection_box,
00190 pseudo_block, pseudo_row);
00191 if (word != NULL) {
00192 recog_interactive(pseudo_block, pseudo_row, word);
00193 delete word;
00194 }
00195 }
00196
00197
00207 BOOL8 recog_interactive(
00208 BLOCK *,
00209 ROW *row,
00210 WERD *word
00211 ) {
00212 WERD_RES word_res(word);
00213 INT16 char_qual;
00214 INT16 good_char_qual;
00215
00216 classify_word_pass2(&word_res, row);
00217 #ifndef SECURE_NAMES
00218 if (tessedit_debug_quality_metrics) {
00219 word_char_quality(&word_res, row, &char_qual, &good_char_qual);
00220 tprintf
00221 ("\n%d chars; word_blob_quality: %d; outline_errs: %d; char_quality: %d; good_char_quality: %d\n",
00222 word_res.reject_map.length (), word_blob_quality (&word_res, row),
00223 word_outline_errs (&word_res), char_qual, good_char_qual);
00224 }
00225 #endif
00226 return TRUE;
00227 }
00228
00229
00245 void recog_all_words(
00246 PAGE_RES *page_res,
00247 volatile ETEXT_DESC *monitor
00248 ) {
00249 PAGE_RES_IT page_res_it(page_res);
00250 INT16 chars_in_word;
00251 INT16 rejects_in_word;
00252 CHAR_SAMPLES_LIST em_clusters;
00253 CHAR_SAMPLE_LIST ems_waiting;
00254 CHAR_SAMPLES_LIST char_clusters;
00255 CHAR_SAMPLE_LIST chars_waiting;
00256 INT16 blob_quality = 0;
00257 INT16 outline_errs = 0;
00258 INT16 doc_blob_quality = 0;
00259 INT16 doc_outline_errs = 0;
00260 INT16 doc_char_quality = 0;
00261 INT16 all_char_quality;
00262 INT16 accepted_all_char_quality;
00263 INT16 good_char_count = 0;
00264 INT16 doc_good_char_quality = 0;
00265 const STRING *wordstr;
00266 const char *text;
00267 int i;
00268
00269 BOOL8 good_quality_doc;
00270 UINT8 permuter_type;
00271
00272 INT32 tess_adapt_mode = 0;
00273 INT32 word_count;
00274 INT32 word_index;
00275
00276 if (tessedit_minimal_rej_pass1) {
00277 tessedit_test_adaption.set_value (TRUE);
00278 tessedit_minimal_rejection.set_value (TRUE);
00279 }
00280
00281 if (tessedit_cluster_adapt_before_pass1) {
00282 tess_adapt_mode = tessedit_tess_adaption_mode;
00283 tessedit_tess_adaption_mode.set_value (0);
00284 tessedit_tess_adapt_to_rejmap.set_value (TRUE);
00285 }
00286
00287
00288 word_count = 0;
00289 if (monitor != NULL) {
00290 monitor->ocr_alive = TRUE;
00291 while (page_res_it.word () != NULL) {
00292 word_count++;
00293 page_res_it.forward ();
00294 }
00295 page_res_it.restart_page ();
00296 }
00297 else
00298 word_count = 1;
00299
00300 word_index = 0;
00301 int dict_words = 0;
00302 while (page_res_it.word () != NULL) {
00303 set_global_loc_code(LOC_PASS1);
00304 word_index++;
00305 if (monitor != NULL) {
00306 monitor->ocr_alive = TRUE;
00307 monitor->progress = 30 + 50 * word_index / word_count;
00308 if ((monitor->end_time != 0 && clock() > monitor->end_time) ||
00309 (monitor->cancel != NULL && (*monitor->cancel)(monitor->cancel_this,
00310 dict_words)))
00311 return;
00312 }
00313 classify_word_pass1 (page_res_it.word (),
00314 page_res_it.row ()->row, FALSE, NULL, NULL);
00315
00316 if (tessedit_test_adaption && !tessedit_minimal_rejection) {
00317 if (!word_adaptable (page_res_it.word (),
00318 tessedit_test_adaption_mode))
00319 page_res_it.word ()->reject_map.rej_word_tess_failure ();
00320
00321 else {
00322 wordstr = &(page_res_it.word ()->best_choice->string ());
00323
00324 text = wordstr->string ();
00325 for (i = 0; text[i] != '\0'; i++) {
00326 if ((text[i] != ' ')
00327 && page_res_it.word ()->reject_map[i].rejected ())
00328 page_res_it.word ()->reject_map[i].
00329 setrej_minimal_rej_accept();
00330 }
00331 }
00332 }
00333
00334 if ((tessedit_cluster_adapt_after_pass1
00335 || tessedit_cluster_adapt_after_pass3
00336 || tessedit_cluster_adapt_before_pass1)
00337 && tessedit_cluster_adaption_mode != 0) {
00338 collect_characters_for_adaption (page_res_it.word (),
00339 &char_clusters, &chars_waiting);
00340 }
00341
00342 if (page_res_it.word()->best_choice->permuter() == USER_DAWG_PERM)
00343 ++dict_words;
00344 page_res_it.forward ();
00345 }
00346
00347 if (tessedit_cluster_adapt_before_pass1)
00348 tessedit_tess_adaption_mode.set_value (tess_adapt_mode);
00349
00350 page_res_it.restart_page ();
00351 while ((tessedit_cluster_adapt_after_pass1
00352 || tessedit_cluster_adapt_before_pass1)
00353 && page_res_it.word () != NULL) {
00354 if (monitor != NULL)
00355 monitor->ocr_alive = TRUE;
00356 if (tessedit_cluster_adapt_after_pass1)
00357 adapt_to_good_samples (page_res_it.word (),
00358 &char_clusters, &chars_waiting);
00359 else
00360 classify_word_pass1 (page_res_it.word (),
00361 page_res_it.row ()->row,
00362 TRUE, &char_clusters, &chars_waiting);
00363
00364 page_res_it.forward ();
00365 }
00366
00367
00368 page_res_it.restart_page ();
00369 word_index = 0;
00370 while (!tessedit_test_adaption && page_res_it.word () != NULL) {
00371 set_global_loc_code(LOC_PASS2);
00372 word_index++;
00373 if (monitor != NULL) {
00374 monitor->ocr_alive = TRUE;
00375 monitor->progress = 80 + 10 * word_index / word_count;
00376 if ((monitor->end_time != 0 && clock() > monitor->end_time) ||
00377 (monitor->cancel != NULL && (*monitor->cancel)(monitor->cancel_this,
00378 dict_words)))
00379 return;
00380 }
00381 classify_word_pass2 (page_res_it.word (), page_res_it.row ()->row);
00382
00383 if (tessedit_em_adaption_mode > 0)
00384 collect_ems_for_adaption (page_res_it.word (),
00385 &em_clusters, &ems_waiting);
00386
00387 if (tessedit_cluster_adapt_after_pass2
00388 && tessedit_cluster_adaption_mode != 0)
00389 collect_characters_for_adaption (page_res_it.word (),
00390 &char_clusters, &chars_waiting);
00391 page_res_it.forward ();
00392 }
00393
00394
00395 set_global_loc_code(LOC_FUZZY_SPACE);
00396
00397 if (!tessedit_test_adaption && tessedit_fix_fuzzy_spaces
00398 && !tessedit_word_for_word)
00399 fix_fuzzy_spaces(monitor, word_count, page_res);
00400
00401 if (!tessedit_test_adaption && tessedit_em_adaption_mode != 0)
00402
00403 print_em_stats(&em_clusters, &ems_waiting);
00404
00405
00406 page_res_it.restart_page ();
00407 word_index = 0;
00408 while (!tessedit_test_adaption && page_res_it.word () != NULL) {
00409 set_global_loc_code(LOC_MM_ADAPT);
00410 word_index++;
00411 if (monitor != NULL) {
00412 monitor->ocr_alive = TRUE;
00413 monitor->progress = 95 + 5 * word_index / word_count;
00414 }
00415 check_debug_pt (page_res_it.word (), 70);
00416
00417
00418 if (tessedit_em_adaption_mode != 0)
00419 adapt_to_good_ems (page_res_it.word (), &em_clusters, &ems_waiting);
00420
00421 if (tessedit_cluster_adapt_after_pass2
00422 && tessedit_cluster_adaption_mode != 0)
00423 adapt_to_good_samples (page_res_it.word (),
00424 &char_clusters, &chars_waiting);
00425
00426 if (tessedit_reject_fullstops
00427 && strchr (page_res_it.word ()->best_choice->string ().string (),
00428 '.') != NULL)
00429 reject_all_fullstops (page_res_it.word ());
00430 else if (tessedit_reject_suspect_fullstops
00431 && strchr (page_res_it.word ()->best_choice->string ().
00432 string (), '.') != NULL)
00433 reject_suspect_fullstops (page_res_it.word ());
00434
00435 page_res_it.rej_stat_word ();
00436 chars_in_word = page_res_it.word ()->reject_map.length ();
00437 rejects_in_word = page_res_it.word ()->reject_map.reject_count ();
00438
00439 blob_quality = word_blob_quality (page_res_it.word (),
00440 page_res_it.row ()->row);
00441 doc_blob_quality += blob_quality;
00442 outline_errs = word_outline_errs (page_res_it.word ());
00443 doc_outline_errs += outline_errs;
00444 word_char_quality (page_res_it.word (),
00445 page_res_it.row ()->row,
00446 &all_char_quality, &accepted_all_char_quality);
00447 doc_char_quality += all_char_quality;
00448 permuter_type = page_res_it.word ()->best_choice->permuter ();
00449 if ((permuter_type == SYSTEM_DAWG_PERM) ||
00450 (permuter_type == FREQ_DAWG_PERM) ||
00451 (permuter_type == USER_DAWG_PERM)) {
00452 good_char_count += chars_in_word - rejects_in_word;
00453 doc_good_char_quality += accepted_all_char_quality;
00454 }
00455 check_debug_pt (page_res_it.word (), 80);
00456 if (tessedit_reject_bad_qual_wds &&
00457 (blob_quality == 0) && (outline_errs >= chars_in_word))
00458 page_res_it.word ()->reject_map.rej_word_bad_quality ();
00459 check_debug_pt (page_res_it.word (), 90);
00460 page_res_it.forward ();
00461 }
00462
00463 page_res_it.restart_page ();
00464 while (!tessedit_test_adaption
00465 && tessedit_cluster_adapt_after_pass3 && page_res_it.word () != NULL) {
00466 if (monitor != NULL)
00467 monitor->ocr_alive = TRUE;
00468 if (tessedit_cluster_adaption_mode != 0)
00469 adapt_to_good_samples (page_res_it.word (),
00470 &char_clusters, &chars_waiting);
00471 page_res_it.forward ();
00472 }
00473
00474 #ifndef SECURE_NAMES
00475 if (tessedit_debug_quality_metrics) {
00476 tprintf
00477 ("QUALITY: num_chs= %d num_rejs= %d %5.3f blob_qual= %d %5.3f outline_errs= %d %5.3f char_qual= %d %5.3f good_ch_qual= %d %5.3f\n",
00478 page_res->char_count, page_res->rej_count,
00479 page_res->rej_count / (float) page_res->char_count, doc_blob_quality,
00480 doc_blob_quality / (float) page_res->char_count, doc_outline_errs,
00481 doc_outline_errs / (float) page_res->char_count, doc_char_quality,
00482 doc_char_quality / (float) page_res->char_count,
00483 doc_good_char_quality,
00484 good_char_count >
00485 0 ? doc_good_char_quality / (float) good_char_count : 0.0);
00486 }
00487 #endif
00488 good_quality_doc =
00489 (page_res->rej_count / (float) page_res->char_count <= quality_rej_pc)
00490 &&
00491 (doc_blob_quality / (float) page_res->char_count >= quality_blob_pc) &&
00492 (doc_outline_errs / (float) page_res->char_count <= quality_outline_pc) &&
00493 (doc_char_quality / (float) page_res->char_count >= quality_char_pc);
00494
00495
00496
00497 if (!tessedit_test_adaption) {
00498 set_global_loc_code(LOC_DOC_BLK_REJ);
00499 quality_based_rejection(page_res_it, good_quality_doc);
00500 }
00501 font_recognition_pass(page_res_it);
00502
00503
00504 set_global_loc_code(LOC_WRITE_RESULTS);
00505
00506
00507
00508 }
00509
00510
00511
00517 void classify_word_pass1(
00518 WERD_RES *word,
00519 ROW *row,
00520 BOOL8 cluster_adapt,
00521 CHAR_SAMPLES_LIST *char_clusters,
00522 CHAR_SAMPLE_LIST *chars_waiting) {
00523 WERD *bln_word;
00524
00525 BLOB_CHOICE_LIST_CLIST blob_choices;
00526 BOOL8 adapt_ok;
00527 const char *rejmap;
00528 INT16 index;
00529 STRING mapstr = "";
00530 char *match_string;
00531 char word_string[1024];
00532
00533 if (matcher_fp != NULL) {
00534 fgets (word_string, 1023, correct_fp);
00535 if ((match_string = strchr (word_string, '\r')) != NULL)
00536 *match_string = '\0';
00537 if ((match_string = strchr (word_string, '\n')) != NULL)
00538 *match_string = '\0';
00539 if (word_string[0] != '\0') {
00540 word->word->set_text (word_string);
00541 word_answer = (char *) word->word->text ();
00542 }
00543 else
00544 word_answer = NULL;
00545 }
00546
00547 check_debug_pt (word, 0);
00548 matcher_pass = 0;
00549 bln_word = make_bln_copy (word->word, row, row->x_height (), &word->denorm);
00550
00551 word->best_choice = tess_segment_pass1 (bln_word, &word->denorm,
00552 tess_default_matcher,
00553 word->raw_choice, &blob_choices,
00554 word->outword);
00555
00561 if ((word->best_choice->string ().length () == 0) ||
00562 (strspn (word->best_choice->string ().string (), " ") ==
00563 word->best_choice->string ().length ())) {
00564 word->done = FALSE;
00565 word->tess_failed = TRUE;
00566 word->reject_map.initialise (word->best_choice->string ().length ());
00567 word->reject_map.rej_word_tess_failure ();
00568 }
00569 else {
00570 word->tess_failed = FALSE;
00571 if ((word->best_choice->string ().length () !=
00572 word->outword->blob_list ()->length ()) ||
00573 (word->best_choice->string ().length () != blob_choices.length ())) {
00574 tprintf
00575 ("ASSERT FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n",
00576 word->best_choice->string ().string (),
00577 word->best_choice->string ().length (),
00578 word->outword->blob_list ()->length (), blob_choices.length ());
00579 }
00580 ASSERT_HOST (word->best_choice->string ().length () ==
00581 word->outword->blob_list ()->length ());
00582 ASSERT_HOST (word->best_choice->string ().length () ==
00583 blob_choices.length ());
00584
00585
00586
00587
00588
00589
00590
00591
00592
00593
00594 if (word->word->flag (W_REP_CHAR)) {
00595 fix_rep_char(word);
00596 }
00597 else {
00598 fix_quotes ((char *) word->best_choice->string ().string (),
00599
00600 word->outword, &blob_choices);
00601 if (tessedit_fix_hyphens)
00602
00603 fix_hyphens ((char *) word->best_choice->string ().string (),
00604 word->outword, &blob_choices);
00605 record_certainty (word->best_choice->certainty (), 1);
00606
00607 word->tess_accepted = tess_acceptable_word (word->best_choice,
00608 word->raw_choice);
00609
00610 word->tess_would_adapt = tess_adaptable_word (word->outword,
00611 word->best_choice,
00612 word->raw_choice);
00613
00614 make_reject_map (word, &blob_choices, row, 1);
00615
00616 adapt_ok = word_adaptable (word, tessedit_tess_adaption_mode);
00617
00618 if (cluster_adapt)
00619 adapt_to_good_samples(word, char_clusters, chars_waiting);
00620
00621 if (adapt_ok || tessedit_tess_adapt_to_rejmap) {
00622 if (!tessedit_tess_adapt_to_rejmap)
00623 rejmap = NULL;
00624 else {
00625 ASSERT_HOST (word->reject_map.length () ==
00626 word->best_choice->string ().length ());
00627
00628 for (index = 0; index < word->reject_map.length (); index++) {
00629 if (adapt_ok || word->reject_map[index].accepted ())
00630 mapstr += '1';
00631 else
00632 mapstr += '0';
00633 }
00634 rejmap = mapstr.string ();
00635 }
00636
00637
00638 tess_adapter (word->outword, &word->denorm,
00639 word->best_choice->string ().string (),
00640 word->raw_choice->string ().string (), rejmap);
00641 }
00642
00643 if (tessedit_enable_doc_dict)
00644 tess_add_doc_word (word->best_choice);
00645 set_word_fonts(word, &blob_choices);
00646 }
00647 }
00648 if (tessedit_print_text) {
00649 write_cooked_text (bln_word, word->best_choice->string (),
00650 word->done, FALSE, stdout);
00651 }
00652 delete bln_word;
00653 blob_choices.deep_clear ();
00654 }
00655
00656
00657
00661 void classify_word_pass2(
00662 WERD_RES *word,
00663 ROW *row) {
00664 BOOL8 done_this_pass = FALSE;
00665 WERD_RES new_x_ht_word (word->word);
00666 float new_x_ht = 0.0;
00667 INT16 old_xht_reject_count;
00668 INT16 new_xht_reject_count;
00669 INT16 old_xht_accept_count;
00670 INT16 new_xht_accept_count;
00671 BOOL8 accept_new_x_ht = FALSE;
00672 INT16 old_chs_in_wd;
00673 INT16 new_chs_in_wd;
00674 INT16 old_word_quality;
00675 INT16 new_word_quality;
00676 INT16 dummy;
00677
00678 set_global_subloc_code(SUBLOC_NORM);
00679 check_debug_pt (word, 30);
00680 if (!word->done ||
00681 tessedit_training_tess ||
00682 tessedit_training_wiseowl || tessedit_dump_choices) {
00683 word->x_height = row->x_height ();
00684 word->caps_height = 0.0;
00685 if (word->outword != NULL) {
00686 delete word->outword;
00687 delete word->best_choice;
00688 delete word->raw_choice;
00689 }
00690 match_word_pass2 (word, row, row->x_height ());
00691 done_this_pass = TRUE;
00692 check_debug_pt (word, 40);
00693 }
00694
00695 if (!word->tess_failed && !word->word->flag (W_REP_CHAR)) {
00696 set_global_subloc_code(SUBLOC_FIX_XHT);
00697 if ((tessedit_xht_fiddles_on_done_wds || !word->done) &&
00698 (tessedit_xht_fiddles_on_no_rej_wds ||
00699 (word->reject_map.reject_count () > 0))) {
00700 if ((x_ht_check_word_occ >= 2) && word_occ_first)
00701 check_block_occ(word);
00702
00703 if (tessedit_redo_xheight)
00704 re_estimate_x_ht(word, &new_x_ht);
00705
00706 if (((x_ht_check_word_occ >= 2) && !word_occ_first) ||
00707 ((x_ht_check_word_occ >= 1) && (new_x_ht > 0)))
00708 check_block_occ(word);
00709 }
00710 if (new_x_ht > 0) {
00711 old_chs_in_wd = word->reject_map.length ();
00712
00713
00714 new_x_ht_word.x_height = new_x_ht;
00715 new_x_ht_word.caps_height = 0.0;
00716 match_word_pass2 (&new_x_ht_word, row, new_x_ht_word.x_height);
00717 if (!new_x_ht_word.tess_failed) {
00718 if ((x_ht_check_word_occ >= 1) && word_occ_first)
00719 check_block_occ(&new_x_ht_word);
00720
00721 re_estimate_x_ht(&new_x_ht_word, &new_x_ht);
00722
00723 if ((x_ht_check_word_occ >= 1) && !word_occ_first)
00724 check_block_occ(&new_x_ht_word);
00725
00726 old_xht_reject_count = word->reject_map.reject_count ();
00727 old_xht_accept_count = old_chs_in_wd - old_xht_reject_count;
00728 new_xht_reject_count = new_x_ht_word.reject_map.reject_count ();
00729 new_chs_in_wd = new_x_ht_word.reject_map.length ();
00730 new_xht_accept_count = new_chs_in_wd - new_xht_reject_count;
00731 accept_new_x_ht =
00732 ((new_xht_accept_count > old_xht_accept_count) ||
00733 ((new_xht_accept_count == old_xht_accept_count) &&
00734 (new_xht_accept_count > 0))) &&
00735 (!new_x_ht_word.guessed_x_ht ||
00736 !new_x_ht_word.guessed_caps_ht);
00737
00738 if (accept_new_x_ht && x_ht_quality_check) {
00739 word_char_quality(word, row, &old_word_quality, &dummy);
00740 word_char_quality(&new_x_ht_word, row, &new_word_quality, &dummy);
00741 if (old_word_quality > new_word_quality)
00742 accept_new_x_ht = FALSE;
00743 }
00744
00745 if (accept_new_x_ht && (x_ht_stringency > 0)) {
00746 accept_new_x_ht =
00747 (count_alphanums (&new_x_ht_word) > x_ht_stringency);
00748 if (!accept_new_x_ht && rej_use_xht) {
00749 if (debug_x_ht_level >= 1)
00750 tprintf
00751 ("Failed stringency test so reject original word\n");
00752 word->reject_map.rej_word_xht_fixup ();
00753 }
00754 }
00755
00756 #ifndef SECURE_NAMES
00757 if (debug_x_ht_level >= 1) {
00758 tprintf ("New XHT Match:: %s ",
00759 word->best_choice->string ().string ());
00760 word->reject_map.print (debug_fp);
00761 tprintf (" -> %s ",
00762 new_x_ht_word.best_choice->string ().string ());
00763 new_x_ht_word.reject_map.print (debug_fp);
00764 tprintf (" %s->%s %s %s\n",
00765 word->guessed_x_ht ? "GUESS" : "CERT",
00766 new_x_ht_word.guessed_x_ht ? "GUESS" : "CERT",
00767 new_x_ht > 0.1 ? "STILL DOUBT" : "OK",
00768 accept_new_x_ht ? "ACCEPTED" : "");
00769 }
00770 #endif
00771 }
00772 if (accept_new_x_ht) {
00773
00774
00775
00776
00777 delete word->outword;
00778 word->outword = new_x_ht_word.outword;
00779 word->denorm = new_x_ht_word.denorm;
00780 delete word->best_choice;
00781 word->best_choice = new_x_ht_word.best_choice;
00782 delete word->raw_choice;
00783 word->raw_choice = new_x_ht_word.raw_choice;
00784 word->reject_map = new_x_ht_word.reject_map;
00785 word->done = new_x_ht_word.done;
00786 done_this_pass = TRUE;
00787 }
00788 else {
00789
00790
00791
00792
00793
00794
00795 delete new_x_ht_word.outword;
00796 delete new_x_ht_word.best_choice;
00797 delete new_x_ht_word.raw_choice;
00798 }
00799 new_x_ht_word.outword = NULL;
00800 new_x_ht_word.best_choice = NULL;
00801 new_x_ht_word.raw_choice = NULL;
00802
00803 if (rej_mostly_reject_mode == 2) {
00804 reject_mostly_rejects(word);
00805 tprintf ("Rejecting mostly rejects on %s ",
00806 word->best_choice->string ().string ());
00807 }
00808 }
00809
00810 set_global_subloc_code(SUBLOC_NORM);
00811
00812 if (done_this_pass && !word->done && tessedit_save_stats)
00813 SaveBadWord (word->best_choice->string ().string (),
00814 word->best_choice->certainty ());
00815 record_certainty (word->best_choice->certainty (), 2);
00816 }
00817 #ifndef GRAPHICS_DISABLED
00818 if (tessedit_draw_outwords) {
00819 if (fx_win == NO_WINDOW)
00820 create_fx_win();
00821 clear_fx_win();
00822 word->outword->plot (fx_win);
00823 make_picture_current(fx_win);
00824 }
00825 #endif
00826
00827 set_global_subloc_code(SUBLOC_NORM);
00828 if (tessedit_print_text) {
00829 write_cooked_text (word->outword, word->best_choice->string (),
00830 word->done, done_this_pass, stdout);
00831 }
00832 check_debug_pt (word, 50);
00833 }
00834
00835
00836
00840 void match_word_pass2(
00841 WERD_RES *word,
00842 ROW *row,
00843 float x_height) {
00844 WERD *bln_word;
00845
00846 BLOB_CHOICE_LIST_CLIST blob_choices;
00847
00848 set_global_subsubloc_code(SUBSUBLOC_OTHER);
00849 if (matcher_fp != NULL) {
00850 word_answer = (char *) word->word->text ();
00851 if (word_answer != NULL && word_answer[0] == '\0')
00852 word_answer = NULL;
00853 }
00854 matcher_pass = 0;
00855 bln_word = make_bln_copy (word->word, row, x_height, &word->denorm);
00856 set_global_subsubloc_code(SUBSUBLOC_TESS);
00857 if (tessedit_training_tess)
00858 word->best_choice = correct_segment_pass2 (bln_word,
00859 &word->denorm,
00860 tess_default_matcher,
00861 tess_training_tester,
00862 word->raw_choice,
00863 &blob_choices, word->outword);
00864 else if (tessedit_dump_choices)
00865 word->best_choice = test_segment_pass2 (bln_word,
00866 &word->denorm,
00867 tess_default_matcher,
00868 choice_dump_tester,
00869 word->raw_choice,
00870 &blob_choices, word->outword);
00871
00872
00873
00874
00875
00876
00877
00878 else {
00879 word->best_choice = tess_segment_pass2 (bln_word, &word->denorm,
00880 tess_default_matcher,
00881 word->raw_choice, &blob_choices,
00882 word->outword);
00883 }
00884 set_global_subsubloc_code(SUBSUBLOC_OTHER);
00885
00886
00887
00888
00889
00890 if ((word->best_choice->string ().length () == 0) ||
00891 (strspn (word->best_choice->string ().string (), " ") ==
00892 word->best_choice->string ().length ())) {
00893 word->tess_failed = TRUE;
00894 word->reject_map.initialise (word->best_choice->string ().length ());
00895 word->reject_map.rej_word_tess_failure ();
00896
00897 }
00898 else {
00899 if ((word->best_choice->string ().length () !=
00900 word->outword->blob_list ()->length ()) ||
00901 (word->best_choice->string ().length () != blob_choices.length ())) {
00902 tprintf
00903 ("ASSERT FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n",
00904 word->best_choice->string ().string (),
00905 word->best_choice->string ().length (),
00906 word->outword->blob_list ()->length (), blob_choices.length ());
00907 }
00908 ASSERT_HOST (word->best_choice->string ().length () ==
00909 word->outword->blob_list ()->length ());
00910 ASSERT_HOST (word->best_choice->string ().length () ==
00911 blob_choices.length ());
00912
00913 word->tess_failed = FALSE;
00914 if (word->word->flag (W_REP_CHAR)) {
00915 fix_rep_char(word);
00916 }
00917 else {
00918 fix_quotes ((char *) word->best_choice->string ().string (),
00919 word->outword, &blob_choices);
00920 if (tessedit_fix_hyphens)
00921 fix_hyphens ((char *) word->best_choice->string ().string (),
00922 word->outword, &blob_choices);
00923
00924 if ((word->best_choice->string ().length () !=
00925 word->outword->blob_list ()->length ()) ||
00926 (word->best_choice->string ().length () !=
00927 blob_choices.length ())) {
00928 #ifndef SECURE_NAMES
00929 tprintf
00930 ("POST FIX_QUOTES FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n",
00931 word->best_choice->string ().string (),
00932 word->best_choice->string ().length (),
00933 word->outword->blob_list ()->length (),
00934 blob_choices.length ());
00935 #endif
00936
00937 }
00938 ASSERT_HOST (word->best_choice->string ().length () ==
00939 word->outword->blob_list ()->length ());
00940 ASSERT_HOST (word->best_choice->string ().length () ==
00941 blob_choices.length ());
00942
00943 word->tess_accepted = tess_acceptable_word (word->best_choice,
00944 word->raw_choice);
00945
00946 make_reject_map (word, &blob_choices, row, 2);
00947 }
00948 }
00949 blob_choices.deep_clear ();
00950 delete bln_word;
00951 assert (word->raw_choice != NULL);
00952 }
00953
00954
00955
00965 void fix_rep_char(
00966 WERD_RES *word
00967 ) {
00968 struct REP_CH
00969 {
00970 char ch;
00971 int count;
00972 };
00973
00974 REP_CH *rep_ch;
00975 int word_len;
00976 int rep_ch_count = 0;
00977 const char *word_str;
00978 int i, j;
00979 int total = 0;
00980 int max = 0;
00981 char maxch = ' ';
00982
00983 word_str = word->best_choice->string ().string ();
00984 word_len = strlen (word_str);
00985 rep_ch = (REP_CH *) alloc_mem (word_len * sizeof (REP_CH));
00986 for (i = 0; i < word_len; i++) {
00987 for (j = 0; j < rep_ch_count && rep_ch[j].ch != word_str[i]; j++);
00988 if (j < rep_ch_count)
00989 rep_ch[j].count++;
00990 else {
00991 rep_ch[rep_ch_count].ch = word_str[i];
00992 rep_ch[rep_ch_count].count = 1;
00993 rep_ch_count++;
00994 }
00995 }
00996
00997 for (j = 0; j < rep_ch_count; j++) {
00998 total += rep_ch[j].count;
00999 if ((rep_ch[j].count > max) && (rep_ch[j].ch != ' ')) {
01000 max = rep_ch[j].count;
01001 maxch = rep_ch[j].ch;
01002 }
01003 }
01004
01005
01006 free_mem(rep_ch);
01007
01008 word->reject_map.initialise (word_len);
01009 for (i = 0; i < word_len; i++) {
01010 if (word_str[i] != maxch)
01011
01012 word->reject_map[i].setrej_bad_repetition ();
01013 }
01014 word->done = TRUE;
01015 }
01016
01017
01018
01022 void fix_quotes(
01023 char *string,
01024 WERD *word,
01025 BLOB_CHOICE_LIST_CLIST *blob_choices) {
01026 char *ptr;
01027
01028 PBLOB_IT blob_it = word->blob_list ();
01029
01030 BLOB_CHOICE_LIST_C_IT choice_it = blob_choices;
01031 BLOB_CHOICE_IT it1;
01032 BLOB_CHOICE_IT it2;
01033
01034 for (ptr = string;
01035 *ptr != '\0'; ptr++, blob_it.forward (), choice_it.forward ()) {
01036 if ((*ptr == '\'' || *ptr == '`')
01037 && (*(ptr + 1) == '\'' || *(ptr + 1) == '`')) {
01038 *ptr = '"';
01039 strcpy (ptr + 1, ptr + 2);
01040 merge_blobs (blob_it.data (), blob_it.data_relative (1));
01041 blob_it.forward ();
01042 delete blob_it.extract ();
01043
01044 it1.set_to_list (choice_it.data ());
01045 it2.set_to_list (choice_it.data_relative (1));
01046 if (it1.data ()->certainty () < it2.data ()->certainty ()) {
01047 choice_it.forward ();
01048
01049 delete choice_it.extract ();
01050 }
01051 else {
01052
01053 delete choice_it.extract ();
01054 choice_it.forward ();
01055 }
01056 }
01057 }
01058 }
01059
01060
01061
01067 void fix_hyphens(
01068 char *string,
01069 WERD *word,
01070 BLOB_CHOICE_LIST_CLIST *blob_choices) {
01071 char *ptr;
01072
01073 PBLOB_IT blob_it = word->blob_list ();
01074
01075 BLOB_CHOICE_LIST_C_IT choice_it = blob_choices;
01076 BLOB_CHOICE_IT it1;
01077 BLOB_CHOICE_IT it2;
01078
01079 for (ptr = string;
01080 *ptr != '\0'; ptr++, blob_it.forward (), choice_it.forward ()) {
01081 if ((*ptr == '-' || *ptr == '~') &&
01082 (*(ptr + 1) == '-' || *(ptr + 1) == '~') &&
01083 (blob_it.data ()->bounding_box ().right () >=
01084 blob_it.data_relative (1)->bounding_box ().left ())) {
01085 *ptr = '-';
01086 strcpy (ptr + 1, ptr + 2);
01087 merge_blobs (blob_it.data (), blob_it.data_relative (1));
01088 blob_it.forward ();
01089 delete blob_it.extract ();
01090
01091 it1.set_to_list (choice_it.data ());
01092 it2.set_to_list (choice_it.data_relative (1));
01093 if (it1.data ()->certainty () < it2.data ()->certainty ()) {
01094 choice_it.forward ();
01095
01096 delete choice_it.extract ();
01097 }
01098 else {
01099
01100 delete choice_it.extract ();
01101 choice_it.forward ();
01102 }
01103 }
01104 }
01105 }
01106
01107
01108
01114 void merge_blobs(
01115 PBLOB *blob1,
01116 PBLOB *blob2
01117 ) {
01118 OUTLINE_IT outline_it = blob1->out_list ();
01119
01120
01121 outline_it.move_to_last ();
01122
01123 outline_it.add_list_after (blob2->out_list ());
01124 }
01125
01126
01127
01134 void choice_dump_tester(
01135 PBLOB *,
01136 DENORM *,
01137 BOOL8 correct,
01138 char *text,
01139 INT32 count,
01140 BLOB_CHOICE_LIST *ratings
01141 ) {
01142 STRING choice_file_name;
01143 BLOB_CHOICE *blob_choice;
01144 BLOB_CHOICE_IT it;
01145 char source_chars[20];
01146 char correct_char[3];
01147
01148 if (choice_file == NULL) {
01149 choice_file_name = imagebasename + ".chc";
01150 if (!(choice_file = fopen (choice_file_name.string (), "w"))) {
01151 CANTOPENFILE.error ("choice_dump_tester", EXIT, "%s %d",
01152 choice_file_name.string (), errno);
01153 }
01154 }
01155
01156 if ((count == 0) || (text == NULL) || (text[0] == '\0')) {
01157 strcpy (source_chars, "$$");
01158 strcpy (correct_char, "$$");
01159 }
01160 else {
01161 strncpy(source_chars, text, count);
01162 source_chars[count] = '\0';
01163 if (correct) {
01164 correct_char[0] = text[0];
01165 correct_char[1] = '\0';
01166 }
01167 else {
01168 strcpy (correct_char, "$$");
01169 }
01170 }
01171 fprintf (choice_file, "%s\t%s", source_chars, correct_char);
01172
01173 it.set_to_list (ratings);
01174 for (it.mark_cycle_pt (); !it.cycled_list (); it.forward ()) {
01175 blob_choice = it.data ();
01176 if ((blob_choice->char_class () >= '!') &&
01177 (blob_choice->char_class () <= '~'))
01178 fprintf (choice_file, "\t%c\t%f\t%f",
01179 blob_choice->char_class (),
01180 blob_choice->rating (), blob_choice->certainty ());
01181 }
01182 fprintf (choice_file, "\n");
01183 }
01184
01185
01186
01193 WERD *make_bln_copy(WERD *src_word, ROW *row, float x_height, DENORM *denorm) {
01194 WERD *result;
01195
01196
01197
01198
01199
01200
01201
01202
01203 result = src_word->poly_copy (row->x_height ());
01204
01205
01206
01207
01208
01209
01210
01211 result->baseline_normalise_x (row, x_height, denorm);
01212 return result;
01213 }
01214
01215
01216
01222 ACCEPTABLE_WERD_TYPE acceptable_word_string(const char *s) {
01223 int i = 0;
01224 int leading_punct_count;
01225 int upper_count = 0;
01226 int hyphen_pos = -1;
01227 ACCEPTABLE_WERD_TYPE word_type = AC_UNACCEPTABLE;
01228
01229 if (strlen (s) > 20)
01230 return word_type;
01231
01232
01233
01234 if ((s[i] != '\0') && (STRING (chs_leading_punct).contains (s[i])))
01235 i++;
01236 leading_punct_count = i;
01237
01238
01239 while (isupper (s[i])) {
01240 i++;
01241 upper_count++;
01242 }
01243 if (upper_count > 1)
01244 word_type = AC_UPPER_CASE;
01245 else {
01246
01247 while (islower (s[i])) {
01248 i++;
01249 }
01250 if (i - leading_punct_count < quality_min_initial_alphas_reqd)
01251 goto not_a_word;
01252
01253
01254
01255
01256 if (s[i] == '-') {
01257 hyphen_pos = i++;
01258 if (s[i] != '\0') {
01259 while (islower (s[i])) {
01260 i++;
01261 }
01262 if (i < hyphen_pos + 3)
01263 goto not_a_word;
01264 }
01265 }
01266 else {
01267
01268 if ((s[i] == '\'') && (s[i + 1] == 's'))
01269 i += 2;
01270 }
01271 if (upper_count > 0)
01272 word_type = AC_INITIAL_CAP;
01273 else
01274 word_type = AC_LOWER_CASE;
01275 }
01276
01277
01278 if ((s[i] != '\0') && (STRING (chs_trailing_punct1).contains (s[i])))
01279 i++;
01280 if ((s[i] != '\0') &&
01281 (s[i - 1] != s[i]) && (STRING (chs_trailing_punct2).contains (s[i])))
01282 i++;
01283
01284 if (s[i] != '\0')
01285 word_type = AC_UNACCEPTABLE;
01286
01287 not_a_word:
01288
01289 if (word_type == AC_UNACCEPTABLE) {
01290
01291 i = 0;
01292 if (isupper (s[0])) {
01293 word_type = AC_UC_ABBREV;
01294 while ((s[i] != '\0') && isupper (s[i]) && (s[i + 1] == '.'))
01295 i += 2;
01296 }
01297 else if (islower (s[0])) {
01298 word_type = AC_LC_ABBREV;
01299 while ((s[i] != '\0') && islower (s[i]) && (s[i + 1] == '.'))
01300 i += 2;
01301 }
01302 if (s[i] != '\0')
01303 word_type = AC_UNACCEPTABLE;
01304 }
01305
01306 return word_type;
01307 }
01308
01309
01310
01314 BOOL8 check_debug_pt(WERD_RES *word, int location) {
01315 BOOL8 show_map_detail = FALSE;
01316 INT16 i;
01317
01318 #ifndef SECURE_NAMES
01319 if (!test_pt)
01320 return FALSE;
01321
01322 tessedit_rejection_debug.set_value (FALSE);
01323 debug_x_ht_level.set_value (0);
01324 tessedit_cluster_debug.set_value (FALSE);
01325 nn_debug.set_value (FALSE);
01326 nn_reject_debug.set_value (FALSE);
01327
01328 if (word->word->bounding_box ().contains (FCOORD (test_pt_x, test_pt_y))) {
01329 if (location < 0)
01330 return TRUE;
01331 tessedit_rejection_debug.set_value (TRUE);
01332 debug_x_ht_level.set_value (20);
01333 tessedit_cluster_debug.set_value (TRUE);
01334 nn_debug.set_value (TRUE);
01335 nn_reject_debug.set_value (TRUE);
01336 tprintf ("\n\nTESTWD::");
01337 switch (location) {
01338 case 0:
01339 tprintf ("classify_word_pass1 start\n");
01340 word->word->print (debug_fp);
01341 break;
01342 case 10:
01343 tprintf ("make_reject_map: initial map");
01344 break;
01345 case 20:
01346 tprintf ("make_reject_map: after NN");
01347 break;
01348 case 30:
01349 tprintf ("classify_word_pass2 - START");
01350 break;
01351 case 40:
01352 tprintf ("classify_word_pass2 - Pre Xht");
01353 break;
01354 case 50:
01355 tprintf ("classify_word_pass2 - END");
01356 show_map_detail = TRUE;
01357 break;
01358 case 60:
01359 tprintf ("fixspace");
01360 break;
01361 case 70:
01362 tprintf ("MM pass START");
01363 break;
01364 case 80:
01365 tprintf ("MM pass END");
01366 break;
01367 case 90:
01368 tprintf ("After Poor quality rejection");
01369 break;
01370 case 100:
01371 tprintf ("unrej_good_quality_words - START");
01372 break;
01373 case 110:
01374 tprintf ("unrej_good_quality_words - END");
01375 break;
01376 case 120:
01377 tprintf ("Write results pass");
01378 show_map_detail = TRUE;
01379 break;
01380 }
01381 tprintf (" \"%s\" ", word->best_choice->string ().string ());
01382 word->reject_map.print (debug_fp);
01383 tprintf ("\n");
01384 if (show_map_detail) {
01385 tprintf ("\"%s\"\n", word->best_choice->string ().string ());
01386 for (i = 0; word->best_choice->string ()[i] != '\0'; i++) {
01387 tprintf ("**** \"%c\" ****\n", word->best_choice->string ()[i]);
01388 word->reject_map[i].full_print (debug_fp);
01389 }
01390 }
01391
01392 tprintf ("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE");
01393 tprintf ("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE");
01394 return TRUE;
01395 }
01396 else
01397 #endif
01398 return FALSE;
01399 }
01400
01401
01402
01406 void set_word_fonts(
01407 WERD_RES *word,
01408 BLOB_CHOICE_LIST_CLIST *blob_choices) {
01409 INT32 index;
01410 char choice_char;
01411 INT8 config;
01412
01413 BLOB_CHOICE_LIST_C_IT char_it = blob_choices;
01414 BLOB_CHOICE_IT choice_it;
01415 STATS fonts (0, 32);
01416 static INT8 italic_table[32] = {
01417 1, -1, 1, -1,
01418 1, -1, 1, -1,
01419 1, -1, 1, -1,
01420 1, -1, 1, -1,
01421 1, -1, 1, -1,
01422 1, -1, 1, -1,
01423 1, -1, 1, -1,
01424 1, -1, 1, -1
01425 };
01426 static INT8 bold_table[32] = {
01427 1, 1, -1, -1,
01428 1, 1, -1, -1,
01429 1, 1, -1, -1,
01430 1, 1, -1, -1,
01431 1, 1, -1, -1,
01432 1, 1, -1, -1,
01433 1, 1, -1, -1,
01434 1, 1, -1, -1
01435 };
01436 static INT8 font_table[32] = {
01437 2, 2, 2, 2,
01438 -1, -1, -1, -1,
01439 0, 0, 0, 0,
01440 1, 1, 1, 1,
01441 3, 3, 3, 3,
01442 4, 4, 4, 4,
01443 5, 5, 5, 5,
01444 2, 2, 2, 2
01445 };
01446
01447 word->italic = 0;
01448 word->bold = 0;
01449 for (char_it.mark_cycle_pt (), index = 0;
01450 !char_it.cycled_list (); char_it.forward (), index++) {
01451 choice_char = word->best_choice->string ()[index];
01452 choice_it.set_to_list (char_it.data ());
01453 for (choice_it.mark_cycle_pt (); !choice_it.cycled_list ();
01454 choice_it.forward ()) {
01455 if (choice_it.data ()->char_class () == choice_char) {
01456 config = choice_it.data ()->config ();
01457 if (tessedit_debug_fonts)
01458 tprintf ("%c(%d=%d%c%c)",
01459 choice_char, config, (config & 31) >> 2,
01460 config & 2 ? 'N' : 'B', config & 1 ? 'N' : 'I');
01461 if (config != -1) {
01462 config &= 31;
01463 word->italic += italic_table[config];
01464 word->bold += bold_table[config];
01465 if (font_table[config] != -1)
01466 fonts.add (font_table[config], 1);
01467 }
01468 break;
01469 }
01470 }
01471 }
01472 find_modal_font (&fonts, &word->font1, &word->font1_count);
01473 find_modal_font (&fonts, &word->font2, &word->font2_count);
01474 if (tessedit_debug_fonts)
01475 tprintf ("\n");
01476
01477
01478
01479
01480
01481
01482
01483
01484
01485
01486
01487
01488
01489
01490
01491
01492
01493
01494
01495
01496
01497
01498 }
01499
01500
01501
01505 void font_recognition_pass(
01506 PAGE_RES_IT &page_res_it) {
01507 INT32 length;
01508 INT32 count;
01509 INT8 doc_font;
01510 INT8 doc_font_count;
01511 INT32 doc_italic;
01512 INT32 doc_bold;
01513 ROW_RES *row = NULL;
01514 WERD_RES *word;
01515 STATS fonts (0, 32);
01516 STATS doc_fonts (0, 32);
01517
01518 doc_italic = 0;
01519 doc_bold = 0;
01520 page_res_it.restart_page ();
01521 while (page_res_it.word () != NULL) {
01522 if (row != page_res_it.row ()) {
01523 if (row != NULL) {
01524 find_modal_font (&fonts, &row->font1, &row->font1_count);
01525 find_modal_font (&fonts, &row->font2, &row->font2_count);
01526 }
01527 row = page_res_it.row ();
01528 fonts.clear ();
01529 row->italic = 0;
01530 row->bold = 0;
01531 }
01532 word = page_res_it.word ();
01533 row->italic += word->italic;
01534 row->bold += word->bold;
01535 fonts.add (word->font1, word->font1_count);
01536 fonts.add (word->font2, word->font2_count);
01537 doc_italic += word->italic;
01538 doc_bold += word->bold;
01539 doc_fonts.add (word->font1, word->font1_count);
01540 doc_fonts.add (word->font2, word->font2_count);
01541 page_res_it.forward ();
01542 }
01543 if (row != NULL) {
01544 find_modal_font (&fonts, &row->font1, &row->font1_count);
01545 find_modal_font (&fonts, &row->font2, &row->font2_count);
01546 }
01547 find_modal_font(&doc_fonts, &doc_font, &doc_font_count);
01548
01549
01550
01551
01552
01553
01554
01555
01556
01557
01558
01559
01560
01561
01562
01563
01564
01565
01566
01567
01568
01569
01570
01571
01572
01573
01574
01575
01576
01577
01578
01579
01580
01581
01582
01583
01584
01585
01586
01587
01588
01589
01590
01591
01592
01593
01594
01595 page_res_it.restart_page ();
01596 while (page_res_it.word () != NULL) {
01597 row = page_res_it.row ();
01598 word = page_res_it.word ();
01599 length = word->best_choice->string ().length ();
01600
01601 count = word->italic;
01602 if (count < 0)
01603 count = -count;
01604 if (!(count == length || length > 3 && count >= length * 3 / 4))
01605 word->italic = doc_italic > 0 ? 1 : -1;
01606
01607 count = word->bold;
01608 if (count < 0)
01609 count = -count;
01610 if (!(count == length || length > 3 && count >= length * 3 / 4))
01611 word->bold = doc_bold > 0 ? 1 : -1;
01612
01613 count = word->font1_count;
01614 if (!(count == length || length > 3 && count >= length * 3 / 4)) {
01615 word->font1 = doc_font;
01616 word->font1_count = doc_font_count;
01617 }
01618
01619 page_res_it.forward ();
01620 }
01621 }
01622
01623
01624
01628 void add_in_one_row(
01629 ROW_RES *row,
01630 STATS *fonts,
01631 INT8 *italic,
01632 INT8 *bold
01633 ) {
01634 WERD_RES *word;
01635 WERD_RES_IT word_it = &row->word_res_list;
01636
01637 for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
01638 word = word_it.data ();
01639 *italic += word->italic;
01640 *bold += word->bold;
01641 if (word->font1_count > 0)
01642 fonts->add (word->font1, word->font1_count);
01643 if (word->font2_count > 0)
01644 fonts->add (word->font2, word->font2_count);
01645
01646 }
01647 }
01648
01649
01650
01654 void find_modal_font(
01655 STATS *fonts,
01656 INT8 *font_out,
01657 INT8 *font_count
01658 ) {
01659 INT8 font;
01660 INT32 count;
01661
01662 if (fonts->get_total () > 0) {
01663 font = (INT8) fonts->mode ();
01664 *font_out = font;
01665 count = fonts->pile_count (font);
01666 *font_count = count < MAX_INT8 ? count : MAX_INT8;
01667 fonts->add (font, -*font_count);
01668 }
01669 else {
01670 *font_out = -1;
01671 *font_count = 0;
01672 }
01673 }