ccmain/control.cpp

Go to the documentation of this file.
00001 
00021 #include          "mfcpch.h"
00022 #include          "mainblk.h"
00023 #include          <string.h>
00024 #include          <math.h>
00025 #ifdef __UNIX__
00026 #include          <assert.h>
00027 #include          <unistd.h>
00028 #include                    <errno.h>
00029 #endif
00030 #include          <ctype.h>
00031 #include          "ocrclass.h"
00032 #include          "werdit.h"
00033 #include          "drawfx.h"
00034 #include          "tfacep.h"
00035 #include          "tessbox.h"
00036 #include          "tessvars.h"
00037 //#include                                      "fxtop.h"
00038 #include          "pgedit.h"
00039 #include          "reject.h"
00040 #include          "adaptions.h"
00041 #include          "charcut.h"
00042 #include          "fixxht.h"
00043 #include          "fixspace.h"
00044 #include          "genblob.h"
00045 #include          "docqual.h"
00046 #include          "control.h"
00047 #include          "secname.h"
00048 #include          "output.h"
00049 #include          "callcpp.h"
00050 #include          "notdll.h"
00051 #include "tordvars.h"
00052 #include "adaptmatch.h"
00053 
00054 #define MIN_FONT_ROW_COUNT  8
00055 #define MAX_XHEIGHT_DIFF  3
00056 #define EXTERN
00057 
00060 //extern "C" {
00061 //EXTERN BOOL_VAR(tessedit_small_match,FALSE,"Use small matrix matcher");
00062 
00063 //extern FILE*                          matcher_fp;
00064 //extern FILE*                          correct_fp;
00065 //};
00066 BOOL_VAR (tessedit_small_match, FALSE, "Use small matrix matcher");
00067 EXTERN BOOL_VAR (tessedit_print_text, FALSE, "Write text to stdout");
00068 EXTERN BOOL_VAR (tessedit_draw_words, FALSE, "Draw source words");
00069 EXTERN BOOL_VAR (tessedit_draw_outwords, FALSE, "Draw output words");
00070 EXTERN BOOL_VAR (tessedit_training_wiseowl, FALSE, "Call WO to learn blobs");
00071 EXTERN BOOL_VAR (tessedit_training_tess, FALSE, "Call Tess to learn blobs");
00072 EXTERN BOOL_VAR (tessedit_matcher_is_wiseowl, FALSE, "Call WO to classify");
00073 EXTERN BOOL_VAR (tessedit_dump_choices, FALSE, "Dump char choices");
00074 EXTERN BOOL_VAR (tessedit_fix_fuzzy_spaces, TRUE,
00075 "Try to improve fuzzy spaces");
00076 EXTERN BOOL_VAR (tessedit_unrej_any_wd, FALSE,
00077 "Dont bother with word plausibility");
00078 EXTERN BOOL_VAR (tessedit_fix_hyphens, TRUE, "Crunch double hyphens?");
00079 
00080 EXTERN BOOL_VAR (tessedit_reject_fullstops, FALSE, "Reject all fullstops");
00081 EXTERN BOOL_VAR (tessedit_reject_suspect_fullstops, FALSE,
00082 "Reject suspect fullstops");
00083 EXTERN BOOL_VAR (tessedit_redo_xheight, TRUE, "Check/Correct x-height");
00084 EXTERN BOOL_VAR (tessedit_cluster_adaption_on, TRUE,
00085 "Do our own adaption - ems only");
00086 EXTERN BOOL_VAR (tessedit_enable_doc_dict, TRUE,
00087 "Add words to the document dictionary");
00088 EXTERN BOOL_VAR (word_occ_first, FALSE, "Do word occ before re-est xht");
00089 EXTERN BOOL_VAR (tessedit_debug_fonts, FALSE, "Output font info per char");
00090 EXTERN BOOL_VAR (tessedit_xht_fiddles_on_done_wds, TRUE,
00091 "Apply xht fix up even if done");
00092 EXTERN BOOL_VAR (tessedit_xht_fiddles_on_no_rej_wds, TRUE,
00093 "Apply xht fix up even in no rejects");
00094 EXTERN INT_VAR (x_ht_check_word_occ, 2, "Check Char Block occupancy");
00095 EXTERN INT_VAR (x_ht_stringency, 1, "How many confirmed a/n to accept?");
00096 EXTERN BOOL_VAR (x_ht_quality_check, TRUE, "Dont allow worse quality");
00097 EXTERN BOOL_VAR (tessedit_debug_block_rejection, FALSE,
00098 "Block and Row stats");
00099 EXTERN INT_VAR (debug_x_ht_level, 0, "Reestimate debug");
00100 EXTERN BOOL_VAR (rej_use_xht, TRUE, "Individual rejection control");
00101 EXTERN BOOL_VAR (debug_acceptable_wds, FALSE, "Dump word pass/fail chk");
00102 
00103 EXTERN STRING_VAR (chs_leading_punct, "('`\"", "Leading punctuation");
00104 EXTERN
00105 STRING_VAR (chs_trailing_punct1, ").,;:?!", "1st Trailing punctuation");
00106 EXTERN STRING_VAR (chs_trailing_punct2, ")'`\"",
00107 "2nd Trailing punctuation");
00108 
00109 EXTERN double_VAR (quality_rej_pc, 0.08,
00110 "good_quality_doc lte rejection limit");
00111 EXTERN double_VAR (quality_blob_pc, 0.0,
00112 "good_quality_doc gte good blobs limit");
00113 EXTERN double_VAR (quality_outline_pc, 1.0,
00114 "good_quality_doc lte outline error limit");
00115 EXTERN double_VAR (quality_char_pc, 0.95,
00116 "good_quality_doc gte good char limit");
00117 EXTERN INT_VAR (quality_min_initial_alphas_reqd, 2,
00118 "alphas in a good word");
00119 
00120 EXTERN BOOL_VAR (tessedit_tess_adapt_to_rejmap, FALSE,
00121 "Use reject map to control Tesseract adaption");
00122 EXTERN INT_VAR (tessedit_tess_adaption_mode, 0x27,
00123 "Adaptation decision algorithm for tess");
00124 EXTERN INT_VAR (tessedit_em_adaption_mode, 0,
00125 "Adaptation decision algorithm for ems matrix matcher");
00126 EXTERN BOOL_VAR (tessedit_cluster_adapt_after_pass1, FALSE,
00127 "Adapt using clusterer after pass 1");
00128 EXTERN BOOL_VAR (tessedit_cluster_adapt_after_pass2, FALSE,
00129 "Adapt using clusterer after pass 1");
00130 EXTERN BOOL_VAR (tessedit_cluster_adapt_after_pass3, FALSE,
00131 "Adapt using clusterer after pass 1");
00132 EXTERN BOOL_VAR (tessedit_cluster_adapt_before_pass1, FALSE,
00133 "Adapt using clusterer before Tess adaping during pass 1");
00134 EXTERN INT_VAR (tessedit_cluster_adaption_mode, 0,
00135 "Adaptation decision algorithm for matrix matcher");
00136 EXTERN BOOL_VAR (tessedit_adaption_debug, FALSE,
00137 "Generate and print debug information for adaption");
00138 EXTERN BOOL_VAR (tessedit_minimal_rej_pass1, FALSE,
00139 "Do minimal rejection on pass 1 output");
00140 EXTERN BOOL_VAR (tessedit_test_adaption, FALSE,
00141 "Test adaption criteria");
00142 EXTERN BOOL_VAR (tessedit_global_adaption, FALSE,
00143 "Adapt to all docs over time");
00144 EXTERN BOOL_VAR (tessedit_matcher_log, FALSE, "Log matcher activity");
00145 EXTERN INT_VAR (tessedit_test_adaption_mode, 3,
00146 "Adaptation decision algorithm for tess");
00147 
00148 EXTERN BOOL_VAR (test_pt, FALSE, "Test for point");
00149 EXTERN double_VAR (test_pt_x, 99999.99, "xcoord");
00150 EXTERN double_VAR (test_pt_y, 99999.99, "ycoord");
00153 extern int MatcherDebugLevel;
00154 extern int display_ratings;
00155 extern int number_debug;
00156 extern int adjust_debug;
00157 /*
00158 extern "C" {
00159   extern int   MatcherDebugLevel;
00160   extern int   display_ratings;
00161   extern int   number_debug;
00162   extern int   adjust_debug;
00163 // extern int  LearningDebugLevel;
00164  };
00165 */
00166 FILE *choice_file = NULL;        //Choice file ptr
00167 
00168 CLISTIZEH (PBLOB) CLISTIZE (PBLOB)
00169 /* DEBUGGING */
00170 INT16 blob_count(WERD *w) {
00171   return w->blob_list ()->length ();
00172 }
00173 
00174 
00182 void recog_pseudo_word(
00183                        BLOCK_LIST *block_list,
00184                        BOX &selection_box) {
00185   WERD *word;
00186   ROW *pseudo_row;               //row of word
00187   BLOCK *pseudo_block;           //block of word
00188 
00189   word = make_pseudo_word (block_list, selection_box,
00190     pseudo_block, pseudo_row);
00191   if (word != NULL) {
00192     recog_interactive(pseudo_block, pseudo_row, word);
00193     delete word;
00194   }
00195 }
00196 
00197 
00207 BOOL8 recog_interactive(
00208                         BLOCK *,
00209                         ROW *row,
00210                         WERD *word
00211                        ) {
00212   WERD_RES word_res(word);
00213   INT16 char_qual;
00214   INT16 good_char_qual;
00215 
00216   classify_word_pass2(&word_res, row);
00217   #ifndef SECURE_NAMES
00218   if (tessedit_debug_quality_metrics) {
00219     word_char_quality(&word_res, row, &char_qual, &good_char_qual);
00220     tprintf
00221       ("\n%d chars;  word_blob_quality: %d;  outline_errs: %d; char_quality: %d; good_char_quality: %d\n",
00222       word_res.reject_map.length (), word_blob_quality (&word_res, row),
00223       word_outline_errs (&word_res), char_qual, good_char_qual);
00224   }
00225   #endif
00226   return TRUE;
00227 }
00228 
00229 
00245 void recog_all_words(
00246                      PAGE_RES *page_res,
00247                      volatile ETEXT_DESC *monitor
00248                     ) {
00249   PAGE_RES_IT page_res_it(page_res); //reset page iterator
00250   INT16 chars_in_word;
00251   INT16 rejects_in_word;
00252   CHAR_SAMPLES_LIST em_clusters;
00253   CHAR_SAMPLE_LIST ems_waiting;
00254   CHAR_SAMPLES_LIST char_clusters;
00255   CHAR_SAMPLE_LIST chars_waiting;
00256   INT16 blob_quality = 0;
00257   INT16 outline_errs = 0;
00258   INT16 doc_blob_quality = 0;
00259   INT16 doc_outline_errs = 0;
00260   INT16 doc_char_quality = 0;
00261   INT16 all_char_quality;
00262   INT16 accepted_all_char_quality;
00263   INT16 good_char_count = 0;
00264   INT16 doc_good_char_quality = 0;
00265   const STRING *wordstr;
00266   const char *text;
00267   int i;
00268 
00269   BOOL8 good_quality_doc;
00270   UINT8 permuter_type;
00271 
00272   INT32 tess_adapt_mode = 0;
00273   INT32 word_count;              //count of words in doc
00274   INT32 word_index;              //current word
00275 
00276   if (tessedit_minimal_rej_pass1) {
00277     tessedit_test_adaption.set_value (TRUE);
00278     tessedit_minimal_rejection.set_value (TRUE);
00279   }
00280 
00281   if (tessedit_cluster_adapt_before_pass1) {
00282     tess_adapt_mode = tessedit_tess_adaption_mode;
00283     tessedit_tess_adaption_mode.set_value (0);
00284     tessedit_tess_adapt_to_rejmap.set_value (TRUE);
00285   }
00286 
00287   /* Pass 1 */
00288   word_count = 0;
00289   if (monitor != NULL) {
00290     monitor->ocr_alive = TRUE;
00291     while (page_res_it.word () != NULL) {
00292       word_count++;
00293       page_res_it.forward ();
00294     }
00295     page_res_it.restart_page ();
00296   }
00297   else
00298     word_count = 1;
00299 
00300   word_index = 0;
00301   int dict_words = 0;
00302   while (page_res_it.word () != NULL) {
00303     set_global_loc_code(LOC_PASS1);
00304     word_index++;
00305     if (monitor != NULL) {
00306       monitor->ocr_alive = TRUE;
00307       monitor->progress = 30 + 50 * word_index / word_count;
00308       if ((monitor->end_time != 0 && clock() > monitor->end_time) ||
00309           (monitor->cancel != NULL && (*monitor->cancel)(monitor->cancel_this,
00310                                                          dict_words)))
00311         return;
00312     }
00313     classify_word_pass1 (page_res_it.word (),
00314       page_res_it.row ()->row, FALSE, NULL, NULL);
00315 
00316     if (tessedit_test_adaption && !tessedit_minimal_rejection) {
00317       if (!word_adaptable (page_res_it.word (),
00318         tessedit_test_adaption_mode))
00319         page_res_it.word ()->reject_map.rej_word_tess_failure ();
00320       //FAKE PERM REJ
00321       else {
00322         wordstr = &(page_res_it.word ()->best_choice->string ());
00323         /* Override rejection mechanisms for this word */
00324         text = wordstr->string ();
00325         for (i = 0; text[i] != '\0'; i++) {
00326           if ((text[i] != ' ')
00327             && page_res_it.word ()->reject_map[i].rejected ())
00328             page_res_it.word ()->reject_map[i].
00329               setrej_minimal_rej_accept();
00330         }
00331       }
00332     }
00333 
00334     if ((tessedit_cluster_adapt_after_pass1
00335       || tessedit_cluster_adapt_after_pass3
00336       || tessedit_cluster_adapt_before_pass1)
00337     && tessedit_cluster_adaption_mode != 0) {
00338       collect_characters_for_adaption (page_res_it.word (),
00339         &char_clusters, &chars_waiting);
00340     }
00341     // Count dict words.
00342     if (page_res_it.word()->best_choice->permuter() == USER_DAWG_PERM)
00343       ++dict_words;
00344     page_res_it.forward ();
00345   }
00346 
00347   if (tessedit_cluster_adapt_before_pass1)
00348     tessedit_tess_adaption_mode.set_value (tess_adapt_mode);
00349 
00350   page_res_it.restart_page ();
00351   while ((tessedit_cluster_adapt_after_pass1
00352     || tessedit_cluster_adapt_before_pass1)
00353   && page_res_it.word () != NULL) {
00354     if (monitor != NULL)
00355       monitor->ocr_alive = TRUE;
00356     if (tessedit_cluster_adapt_after_pass1)
00357       adapt_to_good_samples (page_res_it.word (),
00358         &char_clusters, &chars_waiting);
00359     else
00360       classify_word_pass1 (page_res_it.word (),
00361         page_res_it.row ()->row,
00362         TRUE, &char_clusters, &chars_waiting);
00363 
00364     page_res_it.forward ();
00365   }
00366 
00367   /* Pass 2 */
00368   page_res_it.restart_page ();
00369   word_index = 0;
00370   while (!tessedit_test_adaption && page_res_it.word () != NULL) {
00371     set_global_loc_code(LOC_PASS2);
00372     word_index++;
00373     if (monitor != NULL) {
00374       monitor->ocr_alive = TRUE;
00375       monitor->progress = 80 + 10 * word_index / word_count;
00376       if ((monitor->end_time != 0 && clock() > monitor->end_time) ||
00377           (monitor->cancel != NULL && (*monitor->cancel)(monitor->cancel_this,
00378                                                          dict_words)))
00379         return;
00380     }
00381     classify_word_pass2 (page_res_it.word (), page_res_it.row ()->row);
00382 
00383     if (tessedit_em_adaption_mode > 0)
00384       collect_ems_for_adaption (page_res_it.word (),
00385         &em_clusters, &ems_waiting);
00386 
00387     if (tessedit_cluster_adapt_after_pass2
00388       && tessedit_cluster_adaption_mode != 0)
00389       collect_characters_for_adaption (page_res_it.word (),
00390         &char_clusters, &chars_waiting);
00391     page_res_it.forward ();
00392   }
00393 
00394   /* Another pass */
00395   set_global_loc_code(LOC_FUZZY_SPACE);
00396 
00397   if (!tessedit_test_adaption && tessedit_fix_fuzzy_spaces
00398     && !tessedit_word_for_word)
00399     fix_fuzzy_spaces(monitor, word_count, page_res);
00400 
00401   if (!tessedit_test_adaption && tessedit_em_adaption_mode != 0)
00402                                  // Initially ems only
00403     print_em_stats(&em_clusters, &ems_waiting);
00404 
00405   /* Pass 3 - used for checking confusion sets */
00406   page_res_it.restart_page ();
00407   word_index = 0;
00408   while (!tessedit_test_adaption && page_res_it.word () != NULL) {
00409     set_global_loc_code(LOC_MM_ADAPT);
00410     word_index++;
00411     if (monitor != NULL) {
00412       monitor->ocr_alive = TRUE;
00413       monitor->progress = 95 + 5 * word_index / word_count;
00414     }
00415     check_debug_pt (page_res_it.word (), 70);
00416     /* Use good matches to sort out confusions */
00417 
00418     if (tessedit_em_adaption_mode != 0)
00419       adapt_to_good_ems (page_res_it.word (), &em_clusters, &ems_waiting);
00420 
00421     if (tessedit_cluster_adapt_after_pass2
00422       && tessedit_cluster_adaption_mode != 0)
00423       adapt_to_good_samples (page_res_it.word (),
00424         &char_clusters, &chars_waiting);
00425 
00426     if (tessedit_reject_fullstops
00427       && strchr (page_res_it.word ()->best_choice->string ().string (),
00428       '.') != NULL)
00429       reject_all_fullstops (page_res_it.word ());
00430     else if (tessedit_reject_suspect_fullstops
00431       && strchr (page_res_it.word ()->best_choice->string ().
00432       string (), '.') != NULL)
00433       reject_suspect_fullstops (page_res_it.word ());
00434 
00435     page_res_it.rej_stat_word ();
00436     chars_in_word = page_res_it.word ()->reject_map.length ();
00437     rejects_in_word = page_res_it.word ()->reject_map.reject_count ();
00438 
00439     blob_quality = word_blob_quality (page_res_it.word (),
00440       page_res_it.row ()->row);
00441     doc_blob_quality += blob_quality;
00442     outline_errs = word_outline_errs (page_res_it.word ());
00443     doc_outline_errs += outline_errs;
00444     word_char_quality (page_res_it.word (),
00445       page_res_it.row ()->row,
00446       &all_char_quality, &accepted_all_char_quality);
00447     doc_char_quality += all_char_quality;
00448     permuter_type = page_res_it.word ()->best_choice->permuter ();
00449     if ((permuter_type == SYSTEM_DAWG_PERM) ||
00450       (permuter_type == FREQ_DAWG_PERM) ||
00451     (permuter_type == USER_DAWG_PERM)) {
00452       good_char_count += chars_in_word - rejects_in_word;
00453       doc_good_char_quality += accepted_all_char_quality;
00454     }
00455     check_debug_pt (page_res_it.word (), 80);
00456     if (tessedit_reject_bad_qual_wds &&
00457       (blob_quality == 0) && (outline_errs >= chars_in_word))
00458       page_res_it.word ()->reject_map.rej_word_bad_quality ();
00459     check_debug_pt (page_res_it.word (), 90);
00460     page_res_it.forward ();
00461   }
00462 
00463   page_res_it.restart_page ();
00464   while (!tessedit_test_adaption
00465   && tessedit_cluster_adapt_after_pass3 && page_res_it.word () != NULL) {
00466     if (monitor != NULL)
00467       monitor->ocr_alive = TRUE;
00468     if (tessedit_cluster_adaption_mode != 0)
00469       adapt_to_good_samples (page_res_it.word (),
00470         &char_clusters, &chars_waiting);
00471     page_res_it.forward ();
00472   }
00473 
00474   #ifndef SECURE_NAMES
00475   if (tessedit_debug_quality_metrics) {
00476     tprintf
00477       ("QUALITY: num_chs= %d  num_rejs= %d %5.3f blob_qual= %d %5.3f outline_errs= %d %5.3f char_qual= %d %5.3f good_ch_qual= %d %5.3f\n",
00478       page_res->char_count, page_res->rej_count,
00479       page_res->rej_count / (float) page_res->char_count, doc_blob_quality,
00480       doc_blob_quality / (float) page_res->char_count, doc_outline_errs,
00481       doc_outline_errs / (float) page_res->char_count, doc_char_quality,
00482       doc_char_quality / (float) page_res->char_count,
00483       doc_good_char_quality,
00484       good_char_count >
00485       0 ? doc_good_char_quality / (float) good_char_count : 0.0);
00486   }
00487   #endif
00488   good_quality_doc =
00489     (page_res->rej_count / (float) page_res->char_count <= quality_rej_pc)
00490     &&
00491     (doc_blob_quality / (float) page_res->char_count >= quality_blob_pc) &&
00492     (doc_outline_errs / (float) page_res->char_count <= quality_outline_pc) &&
00493     (doc_char_quality / (float) page_res->char_count >= quality_char_pc);
00494 
00495   /* Do whole document or whole block rejection pass*/
00496 
00497   if (!tessedit_test_adaption) {
00498     set_global_loc_code(LOC_DOC_BLK_REJ);
00499     quality_based_rejection(page_res_it, good_quality_doc);
00500   }
00501   font_recognition_pass(page_res_it);
00502 
00503   /* Write results pass */
00504   set_global_loc_code(LOC_WRITE_RESULTS);
00505   // This is now redundant, but retained commented so show how to obtain
00506   // bounding boxes and style information.
00507   // output_pass (page_res_it, false);
00508 }
00509 
00510 
00511 /* ================== */
00517 void classify_word_pass1(                 //recog one word
00518                          WERD_RES *word,  //word to do
00519                          ROW *row,
00520                          BOOL8 cluster_adapt,
00521                          CHAR_SAMPLES_LIST *char_clusters,
00522                          CHAR_SAMPLE_LIST *chars_waiting) {
00523   WERD *bln_word;                //baseline norm copy
00524                                  //detailed results
00525   BLOB_CHOICE_LIST_CLIST blob_choices;
00526   BOOL8 adapt_ok;
00527   const char *rejmap;
00528   INT16 index;
00529   STRING mapstr = "";
00530   char *match_string;
00531   char word_string[1024];
00532 
00533   if (matcher_fp != NULL) {
00534     fgets (word_string, 1023, correct_fp);
00535     if ((match_string = strchr (word_string, '\r')) != NULL)
00536       *match_string = '\0';
00537     if ((match_string = strchr (word_string, '\n')) != NULL)
00538       *match_string = '\0';
00539     if (word_string[0] != '\0') {
00540       word->word->set_text (word_string);
00541       word_answer = (char *) word->word->text ();
00542     }
00543     else
00544       word_answer = NULL;
00545   }
00546 
00547   check_debug_pt (word, 0);
00548   matcher_pass = 0;
00549   bln_word = make_bln_copy (word->word, row, row->x_height (), &word->denorm);
00550 
00551   word->best_choice = tess_segment_pass1 (bln_word, &word->denorm,
00552     tess_default_matcher,
00553     word->raw_choice, &blob_choices,
00554     word->outword);
00555 
00561   if ((word->best_choice->string ().length () == 0) ||
00562     (strspn (word->best_choice->string ().string (), " ") ==
00563   word->best_choice->string ().length ())) {
00564     word->done = FALSE;          //Try again on pass2 - adaption may help
00565     word->tess_failed = TRUE;
00566     word->reject_map.initialise (word->best_choice->string ().length ());
00567     word->reject_map.rej_word_tess_failure ();
00568   }
00569   else {
00570     word->tess_failed = FALSE;
00571     if ((word->best_choice->string ().length () !=
00572       word->outword->blob_list ()->length ()) ||
00573     (word->best_choice->string ().length () != blob_choices.length ())) {
00574       tprintf
00575         ("ASSERT FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n",
00576         word->best_choice->string ().string (),
00577         word->best_choice->string ().length (),
00578         word->outword->blob_list ()->length (), blob_choices.length ());
00579     }
00580     ASSERT_HOST (word->best_choice->string ().length () ==
00581       word->outword->blob_list ()->length ());
00582     ASSERT_HOST (word->best_choice->string ().length () ==
00583       blob_choices.length ());
00584 
00585     /*
00586        The adaption step used to be here. It has been moved to after
00587        make_reject_map so that we know whether the word will be accepted in the
00588        first pass or not.   This move will PREVENT adaption to words containing
00589        double quotes because the word will not be identical to what tess thinks
00590        its best choice is. (See CurrentBestChoiceIs which is used by
00591       AdaptableWord)
00592      */
00593 
00594     if (word->word->flag (W_REP_CHAR)) {
00595       fix_rep_char(word);
00596     }
00597     else {
00598       fix_quotes ((char *) word->best_choice->string ().string (),
00599       //turn to double
00600         word->outword, &blob_choices);
00601       if (tessedit_fix_hyphens)
00602         //turn 2 to 1
00603         fix_hyphens ((char *) word->best_choice->string ().string (),
00604          word->outword, &blob_choices);
00605       record_certainty (word->best_choice->certainty (), 1); //accounting
00606 
00607       word->tess_accepted = tess_acceptable_word (word->best_choice,
00608         word->raw_choice);
00609 
00610       word->tess_would_adapt = tess_adaptable_word (word->outword,
00611         word->best_choice,
00612         word->raw_choice);
00613       // Also sets word->done flag
00614       make_reject_map (word, &blob_choices, row, 1);
00615 
00616       adapt_ok = word_adaptable (word, tessedit_tess_adaption_mode);
00617 
00618       if (cluster_adapt)
00619         adapt_to_good_samples(word, char_clusters, chars_waiting);
00620 
00621       if (adapt_ok || tessedit_tess_adapt_to_rejmap) {
00622         if (!tessedit_tess_adapt_to_rejmap)
00623           rejmap = NULL;
00624         else {
00625           ASSERT_HOST (word->reject_map.length () ==
00626             word->best_choice->string ().length ());
00627 
00628           for (index = 0; index < word->reject_map.length (); index++) {
00629             if (adapt_ok || word->reject_map[index].accepted ())
00630               mapstr += '1';
00631             else
00632               mapstr += '0';
00633           }
00634           rejmap = mapstr.string ();
00635         }
00636 
00637         //adapt to it
00638         tess_adapter (word->outword, &word->denorm,
00639          word->best_choice->string ().string (),
00640          word->raw_choice->string ().string (), rejmap);
00641       }
00642 
00643       if (tessedit_enable_doc_dict)
00644         tess_add_doc_word (word->best_choice);
00645       set_word_fonts(word, &blob_choices);
00646     }
00647   }
00648   if (tessedit_print_text) {
00649     write_cooked_text (bln_word, word->best_choice->string (),
00650       word->done, FALSE, stdout);
00651   }
00652   delete bln_word;
00653   blob_choices.deep_clear ();
00654 }
00655 
00656 
00657 /* ================== */
00661 void classify_word_pass2(  //word to do
00662                          WERD_RES *word,
00663                          ROW *row) {
00664   BOOL8 done_this_pass = FALSE;
00665   WERD_RES new_x_ht_word (word->word);
00666   float new_x_ht = 0.0;
00667   INT16 old_xht_reject_count;
00668   INT16 new_xht_reject_count;
00669   INT16 old_xht_accept_count;
00670   INT16 new_xht_accept_count;
00671   BOOL8 accept_new_x_ht = FALSE;
00672   INT16 old_chs_in_wd;
00673   INT16 new_chs_in_wd;
00674   INT16 old_word_quality;
00675   INT16 new_word_quality;
00676   INT16 dummy;
00677 
00678   set_global_subloc_code(SUBLOC_NORM);
00679   check_debug_pt (word, 30);
00680   if (!word->done ||
00681     tessedit_training_tess ||
00682   tessedit_training_wiseowl || tessedit_dump_choices) {
00683     word->x_height = row->x_height ();
00684     word->caps_height = 0.0;
00685     if (word->outword != NULL) {
00686       delete word->outword;      //get rid of junk
00687       delete word->best_choice;
00688       delete word->raw_choice;
00689     }
00690     match_word_pass2 (word, row, row->x_height ());
00691     done_this_pass = TRUE;
00692     check_debug_pt (word, 40);
00693   }
00694 
00695   if (!word->tess_failed && !word->word->flag (W_REP_CHAR)) {
00696     set_global_subloc_code(SUBLOC_FIX_XHT);
00697     if ((tessedit_xht_fiddles_on_done_wds || !word->done) &&
00698       (tessedit_xht_fiddles_on_no_rej_wds ||
00699     (word->reject_map.reject_count () > 0))) {
00700       if ((x_ht_check_word_occ >= 2) && word_occ_first)
00701         check_block_occ(word);
00702 
00703       if (tessedit_redo_xheight)
00704         re_estimate_x_ht(word, &new_x_ht);
00705 
00706       if (((x_ht_check_word_occ >= 2) && !word_occ_first) ||
00707         ((x_ht_check_word_occ >= 1) && (new_x_ht > 0)))
00708         check_block_occ(word);
00709     }
00710     if (new_x_ht > 0) {
00711       old_chs_in_wd = word->reject_map.length ();
00712 
00713       /* Re-estimated x_ht error suggests a rematch is worthwhile. */
00714       new_x_ht_word.x_height = new_x_ht;
00715       new_x_ht_word.caps_height = 0.0;
00716       match_word_pass2 (&new_x_ht_word, row, new_x_ht_word.x_height);
00717       if (!new_x_ht_word.tess_failed) {
00718         if ((x_ht_check_word_occ >= 1) && word_occ_first)
00719           check_block_occ(&new_x_ht_word);
00720 
00721         re_estimate_x_ht(&new_x_ht_word, &new_x_ht);
00722 
00723         if ((x_ht_check_word_occ >= 1) && !word_occ_first)
00724           check_block_occ(&new_x_ht_word);
00725 
00726         old_xht_reject_count = word->reject_map.reject_count ();
00727         old_xht_accept_count = old_chs_in_wd - old_xht_reject_count;
00728         new_xht_reject_count = new_x_ht_word.reject_map.reject_count ();
00729         new_chs_in_wd = new_x_ht_word.reject_map.length ();
00730         new_xht_accept_count = new_chs_in_wd - new_xht_reject_count;
00731         accept_new_x_ht =
00732           ((new_xht_accept_count > old_xht_accept_count) ||
00733           ((new_xht_accept_count == old_xht_accept_count) &&
00734           (new_xht_accept_count > 0))) &&
00735           (!new_x_ht_word.guessed_x_ht ||
00736           !new_x_ht_word.guessed_caps_ht);
00737 
00738         if (accept_new_x_ht && x_ht_quality_check) {
00739           word_char_quality(word, row, &old_word_quality, &dummy);
00740           word_char_quality(&new_x_ht_word, row, &new_word_quality, &dummy);
00741           if (old_word_quality > new_word_quality)
00742             accept_new_x_ht = FALSE;
00743         }
00744 
00745         if (accept_new_x_ht && (x_ht_stringency > 0)) {
00746           accept_new_x_ht =
00747             (count_alphanums (&new_x_ht_word) > x_ht_stringency);
00748           if (!accept_new_x_ht && rej_use_xht) {
00749             if (debug_x_ht_level >= 1)
00750               tprintf
00751                 ("Failed stringency test so reject original word\n");
00752             word->reject_map.rej_word_xht_fixup ();
00753           }
00754         }
00755 
00756         #ifndef SECURE_NAMES
00757         if (debug_x_ht_level >= 1) {
00758           tprintf ("New XHT Match:: %s ",
00759             word->best_choice->string ().string ());
00760           word->reject_map.print (debug_fp);
00761           tprintf (" -> %s ",
00762             new_x_ht_word.best_choice->string ().string ());
00763           new_x_ht_word.reject_map.print (debug_fp);
00764           tprintf (" %s->%s %s %s\n",
00765             word->guessed_x_ht ? "GUESS" : "CERT",
00766             new_x_ht_word.guessed_x_ht ? "GUESS" : "CERT",
00767             new_x_ht > 0.1 ? "STILL DOUBT" : "OK",
00768             accept_new_x_ht ? "ACCEPTED" : "");
00769         }
00770         #endif
00771       }
00772       if (accept_new_x_ht) {
00773         /*
00774            The new x_ht is deemed superior so put the final results in the real word
00775            and destroy the old results
00776          */
00777         delete word->outword;    //get rid of junk
00778         word->outword = new_x_ht_word.outword;
00779         word->denorm = new_x_ht_word.denorm;
00780         delete word->best_choice;
00781         word->best_choice = new_x_ht_word.best_choice;
00782         delete word->raw_choice;
00783         word->raw_choice = new_x_ht_word.raw_choice;
00784         word->reject_map = new_x_ht_word.reject_map;
00785         word->done = new_x_ht_word.done;
00786         done_this_pass = TRUE;
00787       }
00788       else {
00789       /*
00790          The new x_ht is no better, so destroy the copy word and put any uncertain
00791          x or cap ht estimate back to default. (I.e. dont blame me if its bad!)
00792          Conditionally, use any ammended block occ chars.
00793        */
00794         //get rid of junk
00795         delete new_x_ht_word.outword;
00796         delete new_x_ht_word.best_choice;
00797         delete new_x_ht_word.raw_choice;
00798       }
00799       new_x_ht_word.outword = NULL; //to keep new destructor happy
00800       new_x_ht_word.best_choice = NULL; //to keep new destructor happy
00801       new_x_ht_word.raw_choice = NULL; //to keep new destructor happy
00802 
00803       if (rej_mostly_reject_mode == 2) {
00804         reject_mostly_rejects(word);
00805         tprintf ("Rejecting mostly rejects on %s ",
00806           word->best_choice->string ().string ());
00807       }
00808     }
00809 
00810     set_global_subloc_code(SUBLOC_NORM);
00811 
00812     if (done_this_pass && !word->done && tessedit_save_stats)
00813       SaveBadWord (word->best_choice->string ().string (),
00814         word->best_choice->certainty ());
00815     record_certainty (word->best_choice->certainty (), 2); //accounting
00816   }
00817 #ifndef GRAPHICS_DISABLED
00818   if (tessedit_draw_outwords) {
00819     if (fx_win == NO_WINDOW)
00820       create_fx_win();
00821     clear_fx_win();
00822     word->outword->plot (fx_win);
00823     make_picture_current(fx_win);
00824   }
00825 #endif
00826 
00827   set_global_subloc_code(SUBLOC_NORM);
00828   if (tessedit_print_text) {
00829     write_cooked_text (word->outword, word->best_choice->string (),
00830       word->done, done_this_pass, stdout);
00831   }
00832   check_debug_pt (word, 50);
00833 }
00834 
00835 
00836 /* ================== */
00840 void match_word_pass2(                 //recog one word
00841                       WERD_RES *word,  //word to do
00842                       ROW *row,
00843                       float x_height) {
00844   WERD *bln_word;                //baseline norm copy
00845                                  //detailed results
00846   BLOB_CHOICE_LIST_CLIST blob_choices;
00847 
00848   set_global_subsubloc_code(SUBSUBLOC_OTHER);
00849   if (matcher_fp != NULL) {
00850     word_answer = (char *) word->word->text ();
00851     if (word_answer != NULL && word_answer[0] == '\0')
00852       word_answer = NULL;
00853   }
00854   matcher_pass = 0;
00855   bln_word = make_bln_copy (word->word, row, x_height, &word->denorm);
00856   set_global_subsubloc_code(SUBSUBLOC_TESS);
00857   if (tessedit_training_tess)
00858     word->best_choice = correct_segment_pass2 (bln_word,
00859       &word->denorm,
00860       tess_default_matcher,
00861       tess_training_tester,
00862       word->raw_choice,
00863       &blob_choices, word->outword);
00864   else if (tessedit_dump_choices)
00865     word->best_choice = test_segment_pass2 (bln_word,
00866         &word->denorm,
00867         tess_default_matcher,
00868         choice_dump_tester,
00869         word->raw_choice,
00870         &blob_choices, word->outword);
00871   //      else if (tessedit_training_wiseowl)
00872   //              best_choice=correct_segment_pass2( word, &denorm,
00873   //                                                                                                        tess_default_matcher,wo_learn,
00874   //                                                                                                        raw_choice,&blob_choices,outword);
00875   //      else if (tessedit_matcher_is_wiseowl)
00876   //              best_choice=tess_segment_pass2( word, &denorm, wo_classify,
00877   //                                                                                                raw_choice, &blob_choices, outword);
00878   else {
00879     word->best_choice = tess_segment_pass2 (bln_word, &word->denorm,
00880       tess_default_matcher,
00881       word->raw_choice, &blob_choices,
00882       word->outword);
00883   }
00884   set_global_subsubloc_code(SUBSUBLOC_OTHER);
00885   /*
00886      Test for TESS screw up on word. Recog_word has already ensured that the
00887      choice list, outword blob lists and best_choice string are the same
00888      length. A TESS screw up is indicated by a blank filled or 0 length string.
00889    */
00890   if ((word->best_choice->string ().length () == 0) ||
00891     (strspn (word->best_choice->string ().string (), " ") ==
00892   word->best_choice->string ().length ())) {
00893     word->tess_failed = TRUE;
00894     word->reject_map.initialise (word->best_choice->string ().length ());
00895     word->reject_map.rej_word_tess_failure ();
00896     //              tprintf("Empty word produced\n");
00897   }
00898   else {
00899     if ((word->best_choice->string ().length () !=
00900       word->outword->blob_list ()->length ()) ||
00901     (word->best_choice->string ().length () != blob_choices.length ())) {
00902       tprintf
00903         ("ASSERT FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n",
00904         word->best_choice->string ().string (),
00905         word->best_choice->string ().length (),
00906         word->outword->blob_list ()->length (), blob_choices.length ());
00907     }
00908     ASSERT_HOST (word->best_choice->string ().length () ==
00909       word->outword->blob_list ()->length ());
00910     ASSERT_HOST (word->best_choice->string ().length () ==
00911       blob_choices.length ());
00912 
00913     word->tess_failed = FALSE;
00914     if (word->word->flag (W_REP_CHAR)) {
00915       fix_rep_char(word);
00916     }
00917     else {
00918       fix_quotes ((char *) word->best_choice->string ().string (),
00919         word->outword, &blob_choices);
00920       if (tessedit_fix_hyphens)
00921         fix_hyphens ((char *) word->best_choice->string ().string (),
00922           word->outword, &blob_choices);
00923       /* Dont trust fix_quotes! - though I think I've fixed the bug */
00924       if ((word->best_choice->string ().length () !=
00925         word->outword->blob_list ()->length ()) ||
00926         (word->best_choice->string ().length () !=
00927       blob_choices.length ())) {
00928         #ifndef SECURE_NAMES
00929         tprintf
00930           ("POST FIX_QUOTES FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n",
00931           word->best_choice->string ().string (),
00932           word->best_choice->string ().length (),
00933           word->outword->blob_list ()->length (),
00934           blob_choices.length ());
00935         #endif
00936 
00937       }
00938       ASSERT_HOST (word->best_choice->string ().length () ==
00939         word->outword->blob_list ()->length ());
00940       ASSERT_HOST (word->best_choice->string ().length () ==
00941         blob_choices.length ());
00942 
00943       word->tess_accepted = tess_acceptable_word (word->best_choice,
00944         word->raw_choice);
00945 
00946       make_reject_map (word, &blob_choices, row, 2);
00947     }
00948   }
00949   blob_choices.deep_clear ();
00950   delete bln_word;
00951   assert (word->raw_choice != NULL);
00952 }
00953 
00954 
00955 /* ================== */
00965 void fix_rep_char( 
00966                   WERD_RES *word 
00967                  ) {
00968   struct REP_CH
00969   {
00970     char ch;
00971     int count;
00972   };
00973 
00974   REP_CH *rep_ch;                //array of char counts
00975   int word_len;
00976   int rep_ch_count = 0;          //how many unique chs
00977   const char *word_str;          //the repeated chs
00978   int i, j;
00979   int total = 0;
00980   int max = 0;
00981   char maxch = ' ';              //Most common char
00982 
00983   word_str = word->best_choice->string ().string ();
00984   word_len = strlen (word_str);
00985   rep_ch = (REP_CH *) alloc_mem (word_len * sizeof (REP_CH));
00986   for (i = 0; i < word_len; i++) {
00987     for (j = 0; j < rep_ch_count && rep_ch[j].ch != word_str[i]; j++);
00988     if (j < rep_ch_count)
00989       rep_ch[j].count++;
00990     else {
00991       rep_ch[rep_ch_count].ch = word_str[i];
00992       rep_ch[rep_ch_count].count = 1;
00993       rep_ch_count++;
00994     }
00995   }
00996 
00997   for (j = 0; j < rep_ch_count; j++) {
00998     total += rep_ch[j].count;
00999     if ((rep_ch[j].count > max) && (rep_ch[j].ch != ' ')) {
01000       max = rep_ch[j].count;
01001       maxch = rep_ch[j].ch;
01002     }
01003   }
01004   //      tprintf( "REPEATED CHAR %s len=%d total=%d choice=%c\n",
01005   //                        word_str, word_len, total, maxch );
01006   free_mem(rep_ch);
01007 
01008   word->reject_map.initialise (word_len);
01009   for (i = 0; i < word_len; i++) {
01010     if (word_str[i] != maxch)
01011                                  //rej unrecognised blobs
01012       word->reject_map[i].setrej_bad_repetition ();
01013   }
01014   word->done = TRUE;
01015 }
01016 
01017 
01018 /* ================== */
01022 void fix_quotes(               //make double quotes
01023                 char *string,  //string to fix
01024                 WERD *word,    //word to do //char choices
01025                 BLOB_CHOICE_LIST_CLIST *blob_choices) {
01026   char *ptr;                     //string ptr
01027                                  //blobs
01028   PBLOB_IT blob_it = word->blob_list ();
01029                                  //choices
01030   BLOB_CHOICE_LIST_C_IT choice_it = blob_choices;
01031   BLOB_CHOICE_IT it1;            //first choices
01032   BLOB_CHOICE_IT it2;            //second choices
01033 
01034   for (ptr = string;
01035   *ptr != '\0'; ptr++, blob_it.forward (), choice_it.forward ()) {
01036     if ((*ptr == '\'' || *ptr == '`')
01037     && (*(ptr + 1) == '\'' || *(ptr + 1) == '`')) {
01038       *ptr = '"';                //turn to double
01039       strcpy (ptr + 1, ptr + 2); //shuffle up
01040       merge_blobs (blob_it.data (), blob_it.data_relative (1));
01041       blob_it.forward ();
01042       delete blob_it.extract (); //get rid of spare
01043 
01044       it1.set_to_list (choice_it.data ());
01045       it2.set_to_list (choice_it.data_relative (1));
01046       if (it1.data ()->certainty () < it2.data ()->certainty ()) {
01047         choice_it.forward ();
01048                                  //get rid of spare
01049         delete choice_it.extract ();
01050       }
01051       else {
01052                                  //get rid of spare
01053         delete choice_it.extract ();
01054         choice_it.forward ();
01055       }
01056     }
01057   }
01058 }
01059 
01060 
01061 /* ================== */
01067 void fix_hyphens(               //crunch double hyphens
01068                  char *string,  //string to fix
01069                  WERD *word,    //word to do //char choices
01070                  BLOB_CHOICE_LIST_CLIST *blob_choices) {
01071   char *ptr;                     //string ptr
01072                                  //blobs
01073   PBLOB_IT blob_it = word->blob_list ();
01074                                  //choices
01075   BLOB_CHOICE_LIST_C_IT choice_it = blob_choices;
01076   BLOB_CHOICE_IT it1;            //first choices
01077   BLOB_CHOICE_IT it2;            //second choices
01078 
01079   for (ptr = string;
01080   *ptr != '\0'; ptr++, blob_it.forward (), choice_it.forward ()) {
01081     if ((*ptr == '-' || *ptr == '~') &&
01082       (*(ptr + 1) == '-' || *(ptr + 1) == '~') &&
01083       (blob_it.data ()->bounding_box ().right () >=
01084     blob_it.data_relative (1)->bounding_box ().left ())) {
01085       *ptr = '-';                //turn to single hyphen
01086       strcpy (ptr + 1, ptr + 2); //shuffle up
01087       merge_blobs (blob_it.data (), blob_it.data_relative (1));
01088       blob_it.forward ();
01089       delete blob_it.extract (); //get rid of spare
01090 
01091       it1.set_to_list (choice_it.data ());
01092       it2.set_to_list (choice_it.data_relative (1));
01093       if (it1.data ()->certainty () < it2.data ()->certainty ()) {
01094         choice_it.forward ();
01095                                  //get rid of spare
01096         delete choice_it.extract ();
01097       }
01098       else {
01099                                  //get rid of spare
01100         delete choice_it.extract ();
01101         choice_it.forward ();
01102       }
01103     }
01104   }
01105 }
01106 
01107 
01108 /* ================== */
01114 void merge_blobs(               //combine 2 blobs
01115                  PBLOB *blob1,  //dest blob
01116                  PBLOB *blob2   //source blob
01117                 ) {
01118   OUTLINE_IT outline_it = blob1->out_list ();
01119   //iterator
01120 
01121   outline_it.move_to_last ();    //go to end
01122                                  //do it
01123   outline_it.add_list_after (blob2->out_list ());
01124 }
01125 
01126 
01127 /* ================== */
01134 void choice_dump_tester(                           //dump chars in word
01135                         PBLOB *,                   //blob
01136                         DENORM *,                  //de-normaliser
01137                         BOOL8 correct,             //ly segmented
01138                         char *text,                //correct text
01139                         INT32 count,               //chars in text
01140                         BLOB_CHOICE_LIST *ratings  //list of results
01141                        ) {
01142   STRING choice_file_name;
01143   BLOB_CHOICE *blob_choice;
01144   BLOB_CHOICE_IT it;
01145   char source_chars[20];
01146   char correct_char[3];
01147 
01148   if (choice_file == NULL) {
01149     choice_file_name = imagebasename + ".chc";
01150     if (!(choice_file = fopen (choice_file_name.string (), "w"))) {
01151       CANTOPENFILE.error ("choice_dump_tester", EXIT, "%s %d",
01152         choice_file_name.string (), errno);
01153     }
01154   }
01155 
01156   if ((count == 0) || (text == NULL) || (text[0] == '\0')) {
01157     strcpy (source_chars, "$$");
01158     strcpy (correct_char, "$$");
01159   }
01160   else {
01161     strncpy(source_chars, text, count);
01162     source_chars[count] = '\0';
01163     if (correct) {
01164       correct_char[0] = text[0];
01165       correct_char[1] = '\0';
01166     }
01167     else {
01168       strcpy (correct_char, "$$");
01169     }
01170   }
01171   fprintf (choice_file, "%s\t%s", source_chars, correct_char);
01172 
01173   it.set_to_list (ratings);
01174   for (it.mark_cycle_pt (); !it.cycled_list (); it.forward ()) {
01175     blob_choice = it.data ();
01176     if ((blob_choice->char_class () >= '!') &&
01177       (blob_choice->char_class () <= '~'))
01178       fprintf (choice_file, "\t%c\t%f\t%f",
01179         blob_choice->char_class (),
01180         blob_choice->rating (), blob_choice->certainty ());
01181   }
01182   fprintf (choice_file, "\n");
01183 }
01184 
01185 
01186 /* ================== */
01193 WERD *make_bln_copy(WERD *src_word, ROW *row, float x_height, DENORM *denorm) {
01194   WERD *result;
01195 
01196   //      if (wordit_linearc && !src_word->flag(W_POLYGON))
01197   //      {
01198   //              larc_word = src_word->larc_copy( row->x_height() );
01199   //              result = larc_word->poly_copy( row->x_height() );
01200   //              delete larc_word;
01201   //      }
01202   // else
01203   result = src_word->poly_copy (row->x_height ());
01204 
01205   //      if (tessedit_draw_words)
01206   //      {
01207   //              if ( la_win == NO_WINDOW )
01208   //                      create_la_win();
01209   //              result->plot( la_win );
01210   //      }
01211   result->baseline_normalise_x (row, x_height, denorm);
01212   return result;
01213 }
01214 
01215 
01216 /* ================== */
01222 ACCEPTABLE_WERD_TYPE acceptable_word_string(const char *s) {
01223   int i = 0;
01224   int leading_punct_count;
01225   int upper_count = 0;
01226   int hyphen_pos = -1;
01227   ACCEPTABLE_WERD_TYPE word_type = AC_UNACCEPTABLE;
01228 
01229   if (strlen (s) > 20)
01230     return word_type;
01231 
01232   /* Single Leading punctuation char*/
01233 
01234   if ((s[i] != '\0') && (STRING (chs_leading_punct).contains (s[i])))
01235     i++;
01236   leading_punct_count = i;
01237 
01238   /* Initial cap */
01239   while (isupper (s[i])) {
01240     i++;
01241     upper_count++;
01242   }
01243   if (upper_count > 1)
01244     word_type = AC_UPPER_CASE;
01245   else {
01246     /* Lower case word, possibly with an initial cap */
01247     while (islower (s[i])) {
01248       i++;
01249     }
01250     if (i - leading_punct_count < quality_min_initial_alphas_reqd)
01251       goto not_a_word;
01252     /*
01253     Allow a single hyphen in a lower case word
01254     - dont trust upper case - I've seen several cases of "H" -> "I-I"
01255     */
01256     if (s[i] == '-') {
01257       hyphen_pos = i++;
01258       if (s[i] != '\0') {
01259         while (islower (s[i])) {
01260           i++;
01261         }
01262         if (i < hyphen_pos + 3)
01263           goto not_a_word;
01264       }
01265     }
01266     else {
01267       /* Allow "'s" in NON hyphenated lower case words */
01268       if ((s[i] == '\'') && (s[i + 1] == 's'))
01269         i += 2;
01270     }
01271     if (upper_count > 0)
01272       word_type = AC_INITIAL_CAP;
01273     else
01274       word_type = AC_LOWER_CASE;
01275   }
01276 
01277   /* Up to two different, constrained trailing punctuation chars */
01278   if ((s[i] != '\0') && (STRING (chs_trailing_punct1).contains (s[i])))
01279     i++;
01280   if ((s[i] != '\0') &&
01281     (s[i - 1] != s[i]) && (STRING (chs_trailing_punct2).contains (s[i])))
01282     i++;
01283 
01284   if (s[i] != '\0')
01285     word_type = AC_UNACCEPTABLE;
01286 
01287   not_a_word:
01288 
01289   if (word_type == AC_UNACCEPTABLE) {
01290     /* Look for abbreviation string */
01291     i = 0;
01292     if (isupper (s[0])) {
01293       word_type = AC_UC_ABBREV;
01294       while ((s[i] != '\0') && isupper (s[i]) && (s[i + 1] == '.'))
01295         i += 2;
01296     }
01297     else if (islower (s[0])) {
01298       word_type = AC_LC_ABBREV;
01299       while ((s[i] != '\0') && islower (s[i]) && (s[i + 1] == '.'))
01300         i += 2;
01301     }
01302     if (s[i] != '\0')
01303       word_type = AC_UNACCEPTABLE;
01304   }
01305 
01306   return word_type;
01307 }
01308 
01309 
01310 /* ================== */
01314 BOOL8 check_debug_pt(WERD_RES *word, int location) {
01315   BOOL8 show_map_detail = FALSE;
01316   INT16 i;
01317 
01318   #ifndef SECURE_NAMES
01319   if (!test_pt)
01320     return FALSE;
01321 
01322   tessedit_rejection_debug.set_value (FALSE);
01323   debug_x_ht_level.set_value (0);
01324   tessedit_cluster_debug.set_value (FALSE);
01325   nn_debug.set_value (FALSE);
01326   nn_reject_debug.set_value (FALSE);
01327 
01328   if (word->word->bounding_box ().contains (FCOORD (test_pt_x, test_pt_y))) {
01329     if (location < 0)
01330       return TRUE;               //For breakpoint use
01331     tessedit_rejection_debug.set_value (TRUE);
01332     debug_x_ht_level.set_value (20);
01333     tessedit_cluster_debug.set_value (TRUE);
01334     nn_debug.set_value (TRUE);
01335     nn_reject_debug.set_value (TRUE);
01336     tprintf ("\n\nTESTWD::");
01337     switch (location) {
01338       case 0:
01339         tprintf ("classify_word_pass1 start\n");
01340         word->word->print (debug_fp);
01341         break;
01342       case 10:
01343         tprintf ("make_reject_map: initial map");
01344         break;
01345       case 20:
01346         tprintf ("make_reject_map: after NN");
01347         break;
01348       case 30:
01349         tprintf ("classify_word_pass2 - START");
01350         break;
01351       case 40:
01352         tprintf ("classify_word_pass2 - Pre Xht");
01353         break;
01354       case 50:
01355         tprintf ("classify_word_pass2 - END");
01356         show_map_detail = TRUE;
01357         break;
01358       case 60:
01359         tprintf ("fixspace");
01360         break;
01361       case 70:
01362         tprintf ("MM pass START");
01363         break;
01364       case 80:
01365         tprintf ("MM pass END");
01366         break;
01367       case 90:
01368         tprintf ("After Poor quality rejection");
01369         break;
01370       case 100:
01371         tprintf ("unrej_good_quality_words - START");
01372         break;
01373       case 110:
01374         tprintf ("unrej_good_quality_words - END");
01375         break;
01376       case 120:
01377         tprintf ("Write results pass");
01378         show_map_detail = TRUE;
01379         break;
01380     }
01381     tprintf (" \"%s\" ", word->best_choice->string ().string ());
01382     word->reject_map.print (debug_fp);
01383     tprintf ("\n");
01384     if (show_map_detail) {
01385       tprintf ("\"%s\"\n", word->best_choice->string ().string ());
01386       for (i = 0; word->best_choice->string ()[i] != '\0'; i++) {
01387         tprintf ("**** \"%c\" ****\n", word->best_choice->string ()[i]);
01388         word->reject_map[i].full_print (debug_fp);
01389       }
01390     }
01391 
01392     tprintf ("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE");
01393     tprintf ("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE");
01394     return TRUE;
01395   }
01396   else
01397   #endif
01398     return FALSE;
01399 }
01400 
01401 
01402 /* ================== */
01406 void set_word_fonts(                 //good chars in word
01407                     WERD_RES *word,  //word to adapt to //detailed results
01408                     BLOB_CHOICE_LIST_CLIST *blob_choices) {
01409   INT32 index;                   //char index
01410   char choice_char;              //char from word
01411   INT8 config;                   //font of char
01412                                  //character iterator
01413   BLOB_CHOICE_LIST_C_IT char_it = blob_choices;
01414   BLOB_CHOICE_IT choice_it;      //choice iterator
01415   STATS fonts (0, 32);           //font counters
01416   static INT8 italic_table[32] = {
01417     1, -1, 1, -1,
01418     1, -1, 1, -1,
01419     1, -1, 1, -1,
01420     1, -1, 1, -1,
01421     1, -1, 1, -1,
01422     1, -1, 1, -1,
01423     1, -1, 1, -1,
01424     1, -1, 1, -1
01425   };
01426   static INT8 bold_table[32] = {
01427     1, 1, -1, -1,
01428     1, 1, -1, -1,
01429     1, 1, -1, -1,
01430     1, 1, -1, -1,
01431     1, 1, -1, -1,
01432     1, 1, -1, -1,
01433     1, 1, -1, -1,
01434     1, 1, -1, -1
01435   };
01436   static INT8 font_table[32] = {
01437     2, 2, 2, 2,
01438     -1, -1, -1, -1,
01439     0, 0, 0, 0,
01440     1, 1, 1, 1,
01441     3, 3, 3, 3,
01442     4, 4, 4, 4,
01443     5, 5, 5, 5,
01444     2, 2, 2, 2
01445   };
01446 
01447   word->italic = 0;
01448   word->bold = 0;
01449   for (char_it.mark_cycle_pt (), index = 0;
01450   !char_it.cycled_list (); char_it.forward (), index++) {
01451     choice_char = word->best_choice->string ()[index];
01452     choice_it.set_to_list (char_it.data ());
01453     for (choice_it.mark_cycle_pt (); !choice_it.cycled_list ();
01454     choice_it.forward ()) {
01455       if (choice_it.data ()->char_class () == choice_char) {
01456         config = choice_it.data ()->config ();
01457         if (tessedit_debug_fonts)
01458           tprintf ("%c(%d=%d%c%c)",
01459             choice_char, config, (config & 31) >> 2,
01460             config & 2 ? 'N' : 'B', config & 1 ? 'N' : 'I');
01461         if (config != -1) {
01462           config &= 31;
01463           word->italic += italic_table[config];
01464           word->bold += bold_table[config];
01465           if (font_table[config] != -1)
01466             fonts.add (font_table[config], 1);
01467         }
01468         break;
01469       }
01470     }
01471   }
01472   find_modal_font (&fonts, &word->font1, &word->font1_count);
01473   find_modal_font (&fonts, &word->font2, &word->font2_count);
01474   if (tessedit_debug_fonts)
01475     tprintf ("\n");
01476   /*  if (word->font1_count>0)
01477     {
01478       for (char_it.mark_cycle_pt(),index=0;
01479       !char_it.cycled_list();char_it.forward(),index++)
01480       {
01481         choice_char=word->best_choice->string()[index];
01482         choice_it.set_to_list(char_it.data());
01483         for (choice_it.mark_cycle_pt();!choice_it.cycled_list();choice_it.forward())
01484         {
01485           if (choice_it.data()->char_class()==choice_char)
01486           {
01487             config=choice_it.data()->config();
01488             if (config!=-1 && font_table[config&31]==word->font1)
01489             {
01490               word->italic+=italic_table[config];
01491               word->bold+=bold_table[config];
01492             }
01493             break;
01494           }
01495         }
01496       }
01497     }*/
01498 }
01499 
01500 
01501 /* ================== */
01505 void font_recognition_pass(  //good chars in word
01506                            PAGE_RES_IT &page_res_it) {
01507   INT32 length;                  //of word
01508   INT32 count;                   //of a feature
01509   INT8 doc_font;                 //modal font
01510   INT8 doc_font_count;           //modal font
01511   INT32 doc_italic;              //total italics
01512   INT32 doc_bold;                //total bolds
01513   ROW_RES *row = NULL;           //current row
01514   WERD_RES *word;                //current word
01515   STATS fonts (0, 32);           //font counters
01516   STATS doc_fonts (0, 32);       //font counters
01517 
01518   doc_italic = 0;
01519   doc_bold = 0;
01520   page_res_it.restart_page ();
01521   while (page_res_it.word () != NULL) {
01522     if (row != page_res_it.row ()) {
01523       if (row != NULL) {
01524         find_modal_font (&fonts, &row->font1, &row->font1_count);
01525         find_modal_font (&fonts, &row->font2, &row->font2_count);
01526       }
01527       row = page_res_it.row ();  //current row
01528       fonts.clear ();            //clear counters
01529       row->italic = 0;
01530       row->bold = 0;
01531     }
01532     word = page_res_it.word ();
01533     row->italic += word->italic;
01534     row->bold += word->bold;
01535     fonts.add (word->font1, word->font1_count);
01536     fonts.add (word->font2, word->font2_count);
01537     doc_italic += word->italic;
01538     doc_bold += word->bold;
01539     doc_fonts.add (word->font1, word->font1_count);
01540     doc_fonts.add (word->font2, word->font2_count);
01541     page_res_it.forward ();
01542   }
01543   if (row != NULL) {
01544     find_modal_font (&fonts, &row->font1, &row->font1_count);
01545     find_modal_font (&fonts, &row->font2, &row->font2_count);
01546   }
01547   find_modal_font(&doc_fonts, &doc_font, &doc_font_count);
01548   /*
01549     row=NULL;
01550     page_res_it.restart_page();
01551     while (page_res_it.word() != NULL)
01552     {
01553       if (row!=page_res_it.row())
01554       {
01555         row2=row;
01556         row=page_res_it.row();
01557         if (row->font1_count<MIN_FONT_ROW_COUNT)
01558         {
01559           fonts.clear();
01560           italic=0;
01561           bold=0;
01562           add_in_one_row(row,&fonts,&italic,&bold);
01563           if (row2!=NULL)
01564           {
01565             hdiff=row->row->x_height()-row2->row->x_height();
01566             if (hdiff<0)
01567               hdiff=-hdiff;
01568             if (hdiff<MAX_XHEIGHT_DIFF)
01569               add_in_one_row(row2,&fonts,&italic,&bold);
01570           }
01571           do
01572             page_res_it.forward();
01573           while (page_res_it.row()==row);
01574           row2=page_res_it.row();
01575           if (row2!=NULL)
01576           {
01577             hdiff=row->row->x_height()-row2->row->x_height();
01578             if (hdiff<0)
01579               hdiff=-hdiff;
01580             if (hdiff<MAX_XHEIGHT_DIFF)
01581               add_in_one_row(row2,&fonts,&italic,&bold);
01582           }
01583           row->italic=italic;
01584           row->bold=bold;
01585           find_modal_font(&fonts,&row->font1,&row->font1_count);
01586           find_modal_font(&fonts,&row->font2,&row->font2_count);
01587         }
01588         else
01589           page_res_it.forward();
01590       }
01591       else
01592         page_res_it.forward();
01593     }*/
01594 
01595   page_res_it.restart_page ();
01596   while (page_res_it.word () != NULL) {
01597     row = page_res_it.row ();    //current row
01598     word = page_res_it.word ();
01599     length = word->best_choice->string ().length ();
01600 
01601     count = word->italic;
01602     if (count < 0)
01603       count = -count;
01604     if (!(count == length || length > 3 && count >= length * 3 / 4))
01605       word->italic = doc_italic > 0 ? 1 : -1;
01606 
01607     count = word->bold;
01608     if (count < 0)
01609       count = -count;
01610     if (!(count == length || length > 3 && count >= length * 3 / 4))
01611       word->bold = doc_bold > 0 ? 1 : -1;
01612 
01613     count = word->font1_count;
01614     if (!(count == length || length > 3 && count >= length * 3 / 4)) {
01615       word->font1 = doc_font;
01616       word->font1_count = doc_font_count;
01617     }
01618 
01619     page_res_it.forward ();
01620   }
01621 }
01622 
01623 
01624 /* ================== */
01628 void add_in_one_row(               //good chars in word
01629                     ROW_RES *row,  //current row
01630                     STATS *fonts,  //font stats
01631                     INT8 *italic,  //output count
01632                     INT8 *bold     //output count
01633                    ) {
01634   WERD_RES *word;                //current word
01635   WERD_RES_IT word_it = &row->word_res_list;
01636 
01637   for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
01638     word = word_it.data ();
01639     *italic += word->italic;
01640     *bold += word->bold;
01641     if (word->font1_count > 0)
01642       fonts->add (word->font1, word->font1_count);
01643     if (word->font2_count > 0)
01644       fonts->add (word->font2, word->font2_count);
01645 
01646   }
01647 }
01648 
01649 
01650 /* ================== */
01654 void find_modal_font(                  //good chars in word
01655                      STATS *fonts,     //font stats
01656                      INT8 *font_out,   //output font
01657                      INT8 *font_count  //output count
01658                     ) {
01659   INT8 font;                     //font index
01660   INT32 count;                   //pile couat
01661 
01662   if (fonts->get_total () > 0) {
01663     font = (INT8) fonts->mode ();
01664     *font_out = font;
01665     count = fonts->pile_count (font);
01666     *font_count = count < MAX_INT8 ? count : MAX_INT8;
01667     fonts->add (font, -*font_count);
01668   }
01669   else {
01670     *font_out = -1;
01671     *font_count = 0;
01672   }
01673 }

Generated on Wed Feb 28 19:49:07 2007 for Tesseract by  doxygen 1.5.1