ccmain/control.h File Reference

#include "varable.h"
#include "ocrblock.h"
#include "ratngs.h"
#include "statistc.h"
#include "ocrshell.h"
#include "pageres.h"
#include "charsample.h"
#include "notdll.h"

Go to the source code of this file.

Typedefs

Enumerations

Functions


Typedef Documentation

typedef BOOL8(*) BLOB_REJECTOR(PBLOB *, BLOB_CHOICE_IT *, void *)

Definition at line 49 of file control.h.


Enumeration Type Documentation

enum ACCEPTABLE_WERD_TYPE

A werd can only be one of these types

Enumerator:
AC_UNACCEPTABLE  Unacceptable word.
AC_LOWER_CASE  ALL lower case.
AC_UPPER_CASE  ALL upper case.
AC_INITIAL_CAP  ALL but initial lc.
AC_LC_ABBREV  a.b.c.
AC_UC_ABBREV  A.B.C.

Definition at line 39 of file control.h.

00040 {
00041   AC_UNACCEPTABLE,               
00042   AC_LOWER_CASE,                 
00043   AC_UPPER_CASE,                 
00044   AC_INITIAL_CAP,                
00045   AC_LC_ABBREV,                  
00046   AC_UC_ABBREV                   
00047 };


Function Documentation

ACCEPTABLE_WERD_TYPE acceptable_word_string ( const char *  s  ) 

Determine if word is acceptable.

Iterate through word and apply heuristics

Definition at line 1222 of file control.cpp.

References AC_INITIAL_CAP, AC_LC_ABBREV, AC_LOWER_CASE, AC_UC_ABBREV, AC_UNACCEPTABLE, and AC_UPPER_CASE.

Referenced by doc_and_block_rejection(), garbage_word(), potential_word_crunch(), set_unlv_suspects(), and unrej_good_quality_words().

01222                                                            {
01223   int i = 0;
01224   int leading_punct_count;
01225   int upper_count = 0;
01226   int hyphen_pos = -1;
01227   ACCEPTABLE_WERD_TYPE word_type = AC_UNACCEPTABLE;
01228 
01229   if (strlen (s) > 20)
01230     return word_type;
01231 
01232   /* Single Leading punctuation char*/
01233 
01234   if ((s[i] != '\0') && (STRING (chs_leading_punct).contains (s[i])))
01235     i++;
01236   leading_punct_count = i;
01237 
01238   /* Initial cap */
01239   while (isupper (s[i])) {
01240     i++;
01241     upper_count++;
01242   }
01243   if (upper_count > 1)
01244     word_type = AC_UPPER_CASE;
01245   else {
01246     /* Lower case word, possibly with an initial cap */
01247     while (islower (s[i])) {
01248       i++;
01249     }
01250     if (i - leading_punct_count < quality_min_initial_alphas_reqd)
01251       goto not_a_word;
01252     /*
01253     Allow a single hyphen in a lower case word
01254     - dont trust upper case - I've seen several cases of "H" -> "I-I"
01255     */
01256     if (s[i] == '-') {
01257       hyphen_pos = i++;
01258       if (s[i] != '\0') {
01259         while (islower (s[i])) {
01260           i++;
01261         }
01262         if (i < hyphen_pos + 3)
01263           goto not_a_word;
01264       }
01265     }
01266     else {
01267       /* Allow "'s" in NON hyphenated lower case words */
01268       if ((s[i] == '\'') && (s[i + 1] == 's'))
01269         i += 2;
01270     }
01271     if (upper_count > 0)
01272       word_type = AC_INITIAL_CAP;
01273     else
01274       word_type = AC_LOWER_CASE;
01275   }
01276 
01277   /* Up to two different, constrained trailing punctuation chars */
01278   if ((s[i] != '\0') && (STRING (chs_trailing_punct1).contains (s[i])))
01279     i++;
01280   if ((s[i] != '\0') &&
01281     (s[i - 1] != s[i]) && (STRING (chs_trailing_punct2).contains (s[i])))
01282     i++;
01283 
01284   if (s[i] != '\0')
01285     word_type = AC_UNACCEPTABLE;
01286 
01287   not_a_word:
01288 
01289   if (word_type == AC_UNACCEPTABLE) {
01290     /* Look for abbreviation string */
01291     i = 0;
01292     if (isupper (s[0])) {
01293       word_type = AC_UC_ABBREV;
01294       while ((s[i] != '\0') && isupper (s[i]) && (s[i + 1] == '.'))
01295         i += 2;
01296     }
01297     else if (islower (s[0])) {
01298       word_type = AC_LC_ABBREV;
01299       while ((s[i] != '\0') && islower (s[i]) && (s[i + 1] == '.'))
01300         i += 2;
01301     }
01302     if (s[i] != '\0')
01303       word_type = AC_UNACCEPTABLE;
01304   }
01305 
01306   return word_type;
01307 }

void add_in_one_row ( ROW_RES row,
STATS fonts,
INT8 italic,
INT8 bold 
)

Add into the stats for one row.

Definition at line 1628 of file control.cpp.

References STATS::add(), WERD_RES::bold, WERD_RES::font1, WERD_RES::font1_count, WERD_RES::font2, WERD_RES::font2_count, WERD_RES::italic, and ROW_RES::word_res_list.

01633                      {
01634   WERD_RES *word;                //current word
01635   WERD_RES_IT word_it = &row->word_res_list;
01636 
01637   for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
01638     word = word_it.data ();
01639     *italic += word->italic;
01640     *bold += word->bold;
01641     if (word->font1_count > 0)
01642       fonts->add (word->font1, word->font1_count);
01643     if (word->font2_count > 0)
01644       fonts->add (word->font2, word->font2_count);
01645 
01646   }
01647 }

BOOL_VAR_H ( test_pt  ,
FALSE  ,
"Test for point"   
)

BOOL_VAR_H ( tessedit_matcher_log  ,
FALSE  ,
"Log matcher activity"   
)

BOOL_VAR_H ( tessedit_global_adaption  ,
FALSE  ,
"Adapt to all docs over time"   
)

BOOL_VAR_H ( tessedit_test_adaption  ,
FALSE  ,
"Test adaption criteria"   
)

BOOL_VAR_H ( tessedit_minimal_rej_pass1  ,
FALSE  ,
"Do minimal rejection on pass 1 output"   
)

BOOL_VAR_H ( tessedit_adaption_debug  ,
FALSE  ,
"Generate and print debug information for adaption"   
)

BOOL_VAR_H ( tessedit_cluster_adapt_before_pass1  ,
FALSE  ,
"Adapt using clusterer before Tess adaping during pass 1"   
)

BOOL_VAR_H ( tessedit_cluster_adapt_after_pass3  ,
FALSE  ,
"Adapt using clusterer after pass 1"   
)

BOOL_VAR_H ( tessedit_cluster_adapt_after_pass2  ,
FALSE  ,
"Adapt using clusterer after pass 1"   
)

BOOL_VAR_H ( tessedit_cluster_adapt_after_pass1  ,
FALSE  ,
"Adapt using clusterer after pass 1"   
)

BOOL_VAR_H ( tessedit_tess_adapt_to_rejmap  ,
FALSE  ,
"Use reject map to control Tesseract adaption"   
)

BOOL_VAR_H ( debug_acceptable_wds  ,
FALSE  ,
"Dump word pass/fail chk"   
)

BOOL_VAR_H ( rej_use_xht  ,
TRUE  ,
"Individual rejection control"   
)

BOOL_VAR_H ( tessedit_debug_block_rejection  ,
FALSE  ,
"Block and Row stats"   
)

BOOL_VAR_H ( x_ht_quality_check  ,
TRUE  ,
"Dont allow worse quality"   
)

BOOL_VAR_H ( tessedit_xht_fiddles_on_no_rej_wds  ,
TRUE  ,
"Apply xht fix up even in no rejects"   
)

BOOL_VAR_H ( tessedit_xht_fiddles_on_done_wds  ,
TRUE  ,
"Apply xht fix up even if done"   
)

BOOL_VAR_H ( word_occ_first  ,
FALSE  ,
"Do word occ before re-est xht"   
)

BOOL_VAR_H ( tessedit_enable_doc_dict  ,
TRUE  ,
"Add words to the document dictionary"   
)

BOOL_VAR_H ( tessedit_cluster_adaption_on  ,
TRUE  ,
"Do our own adaption - ems only"   
)

BOOL_VAR_H ( tessedit_redo_xheight  ,
TRUE  ,
"Check/Correct x-height"   
)

BOOL_VAR_H ( tessedit_reject_suspect_fullstops  ,
FALSE  ,
"Reject suspect fullstops"   
)

BOOL_VAR_H ( tessedit_reject_fullstops  ,
FALSE  ,
"Reject all fullstops"   
)

BOOL_VAR_H ( tessedit_fix_hyphens  ,
TRUE  ,
"Crunch double hyphens?"   
)

BOOL_VAR_H ( tessedit_unrej_any_wd  ,
FALSE  ,
"Dont bother with word plausibility"   
)

BOOL_VAR_H ( tessedit_fix_fuzzy_spaces  ,
TRUE  ,
"Try to improve fuzzy spaces"   
)

BOOL_VAR_H ( tessedit_dump_choices  ,
FALSE  ,
"Dump char choices"   
)

BOOL_VAR_H ( tessedit_matcher_is_wiseowl  ,
FALSE  ,
"Call WO to classify"   
)

BOOL_VAR_H ( tessedit_training_tess  ,
FALSE  ,
"Call Tess to learn blobs"   
)

BOOL_VAR_H ( tessedit_training_wiseowl  ,
FALSE  ,
"Call WO to learn blobs"   
)

BOOL_VAR_H ( tessedit_draw_outwords  ,
FALSE  ,
"Draw output words"   
)

BOOL_VAR_H ( tessedit_draw_words  ,
FALSE  ,
"Draw source words"   
)

BOOL_VAR_H ( tessedit_print_text  ,
FALSE  ,
"Write text to stdout"   
)

BOOL8 check_debug_pt ( WERD_RES word,
int  location 
)

DEBUGGING ROUTINE.

Definition at line 1314 of file control.cpp.

References WERD_RES::best_choice, WERD::bounding_box(), BOX::contains(), debug_fp, WERD_RES::done, FALSE, REJMAP::full_print(), REJMAP::print(), WERD::print(), WERD_RES::reject_map, WERD_RES::tess_accepted, tprintf(), TRUE, and WERD_RES::word.

Referenced by classify_word_pass1(), classify_word_pass2(), fix_fuzzy_spaces(), output_pass(), recog_all_words(), unrej_good_quality_words(), and write_results().

01314                                                    {
01315   BOOL8 show_map_detail = FALSE;
01316   INT16 i;
01317 
01318   #ifndef SECURE_NAMES
01319   if (!test_pt)
01320     return FALSE;
01321 
01322   tessedit_rejection_debug.set_value (FALSE);
01323   debug_x_ht_level.set_value (0);
01324   tessedit_cluster_debug.set_value (FALSE);
01325   nn_debug.set_value (FALSE);
01326   nn_reject_debug.set_value (FALSE);
01327 
01328   if (word->word->bounding_box ().contains (FCOORD (test_pt_x, test_pt_y))) {
01329     if (location < 0)
01330       return TRUE;               //For breakpoint use
01331     tessedit_rejection_debug.set_value (TRUE);
01332     debug_x_ht_level.set_value (20);
01333     tessedit_cluster_debug.set_value (TRUE);
01334     nn_debug.set_value (TRUE);
01335     nn_reject_debug.set_value (TRUE);
01336     tprintf ("\n\nTESTWD::");
01337     switch (location) {
01338       case 0:
01339         tprintf ("classify_word_pass1 start\n");
01340         word->word->print (debug_fp);
01341         break;
01342       case 10:
01343         tprintf ("make_reject_map: initial map");
01344         break;
01345       case 20:
01346         tprintf ("make_reject_map: after NN");
01347         break;
01348       case 30:
01349         tprintf ("classify_word_pass2 - START");
01350         break;
01351       case 40:
01352         tprintf ("classify_word_pass2 - Pre Xht");
01353         break;
01354       case 50:
01355         tprintf ("classify_word_pass2 - END");
01356         show_map_detail = TRUE;
01357         break;
01358       case 60:
01359         tprintf ("fixspace");
01360         break;
01361       case 70:
01362         tprintf ("MM pass START");
01363         break;
01364       case 80:
01365         tprintf ("MM pass END");
01366         break;
01367       case 90:
01368         tprintf ("After Poor quality rejection");
01369         break;
01370       case 100:
01371         tprintf ("unrej_good_quality_words - START");
01372         break;
01373       case 110:
01374         tprintf ("unrej_good_quality_words - END");
01375         break;
01376       case 120:
01377         tprintf ("Write results pass");
01378         show_map_detail = TRUE;
01379         break;
01380     }
01381     tprintf (" \"%s\" ", word->best_choice->string ().string ());
01382     word->reject_map.print (debug_fp);
01383     tprintf ("\n");
01384     if (show_map_detail) {
01385       tprintf ("\"%s\"\n", word->best_choice->string ().string ());
01386       for (i = 0; word->best_choice->string ()[i] != '\0'; i++) {
01387         tprintf ("**** \"%c\" ****\n", word->best_choice->string ()[i]);
01388         word->reject_map[i].full_print (debug_fp);
01389       }
01390     }
01391 
01392     tprintf ("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE");
01393     tprintf ("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE");
01394     return TRUE;
01395   }
01396   else
01397   #endif
01398     return FALSE;
01399 }

void choice_dump_tester ( PBLOB ,
DENORM ,
BOOL8  correct,
char *  text,
INT32  count,
BLOB_CHOICE_LIST *  ratings 
)

Matcher tester function which generates .chc file entries.

Called via test_segment_pass2 for every blob tested by tess in a word. (But only for words for which a correct segmentation could be found.)

Definition at line 1134 of file control.cpp.

References CANTOPENFILE, BLOB_CHOICE::certainty(), BLOB_CHOICE::char_class(), choice_file, ERRCODE::error(), EXIT, imagebasename, NULL, BLOB_CHOICE::rating(), and STRING::string().

Referenced by match_word_pass2().

01141                          {
01142   STRING choice_file_name;
01143   BLOB_CHOICE *blob_choice;
01144   BLOB_CHOICE_IT it;
01145   char source_chars[20];
01146   char correct_char[3];
01147 
01148   if (choice_file == NULL) {
01149     choice_file_name = imagebasename + ".chc";
01150     if (!(choice_file = fopen (choice_file_name.string (), "w"))) {
01151       CANTOPENFILE.error ("choice_dump_tester", EXIT, "%s %d",
01152         choice_file_name.string (), errno);
01153     }
01154   }
01155 
01156   if ((count == 0) || (text == NULL) || (text[0] == '\0')) {
01157     strcpy (source_chars, "$$");
01158     strcpy (correct_char, "$$");
01159   }
01160   else {
01161     strncpy(source_chars, text, count);
01162     source_chars[count] = '\0';
01163     if (correct) {
01164       correct_char[0] = text[0];
01165       correct_char[1] = '\0';
01166     }
01167     else {
01168       strcpy (correct_char, "$$");
01169     }
01170   }
01171   fprintf (choice_file, "%s\t%s", source_chars, correct_char);
01172 
01173   it.set_to_list (ratings);
01174   for (it.mark_cycle_pt (); !it.cycled_list (); it.forward ()) {
01175     blob_choice = it.data ();
01176     if ((blob_choice->char_class () >= '!') &&
01177       (blob_choice->char_class () <= '~'))
01178       fprintf (choice_file, "\t%c\t%f\t%f",
01179         blob_choice->char_class (),
01180         blob_choice->rating (), blob_choice->certainty ());
01181   }
01182   fprintf (choice_file, "\n");
01183 }

void classify_word_pass1 ( WERD_RES word,
ROW row,
BOOL8  cluster_adapt,
CHAR_SAMPLES_LIST *  char_clusters,
CHAR_SAMPLE_LIST *  chars_waiting 
)

Recognize one word.

Baseline normalize the word and pass it to Tess.

Test for TESS screw up on word. Recog_word has already ensured that the choice list, outword blob lists and best_choice string are the same length. A TESS screw up is indicated by a blank filled or 0 length string.

Definition at line 517 of file control.cpp.

References adapt_to_good_samples(), ASSERT_HOST, WERD_RES::best_choice, WERD::blob_list(), check_debug_pt(), correct_fp, WERD_RES::denorm, WERD_RES::done, FALSE, fix_hyphens(), fix_quotes(), fix_rep_char(), WERD::flag(), REJMAP::initialise(), make_bln_copy(), make_reject_map(), matcher_fp, matcher_pass, NULL, WERD_RES::outword, WERD_RES::raw_choice, record_certainty(), REJMAP::rej_word_tess_failure(), WERD_RES::reject_map, WERD::set_text(), set_word_fonts(), STRING::string(), tess_acceptable_word(), WERD_RES::tess_accepted, tess_adaptable_word(), tess_adapter(), tess_add_doc_word(), tess_default_matcher(), WERD_RES::tess_failed, tess_segment_pass1(), WERD_RES::tess_would_adapt, WERD::text(), tprintf(), TRUE, W_REP_CHAR, WERD_RES::word, word_adaptable(), word_answer, write_cooked_text(), and ROW::x_height().

Referenced by recog_all_words().

00522                                                           {
00523   WERD *bln_word;                //baseline norm copy
00524                                  //detailed results
00525   BLOB_CHOICE_LIST_CLIST blob_choices;
00526   BOOL8 adapt_ok;
00527   const char *rejmap;
00528   INT16 index;
00529   STRING mapstr = "";
00530   char *match_string;
00531   char word_string[1024];
00532 
00533   if (matcher_fp != NULL) {
00534     fgets (word_string, 1023, correct_fp);
00535     if ((match_string = strchr (word_string, '\r')) != NULL)
00536       *match_string = '\0';
00537     if ((match_string = strchr (word_string, '\n')) != NULL)
00538       *match_string = '\0';
00539     if (word_string[0] != '\0') {
00540       word->word->set_text (word_string);
00541       word_answer = (char *) word->word->text ();
00542     }
00543     else
00544       word_answer = NULL;
00545   }
00546 
00547   check_debug_pt (word, 0);
00548   matcher_pass = 0;
00549   bln_word = make_bln_copy (word->word, row, row->x_height (), &word->denorm);
00550 
00551   word->best_choice = tess_segment_pass1 (bln_word, &word->denorm,
00552     tess_default_matcher,
00553     word->raw_choice, &blob_choices,
00554     word->outword);
00555 
00561   if ((word->best_choice->string ().length () == 0) ||
00562     (strspn (word->best_choice->string ().string (), " ") ==
00563   word->best_choice->string ().length ())) {
00564     word->done = FALSE;          //Try again on pass2 - adaption may help
00565     word->tess_failed = TRUE;
00566     word->reject_map.initialise (word->best_choice->string ().length ());
00567     word->reject_map.rej_word_tess_failure ();
00568   }
00569   else {
00570     word->tess_failed = FALSE;
00571     if ((word->best_choice->string ().length () !=
00572       word->outword->blob_list ()->length ()) ||
00573     (word->best_choice->string ().length () != blob_choices.length ())) {
00574       tprintf
00575         ("ASSERT FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n",
00576         word->best_choice->string ().string (),
00577         word->best_choice->string ().length (),
00578         word->outword->blob_list ()->length (), blob_choices.length ());
00579     }
00580     ASSERT_HOST (word->best_choice->string ().length () ==
00581       word->outword->blob_list ()->length ());
00582     ASSERT_HOST (word->best_choice->string ().length () ==
00583       blob_choices.length ());
00584 
00585     /*
00586        The adaption step used to be here. It has been moved to after
00587        make_reject_map so that we know whether the word will be accepted in the
00588        first pass or not.   This move will PREVENT adaption to words containing
00589        double quotes because the word will not be identical to what tess thinks
00590        its best choice is. (See CurrentBestChoiceIs which is used by
00591       AdaptableWord)
00592      */
00593 
00594     if (word->word->flag (W_REP_CHAR)) {
00595       fix_rep_char(word);
00596     }
00597     else {
00598       fix_quotes ((char *) word->best_choice->string ().string (),
00599       //turn to double
00600         word->outword, &blob_choices);
00601       if (tessedit_fix_hyphens)
00602         //turn 2 to 1
00603         fix_hyphens ((char *) word->best_choice->string ().string (),
00604          word->outword, &blob_choices);
00605       record_certainty (word->best_choice->certainty (), 1); //accounting
00606 
00607       word->tess_accepted = tess_acceptable_word (word->best_choice,
00608         word->raw_choice);
00609 
00610       word->tess_would_adapt = tess_adaptable_word (word->outword,
00611         word->best_choice,
00612         word->raw_choice);
00613       // Also sets word->done flag
00614       make_reject_map (word, &blob_choices, row, 1);
00615 
00616       adapt_ok = word_adaptable (word, tessedit_tess_adaption_mode);
00617 
00618       if (cluster_adapt)
00619         adapt_to_good_samples(word, char_clusters, chars_waiting);
00620 
00621       if (adapt_ok || tessedit_tess_adapt_to_rejmap) {
00622         if (!tessedit_tess_adapt_to_rejmap)
00623           rejmap = NULL;
00624         else {
00625           ASSERT_HOST (word->reject_map.length () ==
00626             word->best_choice->string ().length ());
00627 
00628           for (index = 0; index < word->reject_map.length (); index++) {
00629             if (adapt_ok || word->reject_map[index].accepted ())
00630               mapstr += '1';
00631             else
00632               mapstr += '0';
00633           }
00634           rejmap = mapstr.string ();
00635         }
00636 
00637         //adapt to it
00638         tess_adapter (word->outword, &word->denorm,
00639          word->best_choice->string ().string (),
00640          word->raw_choice->string ().string (), rejmap);
00641       }
00642 
00643       if (tessedit_enable_doc_dict)
00644         tess_add_doc_word (word->best_choice);
00645       set_word_fonts(word, &blob_choices);
00646     }
00647   }
00648   if (tessedit_print_text) {
00649     write_cooked_text (bln_word, word->best_choice->string (),
00650       word->done, FALSE, stdout);
00651   }
00652   delete bln_word;
00653   blob_choices.deep_clear ();
00654 }

void classify_word_pass2 ( WERD_RES word,
ROW row 
)

Control what to do with the word in pass 2.

Definition at line 661 of file control.cpp.

References WERD_RES::best_choice, WERD_RES::caps_height, check_block_occ(), check_debug_pt(), clear_fx_win(), count_alphanums(), create_fx_win(), debug_fp, WERD_RES::denorm, WERD_RES::done, dummy, FALSE, WERD::flag(), fx_win, WERD_RES::guessed_x_ht, REJMAP::length(), make_picture_current, match_word_pass2(), NO_WINDOW, NULL, WERD_RES::outword, WERD::plot(), REJMAP::print(), WERD_RES::raw_choice, re_estimate_x_ht(), record_certainty(), REJMAP::rej_word_xht_fixup(), REJMAP::reject_count(), WERD_RES::reject_map, reject_mostly_rejects(), SaveBadWord(), set_global_subloc_code(), SUBLOC_FIX_XHT, SUBLOC_NORM, WERD_RES::tess_failed, tprintf(), TRUE, W_REP_CHAR, WERD_RES::word, word_char_quality(), write_cooked_text(), ROW::x_height(), and WERD_RES::x_height.

Referenced by match_current_words(), recog_all_words(), and recog_interactive().

00663                                    {
00664   BOOL8 done_this_pass = FALSE;
00665   WERD_RES new_x_ht_word (word->word);
00666   float new_x_ht = 0.0;
00667   INT16 old_xht_reject_count;
00668   INT16 new_xht_reject_count;
00669   INT16 old_xht_accept_count;
00670   INT16 new_xht_accept_count;
00671   BOOL8 accept_new_x_ht = FALSE;
00672   INT16 old_chs_in_wd;
00673   INT16 new_chs_in_wd;
00674   INT16 old_word_quality;
00675   INT16 new_word_quality;
00676   INT16 dummy;
00677 
00678   set_global_subloc_code(SUBLOC_NORM);
00679   check_debug_pt (word, 30);
00680   if (!word->done ||
00681     tessedit_training_tess ||
00682   tessedit_training_wiseowl || tessedit_dump_choices) {
00683     word->x_height = row->x_height ();
00684     word->caps_height = 0.0;
00685     if (word->outword != NULL) {
00686       delete word->outword;      //get rid of junk
00687       delete word->best_choice;
00688       delete word->raw_choice;
00689     }
00690     match_word_pass2 (word, row, row->x_height ());
00691     done_this_pass = TRUE;
00692     check_debug_pt (word, 40);
00693   }
00694 
00695   if (!word->tess_failed && !word->word->flag (W_REP_CHAR)) {
00696     set_global_subloc_code(SUBLOC_FIX_XHT);
00697     if ((tessedit_xht_fiddles_on_done_wds || !word->done) &&
00698       (tessedit_xht_fiddles_on_no_rej_wds ||
00699     (word->reject_map.reject_count () > 0))) {
00700       if ((x_ht_check_word_occ >= 2) && word_occ_first)
00701         check_block_occ(word);
00702 
00703       if (tessedit_redo_xheight)
00704         re_estimate_x_ht(word, &new_x_ht);
00705 
00706       if (((x_ht_check_word_occ >= 2) && !word_occ_first) ||
00707         ((x_ht_check_word_occ >= 1) && (new_x_ht > 0)))
00708         check_block_occ(word);
00709     }
00710     if (new_x_ht > 0) {
00711       old_chs_in_wd = word->reject_map.length ();
00712 
00713       /* Re-estimated x_ht error suggests a rematch is worthwhile. */
00714       new_x_ht_word.x_height = new_x_ht;
00715       new_x_ht_word.caps_height = 0.0;
00716       match_word_pass2 (&new_x_ht_word, row, new_x_ht_word.x_height);
00717       if (!new_x_ht_word.tess_failed) {
00718         if ((x_ht_check_word_occ >= 1) && word_occ_first)
00719           check_block_occ(&new_x_ht_word);
00720 
00721         re_estimate_x_ht(&new_x_ht_word, &new_x_ht);
00722 
00723         if ((x_ht_check_word_occ >= 1) && !word_occ_first)
00724           check_block_occ(&new_x_ht_word);
00725 
00726         old_xht_reject_count = word->reject_map.reject_count ();
00727         old_xht_accept_count = old_chs_in_wd - old_xht_reject_count;
00728         new_xht_reject_count = new_x_ht_word.reject_map.reject_count ();
00729         new_chs_in_wd = new_x_ht_word.reject_map.length ();
00730         new_xht_accept_count = new_chs_in_wd - new_xht_reject_count;
00731         accept_new_x_ht =
00732           ((new_xht_accept_count > old_xht_accept_count) ||
00733           ((new_xht_accept_count == old_xht_accept_count) &&
00734           (new_xht_accept_count > 0))) &&
00735           (!new_x_ht_word.guessed_x_ht ||
00736           !new_x_ht_word.guessed_caps_ht);
00737 
00738         if (accept_new_x_ht && x_ht_quality_check) {
00739           word_char_quality(word, row, &old_word_quality, &dummy);
00740           word_char_quality(&new_x_ht_word, row, &new_word_quality, &dummy);
00741           if (old_word_quality > new_word_quality)
00742             accept_new_x_ht = FALSE;
00743         }
00744 
00745         if (accept_new_x_ht && (x_ht_stringency > 0)) {
00746           accept_new_x_ht =
00747             (count_alphanums (&new_x_ht_word) > x_ht_stringency);
00748           if (!accept_new_x_ht && rej_use_xht) {
00749             if (debug_x_ht_level >= 1)
00750               tprintf
00751                 ("Failed stringency test so reject original word\n");
00752             word->reject_map.rej_word_xht_fixup ();
00753           }
00754         }
00755 
00756         #ifndef SECURE_NAMES
00757         if (debug_x_ht_level >= 1) {
00758           tprintf ("New XHT Match:: %s ",
00759             word->best_choice->string ().string ());
00760           word->reject_map.print (debug_fp);
00761           tprintf (" -> %s ",
00762             new_x_ht_word.best_choice->string ().string ());
00763           new_x_ht_word.reject_map.print (debug_fp);
00764           tprintf (" %s->%s %s %s\n",
00765             word->guessed_x_ht ? "GUESS" : "CERT",
00766             new_x_ht_word.guessed_x_ht ? "GUESS" : "CERT",
00767             new_x_ht > 0.1 ? "STILL DOUBT" : "OK",
00768             accept_new_x_ht ? "ACCEPTED" : "");
00769         }
00770         #endif
00771       }
00772       if (accept_new_x_ht) {
00773         /*
00774            The new x_ht is deemed superior so put the final results in the real word
00775            and destroy the old results
00776          */
00777         delete word->outword;    //get rid of junk
00778         word->outword = new_x_ht_word.outword;
00779         word->denorm = new_x_ht_word.denorm;
00780         delete word->best_choice;
00781         word->best_choice = new_x_ht_word.best_choice;
00782         delete word->raw_choice;
00783         word->raw_choice = new_x_ht_word.raw_choice;
00784         word->reject_map = new_x_ht_word.reject_map;
00785         word->done = new_x_ht_word.done;
00786         done_this_pass = TRUE;
00787       }
00788       else {
00789       /*
00790          The new x_ht is no better, so destroy the copy word and put any uncertain
00791          x or cap ht estimate back to default. (I.e. dont blame me if its bad!)
00792          Conditionally, use any ammended block occ chars.
00793        */
00794         //get rid of junk
00795         delete new_x_ht_word.outword;
00796         delete new_x_ht_word.best_choice;
00797         delete new_x_ht_word.raw_choice;
00798       }
00799       new_x_ht_word.outword = NULL; //to keep new destructor happy
00800       new_x_ht_word.best_choice = NULL; //to keep new destructor happy
00801       new_x_ht_word.raw_choice = NULL; //to keep new destructor happy
00802 
00803       if (rej_mostly_reject_mode == 2) {
00804         reject_mostly_rejects(word);
00805         tprintf ("Rejecting mostly rejects on %s ",
00806           word->best_choice->string ().string ());
00807       }
00808     }
00809 
00810     set_global_subloc_code(SUBLOC_NORM);
00811 
00812     if (done_this_pass && !word->done && tessedit_save_stats)
00813       SaveBadWord (word->best_choice->string ().string (),
00814         word->best_choice->certainty ());
00815     record_certainty (word->best_choice->certainty (), 2); //accounting
00816   }
00817 #ifndef GRAPHICS_DISABLED
00818   if (tessedit_draw_outwords) {
00819     if (fx_win == NO_WINDOW)
00820       create_fx_win();
00821     clear_fx_win();
00822     word->outword->plot (fx_win);
00823     make_picture_current(fx_win);
00824   }
00825 #endif
00826 
00827   set_global_subloc_code(SUBLOC_NORM);
00828   if (tessedit_print_text) {
00829     write_cooked_text (word->outword, word->best_choice->string (),
00830       word->done, done_this_pass, stdout);
00831   }
00832   check_debug_pt (word, 50);
00833 }

double_VAR_H ( test_pt_y  ,
99999.  99,
"ycoord"   
)

double_VAR_H ( test_pt_x  ,
99999.  99,
"xcoord"   
)

double_VAR_H ( quality_char_pc  ,
0.  95,
"good_quality_doc gte good char limit"   
)

double_VAR_H ( quality_outline_pc  ,
1.  0,
"good_quality_doc lte outline error limit"   
)

double_VAR_H ( quality_blob_pc  ,
0.  0,
"good_quality_doc gte good blobs limit"   
)

double_VAR_H ( quality_rej_pc  ,
0.  08,
"good_quality_doc lte rejection limit"   
)

void find_modal_font ( STATS fonts,
INT8 font_out,
INT8 font_count 
)

Find the modal font and remove from the stats.

Definition at line 1654 of file control.cpp.

References STATS::add(), count(), STATS::get_total(), MAX_INT8, STATS::mode(), and STATS::pile_count().

Referenced by font_recognition_pass(), and set_word_fonts().

01658                       {
01659   INT8 font;                     //font index
01660   INT32 count;                   //pile couat
01661 
01662   if (fonts->get_total () > 0) {
01663     font = (INT8) fonts->mode ();
01664     *font_out = font;
01665     count = fonts->pile_count (font);
01666     *font_count = count < MAX_INT8 ? count : MAX_INT8;
01667     fonts->add (font, -*font_count);
01668   }
01669   else {
01670     *font_out = -1;
01671     *font_count = 0;
01672   }
01673 }

void fix_hyphens ( char *  string,
WERD word,
BLOB_CHOICE_LIST_CLIST *  blob_choices 
)

Change pairs of hyphens to a single hyphen if the bounding boxes touch.

Typically a long dash which has been segmented.

Definition at line 1067 of file control.cpp.

References WERD::blob_list(), and merge_blobs().

Referenced by classify_word_pass1(), and match_word_pass2().

01070                                                        {
01071   char *ptr;                     //string ptr
01072                                  //blobs
01073   PBLOB_IT blob_it = word->blob_list ();
01074                                  //choices
01075   BLOB_CHOICE_LIST_C_IT choice_it = blob_choices;
01076   BLOB_CHOICE_IT it1;            //first choices
01077   BLOB_CHOICE_IT it2;            //second choices
01078 
01079   for (ptr = string;
01080   *ptr != '\0'; ptr++, blob_it.forward (), choice_it.forward ()) {
01081     if ((*ptr == '-' || *ptr == '~') &&
01082       (*(ptr + 1) == '-' || *(ptr + 1) == '~') &&
01083       (blob_it.data ()->bounding_box ().right () >=
01084     blob_it.data_relative (1)->bounding_box ().left ())) {
01085       *ptr = '-';                //turn to single hyphen
01086       strcpy (ptr + 1, ptr + 2); //shuffle up
01087       merge_blobs (blob_it.data (), blob_it.data_relative (1));
01088       blob_it.forward ();
01089       delete blob_it.extract (); //get rid of spare
01090 
01091       it1.set_to_list (choice_it.data ());
01092       it2.set_to_list (choice_it.data_relative (1));
01093       if (it1.data ()->certainty () < it2.data ()->certainty ()) {
01094         choice_it.forward ();
01095                                  //get rid of spare
01096         delete choice_it.extract ();
01097       }
01098       else {
01099                                  //get rid of spare
01100         delete choice_it.extract ();
01101         choice_it.forward ();
01102       }
01103     }
01104   }
01105 }

void fix_quotes ( char *  string,
WERD word,
BLOB_CHOICE_LIST_CLIST *  blob_choices 
)

Change pairs of quotes to double quotes.

Definition at line 1022 of file control.cpp.

References WERD::blob_list(), and merge_blobs().

01025                                                       {
01026   char *ptr;                     //string ptr
01027                                  //blobs
01028   PBLOB_IT blob_it = word->blob_list ();
01029                                  //choices
01030   BLOB_CHOICE_LIST_C_IT choice_it = blob_choices;
01031   BLOB_CHOICE_IT it1;            //first choices
01032   BLOB_CHOICE_IT it2;            //second choices
01033 
01034   for (ptr = string;
01035   *ptr != '\0'; ptr++, blob_it.forward (), choice_it.forward ()) {
01036     if ((*ptr == '\'' || *ptr == '`')
01037     && (*(ptr + 1) == '\'' || *(ptr + 1) == '`')) {
01038       *ptr = '"';                //turn to double
01039       strcpy (ptr + 1, ptr + 2); //shuffle up
01040       merge_blobs (blob_it.data (), blob_it.data_relative (1));
01041       blob_it.forward ();
01042       delete blob_it.extract (); //get rid of spare
01043 
01044       it1.set_to_list (choice_it.data ());
01045       it2.set_to_list (choice_it.data_relative (1));
01046       if (it1.data ()->certainty () < it2.data ()->certainty ()) {
01047         choice_it.forward ();
01048                                  //get rid of spare
01049         delete choice_it.extract ();
01050       }
01051       else {
01052                                  //get rid of spare
01053         delete choice_it.extract ();
01054         choice_it.forward ();
01055       }
01056     }
01057   }
01058 }

void fix_rep_char ( WERD_RES word  ) 

Fix word with a repeated char.

Parameters:
word Word to do
Returns:
none
Find the repeated char character. Make a reject string which rejects any char other than the voted char. Set the word to done to stop rematching it.

Definition at line 965 of file control.cpp.

References alloc_mem(), WERD_RES::best_choice, count(), WERD_RES::done, free_mem(), REJMAP::initialise(), max, WERD_RES::reject_map, and TRUE.

Referenced by classify_word_pass1(), and match_word_pass2().

00967                    {
00968   struct REP_CH
00969   {
00970     char ch;
00971     int count;
00972   };
00973 
00974   REP_CH *rep_ch;                //array of char counts
00975   int word_len;
00976   int rep_ch_count = 0;          //how many unique chs
00977   const char *word_str;          //the repeated chs
00978   int i, j;
00979   int total = 0;
00980   int max = 0;
00981   char maxch = ' ';              //Most common char
00982 
00983   word_str = word->best_choice->string ().string ();
00984   word_len = strlen (word_str);
00985   rep_ch = (REP_CH *) alloc_mem (word_len * sizeof (REP_CH));
00986   for (i = 0; i < word_len; i++) {
00987     for (j = 0; j < rep_ch_count && rep_ch[j].ch != word_str[i]; j++);
00988     if (j < rep_ch_count)
00989       rep_ch[j].count++;
00990     else {
00991       rep_ch[rep_ch_count].ch = word_str[i];
00992       rep_ch[rep_ch_count].count = 1;
00993       rep_ch_count++;
00994     }
00995   }
00996 
00997   for (j = 0; j < rep_ch_count; j++) {
00998     total += rep_ch[j].count;
00999     if ((rep_ch[j].count > max) && (rep_ch[j].ch != ' ')) {
01000       max = rep_ch[j].count;
01001       maxch = rep_ch[j].ch;
01002     }
01003   }
01004   //      tprintf( "REPEATED CHAR %s len=%d total=%d choice=%c\n",
01005   //                        word_str, word_len, total, maxch );
01006   free_mem(rep_ch);
01007 
01008   word->reject_map.initialise (word_len);
01009   for (i = 0; i < word_len; i++) {
01010     if (word_str[i] != maxch)
01011                                  //rej unrecognised blobs
01012       word->reject_map[i].setrej_bad_repetition ();
01013   }
01014   word->done = TRUE;
01015 }

void font_recognition_pass ( PAGE_RES_IT page_res_it  ) 

Smooth the fonts for the document.

Definition at line 1505 of file control.cpp.

References STATS::add(), WERD_RES::best_choice, WERD_RES::bold, ROW_RES::bold, STATS::clear(), count(), find_modal_font(), WERD_RES::font1, ROW_RES::font1, WERD_RES::font1_count, ROW_RES::font1_count, WERD_RES::font2, ROW_RES::font2, WERD_RES::font2_count, ROW_RES::font2_count, PAGE_RES_IT::forward(), WERD_RES::italic, ROW_RES::italic, NULL, PAGE_RES_IT::restart_page(), PAGE_RES_IT::row(), and PAGE_RES_IT::word().

Referenced by recog_all_words().

01506                                                      {
01507   INT32 length;                  //of word
01508   INT32 count;                   //of a feature
01509   INT8 doc_font;                 //modal font
01510   INT8 doc_font_count;           //modal font
01511   INT32 doc_italic;              //total italics
01512   INT32 doc_bold;                //total bolds
01513   ROW_RES *row = NULL;           //current row
01514   WERD_RES *word;                //current word
01515   STATS fonts (0, 32);           //font counters
01516   STATS doc_fonts (0, 32);       //font counters
01517 
01518   doc_italic = 0;
01519   doc_bold = 0;
01520   page_res_it.restart_page ();
01521   while (page_res_it.word () != NULL) {
01522     if (row != page_res_it.row ()) {
01523       if (row != NULL) {
01524         find_modal_font (&fonts, &row->font1, &row->font1_count);
01525         find_modal_font (&fonts, &row->font2, &row->font2_count);
01526       }
01527       row = page_res_it.row ();  //current row
01528       fonts.clear ();            //clear counters
01529       row->italic = 0;
01530       row->bold = 0;
01531     }
01532     word = page_res_it.word ();
01533     row->italic += word->italic;
01534     row->bold += word->bold;
01535     fonts.add (word->font1, word->font1_count);
01536     fonts.add (word->font2, word->font2_count);
01537     doc_italic += word->italic;
01538     doc_bold += word->bold;
01539     doc_fonts.add (word->font1, word->font1_count);
01540     doc_fonts.add (word->font2, word->font2_count);
01541     page_res_it.forward ();
01542   }
01543   if (row != NULL) {
01544     find_modal_font (&fonts, &row->font1, &row->font1_count);
01545     find_modal_font (&fonts, &row->font2, &row->font2_count);
01546   }
01547   find_modal_font(&doc_fonts, &doc_font, &doc_font_count);
01548   /*
01549     row=NULL;
01550     page_res_it.restart_page();
01551     while (page_res_it.word() != NULL)
01552     {
01553       if (row!=page_res_it.row())
01554       {
01555         row2=row;
01556         row=page_res_it.row();
01557         if (row->font1_count<MIN_FONT_ROW_COUNT)
01558         {
01559           fonts.clear();
01560           italic=0;
01561           bold=0;
01562           add_in_one_row(row,&fonts,&italic,&bold);
01563           if (row2!=NULL)
01564           {
01565             hdiff=row->row->x_height()-row2->row->x_height();
01566             if (hdiff<0)
01567               hdiff=-hdiff;
01568             if (hdiff<MAX_XHEIGHT_DIFF)
01569               add_in_one_row(row2,&fonts,&italic,&bold);
01570           }
01571           do
01572             page_res_it.forward();
01573           while (page_res_it.row()==row);
01574           row2=page_res_it.row();
01575           if (row2!=NULL)
01576           {
01577             hdiff=row->row->x_height()-row2->row->x_height();
01578             if (hdiff<0)
01579               hdiff=-hdiff;
01580             if (hdiff<MAX_XHEIGHT_DIFF)
01581               add_in_one_row(row2,&fonts,&italic,&bold);
01582           }
01583           row->italic=italic;
01584           row->bold=bold;
01585           find_modal_font(&fonts,&row->font1,&row->font1_count);
01586           find_modal_font(&fonts,&row->font2,&row->font2_count);
01587         }
01588         else
01589           page_res_it.forward();
01590       }
01591       else
01592         page_res_it.forward();
01593     }*/
01594 
01595   page_res_it.restart_page ();
01596   while (page_res_it.word () != NULL) {
01597     row = page_res_it.row ();    //current row
01598     word = page_res_it.word ();
01599     length = word->best_choice->string ().length ();
01600 
01601     count = word->italic;
01602     if (count < 0)
01603       count = -count;
01604     if (!(count == length || length > 3 && count >= length * 3 / 4))
01605       word->italic = doc_italic > 0 ? 1 : -1;
01606 
01607     count = word->bold;
01608     if (count < 0)
01609       count = -count;
01610     if (!(count == length || length > 3 && count >= length * 3 / 4))
01611       word->bold = doc_bold > 0 ? 1 : -1;
01612 
01613     count = word->font1_count;
01614     if (!(count == length || length > 3 && count >= length * 3 / 4)) {
01615       word->font1 = doc_font;
01616       word->font1_count = doc_font_count;
01617     }
01618 
01619     page_res_it.forward ();
01620   }
01621 }

INT_VAR_H ( tessedit_test_adaption_mode  ,
,
"Adaptation decision algorithm for tess"   
)

INT_VAR_H ( tessedit_cluster_adaption_mode  ,
,
"Adaptation decision algorithm for matrix matcher"   
)

INT_VAR_H ( tessedit_em_adaption_mode  ,
62  ,
"Adaptation decision algorithm for ems matrix matcher"   
)

INT_VAR_H ( tessedit_tess_adaption_mode  ,
,
"Adaptation decision algorithm for tess"   
)

INT_VAR_H ( quality_min_initial_alphas_reqd  ,
,
"alphas in a good word"   
)

INT_VAR_H ( debug_x_ht_level  ,
,
"Reestimate debug  
)

INT_VAR_H ( x_ht_stringency  ,
,
"How many confirmed a/n to accept?"   
)

INT_VAR_H ( x_ht_check_word_occ  ,
,
"Check Char Block occupancy"   
)

INT_VAR_H ( tessedit_single_match  ,
FALSE  ,
"Top choice only from CP"   
)

WERD* make_bln_copy ( WERD src_word,
ROW row,
float  x_height,
DENORM denorm 
)

Generate a baseline normalised copy of the source word.

The copy is done so that whatever format the original word is in, a polygonal bln version is generated as output.

Definition at line 1193 of file control.cpp.

References WERD::baseline_normalise_x(), WERD::poly_copy(), and ROW::x_height().

Referenced by apply_box_testing(), apply_box_training(), classify_word_pass1(), match_word_pass2(), unrej_good_chs(), word_blob_quality(), and word_char_quality().

01193                                                                               {
01194   WERD *result;
01195 
01196   //      if (wordit_linearc && !src_word->flag(W_POLYGON))
01197   //      {
01198   //              larc_word = src_word->larc_copy( row->x_height() );
01199   //              result = larc_word->poly_copy( row->x_height() );
01200   //              delete larc_word;
01201   //      }
01202   // else
01203   result = src_word->poly_copy (row->x_height ());
01204 
01205   //      if (tessedit_draw_words)
01206   //      {
01207   //              if ( la_win == NO_WINDOW )
01208   //                      create_la_win();
01209   //              result->plot( la_win );
01210   //      }
01211   result->baseline_normalise_x (row, x_height, denorm);
01212   return result;
01213 }

void match_word_pass2 ( WERD_RES word,
ROW row,
float  x_height 
)

Baseline normalize the word and pass it to Tess.

Definition at line 840 of file control.cpp.

References assert(), ASSERT_HOST, WERD_RES::best_choice, WERD::blob_list(), choice_dump_tester(), correct_segment_pass2(), WERD_RES::denorm, FALSE, fix_hyphens(), fix_quotes(), fix_rep_char(), WERD::flag(), REJMAP::initialise(), make_bln_copy(), make_reject_map(), matcher_fp, matcher_pass, NULL, WERD_RES::outword, WERD_RES::raw_choice, REJMAP::rej_word_tess_failure(), WERD_RES::reject_map, set_global_subsubloc_code(), SUBSUBLOC_OTHER, SUBSUBLOC_TESS, tess_acceptable_word(), WERD_RES::tess_accepted, tess_default_matcher(), WERD_RES::tess_failed, tess_segment_pass2(), tess_training_tester(), test_segment_pass2(), WERD::text(), tprintf(), TRUE, W_REP_CHAR, WERD_RES::word, and word_answer.

Referenced by classify_word_pass2().

00843                                       {
00844   WERD *bln_word;                //baseline norm copy
00845                                  //detailed results
00846   BLOB_CHOICE_LIST_CLIST blob_choices;
00847 
00848   set_global_subsubloc_code(SUBSUBLOC_OTHER);
00849   if (matcher_fp != NULL) {
00850     word_answer = (char *) word->word->text ();
00851     if (word_answer != NULL && word_answer[0] == '\0')
00852       word_answer = NULL;
00853   }
00854   matcher_pass = 0;
00855   bln_word = make_bln_copy (word->word, row, x_height, &word->denorm);
00856   set_global_subsubloc_code(SUBSUBLOC_TESS);
00857   if (tessedit_training_tess)
00858     word->best_choice = correct_segment_pass2 (bln_word,
00859       &word->denorm,
00860       tess_default_matcher,
00861       tess_training_tester,
00862       word->raw_choice,
00863       &blob_choices, word->outword);
00864   else if (tessedit_dump_choices)
00865     word->best_choice = test_segment_pass2 (bln_word,
00866         &word->denorm,
00867         tess_default_matcher,
00868         choice_dump_tester,
00869         word->raw_choice,
00870         &blob_choices, word->outword);
00871   //      else if (tessedit_training_wiseowl)
00872   //              best_choice=correct_segment_pass2( word, &denorm,
00873   //                                                                                                        tess_default_matcher,wo_learn,
00874   //                                                                                                        raw_choice,&blob_choices,outword);
00875   //      else if (tessedit_matcher_is_wiseowl)
00876   //              best_choice=tess_segment_pass2( word, &denorm, wo_classify,
00877   //                                                                                                raw_choice, &blob_choices, outword);
00878   else {
00879     word->best_choice = tess_segment_pass2 (bln_word, &word->denorm,
00880       tess_default_matcher,
00881       word->raw_choice, &blob_choices,
00882       word->outword);
00883   }
00884   set_global_subsubloc_code(SUBSUBLOC_OTHER);
00885   /*
00886      Test for TESS screw up on word. Recog_word has already ensured that the
00887      choice list, outword blob lists and best_choice string are the same
00888      length. A TESS screw up is indicated by a blank filled or 0 length string.
00889    */
00890   if ((word->best_choice->string ().length () == 0) ||
00891     (strspn (word->best_choice->string ().string (), " ") ==
00892   word->best_choice->string ().length ())) {
00893     word->tess_failed = TRUE;
00894     word->reject_map.initialise (word->best_choice->string ().length ());
00895     word->reject_map.rej_word_tess_failure ();
00896     //              tprintf("Empty word produced\n");
00897   }
00898   else {
00899     if ((word->best_choice->string ().length () !=
00900       word->outword->blob_list ()->length ()) ||
00901     (word->best_choice->string ().length () != blob_choices.length ())) {
00902       tprintf
00903         ("ASSERT FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n",
00904         word->best_choice->string ().string (),
00905         word->best_choice->string ().length (),
00906         word->outword->blob_list ()->length (), blob_choices.length ());
00907     }
00908     ASSERT_HOST (word->best_choice->string ().length () ==
00909       word->outword->blob_list ()->length ());
00910     ASSERT_HOST (word->best_choice->string ().length () ==
00911       blob_choices.length ());
00912 
00913     word->tess_failed = FALSE;
00914     if (word->word->flag (W_REP_CHAR)) {
00915       fix_rep_char(word);
00916     }
00917     else {
00918       fix_quotes ((char *) word->best_choice->string ().string (),
00919         word->outword, &blob_choices);
00920       if (tessedit_fix_hyphens)
00921         fix_hyphens ((char *) word->best_choice->string ().string (),
00922           word->outword, &blob_choices);
00923       /* Dont trust fix_quotes! - though I think I've fixed the bug */
00924       if ((word->best_choice->string ().length () !=
00925         word->outword->blob_list ()->length ()) ||
00926         (word->best_choice->string ().length () !=
00927       blob_choices.length ())) {
00928         #ifndef SECURE_NAMES
00929         tprintf
00930           ("POST FIX_QUOTES FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n",
00931           word->best_choice->string ().string (),
00932           word->best_choice->string ().length (),
00933           word->outword->blob_list ()->length (),
00934           blob_choices.length ());
00935         #endif
00936 
00937       }
00938       ASSERT_HOST (word->best_choice->string ().length () ==
00939         word->outword->blob_list ()->length ());
00940       ASSERT_HOST (word->best_choice->string ().length () ==
00941         blob_choices.length ());
00942 
00943       word->tess_accepted = tess_acceptable_word (word->best_choice,
00944         word->raw_choice);
00945 
00946       make_reject_map (word, &blob_choices, row, 2);
00947     }
00948   }
00949   blob_choices.deep_clear ();
00950   delete bln_word;
00951   assert (word->raw_choice != NULL);
00952 }

void merge_blobs ( PBLOB blob1,
PBLOB blob2 
)

Add the outlines from blob2 to blob1.

Blob2 is emptied but not deleted.

Definition at line 1114 of file control.cpp.

References PBLOB::out_list(), and outline_it.

Referenced by fix_hyphens(), fix_quotes(), and merge_tess_fails().

01117                   {
01118   OUTLINE_IT outline_it = blob1->out_list ();
01119   //iterator
01120 
01121   outline_it.move_to_last ();    //go to end
01122                                  //do it
01123   outline_it.add_list_after (blob2->out_list ());
01124 }

void recog_all_words ( PAGE_RES page_res,
volatile ETEXT_DESC monitor 
)

Walk the current block list applying the specified word processor function to all words. Does several passes!

Parameters:
page_res page structure
monitor progress monitor
Note:
Global:
  • tessedit_minimal_rej_pass1,
  • tessedit_test_adaption,
  • tessedit_minimal_rejection,
  • tessedit_test_adaption_mode,
  • tessedit_em_adaption_mode,
  • tessedit_cluster_adapt_after_pass1,
  • tessedit_cluster_adapt_after_pass3,
  • tessedit_cluster_adapt_before_pass1,
  • tessedit_cluster_adaption_mode
Returns:
none

Definition at line 245 of file control.cpp.

References adapt_to_good_ems(), adapt_to_good_samples(), WERD_RES::best_choice, check_debug_pt(), classify_word_pass1(), classify_word_pass2(), collect_characters_for_adaption(), collect_ems_for_adaption(), FALSE, fix_fuzzy_spaces(), font_recognition_pass(), PAGE_RES_IT::forward(), FREQ_DAWG_PERM, REJMAP::length(), LOC_DOC_BLK_REJ, LOC_FUZZY_SPACE, LOC_MM_ADAPT, LOC_PASS1, LOC_PASS2, LOC_WRITE_RESULTS, NULL, print_em_stats(), quality_based_rejection(), PAGE_RES_IT::rej_stat_word(), REJMAP::rej_word_bad_quality(), REJMAP::rej_word_tess_failure(), reject_all_fullstops(), REJMAP::reject_count(), WERD_RES::reject_map, reject_suspect_fullstops(), PAGE_RES_IT::restart_page(), ROW_RES::row, PAGE_RES_IT::row(), set_global_loc_code(), STRING::string(), SYSTEM_DAWG_PERM, tprintf(), TRUE, USER_DAWG_PERM, PAGE_RES_IT::word(), word_adaptable(), word_blob_quality(), word_char_quality(), word_count, and word_outline_errs().

Referenced by TessBaseAPI::Recognize().

00248                       {
00249   PAGE_RES_IT page_res_it(page_res); //reset page iterator
00250   INT16 chars_in_word;
00251   INT16 rejects_in_word;
00252   CHAR_SAMPLES_LIST em_clusters;
00253   CHAR_SAMPLE_LIST ems_waiting;
00254   CHAR_SAMPLES_LIST char_clusters;
00255   CHAR_SAMPLE_LIST chars_waiting;
00256   INT16 blob_quality = 0;
00257   INT16 outline_errs = 0;
00258   INT16 doc_blob_quality = 0;
00259   INT16 doc_outline_errs = 0;
00260   INT16 doc_char_quality = 0;
00261   INT16 all_char_quality;
00262   INT16 accepted_all_char_quality;
00263   INT16 good_char_count = 0;
00264   INT16 doc_good_char_quality = 0;
00265   const STRING *wordstr;
00266   const char *text;
00267   int i;
00268 
00269   BOOL8 good_quality_doc;
00270   UINT8 permuter_type;
00271 
00272   INT32 tess_adapt_mode = 0;
00273   INT32 word_count;              //count of words in doc
00274   INT32 word_index;              //current word
00275 
00276   if (tessedit_minimal_rej_pass1) {
00277     tessedit_test_adaption.set_value (TRUE);
00278     tessedit_minimal_rejection.set_value (TRUE);
00279   }
00280 
00281   if (tessedit_cluster_adapt_before_pass1) {
00282     tess_adapt_mode = tessedit_tess_adaption_mode;
00283     tessedit_tess_adaption_mode.set_value (0);
00284     tessedit_tess_adapt_to_rejmap.set_value (TRUE);
00285   }
00286 
00287   /* Pass 1 */
00288   word_count = 0;
00289   if (monitor != NULL) {
00290     monitor->ocr_alive = TRUE;
00291     while (page_res_it.word () != NULL) {
00292       word_count++;
00293       page_res_it.forward ();
00294     }
00295     page_res_it.restart_page ();
00296   }
00297   else
00298     word_count = 1;
00299 
00300   word_index = 0;
00301   int dict_words = 0;
00302   while (page_res_it.word () != NULL) {
00303     set_global_loc_code(LOC_PASS1);
00304     word_index++;
00305     if (monitor != NULL) {
00306       monitor->ocr_alive = TRUE;
00307       monitor->progress = 30 + 50 * word_index / word_count;
00308       if ((monitor->end_time != 0 && clock() > monitor->end_time) ||
00309           (monitor->cancel != NULL && (*monitor->cancel)(monitor->cancel_this,
00310                                                          dict_words)))
00311         return;
00312     }
00313     classify_word_pass1 (page_res_it.word (),
00314       page_res_it.row ()->row, FALSE, NULL, NULL);
00315 
00316     if (tessedit_test_adaption && !tessedit_minimal_rejection) {
00317       if (!word_adaptable (page_res_it.word (),
00318         tessedit_test_adaption_mode))
00319         page_res_it.word ()->reject_map.rej_word_tess_failure ();
00320       //FAKE PERM REJ
00321       else {
00322         wordstr = &(page_res_it.word ()->best_choice->string ());
00323         /* Override rejection mechanisms for this word */
00324         text = wordstr->string ();
00325         for (i = 0; text[i] != '\0'; i++) {
00326           if ((text[i] != ' ')
00327             && page_res_it.word ()->reject_map[i].rejected ())
00328             page_res_it.word ()->reject_map[i].
00329               setrej_minimal_rej_accept();
00330         }
00331       }
00332     }
00333 
00334     if ((tessedit_cluster_adapt_after_pass1
00335       || tessedit_cluster_adapt_after_pass3
00336       || tessedit_cluster_adapt_before_pass1)
00337     && tessedit_cluster_adaption_mode != 0) {
00338       collect_characters_for_adaption (page_res_it.word (),
00339         &char_clusters, &chars_waiting);
00340     }
00341     // Count dict words.
00342     if (page_res_it.word()->best_choice->permuter() == USER_DAWG_PERM)
00343       ++dict_words;
00344     page_res_it.forward ();
00345   }
00346 
00347   if (tessedit_cluster_adapt_before_pass1)
00348     tessedit_tess_adaption_mode.set_value (tess_adapt_mode);
00349 
00350   page_res_it.restart_page ();
00351   while ((tessedit_cluster_adapt_after_pass1
00352     || tessedit_cluster_adapt_before_pass1)
00353   && page_res_it.word () != NULL) {
00354     if (monitor != NULL)
00355       monitor->ocr_alive = TRUE;
00356     if (tessedit_cluster_adapt_after_pass1)
00357       adapt_to_good_samples (page_res_it.word (),
00358         &char_clusters, &chars_waiting);
00359     else
00360       classify_word_pass1 (page_res_it.word (),
00361         page_res_it.row ()->row,
00362         TRUE, &char_clusters, &chars_waiting);
00363 
00364     page_res_it.forward ();
00365   }
00366 
00367   /* Pass 2 */
00368   page_res_it.restart_page ();
00369   word_index = 0;
00370   while (!tessedit_test_adaption && page_res_it.word () != NULL) {
00371     set_global_loc_code(LOC_PASS2);
00372     word_index++;
00373     if (monitor != NULL) {
00374       monitor->ocr_alive = TRUE;
00375       monitor->progress = 80 + 10 * word_index / word_count;
00376       if ((monitor->end_time != 0 && clock() > monitor->end_time) ||
00377           (monitor->cancel != NULL && (*monitor->cancel)(monitor->cancel_this,
00378                                                          dict_words)))
00379         return;
00380     }
00381     classify_word_pass2 (page_res_it.word (), page_res_it.row ()->row);
00382 
00383     if (tessedit_em_adaption_mode > 0)
00384       collect_ems_for_adaption (page_res_it.word (),
00385         &em_clusters, &ems_waiting);
00386 
00387     if (tessedit_cluster_adapt_after_pass2
00388       && tessedit_cluster_adaption_mode != 0)
00389       collect_characters_for_adaption (page_res_it.word (),
00390         &char_clusters, &chars_waiting);
00391     page_res_it.forward ();
00392   }
00393 
00394   /* Another pass */
00395   set_global_loc_code(LOC_FUZZY_SPACE);
00396 
00397   if (!tessedit_test_adaption && tessedit_fix_fuzzy_spaces
00398     && !tessedit_word_for_word)
00399     fix_fuzzy_spaces(monitor, word_count, page_res);
00400 
00401   if (!tessedit_test_adaption && tessedit_em_adaption_mode != 0)
00402                                  // Initially ems only
00403     print_em_stats(&em_clusters, &ems_waiting);
00404 
00405   /* Pass 3 - used for checking confusion sets */
00406   page_res_it.restart_page ();
00407   word_index = 0;
00408   while (!tessedit_test_adaption && page_res_it.word () != NULL) {
00409     set_global_loc_code(LOC_MM_ADAPT);
00410     word_index++;
00411     if (monitor != NULL) {
00412       monitor->ocr_alive = TRUE;
00413       monitor->progress = 95 + 5 * word_index / word_count;
00414     }
00415     check_debug_pt (page_res_it.word (), 70);
00416     /* Use good matches to sort out confusions */
00417 
00418     if (tessedit_em_adaption_mode != 0)
00419       adapt_to_good_ems (page_res_it.word (), &em_clusters, &ems_waiting);
00420 
00421     if (tessedit_cluster_adapt_after_pass2
00422       && tessedit_cluster_adaption_mode != 0)
00423       adapt_to_good_samples (page_res_it.word (),
00424         &char_clusters, &chars_waiting);
00425 
00426     if (tessedit_reject_fullstops
00427       && strchr (page_res_it.word ()->best_choice->string ().string (),
00428       '.') != NULL)
00429       reject_all_fullstops (page_res_it.word ());
00430     else if (tessedit_reject_suspect_fullstops
00431       && strchr (page_res_it.word ()->best_choice->string ().
00432       string (), '.') != NULL)
00433       reject_suspect_fullstops (page_res_it.word ());
00434 
00435     page_res_it.rej_stat_word ();
00436     chars_in_word = page_res_it.word ()->reject_map.length ();
00437     rejects_in_word = page_res_it.word ()->reject_map.reject_count ();
00438 
00439     blob_quality = word_blob_quality (page_res_it.word (),
00440       page_res_it.row ()->row);
00441     doc_blob_quality += blob_quality;
00442     outline_errs = word_outline_errs (page_res_it.word ());
00443     doc_outline_errs += outline_errs;
00444     word_char_quality (page_res_it.word (),
00445       page_res_it.row ()->row,
00446       &all_char_quality, &accepted_all_char_quality);
00447     doc_char_quality += all_char_quality;
00448     permuter_type = page_res_it.word ()->best_choice->permuter ();
00449     if ((permuter_type == SYSTEM_DAWG_PERM) ||
00450       (permuter_type == FREQ_DAWG_PERM) ||
00451     (permuter_type == USER_DAWG_PERM)) {
00452       good_char_count += chars_in_word - rejects_in_word;
00453       doc_good_char_quality += accepted_all_char_quality;
00454     }
00455     check_debug_pt (page_res_it.word (), 80);
00456     if (tessedit_reject_bad_qual_wds &&
00457       (blob_quality == 0) && (outline_errs >= chars_in_word))
00458       page_res_it.word ()->reject_map.rej_word_bad_quality ();
00459     check_debug_pt (page_res_it.word (), 90);
00460     page_res_it.forward ();
00461   }
00462 
00463   page_res_it.restart_page ();
00464   while (!tessedit_test_adaption
00465   && tessedit_cluster_adapt_after_pass3 && page_res_it.word () != NULL) {
00466     if (monitor != NULL)
00467       monitor->ocr_alive = TRUE;
00468     if (tessedit_cluster_adaption_mode != 0)
00469       adapt_to_good_samples (page_res_it.word (),
00470         &char_clusters, &chars_waiting);
00471     page_res_it.forward ();
00472   }
00473 
00474   #ifndef SECURE_NAMES
00475   if (tessedit_debug_quality_metrics) {
00476     tprintf
00477       ("QUALITY: num_chs= %d  num_rejs= %d %5.3f blob_qual= %d %5.3f outline_errs= %d %5.3f char_qual= %d %5.3f good_ch_qual= %d %5.3f\n",
00478       page_res->char_count, page_res->rej_count,
00479       page_res->rej_count / (float) page_res->char_count, doc_blob_quality,
00480       doc_blob_quality / (float) page_res->char_count, doc_outline_errs,
00481       doc_outline_errs / (float) page_res->char_count, doc_char_quality,
00482       doc_char_quality / (float) page_res->char_count,
00483       doc_good_char_quality,
00484       good_char_count >
00485       0 ? doc_good_char_quality / (float) good_char_count : 0.0);
00486   }
00487   #endif
00488   good_quality_doc =
00489     (page_res->rej_count / (float) page_res->char_count <= quality_rej_pc)
00490     &&
00491     (doc_blob_quality / (float) page_res->char_count >= quality_blob_pc) &&
00492     (doc_outline_errs / (float) page_res->char_count <= quality_outline_pc) &&
00493     (doc_char_quality / (float) page_res->char_count >= quality_char_pc);
00494 
00495   /* Do whole document or whole block rejection pass*/
00496 
00497   if (!tessedit_test_adaption) {
00498     set_global_loc_code(LOC_DOC_BLK_REJ);
00499     quality_based_rejection(page_res_it, good_quality_doc);
00500   }
00501   font_recognition_pass(page_res_it);
00502 
00503   /* Write results pass */
00504   set_global_loc_code(LOC_WRITE_RESULTS);
00505   // This is now redundant, but retained commented so show how to obtain
00506   // bounding boxes and style information.
00507   // output_pass (page_res_it, false);
00508 }

BOOL8 recog_interactive ( BLOCK ,
ROW row,
WERD word 
)

Recognize a single word in interactive mode.

Parameters:
* block
row row of word
word word to recognize
Note:
Global: tessedit_debug_quality_metrics,
Returns:
TRUE if found one word

Definition at line 207 of file control.cpp.

References classify_word_pass2(), tprintf(), TRUE, word_blob_quality(), word_char_quality(), and word_outline_errs().

Referenced by extend_moded_commands(), and recog_pseudo_word().

00211                          {
00212   WERD_RES word_res(word);
00213   INT16 char_qual;
00214   INT16 good_char_qual;
00215 
00216   classify_word_pass2(&word_res, row);
00217   #ifndef SECURE_NAMES
00218   if (tessedit_debug_quality_metrics) {
00219     word_char_quality(&word_res, row, &char_qual, &good_char_qual);
00220     tprintf
00221       ("\n%d chars;  word_blob_quality: %d;  outline_errs: %d; char_quality: %d; good_char_quality: %d\n",
00222       word_res.reject_map.length (), word_blob_quality (&word_res, row),
00223       word_outline_errs (&word_res), char_qual, good_char_qual);
00224   }
00225   #endif
00226   return TRUE;
00227 }

void recog_pseudo_word ( BLOCK_LIST *  block_list,
BOX selection_box 
)

Make a word from the selected blobs and run Tess on them i.e., recognize blobs.

Parameters:
block_list blocks to check
selection_box 
Returns:
none

Definition at line 182 of file control.cpp.

References make_pseudo_word(), NULL, and recog_interactive().

Referenced by extend_moded_commands().

00184                                            {
00185   WERD *word;
00186   ROW *pseudo_row;               //row of word
00187   BLOCK *pseudo_block;           //block of word
00188 
00189   word = make_pseudo_word (block_list, selection_box,
00190     pseudo_block, pseudo_row);
00191   if (word != NULL) {
00192     recog_interactive(pseudo_block, pseudo_row, word);
00193     delete word;
00194   }
00195 }

void set_word_fonts ( WERD_RES word,
BLOB_CHOICE_LIST_CLIST *  blob_choices 
)

Get the fonts for the word.

Definition at line 1406 of file control.cpp.

References STATS::add(), WERD_RES::best_choice, WERD_RES::bold, find_modal_font(), WERD_RES::font1, WERD_RES::font1_count, WERD_RES::font2, WERD_RES::font2_count, WERD_RES::italic, and tprintf().

Referenced by classify_word_pass1().

01408                                                           {
01409   INT32 index;                   //char index
01410   char choice_char;              //char from word
01411   INT8 config;                   //font of char
01412                                  //character iterator
01413   BLOB_CHOICE_LIST_C_IT char_it = blob_choices;
01414   BLOB_CHOICE_IT choice_it;      //choice iterator
01415   STATS fonts (0, 32);           //font counters
01416   static INT8 italic_table[32] = {
01417     1, -1, 1, -1,
01418     1, -1, 1, -1,
01419     1, -1, 1, -1,
01420     1, -1, 1, -1,
01421     1, -1, 1, -1,
01422     1, -1, 1, -1,
01423     1, -1, 1, -1,
01424     1, -1, 1, -1
01425   };
01426   static INT8 bold_table[32] = {
01427     1, 1, -1, -1,
01428     1, 1, -1, -1,
01429     1, 1, -1, -1,
01430     1, 1, -1, -1,
01431     1, 1, -1, -1,
01432     1, 1, -1, -1,
01433     1, 1, -1, -1,
01434     1, 1, -1, -1
01435   };
01436   static INT8 font_table[32] = {
01437     2, 2, 2, 2,
01438     -1, -1, -1, -1,
01439     0, 0, 0, 0,
01440     1, 1, 1, 1,
01441     3, 3, 3, 3,
01442     4, 4, 4, 4,
01443     5, 5, 5, 5,
01444     2, 2, 2, 2
01445   };
01446 
01447   word->italic = 0;
01448   word->bold = 0;
01449   for (char_it.mark_cycle_pt (), index = 0;
01450   !char_it.cycled_list (); char_it.forward (), index++) {
01451     choice_char = word->best_choice->string ()[index];
01452     choice_it.set_to_list (char_it.data ());
01453     for (choice_it.mark_cycle_pt (); !choice_it.cycled_list ();
01454     choice_it.forward ()) {
01455       if (choice_it.data ()->char_class () == choice_char) {
01456         config = choice_it.data ()->config ();
01457         if (tessedit_debug_fonts)
01458           tprintf ("%c(%d=%d%c%c)",
01459             choice_char, config, (config & 31) >> 2,
01460             config & 2 ? 'N' : 'B', config & 1 ? 'N' : 'I');
01461         if (config != -1) {
01462           config &= 31;
01463           word->italic += italic_table[config];
01464           word->bold += bold_table[config];
01465           if (font_table[config] != -1)
01466             fonts.add (font_table[config], 1);
01467         }
01468         break;
01469       }
01470     }
01471   }
01472   find_modal_font (&fonts, &word->font1, &word->font1_count);
01473   find_modal_font (&fonts, &word->font2, &word->font2_count);
01474   if (tessedit_debug_fonts)
01475     tprintf ("\n");
01476   /*  if (word->font1_count>0)
01477     {
01478       for (char_it.mark_cycle_pt(),index=0;
01479       !char_it.cycled_list();char_it.forward(),index++)
01480       {
01481         choice_char=word->best_choice->string()[index];
01482         choice_it.set_to_list(char_it.data());
01483         for (choice_it.mark_cycle_pt();!choice_it.cycled_list();choice_it.forward())
01484         {
01485           if (choice_it.data()->char_class()==choice_char)
01486           {
01487             config=choice_it.data()->config();
01488             if (config!=-1 && font_table[config&31]==word->font1)
01489             {
01490               word->italic+=italic_table[config];
01491               word->bold+=bold_table[config];
01492             }
01493             break;
01494           }
01495         }
01496       }
01497     }*/
01498 }

STRING_VAR_H ( chs_trailing_punct2  ,
 
)

STRING_VAR_H ( chs_trailing_punct1  ,
 
)

STRING_VAR_H ( chs_leading_punct   ) 


Generated on Wed Feb 28 19:49:14 2007 for Tesseract by  doxygen 1.5.1