dict/stopper.cpp

Go to the documentation of this file.
00001 
00019 /* =================
00020  Include Files and Type Defines
00021  ==================== */
00022 #include "stopper.h"
00023 #include "emalloc.h"
00024 #include "matchdefs.h"
00025 #include "debug.h"
00026 #include "callcpp.h"
00027 #include "permute.h"
00028 #include "context.h"
00029 #include "permnum.h"
00030 #include "danerror.h"
00031 #include "const.h"
00032 #include "freelist.h"
00033 #include "efio.h"
00034 #include "globals.h"
00035 #include "scanutils.h"
00036 
00037 #include <stdio.h>
00038 #include <string.h>
00039 #include <ctype.h>
00040 #include <math.h>
00041 #ifdef __UNIX__
00042 #include <assert.h>
00043 #endif
00044 
00045 /* these are kludges - add appropriate .h file later */
00046 extern float CertaintyScale;     /* from subfeat.h */
00047 
00048 #define MAX_WERD_SIZE   100
00049 #define MAX_AMBIG_SIZE    3
00050 
00051 #define DANGEROUS_AMBIGS  "tessdata/DangAmbigs"
00052 
00057 typedef LIST AMBIG_TABLE;
00058 
00063 typedef struct
00064 {
00065   UINT8 Class;
00066   UINT16 NumChunks;
00067   float Certainty;
00068 } CHAR_CHOICE;
00069 
00074 typedef struct
00075 {
00076   float Rating;
00077   float Certainty;
00078   FLOAT32 AdjustFactor;
00079   int Length;
00080   CHAR_CHOICE Blob[1];
00081 } VIABLE_CHOICE_STRUCT;
00082 
00087 typedef VIABLE_CHOICE_STRUCT *VIABLE_CHOICE;
00088 
00093 typedef struct
00094 {
00095   VIABLE_CHOICE Choice;
00096   float ChunkCertainty[MAX_NUM_CHUNKS];
00097   UINT8 ChunkClass[MAX_NUM_CHUNKS];
00098 } EXPANDED_CHOICE;
00099 
00100 /* =================
00101           Macros
00102  ==================== */
00103 #define BestCertainty(Choices)  (((VIABLE_CHOICE) first (Choices))->Certainty)
00104 #define BestRating(Choices) (((VIABLE_CHOICE) first (Choices))->Rating)
00105 #define BestFactor(Choices) (((VIABLE_CHOICE) first (Choices))->AdjustFactor)
00106 
00107 #define AmbigThreshold(F1,F2) (((F2) - (F1)) * AmbigThresholdGain - \
00108             AmbigThresholdOffset)
00109 
00110 /*---------------------------------------------------------------------------
00111           Private Function Prototoypes
00112 ----------------------------------------------------------------------------*/
00113 void AddNewChunk(VIABLE_CHOICE Choice, int Blob);
00114 
00115 int AmbigsFound(char *Word,
00116                 char *CurrentChar,
00117                 const char *Tail,
00118                 LIST Ambigs,
00119                 DANGERR *fixpt);
00120 
00121 int ChoiceSameAs(A_CHOICE *Choice, VIABLE_CHOICE ViableChoice);
00122 
00123 int CmpChoiceRatings(void *arg1,   //VIABLE_CHOICE         Choice1,
00124                      void *arg2);  //VIABLE_CHOICE         Choice2);
00125 
00126 void ExpandChoice(VIABLE_CHOICE Choice, EXPANDED_CHOICE *ExpandedChoice);
00127 
00128 AMBIG_TABLE *FillAmbigTable();
00129 
00130 int FreeBadChoice(void *item1,   //VIABLE_CHOICE                 Choice,
00131                   void *item2);  //EXPANDED_CHOICE                       *BestChoice);
00132 
00133 int LengthOfShortestAlphaRun(register char *Word);
00134 
00135 VIABLE_CHOICE NewViableChoice (A_CHOICE * Choice,
00136 FLOAT32 AdjustFactor, float Certainties[]);
00137 
00138 void PrintViableChoice(FILE *File, const char *Label, VIABLE_CHOICE Choice);
00139 
00140 void ReplaceDuplicateChoice (VIABLE_CHOICE OldChoice,
00141 A_CHOICE * NewChoice,
00142 FLOAT32 AdjustFactor, float Certainties[]);
00143 
00144 int StringSameAs(const char *String, VIABLE_CHOICE ViableChoice);
00145 
00146 int UniformCertainties(CHOICES_LIST Choices, A_CHOICE *BestChoice);
00147 
00148 /* =================
00149         Global Data Definitions and Declarations
00150  ==================== */
00152 static const char *DangerousAmbigs = DANGEROUS_AMBIGS;
00153 
00155 static char *WordToDebug = NULL;
00156 
00159 BOOL8 KeepWordChoices = TRUE;
00160 
00162 static FLOAT32 RejectOffset = 0.0;
00163 
00165 static VIABLE_CHOICE BestRawChoice = NULL;
00166 static LIST BestChoices = NIL;
00167 static PIECES_STATE CurrentSegmentation;
00168 
00171 make_float_var (NonDictCertainty, -2.50, MakeNonDictCertainty,
00172 17, 2, SetNonDictCertainty,
00173 "Certainty threshold for non-dict words");
00174 
00175 make_float_var (RejectCertaintyOffset, 1.0, MakeRejectCertaintyOffset,
00176 17, 3, SetRejectCertaintyOffset, "Reject certainty offset");
00177 
00178 make_int_var (SmallWordSize, 2, MakeSmallWordSize,
00179 17, 4, SetSmallWordSize,
00180 "Size of dict word to be treated as non-dict word");
00181 
00182 make_float_var (CertaintyPerChar, -0.50, MakeCertaintyPerChar,
00183 17, 5, SetCertaintyPerChar,
00184 "Certainty to add for each dict char above SmallWordSize");
00185 
00186 make_float_var (CertaintyVariation, 3.0, MakeCertaintyVariation,
00187 17, 6, SetCertaintyVariation,
00188 "Max certaintly variation allowed in a word (in sigma)");
00189 
00190 make_int_var (StopperDebugLevel, 0, MakeStopperDebugLevel,
00191 17, 7, SetStopperDebugLevel, "Stopper debug level");
00192 
00193 make_float_var (AmbigThresholdGain, 8.0, MakeAmbigThresholdGain,
00194 17, 8, SetAmbigThresholdGain,
00195 "Gain factor for ambiguity threshold");
00196 
00197 make_float_var (AmbigThresholdOffset, 1.5, MakeAmbigThresholdOffset,
00198 17, 9, SetAmbigThresholdOffset,
00199 "Certainty offset for ambiguity threshold");
00200 
00201 //extern char *demodir;
00202 extern int first_pass;
00203 INT_VAR (tessedit_truncate_wordchoice_log, 10, "Max words to keep in list");
00206 /* =================
00207               Public Code
00208  ==================== */
00209 /* =============================== */
00226 int AcceptableChoice(CHOICES_LIST Choices,
00227                      A_CHOICE *BestChoice,
00228                      A_CHOICE *RawChoice,
00229                      DANGERR *fixpt) {
00230   float CertaintyThreshold = NonDictCertainty;
00231   int WordSize;
00232 
00233   if (fixpt != NULL)
00234     fixpt->index = -1;
00235   if ((BestChoice == NULL) || (class_string (BestChoice) == NULL))
00236     return (FALSE);
00237 
00238   if (StopperDebugLevel >= 1)
00239     cprintf ("\nStopper:  %s (word=%c, case=%c, punct=%c)\n",
00240       class_string (BestChoice),
00241       (valid_word (class_string (BestChoice)) ? 'y' : 'n'),
00242     (case_ok (class_string (BestChoice)) ? 'y' : 'n'),
00243     ((punctuation_ok (class_string (BestChoice)) !=
00244     -1) ? 'y' : 'n'));
00245 
00246   if (valid_word (class_string (BestChoice)) &&
00247     case_ok (class_string (BestChoice)) &&
00248   punctuation_ok (class_string (BestChoice)) != -1) {
00249     WordSize = LengthOfShortestAlphaRun (class_string (BestChoice));
00250     WordSize -= SmallWordSize;
00251     if (WordSize < 0)
00252       WordSize = 0;
00253     CertaintyThreshold += WordSize * CertaintyPerChar;
00254   }
00255   else if (stopper_numbers_on && valid_number (class_string (BestChoice))) {
00256     CertaintyThreshold += stopper_numbers_on * CertaintyPerChar;
00257   }
00258 
00259   if (StopperDebugLevel >= 1)
00260     cprintf ("Stopper:  Certainty = %4.1f, Threshold = %4.1f\n",
00261       class_certainty (BestChoice), CertaintyThreshold);
00262 
00263   if (NoDangerousAmbig (class_string (BestChoice), fixpt)
00264     && class_certainty (BestChoice) > CertaintyThreshold &&
00265     UniformCertainties (Choices, BestChoice))
00266     return (TRUE);
00267   else
00268     return (FALSE);
00269 
00270 }                                /* AcceptableChoice */
00271 
00272 
00273 /* =============================== */
00292 int AcceptableResult(A_CHOICE *BestChoice, A_CHOICE *RawChoice) { 
00293   float CertaintyThreshold = NonDictCertainty - RejectOffset;
00294   int WordSize;
00295 
00296   if (StopperDebugLevel >= 1)
00297     cprintf ("\nRejecter: %s (word=%c, case=%c, punct=%c, unambig=%c)\n",
00298       class_string (BestChoice),
00299       (valid_word (class_string (BestChoice)) ? 'y' : 'n'),
00300     (case_ok (class_string (BestChoice)) ? 'y' : 'n'),
00301     ((punctuation_ok (class_string (BestChoice)) != -1) ? 'y' : 'n'),
00302     ((rest (BestChoices) != NIL) ? 'n' : 'y'));
00303 
00304   if ((BestChoice == NULL) ||
00305     (class_string (BestChoice) == NULL) || CurrentWordAmbig ())
00306     return (FALSE);
00307 
00308   if (valid_word (class_string (BestChoice)) &&
00309     case_ok (class_string (BestChoice)) &&
00310   punctuation_ok (class_string (BestChoice)) != -1) {
00311     WordSize = LengthOfShortestAlphaRun (class_string (BestChoice));
00312     WordSize -= SmallWordSize;
00313     if (WordSize < 0)
00314       WordSize = 0;
00315     CertaintyThreshold += WordSize * CertaintyPerChar;
00316   }
00317 
00318   if (StopperDebugLevel >= 1)
00319     cprintf ("Rejecter: Certainty = %4.1f, Threshold = %4.1f   ",
00320       class_certainty (BestChoice), CertaintyThreshold);
00321 
00322   if (class_certainty (BestChoice) > CertaintyThreshold) {
00323     if (StopperDebugLevel >= 1)
00324       cprintf ("ACCEPTED\n");
00325     return (TRUE);
00326   }
00327   else {
00328     if (StopperDebugLevel >= 1)
00329       cprintf ("REJECTED\n");
00330     return (FALSE);
00331   }
00332 }                                /* AcceptableResult */
00333 
00334 
00335 /* =============================== */
00347 int AlternativeChoicesWorseThan(FLOAT32 Threshold) { 
00348   LIST Alternatives;
00349   VIABLE_CHOICE Choice;
00350 
00351   Alternatives = rest (BestChoices);
00352   iterate(Alternatives) {
00353     Choice = (VIABLE_CHOICE) first (Alternatives);
00354     if (Choice->AdjustFactor <= Threshold)
00355       return (FALSE);
00356   }
00357 
00358   return (TRUE);
00359 
00360 }                                /* AlternativeChoicesWorseThan */
00361 
00362 
00363 /* =============================== */
00374 int CurrentBestChoiceIs(const char *Word) { 
00375   return (BestChoices != NIL &&
00376     StringSameAs (Word, (VIABLE_CHOICE) first (BestChoices)));
00377 
00378 }                                /* CurrentBestChoiceIs */
00379 
00380 
00381 /* =============================== */
00392 FLOAT32 CurrentBestChoiceAdjustFactor() { 
00393   VIABLE_CHOICE BestChoice;
00394 
00395   if (BestChoices == NIL)
00396     return (MAX_FLOAT32);
00397 
00398   BestChoice = (VIABLE_CHOICE) first (BestChoices);
00399   return (BestChoice->AdjustFactor);
00400 
00401 }                                /* CurrentBestChoiceAdjustFactor */
00402 
00403 
00404 /* =============================== */
00415 int CurrentWordAmbig() { 
00416   return (rest (BestChoices) != NIL);
00417 
00418 }                                /* CurrentWordAmbig */
00419 
00420 
00421 /* =============================== */
00431 void DebugWordChoices() { 
00432   LIST Choices;
00433   int i;
00434   char LabelString[80];
00435 
00436   if (StopperDebugLevel >= 1 ||
00437     WordToDebug && BestChoices &&
00438   StringSameAs (WordToDebug, (VIABLE_CHOICE) first (BestChoices))) {
00439     if (BestRawChoice)
00440       PrintViableChoice (stdout, "\nBest Raw Choice:   ", BestRawChoice);
00441 
00442     i = 1;
00443     Choices = BestChoices;
00444     if (Choices)
00445       cprintf ("\nBest Cooked Choices:\n");
00446     iterate(Choices) {
00447       sprintf (LabelString, "Cooked Choice #%d:  ", i);
00448       PrintViableChoice (stdout, LabelString,
00449         (VIABLE_CHOICE) first (Choices));
00450       i++;
00451     }
00452   }
00453 }                                /* DebugWordChoices */
00454 
00455 
00456 /* =============================== */
00467 void FilterWordChoices() { 
00468   EXPANDED_CHOICE BestChoice;
00469 
00470   if (BestChoices == NIL || second (BestChoices) == NIL)
00471     return;
00472 
00473   /* compute certainties and class for each chunk in best choice */
00474   ExpandChoice ((VIABLE_CHOICE_STRUCT *) first (BestChoices), &BestChoice);
00475 
00476   set_rest (BestChoices, delete_d (rest (BestChoices),
00477     &BestChoice, FreeBadChoice));
00478 
00479 }                                /* FilterWordChoices */
00480 
00481 
00482 /* =============================== */
00510 void
00511 FindClassifierErrors (FLOAT32 MinRating,
00512 FLOAT32 MaxRating,
00513 FLOAT32 RatingMargin, FLOAT32 Thresholds[]) {
00514   EXPANDED_CHOICE BestRaw;
00515   VIABLE_CHOICE Choice;
00516   int i, j, Chunk;
00517   FLOAT32 AvgRating;
00518   int NumErrorChunks;
00519 
00520   assert (BestChoices != NIL);
00521   assert (BestRawChoice != NULL);
00522 
00523   ExpandChoice(BestRawChoice, &BestRaw);
00524   Choice = (VIABLE_CHOICE) first (BestChoices);
00525 
00526   for (i = 0, Chunk = 0; i < Choice->Length; i++, Thresholds++) {
00527     AvgRating = 0.0;
00528     NumErrorChunks = 0;
00529 
00530     for (j = 0; j < Choice->Blob[i].NumChunks; j++, Chunk++)
00531     if (Choice->Blob[i].Class != BestRaw.ChunkClass[Chunk]) {
00532       AvgRating += BestRaw.ChunkCertainty[Chunk];
00533       NumErrorChunks++;
00534     }
00535 
00536     if (NumErrorChunks > 0) {
00537       AvgRating /= NumErrorChunks;
00538       *Thresholds = (AvgRating / -CertaintyScale) * (1.0 - RatingMargin);
00539     }
00540     else
00541       *Thresholds = MaxRating;
00542 
00543     if (*Thresholds > MaxRating)
00544       *Thresholds = MaxRating;
00545     if (*Thresholds < MinRating)
00546       *Thresholds = MinRating;
00547   }
00548 }                                /* FindClassifierErrors */
00549 
00550 
00551 /* =============================== */
00561 void InitStopperVars() { 
00562   VALUE dummy;
00563 
00564   string_variable (DangerousAmbigs, "DangerousAmbigs", DANGEROUS_AMBIGS);
00565   string_variable (WordToDebug, "WordToDebug", "");
00566 
00567   MakeNonDictCertainty();
00568   MakeRejectCertaintyOffset();
00569   MakeSmallWordSize();
00570   MakeCertaintyPerChar();
00571   MakeCertaintyVariation();
00572   MakeStopperDebugLevel();
00573   MakeAmbigThresholdGain();
00574   MakeAmbigThresholdOffset();
00575 }                                /* InitStopperVars */
00576 
00577 
00578 /* =============================== */
00589 void InitChoiceAccum() { 
00590   BLOB_WIDTH *BlobWidth, *End;
00591 
00592   if (BestRawChoice)
00593     memfree(BestRawChoice);
00594 
00595   if (BestChoices)
00596     destroy_nodes(BestChoices, memfree);
00597 
00598   BestRawChoice = NULL;
00599   BestChoices = NIL;
00600   EnableChoiceAccum();
00601 
00602   for (BlobWidth = CurrentSegmentation,
00603     End = CurrentSegmentation + MAX_NUM_CHUNKS;
00604     BlobWidth < End; *BlobWidth++ = 1);
00605 
00606 }                                /* InitChoiceAccum */
00607 
00608 
00609 /* =============================== */
00622 void
00623 LogNewRawChoice (A_CHOICE * Choice, FLOAT32 AdjustFactor, float Certainties[]) {
00624   if (!KeepWordChoices)
00625     return;
00626 
00627   if (!BestRawChoice)
00628     BestRawChoice = NewViableChoice (Choice, AdjustFactor, Certainties);
00629   else if (class_probability (Choice) < BestRawChoice->Rating) {
00630     if (ChoiceSameAs (Choice, BestRawChoice))
00631       ReplaceDuplicateChoice(BestRawChoice, Choice, AdjustFactor, Certainties);
00632     else {
00633       memfree(BestRawChoice);
00634       BestRawChoice = NewViableChoice (Choice, AdjustFactor, Certainties);
00635     }
00636   }
00637 }                                /* LogNewRawChoice */
00638 
00639 /* =============================== */
00650 void LogNewSegmentation(PIECES_STATE BlobWidth) { 
00651   BLOB_WIDTH *Segmentation;
00652 
00653   for (Segmentation = CurrentSegmentation; *BlobWidth != 0;
00654     BlobWidth++, Segmentation++)
00655   *Segmentation = *BlobWidth;
00656   *Segmentation = 0;
00657 
00658 }                                /* LogNewSegmentation */
00659 
00660 
00661 /* =============================== */
00674 void LogNewSplit(int Blob) { 
00675   LIST Choices;
00676 
00677   if (BestRawChoice) {
00678     AddNewChunk(BestRawChoice, Blob);
00679   }
00680 
00681   Choices = BestChoices;
00682   iterate(Choices) {
00683     AddNewChunk ((VIABLE_CHOICE) first (Choices), Blob);
00684   }
00685 
00686 }                                /* LogNewSplit */
00687 
00688 
00689 /* =============================== */
00705 void
00706 LogNewWordChoice (A_CHOICE * Choice,
00707 FLOAT32 AdjustFactor, float Certainties[]) {
00708   VIABLE_CHOICE NewChoice;
00709   LIST Choices;
00710   FLOAT32 Threshold;
00711 
00712   if (!KeepWordChoices)
00713     return;
00714 
00715   /* throw out obviously bad choices to save some work */
00716   if (BestChoices != NIL) {
00717     Threshold = AmbigThreshold (BestFactor (BestChoices), AdjustFactor);
00718     if (Threshold > -AmbigThresholdOffset)
00719       Threshold = -AmbigThresholdOffset;
00720     if (class_certainty (Choice) - BestCertainty (BestChoices) < Threshold)
00721       return;
00722   }
00723 
00724   /* see if a choice with the same text string has already been found */
00725   NewChoice = NULL;
00726   Choices = BestChoices;
00727   iterate(Choices) {
00728     if (ChoiceSameAs (Choice, (VIABLE_CHOICE) first (Choices)))
00729       if (class_probability (Choice) < BestRating (Choices))
00730         NewChoice = (VIABLE_CHOICE) first (Choices);
00731     else
00732       return;
00733   }
00734 
00735   if (NewChoice) {
00736     ReplaceDuplicateChoice(NewChoice, Choice, AdjustFactor, Certainties);
00737     BestChoices = delete_d (BestChoices, NewChoice, is_same_node);
00738   }
00739   else {
00740     NewChoice = NewViableChoice (Choice, AdjustFactor, Certainties);
00741   }
00742 
00743   BestChoices = s_adjoin (BestChoices, NewChoice, CmpChoiceRatings);
00744   if (StopperDebugLevel >= 2)
00745     PrintViableChoice (stdout, "New Word Choice:  ", NewChoice);
00746   if (count (BestChoices) > tessedit_truncate_wordchoice_log) {
00747     Choices =
00748       (LIST) nth_cell (BestChoices, tessedit_truncate_wordchoice_log);
00749     destroy_nodes (rest (Choices), Efree);
00750     set_rest(Choices, NIL);
00751   }
00752 
00753 }                                /* LogNewWordChoice */
00754 
00755 
00756 /* =============================== */
00774 static AMBIG_TABLE *AmbigFor = NULL;
00775 
00776 int NoDangerousAmbig(const char *Word, DANGERR *fixpt) {
00777 
00778   char NewWord[MAX_WERD_SIZE];
00779   char *NextNewChar;
00780   int bad_index = 0;
00781 
00782   if (!AmbigFor)
00783     AmbigFor = FillAmbigTable ();
00784 
00785   NextNewChar = NewWord;
00786   while (*Word)
00787   if (AmbigsFound (NewWord, NextNewChar, Word + 1, AmbigFor[*Word], fixpt)) {
00788     if (fixpt != NULL)
00789       fixpt->index = bad_index;
00790     return (FALSE);
00791   }
00792   else {
00793     *NextNewChar++ = *Word++;
00794     bad_index++;
00795   }
00796 
00797   return (TRUE);
00798 
00799 }                                /* NoDangerousAmbig */
00800 
00801 void EndDangerousAmbigs() {
00802   if (AmbigFor != NULL) {
00803     for (int i = 0; i <= MAX_CLASS_ID; ++i) {
00804       destroy_nodes(AmbigFor[i], Efree);
00805     }
00806     Efree(AmbigFor);
00807     AmbigFor = NULL;
00808   }
00809 }
00810 
00811 /* =============================== */
00822 void SettupStopperPass1() { 
00823   RejectOffset = 0.0;
00824 }                                /* SettupStopperPass1 */
00825 
00826 
00827 /* =============================== */
00838 void SettupStopperPass2() { 
00839   RejectOffset = RejectCertaintyOffset;
00840 }                                /* SettupStopperPass2 */
00841 
00842 
00843 /* =================
00844               Private Code
00845  ==================== */
00846 /* =============================== */
00858 void AddNewChunk(VIABLE_CHOICE Choice, int Blob) { 
00859   int i, LastChunk;
00860 
00861   for (i = 0, LastChunk = 0; i < Choice->Length; i++) {
00862     LastChunk += Choice->Blob[i].NumChunks;
00863     if (Blob < LastChunk) {
00864       (Choice->Blob[i].NumChunks)++;
00865       return;
00866     }
00867   }
00868   mem_tidy (1);
00869   cprintf ("AddNewChunk failed:Choice->Length=%d, LastChunk=%d, Blob=%d\n",
00870     Choice->Length, LastChunk, Blob);
00871   assert(FALSE);  /* this should never get executed */
00872 
00873 }                                /* AddNewChunk */
00874 
00875 
00876 /* =============================== */
00898 int AmbigsFound(char *Word,
00899                 char *CurrentChar,
00900                 const char *Tail,
00901                 LIST Ambigs,
00902                 DANGERR *fixpt) {
00903   char *AmbigSpec;
00904   const char *UnmatchedTail;
00905   int Matches;
00906   int bad_length;
00907 
00908   iterate(Ambigs) {
00909     AmbigSpec = (char *) first (Ambigs);
00910     bad_length = 1;
00911     UnmatchedTail = Tail;
00912     Matches = TRUE;
00913 
00914     while (*AmbigSpec != ' ' && Matches)
00915     if (*AmbigSpec == *UnmatchedTail) {
00916       AmbigSpec++;
00917       UnmatchedTail++;
00918       bad_length++;
00919     }
00920     else
00921       Matches = FALSE;
00922 
00923     if (Matches) {
00924       AmbigSpec++;               /* skip over the space */
00925                                  /* insert replacement string */
00926       strcpy(CurrentChar, AmbigSpec);
00927                                  /* add tail */
00928       strcat(Word, UnmatchedTail);
00929       if (valid_word (Word)) {
00930         if (StopperDebugLevel >= 1)
00931           cprintf ("Stopper:  Possible ambiguous word = %s\n", Word);
00932         if (fixpt != NULL) {
00933           fixpt->good_length = strlen (AmbigSpec);
00934           fixpt->bad_length = bad_length;
00935         }
00936         return (TRUE);
00937       }
00938     }
00939   }
00940   return (FALSE);
00941 
00942 }                                /* AmbigsFound */
00943 
00944 
00945 /* =============================== */
00957 int ChoiceSameAs(A_CHOICE *Choice, VIABLE_CHOICE ViableChoice) { 
00958   return (StringSameAs (class_string (Choice), ViableChoice));
00959 
00960 }                                /* ChoiceSameAs */
00961 
00962 
00963 /* =============================== */
00975 int CmpChoiceRatings(void *arg1,
00976                      void *arg2) {
00977   float R1, R2;
00978   VIABLE_CHOICE Choice1 = (VIABLE_CHOICE) arg1;
00979   VIABLE_CHOICE Choice2 = (VIABLE_CHOICE) arg2;
00980 
00981   R1 = Choice1->Rating;
00982   R2 = Choice2->Rating;
00983 
00984   if (R1 < R2)
00985     return (-1);
00986   else
00987     return (1);
00988 
00989 }                                /* CmpChoiceRatings */
00990 
00991 
00992 /* =============================== */
01007 void ExpandChoice(VIABLE_CHOICE Choice, EXPANDED_CHOICE *ExpandedChoice) { 
01008   int i, j, Chunk;
01009 
01010   ExpandedChoice->Choice = Choice;
01011   for (i = 0, Chunk = 0; i < Choice->Length; i++)
01012   for (j = 0; j < Choice->Blob[i].NumChunks; j++, Chunk++) {
01013     ExpandedChoice->ChunkCertainty[Chunk] = Choice->Blob[i].Certainty;
01014     ExpandedChoice->ChunkClass[Chunk] = Choice->Blob[i].Class;
01015   }
01016 }                                /* ExpandChoice */
01017 
01018 
01019 /* =============================== */
01040 AMBIG_TABLE *FillAmbigTable() { 
01041   FILE *AmbigFile;
01042   AMBIG_TABLE *NewTable;
01043   int i;
01044   char TestString[256];
01045   char ReplacementString[256];
01046   char name[1024];
01047   char *AmbigSpec;
01048   int AmbigSize;
01049 
01050   strcpy(name, demodir);
01051   strcat(name, DangerousAmbigs);
01052   AmbigFile = Efopen (name, "r");
01053   NewTable = (AMBIG_TABLE *) Emalloc (sizeof (LIST) * (MAX_CLASS_ID + 1));
01054 
01055   for (i = 0; i <= MAX_CLASS_ID; i++)
01056     NewTable[i] = NIL;
01057 
01058   while (fscanf (AmbigFile, "%s", TestString) == 1 &&
01059   fscanf (AmbigFile, "%s", ReplacementString) == 1) {
01060     if (strlen (TestString) > MAX_AMBIG_SIZE ||
01061       strlen (ReplacementString) > MAX_AMBIG_SIZE)
01062       DoError (0, "Illegal ambiguity specification!");
01063 
01064     AmbigSize = strlen (TestString) + strlen (ReplacementString) + 1;
01065     AmbigSpec = (char *) Emalloc (sizeof (char) * AmbigSize);
01066 
01067     strcpy (AmbigSpec, &(TestString[1]));
01068     strcat (AmbigSpec, " ");
01069     strcat(AmbigSpec, ReplacementString);
01070     NewTable[TestString[0]] =
01071       push_last (NewTable[TestString[0]], AmbigSpec);
01072   }
01073   fclose(AmbigFile);
01074   return (NewTable);
01075 
01076 }                                /* FillAmbigTable */
01077 
01078 
01079 /* =============================== */
01094 int FreeBadChoice(void *item1,    //VIABLE_CHOICE       Choice,
01095                   void *item2) {  //EXPANDED_CHOICE     *BestChoice)
01096   int i, j, Chunk;
01097   FLOAT32 Threshold;
01098   VIABLE_CHOICE Choice;
01099   EXPANDED_CHOICE *BestChoice;
01100 
01101   Choice = (VIABLE_CHOICE) item1;
01102   BestChoice = (EXPANDED_CHOICE *) item2;
01103 
01104   Threshold = AmbigThreshold (BestChoice->Choice->AdjustFactor,
01105     Choice->AdjustFactor);
01106 
01107   for (i = 0, Chunk = 0; i < Choice->Length; i++)
01108     for (j = 0; j < Choice->Blob[i].NumChunks; j++, Chunk++)
01109       if (Choice->Blob[i].Class != BestChoice->ChunkClass[Chunk] &&
01110     Choice->Blob[i].Certainty - BestChoice->ChunkCertainty[Chunk] <
01111       Threshold) {
01112         memfree(Choice);
01113     return (TRUE);
01114   }
01115 
01116   return (FALSE);
01117 
01118 }                                /* FreeBadChoice */
01119 
01120 
01121 /* =============================== */
01131 int LengthOfShortestAlphaRun(register char *Word) { 
01132   register int Shortest = MAXINT;
01133   register int Length;
01134 
01135   for (; *Word; Word++)
01136   if (isalpha (*Word)) {
01137     for (Length = 1, Word++; isalpha (*Word); Word++, Length++);
01138     if (Length < Shortest)
01139       Shortest = Length;
01140 
01141     if (*Word == 0)
01142       break;
01143   }
01144   if (Shortest == MAXINT)
01145     Shortest = 0;
01146 
01147   return (Shortest);
01148 
01149 }                                /* LengthOfShortestAlphaRun */
01150 
01151 
01152 /* =============================== */
01166 VIABLE_CHOICE
01167 NewViableChoice (A_CHOICE * Choice, FLOAT32 AdjustFactor, float Certainties[]) {
01168   VIABLE_CHOICE NewChoice;
01169   int Length;
01170   char *Word;
01171   CHAR_CHOICE *NewChar;
01172   BLOB_WIDTH *BlobWidth;
01173 
01174   Length = strlen (class_string (Choice));
01175   assert (Length <= MAX_NUM_CHUNKS && Length > 0);
01176 
01177   NewChoice = (VIABLE_CHOICE) Emalloc (sizeof (VIABLE_CHOICE_STRUCT) +
01178     (Length - 1) * sizeof (CHAR_CHOICE));
01179 
01180   NewChoice->Rating = class_probability (Choice);
01181   NewChoice->Certainty = class_certainty (Choice);
01182   NewChoice->AdjustFactor = AdjustFactor;
01183   NewChoice->Length = Length;
01184 
01185   for (Word = class_string (Choice),
01186     NewChar = &(NewChoice->Blob[0]),
01187     BlobWidth = CurrentSegmentation;
01188   *Word; Word++, NewChar++, Certainties++, BlobWidth++) {
01189     NewChar->Class = *Word;
01190     NewChar->NumChunks = *BlobWidth;
01191     NewChar->Certainty = *Certainties;
01192   }
01193 
01194   return (NewChoice);
01195 
01196 }                                /* NewViableChoice */
01197 
01198 
01199 /* =============================== */
01211 void PrintViableChoice(FILE *File, const char *Label, VIABLE_CHOICE Choice) { 
01212   int i, j;
01213 
01214   fprintf (File, "%s", Label);
01215 
01216   fprintf (File, "(R=%5.1f, C=%4.1f, F=%4.2f)  ",
01217     Choice->Rating, Choice->Certainty, Choice->AdjustFactor);
01218 
01219   for (i = 0; i < Choice->Length; i++)
01220     fprintf (File, "%c", Choice->Blob[i].Class);
01221   fprintf (File, "\n");
01222 
01223   for (i = 0; i < Choice->Length; i++) {
01224     fprintf (File, "  %c", Choice->Blob[i].Class);
01225     for (j = 0; j < Choice->Blob[i].NumChunks - 1; j++)
01226       fprintf (File, "   ");
01227   }
01228   fprintf (File, "\n");
01229 
01230   for (i = 0; i < Choice->Length; i++) {
01231     for (j = 0; j < Choice->Blob[i].NumChunks; j++)
01232       fprintf (File, "%3d", (int) (Choice->Blob[i].Certainty * -10.0));
01233   }
01234   fprintf (File, "\n");
01235 
01236 }                                /* PrintViableChoice */
01237 
01238 
01239 /* =============================== */
01257 void
01258 ReplaceDuplicateChoice (VIABLE_CHOICE OldChoice,
01259 A_CHOICE * NewChoice,
01260 FLOAT32 AdjustFactor, float Certainties[]) {
01261   char *Word;
01262   CHAR_CHOICE *NewChar;
01263   BLOB_WIDTH *BlobWidth;
01264 
01265   OldChoice->Rating = class_probability (NewChoice);
01266   OldChoice->Certainty = class_certainty (NewChoice);
01267   OldChoice->AdjustFactor = AdjustFactor;
01268 
01269   for (Word = class_string (NewChoice),
01270     NewChar = &(OldChoice->Blob[0]),
01271     BlobWidth = CurrentSegmentation;
01272   *Word; Word++, NewChar++, Certainties++, BlobWidth++) {
01273     NewChar->NumChunks = *BlobWidth;
01274     NewChar->Certainty = *Certainties;
01275   }
01276 }                                /* ReplaceDuplicateChoice */
01277 
01278 
01279 /* =============================== */
01291 int StringSameAs(const char *String, VIABLE_CHOICE ViableChoice) { 
01292   CHAR_CHOICE *Char;
01293   int i;
01294 
01295   for (Char = &(ViableChoice->Blob[0]), i = 0;
01296     i < ViableChoice->Length; String++, Char++, i++)
01297   if (*String != Char->Class)
01298     return (FALSE);
01299 
01300   if (*String == 0)
01301     return (TRUE);
01302   else
01303     return (FALSE);
01304 
01305 }                                /* StringSameAs */
01306 
01307 
01308 /* =============================== */
01327 int UniformCertainties(CHOICES_LIST Choices, A_CHOICE *BestChoice) { 
01328   int i;
01329   CHOICES CharChoices;
01330   float Certainty;
01331   float WorstCertainty = MAX_FLOAT32;
01332   float CertaintyThreshold;
01333   FLOAT64 TotalCertainty;
01334   FLOAT64 TotalCertaintySquared;
01335   FLOAT64 Variance;
01336   FLOAT32 Mean, StdDev;
01337   int WordLength;
01338 
01339   WordLength = array_count (Choices);
01340   if (WordLength < 3)
01341     return (TRUE);
01342 
01343   TotalCertainty = TotalCertaintySquared = 0.0;
01344   for_each_choice(Choices, i) {
01345     CharChoices = (CHOICES) array_index (Choices, i);
01346     Certainty = best_certainty (CharChoices);
01347     TotalCertainty += Certainty;
01348     TotalCertaintySquared += Certainty * Certainty;
01349     if (Certainty < WorstCertainty)
01350       WorstCertainty = Certainty;
01351   }
01352 
01353   /* subtract off worst certainty from statistics */
01354   WordLength--;
01355   TotalCertainty -= WorstCertainty;
01356   TotalCertaintySquared -= WorstCertainty * WorstCertainty;
01357 
01358   Mean = TotalCertainty / WordLength;
01359   Variance = ((WordLength * TotalCertaintySquared -
01360     TotalCertainty * TotalCertainty) /
01361     (WordLength * (WordLength - 1)));
01362   if (Variance < 0.0)
01363     Variance = 0.0;
01364   StdDev = sqrt (Variance);
01365 
01366   CertaintyThreshold = Mean - CertaintyVariation * StdDev;
01367   if (CertaintyThreshold > NonDictCertainty)
01368     CertaintyThreshold = NonDictCertainty;
01369 
01370   if (class_certainty (BestChoice) < CertaintyThreshold) {
01371     if (StopperDebugLevel >= 1)
01372       cprintf
01373         ("Stopper:  Non-uniform certainty = %4.1f (m=%4.1f, s=%4.1f, t=%4.1f)\n",
01374         class_certainty (BestChoice), Mean, StdDev, CertaintyThreshold);
01375     return (FALSE);
01376   }
01377   else
01378     return (TRUE);
01379 
01380 }                                /* UniformCertainties */

Generated on Wed Feb 28 19:49:10 2007 for Tesseract by  doxygen 1.5.1