00001
00019
00020
00021
00022 #include "stopper.h"
00023 #include "emalloc.h"
00024 #include "matchdefs.h"
00025 #include "debug.h"
00026 #include "callcpp.h"
00027 #include "permute.h"
00028 #include "context.h"
00029 #include "permnum.h"
00030 #include "danerror.h"
00031 #include "const.h"
00032 #include "freelist.h"
00033 #include "efio.h"
00034 #include "globals.h"
00035 #include "scanutils.h"
00036
00037 #include <stdio.h>
00038 #include <string.h>
00039 #include <ctype.h>
00040 #include <math.h>
00041 #ifdef __UNIX__
00042 #include <assert.h>
00043 #endif
00044
00045
00046 extern float CertaintyScale;
00047
00048 #define MAX_WERD_SIZE 100
00049 #define MAX_AMBIG_SIZE 3
00050
00051 #define DANGEROUS_AMBIGS "tessdata/DangAmbigs"
00052
00057 typedef LIST AMBIG_TABLE;
00058
00063 typedef struct
00064 {
00065 UINT8 Class;
00066 UINT16 NumChunks;
00067 float Certainty;
00068 } CHAR_CHOICE;
00069
00074 typedef struct
00075 {
00076 float Rating;
00077 float Certainty;
00078 FLOAT32 AdjustFactor;
00079 int Length;
00080 CHAR_CHOICE Blob[1];
00081 } VIABLE_CHOICE_STRUCT;
00082
00087 typedef VIABLE_CHOICE_STRUCT *VIABLE_CHOICE;
00088
00093 typedef struct
00094 {
00095 VIABLE_CHOICE Choice;
00096 float ChunkCertainty[MAX_NUM_CHUNKS];
00097 UINT8 ChunkClass[MAX_NUM_CHUNKS];
00098 } EXPANDED_CHOICE;
00099
00100
00101
00102
00103 #define BestCertainty(Choices) (((VIABLE_CHOICE) first (Choices))->Certainty)
00104 #define BestRating(Choices) (((VIABLE_CHOICE) first (Choices))->Rating)
00105 #define BestFactor(Choices) (((VIABLE_CHOICE) first (Choices))->AdjustFactor)
00106
00107 #define AmbigThreshold(F1,F2) (((F2) - (F1)) * AmbigThresholdGain - \
00108 AmbigThresholdOffset)
00109
00110
00111
00112
00113 void AddNewChunk(VIABLE_CHOICE Choice, int Blob);
00114
00115 int AmbigsFound(char *Word,
00116 char *CurrentChar,
00117 const char *Tail,
00118 LIST Ambigs,
00119 DANGERR *fixpt);
00120
00121 int ChoiceSameAs(A_CHOICE *Choice, VIABLE_CHOICE ViableChoice);
00122
00123 int CmpChoiceRatings(void *arg1,
00124 void *arg2);
00125
00126 void ExpandChoice(VIABLE_CHOICE Choice, EXPANDED_CHOICE *ExpandedChoice);
00127
00128 AMBIG_TABLE *FillAmbigTable();
00129
00130 int FreeBadChoice(void *item1,
00131 void *item2);
00132
00133 int LengthOfShortestAlphaRun(register char *Word);
00134
00135 VIABLE_CHOICE NewViableChoice (A_CHOICE * Choice,
00136 FLOAT32 AdjustFactor, float Certainties[]);
00137
00138 void PrintViableChoice(FILE *File, const char *Label, VIABLE_CHOICE Choice);
00139
00140 void ReplaceDuplicateChoice (VIABLE_CHOICE OldChoice,
00141 A_CHOICE * NewChoice,
00142 FLOAT32 AdjustFactor, float Certainties[]);
00143
00144 int StringSameAs(const char *String, VIABLE_CHOICE ViableChoice);
00145
00146 int UniformCertainties(CHOICES_LIST Choices, A_CHOICE *BestChoice);
00147
00148
00149
00150
00152 static const char *DangerousAmbigs = DANGEROUS_AMBIGS;
00153
00155 static char *WordToDebug = NULL;
00156
00159 BOOL8 KeepWordChoices = TRUE;
00160
00162 static FLOAT32 RejectOffset = 0.0;
00163
00165 static VIABLE_CHOICE BestRawChoice = NULL;
00166 static LIST BestChoices = NIL;
00167 static PIECES_STATE CurrentSegmentation;
00168
00171 make_float_var (NonDictCertainty, -2.50, MakeNonDictCertainty,
00172 17, 2, SetNonDictCertainty,
00173 "Certainty threshold for non-dict words");
00174
00175 make_float_var (RejectCertaintyOffset, 1.0, MakeRejectCertaintyOffset,
00176 17, 3, SetRejectCertaintyOffset, "Reject certainty offset");
00177
00178 make_int_var (SmallWordSize, 2, MakeSmallWordSize,
00179 17, 4, SetSmallWordSize,
00180 "Size of dict word to be treated as non-dict word");
00181
00182 make_float_var (CertaintyPerChar, -0.50, MakeCertaintyPerChar,
00183 17, 5, SetCertaintyPerChar,
00184 "Certainty to add for each dict char above SmallWordSize");
00185
00186 make_float_var (CertaintyVariation, 3.0, MakeCertaintyVariation,
00187 17, 6, SetCertaintyVariation,
00188 "Max certaintly variation allowed in a word (in sigma)");
00189
00190 make_int_var (StopperDebugLevel, 0, MakeStopperDebugLevel,
00191 17, 7, SetStopperDebugLevel, "Stopper debug level");
00192
00193 make_float_var (AmbigThresholdGain, 8.0, MakeAmbigThresholdGain,
00194 17, 8, SetAmbigThresholdGain,
00195 "Gain factor for ambiguity threshold");
00196
00197 make_float_var (AmbigThresholdOffset, 1.5, MakeAmbigThresholdOffset,
00198 17, 9, SetAmbigThresholdOffset,
00199 "Certainty offset for ambiguity threshold");
00200
00201
00202 extern int first_pass;
00203 INT_VAR (tessedit_truncate_wordchoice_log, 10, "Max words to keep in list");
00206
00207
00208
00209
00226 int AcceptableChoice(CHOICES_LIST Choices,
00227 A_CHOICE *BestChoice,
00228 A_CHOICE *RawChoice,
00229 DANGERR *fixpt) {
00230 float CertaintyThreshold = NonDictCertainty;
00231 int WordSize;
00232
00233 if (fixpt != NULL)
00234 fixpt->index = -1;
00235 if ((BestChoice == NULL) || (class_string (BestChoice) == NULL))
00236 return (FALSE);
00237
00238 if (StopperDebugLevel >= 1)
00239 cprintf ("\nStopper: %s (word=%c, case=%c, punct=%c)\n",
00240 class_string (BestChoice),
00241 (valid_word (class_string (BestChoice)) ? 'y' : 'n'),
00242 (case_ok (class_string (BestChoice)) ? 'y' : 'n'),
00243 ((punctuation_ok (class_string (BestChoice)) !=
00244 -1) ? 'y' : 'n'));
00245
00246 if (valid_word (class_string (BestChoice)) &&
00247 case_ok (class_string (BestChoice)) &&
00248 punctuation_ok (class_string (BestChoice)) != -1) {
00249 WordSize = LengthOfShortestAlphaRun (class_string (BestChoice));
00250 WordSize -= SmallWordSize;
00251 if (WordSize < 0)
00252 WordSize = 0;
00253 CertaintyThreshold += WordSize * CertaintyPerChar;
00254 }
00255 else if (stopper_numbers_on && valid_number (class_string (BestChoice))) {
00256 CertaintyThreshold += stopper_numbers_on * CertaintyPerChar;
00257 }
00258
00259 if (StopperDebugLevel >= 1)
00260 cprintf ("Stopper: Certainty = %4.1f, Threshold = %4.1f\n",
00261 class_certainty (BestChoice), CertaintyThreshold);
00262
00263 if (NoDangerousAmbig (class_string (BestChoice), fixpt)
00264 && class_certainty (BestChoice) > CertaintyThreshold &&
00265 UniformCertainties (Choices, BestChoice))
00266 return (TRUE);
00267 else
00268 return (FALSE);
00269
00270 }
00271
00272
00273
00292 int AcceptableResult(A_CHOICE *BestChoice, A_CHOICE *RawChoice) {
00293 float CertaintyThreshold = NonDictCertainty - RejectOffset;
00294 int WordSize;
00295
00296 if (StopperDebugLevel >= 1)
00297 cprintf ("\nRejecter: %s (word=%c, case=%c, punct=%c, unambig=%c)\n",
00298 class_string (BestChoice),
00299 (valid_word (class_string (BestChoice)) ? 'y' : 'n'),
00300 (case_ok (class_string (BestChoice)) ? 'y' : 'n'),
00301 ((punctuation_ok (class_string (BestChoice)) != -1) ? 'y' : 'n'),
00302 ((rest (BestChoices) != NIL) ? 'n' : 'y'));
00303
00304 if ((BestChoice == NULL) ||
00305 (class_string (BestChoice) == NULL) || CurrentWordAmbig ())
00306 return (FALSE);
00307
00308 if (valid_word (class_string (BestChoice)) &&
00309 case_ok (class_string (BestChoice)) &&
00310 punctuation_ok (class_string (BestChoice)) != -1) {
00311 WordSize = LengthOfShortestAlphaRun (class_string (BestChoice));
00312 WordSize -= SmallWordSize;
00313 if (WordSize < 0)
00314 WordSize = 0;
00315 CertaintyThreshold += WordSize * CertaintyPerChar;
00316 }
00317
00318 if (StopperDebugLevel >= 1)
00319 cprintf ("Rejecter: Certainty = %4.1f, Threshold = %4.1f ",
00320 class_certainty (BestChoice), CertaintyThreshold);
00321
00322 if (class_certainty (BestChoice) > CertaintyThreshold) {
00323 if (StopperDebugLevel >= 1)
00324 cprintf ("ACCEPTED\n");
00325 return (TRUE);
00326 }
00327 else {
00328 if (StopperDebugLevel >= 1)
00329 cprintf ("REJECTED\n");
00330 return (FALSE);
00331 }
00332 }
00333
00334
00335
00347 int AlternativeChoicesWorseThan(FLOAT32 Threshold) {
00348 LIST Alternatives;
00349 VIABLE_CHOICE Choice;
00350
00351 Alternatives = rest (BestChoices);
00352 iterate(Alternatives) {
00353 Choice = (VIABLE_CHOICE) first (Alternatives);
00354 if (Choice->AdjustFactor <= Threshold)
00355 return (FALSE);
00356 }
00357
00358 return (TRUE);
00359
00360 }
00361
00362
00363
00374 int CurrentBestChoiceIs(const char *Word) {
00375 return (BestChoices != NIL &&
00376 StringSameAs (Word, (VIABLE_CHOICE) first (BestChoices)));
00377
00378 }
00379
00380
00381
00392 FLOAT32 CurrentBestChoiceAdjustFactor() {
00393 VIABLE_CHOICE BestChoice;
00394
00395 if (BestChoices == NIL)
00396 return (MAX_FLOAT32);
00397
00398 BestChoice = (VIABLE_CHOICE) first (BestChoices);
00399 return (BestChoice->AdjustFactor);
00400
00401 }
00402
00403
00404
00415 int CurrentWordAmbig() {
00416 return (rest (BestChoices) != NIL);
00417
00418 }
00419
00420
00421
00431 void DebugWordChoices() {
00432 LIST Choices;
00433 int i;
00434 char LabelString[80];
00435
00436 if (StopperDebugLevel >= 1 ||
00437 WordToDebug && BestChoices &&
00438 StringSameAs (WordToDebug, (VIABLE_CHOICE) first (BestChoices))) {
00439 if (BestRawChoice)
00440 PrintViableChoice (stdout, "\nBest Raw Choice: ", BestRawChoice);
00441
00442 i = 1;
00443 Choices = BestChoices;
00444 if (Choices)
00445 cprintf ("\nBest Cooked Choices:\n");
00446 iterate(Choices) {
00447 sprintf (LabelString, "Cooked Choice #%d: ", i);
00448 PrintViableChoice (stdout, LabelString,
00449 (VIABLE_CHOICE) first (Choices));
00450 i++;
00451 }
00452 }
00453 }
00454
00455
00456
00467 void FilterWordChoices() {
00468 EXPANDED_CHOICE BestChoice;
00469
00470 if (BestChoices == NIL || second (BestChoices) == NIL)
00471 return;
00472
00473
00474 ExpandChoice ((VIABLE_CHOICE_STRUCT *) first (BestChoices), &BestChoice);
00475
00476 set_rest (BestChoices, delete_d (rest (BestChoices),
00477 &BestChoice, FreeBadChoice));
00478
00479 }
00480
00481
00482
00510 void
00511 FindClassifierErrors (FLOAT32 MinRating,
00512 FLOAT32 MaxRating,
00513 FLOAT32 RatingMargin, FLOAT32 Thresholds[]) {
00514 EXPANDED_CHOICE BestRaw;
00515 VIABLE_CHOICE Choice;
00516 int i, j, Chunk;
00517 FLOAT32 AvgRating;
00518 int NumErrorChunks;
00519
00520 assert (BestChoices != NIL);
00521 assert (BestRawChoice != NULL);
00522
00523 ExpandChoice(BestRawChoice, &BestRaw);
00524 Choice = (VIABLE_CHOICE) first (BestChoices);
00525
00526 for (i = 0, Chunk = 0; i < Choice->Length; i++, Thresholds++) {
00527 AvgRating = 0.0;
00528 NumErrorChunks = 0;
00529
00530 for (j = 0; j < Choice->Blob[i].NumChunks; j++, Chunk++)
00531 if (Choice->Blob[i].Class != BestRaw.ChunkClass[Chunk]) {
00532 AvgRating += BestRaw.ChunkCertainty[Chunk];
00533 NumErrorChunks++;
00534 }
00535
00536 if (NumErrorChunks > 0) {
00537 AvgRating /= NumErrorChunks;
00538 *Thresholds = (AvgRating / -CertaintyScale) * (1.0 - RatingMargin);
00539 }
00540 else
00541 *Thresholds = MaxRating;
00542
00543 if (*Thresholds > MaxRating)
00544 *Thresholds = MaxRating;
00545 if (*Thresholds < MinRating)
00546 *Thresholds = MinRating;
00547 }
00548 }
00549
00550
00551
00561 void InitStopperVars() {
00562 VALUE dummy;
00563
00564 string_variable (DangerousAmbigs, "DangerousAmbigs", DANGEROUS_AMBIGS);
00565 string_variable (WordToDebug, "WordToDebug", "");
00566
00567 MakeNonDictCertainty();
00568 MakeRejectCertaintyOffset();
00569 MakeSmallWordSize();
00570 MakeCertaintyPerChar();
00571 MakeCertaintyVariation();
00572 MakeStopperDebugLevel();
00573 MakeAmbigThresholdGain();
00574 MakeAmbigThresholdOffset();
00575 }
00576
00577
00578
00589 void InitChoiceAccum() {
00590 BLOB_WIDTH *BlobWidth, *End;
00591
00592 if (BestRawChoice)
00593 memfree(BestRawChoice);
00594
00595 if (BestChoices)
00596 destroy_nodes(BestChoices, memfree);
00597
00598 BestRawChoice = NULL;
00599 BestChoices = NIL;
00600 EnableChoiceAccum();
00601
00602 for (BlobWidth = CurrentSegmentation,
00603 End = CurrentSegmentation + MAX_NUM_CHUNKS;
00604 BlobWidth < End; *BlobWidth++ = 1);
00605
00606 }
00607
00608
00609
00622 void
00623 LogNewRawChoice (A_CHOICE * Choice, FLOAT32 AdjustFactor, float Certainties[]) {
00624 if (!KeepWordChoices)
00625 return;
00626
00627 if (!BestRawChoice)
00628 BestRawChoice = NewViableChoice (Choice, AdjustFactor, Certainties);
00629 else if (class_probability (Choice) < BestRawChoice->Rating) {
00630 if (ChoiceSameAs (Choice, BestRawChoice))
00631 ReplaceDuplicateChoice(BestRawChoice, Choice, AdjustFactor, Certainties);
00632 else {
00633 memfree(BestRawChoice);
00634 BestRawChoice = NewViableChoice (Choice, AdjustFactor, Certainties);
00635 }
00636 }
00637 }
00638
00639
00650 void LogNewSegmentation(PIECES_STATE BlobWidth) {
00651 BLOB_WIDTH *Segmentation;
00652
00653 for (Segmentation = CurrentSegmentation; *BlobWidth != 0;
00654 BlobWidth++, Segmentation++)
00655 *Segmentation = *BlobWidth;
00656 *Segmentation = 0;
00657
00658 }
00659
00660
00661
00674 void LogNewSplit(int Blob) {
00675 LIST Choices;
00676
00677 if (BestRawChoice) {
00678 AddNewChunk(BestRawChoice, Blob);
00679 }
00680
00681 Choices = BestChoices;
00682 iterate(Choices) {
00683 AddNewChunk ((VIABLE_CHOICE) first (Choices), Blob);
00684 }
00685
00686 }
00687
00688
00689
00705 void
00706 LogNewWordChoice (A_CHOICE * Choice,
00707 FLOAT32 AdjustFactor, float Certainties[]) {
00708 VIABLE_CHOICE NewChoice;
00709 LIST Choices;
00710 FLOAT32 Threshold;
00711
00712 if (!KeepWordChoices)
00713 return;
00714
00715
00716 if (BestChoices != NIL) {
00717 Threshold = AmbigThreshold (BestFactor (BestChoices), AdjustFactor);
00718 if (Threshold > -AmbigThresholdOffset)
00719 Threshold = -AmbigThresholdOffset;
00720 if (class_certainty (Choice) - BestCertainty (BestChoices) < Threshold)
00721 return;
00722 }
00723
00724
00725 NewChoice = NULL;
00726 Choices = BestChoices;
00727 iterate(Choices) {
00728 if (ChoiceSameAs (Choice, (VIABLE_CHOICE) first (Choices)))
00729 if (class_probability (Choice) < BestRating (Choices))
00730 NewChoice = (VIABLE_CHOICE) first (Choices);
00731 else
00732 return;
00733 }
00734
00735 if (NewChoice) {
00736 ReplaceDuplicateChoice(NewChoice, Choice, AdjustFactor, Certainties);
00737 BestChoices = delete_d (BestChoices, NewChoice, is_same_node);
00738 }
00739 else {
00740 NewChoice = NewViableChoice (Choice, AdjustFactor, Certainties);
00741 }
00742
00743 BestChoices = s_adjoin (BestChoices, NewChoice, CmpChoiceRatings);
00744 if (StopperDebugLevel >= 2)
00745 PrintViableChoice (stdout, "New Word Choice: ", NewChoice);
00746 if (count (BestChoices) > tessedit_truncate_wordchoice_log) {
00747 Choices =
00748 (LIST) nth_cell (BestChoices, tessedit_truncate_wordchoice_log);
00749 destroy_nodes (rest (Choices), Efree);
00750 set_rest(Choices, NIL);
00751 }
00752
00753 }
00754
00755
00756
00774 static AMBIG_TABLE *AmbigFor = NULL;
00775
00776 int NoDangerousAmbig(const char *Word, DANGERR *fixpt) {
00777
00778 char NewWord[MAX_WERD_SIZE];
00779 char *NextNewChar;
00780 int bad_index = 0;
00781
00782 if (!AmbigFor)
00783 AmbigFor = FillAmbigTable ();
00784
00785 NextNewChar = NewWord;
00786 while (*Word)
00787 if (AmbigsFound (NewWord, NextNewChar, Word + 1, AmbigFor[*Word], fixpt)) {
00788 if (fixpt != NULL)
00789 fixpt->index = bad_index;
00790 return (FALSE);
00791 }
00792 else {
00793 *NextNewChar++ = *Word++;
00794 bad_index++;
00795 }
00796
00797 return (TRUE);
00798
00799 }
00800
00801 void EndDangerousAmbigs() {
00802 if (AmbigFor != NULL) {
00803 for (int i = 0; i <= MAX_CLASS_ID; ++i) {
00804 destroy_nodes(AmbigFor[i], Efree);
00805 }
00806 Efree(AmbigFor);
00807 AmbigFor = NULL;
00808 }
00809 }
00810
00811
00822 void SettupStopperPass1() {
00823 RejectOffset = 0.0;
00824 }
00825
00826
00827
00838 void SettupStopperPass2() {
00839 RejectOffset = RejectCertaintyOffset;
00840 }
00841
00842
00843
00844
00845
00846
00858 void AddNewChunk(VIABLE_CHOICE Choice, int Blob) {
00859 int i, LastChunk;
00860
00861 for (i = 0, LastChunk = 0; i < Choice->Length; i++) {
00862 LastChunk += Choice->Blob[i].NumChunks;
00863 if (Blob < LastChunk) {
00864 (Choice->Blob[i].NumChunks)++;
00865 return;
00866 }
00867 }
00868 mem_tidy (1);
00869 cprintf ("AddNewChunk failed:Choice->Length=%d, LastChunk=%d, Blob=%d\n",
00870 Choice->Length, LastChunk, Blob);
00871 assert(FALSE);
00872
00873 }
00874
00875
00876
00898 int AmbigsFound(char *Word,
00899 char *CurrentChar,
00900 const char *Tail,
00901 LIST Ambigs,
00902 DANGERR *fixpt) {
00903 char *AmbigSpec;
00904 const char *UnmatchedTail;
00905 int Matches;
00906 int bad_length;
00907
00908 iterate(Ambigs) {
00909 AmbigSpec = (char *) first (Ambigs);
00910 bad_length = 1;
00911 UnmatchedTail = Tail;
00912 Matches = TRUE;
00913
00914 while (*AmbigSpec != ' ' && Matches)
00915 if (*AmbigSpec == *UnmatchedTail) {
00916 AmbigSpec++;
00917 UnmatchedTail++;
00918 bad_length++;
00919 }
00920 else
00921 Matches = FALSE;
00922
00923 if (Matches) {
00924 AmbigSpec++;
00925
00926 strcpy(CurrentChar, AmbigSpec);
00927
00928 strcat(Word, UnmatchedTail);
00929 if (valid_word (Word)) {
00930 if (StopperDebugLevel >= 1)
00931 cprintf ("Stopper: Possible ambiguous word = %s\n", Word);
00932 if (fixpt != NULL) {
00933 fixpt->good_length = strlen (AmbigSpec);
00934 fixpt->bad_length = bad_length;
00935 }
00936 return (TRUE);
00937 }
00938 }
00939 }
00940 return (FALSE);
00941
00942 }
00943
00944
00945
00957 int ChoiceSameAs(A_CHOICE *Choice, VIABLE_CHOICE ViableChoice) {
00958 return (StringSameAs (class_string (Choice), ViableChoice));
00959
00960 }
00961
00962
00963
00975 int CmpChoiceRatings(void *arg1,
00976 void *arg2) {
00977 float R1, R2;
00978 VIABLE_CHOICE Choice1 = (VIABLE_CHOICE) arg1;
00979 VIABLE_CHOICE Choice2 = (VIABLE_CHOICE) arg2;
00980
00981 R1 = Choice1->Rating;
00982 R2 = Choice2->Rating;
00983
00984 if (R1 < R2)
00985 return (-1);
00986 else
00987 return (1);
00988
00989 }
00990
00991
00992
01007 void ExpandChoice(VIABLE_CHOICE Choice, EXPANDED_CHOICE *ExpandedChoice) {
01008 int i, j, Chunk;
01009
01010 ExpandedChoice->Choice = Choice;
01011 for (i = 0, Chunk = 0; i < Choice->Length; i++)
01012 for (j = 0; j < Choice->Blob[i].NumChunks; j++, Chunk++) {
01013 ExpandedChoice->ChunkCertainty[Chunk] = Choice->Blob[i].Certainty;
01014 ExpandedChoice->ChunkClass[Chunk] = Choice->Blob[i].Class;
01015 }
01016 }
01017
01018
01019
01040 AMBIG_TABLE *FillAmbigTable() {
01041 FILE *AmbigFile;
01042 AMBIG_TABLE *NewTable;
01043 int i;
01044 char TestString[256];
01045 char ReplacementString[256];
01046 char name[1024];
01047 char *AmbigSpec;
01048 int AmbigSize;
01049
01050 strcpy(name, demodir);
01051 strcat(name, DangerousAmbigs);
01052 AmbigFile = Efopen (name, "r");
01053 NewTable = (AMBIG_TABLE *) Emalloc (sizeof (LIST) * (MAX_CLASS_ID + 1));
01054
01055 for (i = 0; i <= MAX_CLASS_ID; i++)
01056 NewTable[i] = NIL;
01057
01058 while (fscanf (AmbigFile, "%s", TestString) == 1 &&
01059 fscanf (AmbigFile, "%s", ReplacementString) == 1) {
01060 if (strlen (TestString) > MAX_AMBIG_SIZE ||
01061 strlen (ReplacementString) > MAX_AMBIG_SIZE)
01062 DoError (0, "Illegal ambiguity specification!");
01063
01064 AmbigSize = strlen (TestString) + strlen (ReplacementString) + 1;
01065 AmbigSpec = (char *) Emalloc (sizeof (char) * AmbigSize);
01066
01067 strcpy (AmbigSpec, &(TestString[1]));
01068 strcat (AmbigSpec, " ");
01069 strcat(AmbigSpec, ReplacementString);
01070 NewTable[TestString[0]] =
01071 push_last (NewTable[TestString[0]], AmbigSpec);
01072 }
01073 fclose(AmbigFile);
01074 return (NewTable);
01075
01076 }
01077
01078
01079
01094 int FreeBadChoice(void *item1,
01095 void *item2) {
01096 int i, j, Chunk;
01097 FLOAT32 Threshold;
01098 VIABLE_CHOICE Choice;
01099 EXPANDED_CHOICE *BestChoice;
01100
01101 Choice = (VIABLE_CHOICE) item1;
01102 BestChoice = (EXPANDED_CHOICE *) item2;
01103
01104 Threshold = AmbigThreshold (BestChoice->Choice->AdjustFactor,
01105 Choice->AdjustFactor);
01106
01107 for (i = 0, Chunk = 0; i < Choice->Length; i++)
01108 for (j = 0; j < Choice->Blob[i].NumChunks; j++, Chunk++)
01109 if (Choice->Blob[i].Class != BestChoice->ChunkClass[Chunk] &&
01110 Choice->Blob[i].Certainty - BestChoice->ChunkCertainty[Chunk] <
01111 Threshold) {
01112 memfree(Choice);
01113 return (TRUE);
01114 }
01115
01116 return (FALSE);
01117
01118 }
01119
01120
01121
01131 int LengthOfShortestAlphaRun(register char *Word) {
01132 register int Shortest = MAXINT;
01133 register int Length;
01134
01135 for (; *Word; Word++)
01136 if (isalpha (*Word)) {
01137 for (Length = 1, Word++; isalpha (*Word); Word++, Length++);
01138 if (Length < Shortest)
01139 Shortest = Length;
01140
01141 if (*Word == 0)
01142 break;
01143 }
01144 if (Shortest == MAXINT)
01145 Shortest = 0;
01146
01147 return (Shortest);
01148
01149 }
01150
01151
01152
01166 VIABLE_CHOICE
01167 NewViableChoice (A_CHOICE * Choice, FLOAT32 AdjustFactor, float Certainties[]) {
01168 VIABLE_CHOICE NewChoice;
01169 int Length;
01170 char *Word;
01171 CHAR_CHOICE *NewChar;
01172 BLOB_WIDTH *BlobWidth;
01173
01174 Length = strlen (class_string (Choice));
01175 assert (Length <= MAX_NUM_CHUNKS && Length > 0);
01176
01177 NewChoice = (VIABLE_CHOICE) Emalloc (sizeof (VIABLE_CHOICE_STRUCT) +
01178 (Length - 1) * sizeof (CHAR_CHOICE));
01179
01180 NewChoice->Rating = class_probability (Choice);
01181 NewChoice->Certainty = class_certainty (Choice);
01182 NewChoice->AdjustFactor = AdjustFactor;
01183 NewChoice->Length = Length;
01184
01185 for (Word = class_string (Choice),
01186 NewChar = &(NewChoice->Blob[0]),
01187 BlobWidth = CurrentSegmentation;
01188 *Word; Word++, NewChar++, Certainties++, BlobWidth++) {
01189 NewChar->Class = *Word;
01190 NewChar->NumChunks = *BlobWidth;
01191 NewChar->Certainty = *Certainties;
01192 }
01193
01194 return (NewChoice);
01195
01196 }
01197
01198
01199
01211 void PrintViableChoice(FILE *File, const char *Label, VIABLE_CHOICE Choice) {
01212 int i, j;
01213
01214 fprintf (File, "%s", Label);
01215
01216 fprintf (File, "(R=%5.1f, C=%4.1f, F=%4.2f) ",
01217 Choice->Rating, Choice->Certainty, Choice->AdjustFactor);
01218
01219 for (i = 0; i < Choice->Length; i++)
01220 fprintf (File, "%c", Choice->Blob[i].Class);
01221 fprintf (File, "\n");
01222
01223 for (i = 0; i < Choice->Length; i++) {
01224 fprintf (File, " %c", Choice->Blob[i].Class);
01225 for (j = 0; j < Choice->Blob[i].NumChunks - 1; j++)
01226 fprintf (File, " ");
01227 }
01228 fprintf (File, "\n");
01229
01230 for (i = 0; i < Choice->Length; i++) {
01231 for (j = 0; j < Choice->Blob[i].NumChunks; j++)
01232 fprintf (File, "%3d", (int) (Choice->Blob[i].Certainty * -10.0));
01233 }
01234 fprintf (File, "\n");
01235
01236 }
01237
01238
01239
01257 void
01258 ReplaceDuplicateChoice (VIABLE_CHOICE OldChoice,
01259 A_CHOICE * NewChoice,
01260 FLOAT32 AdjustFactor, float Certainties[]) {
01261 char *Word;
01262 CHAR_CHOICE *NewChar;
01263 BLOB_WIDTH *BlobWidth;
01264
01265 OldChoice->Rating = class_probability (NewChoice);
01266 OldChoice->Certainty = class_certainty (NewChoice);
01267 OldChoice->AdjustFactor = AdjustFactor;
01268
01269 for (Word = class_string (NewChoice),
01270 NewChar = &(OldChoice->Blob[0]),
01271 BlobWidth = CurrentSegmentation;
01272 *Word; Word++, NewChar++, Certainties++, BlobWidth++) {
01273 NewChar->NumChunks = *BlobWidth;
01274 NewChar->Certainty = *Certainties;
01275 }
01276 }
01277
01278
01279
01291 int StringSameAs(const char *String, VIABLE_CHOICE ViableChoice) {
01292 CHAR_CHOICE *Char;
01293 int i;
01294
01295 for (Char = &(ViableChoice->Blob[0]), i = 0;
01296 i < ViableChoice->Length; String++, Char++, i++)
01297 if (*String != Char->Class)
01298 return (FALSE);
01299
01300 if (*String == 0)
01301 return (TRUE);
01302 else
01303 return (FALSE);
01304
01305 }
01306
01307
01308
01327 int UniformCertainties(CHOICES_LIST Choices, A_CHOICE *BestChoice) {
01328 int i;
01329 CHOICES CharChoices;
01330 float Certainty;
01331 float WorstCertainty = MAX_FLOAT32;
01332 float CertaintyThreshold;
01333 FLOAT64 TotalCertainty;
01334 FLOAT64 TotalCertaintySquared;
01335 FLOAT64 Variance;
01336 FLOAT32 Mean, StdDev;
01337 int WordLength;
01338
01339 WordLength = array_count (Choices);
01340 if (WordLength < 3)
01341 return (TRUE);
01342
01343 TotalCertainty = TotalCertaintySquared = 0.0;
01344 for_each_choice(Choices, i) {
01345 CharChoices = (CHOICES) array_index (Choices, i);
01346 Certainty = best_certainty (CharChoices);
01347 TotalCertainty += Certainty;
01348 TotalCertaintySquared += Certainty * Certainty;
01349 if (Certainty < WorstCertainty)
01350 WorstCertainty = Certainty;
01351 }
01352
01353
01354 WordLength--;
01355 TotalCertainty -= WorstCertainty;
01356 TotalCertaintySquared -= WorstCertainty * WorstCertainty;
01357
01358 Mean = TotalCertainty / WordLength;
01359 Variance = ((WordLength * TotalCertaintySquared -
01360 TotalCertainty * TotalCertainty) /
01361 (WordLength * (WordLength - 1)));
01362 if (Variance < 0.0)
01363 Variance = 0.0;
01364 StdDev = sqrt (Variance);
01365
01366 CertaintyThreshold = Mean - CertaintyVariation * StdDev;
01367 if (CertaintyThreshold > NonDictCertainty)
01368 CertaintyThreshold = NonDictCertainty;
01369
01370 if (class_certainty (BestChoice) < CertaintyThreshold) {
01371 if (StopperDebugLevel >= 1)
01372 cprintf
01373 ("Stopper: Non-uniform certainty = %4.1f (m=%4.1f, s=%4.1f, t=%4.1f)\n",
01374 class_certainty (BestChoice), Mean, StdDev, CertaintyThreshold);
01375 return (FALSE);
01376 }
01377 else
01378 return (TRUE);
01379
01380 }