#include "stopper.h"
#include "emalloc.h"
#include "matchdefs.h"
#include "debug.h"
#include "callcpp.h"
#include "permute.h"
#include "context.h"
#include "permnum.h"
#include "danerror.h"
#include "const.h"
#include "freelist.h"
#include "efio.h"
#include "globals.h"
#include "scanutils.h"
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <math.h>
Go to the source code of this file.
#define AmbigThreshold | ( | F1, | |||
F2 | ) |
Value:
(((F2) - (F1)) * AmbigThresholdGain - \ AmbigThresholdOffset)
Definition at line 107 of file stopper.cpp.
Referenced by FreeBadChoice(), and LogNewWordChoice().
#define BestCertainty | ( | Choices | ) | (((VIABLE_CHOICE) first (Choices))->Certainty) |
#define BestFactor | ( | Choices | ) | (((VIABLE_CHOICE) first (Choices))->AdjustFactor) |
#define BestRating | ( | Choices | ) | (((VIABLE_CHOICE) first (Choices))->Rating) |
#define DANGEROUS_AMBIGS "tessdata/DangAmbigs" |
File containing letter patterns that give tess fits
Definition at line 51 of file stopper.cpp.
Referenced by InitStopperVars().
#define MAX_AMBIG_SIZE 3 |
#define MAX_WERD_SIZE 100 |
int AcceptableChoice | ( | CHOICES_LIST | Choices, | |
A_CHOICE * | BestChoice, | |||
A_CHOICE * | RawChoice, | |||
DANGERR * | fixpt | |||
) |
Return TRUE if the results from this segmentation are good enough to stop; else FALSE.
Choices | Choices for current segmentation | |
BestChoice | Best choice for current segmentation | |
RawChoice | Best raw choice for current segmentation | |
fixpt | point to fix |
Definition at line 226 of file stopper.cpp.
References case_ok(), CertaintyPerChar, class_certainty, class_string, cprintf(), FALSE, DANGERR::index, LengthOfShortestAlphaRun(), NoDangerousAmbig(), NonDictCertainty, NULL, punctuation_ok(), StopperDebugLevel, TRUE, UniformCertainties(), valid_number(), and valid_word().
Referenced by chop_word_main(), evaluate_state(), and improve_by_chopping().
00229 { 00230 float CertaintyThreshold = NonDictCertainty; 00231 int WordSize; 00232 00233 if (fixpt != NULL) 00234 fixpt->index = -1; 00235 if ((BestChoice == NULL) || (class_string (BestChoice) == NULL)) 00236 return (FALSE); 00237 00238 if (StopperDebugLevel >= 1) 00239 cprintf ("\nStopper: %s (word=%c, case=%c, punct=%c)\n", 00240 class_string (BestChoice), 00241 (valid_word (class_string (BestChoice)) ? 'y' : 'n'), 00242 (case_ok (class_string (BestChoice)) ? 'y' : 'n'), 00243 ((punctuation_ok (class_string (BestChoice)) != 00244 -1) ? 'y' : 'n')); 00245 00246 if (valid_word (class_string (BestChoice)) && 00247 case_ok (class_string (BestChoice)) && 00248 punctuation_ok (class_string (BestChoice)) != -1) { 00249 WordSize = LengthOfShortestAlphaRun (class_string (BestChoice)); 00250 WordSize -= SmallWordSize; 00251 if (WordSize < 0) 00252 WordSize = 0; 00253 CertaintyThreshold += WordSize * CertaintyPerChar; 00254 } 00255 else if (stopper_numbers_on && valid_number (class_string (BestChoice))) { 00256 CertaintyThreshold += stopper_numbers_on * CertaintyPerChar; 00257 } 00258 00259 if (StopperDebugLevel >= 1) 00260 cprintf ("Stopper: Certainty = %4.1f, Threshold = %4.1f\n", 00261 class_certainty (BestChoice), CertaintyThreshold); 00262 00263 if (NoDangerousAmbig (class_string (BestChoice), fixpt) 00264 && class_certainty (BestChoice) > CertaintyThreshold && 00265 UniformCertainties (Choices, BestChoice)) 00266 return (TRUE); 00267 else 00268 return (FALSE); 00269 00270 } /* AcceptableChoice */
Return FALSE if the best choice for the current word is questionable and should be tried again on the second pass or should be flagged to the user.
BestChoice | Best choice for current word | |
RawChoice | Best raw choice for current word |
Definition at line 292 of file stopper.cpp.
References BestChoices, case_ok(), CertaintyPerChar, class_certainty, class_string, cprintf(), CurrentWordAmbig(), FALSE, LengthOfShortestAlphaRun(), NIL, NonDictCertainty, NULL, punctuation_ok(), RejectOffset, rest, StopperDebugLevel, TRUE, and valid_word().
Referenced by tess_acceptable_word().
00292 { 00293 float CertaintyThreshold = NonDictCertainty - RejectOffset; 00294 int WordSize; 00295 00296 if (StopperDebugLevel >= 1) 00297 cprintf ("\nRejecter: %s (word=%c, case=%c, punct=%c, unambig=%c)\n", 00298 class_string (BestChoice), 00299 (valid_word (class_string (BestChoice)) ? 'y' : 'n'), 00300 (case_ok (class_string (BestChoice)) ? 'y' : 'n'), 00301 ((punctuation_ok (class_string (BestChoice)) != -1) ? 'y' : 'n'), 00302 ((rest (BestChoices) != NIL) ? 'n' : 'y')); 00303 00304 if ((BestChoice == NULL) || 00305 (class_string (BestChoice) == NULL) || CurrentWordAmbig ()) 00306 return (FALSE); 00307 00308 if (valid_word (class_string (BestChoice)) && 00309 case_ok (class_string (BestChoice)) && 00310 punctuation_ok (class_string (BestChoice)) != -1) { 00311 WordSize = LengthOfShortestAlphaRun (class_string (BestChoice)); 00312 WordSize -= SmallWordSize; 00313 if (WordSize < 0) 00314 WordSize = 0; 00315 CertaintyThreshold += WordSize * CertaintyPerChar; 00316 } 00317 00318 if (StopperDebugLevel >= 1) 00319 cprintf ("Rejecter: Certainty = %4.1f, Threshold = %4.1f ", 00320 class_certainty (BestChoice), CertaintyThreshold); 00321 00322 if (class_certainty (BestChoice) > CertaintyThreshold) { 00323 if (StopperDebugLevel >= 1) 00324 cprintf ("ACCEPTED\n"); 00325 return (TRUE); 00326 } 00327 else { 00328 if (StopperDebugLevel >= 1) 00329 cprintf ("REJECTED\n"); 00330 return (FALSE); 00331 } 00332 } /* AcceptableResult */
void AddNewChunk | ( | VIABLE_CHOICE | Choice, | |
int | Blob | |||
) |
Increments the chunk count of the character in Choice which corresponds to Blob.
Choice | choice to add a new chunk to | |
Blob | index of blob being split |
Definition at line 858 of file stopper.cpp.
References assert(), VIABLE_CHOICE_STRUCT::Blob, cprintf(), FALSE, VIABLE_CHOICE_STRUCT::Length, mem_tidy(), and CHAR_CHOICE::NumChunks.
Referenced by LogNewSplit().
00858 { 00859 int i, LastChunk; 00860 00861 for (i = 0, LastChunk = 0; i < Choice->Length; i++) { 00862 LastChunk += Choice->Blob[i].NumChunks; 00863 if (Blob < LastChunk) { 00864 (Choice->Blob[i].NumChunks)++; 00865 return; 00866 } 00867 } 00868 mem_tidy (1); 00869 cprintf ("AddNewChunk failed:Choice->Length=%d, LastChunk=%d, Blob=%d\n", 00870 Choice->Length, LastChunk, Blob); 00871 assert(FALSE); /* this should never get executed */ 00872 00873 } /* AddNewChunk */
int AlternativeChoicesWorseThan | ( | FLOAT32 | Threshold | ) |
Returns TRUE if there are no alternative choices for the current word OR if all alternatives have an adjust factor worse than Threshold.
Threshold | Minimum adjust factor for alternative choices |
Definition at line 347 of file stopper.cpp.
References VIABLE_CHOICE_STRUCT::AdjustFactor, BestChoices, FALSE, first, iterate, rest, and TRUE.
Referenced by AdaptableWord().
00347 { 00348 LIST Alternatives; 00349 VIABLE_CHOICE Choice; 00350 00351 Alternatives = rest (BestChoices); 00352 iterate(Alternatives) { 00353 Choice = (VIABLE_CHOICE) first (Alternatives); 00354 if (Choice->AdjustFactor <= Threshold) 00355 return (FALSE); 00356 } 00357 00358 return (TRUE); 00359 00360 } /* AlternativeChoicesWorseThan */
int AmbigsFound | ( | char * | Word, | |
char * | CurrentChar, | |||
const char * | Tail, | |||
LIST | Ambigs, | |||
DANGERR * | fixpt | |||
) |
For each ambiguity in Ambigs, see if the remainder of the test string matches the start of Tail.
Word | Word being tested for ambiguities | |
CurrentChar | Position in Word to put ambig replacement | |
Tail | End of word to place after ambiguity | |
Ambigs | List of ambiguities to test at this position | |
fixpt | point to fix |
Definition at line 898 of file stopper.cpp.
References DANGERR::bad_length, cprintf(), FALSE, first, DANGERR::good_length, iterate, NULL, StopperDebugLevel, TRUE, and valid_word().
Referenced by NoDangerousAmbig().
00902 { 00903 char *AmbigSpec; 00904 const char *UnmatchedTail; 00905 int Matches; 00906 int bad_length; 00907 00908 iterate(Ambigs) { 00909 AmbigSpec = (char *) first (Ambigs); 00910 bad_length = 1; 00911 UnmatchedTail = Tail; 00912 Matches = TRUE; 00913 00914 while (*AmbigSpec != ' ' && Matches) 00915 if (*AmbigSpec == *UnmatchedTail) { 00916 AmbigSpec++; 00917 UnmatchedTail++; 00918 bad_length++; 00919 } 00920 else 00921 Matches = FALSE; 00922 00923 if (Matches) { 00924 AmbigSpec++; /* skip over the space */ 00925 /* insert replacement string */ 00926 strcpy(CurrentChar, AmbigSpec); 00927 /* add tail */ 00928 strcat(Word, UnmatchedTail); 00929 if (valid_word (Word)) { 00930 if (StopperDebugLevel >= 1) 00931 cprintf ("Stopper: Possible ambiguous word = %s\n", Word); 00932 if (fixpt != NULL) { 00933 fixpt->good_length = strlen (AmbigSpec); 00934 fixpt->bad_length = bad_length; 00935 } 00936 return (TRUE); 00937 } 00938 } 00939 } 00940 return (FALSE); 00941 00942 } /* AmbigsFound */
int ChoiceSameAs | ( | A_CHOICE * | Choice, | |
VIABLE_CHOICE | ViableChoice | |||
) |
Compares the corresponding strings of Choice and ViableChoice and returns TRUE if they are the same, else FALSE.
Choice | Choice to compare to ViableChoice | |
ViableChoice | Viable choice to compare to Choice |
Definition at line 957 of file stopper.cpp.
References class_string, and StringSameAs().
Referenced by LogNewRawChoice(), and LogNewWordChoice().
00957 { 00958 return (StringSameAs (class_string (Choice), ViableChoice)); 00959 00960 } /* ChoiceSameAs */
int CmpChoiceRatings | ( | void * | arg1, | |
void * | arg2 | |||
) |
Return -1 if the rating for Choice1 is less than the rating for Choice2, otherwise return (1).
arg1 | Choice to compare ratings for | |
arg2 | Choice to compare ratings for |
Definition at line 975 of file stopper.cpp.
References VIABLE_CHOICE_STRUCT::Rating.
Referenced by LogNewWordChoice().
00976 { 00977 float R1, R2; 00978 VIABLE_CHOICE Choice1 = (VIABLE_CHOICE) arg1; 00979 VIABLE_CHOICE Choice2 = (VIABLE_CHOICE) arg2; 00980 00981 R1 = Choice1->Rating; 00982 R2 = Choice2->Rating; 00983 00984 if (R1 < R2) 00985 return (-1); 00986 else 00987 return (1); 00988 00989 } /* CmpChoiceRatings */
FLOAT32 CurrentBestChoiceAdjustFactor | ( | ) |
Return the adjustment factor for the best choice for the current word.
none |
Definition at line 392 of file stopper.cpp.
References VIABLE_CHOICE_STRUCT::AdjustFactor, BestChoices, first, MAX_FLOAT32, and NIL.
Referenced by AdaptableWord().
00392 { 00393 VIABLE_CHOICE BestChoice; 00394 00395 if (BestChoices == NIL) 00396 return (MAX_FLOAT32); 00397 00398 BestChoice = (VIABLE_CHOICE) first (BestChoices); 00399 return (BestChoice->AdjustFactor); 00400 00401 } /* CurrentBestChoiceAdjustFactor */
int CurrentBestChoiceIs | ( | const char * | Word | ) |
Returns TRUE if Word is the same as the current best choice, FALSE otherwise.
Word | String to compare to current best choice |
Definition at line 374 of file stopper.cpp.
References BestChoices, first, NIL, and StringSameAs().
Referenced by AdaptableWord().
00374 { 00375 return (BestChoices != NIL && 00376 StringSameAs (Word, (VIABLE_CHOICE) first (BestChoices))); 00377 00378 } /* CurrentBestChoiceIs */
int CurrentWordAmbig | ( | ) |
Returns TRUE if there are multiple good choices for the current word and FALSE otherwise.
none |
Definition at line 415 of file stopper.cpp.
References BestChoices, NIL, and rest.
Referenced by AcceptableResult(), and add_document_word().
00415 { 00416 return (rest (BestChoices) != NIL); 00417 00418 } /* CurrentWordAmbig */
void DebugWordChoices | ( | ) |
Print the current choices for this word to stdout.
none |
Definition at line 431 of file stopper.cpp.
References BestChoices, BestRawChoice, cprintf(), first, iterate, PrintViableChoice(), StopperDebugLevel, StringSameAs(), and WordToDebug.
Referenced by cc_recog().
00431 { 00432 LIST Choices; 00433 int i; 00434 char LabelString[80]; 00435 00436 if (StopperDebugLevel >= 1 || 00437 WordToDebug && BestChoices && 00438 StringSameAs (WordToDebug, (VIABLE_CHOICE) first (BestChoices))) { 00439 if (BestRawChoice) 00440 PrintViableChoice (stdout, "\nBest Raw Choice: ", BestRawChoice); 00441 00442 i = 1; 00443 Choices = BestChoices; 00444 if (Choices) 00445 cprintf ("\nBest Cooked Choices:\n"); 00446 iterate(Choices) { 00447 sprintf (LabelString, "Cooked Choice #%d: ", i); 00448 PrintViableChoice (stdout, LabelString, 00449 (VIABLE_CHOICE) first (Choices)); 00450 i++; 00451 } 00452 } 00453 } /* DebugWordChoices */
void EndDangerousAmbigs | ( | ) |
Definition at line 801 of file stopper.cpp.
References AmbigFor, destroy_nodes(), Efree(), MAX_CLASS_ID, and NULL.
Referenced by EndAdaptiveClassifier().
00801 { 00802 if (AmbigFor != NULL) { 00803 for (int i = 0; i <= MAX_CLASS_ID; ++i) { 00804 destroy_nodes(AmbigFor[i], Efree); 00805 } 00806 Efree(AmbigFor); 00807 AmbigFor = NULL; 00808 } 00809 }
void ExpandChoice | ( | VIABLE_CHOICE | Choice, | |
EXPANDED_CHOICE * | ExpandedChoice | |||
) |
Expands Choice and places the results in ExpandedChoice.
Choice | Choice to be expanded | |
ExpandedChoice | Place to put resulting expanded choice |
Definition at line 1007 of file stopper.cpp.
References VIABLE_CHOICE_STRUCT::Blob, CHAR_CHOICE::Certainty, EXPANDED_CHOICE::Choice, EXPANDED_CHOICE::ChunkCertainty, EXPANDED_CHOICE::ChunkClass, CHAR_CHOICE::Class, VIABLE_CHOICE_STRUCT::Length, and CHAR_CHOICE::NumChunks.
Referenced by FilterWordChoices(), and FindClassifierErrors().
01007 { 01008 int i, j, Chunk; 01009 01010 ExpandedChoice->Choice = Choice; 01011 for (i = 0, Chunk = 0; i < Choice->Length; i++) 01012 for (j = 0; j < Choice->Blob[i].NumChunks; j++, Chunk++) { 01013 ExpandedChoice->ChunkCertainty[Chunk] = Choice->Blob[i].Certainty; 01014 ExpandedChoice->ChunkClass[Chunk] = Choice->Blob[i].Class; 01015 } 01016 } /* ExpandChoice */
AMBIG_TABLE * FillAmbigTable | ( | ) |
Allocates a new ambiguity table and fills it in from the file specified by DangerousAmbigs.
none |
Definition at line 1040 of file stopper.cpp.
References DangerousAmbigs, demodir, DoError(), Efopen(), Emalloc(), fscanf(), MAX_AMBIG_SIZE, MAX_CLASS_ID, NIL, and push_last().
Referenced by NoDangerousAmbig().
01040 { 01041 FILE *AmbigFile; 01042 AMBIG_TABLE *NewTable; 01043 int i; 01044 char TestString[256]; 01045 char ReplacementString[256]; 01046 char name[1024]; 01047 char *AmbigSpec; 01048 int AmbigSize; 01049 01050 strcpy(name, demodir); 01051 strcat(name, DangerousAmbigs); 01052 AmbigFile = Efopen (name, "r"); 01053 NewTable = (AMBIG_TABLE *) Emalloc (sizeof (LIST) * (MAX_CLASS_ID + 1)); 01054 01055 for (i = 0; i <= MAX_CLASS_ID; i++) 01056 NewTable[i] = NIL; 01057 01058 while (fscanf (AmbigFile, "%s", TestString) == 1 && 01059 fscanf (AmbigFile, "%s", ReplacementString) == 1) { 01060 if (strlen (TestString) > MAX_AMBIG_SIZE || 01061 strlen (ReplacementString) > MAX_AMBIG_SIZE) 01062 DoError (0, "Illegal ambiguity specification!"); 01063 01064 AmbigSize = strlen (TestString) + strlen (ReplacementString) + 1; 01065 AmbigSpec = (char *) Emalloc (sizeof (char) * AmbigSize); 01066 01067 strcpy (AmbigSpec, &(TestString[1])); 01068 strcat (AmbigSpec, " "); 01069 strcat(AmbigSpec, ReplacementString); 01070 NewTable[TestString[0]] = 01071 push_last (NewTable[TestString[0]], AmbigSpec); 01072 } 01073 fclose(AmbigFile); 01074 return (NewTable); 01075 01076 } /* FillAmbigTable */
void FilterWordChoices | ( | ) |
Removes from BestChoices all choices which are not within a reasonable range of the best choice.
none |
Definition at line 467 of file stopper.cpp.
References BestChoices, delete_d(), ExpandChoice(), first, FreeBadChoice(), NIL, rest, second, and set_rest.
Referenced by chop_word_main().
00467 { 00468 EXPANDED_CHOICE BestChoice; 00469 00470 if (BestChoices == NIL || second (BestChoices) == NIL) 00471 return; 00472 00473 /* compute certainties and class for each chunk in best choice */ 00474 ExpandChoice ((VIABLE_CHOICE_STRUCT *) first (BestChoices), &BestChoice); 00475 00476 set_rest (BestChoices, delete_d (rest (BestChoices), 00477 &BestChoice, FreeBadChoice)); 00478 00479 } /* FilterWordChoices */
void FindClassifierErrors | ( | FLOAT32 | MinRating, | |
FLOAT32 | MaxRating, | |||
FLOAT32 | RatingMargin, | |||
FLOAT32 | Thresholds[] | |||
) |
Compares the best choice for the current word to the best raw choice to determine which characters were classified incorrectly by the classifier; places a separate threshold into Thresholds for each character in the word.
MinRating | Limits how tight to make a template | |
MaxRating | Limits how loose to make a template | |
RatingMargin | Amount of margin to put in template | |
Thresholds[] | Place to put error thresholds |
This can then be used by the caller to try to create a new template for the desired class that will classify the character with a rating better than the threshold value. The match rating placed into Thresholds is never allowed to be below MinRating in order to prevent trying to make overly tight templates.
Definition at line 511 of file stopper.cpp.
References assert(), BestChoices, BestRawChoice, VIABLE_CHOICE_STRUCT::Blob, CertaintyScale, EXPANDED_CHOICE::ChunkCertainty, EXPANDED_CHOICE::ChunkClass, CHAR_CHOICE::Class, ExpandChoice(), first, VIABLE_CHOICE_STRUCT::Length, NIL, NULL, and CHAR_CHOICE::NumChunks.
Referenced by GetAdaptThresholds().
00513 { 00514 EXPANDED_CHOICE BestRaw; 00515 VIABLE_CHOICE Choice; 00516 int i, j, Chunk; 00517 FLOAT32 AvgRating; 00518 int NumErrorChunks; 00519 00520 assert (BestChoices != NIL); 00521 assert (BestRawChoice != NULL); 00522 00523 ExpandChoice(BestRawChoice, &BestRaw); 00524 Choice = (VIABLE_CHOICE) first (BestChoices); 00525 00526 for (i = 0, Chunk = 0; i < Choice->Length; i++, Thresholds++) { 00527 AvgRating = 0.0; 00528 NumErrorChunks = 0; 00529 00530 for (j = 0; j < Choice->Blob[i].NumChunks; j++, Chunk++) 00531 if (Choice->Blob[i].Class != BestRaw.ChunkClass[Chunk]) { 00532 AvgRating += BestRaw.ChunkCertainty[Chunk]; 00533 NumErrorChunks++; 00534 } 00535 00536 if (NumErrorChunks > 0) { 00537 AvgRating /= NumErrorChunks; 00538 *Thresholds = (AvgRating / -CertaintyScale) * (1.0 - RatingMargin); 00539 } 00540 else 00541 *Thresholds = MaxRating; 00542 00543 if (*Thresholds > MaxRating) 00544 *Thresholds = MaxRating; 00545 if (*Thresholds < MinRating) 00546 *Thresholds = MinRating; 00547 } 00548 } /* FindClassifierErrors */
int FreeBadChoice | ( | void * | item1, | |
void * | item2 | |||
) |
If the certainty of any chunk in Choice is not ambiguous with the corresponding chunk in the best choice, free Choice and return TRUE, else FALSE.
item1 | Choice to be tested | |
item2 | Choice to be tested |
Definition at line 1094 of file stopper.cpp.
References VIABLE_CHOICE_STRUCT::AdjustFactor, AmbigThreshold, VIABLE_CHOICE_STRUCT::Blob, CHAR_CHOICE::Certainty, EXPANDED_CHOICE::Choice, EXPANDED_CHOICE::ChunkCertainty, EXPANDED_CHOICE::ChunkClass, CHAR_CHOICE::Class, VIABLE_CHOICE_STRUCT::Length, memfree(), CHAR_CHOICE::NumChunks, and TRUE.
Referenced by FilterWordChoices().
01095 { //EXPANDED_CHOICE *BestChoice) 01096 int i, j, Chunk; 01097 FLOAT32 Threshold; 01098 VIABLE_CHOICE Choice; 01099 EXPANDED_CHOICE *BestChoice; 01100 01101 Choice = (VIABLE_CHOICE) item1; 01102 BestChoice = (EXPANDED_CHOICE *) item2; 01103 01104 Threshold = AmbigThreshold (BestChoice->Choice->AdjustFactor, 01105 Choice->AdjustFactor); 01106 01107 for (i = 0, Chunk = 0; i < Choice->Length; i++) 01108 for (j = 0; j < Choice->Blob[i].NumChunks; j++, Chunk++) 01109 if (Choice->Blob[i].Class != BestChoice->ChunkClass[Chunk] && 01110 Choice->Blob[i].Certainty - BestChoice->ChunkCertainty[Chunk] < 01111 Threshold) { 01112 memfree(Choice); 01113 return (TRUE); 01114 } 01115 01116 return (FALSE); 01117 01118 } /* FreeBadChoice */
void InitChoiceAccum | ( | ) |
Initializes the data structures used to keep track the good word choices found for a word.
none |
Definition at line 589 of file stopper.cpp.
References BestChoices, BestRawChoice, CurrentSegmentation, destroy_nodes(), EnableChoiceAccum, MAX_NUM_CHUNKS, memfree(), NIL, and NULL.
Referenced by cc_recog(), and program_editdown().
00589 { 00590 BLOB_WIDTH *BlobWidth, *End; 00591 00592 if (BestRawChoice) 00593 memfree(BestRawChoice); 00594 00595 if (BestChoices) 00596 destroy_nodes(BestChoices, memfree); 00597 00598 BestRawChoice = NULL; 00599 BestChoices = NIL; 00600 EnableChoiceAccum(); 00601 00602 for (BlobWidth = CurrentSegmentation, 00603 End = CurrentSegmentation + MAX_NUM_CHUNKS; 00604 BlobWidth < End; *BlobWidth++ = 1); 00605 00606 } /* InitChoiceAccum */
void InitStopperVars | ( | ) |
Initializes the control variables used by the stopper.
none |
Definition at line 561 of file stopper.cpp.
References DANGEROUS_AMBIGS, DangerousAmbigs, dummy, string_variable, and WordToDebug.
Referenced by init_dj_debug().
00561 { 00562 VALUE dummy; 00563 00564 string_variable (DangerousAmbigs, "DangerousAmbigs", DANGEROUS_AMBIGS); 00565 string_variable (WordToDebug, "WordToDebug", ""); 00566 00567 MakeNonDictCertainty(); 00568 MakeRejectCertaintyOffset(); 00569 MakeSmallWordSize(); 00570 MakeCertaintyPerChar(); 00571 MakeCertaintyVariation(); 00572 MakeStopperDebugLevel(); 00573 MakeAmbigThresholdGain(); 00574 MakeAmbigThresholdOffset(); 00575 } /* InitStopperVars */
int LengthOfShortestAlphaRun | ( | register char * | Word | ) |
Return the length of the shortest alpha run in Word.
Word | Word to be tested |
Definition at line 1131 of file stopper.cpp.
References MAXINT.
Referenced by AcceptableChoice(), and AcceptableResult().
01131 { 01132 register int Shortest = MAXINT; 01133 register int Length; 01134 01135 for (; *Word; Word++) 01136 if (isalpha (*Word)) { 01137 for (Length = 1, Word++; isalpha (*Word); Word++, Length++); 01138 if (Length < Shortest) 01139 Shortest = Length; 01140 01141 if (*Word == 0) 01142 break; 01143 } 01144 if (Shortest == MAXINT) 01145 Shortest = 0; 01146 01147 return (Shortest); 01148 01149 } /* LengthOfShortestAlphaRun */
Compares Choice to the best raw (non-dict) choice so far; if new choice is better, best raw choice is updated.
Choice | New raw choice for current word | |
AdjustFactor | Adjustment factor which was applied to choice | |
Certainties | Certainties for each char in new choice |
Definition at line 623 of file stopper.cpp.
References BestRawChoice, ChoiceSameAs(), class_probability, KeepWordChoices, memfree(), NewViableChoice(), VIABLE_CHOICE_STRUCT::Rating, and ReplaceDuplicateChoice().
Referenced by permute_top_choice().
00623 { 00624 if (!KeepWordChoices) 00625 return; 00626 00627 if (!BestRawChoice) 00628 BestRawChoice = NewViableChoice (Choice, AdjustFactor, Certainties); 00629 else if (class_probability (Choice) < BestRawChoice->Rating) { 00630 if (ChoiceSameAs (Choice, BestRawChoice)) 00631 ReplaceDuplicateChoice(BestRawChoice, Choice, AdjustFactor, Certainties); 00632 else { 00633 memfree(BestRawChoice); 00634 BestRawChoice = NewViableChoice (Choice, AdjustFactor, Certainties); 00635 } 00636 } 00637 } /* LogNewRawChoice */
void LogNewSegmentation | ( | PIECES_STATE | BlobWidth | ) |
Updates the blob widths in CurrentSegmentation to be the same as provided in BlobWidth.
BlobWidth | number of chunks in each blob in segmentation |
Definition at line 650 of file stopper.cpp.
References CurrentSegmentation.
Referenced by evaluate_state().
00650 { 00651 BLOB_WIDTH *Segmentation; 00652 00653 for (Segmentation = CurrentSegmentation; *BlobWidth != 0; 00654 BlobWidth++, Segmentation++) 00655 *Segmentation = *BlobWidth; 00656 *Segmentation = 0; 00657 00658 } /* LogNewSegmentation */
void LogNewSplit | ( | int | Blob | ) |
Adds one chunk to the specified blob for each choice in BestChoices and for the BestRawChoice.
Blob | index of blob that was split |
Definition at line 674 of file stopper.cpp.
References AddNewChunk(), BestChoices, BestRawChoice, first, and iterate.
Referenced by improve_by_chopping().
00674 { 00675 LIST Choices; 00676 00677 if (BestRawChoice) { 00678 AddNewChunk(BestRawChoice, Blob); 00679 } 00680 00681 Choices = BestChoices; 00682 iterate(Choices) { 00683 AddNewChunk ((VIABLE_CHOICE) first (Choices), Blob); 00684 } 00685 00686 } /* LogNewSplit */
Adds Choice to BestChoices if the adjusted certainty for Choice is within a reasonable range of the best choice in BestChoices.
Choice | new choice for current word | |
AdjustFactor | adjustment factor which was applied to choice | |
Certainties | certainties for each char in new choice |
Definition at line 706 of file stopper.cpp.
References AmbigThreshold, BestCertainty, BestChoices, BestFactor, BestRating, ChoiceSameAs(), class_certainty, class_probability, CmpChoiceRatings(), count(), delete_d(), destroy_nodes(), Efree(), first, is_same_node(), iterate, KeepWordChoices, NewViableChoice(), NIL, nth_cell(), NULL, PrintViableChoice(), ReplaceDuplicateChoice(), rest, s_adjoin(), set_rest, and StopperDebugLevel.
Referenced by adjust_non_word(), adjust_number(), and adjust_word().
00707 { 00708 VIABLE_CHOICE NewChoice; 00709 LIST Choices; 00710 FLOAT32 Threshold; 00711 00712 if (!KeepWordChoices) 00713 return; 00714 00715 /* throw out obviously bad choices to save some work */ 00716 if (BestChoices != NIL) { 00717 Threshold = AmbigThreshold (BestFactor (BestChoices), AdjustFactor); 00718 if (Threshold > -AmbigThresholdOffset) 00719 Threshold = -AmbigThresholdOffset; 00720 if (class_certainty (Choice) - BestCertainty (BestChoices) < Threshold) 00721 return; 00722 } 00723 00724 /* see if a choice with the same text string has already been found */ 00725 NewChoice = NULL; 00726 Choices = BestChoices; 00727 iterate(Choices) { 00728 if (ChoiceSameAs (Choice, (VIABLE_CHOICE) first (Choices))) 00729 if (class_probability (Choice) < BestRating (Choices)) 00730 NewChoice = (VIABLE_CHOICE) first (Choices); 00731 else 00732 return; 00733 } 00734 00735 if (NewChoice) { 00736 ReplaceDuplicateChoice(NewChoice, Choice, AdjustFactor, Certainties); 00737 BestChoices = delete_d (BestChoices, NewChoice, is_same_node); 00738 } 00739 else { 00740 NewChoice = NewViableChoice (Choice, AdjustFactor, Certainties); 00741 } 00742 00743 BestChoices = s_adjoin (BestChoices, NewChoice, CmpChoiceRatings); 00744 if (StopperDebugLevel >= 2) 00745 PrintViableChoice (stdout, "New Word Choice: ", NewChoice); 00746 if (count (BestChoices) > tessedit_truncate_wordchoice_log) { 00747 Choices = 00748 (LIST) nth_cell (BestChoices, tessedit_truncate_wordchoice_log); 00749 destroy_nodes (rest (Choices), Efree); 00750 set_rest(Choices, NIL); 00751 } 00752 00753 } /* LogNewWordChoice */
VIABLE_CHOICE NewViableChoice | ( | A_CHOICE * | Choice, | |
FLOAT32 | AdjustFactor, | |||
float | Certainties[] | |||
) |
Allocate a new viable choice data structure, copy Choice, Certainties, and CurrentSegmentation into it, and return a pointer to it.
Choice | Choice to be converted to a viable choice | |
AdjustFactor | Factor used to adjust ratings for Choice | |
Certainties | Certainty for each character in Choice |
Definition at line 1167 of file stopper.cpp.
References VIABLE_CHOICE_STRUCT::AdjustFactor, assert(), VIABLE_CHOICE_STRUCT::Blob, CHAR_CHOICE::Certainty, VIABLE_CHOICE_STRUCT::Certainty, CHAR_CHOICE::Class, class_certainty, class_probability, class_string, CurrentSegmentation, Emalloc(), VIABLE_CHOICE_STRUCT::Length, CHAR_CHOICE::NumChunks, and VIABLE_CHOICE_STRUCT::Rating.
Referenced by LogNewRawChoice(), and LogNewWordChoice().
01167 { 01168 VIABLE_CHOICE NewChoice; 01169 int Length; 01170 char *Word; 01171 CHAR_CHOICE *NewChar; 01172 BLOB_WIDTH *BlobWidth; 01173 01174 Length = strlen (class_string (Choice)); 01175 assert (Length <= MAX_NUM_CHUNKS && Length > 0); 01176 01177 NewChoice = (VIABLE_CHOICE) Emalloc (sizeof (VIABLE_CHOICE_STRUCT) + 01178 (Length - 1) * sizeof (CHAR_CHOICE)); 01179 01180 NewChoice->Rating = class_probability (Choice); 01181 NewChoice->Certainty = class_certainty (Choice); 01182 NewChoice->AdjustFactor = AdjustFactor; 01183 NewChoice->Length = Length; 01184 01185 for (Word = class_string (Choice), 01186 NewChar = &(NewChoice->Blob[0]), 01187 BlobWidth = CurrentSegmentation; 01188 *Word; Word++, NewChar++, Certainties++, BlobWidth++) { 01189 NewChar->Class = *Word; 01190 NewChar->NumChunks = *BlobWidth; 01191 NewChar->Certainty = *Certainties; 01192 } 01193 01194 return (NewChoice); 01195 01196 } /* NewViableChoice */
int NoDangerousAmbig | ( | const char * | Word, | |
DANGERR * | fixpt | |||
) |
Definition at line 776 of file stopper.cpp.
References AmbigFor, AmbigsFound(), FALSE, FillAmbigTable(), DANGERR::index, MAX_WERD_SIZE, NULL, and TRUE.
Referenced by AcceptableChoice(), and word_adaptable().
00776 { 00777 00778 char NewWord[MAX_WERD_SIZE]; 00779 char *NextNewChar; 00780 int bad_index = 0; 00781 00782 if (!AmbigFor) 00783 AmbigFor = FillAmbigTable (); 00784 00785 NextNewChar = NewWord; 00786 while (*Word) 00787 if (AmbigsFound (NewWord, NextNewChar, Word + 1, AmbigFor[*Word], fixpt)) { 00788 if (fixpt != NULL) 00789 fixpt->index = bad_index; 00790 return (FALSE); 00791 } 00792 else { 00793 *NextNewChar++ = *Word++; 00794 bad_index++; 00795 } 00796 00797 return (TRUE); 00798 00799 } /* NoDangerousAmbig */
void PrintViableChoice | ( | FILE * | File, | |
const char * | Label, | |||
VIABLE_CHOICE | Choice | |||
) |
Dumps a text representation of the specified Choice to File.
File | Open text file to print Choice to | |
Label | Text label to be printed with Choice | |
Choice | Choice to be printed |
Definition at line 1211 of file stopper.cpp.
References VIABLE_CHOICE_STRUCT::AdjustFactor, VIABLE_CHOICE_STRUCT::Blob, CHAR_CHOICE::Certainty, VIABLE_CHOICE_STRUCT::Certainty, CHAR_CHOICE::Class, VIABLE_CHOICE_STRUCT::Length, CHAR_CHOICE::NumChunks, and VIABLE_CHOICE_STRUCT::Rating.
Referenced by DebugWordChoices(), and LogNewWordChoice().
01211 { 01212 int i, j; 01213 01214 fprintf (File, "%s", Label); 01215 01216 fprintf (File, "(R=%5.1f, C=%4.1f, F=%4.2f) ", 01217 Choice->Rating, Choice->Certainty, Choice->AdjustFactor); 01218 01219 for (i = 0; i < Choice->Length; i++) 01220 fprintf (File, "%c", Choice->Blob[i].Class); 01221 fprintf (File, "\n"); 01222 01223 for (i = 0; i < Choice->Length; i++) { 01224 fprintf (File, " %c", Choice->Blob[i].Class); 01225 for (j = 0; j < Choice->Blob[i].NumChunks - 1; j++) 01226 fprintf (File, " "); 01227 } 01228 fprintf (File, "\n"); 01229 01230 for (i = 0; i < Choice->Length; i++) { 01231 for (j = 0; j < Choice->Blob[i].NumChunks; j++) 01232 fprintf (File, "%3d", (int) (Choice->Blob[i].Certainty * -10.0)); 01233 } 01234 fprintf (File, "\n"); 01235 01236 } /* PrintViableChoice */
void ReplaceDuplicateChoice | ( | VIABLE_CHOICE | OldChoice, | |
A_CHOICE * | NewChoice, | |||
FLOAT32 | AdjustFactor, | |||
float | Certainties[] | |||
) |
Updates OldChoice with relevant information from the new choice whenever a better segmentation (or contextual interpretation) is found for a word which already exists.
OldChoice | Existing viable choice to be replaced | |
NewChoice | Choice to replace OldChoice with | |
AdjustFactor | Factor used to adjust ratings for OldChoice | |
Certainties | Certainty for each character in OldChoice |
Definition at line 1258 of file stopper.cpp.
References VIABLE_CHOICE_STRUCT::AdjustFactor, VIABLE_CHOICE_STRUCT::Blob, CHAR_CHOICE::Certainty, VIABLE_CHOICE_STRUCT::Certainty, class_certainty, class_probability, class_string, CurrentSegmentation, CHAR_CHOICE::NumChunks, and VIABLE_CHOICE_STRUCT::Rating.
Referenced by LogNewRawChoice(), and LogNewWordChoice().
01260 { 01261 char *Word; 01262 CHAR_CHOICE *NewChar; 01263 BLOB_WIDTH *BlobWidth; 01264 01265 OldChoice->Rating = class_probability (NewChoice); 01266 OldChoice->Certainty = class_certainty (NewChoice); 01267 OldChoice->AdjustFactor = AdjustFactor; 01268 01269 for (Word = class_string (NewChoice), 01270 NewChar = &(OldChoice->Blob[0]), 01271 BlobWidth = CurrentSegmentation; 01272 *Word; Word++, NewChar++, Certainties++, BlobWidth++) { 01273 NewChar->NumChunks = *BlobWidth; 01274 NewChar->Certainty = *Certainties; 01275 } 01276 } /* ReplaceDuplicateChoice */
void SettupStopperPass1 | ( | ) |
Performs any settup of stopper variables that is needed in preparation for the first pass.
none |
Definition at line 822 of file stopper.cpp.
References RejectOffset.
Referenced by SettupPass1().
00822 { 00823 RejectOffset = 0.0; 00824 } /* SettupStopperPass1 */
void SettupStopperPass2 | ( | ) |
Performs any settup of stopper variables that is needed in preparation for the second pass.
none |
Definition at line 838 of file stopper.cpp.
References RejectCertaintyOffset, and RejectOffset.
Referenced by SettupPass2().
00838 { 00839 RejectOffset = RejectCertaintyOffset; 00840 } /* SettupStopperPass2 */
int StringSameAs | ( | const char * | String, | |
VIABLE_CHOICE | ViableChoice | |||
) |
Compares String to ViableChoice and returns TRUE if they are the same, FALSE otherwise.
String | String to compare to ViableChoice | |
ViableChoice | Viable choice to compare to String |
Definition at line 1291 of file stopper.cpp.
References VIABLE_CHOICE_STRUCT::Blob, CHAR_CHOICE::Class, FALSE, and TRUE.
Referenced by ChoiceSameAs(), CurrentBestChoiceIs(), and DebugWordChoices().
01291 { 01292 CHAR_CHOICE *Char; 01293 int i; 01294 01295 for (Char = &(ViableChoice->Blob[0]), i = 0; 01296 i < ViableChoice->Length; String++, Char++, i++) 01297 if (*String != Char->Class) 01298 return (FALSE); 01299 01300 if (*String == 0) 01301 return (TRUE); 01302 else 01303 return (FALSE); 01304 01305 } /* StringSameAs */
int UniformCertainties | ( | CHOICES_LIST | Choices, | |
A_CHOICE * | BestChoice | |||
) |
Returns TRUE if the certainty of the BestChoice word is within a reasonable range of the average certainties for the best choices for each character in the segmentation.
Choices | Choices for current segmentation | |
BestChoice | Best choice for current segmentation |
Definition at line 1327 of file stopper.cpp.
References array_count, array_index, best_certainty, class_certainty, cprintf(), FALSE, for_each_choice, MAX_FLOAT32, Mean(), NonDictCertainty, StopperDebugLevel, and TRUE.
Referenced by AcceptableChoice().
01327 { 01328 int i; 01329 CHOICES CharChoices; 01330 float Certainty; 01331 float WorstCertainty = MAX_FLOAT32; 01332 float CertaintyThreshold; 01333 FLOAT64 TotalCertainty; 01334 FLOAT64 TotalCertaintySquared; 01335 FLOAT64 Variance; 01336 FLOAT32 Mean, StdDev; 01337 int WordLength; 01338 01339 WordLength = array_count (Choices); 01340 if (WordLength < 3) 01341 return (TRUE); 01342 01343 TotalCertainty = TotalCertaintySquared = 0.0; 01344 for_each_choice(Choices, i) { 01345 CharChoices = (CHOICES) array_index (Choices, i); 01346 Certainty = best_certainty (CharChoices); 01347 TotalCertainty += Certainty; 01348 TotalCertaintySquared += Certainty * Certainty; 01349 if (Certainty < WorstCertainty) 01350 WorstCertainty = Certainty; 01351 } 01352 01353 /* subtract off worst certainty from statistics */ 01354 WordLength--; 01355 TotalCertainty -= WorstCertainty; 01356 TotalCertaintySquared -= WorstCertainty * WorstCertainty; 01357 01358 Mean = TotalCertainty / WordLength; 01359 Variance = ((WordLength * TotalCertaintySquared - 01360 TotalCertainty * TotalCertainty) / 01361 (WordLength * (WordLength - 1))); 01362 if (Variance < 0.0) 01363 Variance = 0.0; 01364 StdDev = sqrt (Variance); 01365 01366 CertaintyThreshold = Mean - CertaintyVariation * StdDev; 01367 if (CertaintyThreshold > NonDictCertainty) 01368 CertaintyThreshold = NonDictCertainty; 01369 01370 if (class_certainty (BestChoice) < CertaintyThreshold) { 01371 if (StopperDebugLevel >= 1) 01372 cprintf 01373 ("Stopper: Non-uniform certainty = %4.1f (m=%4.1f, s=%4.1f, t=%4.1f)\n", 01374 class_certainty (BestChoice), Mean, StdDev, CertaintyThreshold); 01375 return (FALSE); 01376 } 01377 else 01378 return (TRUE); 01379 01380 } /* UniformCertainties */
AMBIG_TABLE* AmbigFor = NULL [static] |
Checks each letter in word against a list of potentially ambiguous characters.
Word | Word to check for dangerous ambiguities | |
fixpt | Point to fix |
Definition at line 774 of file stopper.cpp.
Referenced by EndDangerousAmbigs(), and NoDangerousAmbig().
LIST BestChoices = NIL [static] |
Definition at line 166 of file stopper.cpp.
Referenced by AcceptableResult(), AlternativeChoicesWorseThan(), CurrentBestChoiceAdjustFactor(), CurrentBestChoiceIs(), CurrentWordAmbig(), DebugWordChoices(), FilterWordChoices(), FindClassifierErrors(), InitChoiceAccum(), LogNewSplit(), and LogNewWordChoice().
VIABLE_CHOICE BestRawChoice = NULL [static] |
structures to keep track of viable word choices
Definition at line 165 of file stopper.cpp.
Referenced by DebugWordChoices(), FindClassifierErrors(), InitChoiceAccum(), LogNewRawChoice(), and LogNewSplit().
float CertaintyScale |
** (c) Copyright Hewlett-Packard Company, 1988. ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. ** You may obtain a copy of the License at ** http://www.apache.org/licenses/LICENSE-2.0 ** Unless required by applicable law or agreed to in writing, software ** distributed under the License is distributed on an "AS IS" BASIS, ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ** See the License for the specific language governing permissions and ** limitations under the License.
Referenced by ConvertMatchesToChoices(), and FindClassifierErrors().
PIECES_STATE CurrentSegmentation [static] |
Definition at line 167 of file stopper.cpp.
Referenced by InitChoiceAccum(), LogNewSegmentation(), NewViableChoice(), and ReplaceDuplicateChoice().
const char* DangerousAmbigs = DANGEROUS_AMBIGS [static] |
Name of file containing potentially dangerous ambiguities
Definition at line 152 of file stopper.cpp.
Referenced by FillAmbigTable(), and InitStopperVars().
BOOL8 KeepWordChoices = TRUE |
flag used to disable accumulation of word choices during compound word permutation
Definition at line 159 of file stopper.cpp.
Referenced by LogNewRawChoice(), and LogNewWordChoice().
FLOAT32 RejectOffset = 0.0 [static] |
additional certainty padding allowed before a word is rejected
Definition at line 162 of file stopper.cpp.
Referenced by AcceptableResult(), SettupStopperPass1(), and SettupStopperPass2().
char* WordToDebug = NULL [static] |
Word for which stopper debug information should be printed to stdout
Definition at line 155 of file stopper.cpp.
Referenced by DebugWordChoices(), and InitStopperVars().