00001
00026 #include "oldlist.h"
00027 #include "efio.h"
00028 #include "emalloc.h"
00029 #include "featdefs.h"
00030 #include "getopt.h"
00031 #include "ocrfeatures.h"
00032 #include "general.h"
00033 #include "clusttool.h"
00034 #include "cluster.h"
00035 #include "name2char.h"
00036 #include <string.h>
00037 #include <stdio.h>
00038 #include <math.h>
00039
00040 #define MAXNAMESIZE 80
00041
00042 #define MAX_NUM_SAMPLES 10000
00043
00044 #define PROGRAM_FEATURE_TYPE "cn"
00045
00046 #define MINSD (1.0f / 64.0f)
00047
00048 int row_number;
00049
00058 typedef struct
00059 {
00060 char *Label;
00061 LIST List;
00062 }
00063 LABELEDLISTNODE, *LABELEDLIST;
00064
00068 #define round(x,frag)(floor(x/frag+.5)*frag)
00069
00070
00071
00072
00073 int main (
00074 int argc,
00075 char **argv);
00076
00077
00078
00079
00080 void ParseArguments(
00081 int argc,
00082 char **argv);
00083
00084 char *GetNextFilename ();
00085
00086 void ReadTrainingSamples (
00087 FILE *File,
00088 LIST* TrainingSamples);
00089
00090 LABELEDLIST FindList (
00091 LIST List,
00092 char *Label);
00093
00094 LABELEDLIST NewLabeledList (
00095 char *Label);
00096
00097 void WriteTrainingSamples (
00098 char *Directory,
00099 LIST CharList);
00100
00101 void WriteNormProtos (
00102 char *Directory,
00103 LIST LabeledProtoList,
00104 CLUSTERER *Clusterer);
00105
00106 void FreeTrainingSamples (
00107 LIST CharList);
00108
00109 void FreeNormProtoList (
00110 LIST CharList);
00111
00112 void FreeLabeledList (
00113 LABELEDLIST LabeledList);
00114
00115 CLUSTERER *SetUpForClustering(
00116 LABELEDLIST CharSample);
00117
00118
00119
00120
00121
00122 void AddToNormProtosList(
00123 LIST* NormProtoList,
00124 LIST ProtoList,
00125 char* CharName);
00126
00127 void WriteProtos(
00128 FILE *File,
00129 UINT16 N,
00130 LIST ProtoList,
00131 BOOL8 WriteSigProtos,
00132 BOOL8 WriteInsigProtos);
00133
00134 int NumberOfProtos(
00135 LIST ProtoList,
00136 BOOL8 CountSigProtos,
00137 BOOL8 CountInsigProtos);
00138
00139
00140
00141
00142 static char FontName[MAXNAMESIZE];
00143
00144 static char *Directory = NULL;
00145 static int MaxNumSamples = MAX_NUM_SAMPLES;
00146 static int Argc;
00147 static char **Argv;
00148
00149
00150 static BOOL8 ShowAllSamples = FALSE;
00151 static BOOL8 ShowSignificantProtos = TRUE;
00152 static BOOL8 ShowInsignificantProtos = FALSE;
00153
00161 static CLUSTERCONFIG Config =
00162 {
00163 elliptical, 0.025, 0.05, 0.8, 1e-3
00164 };
00165
00166 static FLOAT32 RoundingAccuracy = 0.0;
00167
00168
00169
00170
00171
00225 int main (
00226 int argc,
00227 char **argv)
00228 {
00229 char *PageName;
00230 FILE *TrainingPage;
00231 LIST CharList = NIL;
00232 CLUSTERER *Clusterer = NULL;
00233 LIST ProtoList = NIL;
00234 LIST NormProtoList = NIL;
00235 LIST pCharList;
00236 LABELEDLIST CharSample;
00237
00238 ParseArguments (argc, argv);
00239 while ((PageName = GetNextFilename()) != NULL)
00240 {
00241 printf ("\nReading %s ...", PageName);
00242 TrainingPage = Efopen (PageName, "r");
00243 ReadTrainingSamples (TrainingPage, &CharList);
00244 fclose (TrainingPage);
00245
00246 }
00247 pCharList = CharList;
00248 iterate(pCharList)
00249 {
00250
00251 CharSample = (LABELEDLIST) first (pCharList);
00252 printf ("\nClustering %s ...", CharSample->Label);
00253 Clusterer = SetUpForClustering(CharSample);
00254 ProtoList = ClusterSamples(Clusterer, &Config);
00255 AddToNormProtosList(&NormProtoList, ProtoList, CharSample->Label);
00256 }
00257 FreeTrainingSamples (CharList);
00258 WriteNormProtos (Directory, NormProtoList, Clusterer);
00259 FreeClusterer(Clusterer);
00260 FreeProtoList(&ProtoList);
00261 FreeNormProtoList(NormProtoList);
00262 printf ("\n");
00263 return 0;
00264 }
00265
00266
00267
00268
00269
00270
00271
00306 void ParseArguments(
00307 int argc,
00308 char **argv)
00309 {
00310 int Option;
00311 int ParametersRead;
00312 BOOL8 Error;
00313 extern char *optarg;
00314
00315 Error = FALSE;
00316 Argc = argc;
00317 Argv = argv;
00318 while (( Option = getopt( argc, argv, "R:N:D:C:I:M:B:S:d:n:p" )) != EOF )
00319 {
00320 switch ( Option )
00321 {
00322 case 'n':
00323 sscanf(optarg,"%d", &ParametersRead);
00324 ShowInsignificantProtos = ParametersRead;
00325 break;
00326 case 'p':
00327 sscanf(optarg,"%d", &ParametersRead);
00328 ShowSignificantProtos = ParametersRead;
00329 break;
00330 case 'd':
00331 ShowAllSamples = FALSE;
00332 break;
00333 case 'C':
00334 ParametersRead = sscanf( optarg, "%lf", &(Config.Confidence) );
00335 if ( ParametersRead != 1 ) Error = TRUE;
00336 else if ( Config.Confidence > 1 ) Config.Confidence = 1;
00337 else if ( Config.Confidence < 0 ) Config.Confidence = 0;
00338 break;
00339 case 'I':
00340 ParametersRead = sscanf( optarg, "%f", &(Config.Independence) );
00341 if ( ParametersRead != 1 ) Error = TRUE;
00342 else if ( Config.Independence > 1 ) Config.Independence = 1;
00343 else if ( Config.Independence < 0 ) Config.Independence = 0;
00344 break;
00345 case 'M':
00346 ParametersRead = sscanf( optarg, "%f", &(Config.MinSamples) );
00347 if ( ParametersRead != 1 ) Error = TRUE;
00348 else if ( Config.MinSamples > 1 ) Config.MinSamples = 1;
00349 else if ( Config.MinSamples < 0 ) Config.MinSamples = 0;
00350 break;
00351 case 'B':
00352 ParametersRead = sscanf( optarg, "%f", &(Config.MaxIllegal) );
00353 if ( ParametersRead != 1 ) Error = TRUE;
00354 else if ( Config.MaxIllegal > 1 ) Config.MaxIllegal = 1;
00355 else if ( Config.MaxIllegal < 0 ) Config.MaxIllegal = 0;
00356 break;
00357 case 'R':
00358 ParametersRead = sscanf( optarg, "%f", &RoundingAccuracy );
00359 if ( ParametersRead != 1 ) Error = TRUE;
00360 else if ( RoundingAccuracy > 0.01 ) RoundingAccuracy = 0.01;
00361 else if ( RoundingAccuracy < 0.0 ) RoundingAccuracy = 0.0;
00362 break;
00363 case 'S':
00364 switch ( optarg[0] )
00365 {
00366 case 's': Config.ProtoStyle = spherical; break;
00367 case 'e': Config.ProtoStyle = elliptical; break;
00368 case 'm': Config.ProtoStyle = mixed; break;
00369 case 'a': Config.ProtoStyle = automatic; break;
00370 default: Error = TRUE;
00371 }
00372 break;
00373 case 'D':
00374 Directory = optarg;
00375 break;
00376 case 'N':
00377 if (sscanf (optarg, "%d", &MaxNumSamples) != 1 ||
00378 MaxNumSamples <= 0)
00379 Error = TRUE;
00380 break;
00381 case '?':
00382 Error = TRUE;
00383 break;
00384 }
00385 if ( Error )
00386 {
00387 fprintf (stderr, "usage: %s [-D] [-P] [-N]\n", argv[0] );
00388 fprintf (stderr, "\t[-S ProtoStyle]\n");
00389 fprintf (stderr, "\t[-M MinSamples] [-B MaxBad] [-I Independence] [-C Confidence]\n" );
00390 fprintf (stderr, "\t[-d directory] [-n MaxNumSamples] [ TrainingPage ... ]\n");
00391 exit (2);
00392 }
00393 }
00394 }
00395
00410 char *GetNextFilename ()
00411 {
00412 if (optind < Argc)
00413 return (Argv [optind++]);
00414 else
00415 return (NULL);
00416
00417 }
00418
00419
00420
00434 void ReadTrainingSamples (
00435 FILE *File,
00436 LIST* TrainingSamples)
00437 {
00438 char CharName[MAXNAMESIZE];
00439 LABELEDLIST CharSample;
00440 FEATURE_SET FeatureSamples;
00441 CHAR_DESC CharDesc;
00442 int Type, i;
00443
00444 while (fscanf (File, "%s %s", FontName, CharName) == 2) {
00445 CharSample = FindList (*TrainingSamples, CharName);
00446 if (CharSample == NULL) {
00447 CharSample = NewLabeledList (CharName);
00448 *TrainingSamples = push (*TrainingSamples, CharSample);
00449 }
00450 CharDesc = ReadCharDescription (File);
00451 Type = ShortNameToFeatureType(PROGRAM_FEATURE_TYPE);
00452 FeatureSamples = FeaturesOfType(CharDesc, Type);
00453
00454 for (int feature = 0; feature < FeatureSamples->NumFeatures; ++feature) {
00455 FEATURE f = FeatureSamples->Features[feature];
00456 for (int dim =0; dim < f->Type->NumParams; ++dim)
00457 f->Params[dim] += UniformRandomNumber(-MINSD, MINSD);
00458 }
00459 CharSample->List = push (CharSample->List, FeatureSamples);
00460 for (i = 0; i < NumFeatureSetsIn (CharDesc); i++)
00461 if (Type != i)
00462 FreeFeatureSet (FeaturesOfType (CharDesc, i));
00463 free (CharDesc);
00464 }
00465 }
00466
00467
00468
00479 LABELEDLIST FindList (
00480 LIST List,
00481 char *Label)
00482 {
00483 LABELEDLIST LabeledList;
00484
00485 iterate (List)
00486 {
00487 LabeledList = (LABELEDLIST) first (List);
00488 if (strcmp (LabeledList->Label, Label) == 0)
00489 return (LabeledList);
00490 }
00491 return (NULL);
00492
00493 }
00494
00495
00496
00506 LABELEDLIST NewLabeledList (
00507 char *Label)
00508 {
00509 LABELEDLIST LabeledList;
00510
00511 LabeledList = (LABELEDLIST) (char*)Emalloc (sizeof (LABELEDLISTNODE));
00512 LabeledList->Label = (char*)Emalloc (strlen (Label)+1);
00513 strcpy (LabeledList->Label, Label);
00514 LabeledList->List = NIL;
00515 return (LabeledList);
00516
00517 }
00518
00519
00520
00534 void WriteTrainingSamples (
00535 char *Directory,
00536 LIST CharList)
00537 {
00538 LABELEDLIST CharSample;
00539 FEATURE_SET FeatureSet;
00540 LIST FeatureList;
00541 FILE *File;
00542 char Filename[MAXNAMESIZE];
00543 int NumSamples;
00544
00545 iterate (CharList)
00546 {
00547 CharSample = (LABELEDLIST) first (CharList);
00548
00549
00550 strcpy (Filename, "");
00551 if (Directory != NULL)
00552 {
00553 strcat (Filename, Directory);
00554 strcat (Filename, "/");
00555 }
00556 strcat (Filename, "Merged");
00557 strcat (Filename, "/");
00558 strcat (Filename, CharSample->Label);
00559 strcat (Filename, ".");
00560 strcat (Filename, PROGRAM_FEATURE_TYPE);
00561 printf ("\nWriting %s ...", Filename);
00562
00563
00564
00565 File = fopen (Filename, "r");
00566 if (File == NULL)
00567 {
00568 File = Efopen (Filename, "w");
00569 WriteOldParamDesc
00570 (File, DefinitionOf (ShortNameToFeatureType (PROGRAM_FEATURE_TYPE)));
00571 }
00572 else
00573 {
00574 fclose (File);
00575 File = Efopen (Filename, "a");
00576 }
00577
00578
00579 FeatureList = CharSample->List;
00580 NumSamples = 0;
00581 iterate (FeatureList)
00582 {
00583
00584
00585 FeatureSet = (FEATURE_SET) first (FeatureList);
00586 WriteFeatureSet (File, FeatureSet);
00587 NumSamples++;
00588 }
00589 fclose (File);
00590 }
00591 }
00592
00593
00594
00595
00613 void WriteNormProtos (
00614 char *Directory,
00615 LIST LabeledProtoList,
00616 CLUSTERER *Clusterer)
00617 {
00618 FILE *File;
00619 char Filename[MAXNAMESIZE];
00620 LABELEDLIST LabeledProto;
00621 int N;
00622 char Label;
00623
00624 strcpy (Filename, "");
00625 if (Directory != NULL)
00626 {
00627 strcat (Filename, Directory);
00628 strcat (Filename, "/");
00629 }
00630 strcat (Filename, "normproto");
00631 printf ("\nWriting %s ...", Filename);
00632 File = Efopen (Filename, "w");
00633 fprintf(File,"%0d\n",Clusterer->SampleSize);
00634 WriteParamDesc(File,Clusterer->SampleSize,Clusterer->ParamDesc);
00635
00636 iterate(LabeledProtoList)
00637 {
00638 LabeledProto = (LABELEDLIST) first (LabeledProtoList);
00639 N = NumberOfProtos(LabeledProto->List,
00640 ShowSignificantProtos, ShowInsignificantProtos);
00641 Label = NameToChar(LabeledProto->Label);
00642 fprintf(File, "\n%c %d\n", Label, N);
00643 WriteProtos(File, Clusterer->SampleSize, LabeledProto->List,
00644 ShowSignificantProtos, ShowInsignificantProtos);
00645 }
00646 fclose (File);
00647
00648 }
00649
00650
00651
00662 void FreeTrainingSamples (
00663 LIST CharList)
00664 {
00665 LABELEDLIST CharSample;
00666 FEATURE_SET FeatureSet;
00667 LIST FeatureList;
00668
00669
00670 printf ("\nFreeTrainingSamples...");
00671 iterate (CharList)
00672 {
00673 CharSample = (LABELEDLIST) first (CharList);
00674 FeatureList = CharSample->List;
00675 iterate (FeatureList)
00676 {
00677 FeatureSet = (FEATURE_SET) first (FeatureList);
00678 FreeFeatureSet (FeatureSet);
00679 }
00680 FreeLabeledList (CharSample);
00681 }
00682 destroy (CharList);
00683
00684 }
00685
00686
00696 void FreeNormProtoList (
00697 LIST CharList)
00698
00699 {
00700 LABELEDLIST CharSample;
00701
00702 iterate (CharList)
00703 {
00704 CharSample = (LABELEDLIST) first (CharList);
00705 FreeLabeledList (CharSample);
00706 }
00707 destroy (CharList);
00708
00709 }
00710
00711
00712
00723 void FreeLabeledList (
00724 LABELEDLIST LabeledList)
00725 {
00726 destroy (LabeledList->List);
00727 free (LabeledList->Label);
00728 free (LabeledList);
00729
00730 }
00731
00732
00733
00744 CLUSTERER *SetUpForClustering(
00745 LABELEDLIST CharSample)
00746 {
00747 UINT16 N;
00748 int i, j;
00749 FLOAT32 *Sample = NULL;
00750 CLUSTERER *Clusterer;
00751 INT32 CharID;
00752 LIST FeatureList = NULL;
00753 FEATURE_SET FeatureSet = NULL;
00754 FEATURE_DESC FeatureDesc = NULL;
00755
00756
00757 FeatureDesc = DefinitionOf(ShortNameToFeatureType(PROGRAM_FEATURE_TYPE));
00758 N = FeatureDesc->NumParams;
00759
00760 Clusterer = MakeClusterer(N,FeatureDesc->ParamDesc);
00761
00762
00763 FeatureList = CharSample->List;
00764 CharID = 0;
00765 iterate(FeatureList)
00766 {
00767 FeatureSet = (FEATURE_SET) first (FeatureList);
00768 for (i=0; i < FeatureSet->MaxNumFeatures; i++)
00769 {
00770 if (Sample == NULL)
00771 Sample = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32));
00772 for (j=0; j < N; j++)
00773 if (RoundingAccuracy != 0.0)
00774 Sample[j] = round(FeatureSet->Features[i]->Params[j], RoundingAccuracy);
00775 else
00776 Sample[j] = FeatureSet->Features[i]->Params[j];
00777 MakeSample (Clusterer, Sample, CharID);
00778 }
00779 CharID++;
00780 }
00781 if ( Sample != NULL ) free( Sample );
00782 return( Clusterer );
00783
00784 }
00785
00786
00794 void AddToNormProtosList(
00795 LIST* NormProtoList,
00796 LIST ProtoList,
00797 char* CharName)
00798 {
00799 PROTOTYPE* Proto;
00800 LABELEDLIST LabeledProtoList;
00801
00802 LabeledProtoList = NewLabeledList(CharName);
00803 iterate(ProtoList)
00804 {
00805 Proto = (PROTOTYPE *) first (ProtoList);
00806 LabeledProtoList->List = push(LabeledProtoList->List, Proto);
00807 }
00808 *NormProtoList = push(*NormProtoList, LabeledProtoList);
00809 }
00810
00811
00821 void WriteProtos(
00822 FILE *File,
00823 UINT16 N,
00824 LIST ProtoList,
00825 BOOL8 WriteSigProtos,
00826 BOOL8 WriteInsigProtos)
00827 {
00828 PROTOTYPE *Proto;
00829
00830
00831 iterate(ProtoList)
00832 {
00833 Proto = (PROTOTYPE *) first ( ProtoList );
00834 if (( Proto->Significant && WriteSigProtos )
00835 || ( ! Proto->Significant && WriteInsigProtos ) )
00836 WritePrototype( File, N, Proto );
00837 }
00838 }
00839
00840
00849 int NumberOfProtos(
00850 LIST ProtoList,
00851 BOOL8 CountSigProtos,
00852 BOOL8 CountInsigProtos)
00853 {
00854 int N = 0;
00855 PROTOTYPE *Proto;
00856
00857 iterate(ProtoList)
00858 {
00859 Proto = (PROTOTYPE *) first ( ProtoList );
00860 if (( Proto->Significant && CountSigProtos )
00861 || ( ! Proto->Significant && CountInsigProtos ) )
00862 N++;
00863 }
00864 return(N);
00865 }
00866