training/cnTraining.cpp

Go to the documentation of this file.
00001 
00026 #include "oldlist.h"
00027 #include "efio.h"
00028 #include "emalloc.h"
00029 #include "featdefs.h"
00030 #include "getopt.h"
00031 #include "ocrfeatures.h"
00032 #include "general.h"
00033 #include "clusttool.h"
00034 #include "cluster.h"
00035 #include "name2char.h"
00036 #include <string.h>
00037 #include <stdio.h>
00038 #include <math.h>
00039 
00040 #define MAXNAMESIZE  80
00041 
00042 #define MAX_NUM_SAMPLES 10000
00043 
00044 #define PROGRAM_FEATURE_TYPE "cn"
00045 
00046 #define MINSD (1.0f / 64.0f)
00047 
00048 int   row_number;                /* cjn: fixes link problem */
00049 
00058 typedef struct
00059 {
00060   char      *Label;
00061   LIST      List;
00062 }
00063 LABELEDLISTNODE, *LABELEDLIST;
00064 
00068 #define round(x,frag)(floor(x/frag+.5)*frag)
00069 
00070 /*----------------------------------------------------------------------------
00071                Public Function Prototypes
00072 ----------------------------------------------------------------------------**/
00073 int main (
00074      int argc,
00075      char   **argv);
00076 
00077 /*----------------------------------------------------------------------------
00078                Private Function Prototypes
00079 ----------------------------------------------------------------------------**/
00080 void ParseArguments(
00081      int argc,
00082      char   **argv);
00083 
00084 char *GetNextFilename ();
00085 
00086 void ReadTrainingSamples (
00087      FILE   *File,
00088     LIST* TrainingSamples);
00089 
00090 LABELEDLIST FindList (
00091      LIST   List,
00092      char   *Label);
00093 
00094 LABELEDLIST NewLabeledList (
00095      char   *Label);
00096 
00097 void WriteTrainingSamples (
00098      char   *Directory,
00099      LIST   CharList);
00100 
00101 void WriteNormProtos (
00102      char   *Directory,
00103      LIST   LabeledProtoList,
00104     CLUSTERER *Clusterer);
00105 
00106 void FreeTrainingSamples (
00107      LIST   CharList);
00108 
00109 void FreeNormProtoList (
00110      LIST   CharList);
00111 
00112 void FreeLabeledList (
00113      LABELEDLIST  LabeledList);
00114 
00115 CLUSTERER *SetUpForClustering(
00116      LABELEDLIST  CharSample);
00117 /*
00118 PARAMDESC *ConvertToPARAMDESC(
00119    PARAM_DESC* Param_Desc,
00120    int N);
00121 */
00122 void AddToNormProtosList(
00123    LIST* NormProtoList,
00124    LIST ProtoList,
00125    char* CharName);
00126 
00127 void WriteProtos(
00128      FILE   *File,
00129      UINT16 N,
00130      LIST   ProtoList,
00131      BOOL8  WriteSigProtos,
00132      BOOL8  WriteInsigProtos);
00133 
00134 int NumberOfProtos(
00135    LIST ProtoList,
00136     BOOL8   CountSigProtos,
00137     BOOL8   CountInsigProtos);
00138 
00139 /*----------------------------------------------------------------------------
00140             Global Data Definitions and Declarations
00141 ----------------------------------------------------------------------------**/
00142 static char FontName[MAXNAMESIZE];
00143 /* globals used for parsing command line arguments */
00144 static char *Directory = NULL;
00145 static int  MaxNumSamples = MAX_NUM_SAMPLES;
00146 static int  Argc;
00147 static char **Argv;
00148 
00149 /* globals used to control what information is saved in the output file */
00150 static BOOL8      ShowAllSamples = FALSE;
00151 static BOOL8      ShowSignificantProtos = TRUE;
00152 static BOOL8      ShowInsignificantProtos = FALSE;
00153 
00161 static CLUSTERCONFIG Config =
00162 {
00163   elliptical, 0.025, 0.05, 0.8, 1e-3
00164 };
00165 
00166 static FLOAT32 RoundingAccuracy = 0.0;
00167 
00168 /*----------------------------------------------------------------------------
00169                   Public Code
00170 ----------------------------------------------------------------------------**/
00171 /*---------------------------------------------------------------------------*/
00225 int main (
00226      int argc,
00227      char   **argv)
00228 {
00229    char  *PageName;
00230    FILE  *TrainingPage;
00231    LIST  CharList = NIL;
00232    CLUSTERER   *Clusterer = NULL;
00233    LIST     ProtoList = NIL;
00234    LIST     NormProtoList = NIL;
00235    LIST pCharList;
00236    LABELEDLIST CharSample;
00237 
00238    ParseArguments (argc, argv);
00239    while ((PageName = GetNextFilename()) != NULL)
00240    {
00241       printf ("\nReading %s ...", PageName);
00242       TrainingPage = Efopen (PageName, "r");
00243       ReadTrainingSamples (TrainingPage, &CharList);
00244       fclose (TrainingPage);
00245       //WriteTrainingSamples (Directory, CharList);
00246    }
00247    pCharList = CharList;
00248    iterate(pCharList)
00249    {
00250       //Cluster
00251       CharSample = (LABELEDLIST) first (pCharList);
00252       printf ("\nClustering %s ...", CharSample->Label);
00253       Clusterer = SetUpForClustering(CharSample);
00254       ProtoList = ClusterSamples(Clusterer, &Config);
00255       AddToNormProtosList(&NormProtoList, ProtoList, CharSample->Label);
00256    }
00257    FreeTrainingSamples (CharList);
00258    WriteNormProtos (Directory, NormProtoList, Clusterer);
00259    FreeClusterer(Clusterer);
00260    FreeProtoList(&ProtoList);
00261    FreeNormProtoList(NormProtoList);
00262    printf ("\n");
00263   return 0;
00264 }// main
00265 
00266 
00267 /*----------------------------------------------------------------------------
00268                      Private Code
00269 ----------------------------------------------------------------------------**/
00270 /*---------------------------------------------------------------------------*/
00271 
00306 void ParseArguments(
00307    int   argc,
00308    char  **argv)
00309 {
00310    int      Option;
00311    int      ParametersRead;
00312    BOOL8    Error;
00313    extern char *optarg;
00314 
00315    Error = FALSE;
00316    Argc = argc;
00317    Argv = argv;
00318    while (( Option = getopt( argc, argv, "R:N:D:C:I:M:B:S:d:n:p" )) != EOF )
00319    {
00320       switch ( Option )
00321       {
00322       case 'n':
00323          sscanf(optarg,"%d", &ParametersRead);
00324          ShowInsignificantProtos = ParametersRead;
00325          break;
00326       case 'p':
00327          sscanf(optarg,"%d", &ParametersRead);
00328          ShowSignificantProtos = ParametersRead;
00329          break;
00330       case 'd':
00331          ShowAllSamples = FALSE;
00332          break;
00333       case 'C':
00334          ParametersRead = sscanf( optarg, "%lf", &(Config.Confidence) );
00335          if ( ParametersRead != 1 ) Error = TRUE;
00336          else if ( Config.Confidence > 1 ) Config.Confidence = 1;
00337          else if ( Config.Confidence < 0 ) Config.Confidence = 0;
00338          break;
00339       case 'I':
00340          ParametersRead = sscanf( optarg, "%f", &(Config.Independence) );
00341          if ( ParametersRead != 1 ) Error = TRUE;
00342          else if ( Config.Independence > 1 ) Config.Independence = 1;
00343          else if ( Config.Independence < 0 ) Config.Independence = 0;
00344          break;
00345       case 'M':
00346          ParametersRead = sscanf( optarg, "%f", &(Config.MinSamples) );
00347          if ( ParametersRead != 1 ) Error = TRUE;
00348          else if ( Config.MinSamples > 1 ) Config.MinSamples = 1;
00349          else if ( Config.MinSamples < 0 ) Config.MinSamples = 0;
00350          break;
00351       case 'B':
00352          ParametersRead = sscanf( optarg, "%f", &(Config.MaxIllegal) );
00353          if ( ParametersRead != 1 ) Error = TRUE;
00354          else if ( Config.MaxIllegal > 1 ) Config.MaxIllegal = 1;
00355          else if ( Config.MaxIllegal < 0 ) Config.MaxIllegal = 0;
00356          break;
00357       case 'R':
00358          ParametersRead = sscanf( optarg, "%f", &RoundingAccuracy );
00359          if ( ParametersRead != 1 ) Error = TRUE;
00360          else if ( RoundingAccuracy > 0.01 ) RoundingAccuracy = 0.01;
00361          else if ( RoundingAccuracy < 0.0 ) RoundingAccuracy = 0.0;
00362          break;
00363       case 'S':
00364          switch ( optarg[0] )
00365          {
00366          case 's': Config.ProtoStyle = spherical; break;
00367          case 'e': Config.ProtoStyle = elliptical; break;
00368          case 'm': Config.ProtoStyle = mixed; break;
00369          case 'a': Config.ProtoStyle = automatic; break;
00370          default: Error = TRUE;
00371          }
00372          break;
00373          case 'D':
00374             Directory = optarg;
00375             break;
00376          case 'N':
00377             if (sscanf (optarg, "%d", &MaxNumSamples) != 1 ||
00378                MaxNumSamples <= 0)
00379                Error = TRUE;
00380             break;
00381          case '?':
00382             Error = TRUE;
00383             break;
00384       }
00385       if ( Error )
00386       {
00387          fprintf (stderr, "usage: %s [-D] [-P] [-N]\n", argv[0] );
00388          fprintf (stderr, "\t[-S ProtoStyle]\n");
00389          fprintf (stderr, "\t[-M MinSamples] [-B MaxBad] [-I Independence] [-C Confidence]\n" );
00390          fprintf (stderr, "\t[-d directory] [-n MaxNumSamples] [ TrainingPage ... ]\n");
00391          exit (2);
00392       }
00393    }//while
00394 }// ParseArguments
00395 
00410 char *GetNextFilename ()
00411 {
00412    if (optind < Argc)
00413       return (Argv [optind++]);
00414    else
00415       return (NULL);
00416 
00417 }//GetNextFilename
00418 
00419 /*---------------------------------------------------------------------------*/
00420 
00434 void ReadTrainingSamples (
00435      FILE   *File,
00436     LIST* TrainingSamples)
00437 {
00438    char     CharName[MAXNAMESIZE];
00439    LABELEDLIST CharSample;
00440    FEATURE_SET FeatureSamples;
00441    CHAR_DESC   CharDesc;
00442    int         Type, i;
00443 
00444    while (fscanf (File, "%s %s", FontName, CharName) == 2) {
00445       CharSample = FindList (*TrainingSamples, CharName);
00446       if (CharSample == NULL) {
00447          CharSample = NewLabeledList (CharName);
00448          *TrainingSamples = push (*TrainingSamples, CharSample);
00449       }
00450       CharDesc = ReadCharDescription (File);
00451       Type = ShortNameToFeatureType(PROGRAM_FEATURE_TYPE);
00452       FeatureSamples = FeaturesOfType(CharDesc, Type);
00453 
00454       for (int feature = 0; feature < FeatureSamples->NumFeatures; ++feature) {
00455          FEATURE f = FeatureSamples->Features[feature];
00456          for (int dim =0; dim < f->Type->NumParams; ++dim)
00457             f->Params[dim] += UniformRandomNumber(-MINSD, MINSD);
00458       }//for feature
00459       CharSample->List = push (CharSample->List, FeatureSamples);
00460       for (i = 0; i < NumFeatureSetsIn (CharDesc); i++)
00461          if (Type != i)
00462             FreeFeatureSet (FeaturesOfType (CharDesc, i));
00463       free (CharDesc);
00464    }//while
00465 }// ReadTrainingSamples
00466 
00467 /*---------------------------------------------------------------------------*/
00468 
00479 LABELEDLIST FindList (
00480      LIST   List,
00481      char   *Label)
00482 {
00483    LABELEDLIST LabeledList;
00484 
00485    iterate (List)
00486     {
00487       LabeledList = (LABELEDLIST) first (List);
00488       if (strcmp (LabeledList->Label, Label) == 0)
00489          return (LabeledList);
00490     }
00491    return (NULL);
00492 
00493 }//FindList
00494 
00495 /*---------------------------------------------------------------------------*/
00496 
00506 LABELEDLIST NewLabeledList (
00507      char   *Label)
00508 {
00509    LABELEDLIST LabeledList;
00510 
00511    LabeledList = (LABELEDLIST) (char*)Emalloc (sizeof (LABELEDLISTNODE));
00512    LabeledList->Label = (char*)Emalloc (strlen (Label)+1);
00513    strcpy (LabeledList->Label, Label);
00514    LabeledList->List = NIL;
00515    return (LabeledList);
00516 
00517 }//NewLabeledList
00518 
00519 /*---------------------------------------------------------------------------*/
00520 
00534 void WriteTrainingSamples (
00535      char   *Directory,
00536      LIST   CharList)
00537 {
00538    LABELEDLIST CharSample;
00539    FEATURE_SET FeatureSet;
00540    LIST     FeatureList;
00541    FILE     *File;
00542    char     Filename[MAXNAMESIZE];
00543    int      NumSamples;
00544 
00545    iterate (CharList)      // iterate thru all of the fonts
00546    {
00547       CharSample = (LABELEDLIST) first (CharList);
00548 
00549       // construct the full pathname for the current samples file
00550       strcpy (Filename, "");
00551       if (Directory != NULL)
00552       {
00553          strcat (Filename, Directory);
00554          strcat (Filename, "/");
00555       }
00556       strcat (Filename, "Merged");
00557       strcat (Filename, "/");
00558       strcat (Filename, CharSample->Label);
00559       strcat (Filename, ".");
00560       strcat (Filename, PROGRAM_FEATURE_TYPE);
00561       printf ("\nWriting %s ...", Filename);
00562 
00563       /* if file does not exist, create a new one with an appropriate
00564       header; otherwise append samples to the existing file */
00565       File = fopen (Filename, "r");
00566       if (File == NULL)
00567       {
00568          File = Efopen (Filename, "w");
00569          WriteOldParamDesc
00570             (File, DefinitionOf (ShortNameToFeatureType (PROGRAM_FEATURE_TYPE)));
00571       }
00572       else
00573       {
00574          fclose (File);
00575          File = Efopen (Filename, "a");
00576       }
00577 
00578       // append samples onto the file
00579       FeatureList = CharSample->List;
00580       NumSamples = 0;
00581       iterate (FeatureList)
00582       {
00583          //if (NumSamples >= MaxNumSamples) break;
00584 
00585          FeatureSet = (FEATURE_SET) first (FeatureList);
00586          WriteFeatureSet (File, FeatureSet);
00587          NumSamples++;
00588       }
00589       fclose (File);
00590    }
00591 }// WriteTrainingSamples
00592 
00593 
00594 /*----------------------------------------------------------------------------*/
00595 
00613 void WriteNormProtos (
00614      char   *Directory,
00615      LIST   LabeledProtoList,
00616     CLUSTERER *Clusterer)
00617 {
00618    FILE     *File;
00619    char     Filename[MAXNAMESIZE];
00620    LABELEDLIST LabeledProto;
00621    int N;
00622    char Label;
00623 
00624    strcpy (Filename, "");
00625    if (Directory != NULL)
00626    {
00627       strcat (Filename, Directory);
00628       strcat (Filename, "/");
00629    }
00630    strcat (Filename, "normproto");
00631    printf ("\nWriting %s ...", Filename);
00632    File = Efopen (Filename, "w");
00633    fprintf(File,"%0d\n",Clusterer->SampleSize);
00634    WriteParamDesc(File,Clusterer->SampleSize,Clusterer->ParamDesc);
00635 
00636    iterate(LabeledProtoList)
00637    {
00638       LabeledProto = (LABELEDLIST) first (LabeledProtoList);
00639       N = NumberOfProtos(LabeledProto->List,
00640          ShowSignificantProtos, ShowInsignificantProtos);
00641       Label = NameToChar(LabeledProto->Label);
00642       fprintf(File, "\n%c %d\n", Label, N);
00643       WriteProtos(File, Clusterer->SampleSize, LabeledProto->List,
00644          ShowSignificantProtos, ShowInsignificantProtos);
00645    }
00646    fclose (File);
00647 
00648 }// WriteNormProtos
00649 
00650 /*---------------------------------------------------------------------------*/
00651 
00662 void FreeTrainingSamples (
00663      LIST   CharList)
00664 {
00665    LABELEDLIST CharSample;
00666    FEATURE_SET FeatureSet;
00667    LIST     FeatureList;
00668 
00669 
00670    printf ("\nFreeTrainingSamples...");
00671    iterate (CharList)      /* iterate thru all of the fonts */
00672    {
00673       CharSample = (LABELEDLIST) first (CharList);
00674       FeatureList = CharSample->List;
00675       iterate (FeatureList)   /* iterate thru all of the classes */
00676       {
00677          FeatureSet = (FEATURE_SET) first (FeatureList);
00678          FreeFeatureSet (FeatureSet);
00679       }
00680       FreeLabeledList (CharSample);
00681    }
00682    destroy (CharList);
00683 
00684 }// FreeTrainingSamples
00685 
00686 /*-------------------------------------------------------------------------*/
00696 void FreeNormProtoList (
00697      LIST   CharList)
00698 
00699 {
00700    LABELEDLIST CharSample;
00701 
00702    iterate (CharList)      /* iterate thru all of the fonts */
00703    {
00704       CharSample = (LABELEDLIST) first (CharList);
00705       FreeLabeledList (CharSample);
00706    }
00707    destroy (CharList);
00708 
00709 }// FreeNormProtoList
00710 
00711 /*---------------------------------------------------------------------------*/
00712 
00723 void FreeLabeledList (
00724      LABELEDLIST  LabeledList)
00725 {
00726    destroy (LabeledList->List);
00727    free (LabeledList->Label);
00728    free (LabeledList);
00729 
00730 }// FreeLabeledList
00731 
00732 /*---------------------------------------------------------------------------*/
00733 
00744 CLUSTERER *SetUpForClustering(
00745      LABELEDLIST  CharSample)
00746 {
00747    UINT16   N;
00748    int      i, j;
00749    FLOAT32  *Sample = NULL;
00750    CLUSTERER   *Clusterer;
00751    INT32    CharID;
00752    LIST FeatureList = NULL;
00753    FEATURE_SET FeatureSet = NULL;
00754    FEATURE_DESC FeatureDesc = NULL;
00755 // PARAM_DESC* ParamDesc;
00756 
00757    FeatureDesc = DefinitionOf(ShortNameToFeatureType(PROGRAM_FEATURE_TYPE));
00758    N = FeatureDesc->NumParams;
00759    //ParamDesc = ConvertToPARAMDESC(FeatureDesc->ParamDesc, N);
00760    Clusterer = MakeClusterer(N,FeatureDesc->ParamDesc);
00761 // free(ParamDesc);
00762 
00763    FeatureList = CharSample->List;
00764    CharID = 0;
00765    iterate(FeatureList)
00766    {
00767       FeatureSet = (FEATURE_SET) first (FeatureList);
00768       for (i=0; i < FeatureSet->MaxNumFeatures; i++)
00769       {
00770          if (Sample == NULL)
00771             Sample = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32));
00772          for (j=0; j < N; j++)
00773             if (RoundingAccuracy != 0.0)
00774                Sample[j] = round(FeatureSet->Features[i]->Params[j], RoundingAccuracy);
00775             else
00776                Sample[j] = FeatureSet->Features[i]->Params[j];
00777             MakeSample (Clusterer, Sample, CharID);
00778       }
00779       CharID++;
00780    }
00781    if ( Sample != NULL ) free( Sample );
00782    return( Clusterer );
00783 
00784 }//SetUpForClustering
00785 
00786 /*---------------------------------------------------------------------------*/
00794 void AddToNormProtosList(
00795    LIST* NormProtoList,
00796    LIST ProtoList,
00797    char* CharName)
00798 {
00799    PROTOTYPE* Proto;
00800    LABELEDLIST LabeledProtoList;
00801 
00802    LabeledProtoList = NewLabeledList(CharName);
00803    iterate(ProtoList)
00804    {
00805       Proto = (PROTOTYPE *) first (ProtoList);
00806       LabeledProtoList->List = push(LabeledProtoList->List, Proto);
00807    }
00808    *NormProtoList = push(*NormProtoList, LabeledProtoList);
00809 }//AddToNormProtosList
00810 
00811 /*-------------------------------------------------------------------------*/
00821 void WriteProtos(
00822      FILE   *File,
00823      UINT16 N,
00824      LIST   ProtoList,
00825      BOOL8  WriteSigProtos,
00826      BOOL8  WriteInsigProtos)
00827 {
00828    PROTOTYPE   *Proto;
00829 
00830    // write prototypes
00831    iterate(ProtoList)
00832    {
00833       Proto = (PROTOTYPE *) first ( ProtoList );
00834       if (( Proto->Significant && WriteSigProtos )
00835            || ( ! Proto->Significant && WriteInsigProtos ) )
00836          WritePrototype( File, N, Proto );
00837    }
00838 }// WriteProtos
00839 
00840 /*---------------------------------------------------------------------------*/
00849 int NumberOfProtos(
00850    LIST ProtoList,
00851     BOOL8   CountSigProtos,
00852     BOOL8   CountInsigProtos)
00853 {
00854    int N = 0;
00855    PROTOTYPE   *Proto;
00856 
00857    iterate(ProtoList)
00858    {
00859       Proto = (PROTOTYPE *) first ( ProtoList );
00860       if (( Proto->Significant && CountSigProtos )
00861            || ( ! Proto->Significant && CountInsigProtos ) )
00862          N++;
00863    }
00864    return(N);
00865 }//NumberOfProtos
00866 

Generated on Wed Feb 28 19:49:12 2007 for Tesseract by  doxygen 1.5.1