classify/cluster.cpp

Go to the documentation of this file.
00001 
00019 #include "oldheap.h"
00020 #include "const.h"
00021 #include "cluster.h"
00022 #include "emalloc.h"
00023 #include "danerror.h"
00024 #include "freelist.h"
00025 #include <math.h>
00026 
00027 #define HOTELLING 1  // If true use Hotelling's test to decide where to split.
00028 #define FTABLE_X 10  // Size of FTable.
00029 #define FTABLE_Y 100 // Size of FTable.
00030 
00035 double FTable[FTABLE_Y][FTABLE_X] = {
00036  {4052.19,4999.52,5403.34,5624.62,5763.65,5858.97, 5928.33, 5981.10, 6022.50, 6055.85,},
00037   {98.502,99.000, 99.166, 99.249, 99.300,  99.333,  99.356,  99.374,  99.388,  99.399,},
00038   {34.116,30.816, 29.457, 28.710, 28.237,  27.911,  27.672,  27.489,  27.345,  27.229,},
00039   {21.198,18.000, 16.694, 15.977, 15.522,  15.207,  14.976,  14.799,  14.659,  14.546,},
00040   {16.258,13.274, 12.060, 11.392, 10.967,  10.672,  10.456,  10.289,  10.158,  10.051,},
00041   {13.745,10.925,  9.780,  9.148,  8.746,   8.466,   8.260,   8.102,   7.976,   7.874,},
00042   {12.246, 9.547,  8.451,  7.847,  7.460,   7.191,   6.993,   6.840,   6.719,   6.620,},
00043   {11.259, 8.649,  7.591,  7.006,  6.632,   6.371,   6.178,   6.029,   5.911,   5.814,},
00044   {10.561, 8.022,  6.992,  6.422,  6.057,   5.802,   5.613,   5.467,   5.351,   5.257,},
00045   {10.044, 7.559,  6.552,  5.994,  5.636,   5.386,   5.200,   5.057,   4.942,   4.849,},
00046   { 9.646, 7.206,  6.217,  5.668,  5.316,   5.069,   4.886,   4.744,   4.632,   4.539,},
00047   { 9.330, 6.927,  5.953,  5.412,  5.064,   4.821,   4.640,   4.499,   4.388,   4.296,},
00048   { 9.074, 6.701,  5.739,  5.205,  4.862,   4.620,   4.441,   4.302,   4.191,   4.100,},
00049   { 8.862, 6.515,  5.564,  5.035,  4.695,   4.456,   4.278,   4.140,   4.030,   3.939,},
00050   { 8.683, 6.359,  5.417,  4.893,  4.556,   4.318,   4.142,   4.004,   3.895,   3.805,},
00051   { 8.531, 6.226,  5.292,  4.773,  4.437,   4.202,   4.026,   3.890,   3.780,   3.691,},
00052   { 8.400, 6.112,  5.185,  4.669,  4.336,   4.102,   3.927,   3.791,   3.682,   3.593,},
00053   { 8.285, 6.013,  5.092,  4.579,  4.248,   4.015,   3.841,   3.705,   3.597,   3.508,},
00054   { 8.185, 5.926,  5.010,  4.500,  4.171,   3.939,   3.765,   3.631,   3.523,   3.434,},
00055   { 8.096, 5.849,  4.938,  4.431,  4.103,   3.871,   3.699,   3.564,   3.457,   3.368,},
00056   { 8.017, 5.780,  4.874,  4.369,  4.042,   3.812,   3.640,   3.506,   3.398,   3.310,},
00057   { 7.945, 5.719,  4.817,  4.313,  3.988,   3.758,   3.587,   3.453,   3.346,   3.258,},
00058   { 7.881, 5.664,  4.765,  4.264,  3.939,   3.710,   3.539,   3.406,   3.299,   3.211,},
00059   { 7.823, 5.614,  4.718,  4.218,  3.895,   3.667,   3.496,   3.363,   3.256,   3.168,},
00060   { 7.770, 5.568,  4.675,  4.177,  3.855,   3.627,   3.457,   3.324,   3.217,   3.129,},
00061   { 7.721, 5.526,  4.637,  4.140,  3.818,   3.591,   3.421,   3.288,   3.182,   3.094,},
00062   { 7.677, 5.488,  4.601,  4.106,  3.785,   3.558,   3.388,   3.256,   3.149,   3.062,},
00063   { 7.636, 5.453,  4.568,  4.074,  3.754,   3.528,   3.358,   3.226,   3.120,   3.032,},
00064   { 7.598, 5.420,  4.538,  4.045,  3.725,   3.499,   3.330,   3.198,   3.092,   3.005,},
00065   { 7.562, 5.390,  4.510,  4.018,  3.699,   3.473,   3.305,   3.173,   3.067,   2.979,},
00066   { 7.530, 5.362,  4.484,  3.993,  3.675,   3.449,   3.281,   3.149,   3.043,   2.955,},
00067   { 7.499, 5.336,  4.459,  3.969,  3.652,   3.427,   3.258,   3.127,   3.021,   2.934,},
00068   { 7.471, 5.312,  4.437,  3.948,  3.630,   3.406,   3.238,   3.106,   3.000,   2.913,},
00069   { 7.444, 5.289,  4.416,  3.927,  3.611,   3.386,   3.218,   3.087,   2.981,   2.894,},
00070   { 7.419, 5.268,  4.396,  3.908,  3.592,   3.368,   3.200,   3.069,   2.963,   2.876,},
00071   { 7.396, 5.248,  4.377,  3.890,  3.574,   3.351,   3.183,   3.052,   2.946,   2.859,},
00072   { 7.373, 5.229,  4.360,  3.873,  3.558,   3.334,   3.167,   3.036,   2.930,   2.843,},
00073   { 7.353, 5.211,  4.343,  3.858,  3.542,   3.319,   3.152,   3.021,   2.915,   2.828,},
00074   { 7.333, 5.194,  4.327,  3.843,  3.528,   3.305,   3.137,   3.006,   2.901,   2.814,},
00075   { 7.314, 5.179,  4.313,  3.828,  3.514,   3.291,   3.124,   2.993,   2.888,   2.801,},
00076   { 7.296, 5.163,  4.299,  3.815,  3.501,   3.278,   3.111,   2.980,   2.875,   2.788,},
00077   { 7.280, 5.149,  4.285,  3.802,  3.488,   3.266,   3.099,   2.968,   2.863,   2.776,},
00078   { 7.264, 5.136,  4.273,  3.790,  3.476,   3.254,   3.087,   2.957,   2.851,   2.764,},
00079   { 7.248, 5.123,  4.261,  3.778,  3.465,   3.243,   3.076,   2.946,   2.840,   2.754,},
00080   { 7.234, 5.110,  4.249,  3.767,  3.454,   3.232,   3.066,   2.935,   2.830,   2.743,},
00081   { 7.220, 5.099,  4.238,  3.757,  3.444,   3.222,   3.056,   2.925,   2.820,   2.733,},
00082   { 7.207, 5.087,  4.228,  3.747,  3.434,   3.213,   3.046,   2.916,   2.811,   2.724,},
00083   { 7.194, 5.077,  4.218,  3.737,  3.425,   3.204,   3.037,   2.907,   2.802,   2.715,},
00084   { 7.182, 5.066,  4.208,  3.728,  3.416,   3.195,   3.028,   2.898,   2.793,   2.706,},
00085   { 7.171, 5.057,  4.199,  3.720,  3.408,   3.186,   3.020,   2.890,   2.785,   2.698,},
00086   { 7.159, 5.047,  4.191,  3.711,  3.400,   3.178,   3.012,   2.882,   2.777,   2.690,},
00087   { 7.149, 5.038,  4.182,  3.703,  3.392,   3.171,   3.005,   2.874,   2.769,   2.683,},
00088   { 7.139, 5.030,  4.174,  3.695,  3.384,   3.163,   2.997,   2.867,   2.762,   2.675,},
00089   { 7.129, 5.021,  4.167,  3.688,  3.377,   3.156,   2.990,   2.860,   2.755,   2.668,},
00090   { 7.119, 5.013,  4.159,  3.681,  3.370,   3.149,   2.983,   2.853,   2.748,   2.662,},
00091   { 7.110, 5.006,  4.152,  3.674,  3.363,   3.143,   2.977,   2.847,   2.742,   2.655,},
00092   { 7.102, 4.998,  4.145,  3.667,  3.357,   3.136,   2.971,   2.841,   2.736,   2.649,},
00093   { 7.093, 4.991,  4.138,  3.661,  3.351,   3.130,   2.965,   2.835,   2.730,   2.643,},
00094   { 7.085, 4.984,  4.132,  3.655,  3.345,   3.124,   2.959,   2.829,   2.724,   2.637,},
00095   { 7.077, 4.977,  4.126,  3.649,  3.339,   3.119,   2.953,   2.823,   2.718,   2.632,},
00096   { 7.070, 4.971,  4.120,  3.643,  3.333,   3.113,   2.948,   2.818,   2.713,   2.626,},
00097   { 7.062, 4.965,  4.114,  3.638,  3.328,   3.108,   2.942,   2.813,   2.708,   2.621,},
00098   { 7.055, 4.959,  4.109,  3.632,  3.323,   3.103,   2.937,   2.808,   2.703,   2.616,},
00099   { 7.048, 4.953,  4.103,  3.627,  3.318,   3.098,   2.932,   2.803,   2.698,   2.611,},
00100   { 7.042, 4.947,  4.098,  3.622,  3.313,   3.093,   2.928,   2.798,   2.693,   2.607,},
00101   { 7.035, 4.942,  4.093,  3.618,  3.308,   3.088,   2.923,   2.793,   2.689,   2.602,},
00102   { 7.029, 4.937,  4.088,  3.613,  3.304,   3.084,   2.919,   2.789,   2.684,   2.598,},
00103   { 7.023, 4.932,  4.083,  3.608,  3.299,   3.080,   2.914,   2.785,   2.680,   2.593,},
00104   { 7.017, 4.927,  4.079,  3.604,  3.295,   3.075,   2.910,   2.781,   2.676,   2.589,},
00105   { 7.011, 4.922,  4.074,  3.600,  3.291,   3.071,   2.906,   2.777,   2.672,   2.585,},
00106   { 7.006, 4.917,  4.070,  3.596,  3.287,   3.067,   2.902,   2.773,   2.668,   2.581,},
00107   { 7.001, 4.913,  4.066,  3.591,  3.283,   3.063,   2.898,   2.769,   2.664,   2.578,},
00108   { 6.995, 4.908,  4.062,  3.588,  3.279,   3.060,   2.895,   2.765,   2.660,   2.574,},
00109   { 6.990, 4.904,  4.058,  3.584,  3.275,   3.056,   2.891,   2.762,   2.657,   2.570,},
00110   { 6.985, 4.900,  4.054,  3.580,  3.272,   3.052,   2.887,   2.758,   2.653,   2.567,},
00111   { 6.981, 4.896,  4.050,  3.577,  3.268,   3.049,   2.884,   2.755,   2.650,   2.563,},
00112   { 6.976, 4.892,  4.047,  3.573,  3.265,   3.046,   2.881,   2.751,   2.647,   2.560,},
00113   { 6.971, 4.888,  4.043,  3.570,  3.261,   3.042,   2.877,   2.748,   2.644,   2.557,},
00114   { 6.967, 4.884,  4.040,  3.566,  3.258,   3.039,   2.874,   2.745,   2.640,   2.554,},
00115   { 6.963, 4.881,  4.036,  3.563,  3.255,   3.036,   2.871,   2.742,   2.637,   2.551,},
00116   { 6.958, 4.877,  4.033,  3.560,  3.252,   3.033,   2.868,   2.739,   2.634,   2.548,},
00117   { 6.954, 4.874,  4.030,  3.557,  3.249,   3.030,   2.865,   2.736,   2.632,   2.545,},
00118   { 6.950, 4.870,  4.027,  3.554,  3.246,   3.027,   2.863,   2.733,   2.629,   2.542,},
00119   { 6.947, 4.867,  4.024,  3.551,  3.243,   3.025,   2.860,   2.731,   2.626,   2.539,},
00120   { 6.943, 4.864,  4.021,  3.548,  3.240,   3.022,   2.857,   2.728,   2.623,   2.537,},
00121   { 6.939, 4.861,  4.018,  3.545,  3.238,   3.019,   2.854,   2.725,   2.621,   2.534,},
00122   { 6.935, 4.858,  4.015,  3.543,  3.235,   3.017,   2.852,   2.723,   2.618,   2.532,},
00123   { 6.932, 4.855,  4.012,  3.540,  3.233,   3.014,   2.849,   2.720,   2.616,   2.529,},
00124   { 6.928, 4.852,  4.010,  3.538,  3.230,   3.012,   2.847,   2.718,   2.613,   2.527,},
00125   { 6.925, 4.849,  4.007,  3.535,  3.228,   3.009,   2.845,   2.715,   2.611,   2.524,},
00126   { 6.922, 4.846,  4.004,  3.533,  3.225,   3.007,   2.842,   2.713,   2.609,   2.522,},
00127   { 6.919, 4.844,  4.002,  3.530,  3.223,   3.004,   2.840,   2.711,   2.606,   2.520,},
00128   { 6.915, 4.841,  3.999,  3.528,  3.221,   3.002,   2.838,   2.709,   2.604,   2.518,},
00129   { 6.912, 4.838,  3.997,  3.525,  3.218,   3.000,   2.835,   2.706,   2.602,   2.515,},
00130   { 6.909, 4.836,  3.995,  3.523,  3.216,   2.998,   2.833,   2.704,   2.600,   2.513,},
00131   { 6.906, 4.833,  3.992,  3.521,  3.214,   2.996,   2.831,   2.702,   2.598,   2.511,},
00132   { 6.904, 4.831,  3.990,  3.519,  3.212,   2.994,   2.829,   2.700,   2.596,   2.509,},
00133   { 6.901, 4.829,  3.988,  3.517,  3.210,   2.992,   2.827,   2.698,   2.594,   2.507,},
00134   { 6.898, 4.826,  3.986,  3.515,  3.208,   2.990,   2.825,   2.696,   2.592,   2.505,},
00135   { 6.895, 4.824,  3.984,  3.513,  3.206,   2.988,   2.823,   2.694,   2.590,   2.503}
00136 };
00137 
00146 #define MINVARIANCE     0.0001
00147 
00158 #define MINBUCKETS      5
00159 #define MINSAMPLESPERBUCKET 5
00160 #define MINSAMPLES    (MINBUCKETS * MINSAMPLESPERBUCKET)
00161 #define MINSAMPLESNEEDED  1
00162 
00176 #define BUCKETTABLESIZE   1024
00177 #define NORMALEXTENT    3.0
00178 
00183 typedef struct
00184 {
00185   CLUSTER *Cluster;
00186   CLUSTER *Neighbor;
00187 } TEMPCLUSTER;
00188 
00193 typedef struct
00194 {
00195   FLOAT32 AvgVariance;
00196   FLOAT32 *CoVariance;
00197   FLOAT32 *Min;                  // largest negative distance from the mean
00198   FLOAT32 *Max;                  // largest positive distance from the mean
00199 } STATISTICS;
00200 
00205 typedef struct
00206 {
00207   DISTRIBUTION Distribution;     // distribution being tested for
00208   UINT32 SampleCount;            // # of samples in histogram
00209   FLOAT64 Confidence;            // confidence level of test
00210   FLOAT64 ChiSquared;            // test threshold
00211   UINT16 NumberOfBuckets;        // number of cells in histogram
00212   UINT16 Bucket[BUCKETTABLESIZE];// mapping to histogram buckets
00213   UINT32 *Count;                 // frequency of occurence histogram
00214   FLOAT32 *ExpectedCount;        // expected histogram
00215 } BUCKETS;
00216 
00221 typedef struct
00222 {
00223   UINT16 DegreesOfFreedom;
00224   FLOAT64 Alpha;
00225   FLOAT64 ChiSquared;
00226 } CHISTRUCT;
00227 
00228 typedef FLOAT64 (*DENSITYFUNC) (INT32);
00229 typedef FLOAT64 (*SOLVEFUNC) (CHISTRUCT *, double);
00230 
00231 #define Odd(N) ((N)%2)
00232 #define Mirror(N,R) ((R) - (N) - 1)
00233 #define Abs(N) ( ( (N) < 0 ) ? ( -(N) ) : (N) )
00234 
00235 //--------------Global Data Definitions and Declarations----------------------
00236 
00241 static HEAP *Heap;
00242 static TEMPCLUSTER *TempCluster;
00243 static KDTREE *Tree;
00244 static INT32 CurrentTemp;
00245 
00256 #define SqrtOf2Pi     2.506628275
00257 static FLOAT64 NormalStdDev = BUCKETTABLESIZE / (2.0 * NORMALEXTENT);
00258 static FLOAT64 NormalVariance =
00259 (BUCKETTABLESIZE * BUCKETTABLESIZE) / (4.0 * NORMALEXTENT * NORMALEXTENT);
00260 static FLOAT64 NormalMagnitude =
00261 (2.0 * NORMALEXTENT) / (SqrtOf2Pi * BUCKETTABLESIZE);
00262 static FLOAT64 NormalMean = BUCKETTABLESIZE / 2;
00263 
00265 static LIST OldBuckets[] = { NIL, NIL, NIL };
00266 
00274 #define LOOKUPTABLESIZE   8
00275 #define MAXBUCKETS      39
00276 #define MAXDEGREESOFFREEDOM MAXBUCKETS
00277 static UINT32 CountTable[LOOKUPTABLESIZE] = {
00278   MINSAMPLES, 200, 400, 600, 800, 1000, 1500, 2000
00279 };
00280 static UINT16 BucketsTable[LOOKUPTABLESIZE] = {
00281   MINBUCKETS, 16, 20, 24, 27, 30, 35, MAXBUCKETS
00282 };
00283 
00284 /*-------------------------------------------------------------------------
00285           Private Function Prototypes
00286 --------------------------------------------------------------------------*/
00287 void CreateClusterTree(CLUSTERER *Clusterer);
00288 
00289 void MakePotentialClusters(CLUSTER *Cluster, VISIT Order, INT32 Level);
00290 
00291 CLUSTER *FindNearestNeighbor(KDTREE *Tree,
00292                              CLUSTER *Cluster,
00293                              FLOAT32 *Distance);
00294 
00295 CLUSTER *MakeNewCluster(CLUSTERER *Clusterer, TEMPCLUSTER *TempCluster);
00296 
00297 INT32 MergeClusters (INT16 N,
00298 register PARAM_DESC ParamDesc[],
00299 register INT32 n1,
00300 register INT32 n2,
00301 register FLOAT32 m[],
00302 register FLOAT32 m1[], register FLOAT32 m2[]);
00303 
00304 void ComputePrototypes(CLUSTERER *Clusterer, CLUSTERCONFIG *Config);
00305 
00306 PROTOTYPE *MakePrototype(CLUSTERER *Clusterer,
00307                          CLUSTERCONFIG *Config,
00308                          CLUSTER *Cluster);
00309 
00310 PROTOTYPE *MakeDegenerateProto(UINT16 N,
00311                                CLUSTER *Cluster,
00312                                STATISTICS *Statistics,
00313                                PROTOSTYLE Style,
00314                                INT32 MinSamples);
00315 
00316 PROTOTYPE *TestEllipticalProto(CLUSTERER *Clusterer,
00317                                CLUSTER *Cluster,
00318                                STATISTICS *Statistics);
00319 
00320 PROTOTYPE *MakeSphericalProto(CLUSTERER *Clusterer,
00321                               CLUSTER *Cluster,
00322                               STATISTICS *Statistics,
00323                               BUCKETS *Buckets);
00324 
00325 PROTOTYPE *MakeEllipticalProto(CLUSTERER *Clusterer,
00326                                CLUSTER *Cluster,
00327                                STATISTICS *Statistics,
00328                                BUCKETS *Buckets);
00329 
00330 PROTOTYPE *MakeMixedProto(CLUSTERER *Clusterer,
00331                           CLUSTER *Cluster,
00332                           STATISTICS *Statistics,
00333                           BUCKETS *NormalBuckets,
00334                           FLOAT64 Confidence);
00335 
00336 void MakeDimRandom(UINT16 i, PROTOTYPE *Proto, PARAM_DESC *ParamDesc);
00337 
00338 void MakeDimUniform(UINT16 i, PROTOTYPE *Proto, STATISTICS *Statistics);
00339 
00340 STATISTICS *ComputeStatistics (INT16 N,
00341 PARAM_DESC ParamDesc[], CLUSTER * Cluster);
00342 
00343 PROTOTYPE *NewSphericalProto(UINT16 N,
00344                              CLUSTER *Cluster,
00345                              STATISTICS *Statistics);
00346 
00347 PROTOTYPE *NewEllipticalProto(INT16 N,
00348                               CLUSTER *Cluster,
00349                               STATISTICS *Statistics);
00350 
00351 PROTOTYPE *NewMixedProto(INT16 N, CLUSTER *Cluster, STATISTICS *Statistics);
00352 
00353 PROTOTYPE *NewSimpleProto(INT16 N, CLUSTER *Cluster);
00354 
00355 BOOL8 Independent (PARAM_DESC ParamDesc[],
00356 INT16 N, FLOAT32 * CoVariance, FLOAT32 Independence);
00357 
00358 BUCKETS *GetBuckets(DISTRIBUTION Distribution,
00359                     UINT32 SampleCount,
00360                     FLOAT64 Confidence);
00361 
00362 BUCKETS *MakeBuckets(DISTRIBUTION Distribution,
00363                      UINT32 SampleCount,
00364                      FLOAT64 Confidence);
00365 
00366 UINT16 OptimumNumberOfBuckets(UINT32 SampleCount);
00367 
00368 FLOAT64 ComputeChiSquared(UINT16 DegreesOfFreedom, FLOAT64 Alpha);
00369 
00370 FLOAT64 NormalDensity(INT32 x);
00371 
00372 FLOAT64 UniformDensity(INT32 x);
00373 
00374 FLOAT64 Integral(FLOAT64 f1, FLOAT64 f2, FLOAT64 Dx);
00375 
00376 void FillBuckets(BUCKETS *Buckets,
00377                  CLUSTER *Cluster,
00378                  UINT16 Dim,
00379                  PARAM_DESC *ParamDesc,
00380                  FLOAT32 Mean,
00381                  FLOAT32 StdDev);
00382 
00383 UINT16 NormalBucket(PARAM_DESC *ParamDesc,
00384                     FLOAT32 x,
00385                     FLOAT32 Mean,
00386                     FLOAT32 StdDev);
00387 
00388 UINT16 UniformBucket(PARAM_DESC *ParamDesc,
00389                      FLOAT32 x,
00390                      FLOAT32 Mean,
00391                      FLOAT32 StdDev);
00392 
00393 BOOL8 DistributionOK(BUCKETS *Buckets);
00394 
00395 void FreeStatistics(STATISTICS *Statistics);
00396 
00397 void FreeBuckets(BUCKETS *Buckets);
00398 
00399 void FreeCluster(CLUSTER *Cluster);
00400 
00401 UINT16 DegreesOfFreedom(DISTRIBUTION Distribution, UINT16 HistogramBuckets);
00402 
00403 int NumBucketsMatch(void *arg1,
00404                     void *arg2);
00405 
00406 int ListEntryMatch(void *arg1, void *arg2);
00407 
00408 void AdjustBuckets(BUCKETS *Buckets, UINT32 NewSampleCount);
00409 
00410 void InitBuckets(BUCKETS *Buckets);
00411 
00412 int AlphaMatch(void *arg1,
00413                void *arg2);
00414 
00415 CHISTRUCT *NewChiStruct(UINT16 DegreesOfFreedom, FLOAT64 Alpha);
00416 
00417 FLOAT64 Solve(SOLVEFUNC Function,
00418               void *FunctionParams,
00419               FLOAT64 InitialGuess,
00420               FLOAT64 Accuracy);
00421 
00422 FLOAT64 ChiArea(CHISTRUCT *ChiParams, FLOAT64 x);
00423 
00424 BOOL8 MultipleCharSamples(CLUSTERER *Clusterer,
00425                           CLUSTER *Cluster,
00426                           FLOAT32 MaxIllegal);
00427 
00428 double InvertMatrix(const float* input, int size, float* inv);
00429 
00430 //--------------------------Public Code--------------------------------------
00441 CLUSTERER *
00442 MakeClusterer (INT16 SampleSize, PARAM_DESC ParamDesc[]) {
00443   CLUSTERER *Clusterer;
00444   int i;
00445 
00446   // allocate main clusterer data structure and init simple fields
00447   Clusterer = (CLUSTERER *) Emalloc (sizeof (CLUSTERER));
00448   Clusterer->SampleSize = SampleSize;
00449   Clusterer->NumberOfSamples = 0;
00450   Clusterer->NumChar = 0;
00451 
00452   // init fields which will not be used initially
00453   Clusterer->Root = NULL;
00454   Clusterer->ProtoList = NIL;
00455 
00456   // maintain a copy of param descriptors in the clusterer data structure
00457   Clusterer->ParamDesc =
00458     (PARAM_DESC *) Emalloc (SampleSize * sizeof (PARAM_DESC));
00459   for (i = 0; i < SampleSize; i++) {
00460     Clusterer->ParamDesc[i].Circular = ParamDesc[i].Circular;
00461     Clusterer->ParamDesc[i].NonEssential = ParamDesc[i].NonEssential;
00462     Clusterer->ParamDesc[i].Min = ParamDesc[i].Min;
00463     Clusterer->ParamDesc[i].Max = ParamDesc[i].Max;
00464     Clusterer->ParamDesc[i].Range = ParamDesc[i].Max - ParamDesc[i].Min;
00465     Clusterer->ParamDesc[i].HalfRange = Clusterer->ParamDesc[i].Range / 2;
00466     Clusterer->ParamDesc[i].MidRange =
00467       (ParamDesc[i].Max + ParamDesc[i].Min) / 2;
00468   }
00469 
00470   // allocate a kd tree to hold the samples
00471   Clusterer->KDTree = MakeKDTree (SampleSize, ParamDesc);
00472 
00473   // execute hook for monitoring clustering operation
00474   // (*ClustererCreationHook)( Clusterer );
00475 
00476   return (Clusterer);
00477 }                                // MakeClusterer
00478 
00479 
00497 SAMPLE *
00498 MakeSample (CLUSTERER * Clusterer, FLOAT32 Feature[], INT32 CharID) {
00499   SAMPLE *Sample;
00500   int i;
00501 
00502   // see if the samples have already been clustered - if so trap an error
00503   if (Clusterer->Root != NULL)
00504     DoError (ALREADYCLUSTERED,
00505       "Can't add samples after they have been clustered");
00506 
00507   // allocate the new sample and initialize it
00508   Sample = (SAMPLE *) Emalloc (sizeof (SAMPLE) +
00509     (Clusterer->SampleSize -
00510     1) * sizeof (FLOAT32));
00511   Sample->Clustered = FALSE;
00512   Sample->Prototype = FALSE;
00513   Sample->SampleCount = 1;
00514   Sample->Left = NULL;
00515   Sample->Right = NULL;
00516   Sample->CharID = CharID;
00517 
00518   for (i = 0; i < Clusterer->SampleSize; i++)
00519     Sample->Mean[i] = Feature[i];
00520 
00521   // add the sample to the KD tree - keep track of the total # of samples
00522   Clusterer->NumberOfSamples++;
00523   KDStore (Clusterer->KDTree, Sample->Mean, (char *) Sample);
00524   if (CharID >= Clusterer->NumChar)
00525     Clusterer->NumChar = CharID + 1;
00526 
00527   // execute hook for monitoring clustering operation
00528   // (*SampleCreationHook)( Sample );
00529 
00530   return (Sample);
00531 }                                // MakeSample
00532 
00533 
00557 LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config) {
00558   //only create cluster tree if samples have never been clustered before
00559   if (Clusterer->Root == NULL)
00560     CreateClusterTree(Clusterer);
00561 
00562   //deallocate the old prototype list if one exists
00563   FreeProtoList (&Clusterer->ProtoList);
00564   Clusterer->ProtoList = NIL;
00565 
00566   //compute prototypes starting at the root node in the tree
00567   ComputePrototypes(Clusterer, Config);
00568   return (Clusterer->ProtoList);
00569 }                                // ClusterSamples
00570 
00571 
00588 void FreeClusterer(CLUSTERER *Clusterer) {
00589   if (Clusterer != NULL) {
00590     memfree (Clusterer->ParamDesc);
00591     if (Clusterer->KDTree != NULL)
00592       FreeKDTree (Clusterer->KDTree);
00593     if (Clusterer->Root != NULL)
00594       FreeCluster (Clusterer->Root);
00595     iterate (Clusterer->ProtoList) {
00596       ((PROTOTYPE *) (first (Clusterer->ProtoList)))->Cluster = NULL;
00597     }
00598     memfree(Clusterer);
00599   }
00600 }                                // FreeClusterer
00601 
00602 
00614 void FreeProtoList(LIST *ProtoList) {
00615   destroy_nodes(*ProtoList, FreePrototype);
00616 }                                // FreeProtoList
00617 
00618 
00631 void FreePrototype(void *arg) {  //PROTOTYPE     *Prototype)
00632   PROTOTYPE *Prototype = (PROTOTYPE *) arg;
00633 
00634   // unmark the corresponding cluster (if there is one
00635   if (Prototype->Cluster != NULL)
00636     Prototype->Cluster->Prototype = FALSE;
00637 
00638   // deallocate the prototype statistics and then the prototype itself
00639   if (Prototype->Distrib != NULL)
00640     memfree (Prototype->Distrib);
00641   if (Prototype->Mean != NULL)
00642     memfree (Prototype->Mean);
00643   if (Prototype->Style != spherical) {
00644     if (Prototype->Variance.Elliptical != NULL)
00645       memfree (Prototype->Variance.Elliptical);
00646     if (Prototype->Magnitude.Elliptical != NULL)
00647       memfree (Prototype->Magnitude.Elliptical);
00648     if (Prototype->Weight.Elliptical != NULL)
00649       memfree (Prototype->Weight.Elliptical);
00650   }
00651   memfree(Prototype);
00652 }                                // FreePrototype
00653 
00654 
00672 CLUSTER *NextSample(LIST *SearchState) {
00673   CLUSTER *Cluster;
00674 
00675   if (*SearchState == NIL)
00676     return (NULL);
00677   Cluster = (CLUSTER *) first (*SearchState);
00678   *SearchState = pop (*SearchState);
00679   while (TRUE) {
00680     if (Cluster->Left == NULL)
00681       return (Cluster);
00682     *SearchState = push (*SearchState, Cluster->Right);
00683     Cluster = Cluster->Left;
00684   }
00685 }                                // NextSample
00686 
00687 
00698 FLOAT32 Mean(PROTOTYPE *Proto, UINT16 Dimension) {
00699   return (Proto->Mean[Dimension]);
00700 }                                // Mean
00701 
00702 
00712 FLOAT32 StandardDeviation(PROTOTYPE *Proto, UINT16 Dimension) {
00713   switch (Proto->Style) {
00714     case spherical:
00715       return ((FLOAT32) sqrt ((double) Proto->Variance.Spherical));
00716     case elliptical:
00717       return ((FLOAT32)
00718         sqrt ((double) Proto->Variance.Elliptical[Dimension]));
00719     case mixed:
00720       switch (Proto->Distrib[Dimension]) {
00721         case normal:
00722           return ((FLOAT32)
00723             sqrt ((double) Proto->Variance.Elliptical[Dimension]));
00724         case uniform:
00725         case D_random:
00726           return (Proto->Variance.Elliptical[Dimension]);
00727       }
00728   }
00729   return 0.0f;
00730 }                                // StandardDeviation
00731 
00732 
00733 /*---------------------------------------------------------------------------
00734             Private Code
00735 ----------------------------------------------------------------------------*/
00736 
00756 void CreateClusterTree(CLUSTERER *Clusterer) {
00757   HEAPENTRY HeapEntry;
00758   TEMPCLUSTER *PotentialCluster;
00759 
00760   // save the kd-tree in a global variable so kd-tree walker can get at it
00761   Tree = Clusterer->KDTree;
00762 
00763   // allocate memory to to hold all of the "potential" clusters
00764   TempCluster = (TEMPCLUSTER *)
00765     Emalloc (Clusterer->NumberOfSamples * sizeof (TEMPCLUSTER));
00766   CurrentTemp = 0;
00767 
00768   // note each sample and its nearest neighbor form a "potential" cluster
00769   // save these in a heap with the "best" potential clusters on top
00770   Heap = MakeHeap (Clusterer->NumberOfSamples);
00771   KDWalk (Tree, (void_proc) MakePotentialClusters);
00772 
00773   // form potential clusters into actual clusters - always do "best" first
00774   while (GetTopOfHeap (Heap, &HeapEntry) != EMPTY) {
00775     PotentialCluster = (TEMPCLUSTER *) (HeapEntry.Data);
00776 
00777     // if main cluster of potential cluster is already in another cluster
00778     // then we don't need to worry about it
00779     if (PotentialCluster->Cluster->Clustered) {
00780       continue;
00781     }
00782 
00783     // if main cluster is not yet clustered, but its nearest neighbor is
00784     // then we must find a new nearest neighbor
00785     else if (PotentialCluster->Neighbor->Clustered) {
00786       PotentialCluster->Neighbor =
00787         FindNearestNeighbor (Tree, PotentialCluster->Cluster,
00788         &(HeapEntry.Key));
00789       if (PotentialCluster->Neighbor != NULL) {
00790         HeapStore(Heap, &HeapEntry);
00791       }
00792     }
00793 
00794     // if neither cluster is already clustered, form permanent cluster
00795     else {
00796       PotentialCluster->Cluster =
00797         MakeNewCluster(Clusterer, PotentialCluster);
00798       PotentialCluster->Neighbor =
00799         FindNearestNeighbor (Tree, PotentialCluster->Cluster,
00800         &(HeapEntry.Key));
00801       if (PotentialCluster->Neighbor != NULL) {
00802         HeapStore(Heap, &HeapEntry);
00803       }
00804     }
00805   }
00806 
00807   // the root node in the cluster tree is now the only node in the kd-tree
00808   Clusterer->Root = (CLUSTER *) RootOf (Clusterer->KDTree);
00809 
00810   // free up the memory used by the K-D tree, heap, and temp clusters
00811   FreeKDTree(Tree);
00812   Clusterer->KDTree = NULL;
00813   FreeHeap(Heap);
00814   memfree(TempCluster);
00815 }                                // CreateClusterTree
00816 
00817 
00838 void MakePotentialClusters(CLUSTER *Cluster, VISIT Order, INT32 Level) {
00839   HEAPENTRY HeapEntry;
00840 
00841   if ((Order == preorder) || (Order == leaf)) {
00842     TempCluster[CurrentTemp].Cluster = Cluster;
00843     HeapEntry.Data = (char *) &(TempCluster[CurrentTemp]);
00844     TempCluster[CurrentTemp].Neighbor =
00845       FindNearestNeighbor (Tree, TempCluster[CurrentTemp].Cluster,
00846       &(HeapEntry.Key));
00847     if (TempCluster[CurrentTemp].Neighbor != NULL) {
00848       HeapStore(Heap, &HeapEntry);
00849       CurrentTemp++;
00850     }
00851   }
00852 }                                // MakePotentialClusters
00853 
00854 
00874 CLUSTER *
00875 FindNearestNeighbor (KDTREE * Tree, CLUSTER * Cluster, FLOAT32 * Distance)
00876 #define MAXNEIGHBORS  2
00877 #define MAXDISTANCE   MAX_FLOAT32
00878 {
00879   CLUSTER *Neighbor[MAXNEIGHBORS];
00880   FLOAT32 Dist[MAXNEIGHBORS];
00881   INT32 NumberOfNeighbors;
00882   INT32 i;
00883   CLUSTER *BestNeighbor;
00884 
00885   // find the 2 nearest neighbors of the cluster
00886   NumberOfNeighbors = KDNearestNeighborSearch
00887     (Tree, Cluster->Mean, MAXNEIGHBORS, MAXDISTANCE, Neighbor, Dist);
00888 
00889   // search for the nearest neighbor that is not the cluster itself
00890   *Distance = MAXDISTANCE;
00891   BestNeighbor = NULL;
00892   for (i = 0; i < NumberOfNeighbors; i++) {
00893     if ((Dist[i] < *Distance) && (Neighbor[i] != Cluster)) {
00894       *Distance = Dist[i];
00895       BestNeighbor = Neighbor[i];
00896     }
00897   }
00898   return (BestNeighbor);
00899 }                                // FindNearestNeighbor
00900 
00901 
00916 CLUSTER *MakeNewCluster(CLUSTERER *Clusterer, TEMPCLUSTER *TempCluster) {
00917   CLUSTER *Cluster;
00918 
00919   // allocate the new cluster and initialize it
00920   Cluster = (CLUSTER *) Emalloc (sizeof (CLUSTER) +
00921     (Clusterer->SampleSize -
00922     1) * sizeof (FLOAT32));
00923   Cluster->Clustered = FALSE;
00924   Cluster->Prototype = FALSE;
00925   Cluster->Left = TempCluster->Cluster;
00926   Cluster->Right = TempCluster->Neighbor;
00927   Cluster->CharID = -1;
00928 
00929   // mark the old clusters as "clustered" and delete them from the kd-tree
00930   Cluster->Left->Clustered = TRUE;
00931   Cluster->Right->Clustered = TRUE;
00932   KDDelete (Clusterer->KDTree, Cluster->Left->Mean, Cluster->Left);
00933   KDDelete (Clusterer->KDTree, Cluster->Right->Mean, Cluster->Right);
00934 
00935   // compute the mean and sample count for the new cluster
00936   Cluster->SampleCount =
00937     MergeClusters (Clusterer->SampleSize, Clusterer->ParamDesc,
00938     Cluster->Left->SampleCount, Cluster->Right->SampleCount,
00939     Cluster->Mean, Cluster->Left->Mean, Cluster->Right->Mean);
00940 
00941   // add the new cluster to the KD tree
00942   KDStore (Clusterer->KDTree, Cluster->Mean, Cluster);
00943   return (Cluster);
00944 }                                // MakeNewCluster
00945 
00946 
00964 INT32 MergeClusters (INT16 N,
00965 register PARAM_DESC ParamDesc[],
00966 register INT32 n1,
00967 register INT32 n2,
00968 register FLOAT32 m[],
00969 register FLOAT32 m1[], register FLOAT32 m2[]) {
00970   register INT32 i, n;
00971 
00972   n = n1 + n2;
00973   for (i = N; i > 0; i--, ParamDesc++, m++, m1++, m2++) {
00974     if (ParamDesc->Circular) {
00975       /*
00976      If distance between means is greater than allowed
00977       reduce upper point by one "rotation" to compute mean
00978       then normalize the mean back into the accepted range
00979      */
00980       if ((*m2 - *m1) > ParamDesc->HalfRange) {
00981         *m = (n1 * *m1 + n2 * (*m2 - ParamDesc->Range)) / n;
00982         if (*m < ParamDesc->Min)
00983           *m += ParamDesc->Range;
00984       }
00985       else if ((*m1 - *m2) > ParamDesc->HalfRange) {
00986         *m = (n1 * (*m1 - ParamDesc->Range) + n2 * *m2) / n;
00987         if (*m < ParamDesc->Min)
00988           *m += ParamDesc->Range;
00989       }
00990       else
00991         *m = (n1 * *m1 + n2 * *m2) / n;
00992     }
00993     else
00994       *m = (n1 * *m1 + n2 * *m2) / n;
00995   }
00996   return (n);
00997 }                                // MergeClusters
00998 
00999 
01011 void ComputePrototypes(CLUSTERER *Clusterer, CLUSTERCONFIG *Config) {
01012   LIST ClusterStack = NIL;
01013   CLUSTER *Cluster;
01014   PROTOTYPE *Prototype;
01015 
01016   // use a stack to keep track of clusters waiting to be processed
01017   // initially the only cluster on the stack is the root cluster
01018   if (Clusterer->Root != NULL)
01019     ClusterStack = push (NIL, Clusterer->Root);
01020 
01021   // loop until we have analyzed all clusters which are potential prototypes
01022   while (ClusterStack != NIL) {
01023     // Remove the next cluster to be analyzed from the stack
01024     // try to make a prototype from the cluster
01025     // if successful, put it on the proto list, else split the cluster
01026     Cluster = (CLUSTER *) first (ClusterStack);
01027     ClusterStack = pop (ClusterStack);
01028     Prototype = MakePrototype (Clusterer, Config, Cluster);
01029     if (Prototype != NULL) {
01030       Clusterer->ProtoList = push (Clusterer->ProtoList, Prototype);
01031     }
01032     else {
01033       ClusterStack = push (ClusterStack, Cluster->Right);
01034       ClusterStack = push (ClusterStack, Cluster->Left);
01035     }
01036   }
01037 }                                // ComputePrototypes
01038 
01039 
01060 PROTOTYPE *MakePrototype(CLUSTERER *Clusterer,
01061                          CLUSTERCONFIG *Config,
01062                          CLUSTER *Cluster) {
01063   STATISTICS *Statistics;
01064   PROTOTYPE *Proto;
01065   BUCKETS *Buckets;
01066 
01067   // filter out clusters which contain samples from the same character
01068   if (MultipleCharSamples (Clusterer, Cluster, Config->MaxIllegal))
01069     return (NULL);
01070 
01071   // compute the covariance matrix and ranges for the cluster
01072   Statistics =
01073     ComputeStatistics (Clusterer->SampleSize, Clusterer->ParamDesc, Cluster);
01074 
01075   // Check for degenerate clusters which need not be analyzed further
01076   // note that the MinSamples test assumes that all clusters with multiple
01077   // character samples have been removed (as above)
01078   Proto = MakeDegenerateProto (Clusterer->SampleSize, Cluster, Statistics,
01079     Config->ProtoStyle,
01080     (INT32) (Config->MinSamples *
01081     Clusterer->NumChar));
01082   if (Proto != NULL) {
01083     FreeStatistics(Statistics);
01084     return (Proto);
01085   }
01086   // check to ensure that all dimensions are independent
01087   if (!Independent (Clusterer->ParamDesc, Clusterer->SampleSize,
01088   Statistics->CoVariance, Config->Independence)) {
01089     FreeStatistics(Statistics);
01090     return (NULL);
01091   }
01092 
01093   if (HOTELLING && Config->ProtoStyle == elliptical) {
01094     Proto = TestEllipticalProto(Clusterer, Cluster, Statistics);
01095     if (Proto != NULL) {
01096       FreeStatistics(Statistics);
01097       return Proto;
01098     }
01099   }
01100 
01101   // create a histogram data structure used to evaluate distributions
01102   Buckets = GetBuckets (normal, Cluster->SampleCount, Config->Confidence);
01103 
01104   // create a prototype based on the statistics and test it
01105   switch (Config->ProtoStyle) {
01106     case spherical:
01107       Proto = MakeSphericalProto (Clusterer, Cluster, Statistics, Buckets);
01108       break;
01109     case elliptical:
01110       Proto = MakeEllipticalProto (Clusterer, Cluster, Statistics, Buckets);
01111       break;
01112     case mixed:
01113       Proto = MakeMixedProto (Clusterer, Cluster, Statistics, Buckets,
01114         Config->Confidence);
01115       break;
01116     case automatic:
01117       Proto = MakeSphericalProto (Clusterer, Cluster, Statistics, Buckets);
01118       if (Proto != NULL)
01119         break;
01120       Proto = MakeEllipticalProto (Clusterer, Cluster, Statistics, Buckets);
01121       if (Proto != NULL)
01122         break;
01123       Proto = MakeMixedProto (Clusterer, Cluster, Statistics, Buckets,
01124         Config->Confidence);
01125       break;
01126   }
01127   FreeBuckets(Buckets);
01128   FreeStatistics(Statistics);
01129   return (Proto);
01130 }                                // MakePrototype
01131 
01132 
01155 PROTOTYPE *MakeDegenerateProto(  //this was MinSample
01156                                UINT16 N,
01157                                CLUSTER *Cluster,
01158                                STATISTICS *Statistics,
01159                                PROTOSTYLE Style,
01160                                INT32 MinSamples) {
01161   PROTOTYPE *Proto = NULL;
01162 
01163   if (MinSamples < MINSAMPLESNEEDED)
01164     MinSamples = MINSAMPLESNEEDED;
01165 
01166   if (Cluster->SampleCount < MinSamples) {
01167     switch (Style) {
01168       case spherical:
01169         Proto = NewSphericalProto (N, Cluster, Statistics);
01170         break;
01171       case elliptical:
01172       case automatic:
01173         Proto = NewEllipticalProto (N, Cluster, Statistics);
01174         break;
01175       case mixed:
01176         Proto = NewMixedProto (N, Cluster, Statistics);
01177         break;
01178     }
01179     Proto->Significant = FALSE;
01180   }
01181   return (Proto);
01182 }                                // MakeDegenerateProto
01183 
01197 PROTOTYPE *TestEllipticalProto(CLUSTERER *Clusterer,
01198                                CLUSTER *Cluster,
01199                                STATISTICS *Statistics) {
01200   int N = Clusterer->SampleSize;
01201   CLUSTER* Left = Cluster->Left;
01202   CLUSTER* Right = Cluster->Right;
01203   if (Left == NULL || Right == NULL)
01204     return NULL;
01205   int TotalDims = Left->SampleCount + Right->SampleCount;
01206   if (TotalDims < N + 1)
01207     return NULL;
01208   FLOAT32* Inverse = (FLOAT32 *) Emalloc(N * N * sizeof(FLOAT32));
01209   FLOAT32* Delta = (FLOAT32*) Emalloc(N * sizeof(FLOAT32));
01210   double err = InvertMatrix(Statistics->CoVariance, N, Inverse);
01211   if (err > 1) {
01212     cprintf("Clustering error: Matrix inverse failed with error %g\n", err);
01213   }
01214   for (int dim = 0; dim < N; ++dim) {
01215     Delta[dim] = Left->Mean[dim] - Right->Mean[dim];
01216   }
01217   // Compute Hotelling's T-squared.
01218   double Tsq = 0.0;
01219   for (int x = 0; x < N; ++x) {
01220     double temp = 0.0;
01221     for (int y = 0; y < N; ++y) {
01222       temp += Inverse[y + N*x] * Delta[y];
01223     }
01224     Tsq += Delta[x] * temp;
01225   }
01226   memfree(Inverse);
01227   memfree(Delta);
01228   Tsq *= Left->SampleCount * Right->SampleCount / TotalDims;
01229   double F = Tsq * (TotalDims - N - 1) / ((TotalDims - N) * 2);
01230   int Fx = N;
01231   if (Fx > FTABLE_X)
01232     Fx = FTABLE_X;
01233   --Fx;
01234   int Fy = TotalDims - N - 1;
01235   if (Fy > FTABLE_Y)
01236     Fy = FTABLE_Y;
01237   --Fy;
01238   if (F < FTable[Fy][Fx]) {
01239     return NewEllipticalProto (Clusterer->SampleSize, Cluster, Statistics);
01240   }
01241   return NULL;
01242 }
01243 
01259 PROTOTYPE *MakeSphericalProto(CLUSTERER *Clusterer,
01260                               CLUSTER *Cluster,
01261                               STATISTICS *Statistics,
01262                               BUCKETS *Buckets) {
01263   PROTOTYPE *Proto = NULL;
01264   int i;
01265 
01266   // check that each dimension is a normal distribution
01267   for (i = 0; i < Clusterer->SampleSize; i++) {
01268     if (Clusterer->ParamDesc[i].NonEssential)
01269       continue;
01270 
01271     FillBuckets (Buckets, Cluster, i, &(Clusterer->ParamDesc[i]),
01272       Cluster->Mean[i],
01273       sqrt ((FLOAT64) (Statistics->AvgVariance)));
01274     if (!DistributionOK (Buckets))
01275       break;
01276   }
01277   // if all dimensions matched a normal distribution, make a proto
01278   if (i >= Clusterer->SampleSize)
01279     Proto = NewSphericalProto (Clusterer->SampleSize, Cluster, Statistics);
01280   return (Proto);
01281 }                                // MakeSphericalProto
01282 
01283 
01299 PROTOTYPE *MakeEllipticalProto(CLUSTERER *Clusterer,
01300                                CLUSTER *Cluster,
01301                                STATISTICS *Statistics,
01302                                BUCKETS *Buckets) {
01303   PROTOTYPE *Proto = NULL;
01304   int i;
01305 
01306   // check that each dimension is a normal distribution
01307   for (i = 0; i < Clusterer->SampleSize; i++) {
01308     if (Clusterer->ParamDesc[i].NonEssential)
01309       continue;
01310 
01311     FillBuckets (Buckets, Cluster, i, &(Clusterer->ParamDesc[i]),
01312       Cluster->Mean[i],
01313       sqrt ((FLOAT64) Statistics->
01314       CoVariance[i * (Clusterer->SampleSize + 1)]));
01315     if (!DistributionOK (Buckets))
01316       break;
01317   }
01318   // if all dimensions matched a normal distribution, make a proto
01319   if (i >= Clusterer->SampleSize)
01320     Proto = NewEllipticalProto (Clusterer->SampleSize, Cluster, Statistics);
01321   return (Proto);
01322 }                                // MakeEllipticalProto
01323 
01324 
01344 PROTOTYPE *MakeMixedProto(CLUSTERER *Clusterer,
01345                           CLUSTER *Cluster,
01346                           STATISTICS *Statistics,
01347                           BUCKETS *NormalBuckets,
01348                           FLOAT64 Confidence) {
01349   PROTOTYPE *Proto;
01350   int i;
01351   BUCKETS *UniformBuckets = NULL;
01352   BUCKETS *RandomBuckets = NULL;
01353 
01354   // create a mixed proto to work on - initially assume all dimensions normal*/
01355   Proto = NewMixedProto (Clusterer->SampleSize, Cluster, Statistics);
01356 
01357   // Find the proper distribution for each dimension
01358   for (i = 0; i < Clusterer->SampleSize; i++) {
01359     if (Clusterer->ParamDesc[i].NonEssential)
01360       continue;
01361 
01362     FillBuckets (NormalBuckets, Cluster, i, &(Clusterer->ParamDesc[i]),
01363       Proto->Mean[i],
01364       sqrt ((FLOAT64) Proto->Variance.Elliptical[i]));
01365     if (DistributionOK (NormalBuckets))
01366       continue;
01367 
01368     if (RandomBuckets == NULL)
01369       RandomBuckets =
01370         GetBuckets (D_random, Cluster->SampleCount, Confidence);
01371     MakeDimRandom (i, Proto, &(Clusterer->ParamDesc[i]));
01372     FillBuckets (RandomBuckets, Cluster, i, &(Clusterer->ParamDesc[i]),
01373       Proto->Mean[i], Proto->Variance.Elliptical[i]);
01374     if (DistributionOK (RandomBuckets))
01375       continue;
01376 
01377     if (UniformBuckets == NULL)
01378       UniformBuckets =
01379         GetBuckets (uniform, Cluster->SampleCount, Confidence);
01380     MakeDimUniform(i, Proto, Statistics);
01381     FillBuckets (UniformBuckets, Cluster, i, &(Clusterer->ParamDesc[i]),
01382       Proto->Mean[i], Proto->Variance.Elliptical[i]);
01383     if (DistributionOK (UniformBuckets))
01384       continue;
01385     break;
01386   }
01387   // If any dimension failed to match a distribution, discard the proto
01388   if (i < Clusterer->SampleSize) {
01389     FreePrototype(Proto);
01390     Proto = NULL;
01391   }
01392   if (UniformBuckets != NULL)
01393     FreeBuckets(UniformBuckets);
01394   if (RandomBuckets != NULL)
01395     FreeBuckets(RandomBuckets);
01396   return (Proto);
01397 }                                // MakeMixedProto
01398 
01399 
01410 void MakeDimRandom(UINT16 i, PROTOTYPE *Proto, PARAM_DESC *ParamDesc) {
01411   Proto->Distrib[i] = D_random;
01412   Proto->Mean[i] = ParamDesc->MidRange;
01413   Proto->Variance.Elliptical[i] = ParamDesc->HalfRange;
01414 
01415   // subtract out the previous magnitude of this dimension from the total
01416   Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[i];
01417   Proto->Magnitude.Elliptical[i] = 1.0 / ParamDesc->Range;
01418   Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
01419   Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
01420 
01421   // note that the proto Weight is irrelevant for D_random protos
01422 }                                // MakeDimRandom
01423 
01424 
01436 void MakeDimUniform(UINT16 i, PROTOTYPE *Proto, STATISTICS *Statistics) {
01437   Proto->Distrib[i] = uniform;
01438   Proto->Mean[i] = Proto->Cluster->Mean[i] +
01439     (Statistics->Min[i] + Statistics->Max[i]) / 2;
01440   Proto->Variance.Elliptical[i] =
01441     (Statistics->Max[i] - Statistics->Min[i]) / 2;
01442   if (Proto->Variance.Elliptical[i] < MINVARIANCE)
01443     Proto->Variance.Elliptical[i] = MINVARIANCE;
01444 
01445   // subtract out the previous magnitude of this dimension from the total
01446   Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[i];
01447   Proto->Magnitude.Elliptical[i] =
01448     1.0 / (2.0 * Proto->Variance.Elliptical[i]);
01449   Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
01450   Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
01451 
01452   // note that the proto Weight is irrelevant for uniform protos
01453 }                                // MakeDimUniform
01454 
01455 
01473 STATISTICS *
01474 ComputeStatistics (INT16 N, PARAM_DESC ParamDesc[], CLUSTER * Cluster) {
01475   STATISTICS *Statistics;
01476   int i, j;
01477   FLOAT32 *CoVariance;
01478   FLOAT32 *Distance;
01479   LIST SearchState;
01480   SAMPLE *Sample;
01481   UINT32 SampleCountAdjustedForBias;
01482 
01483   // allocate memory to hold the statistics results
01484   Statistics = (STATISTICS *) Emalloc (sizeof (STATISTICS));
01485   Statistics->CoVariance = (FLOAT32 *) Emalloc (N * N * sizeof (FLOAT32));
01486   Statistics->Min = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
01487   Statistics->Max = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
01488 
01489   // allocate temporary memory to hold the sample to mean distances
01490   Distance = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
01491 
01492   // initialize the statistics
01493   Statistics->AvgVariance = 1.0;
01494   CoVariance = Statistics->CoVariance;
01495   for (i = 0; i < N; i++) {
01496     Statistics->Min[i] = 0.0;
01497     Statistics->Max[i] = 0.0;
01498     for (j = 0; j < N; j++, CoVariance++)
01499       *CoVariance = 0;
01500   }
01501   // Find each sample in the cluster and merge it into the statistics
01502   InitSampleSearch(SearchState, Cluster);
01503   while ((Sample = NextSample (&SearchState)) != NULL) {
01504     for (i = 0; i < N; i++) {
01505       Distance[i] = Sample->Mean[i] - Cluster->Mean[i];
01506       if (ParamDesc[i].Circular) {
01507         if (Distance[i] > ParamDesc[i].HalfRange)
01508           Distance[i] -= ParamDesc[i].Range;
01509         if (Distance[i] < -ParamDesc[i].HalfRange)
01510           Distance[i] += ParamDesc[i].Range;
01511       }
01512       if (Distance[i] < Statistics->Min[i])
01513         Statistics->Min[i] = Distance[i];
01514       if (Distance[i] > Statistics->Max[i])
01515         Statistics->Max[i] = Distance[i];
01516     }
01517     CoVariance = Statistics->CoVariance;
01518     for (i = 0; i < N; i++)
01519       for (j = 0; j < N; j++, CoVariance++)
01520         *CoVariance += Distance[i] * Distance[j];
01521   }
01522   // Normalize the variances by the total number of samples
01523   // use SampleCount-1 instead of SampleCount to get an unbiased estimate
01524   // also compute the geometic mean of the diagonal variances
01525   // ensure that clusters with only 1 sample are handled correctly
01526   if (Cluster->SampleCount > 1)
01527     SampleCountAdjustedForBias = Cluster->SampleCount - 1;
01528   else
01529     SampleCountAdjustedForBias = 1;
01530   CoVariance = Statistics->CoVariance;
01531   for (i = 0; i < N; i++)
01532   for (j = 0; j < N; j++, CoVariance++) {
01533     *CoVariance /= SampleCountAdjustedForBias;
01534     if (j == i) {
01535       if (*CoVariance < MINVARIANCE)
01536         *CoVariance = MINVARIANCE;
01537       Statistics->AvgVariance *= *CoVariance;
01538     }
01539   }
01540   Statistics->AvgVariance = (float)pow((double)Statistics->AvgVariance,
01541                                        1.0 / N);
01542 
01543   // release temporary memory and return
01544   memfree(Distance);
01545   return (Statistics);
01546 }                                // ComputeStatistics
01547 
01548 
01564 PROTOTYPE *NewSphericalProto(UINT16 N,
01565                              CLUSTER *Cluster,
01566                              STATISTICS *Statistics) {
01567   PROTOTYPE *Proto;
01568 
01569   Proto = NewSimpleProto (N, Cluster);
01570 
01571   Proto->Variance.Spherical = Statistics->AvgVariance;
01572   if (Proto->Variance.Spherical < MINVARIANCE)
01573     Proto->Variance.Spherical = MINVARIANCE;
01574 
01575   Proto->Magnitude.Spherical =
01576     1.0 / sqrt ((double) (2.0 * PI * Proto->Variance.Spherical));
01577   Proto->TotalMagnitude = (float)pow((double)Proto->Magnitude.Spherical,
01578                                      (double) N);
01579   Proto->Weight.Spherical = 1.0 / Proto->Variance.Spherical;
01580   Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
01581 
01582   return (Proto);
01583 }                                // NewSphericalProto
01584 
01585 
01600 PROTOTYPE *NewEllipticalProto(INT16 N,
01601                               CLUSTER *Cluster,
01602                               STATISTICS *Statistics) {
01603   PROTOTYPE *Proto;
01604   FLOAT32 *CoVariance;
01605   int i;
01606 
01607   Proto = NewSimpleProto (N, Cluster);
01608   Proto->Variance.Elliptical = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
01609   Proto->Magnitude.Elliptical = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
01610   Proto->Weight.Elliptical = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
01611 
01612   CoVariance = Statistics->CoVariance;
01613   Proto->TotalMagnitude = 1.0;
01614   for (i = 0; i < N; i++, CoVariance += N + 1) {
01615     Proto->Variance.Elliptical[i] = *CoVariance;
01616     if (Proto->Variance.Elliptical[i] < MINVARIANCE)
01617       Proto->Variance.Elliptical[i] = MINVARIANCE;
01618 
01619     Proto->Magnitude.Elliptical[i] =
01620       1.0 / sqrt ((double) (2.0 * PI * Proto->Variance.Elliptical[i]));
01621     Proto->Weight.Elliptical[i] = 1.0 / Proto->Variance.Elliptical[i];
01622     Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
01623   }
01624   Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
01625   Proto->Style = elliptical;
01626   return (Proto);
01627 }                                // NewEllipticalProto
01628 
01629 
01647 PROTOTYPE *NewMixedProto(INT16 N, CLUSTER *Cluster, STATISTICS *Statistics) {
01648   PROTOTYPE *Proto;
01649   int i;
01650 
01651   Proto = NewEllipticalProto (N, Cluster, Statistics);
01652   Proto->Distrib = (DISTRIBUTION *) Emalloc (N * sizeof (DISTRIBUTION));
01653 
01654   for (i = 0; i < N; i++) {
01655     Proto->Distrib[i] = normal;
01656   }
01657   Proto->Style = mixed;
01658   return (Proto);
01659 }                                // NewMixedProto
01660 
01661 
01673 PROTOTYPE *NewSimpleProto(INT16 N, CLUSTER *Cluster) {
01674   PROTOTYPE *Proto;
01675   int i;
01676 
01677   Proto = (PROTOTYPE *) Emalloc (sizeof (PROTOTYPE));
01678   Proto->Mean = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
01679 
01680   for (i = 0; i < N; i++)
01681     Proto->Mean[i] = Cluster->Mean[i];
01682   Proto->Distrib = NULL;
01683 
01684   Proto->Significant = TRUE;
01685   Proto->Style = spherical;
01686   Proto->NumSamples = Cluster->SampleCount;
01687   Proto->Cluster = Cluster;
01688   Proto->Cluster->Prototype = TRUE;
01689   return (Proto);
01690 }                                // NewSimpleProto
01691 
01692 
01714 BOOL8
01715 Independent (PARAM_DESC ParamDesc[],
01716 INT16 N, FLOAT32 * CoVariance, FLOAT32 Independence) {
01717   int i, j;
01718   FLOAT32 *VARii;                // points to ith on-diagonal element
01719   FLOAT32 *VARjj;                // points to jth on-diagonal element
01720   FLOAT32 CorrelationCoeff;
01721 
01722   VARii = CoVariance;
01723   for (i = 0; i < N; i++, VARii += N + 1) {
01724     if (ParamDesc[i].NonEssential)
01725       continue;
01726 
01727     VARjj = VARii + N + 1;
01728     CoVariance = VARii + 1;
01729     for (j = i + 1; j < N; j++, CoVariance++, VARjj += N + 1) {
01730       if (ParamDesc[j].NonEssential)
01731         continue;
01732 
01733       if ((*VARii == 0.0) || (*VARjj == 0.0))
01734         CorrelationCoeff = 0.0;
01735       else
01736         CorrelationCoeff =
01737           sqrt (sqrt (*CoVariance * *CoVariance / (*VARii * *VARjj)));
01738       if (CorrelationCoeff > Independence)
01739         return (FALSE);
01740     }
01741   }
01742   return (TRUE);
01743 }                                // Independent
01744 
01745 
01765 BUCKETS *GetBuckets(DISTRIBUTION Distribution,
01766                     UINT32 SampleCount,
01767                     FLOAT64 Confidence) {
01768   UINT16 NumberOfBuckets;
01769   BUCKETS *Buckets;
01770 
01771   // Search for an old bucket structure with the same number of buckets
01772   NumberOfBuckets = OptimumNumberOfBuckets (SampleCount);
01773   Buckets = (BUCKETS *) first (search (OldBuckets[(int) Distribution],
01774     &NumberOfBuckets, NumBucketsMatch));
01775 
01776   // if a matching bucket structure is found, delete it from the list
01777   if (Buckets != NULL) {
01778     OldBuckets[(int) Distribution] =
01779       delete_d (OldBuckets[(int) Distribution], Buckets, ListEntryMatch);
01780     if (SampleCount != Buckets->SampleCount)
01781       AdjustBuckets(Buckets, SampleCount);
01782     if (Confidence != Buckets->Confidence) {
01783       Buckets->Confidence = Confidence;
01784       Buckets->ChiSquared = ComputeChiSquared
01785         (DegreesOfFreedom (Distribution, Buckets->NumberOfBuckets),
01786         Confidence);
01787     }
01788     InitBuckets(Buckets);
01789   }
01790   else                           // otherwise create a new structure
01791     Buckets = MakeBuckets (Distribution, SampleCount, Confidence);
01792   return (Buckets);
01793 }                                // GetBuckets
01794 
01795 
01817 BUCKETS *MakeBuckets(DISTRIBUTION Distribution,
01818                      UINT32 SampleCount,
01819                      FLOAT64 Confidence) {
01820   static DENSITYFUNC DensityFunction[] =
01821     { NormalDensity, UniformDensity, UniformDensity };
01822   int i, j;
01823   BUCKETS *Buckets;
01824   FLOAT64 BucketProbability;
01825   FLOAT64 NextBucketBoundary;
01826   FLOAT64 Probability;
01827   FLOAT64 ProbabilityDelta;
01828   FLOAT64 LastProbDensity;
01829   FLOAT64 ProbDensity;
01830   UINT16 CurrentBucket;
01831   BOOL8 Symmetrical;
01832 
01833   // allocate memory needed for data structure
01834   Buckets = (BUCKETS *) Emalloc (sizeof (BUCKETS));
01835   Buckets->NumberOfBuckets = OptimumNumberOfBuckets (SampleCount);
01836   Buckets->SampleCount = SampleCount;
01837   Buckets->Confidence = Confidence;
01838   Buckets->Count =
01839     (UINT32 *) Emalloc (Buckets->NumberOfBuckets * sizeof (UINT32));
01840   Buckets->ExpectedCount =
01841     (FLOAT32 *) Emalloc (Buckets->NumberOfBuckets * sizeof (FLOAT32));
01842 
01843   // initialize simple fields
01844   Buckets->Distribution = Distribution;
01845   for (i = 0; i < Buckets->NumberOfBuckets; i++) {
01846     Buckets->Count[i] = 0;
01847     Buckets->ExpectedCount[i] = 0.0;
01848   }
01849 
01850   // all currently defined distributions are symmetrical
01851   Symmetrical = TRUE;
01852   Buckets->ChiSquared = ComputeChiSquared
01853     (DegreesOfFreedom (Distribution, Buckets->NumberOfBuckets), Confidence);
01854 
01855   if (Symmetrical) {
01856     // allocate buckets so that all have approx. equal probability
01857     BucketProbability = 1.0 / (FLOAT64) (Buckets->NumberOfBuckets);
01858 
01859     // distribution is symmetric so fill in upper half then copy
01860     CurrentBucket = Buckets->NumberOfBuckets / 2;
01861     if (Odd (Buckets->NumberOfBuckets))
01862       NextBucketBoundary = BucketProbability / 2;
01863     else
01864       NextBucketBoundary = BucketProbability;
01865 
01866     Probability = 0.0;
01867     LastProbDensity =
01868       (*DensityFunction[(int) Distribution]) (BUCKETTABLESIZE / 2);
01869     for (i = BUCKETTABLESIZE / 2; i < BUCKETTABLESIZE; i++) {
01870       ProbDensity = (*DensityFunction[(int) Distribution]) (i + 1);
01871       ProbabilityDelta = Integral (LastProbDensity, ProbDensity, 1.0);
01872       Probability += ProbabilityDelta;
01873       if (Probability > NextBucketBoundary) {
01874         if (CurrentBucket < Buckets->NumberOfBuckets - 1)
01875           CurrentBucket++;
01876         NextBucketBoundary += BucketProbability;
01877       }
01878       Buckets->Bucket[i] = CurrentBucket;
01879       Buckets->ExpectedCount[CurrentBucket] +=
01880         (FLOAT32) (ProbabilityDelta * SampleCount);
01881       LastProbDensity = ProbDensity;
01882     }
01883     // place any leftover probability into the last bucket
01884     Buckets->ExpectedCount[CurrentBucket] +=
01885       (FLOAT32) ((0.5 - Probability) * SampleCount);
01886 
01887     // copy upper half of distribution to lower half
01888     for (i = 0, j = BUCKETTABLESIZE - 1; i < j; i++, j--)
01889       Buckets->Bucket[i] =
01890         Mirror (Buckets->Bucket[j], Buckets->NumberOfBuckets);
01891 
01892     // copy upper half of expected counts to lower half
01893     for (i = 0, j = Buckets->NumberOfBuckets - 1; i <= j; i++, j--)
01894       Buckets->ExpectedCount[i] += Buckets->ExpectedCount[j];
01895   }
01896   return (Buckets);
01897 }                                // MakeBuckets
01898 
01899 
01920 UINT16 OptimumNumberOfBuckets(UINT32 SampleCount) {
01921   UINT8 Last, Next;
01922   FLOAT32 Slope;
01923 
01924   if (SampleCount < CountTable[0])
01925     return (BucketsTable[0]);
01926 
01927   for (Last = 0, Next = 1; Next < LOOKUPTABLESIZE; Last++, Next++) {
01928     if (SampleCount <= CountTable[Next]) {
01929       Slope = (FLOAT32) (BucketsTable[Next] - BucketsTable[Last]) /
01930         (FLOAT32) (CountTable[Next] - CountTable[Last]);
01931       return ((UINT16) (BucketsTable[Last] +
01932         Slope * (SampleCount - CountTable[Last])));
01933     }
01934   }
01935   return (BucketsTable[Last]);
01936 }                                // OptimumNumberOfBuckets
01937 
01938 
01960 FLOAT64
01961 ComputeChiSquared (UINT16 DegreesOfFreedom, FLOAT64 Alpha)
01962 #define CHIACCURACY     0.01
01963 #define MINALPHA  (1e-200)
01964 {
01965   static LIST ChiWith[MAXDEGREESOFFREEDOM + 1];
01966 
01967   CHISTRUCT *OldChiSquared;
01968   CHISTRUCT SearchKey;
01969 
01970   /*
01971   limit the minimum alpha that can be used - if alpha is too small
01972   it may not be possible to compute chi-squared.
01973   */
01974   if (Alpha < MINALPHA)
01975     Alpha = MINALPHA;
01976   if (Alpha > 1.0)
01977     Alpha = 1.0;
01978   if (Odd (DegreesOfFreedom))
01979     DegreesOfFreedom++;
01980 
01981   /*
01982   find the list of chi-squared values which have already been computed
01983   for the specified number of degrees of freedom.  Search the list for
01984   the desired chi-squared.
01985   */
01986   SearchKey.Alpha = Alpha;
01987   OldChiSquared = (CHISTRUCT *) first (search (ChiWith[DegreesOfFreedom],
01988     &SearchKey, AlphaMatch));
01989 
01990   if (OldChiSquared == NULL) {
01991     OldChiSquared = NewChiStruct (DegreesOfFreedom, Alpha);
01992     OldChiSquared->ChiSquared = Solve (ChiArea, OldChiSquared,
01993       (FLOAT64) DegreesOfFreedom,
01994       (FLOAT64) CHIACCURACY);
01995     ChiWith[DegreesOfFreedom] = push (ChiWith[DegreesOfFreedom],
01996       OldChiSquared);
01997   }
01998   else {
01999     // further optimization might move OldChiSquared to front of list
02000   }
02001 
02002   return (OldChiSquared->ChiSquared);
02003 
02004 }                                // ComputeChiSquared
02005 
02006 
02025 FLOAT64 NormalDensity(INT32 x) {
02026   FLOAT64 Distance;
02027 
02028   Distance = x - NormalMean;
02029   return (NormalMagnitude *
02030     exp (-0.5 * Distance * Distance / NormalVariance));
02031 }                                // NormalDensity
02032 
02033 
02046 FLOAT64 UniformDensity(INT32 x) {
02047   static FLOAT64 UniformDistributionDensity = (FLOAT64) 1.0 / BUCKETTABLESIZE;
02048 
02049   if ((x >= 0.0) && (x <= BUCKETTABLESIZE))
02050     return (UniformDistributionDensity);
02051   else
02052     return ((FLOAT64) 0.0);
02053 }                                // UniformDensity
02054 
02055 
02067 FLOAT64 Integral(FLOAT64 f1, FLOAT64 f2, FLOAT64 Dx) {
02068   return ((f1 + f2) * Dx / 2.0);
02069 }                                // Integral
02070 
02071 
02097 void FillBuckets(BUCKETS *Buckets,
02098                  CLUSTER *Cluster,
02099                  UINT16 Dim,
02100                  PARAM_DESC *ParamDesc,
02101                  FLOAT32 Mean,
02102                  FLOAT32 StdDev) {
02103   UINT16 BucketID;
02104   int i;
02105   LIST SearchState;
02106   SAMPLE *Sample;
02107 
02108   // initialize the histogram bucket counts to 0
02109   for (i = 0; i < Buckets->NumberOfBuckets; i++)
02110     Buckets->Count[i] = 0;
02111 
02112   if (StdDev == 0.0) {
02123     InitSampleSearch(SearchState, Cluster);
02124     i = 0;
02125     while ((Sample = NextSample (&SearchState)) != NULL) {
02126       if (Sample->Mean[Dim] > Mean)
02127         BucketID = Buckets->NumberOfBuckets - 1;
02128       else if (Sample->Mean[Dim] < Mean)
02129         BucketID = 0;
02130       else
02131         BucketID = i;
02132       Buckets->Count[BucketID] += 1;
02133       i++;
02134       if (i >= Buckets->NumberOfBuckets)
02135         i = 0;
02136     }
02137   }
02138   else {
02139     // search for all samples in the cluster and add to histogram buckets
02140     InitSampleSearch(SearchState, Cluster);
02141     while ((Sample = NextSample (&SearchState)) != NULL) {
02142       switch (Buckets->Distribution) {
02143         case normal:
02144           BucketID = NormalBucket (ParamDesc, Sample->Mean[Dim],
02145             Mean, StdDev);
02146           break;
02147         case D_random:
02148         case uniform:
02149           BucketID = UniformBucket (ParamDesc, Sample->Mean[Dim],
02150             Mean, StdDev);
02151           break;
02152         default:
02153           BucketID = 0;
02154       }
02155       Buckets->Count[Buckets->Bucket[BucketID]] += 1;
02156     }
02157   }
02158 }                                // FillBuckets
02159 
02160 
02180 UINT16 NormalBucket(PARAM_DESC *ParamDesc,
02181                     FLOAT32 x,
02182                     FLOAT32 Mean,
02183                     FLOAT32 StdDev) {
02184   FLOAT32 X;
02185 
02186   // wraparound circular parameters if necessary
02187   if (ParamDesc->Circular) {
02188     if (x - Mean > ParamDesc->HalfRange)
02189       x -= ParamDesc->Range;
02190     else if (x - Mean < -ParamDesc->HalfRange)
02191       x += ParamDesc->Range;
02192   }
02193 
02194   X = ((x - Mean) / StdDev) * NormalStdDev + NormalMean;
02195   if (X < 0)
02196     return ((UINT16) 0);
02197   if (X > BUCKETTABLESIZE - 1)
02198     return ((UINT16) (BUCKETTABLESIZE - 1));
02199   return ((UINT16) floor ((FLOAT64) X));
02200 }                                // NormalBucket
02201 
02202 
02220 UINT16 UniformBucket(PARAM_DESC *ParamDesc,
02221                      FLOAT32 x,
02222                      FLOAT32 Mean,
02223                      FLOAT32 StdDev) {
02224   FLOAT32 X;
02225 
02226   // wraparound circular parameters if necessary
02227   if (ParamDesc->Circular) {
02228     if (x - Mean > ParamDesc->HalfRange)
02229       x -= ParamDesc->Range;
02230     else if (x - Mean < -ParamDesc->HalfRange)
02231       x += ParamDesc->Range;
02232   }
02233 
02234   X = ((x - Mean) / (2 * StdDev) * BUCKETTABLESIZE + BUCKETTABLESIZE / 2.0);
02235   if (X < 0)
02236     return ((UINT16) 0);
02237   if (X > BUCKETTABLESIZE - 1)
02238     return ((UINT16) (BUCKETTABLESIZE - 1));
02239   return ((UINT16) floor ((FLOAT64) X));
02240 }                                // UniformBucket
02241 
02242 
02258 BOOL8 DistributionOK(BUCKETS *Buckets) {
02259   FLOAT32 FrequencyDifference;
02260   FLOAT32 TotalDifference;
02261   int i;
02262 
02263   // Compute how well the histogram matches the expected histogram
02264   TotalDifference = 0.0;
02265   for (i = 0; i < Buckets->NumberOfBuckets; i++) {
02266     FrequencyDifference = Buckets->Count[i] - Buckets->ExpectedCount[i];
02267     TotalDifference += (FrequencyDifference * FrequencyDifference) /
02268       Buckets->ExpectedCount[i];
02269   }
02270 
02271   // Test to see if the difference is more than expected
02272   if (TotalDifference > Buckets->ChiSquared)
02273     return (FALSE);
02274   else
02275     return (TRUE);
02276 }                                // DistributionOK
02277 
02278 
02288 void FreeStatistics(STATISTICS *Statistics) {
02289   memfree (Statistics->CoVariance);
02290   memfree (Statistics->Min);
02291   memfree (Statistics->Max);
02292   memfree(Statistics);
02293 }                                // FreeStatistics
02294 
02295 
02308 void FreeBuckets(BUCKETS *Buckets) {
02309   int Dist;
02310 
02311   if (Buckets != NULL) {
02312     Dist = (int) Buckets->Distribution;
02313     OldBuckets[Dist] = (LIST) push (OldBuckets[Dist], Buckets);
02314   }
02315 
02316 }                                // FreeBuckets
02317 
02318 
02330 void FreeCluster(CLUSTER *Cluster) {
02331   if (Cluster != NULL) {
02332     FreeCluster (Cluster->Left);
02333     FreeCluster (Cluster->Right);
02334     memfree(Cluster);
02335   }
02336 }                                // FreeCluster
02337 
02338 
02356 UINT16 DegreesOfFreedom(DISTRIBUTION Distribution, UINT16 HistogramBuckets) {
02357   static UINT8 DegreeOffsets[] = { 3, 3, 1 };
02358 
02359   UINT16 AdjustedNumBuckets;
02360 
02361   AdjustedNumBuckets = HistogramBuckets - DegreeOffsets[(int) Distribution];
02362   if (Odd (AdjustedNumBuckets))
02363     AdjustedNumBuckets++;
02364   return (AdjustedNumBuckets);
02365 
02366 }            // DegreesOfFreedom
02367 
02368 
02381 int NumBucketsMatch(void *arg1,    // BUCKETS   *Histogram,
02382                     void *arg2) {  // UINT16    *DesiredNumberOfBuckets)
02383   BUCKETS *Histogram = (BUCKETS *) arg1;
02384   UINT16 *DesiredNumberOfBuckets = (UINT16 *) arg2;
02385 
02386   return (*DesiredNumberOfBuckets == Histogram->NumberOfBuckets);
02387 
02388 }                                // NumBucketsMatch
02389 
02390 
02401 int ListEntryMatch(void *arg1,    //ListNode
02402                    void *arg2) {  //Key
02403   return (arg1 == arg2);
02404 
02405 }                                // ListEntryMatch
02406 
02407 
02419 void AdjustBuckets(BUCKETS *Buckets, UINT32 NewSampleCount) {
02420   int i;
02421   FLOAT64 AdjustFactor;
02422 
02423   AdjustFactor = (((FLOAT64) NewSampleCount) /
02424     ((FLOAT64) Buckets->SampleCount));
02425 
02426   for (i = 0; i < Buckets->NumberOfBuckets; i++) {
02427     Buckets->ExpectedCount[i] *= AdjustFactor;
02428   }
02429 
02430   Buckets->SampleCount = NewSampleCount;
02431 
02432 }                                // AdjustBuckets
02433 
02434 
02443 void InitBuckets(BUCKETS *Buckets) {
02444   int i;
02445 
02446   for (i = 0; i < Buckets->NumberOfBuckets; i++) {
02447     Buckets->Count[i] = 0;
02448   }
02449 
02450 }                                // InitBuckets
02451 
02452 
02466 int AlphaMatch(void *arg1,
02467                void *arg2) {
02468   CHISTRUCT *ChiStruct = (CHISTRUCT *) arg1;
02469   CHISTRUCT *SearchKey = (CHISTRUCT *) arg2;
02470 
02471   return (ChiStruct->Alpha == SearchKey->Alpha);
02472 
02473 }                                // AlphaMatch
02474 
02475 
02487 CHISTRUCT *NewChiStruct(UINT16 DegreesOfFreedom, FLOAT64 Alpha) {
02488   CHISTRUCT *NewChiStruct;
02489 
02490   NewChiStruct = (CHISTRUCT *) Emalloc (sizeof (CHISTRUCT));
02491   NewChiStruct->DegreesOfFreedom = DegreesOfFreedom;
02492   NewChiStruct->Alpha = Alpha;
02493   return (NewChiStruct);
02494 
02495 }                                // NewChiStruct
02496 
02497 
02516 FLOAT64 Solve (SOLVEFUNC Function,
02517 void *FunctionParams, FLOAT64 InitialGuess, FLOAT64 Accuracy)
02518 #define INITIALDELTA    0.1
02519 #define  DELTARATIO     0.1
02520 {
02521   FLOAT64 x;
02522   FLOAT64 f;
02523   FLOAT64 Slope;
02524   FLOAT64 Delta;
02525   FLOAT64 NewDelta;
02526   FLOAT64 xDelta;
02527   FLOAT64 LastPosX, LastNegX;
02528 
02529   x = InitialGuess;
02530   Delta = INITIALDELTA;
02531   LastPosX = MAX_FLOAT32;
02532   LastNegX = -MAX_FLOAT32;
02533   f = (*Function) ((CHISTRUCT *) FunctionParams, x);
02534   while (Abs (LastPosX - LastNegX) > Accuracy) {
02535     // keep track of outer bounds of current estimate
02536     if (f < 0)
02537       LastNegX = x;
02538     else
02539       LastPosX = x;
02540 
02541     // compute the approx. slope of f(x) at the current point
02542     Slope =
02543       ((*Function) ((CHISTRUCT *) FunctionParams, x + Delta) - f) / Delta;
02544 
02545     // compute the next solution guess */
02546     xDelta = f / Slope;
02547     x -= xDelta;
02548 
02549     // reduce the delta used for computing slope to be a fraction of
02550     //the amount moved to get to the new guess
02551     NewDelta = Abs (xDelta) * DELTARATIO;
02552     if (NewDelta < Delta)
02553       Delta = NewDelta;
02554 
02555     // compute the value of the function at the new guess
02556     f = (*Function) ((CHISTRUCT *) FunctionParams, x);
02557   }
02558   return (x);
02559 
02560 }                                // Solve
02561 
02562 
02590 FLOAT64 ChiArea(CHISTRUCT *ChiParams, FLOAT64 x) {
02591   int i, N;
02592   FLOAT64 SeriesTotal;
02593   FLOAT64 Denominator;
02594   FLOAT64 PowerOfx;
02595 
02596   N = ChiParams->DegreesOfFreedom / 2 - 1;
02597   SeriesTotal = 1;
02598   Denominator = 1;
02599   PowerOfx = 1;
02600   for (i = 1; i <= N; i++) {
02601     Denominator *= 2 * i;
02602     PowerOfx *= x;
02603     SeriesTotal += PowerOfx / Denominator;
02604   }
02605   return ((SeriesTotal * exp (-0.5 * x)) - ChiParams->Alpha);
02606 
02607 }                                // ChiArea
02608 
02609 
02641 BOOL8
02642 MultipleCharSamples (CLUSTERER * Clusterer,
02643 CLUSTER * Cluster, FLOAT32 MaxIllegal)
02644 #define ILLEGAL_CHAR    2
02645 {
02646   static BOOL8 *CharFlags = NULL;
02647   static INT32 NumFlags = 0;
02648   int i;
02649   LIST SearchState;
02650   SAMPLE *Sample;
02651   INT32 CharID;
02652   INT32 NumCharInCluster;
02653   INT32 NumIllegalInCluster;
02654   FLOAT32 PercentIllegal;
02655 
02656   // initial estimate assumes that no illegal chars exist in the cluster
02657   NumCharInCluster = Cluster->SampleCount;
02658   NumIllegalInCluster = 0;
02659 
02660   if (Clusterer->NumChar > NumFlags) {
02661     if (CharFlags != NULL)
02662       memfree(CharFlags);
02663     NumFlags = Clusterer->NumChar;
02664     CharFlags = (BOOL8 *) Emalloc (NumFlags * sizeof (BOOL8));
02665   }
02666 
02667   for (i = 0; i < NumFlags; i++)
02668     CharFlags[i] = FALSE;
02669 
02670   // find each sample in the cluster and check if we have seen it before
02671   InitSampleSearch(SearchState, Cluster);
02672   while ((Sample = NextSample (&SearchState)) != NULL) {
02673     CharID = Sample->CharID;
02674     if (CharFlags[CharID] == FALSE) {
02675       CharFlags[CharID] = TRUE;
02676     }
02677     else {
02678       if (CharFlags[CharID] == TRUE) {
02679         NumIllegalInCluster++;
02680         CharFlags[CharID] = ILLEGAL_CHAR;
02681       }
02682       NumCharInCluster--;
02683       PercentIllegal = (FLOAT32) NumIllegalInCluster / NumCharInCluster;
02684       if (PercentIllegal > MaxIllegal)
02685         return (TRUE);
02686     }
02687   }
02688   return (FALSE);
02689 
02690 }                                // MultipleCharSamples
02691 
02704 double InvertMatrix(const float* input, int size, float* inv) {
02705   double** U;  // The upper triangular array.
02706   double* Umem;
02707   double** U_inv;  // The inverse of U.
02708   double* U_invmem;
02709   double** L;  // The lower triangular array.
02710   double* Lmem;
02711 
02712   // Allocate memory for the 2D arrays.
02713   ALLOC_2D_ARRAY(size, size, Umem, U, double);
02714   ALLOC_2D_ARRAY(size, size, U_invmem, U_inv, double);
02715   ALLOC_2D_ARRAY(size, size, Lmem, L, double);
02716 
02717   // Initialize the working matrices. U starts as input, L as I and U_inv as O.
02718   int row;
02719   int col;
02720   for (row = 0; row < size; row++) {
02721     for (col = 0; col < size; col++) {
02722       U[row][col] = input[row*size + col];
02723       L[row][col] = row == col ? 1.0 : 0.0;
02724       U_inv[row][col] = 0.0;
02725     }
02726   }
02727 
02728   // Compute forward matrix by inversion by LU decomposition of input.
02729   for (col = 0; col < size; ++col) {
02730     // Find best pivot
02731     int best_row = 0;
02732     double best_pivot = -1.0;
02733     for (row = col; row < size; ++row) {
02734       if (Abs(U[row][col]) > best_pivot) {
02735         best_pivot = Abs(U[row][col]);
02736         best_row = row;
02737       }
02738     }
02739     // Exchange pivot rows.
02740     if (best_row != col) {
02741       for (int k = 0; k < size; ++k) {
02742         double tmp = U[best_row][k];
02743         U[best_row][k] = U[col][k];
02744         U[col][k] = tmp;
02745         tmp = L[best_row][k];
02746         L[best_row][k] = L[col][k];
02747         L[col][k] = tmp;
02748       }
02749     }
02750     // Now do the pivot itself.
02751     for (row = col + 1; row < size; ++row) {
02752       double ratio = -U[row][col] / U[col][col];
02753       for (int j = col; j < size; ++j) {
02754         U[row][j] += U[col][j] * ratio;
02755       }
02756       for (int k = 0; k < size; ++k) {
02757         L[row][k] += L[col][k] * ratio;
02758       }
02759     }
02760   }
02761   // Next invert U.
02762   for (col = 0; col < size; ++col) {
02763     U_inv[col][col] = 1.0 / U[col][col];
02764     for (row = col - 1; row >= 0; --row) {
02765       double total = 0.0;
02766       for (int k = col; k > row; --k) {
02767         total += U[row][k] * U_inv[k][col];
02768       }
02769       U_inv[row][col] = -total / U[row][row];
02770     }
02771   }
02772   // Now the answer is U_inv.L.
02773   for (row = 0; row < size; row++) {
02774     for (col = 0; col < size; col++) {
02775       double sum = 0.0;
02776       for (int k = row; k < size; ++k) {
02777         sum += U_inv[row][k] * L[k][col];
02778       }
02779       inv[row*size + col] = sum;
02780     }
02781   }
02782   // Check matrix product.
02783   double error_sum = 0.0;
02784   for (row = 0; row < size; row++) {
02785     for (col = 0; col < size; col++) {
02786       double sum = 0.0;
02787       for (int k = 0; k < size; ++k) {
02788         sum += input[row*size + k] * inv[k *size + col];
02789       }
02790       if (row != col) {
02791         error_sum += Abs(sum);
02792       }
02793     }
02794   }
02795   return error_sum;
02796 }
02797 
02798 

Generated on Wed Feb 28 19:49:09 2007 for Tesseract by  doxygen 1.5.1