#include "oldheap.h"
#include "const.h"
#include "cluster.h"
#include "emalloc.h"
#include "danerror.h"
#include "freelist.h"
#include <math.h>
Go to the source code of this file.
#define BUCKETTABLESIZE 1024 |
The size of the table which maps normalized samples to histogram buckets.
Also define the number of standard deviations in a normal distribution which are considered to be significant.
The mapping table will be defined in such a way that it covers the specified number of standard deviations on either side of the mean.
BUCKETTABLESIZE should always be even.
Definition at line 176 of file cluster.cpp.
Referenced by MakeBuckets(), NormalBucket(), UniformBucket(), and UniformDensity().
#define CHIACCURACY 0.01 |
Referenced by ComputeChiSquared().
#define DELTARATIO 0.1 |
Referenced by Solve().
#define FTABLE_X 10 |
#define FTABLE_Y 100 |
#define HOTELLING 1 |
** (c) Copyright Hewlett-Packard Company, 1988. ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. ** You may obtain a copy of the License at ** http://www.apache.org/licenses/LICENSE-2.0 ** Unless required by applicable law or agreed to in writing, software ** distributed under the License is distributed on an "AS IS" BASIS, ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ** See the License for the specific language governing permissions and ** limitations under the License.
Definition at line 27 of file cluster.cpp.
Referenced by MakePrototype().
#define ILLEGAL_CHAR 2 |
Referenced by MultipleCharSamples().
#define INITIALDELTA 0.1 |
Referenced by Solve().
#define LOOKUPTABLESIZE 8 |
#define MAXBUCKETS 39 |
Definition at line 275 of file cluster.cpp.
#define MAXDEGREESOFFREEDOM MAXBUCKETS |
#define MAXDISTANCE MAX_FLOAT32 |
Referenced by FindNearestNeighbor().
#define MAXNEIGHBORS 2 |
Referenced by FindNearestNeighbor().
#define MINALPHA (1e-200) |
Referenced by ComputeChiSquared().
#define MINBUCKETS 5 |
The absolute minimum number of samples which must be present in order to accurately test hypotheses about underlying probability distributions.
Define separately the minimum samples that are needed before a statistical analysis is attempted; this number should be equal to MINSAMPLES but can be set to a lower number for early testing when very few samples are available.
Definition at line 158 of file cluster.cpp.
#define MINSAMPLES (MINBUCKETS * MINSAMPLESPERBUCKET) |
Definition at line 160 of file cluster.cpp.
#define MINSAMPLESNEEDED 1 |
#define MINSAMPLESPERBUCKET 5 |
Definition at line 159 of file cluster.cpp.
#define MINVARIANCE 0.0001 |
The variance which will be used as a minimum variance for any dimension of any feature.
Since most features are calculated from numbers with a precision no better than 1 in 128, the variance should never be less than the square of this number for parameters whose range is 1.
Definition at line 146 of file cluster.cpp.
Referenced by ComputeStatistics(), MakeDimUniform(), NewEllipticalProto(), and NewSphericalProto().
#define NORMALEXTENT 3.0 |
Definition at line 177 of file cluster.cpp.
Definition at line 231 of file cluster.cpp.
Referenced by ComputeChiSquared(), DegreesOfFreedom(), and MakeBuckets().
#define SqrtOf2Pi 2.506628275 |
The following variables describe a discrete normal distribution which is used by
Definition at line 256 of file cluster.cpp.
typedef FLOAT64(*) DENSITYFUNC(INT32) |
Definition at line 228 of file cluster.cpp.
Definition at line 229 of file cluster.cpp.
Multiplies each ExpectedCount histogram entry by NewSampleCount/OldSampleCount so that the histogram is now adjusted to the new sample count.
Buckets | histogram data structure to adjust | |
NewSampleCount | new sample count to adjust to |
Definition at line 2419 of file cluster.cpp.
References BUCKETS::ExpectedCount, BUCKETS::NumberOfBuckets, and BUCKETS::SampleCount.
Referenced by GetBuckets().
02419 { 02420 int i; 02421 FLOAT64 AdjustFactor; 02422 02423 AdjustFactor = (((FLOAT64) NewSampleCount) / 02424 ((FLOAT64) Buckets->SampleCount)); 02425 02426 for (i = 0; i < Buckets->NumberOfBuckets; i++) { 02427 Buckets->ExpectedCount[i] *= AdjustFactor; 02428 } 02429 02430 Buckets->SampleCount = NewSampleCount; 02431 02432 } // AdjustBuckets
int AlphaMatch | ( | void * | arg1, | |
void * | arg2 | |||
) |
Search a list of structures which hold pre-computed chi-squared values for a chi-squared value whose corresponding alpha field matches the alpha field of SearchKey.
arg1 | Chi-squared struct being tested for a match | |
arg2 | Chi-squared struct that is the search key |
Definition at line 2466 of file cluster.cpp.
References CHISTRUCT::Alpha.
Referenced by ComputeChiSquared().
02467 { 02468 CHISTRUCT *ChiStruct = (CHISTRUCT *) arg1; 02469 CHISTRUCT *SearchKey = (CHISTRUCT *) arg2; 02470 02471 return (ChiStruct->Alpha == SearchKey->Alpha); 02472 02473 } // AlphaMatch
Compute SOME areas under chi density curve.
ChiParams | Contains degrees of freedom and alpha | |
x | Value of chi-squared to evaluate |
This routine is intended to be passed to the Solve() function to find the value of chi-squared which will yield a desired area under the right tail of the chi density curve. The function will only work for even degrees of freedom.
The equations are based on integrating the chi density curve in parts to obtain a series that can be used to compute the area under the curve.
Definition at line 2590 of file cluster.cpp.
References CHISTRUCT::Alpha, CHISTRUCT::DegreesOfFreedom, and N.
Referenced by ComputeChiSquared().
02590 { 02591 int i, N; 02592 FLOAT64 SeriesTotal; 02593 FLOAT64 Denominator; 02594 FLOAT64 PowerOfx; 02595 02596 N = ChiParams->DegreesOfFreedom / 2 - 1; 02597 SeriesTotal = 1; 02598 Denominator = 1; 02599 PowerOfx = 1; 02600 for (i = 1; i <= N; i++) { 02601 Denominator *= 2 * i; 02602 PowerOfx *= x; 02603 SeriesTotal += PowerOfx / Denominator; 02604 } 02605 return ((SeriesTotal * exp (-0.5 * x)) - ChiParams->Alpha); 02606 02607 } // ChiArea
LIST ClusterSamples | ( | CLUSTERER * | Clusterer, | |
CLUSTERCONFIG * | Config | |||
) |
If unclustered, put samples of KD tree into cluster tree and compute its prototypes.
Clusterer | data struct containing samples to be clustered | |
Config | parameters which control clustering process |
If the samples have not been clustered before, the samples in the KD tree are formed into a cluster tree and then the prototypes are computed from the cluster tree.
In either case this routine returns a pointer to a list of prototypes that best represent the samples given the constraints specified in Config.
Definition at line 557 of file cluster.cpp.
References ComputePrototypes(), Config, CreateClusterTree(), FreeProtoList(), NULL, and CLUSTERER::Root.
00557 { 00558 //only create cluster tree if samples have never been clustered before 00559 if (Clusterer->Root == NULL) 00560 CreateClusterTree(Clusterer); 00561 00562 //deallocate the old prototype list if one exists 00563 FreeProtoList (&Clusterer->ProtoList); 00564 Clusterer->ProtoList = NIL; 00565 00566 //compute prototypes starting at the root node in the tree 00567 ComputePrototypes(Clusterer, Config); 00568 return (Clusterer->ProtoList); 00569 } // ClusterSamples
Computes the chi-squared value.
DegreesOfFreedom | determines shape of distribution | |
Alpha | probability of right tail |
Definition at line 1961 of file cluster.cpp.
References CHISTRUCT::Alpha, AlphaMatch(), CHIACCURACY, ChiArea(), CHISTRUCT::ChiSquared, first, MAXDEGREESOFFREEDOM, MINALPHA, NewChiStruct(), NULL, Odd, push(), search(), and Solve().
Referenced by GetBuckets(), and MakeBuckets().
01964 { 01965 static LIST ChiWith[MAXDEGREESOFFREEDOM + 1]; 01966 01967 CHISTRUCT *OldChiSquared; 01968 CHISTRUCT SearchKey; 01969 01970 /* 01971 limit the minimum alpha that can be used - if alpha is too small 01972 it may not be possible to compute chi-squared. 01973 */ 01974 if (Alpha < MINALPHA) 01975 Alpha = MINALPHA; 01976 if (Alpha > 1.0) 01977 Alpha = 1.0; 01978 if (Odd (DegreesOfFreedom)) 01979 DegreesOfFreedom++; 01980 01981 /* 01982 find the list of chi-squared values which have already been computed 01983 for the specified number of degrees of freedom. Search the list for 01984 the desired chi-squared. 01985 */ 01986 SearchKey.Alpha = Alpha; 01987 OldChiSquared = (CHISTRUCT *) first (search (ChiWith[DegreesOfFreedom], 01988 &SearchKey, AlphaMatch)); 01989 01990 if (OldChiSquared == NULL) { 01991 OldChiSquared = NewChiStruct (DegreesOfFreedom, Alpha); 01992 OldChiSquared->ChiSquared = Solve (ChiArea, OldChiSquared, 01993 (FLOAT64) DegreesOfFreedom, 01994 (FLOAT64) CHIACCURACY); 01995 ChiWith[DegreesOfFreedom] = push (ChiWith[DegreesOfFreedom], 01996 OldChiSquared); 01997 } 01998 else { 01999 // further optimization might move OldChiSquared to front of list 02000 } 02001 02002 return (OldChiSquared->ChiSquared); 02003 02004 } // ComputeChiSquared
void ComputePrototypes | ( | CLUSTERER * | Clusterer, | |
CLUSTERCONFIG * | Config | |||
) |
Decides which clusters in the cluster tree should be represented by prototypes, forms a list of these prototypes, and places the list in the Clusterer data structure.
Clusterer | data structure holding cluster tree | |
Config | parameters used to control prototype generation |
Definition at line 1011 of file cluster.cpp.
References Config, first, sample::Left, MakePrototype(), NIL, NULL, pop(), CLUSTERER::ProtoList, push(), sample::Right, and CLUSTERER::Root.
Referenced by ClusterSamples().
01011 { 01012 LIST ClusterStack = NIL; 01013 CLUSTER *Cluster; 01014 PROTOTYPE *Prototype; 01015 01016 // use a stack to keep track of clusters waiting to be processed 01017 // initially the only cluster on the stack is the root cluster 01018 if (Clusterer->Root != NULL) 01019 ClusterStack = push (NIL, Clusterer->Root); 01020 01021 // loop until we have analyzed all clusters which are potential prototypes 01022 while (ClusterStack != NIL) { 01023 // Remove the next cluster to be analyzed from the stack 01024 // try to make a prototype from the cluster 01025 // if successful, put it on the proto list, else split the cluster 01026 Cluster = (CLUSTER *) first (ClusterStack); 01027 ClusterStack = pop (ClusterStack); 01028 Prototype = MakePrototype (Clusterer, Config, Cluster); 01029 if (Prototype != NULL) { 01030 Clusterer->ProtoList = push (Clusterer->ProtoList, Prototype); 01031 } 01032 else { 01033 ClusterStack = push (ClusterStack, Cluster->Right); 01034 ClusterStack = push (ClusterStack, Cluster->Left); 01035 } 01036 } 01037 } // ComputePrototypes
STATISTICS * ComputeStatistics | ( | INT16 | N, | |
PARAM_DESC | ParamDesc[], | |||
CLUSTER * | Cluster | |||
) |
Searches the cluster tree for all leaf nodes which are samples in the specified cluster.
N | number of dimensions | |
ParamDesc | array of dimension descriptions | |
Cluster | cluster whose stats are to be computed |
Definition at line 1474 of file cluster.cpp.
References STATISTICS::AvgVariance, STATISTICS::CoVariance, Distance, Emalloc(), InitSampleSearch, STATISTICS::Max, sample::Mean, memfree(), STATISTICS::Min, MINVARIANCE, NextSample(), NULL, and sample::SampleCount.
Referenced by MakePrototype().
01474 { 01475 STATISTICS *Statistics; 01476 int i, j; 01477 FLOAT32 *CoVariance; 01478 FLOAT32 *Distance; 01479 LIST SearchState; 01480 SAMPLE *Sample; 01481 UINT32 SampleCountAdjustedForBias; 01482 01483 // allocate memory to hold the statistics results 01484 Statistics = (STATISTICS *) Emalloc (sizeof (STATISTICS)); 01485 Statistics->CoVariance = (FLOAT32 *) Emalloc (N * N * sizeof (FLOAT32)); 01486 Statistics->Min = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32)); 01487 Statistics->Max = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32)); 01488 01489 // allocate temporary memory to hold the sample to mean distances 01490 Distance = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32)); 01491 01492 // initialize the statistics 01493 Statistics->AvgVariance = 1.0; 01494 CoVariance = Statistics->CoVariance; 01495 for (i = 0; i < N; i++) { 01496 Statistics->Min[i] = 0.0; 01497 Statistics->Max[i] = 0.0; 01498 for (j = 0; j < N; j++, CoVariance++) 01499 *CoVariance = 0; 01500 } 01501 // Find each sample in the cluster and merge it into the statistics 01502 InitSampleSearch(SearchState, Cluster); 01503 while ((Sample = NextSample (&SearchState)) != NULL) { 01504 for (i = 0; i < N; i++) { 01505 Distance[i] = Sample->Mean[i] - Cluster->Mean[i]; 01506 if (ParamDesc[i].Circular) { 01507 if (Distance[i] > ParamDesc[i].HalfRange) 01508 Distance[i] -= ParamDesc[i].Range; 01509 if (Distance[i] < -ParamDesc[i].HalfRange) 01510 Distance[i] += ParamDesc[i].Range; 01511 } 01512 if (Distance[i] < Statistics->Min[i]) 01513 Statistics->Min[i] = Distance[i]; 01514 if (Distance[i] > Statistics->Max[i]) 01515 Statistics->Max[i] = Distance[i]; 01516 } 01517 CoVariance = Statistics->CoVariance; 01518 for (i = 0; i < N; i++) 01519 for (j = 0; j < N; j++, CoVariance++) 01520 *CoVariance += Distance[i] * Distance[j]; 01521 } 01522 // Normalize the variances by the total number of samples 01523 // use SampleCount-1 instead of SampleCount to get an unbiased estimate 01524 // also compute the geometic mean of the diagonal variances 01525 // ensure that clusters with only 1 sample are handled correctly 01526 if (Cluster->SampleCount > 1) 01527 SampleCountAdjustedForBias = Cluster->SampleCount - 1; 01528 else 01529 SampleCountAdjustedForBias = 1; 01530 CoVariance = Statistics->CoVariance; 01531 for (i = 0; i < N; i++) 01532 for (j = 0; j < N; j++, CoVariance++) { 01533 *CoVariance /= SampleCountAdjustedForBias; 01534 if (j == i) { 01535 if (*CoVariance < MINVARIANCE) 01536 *CoVariance = MINVARIANCE; 01537 Statistics->AvgVariance *= *CoVariance; 01538 } 01539 } 01540 Statistics->AvgVariance = (float)pow((double)Statistics->AvgVariance, 01541 1.0 / N); 01542 01543 // release temporary memory and return 01544 memfree(Distance); 01545 return (Statistics); 01546 } // ComputeStatistics
void CreateClusterTree | ( | CLUSTERER * | Clusterer | ) |
Do bottoms-up clustering of samples in KD-tree of Clusterer structure.
Clusterer | data structure holdings samples to be clustered |
Definition at line 756 of file cluster.cpp.
References TEMPCLUSTER::Cluster, sample::Clustered, CurrentTemp, HEAPENTRY::Data, Emalloc(), EMPTY, FindNearestNeighbor(), FreeHeap, FreeKDTree(), GetTopOfHeap(), Heap, HeapStore(), CLUSTERER::KDTree, KDWalk(), HEAPENTRY::Key, MakeHeap(), MakeNewCluster(), MakePotentialClusters(), memfree(), TEMPCLUSTER::Neighbor, NULL, CLUSTERER::NumberOfSamples, CLUSTERER::Root, RootOf, TempCluster, and Tree.
Referenced by ClusterSamples().
00756 { 00757 HEAPENTRY HeapEntry; 00758 TEMPCLUSTER *PotentialCluster; 00759 00760 // save the kd-tree in a global variable so kd-tree walker can get at it 00761 Tree = Clusterer->KDTree; 00762 00763 // allocate memory to to hold all of the "potential" clusters 00764 TempCluster = (TEMPCLUSTER *) 00765 Emalloc (Clusterer->NumberOfSamples * sizeof (TEMPCLUSTER)); 00766 CurrentTemp = 0; 00767 00768 // note each sample and its nearest neighbor form a "potential" cluster 00769 // save these in a heap with the "best" potential clusters on top 00770 Heap = MakeHeap (Clusterer->NumberOfSamples); 00771 KDWalk (Tree, (void_proc) MakePotentialClusters); 00772 00773 // form potential clusters into actual clusters - always do "best" first 00774 while (GetTopOfHeap (Heap, &HeapEntry) != EMPTY) { 00775 PotentialCluster = (TEMPCLUSTER *) (HeapEntry.Data); 00776 00777 // if main cluster of potential cluster is already in another cluster 00778 // then we don't need to worry about it 00779 if (PotentialCluster->Cluster->Clustered) { 00780 continue; 00781 } 00782 00783 // if main cluster is not yet clustered, but its nearest neighbor is 00784 // then we must find a new nearest neighbor 00785 else if (PotentialCluster->Neighbor->Clustered) { 00786 PotentialCluster->Neighbor = 00787 FindNearestNeighbor (Tree, PotentialCluster->Cluster, 00788 &(HeapEntry.Key)); 00789 if (PotentialCluster->Neighbor != NULL) { 00790 HeapStore(Heap, &HeapEntry); 00791 } 00792 } 00793 00794 // if neither cluster is already clustered, form permanent cluster 00795 else { 00796 PotentialCluster->Cluster = 00797 MakeNewCluster(Clusterer, PotentialCluster); 00798 PotentialCluster->Neighbor = 00799 FindNearestNeighbor (Tree, PotentialCluster->Cluster, 00800 &(HeapEntry.Key)); 00801 if (PotentialCluster->Neighbor != NULL) { 00802 HeapStore(Heap, &HeapEntry); 00803 } 00804 } 00805 } 00806 00807 // the root node in the cluster tree is now the only node in the kd-tree 00808 Clusterer->Root = (CLUSTER *) RootOf (Clusterer->KDTree); 00809 00810 // free up the memory used by the K-D tree, heap, and temp clusters 00811 FreeKDTree(Tree); 00812 Clusterer->KDTree = NULL; 00813 FreeHeap(Heap); 00814 memfree(TempCluster); 00815 } // CreateClusterTree
UINT16 DegreesOfFreedom | ( | DISTRIBUTION | Distribution, | |
UINT16 | HistogramBuckets | |||
) |
Compute DOF for chi-squared test.
Distribution | distribution being tested for | |
HistogramBuckets | number of buckets in chi-square test |
Definition at line 2356 of file cluster.cpp.
References Odd.
Referenced by GetBuckets(), and MakeBuckets().
02356 { 02357 static UINT8 DegreeOffsets[] = { 3, 3, 1 }; 02358 02359 UINT16 AdjustedNumBuckets; 02360 02361 AdjustedNumBuckets = HistogramBuckets - DegreeOffsets[(int) Distribution]; 02362 if (Odd (AdjustedNumBuckets)) 02363 AdjustedNumBuckets++; 02364 return (AdjustedNumBuckets); 02365 02366 } // DegreesOfFreedom
Chi-square test of histogram data.
Buckets | Histogram data to perform chi-square test on |
Definition at line 2258 of file cluster.cpp.
References BUCKETS::ChiSquared, BUCKETS::Count, BUCKETS::ExpectedCount, FALSE, BUCKETS::NumberOfBuckets, and TRUE.
Referenced by MakeEllipticalProto(), MakeMixedProto(), and MakeSphericalProto().
02258 { 02259 FLOAT32 FrequencyDifference; 02260 FLOAT32 TotalDifference; 02261 int i; 02262 02263 // Compute how well the histogram matches the expected histogram 02264 TotalDifference = 0.0; 02265 for (i = 0; i < Buckets->NumberOfBuckets; i++) { 02266 FrequencyDifference = Buckets->Count[i] - Buckets->ExpectedCount[i]; 02267 TotalDifference += (FrequencyDifference * FrequencyDifference) / 02268 Buckets->ExpectedCount[i]; 02269 } 02270 02271 // Test to see if the difference is more than expected 02272 if (TotalDifference > Buckets->ChiSquared) 02273 return (FALSE); 02274 else 02275 return (TRUE); 02276 } // DistributionOK
void FillBuckets | ( | BUCKETS * | Buckets, | |
CLUSTER * | Cluster, | |||
UINT16 | Dim, | |||
PARAM_DESC * | ParamDesc, | |||
FLOAT32 | Mean, | |||
FLOAT32 | StdDev | |||
) |
Counts cluster samples which within various histogram buckets.
Buckets | histogram buckets to count samples | |
Cluster | cluster whose samples are being analyzed | |
Dim | dimension of samples which is being analyzed | |
ParamDesc | description of the dimension | |
Mean | "mean" of the distribution | |
StdDev | "standard deviation" of the distribution |
If the standard deviation is zero, then we can't statistically analyze the cluster.
Use a pseudo-analysis: samples exactly on the mean are distributed evenly across all buckets. Samples greater than the mean are placed in the last bucket; samples less than the mean are placed in the first bucket.
Definition at line 2097 of file cluster.cpp.
References BUCKETS::Bucket, BUCKETS::Count, D_random, BUCKETS::Distribution, InitSampleSearch, sample::Mean, NextSample(), normal, NormalBucket(), NULL, BUCKETS::NumberOfBuckets, uniform, and UniformBucket().
Referenced by MakeEllipticalProto(), MakeMixedProto(), and MakeSphericalProto().
02102 { 02103 UINT16 BucketID; 02104 int i; 02105 LIST SearchState; 02106 SAMPLE *Sample; 02107 02108 // initialize the histogram bucket counts to 0 02109 for (i = 0; i < Buckets->NumberOfBuckets; i++) 02110 Buckets->Count[i] = 0; 02111 02112 if (StdDev == 0.0) { 02123 InitSampleSearch(SearchState, Cluster); 02124 i = 0; 02125 while ((Sample = NextSample (&SearchState)) != NULL) { 02126 if (Sample->Mean[Dim] > Mean) 02127 BucketID = Buckets->NumberOfBuckets - 1; 02128 else if (Sample->Mean[Dim] < Mean) 02129 BucketID = 0; 02130 else 02131 BucketID = i; 02132 Buckets->Count[BucketID] += 1; 02133 i++; 02134 if (i >= Buckets->NumberOfBuckets) 02135 i = 0; 02136 } 02137 } 02138 else { 02139 // search for all samples in the cluster and add to histogram buckets 02140 InitSampleSearch(SearchState, Cluster); 02141 while ((Sample = NextSample (&SearchState)) != NULL) { 02142 switch (Buckets->Distribution) { 02143 case normal: 02144 BucketID = NormalBucket (ParamDesc, Sample->Mean[Dim], 02145 Mean, StdDev); 02146 break; 02147 case D_random: 02148 case uniform: 02149 BucketID = UniformBucket (ParamDesc, Sample->Mean[Dim], 02150 Mean, StdDev); 02151 break; 02152 default: 02153 BucketID = 0; 02154 } 02155 Buckets->Count[Buckets->Bucket[BucketID]] += 1; 02156 } 02157 } 02158 } // FillBuckets
Find nearest neighbor of specified cluster in KD-tree.
Tree | kd-tree to search in for nearest neighbor | |
Cluster | cluster whose nearest neighbor is to be found | |
Distance | ptr to variable to report distance found |
Definition at line 875 of file cluster.cpp.
References KDNearestNeighborSearch(), MAXDISTANCE, MAXNEIGHBORS, sample::Mean, Neighbor, NULL, NumberOfNeighbors, and Tree.
Referenced by CreateClusterTree(), and MakePotentialClusters().
00878 { 00879 CLUSTER *Neighbor[MAXNEIGHBORS]; 00880 FLOAT32 Dist[MAXNEIGHBORS]; 00881 INT32 NumberOfNeighbors; 00882 INT32 i; 00883 CLUSTER *BestNeighbor; 00884 00885 // find the 2 nearest neighbors of the cluster 00886 NumberOfNeighbors = KDNearestNeighborSearch 00887 (Tree, Cluster->Mean, MAXNEIGHBORS, MAXDISTANCE, Neighbor, Dist); 00888 00889 // search for the nearest neighbor that is not the cluster itself 00890 *Distance = MAXDISTANCE; 00891 BestNeighbor = NULL; 00892 for (i = 0; i < NumberOfNeighbors; i++) { 00893 if ((Dist[i] < *Distance) && (Neighbor[i] != Cluster)) { 00894 *Distance = Dist[i]; 00895 BestNeighbor = Neighbor[i]; 00896 } 00897 } 00898 return (BestNeighbor); 00899 } // FindNearestNeighbor
void FreeBuckets | ( | BUCKETS * | Buckets | ) |
Places the specified histogram data structure at the front of a list of histograms so that it can be reused later if necessary.
Buckets | pointer to data structure to be freed |
Definition at line 2308 of file cluster.cpp.
References BUCKETS::Distribution, NULL, OldBuckets, and push().
Referenced by MakeMixedProto(), and MakePrototype().
02308 { 02309 int Dist; 02310 02311 if (Buckets != NULL) { 02312 Dist = (int) Buckets->Distribution; 02313 OldBuckets[Dist] = (LIST) push (OldBuckets[Dist], Buckets); 02314 } 02315 02316 } // FreeBuckets
void FreeCluster | ( | CLUSTER * | Cluster | ) |
Frees the memory consumed by the specified cluster and all of its subclusters.
Cluster | Pointer to cluster to be freed |
Definition at line 2330 of file cluster.cpp.
References sample::Left, memfree(), NULL, and sample::Right.
Referenced by FreeClusterer().
02330 { 02331 if (Cluster != NULL) { 02332 FreeCluster (Cluster->Left); 02333 FreeCluster (Cluster->Right); 02334 memfree(Cluster); 02335 } 02336 } // FreeCluster
void FreeClusterer | ( | CLUSTERER * | Clusterer | ) |
Free memory for specified structure without affecting prototype list.
Clusterer | pointer to data structure to be freed |
Definition at line 588 of file cluster.cpp.
References first, FreeCluster(), FreeKDTree(), iterate, CLUSTERER::KDTree, memfree(), NULL, CLUSTERER::ParamDesc, CLUSTERER::ProtoList, and CLUSTERER::Root.
00588 { 00589 if (Clusterer != NULL) { 00590 memfree (Clusterer->ParamDesc); 00591 if (Clusterer->KDTree != NULL) 00592 FreeKDTree (Clusterer->KDTree); 00593 if (Clusterer->Root != NULL) 00594 FreeCluster (Clusterer->Root); 00595 iterate (Clusterer->ProtoList) { 00596 ((PROTOTYPE *) (first (Clusterer->ProtoList)))->Cluster = NULL; 00597 } 00598 memfree(Clusterer); 00599 } 00600 } // FreeClusterer
void FreeProtoList | ( | LIST * | ProtoList | ) |
Frees all of the memory allocated to the specified list of prototypes.
ProtoList | pointer to list of prototypes to be freed |
Definition at line 614 of file cluster.cpp.
References destroy_nodes(), and FreePrototype().
Referenced by ClusterSamples(), and FreeNormProtos().
00614 { 00615 destroy_nodes(*ProtoList, FreePrototype); 00616 } // FreeProtoList
void FreePrototype | ( | void * | arg | ) |
Deallocates the memory consumed by the specified prototype and modifies the corresponding cluster so that it is no longer marked as a prototype.
arg | Prototype data structure to be deallocated |
Definition at line 631 of file cluster.cpp.
References proto::Cluster, proto::Distrib, FLOATUNION::Elliptical, FALSE, proto::Magnitude, proto::Mean, memfree(), NULL, sample::Prototype, spherical, proto::Style, proto::Variance, and proto::Weight.
Referenced by FreeProtoList(), and MakeMixedProto().
00631 { //PROTOTYPE *Prototype) 00632 PROTOTYPE *Prototype = (PROTOTYPE *) arg; 00633 00634 // unmark the corresponding cluster (if there is one 00635 if (Prototype->Cluster != NULL) 00636 Prototype->Cluster->Prototype = FALSE; 00637 00638 // deallocate the prototype statistics and then the prototype itself 00639 if (Prototype->Distrib != NULL) 00640 memfree (Prototype->Distrib); 00641 if (Prototype->Mean != NULL) 00642 memfree (Prototype->Mean); 00643 if (Prototype->Style != spherical) { 00644 if (Prototype->Variance.Elliptical != NULL) 00645 memfree (Prototype->Variance.Elliptical); 00646 if (Prototype->Magnitude.Elliptical != NULL) 00647 memfree (Prototype->Magnitude.Elliptical); 00648 if (Prototype->Weight.Elliptical != NULL) 00649 memfree (Prototype->Weight.Elliptical); 00650 } 00651 memfree(Prototype); 00652 } // FreePrototype
void FreeStatistics | ( | STATISTICS * | Statistics | ) |
Frees the memory used by the statistics data structure.
Statistics | pointer to data structure to be freed |
Definition at line 2288 of file cluster.cpp.
References STATISTICS::CoVariance, STATISTICS::Max, memfree(), and STATISTICS::Min.
Referenced by MakePrototype().
02288 { 02289 memfree (Statistics->CoVariance); 02290 memfree (Statistics->Min); 02291 memfree (Statistics->Max); 02292 memfree(Statistics); 02293 } // FreeStatistics
BUCKETS * GetBuckets | ( | DISTRIBUTION | Distribution, | |
UINT32 | SampleCount, | |||
FLOAT64 | Confidence | |||
) |
Returns histogram data structure & applies goodness of fit test.
Distribution | type of probability distribution to test for | |
SampleCount | number of samples that are available | |
Confidence | probability of a Type I error |
Definition at line 1765 of file cluster.cpp.
References AdjustBuckets(), BUCKETS::ChiSquared, ComputeChiSquared(), BUCKETS::Confidence, DegreesOfFreedom(), delete_d(), first, InitBuckets(), ListEntryMatch(), MakeBuckets(), NULL, BUCKETS::NumberOfBuckets, NumBucketsMatch(), OldBuckets, OptimumNumberOfBuckets(), BUCKETS::SampleCount, and search().
Referenced by MakeMixedProto(), and MakePrototype().
01767 { 01768 UINT16 NumberOfBuckets; 01769 BUCKETS *Buckets; 01770 01771 // Search for an old bucket structure with the same number of buckets 01772 NumberOfBuckets = OptimumNumberOfBuckets (SampleCount); 01773 Buckets = (BUCKETS *) first (search (OldBuckets[(int) Distribution], 01774 &NumberOfBuckets, NumBucketsMatch)); 01775 01776 // if a matching bucket structure is found, delete it from the list 01777 if (Buckets != NULL) { 01778 OldBuckets[(int) Distribution] = 01779 delete_d (OldBuckets[(int) Distribution], Buckets, ListEntryMatch); 01780 if (SampleCount != Buckets->SampleCount) 01781 AdjustBuckets(Buckets, SampleCount); 01782 if (Confidence != Buckets->Confidence) { 01783 Buckets->Confidence = Confidence; 01784 Buckets->ChiSquared = ComputeChiSquared 01785 (DegreesOfFreedom (Distribution, Buckets->NumberOfBuckets), 01786 Confidence); 01787 } 01788 InitBuckets(Buckets); 01789 } 01790 else // otherwise create a new structure 01791 Buckets = MakeBuckets (Distribution, SampleCount, Confidence); 01792 return (Buckets); 01793 } // GetBuckets
BOOL8 Independent | ( | PARAM_DESC | ParamDesc[], | |
INT16 | N, | |||
FLOAT32 * | CoVariance, | |||
FLOAT32 | Independence | |||
) |
Returns TRUE if the specified covariance matrix indicates that all N dimensions are independent of one another.
ParamDesc | descriptions of each feature space dimension | |
N | number of dimensions | |
CoVariance | ptr to a covariance matrix | |
Independence | max off-diagonal correlation coefficient |
Definition at line 1715 of file cluster.cpp.
Referenced by MakePrototype().
01716 { 01717 int i, j; 01718 FLOAT32 *VARii; // points to ith on-diagonal element 01719 FLOAT32 *VARjj; // points to jth on-diagonal element 01720 FLOAT32 CorrelationCoeff; 01721 01722 VARii = CoVariance; 01723 for (i = 0; i < N; i++, VARii += N + 1) { 01724 if (ParamDesc[i].NonEssential) 01725 continue; 01726 01727 VARjj = VARii + N + 1; 01728 CoVariance = VARii + 1; 01729 for (j = i + 1; j < N; j++, CoVariance++, VARjj += N + 1) { 01730 if (ParamDesc[j].NonEssential) 01731 continue; 01732 01733 if ((*VARii == 0.0) || (*VARjj == 0.0)) 01734 CorrelationCoeff = 0.0; 01735 else 01736 CorrelationCoeff = 01737 sqrt (sqrt (*CoVariance * *CoVariance / (*VARii * *VARjj))); 01738 if (CorrelationCoeff > Independence) 01739 return (FALSE); 01740 } 01741 } 01742 return (TRUE); 01743 } // Independent
void InitBuckets | ( | BUCKETS * | Buckets | ) |
Sets the bucket counts in the specified histogram to zero.
Buckets | histogram data structure to init |
Definition at line 2443 of file cluster.cpp.
References BUCKETS::Count, and BUCKETS::NumberOfBuckets.
Referenced by GetBuckets().
02443 { 02444 int i; 02445 02446 for (i = 0; i < Buckets->NumberOfBuckets; i++) { 02447 Buckets->Count[i] = 0; 02448 } 02449 02450 } // InitBuckets
Computes a trapezoidal approximation to the integral of a function over a small delta in x.
f1 | value of function at x1 | |
f2 | value of function at x2 | |
Dx | x2 - x1 (should always be positive) |
Definition at line 2067 of file cluster.cpp.
Referenced by MakeBuckets().
double InvertMatrix | ( | const float * | input, | |
int | size, | |||
float * | inv | |||
) |
Find inverse of matrix using LU decomposition with partial pivoting.
input | Input matrix | |
size | Matrix order | |
inv | Inverse of input |
Definition at line 2704 of file cluster.cpp.
References Abs, and ALLOC_2D_ARRAY.
Referenced by TestEllipticalProto().
02704 { 02705 double** U; // The upper triangular array. 02706 double* Umem; 02707 double** U_inv; // The inverse of U. 02708 double* U_invmem; 02709 double** L; // The lower triangular array. 02710 double* Lmem; 02711 02712 // Allocate memory for the 2D arrays. 02713 ALLOC_2D_ARRAY(size, size, Umem, U, double); 02714 ALLOC_2D_ARRAY(size, size, U_invmem, U_inv, double); 02715 ALLOC_2D_ARRAY(size, size, Lmem, L, double); 02716 02717 // Initialize the working matrices. U starts as input, L as I and U_inv as O. 02718 int row; 02719 int col; 02720 for (row = 0; row < size; row++) { 02721 for (col = 0; col < size; col++) { 02722 U[row][col] = input[row*size + col]; 02723 L[row][col] = row == col ? 1.0 : 0.0; 02724 U_inv[row][col] = 0.0; 02725 } 02726 } 02727 02728 // Compute forward matrix by inversion by LU decomposition of input. 02729 for (col = 0; col < size; ++col) { 02730 // Find best pivot 02731 int best_row = 0; 02732 double best_pivot = -1.0; 02733 for (row = col; row < size; ++row) { 02734 if (Abs(U[row][col]) > best_pivot) { 02735 best_pivot = Abs(U[row][col]); 02736 best_row = row; 02737 } 02738 } 02739 // Exchange pivot rows. 02740 if (best_row != col) { 02741 for (int k = 0; k < size; ++k) { 02742 double tmp = U[best_row][k]; 02743 U[best_row][k] = U[col][k]; 02744 U[col][k] = tmp; 02745 tmp = L[best_row][k]; 02746 L[best_row][k] = L[col][k]; 02747 L[col][k] = tmp; 02748 } 02749 } 02750 // Now do the pivot itself. 02751 for (row = col + 1; row < size; ++row) { 02752 double ratio = -U[row][col] / U[col][col]; 02753 for (int j = col; j < size; ++j) { 02754 U[row][j] += U[col][j] * ratio; 02755 } 02756 for (int k = 0; k < size; ++k) { 02757 L[row][k] += L[col][k] * ratio; 02758 } 02759 } 02760 } 02761 // Next invert U. 02762 for (col = 0; col < size; ++col) { 02763 U_inv[col][col] = 1.0 / U[col][col]; 02764 for (row = col - 1; row >= 0; --row) { 02765 double total = 0.0; 02766 for (int k = col; k > row; --k) { 02767 total += U[row][k] * U_inv[k][col]; 02768 } 02769 U_inv[row][col] = -total / U[row][row]; 02770 } 02771 } 02772 // Now the answer is U_inv.L. 02773 for (row = 0; row < size; row++) { 02774 for (col = 0; col < size; col++) { 02775 double sum = 0.0; 02776 for (int k = row; k < size; ++k) { 02777 sum += U_inv[row][k] * L[k][col]; 02778 } 02779 inv[row*size + col] = sum; 02780 } 02781 } 02782 // Check matrix product. 02783 double error_sum = 0.0; 02784 for (row = 0; row < size; row++) { 02785 for (col = 0; col < size; col++) { 02786 double sum = 0.0; 02787 for (int k = 0; k < size; ++k) { 02788 sum += input[row*size + k] * inv[k *size + col]; 02789 } 02790 if (row != col) { 02791 error_sum += Abs(sum); 02792 } 02793 } 02794 } 02795 return error_sum; 02796 }
int ListEntryMatch | ( | void * | arg1, | |
void * | arg2 | |||
) |
Searches list for a list node whose contents match Key; called by the list delete_d routine.
arg1 | key to compare | |
arg2 | key to compare |
Definition at line 2401 of file cluster.cpp.
Referenced by GetBuckets().
BUCKETS * MakeBuckets | ( | DISTRIBUTION | Distribution, | |
UINT32 | SampleCount, | |||
FLOAT64 | Confidence | |||
) |
Make histogram data structure.
Distribution | type of probability distribution to test for | |
SampleCount | number of samples that are available | |
Confidence | probability of a Type I error |
Definition at line 1817 of file cluster.cpp.
References BUCKETS::Bucket, BUCKETTABLESIZE, BUCKETS::ChiSquared, ComputeChiSquared(), BUCKETS::Confidence, BUCKETS::Count, DegreesOfFreedom(), BUCKETS::Distribution, Emalloc(), BUCKETS::ExpectedCount, Integral(), Mirror, NormalDensity(), BUCKETS::NumberOfBuckets, Odd, OptimumNumberOfBuckets(), BUCKETS::SampleCount, TRUE, and UniformDensity().
Referenced by GetBuckets().
01819 { 01820 static DENSITYFUNC DensityFunction[] = 01821 { NormalDensity, UniformDensity, UniformDensity }; 01822 int i, j; 01823 BUCKETS *Buckets; 01824 FLOAT64 BucketProbability; 01825 FLOAT64 NextBucketBoundary; 01826 FLOAT64 Probability; 01827 FLOAT64 ProbabilityDelta; 01828 FLOAT64 LastProbDensity; 01829 FLOAT64 ProbDensity; 01830 UINT16 CurrentBucket; 01831 BOOL8 Symmetrical; 01832 01833 // allocate memory needed for data structure 01834 Buckets = (BUCKETS *) Emalloc (sizeof (BUCKETS)); 01835 Buckets->NumberOfBuckets = OptimumNumberOfBuckets (SampleCount); 01836 Buckets->SampleCount = SampleCount; 01837 Buckets->Confidence = Confidence; 01838 Buckets->Count = 01839 (UINT32 *) Emalloc (Buckets->NumberOfBuckets * sizeof (UINT32)); 01840 Buckets->ExpectedCount = 01841 (FLOAT32 *) Emalloc (Buckets->NumberOfBuckets * sizeof (FLOAT32)); 01842 01843 // initialize simple fields 01844 Buckets->Distribution = Distribution; 01845 for (i = 0; i < Buckets->NumberOfBuckets; i++) { 01846 Buckets->Count[i] = 0; 01847 Buckets->ExpectedCount[i] = 0.0; 01848 } 01849 01850 // all currently defined distributions are symmetrical 01851 Symmetrical = TRUE; 01852 Buckets->ChiSquared = ComputeChiSquared 01853 (DegreesOfFreedom (Distribution, Buckets->NumberOfBuckets), Confidence); 01854 01855 if (Symmetrical) { 01856 // allocate buckets so that all have approx. equal probability 01857 BucketProbability = 1.0 / (FLOAT64) (Buckets->NumberOfBuckets); 01858 01859 // distribution is symmetric so fill in upper half then copy 01860 CurrentBucket = Buckets->NumberOfBuckets / 2; 01861 if (Odd (Buckets->NumberOfBuckets)) 01862 NextBucketBoundary = BucketProbability / 2; 01863 else 01864 NextBucketBoundary = BucketProbability; 01865 01866 Probability = 0.0; 01867 LastProbDensity = 01868 (*DensityFunction[(int) Distribution]) (BUCKETTABLESIZE / 2); 01869 for (i = BUCKETTABLESIZE / 2; i < BUCKETTABLESIZE; i++) { 01870 ProbDensity = (*DensityFunction[(int) Distribution]) (i + 1); 01871 ProbabilityDelta = Integral (LastProbDensity, ProbDensity, 1.0); 01872 Probability += ProbabilityDelta; 01873 if (Probability > NextBucketBoundary) { 01874 if (CurrentBucket < Buckets->NumberOfBuckets - 1) 01875 CurrentBucket++; 01876 NextBucketBoundary += BucketProbability; 01877 } 01878 Buckets->Bucket[i] = CurrentBucket; 01879 Buckets->ExpectedCount[CurrentBucket] += 01880 (FLOAT32) (ProbabilityDelta * SampleCount); 01881 LastProbDensity = ProbDensity; 01882 } 01883 // place any leftover probability into the last bucket 01884 Buckets->ExpectedCount[CurrentBucket] += 01885 (FLOAT32) ((0.5 - Probability) * SampleCount); 01886 01887 // copy upper half of distribution to lower half 01888 for (i = 0, j = BUCKETTABLESIZE - 1; i < j; i++, j--) 01889 Buckets->Bucket[i] = 01890 Mirror (Buckets->Bucket[j], Buckets->NumberOfBuckets); 01891 01892 // copy upper half of expected counts to lower half 01893 for (i = 0, j = Buckets->NumberOfBuckets - 1; i <= j; i++, j--) 01894 Buckets->ExpectedCount[i] += Buckets->ExpectedCount[j]; 01895 } 01896 return (Buckets); 01897 } // MakeBuckets
CLUSTERER* MakeClusterer | ( | INT16 | SampleSize, | |
PARAM_DESC | ParamDesc[] | |||
) |
Creates a new clusterer data structure, initializes it, and returns a pointer to it.
SampleSize | number of dimensions in feature space | |
ParamDesc | description of each dimension |
Definition at line 442 of file cluster.cpp.
References PARAM_DESC::Circular, Emalloc(), PARAM_DESC::HalfRange, CLUSTERER::KDTree, MakeKDTree(), PARAM_DESC::Max, PARAM_DESC::MidRange, PARAM_DESC::Min, PARAM_DESC::NonEssential, NULL, CLUSTERER::NumberOfSamples, CLUSTERER::NumChar, CLUSTERER::ParamDesc, CLUSTERER::ProtoList, PARAM_DESC::Range, CLUSTERER::Root, and CLUSTERER::SampleSize.
Referenced by SetUpForClustering().
00442 { 00443 CLUSTERER *Clusterer; 00444 int i; 00445 00446 // allocate main clusterer data structure and init simple fields 00447 Clusterer = (CLUSTERER *) Emalloc (sizeof (CLUSTERER)); 00448 Clusterer->SampleSize = SampleSize; 00449 Clusterer->NumberOfSamples = 0; 00450 Clusterer->NumChar = 0; 00451 00452 // init fields which will not be used initially 00453 Clusterer->Root = NULL; 00454 Clusterer->ProtoList = NIL; 00455 00456 // maintain a copy of param descriptors in the clusterer data structure 00457 Clusterer->ParamDesc = 00458 (PARAM_DESC *) Emalloc (SampleSize * sizeof (PARAM_DESC)); 00459 for (i = 0; i < SampleSize; i++) { 00460 Clusterer->ParamDesc[i].Circular = ParamDesc[i].Circular; 00461 Clusterer->ParamDesc[i].NonEssential = ParamDesc[i].NonEssential; 00462 Clusterer->ParamDesc[i].Min = ParamDesc[i].Min; 00463 Clusterer->ParamDesc[i].Max = ParamDesc[i].Max; 00464 Clusterer->ParamDesc[i].Range = ParamDesc[i].Max - ParamDesc[i].Min; 00465 Clusterer->ParamDesc[i].HalfRange = Clusterer->ParamDesc[i].Range / 2; 00466 Clusterer->ParamDesc[i].MidRange = 00467 (ParamDesc[i].Max + ParamDesc[i].Min) / 2; 00468 } 00469 00470 // allocate a kd tree to hold the samples 00471 Clusterer->KDTree = MakeKDTree (SampleSize, ParamDesc); 00472 00473 // execute hook for monitoring clustering operation 00474 // (*ClustererCreationHook)( Clusterer ); 00475 00476 return (Clusterer); 00477 } // MakeClusterer
PROTOTYPE * MakeDegenerateProto | ( | UINT16 | N, | |
CLUSTER * | Cluster, | |||
STATISTICS * | Statistics, | |||
PROTOSTYLE | Style, | |||
INT32 | MinSamples | |||
) |
Checks for clusters which are degenerate and therefore cannot be analyzed in a statistically valid way.
N | number of dimensions | |
Cluster | cluster being analyzed | |
Statistics | statistical info about cluster | |
Style | type of prototype to be generated | |
MinSamples | minimum number of samples in a cluster |
Definition at line 1155 of file cluster.cpp.
References automatic, elliptical, FALSE, MINSAMPLESNEEDED, mixed, NewEllipticalProto(), NewMixedProto(), NewSphericalProto(), NULL, sample::SampleCount, proto::Significant, and spherical.
Referenced by MakePrototype().
01160 { 01161 PROTOTYPE *Proto = NULL; 01162 01163 if (MinSamples < MINSAMPLESNEEDED) 01164 MinSamples = MINSAMPLESNEEDED; 01165 01166 if (Cluster->SampleCount < MinSamples) { 01167 switch (Style) { 01168 case spherical: 01169 Proto = NewSphericalProto (N, Cluster, Statistics); 01170 break; 01171 case elliptical: 01172 case automatic: 01173 Proto = NewEllipticalProto (N, Cluster, Statistics); 01174 break; 01175 case mixed: 01176 Proto = NewMixedProto (N, Cluster, Statistics); 01177 break; 01178 } 01179 Proto->Significant = FALSE; 01180 } 01181 return (Proto); 01182 } // MakeDegenerateProto
void MakeDimRandom | ( | UINT16 | i, | |
PROTOTYPE * | Proto, | |||
PARAM_DESC * | ParamDesc | |||
) |
Alters the ith dimension of the specified mixed prototype to be D_random.
i | index of dimension to be changed | |
Proto | prototype whose dimension is to be altered | |
ParamDesc | description of specified dimension |
Definition at line 1410 of file cluster.cpp.
References D_random, proto::Distrib, FLOATUNION::Elliptical, PARAM_DESC::HalfRange, proto::LogMagnitude, proto::Magnitude, proto::Mean, PARAM_DESC::MidRange, PARAM_DESC::Range, proto::TotalMagnitude, and proto::Variance.
Referenced by MakeMixedProto().
01410 { 01411 Proto->Distrib[i] = D_random; 01412 Proto->Mean[i] = ParamDesc->MidRange; 01413 Proto->Variance.Elliptical[i] = ParamDesc->HalfRange; 01414 01415 // subtract out the previous magnitude of this dimension from the total 01416 Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[i]; 01417 Proto->Magnitude.Elliptical[i] = 1.0 / ParamDesc->Range; 01418 Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i]; 01419 Proto->LogMagnitude = log ((double) Proto->TotalMagnitude); 01420 01421 // note that the proto Weight is irrelevant for D_random protos 01422 } // MakeDimRandom
void MakeDimUniform | ( | UINT16 | i, | |
PROTOTYPE * | Proto, | |||
STATISTICS * | Statistics | |||
) |
Alters the ith dimension of the specified mixed prototype to be uniform.
i | index of dimension to be changed | |
Proto | prototype whose dimension is to be altered | |
Statistics | statistical info about prototype |
Definition at line 1436 of file cluster.cpp.
References proto::Cluster, proto::Distrib, FLOATUNION::Elliptical, proto::LogMagnitude, proto::Magnitude, STATISTICS::Max, sample::Mean, proto::Mean, STATISTICS::Min, MINVARIANCE, proto::TotalMagnitude, uniform, and proto::Variance.
Referenced by MakeMixedProto().
01436 { 01437 Proto->Distrib[i] = uniform; 01438 Proto->Mean[i] = Proto->Cluster->Mean[i] + 01439 (Statistics->Min[i] + Statistics->Max[i]) / 2; 01440 Proto->Variance.Elliptical[i] = 01441 (Statistics->Max[i] - Statistics->Min[i]) / 2; 01442 if (Proto->Variance.Elliptical[i] < MINVARIANCE) 01443 Proto->Variance.Elliptical[i] = MINVARIANCE; 01444 01445 // subtract out the previous magnitude of this dimension from the total 01446 Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[i]; 01447 Proto->Magnitude.Elliptical[i] = 01448 1.0 / (2.0 * Proto->Variance.Elliptical[i]); 01449 Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i]; 01450 Proto->LogMagnitude = log ((double) Proto->TotalMagnitude); 01451 01452 // note that the proto Weight is irrelevant for uniform protos 01453 } // MakeDimUniform
PROTOTYPE * MakeEllipticalProto | ( | CLUSTERER * | Clusterer, | |
CLUSTER * | Cluster, | |||
STATISTICS * | Statistics, | |||
BUCKETS * | Buckets | |||
) |
Tests the specified cluster to see if it can be approximated by an elliptical normal distribution.
Clusterer | data struct containing samples being clustered | |
Cluster | cluster to be made into an elliptical prototype | |
Statistics | statistical info about cluster | |
Buckets | histogram struct used to analyze distribution |
Definition at line 1299 of file cluster.cpp.
References DistributionOK(), FillBuckets(), sample::Mean, NewEllipticalProto(), PARAM_DESC::NonEssential, NULL, CLUSTERER::ParamDesc, and CLUSTERER::SampleSize.
Referenced by MakePrototype().
01302 { 01303 PROTOTYPE *Proto = NULL; 01304 int i; 01305 01306 // check that each dimension is a normal distribution 01307 for (i = 0; i < Clusterer->SampleSize; i++) { 01308 if (Clusterer->ParamDesc[i].NonEssential) 01309 continue; 01310 01311 FillBuckets (Buckets, Cluster, i, &(Clusterer->ParamDesc[i]), 01312 Cluster->Mean[i], 01313 sqrt ((FLOAT64) Statistics-> 01314 CoVariance[i * (Clusterer->SampleSize + 1)])); 01315 if (!DistributionOK (Buckets)) 01316 break; 01317 } 01318 // if all dimensions matched a normal distribution, make a proto 01319 if (i >= Clusterer->SampleSize) 01320 Proto = NewEllipticalProto (Clusterer->SampleSize, Cluster, Statistics); 01321 return (Proto); 01322 } // MakeEllipticalProto
PROTOTYPE * MakeMixedProto | ( | CLUSTERER * | Clusterer, | |
CLUSTER * | Cluster, | |||
STATISTICS * | Statistics, | |||
BUCKETS * | NormalBuckets, | |||
FLOAT64 | Confidence | |||
) |
Tests each dimension of the specified cluster to see what distribution would best approximate that dimension.
Clusterer | data struct containing samples being clustered | |
Cluster | cluster to be made into a prototype | |
Statistics | statistical info about cluster | |
NormalBuckets | histogram struct used to analyze distribution | |
Confidence | confidence level for alternate distributions |
Definition at line 1344 of file cluster.cpp.
References D_random, DistributionOK(), FLOATUNION::Elliptical, FillBuckets(), FreeBuckets(), FreePrototype(), GetBuckets(), MakeDimRandom(), MakeDimUniform(), proto::Mean, NewMixedProto(), PARAM_DESC::NonEssential, NULL, CLUSTERER::ParamDesc, sample::SampleCount, CLUSTERER::SampleSize, uniform, and proto::Variance.
Referenced by MakePrototype().
01348 { 01349 PROTOTYPE *Proto; 01350 int i; 01351 BUCKETS *UniformBuckets = NULL; 01352 BUCKETS *RandomBuckets = NULL; 01353 01354 // create a mixed proto to work on - initially assume all dimensions normal*/ 01355 Proto = NewMixedProto (Clusterer->SampleSize, Cluster, Statistics); 01356 01357 // Find the proper distribution for each dimension 01358 for (i = 0; i < Clusterer->SampleSize; i++) { 01359 if (Clusterer->ParamDesc[i].NonEssential) 01360 continue; 01361 01362 FillBuckets (NormalBuckets, Cluster, i, &(Clusterer->ParamDesc[i]), 01363 Proto->Mean[i], 01364 sqrt ((FLOAT64) Proto->Variance.Elliptical[i])); 01365 if (DistributionOK (NormalBuckets)) 01366 continue; 01367 01368 if (RandomBuckets == NULL) 01369 RandomBuckets = 01370 GetBuckets (D_random, Cluster->SampleCount, Confidence); 01371 MakeDimRandom (i, Proto, &(Clusterer->ParamDesc[i])); 01372 FillBuckets (RandomBuckets, Cluster, i, &(Clusterer->ParamDesc[i]), 01373 Proto->Mean[i], Proto->Variance.Elliptical[i]); 01374 if (DistributionOK (RandomBuckets)) 01375 continue; 01376 01377 if (UniformBuckets == NULL) 01378 UniformBuckets = 01379 GetBuckets (uniform, Cluster->SampleCount, Confidence); 01380 MakeDimUniform(i, Proto, Statistics); 01381 FillBuckets (UniformBuckets, Cluster, i, &(Clusterer->ParamDesc[i]), 01382 Proto->Mean[i], Proto->Variance.Elliptical[i]); 01383 if (DistributionOK (UniformBuckets)) 01384 continue; 01385 break; 01386 } 01387 // If any dimension failed to match a distribution, discard the proto 01388 if (i < Clusterer->SampleSize) { 01389 FreePrototype(Proto); 01390 Proto = NULL; 01391 } 01392 if (UniformBuckets != NULL) 01393 FreeBuckets(UniformBuckets); 01394 if (RandomBuckets != NULL) 01395 FreeBuckets(RandomBuckets); 01396 return (Proto); 01397 } // MakeMixedProto
CLUSTER * MakeNewCluster | ( | CLUSTERER * | Clusterer, | |
TEMPCLUSTER * | TempCluster | |||
) |
Creates a new permanent cluster from the clusters specified in TempCluster.
Clusterer | current clustering environment | |
TempCluster | potential cluster to make permanent |
Definition at line 916 of file cluster.cpp.
References sample::CharID, TEMPCLUSTER::Cluster, sample::Clustered, Emalloc(), FALSE, KDDelete(), KDStore(), CLUSTERER::KDTree, sample::Left, sample::Mean, MergeClusters(), TEMPCLUSTER::Neighbor, CLUSTERER::ParamDesc, sample::Prototype, sample::Right, sample::SampleCount, CLUSTERER::SampleSize, TempCluster, and TRUE.
Referenced by CreateClusterTree().
00916 { 00917 CLUSTER *Cluster; 00918 00919 // allocate the new cluster and initialize it 00920 Cluster = (CLUSTER *) Emalloc (sizeof (CLUSTER) + 00921 (Clusterer->SampleSize - 00922 1) * sizeof (FLOAT32)); 00923 Cluster->Clustered = FALSE; 00924 Cluster->Prototype = FALSE; 00925 Cluster->Left = TempCluster->Cluster; 00926 Cluster->Right = TempCluster->Neighbor; 00927 Cluster->CharID = -1; 00928 00929 // mark the old clusters as "clustered" and delete them from the kd-tree 00930 Cluster->Left->Clustered = TRUE; 00931 Cluster->Right->Clustered = TRUE; 00932 KDDelete (Clusterer->KDTree, Cluster->Left->Mean, Cluster->Left); 00933 KDDelete (Clusterer->KDTree, Cluster->Right->Mean, Cluster->Right); 00934 00935 // compute the mean and sample count for the new cluster 00936 Cluster->SampleCount = 00937 MergeClusters (Clusterer->SampleSize, Clusterer->ParamDesc, 00938 Cluster->Left->SampleCount, Cluster->Right->SampleCount, 00939 Cluster->Mean, Cluster->Left->Mean, Cluster->Right->Mean); 00940 00941 // add the new cluster to the KD tree 00942 KDStore (Clusterer->KDTree, Cluster->Mean, Cluster); 00943 return (Cluster); 00944 } // MakeNewCluster
Create potential cluster for each sample in KD-tree being walked & push on heap.
Cluster | current cluster being visited in kd-tree walk | |
Order | order in which cluster is being visited | |
Level | level of this cluster in the kd-tree |
Globals: TempCluster array of temporary clusters
Globals: CurrentTemp index of next temp cluster to be used
Globals: Heap heap used to hold temp clusters - "best" on top
Definition at line 838 of file cluster.cpp.
References TEMPCLUSTER::Cluster, CurrentTemp, HEAPENTRY::Data, FindNearestNeighbor(), Heap, HeapStore(), HEAPENTRY::Key, leaf, Neighbor, TEMPCLUSTER::Neighbor, NULL, preorder, TempCluster, and Tree.
Referenced by CreateClusterTree().
00838 { 00839 HEAPENTRY HeapEntry; 00840 00841 if ((Order == preorder) || (Order == leaf)) { 00842 TempCluster[CurrentTemp].Cluster = Cluster; 00843 HeapEntry.Data = (char *) &(TempCluster[CurrentTemp]); 00844 TempCluster[CurrentTemp].Neighbor = 00845 FindNearestNeighbor (Tree, TempCluster[CurrentTemp].Cluster, 00846 &(HeapEntry.Key)); 00847 if (TempCluster[CurrentTemp].Neighbor != NULL) { 00848 HeapStore(Heap, &HeapEntry); 00849 CurrentTemp++; 00850 } 00851 } 00852 } // MakePotentialClusters
PROTOTYPE * MakePrototype | ( | CLUSTERER * | Clusterer, | |
CLUSTERCONFIG * | Config, | |||
CLUSTER * | Cluster | |||
) |
Makes prototype from specified cluster conforming to distribution in Config.
Clusterer | data structure holding cluster tree | |
Config | parameters used to control prototype generation | |
Cluster | cluster to be made into a prototype |
If a prototype can be found that matches the desired distribution then a pointer to it is returned, otherwise NULL is returned.
Definition at line 1060 of file cluster.cpp.
References automatic, ComputeStatistics(), CLUSTERCONFIG::Confidence, Config, STATISTICS::CoVariance, elliptical, FreeBuckets(), FreeStatistics(), GetBuckets(), HOTELLING, CLUSTERCONFIG::Independence, Independent(), MakeDegenerateProto(), MakeEllipticalProto(), MakeMixedProto(), MakeSphericalProto(), CLUSTERCONFIG::MaxIllegal, CLUSTERCONFIG::MinSamples, mixed, MultipleCharSamples(), normal, NULL, CLUSTERCONFIG::ProtoStyle, sample::SampleCount, spherical, and TestEllipticalProto().
Referenced by ComputePrototypes().
01062 { 01063 STATISTICS *Statistics; 01064 PROTOTYPE *Proto; 01065 BUCKETS *Buckets; 01066 01067 // filter out clusters which contain samples from the same character 01068 if (MultipleCharSamples (Clusterer, Cluster, Config->MaxIllegal)) 01069 return (NULL); 01070 01071 // compute the covariance matrix and ranges for the cluster 01072 Statistics = 01073 ComputeStatistics (Clusterer->SampleSize, Clusterer->ParamDesc, Cluster); 01074 01075 // Check for degenerate clusters which need not be analyzed further 01076 // note that the MinSamples test assumes that all clusters with multiple 01077 // character samples have been removed (as above) 01078 Proto = MakeDegenerateProto (Clusterer->SampleSize, Cluster, Statistics, 01079 Config->ProtoStyle, 01080 (INT32) (Config->MinSamples * 01081 Clusterer->NumChar)); 01082 if (Proto != NULL) { 01083 FreeStatistics(Statistics); 01084 return (Proto); 01085 } 01086 // check to ensure that all dimensions are independent 01087 if (!Independent (Clusterer->ParamDesc, Clusterer->SampleSize, 01088 Statistics->CoVariance, Config->Independence)) { 01089 FreeStatistics(Statistics); 01090 return (NULL); 01091 } 01092 01093 if (HOTELLING && Config->ProtoStyle == elliptical) { 01094 Proto = TestEllipticalProto(Clusterer, Cluster, Statistics); 01095 if (Proto != NULL) { 01096 FreeStatistics(Statistics); 01097 return Proto; 01098 } 01099 } 01100 01101 // create a histogram data structure used to evaluate distributions 01102 Buckets = GetBuckets (normal, Cluster->SampleCount, Config->Confidence); 01103 01104 // create a prototype based on the statistics and test it 01105 switch (Config->ProtoStyle) { 01106 case spherical: 01107 Proto = MakeSphericalProto (Clusterer, Cluster, Statistics, Buckets); 01108 break; 01109 case elliptical: 01110 Proto = MakeEllipticalProto (Clusterer, Cluster, Statistics, Buckets); 01111 break; 01112 case mixed: 01113 Proto = MakeMixedProto (Clusterer, Cluster, Statistics, Buckets, 01114 Config->Confidence); 01115 break; 01116 case automatic: 01117 Proto = MakeSphericalProto (Clusterer, Cluster, Statistics, Buckets); 01118 if (Proto != NULL) 01119 break; 01120 Proto = MakeEllipticalProto (Clusterer, Cluster, Statistics, Buckets); 01121 if (Proto != NULL) 01122 break; 01123 Proto = MakeMixedProto (Clusterer, Cluster, Statistics, Buckets, 01124 Config->Confidence); 01125 break; 01126 } 01127 FreeBuckets(Buckets); 01128 FreeStatistics(Statistics); 01129 return (Proto); 01130 } // MakePrototype
Creates new sample data structure for specified feature.
Clusterer | clusterer data structure to add sample to | |
Feature | feature to be added to clusterer | |
CharID | unique ident. of char that sample came from |
Exceptions: ALREADYCLUSTERED MakeSample can't be called after ClusterSamples has been called
Definition at line 498 of file cluster.cpp.
References ALREADYCLUSTERED, sample::CharID, sample::Clustered, DoError(), Emalloc(), FALSE, KDStore(), CLUSTERER::KDTree, sample::Left, sample::Mean, NULL, CLUSTERER::NumberOfSamples, CLUSTERER::NumChar, sample::Prototype, sample::Right, CLUSTERER::Root, sample::SampleCount, and CLUSTERER::SampleSize.
Referenced by SetUpForClustering().
00498 { 00499 SAMPLE *Sample; 00500 int i; 00501 00502 // see if the samples have already been clustered - if so trap an error 00503 if (Clusterer->Root != NULL) 00504 DoError (ALREADYCLUSTERED, 00505 "Can't add samples after they have been clustered"); 00506 00507 // allocate the new sample and initialize it 00508 Sample = (SAMPLE *) Emalloc (sizeof (SAMPLE) + 00509 (Clusterer->SampleSize - 00510 1) * sizeof (FLOAT32)); 00511 Sample->Clustered = FALSE; 00512 Sample->Prototype = FALSE; 00513 Sample->SampleCount = 1; 00514 Sample->Left = NULL; 00515 Sample->Right = NULL; 00516 Sample->CharID = CharID; 00517 00518 for (i = 0; i < Clusterer->SampleSize; i++) 00519 Sample->Mean[i] = Feature[i]; 00520 00521 // add the sample to the KD tree - keep track of the total # of samples 00522 Clusterer->NumberOfSamples++; 00523 KDStore (Clusterer->KDTree, Sample->Mean, (char *) Sample); 00524 if (CharID >= Clusterer->NumChar) 00525 Clusterer->NumChar = CharID + 1; 00526 00527 // execute hook for monitoring clustering operation 00528 // (*SampleCreationHook)( Sample ); 00529 00530 return (Sample); 00531 } // MakeSample
PROTOTYPE * MakeSphericalProto | ( | CLUSTERER * | Clusterer, | |
CLUSTER * | Cluster, | |||
STATISTICS * | Statistics, | |||
BUCKETS * | Buckets | |||
) |
Tests the specified cluster to see if it can be approximated by a spherical normal distribution.
Clusterer | data struct containing samples being clustered | |
Cluster | cluster to be made into a spherical prototype | |
Statistics | statistical info about cluster | |
Buckets | histogram struct used to analyze distribution |
Definition at line 1259 of file cluster.cpp.
References STATISTICS::AvgVariance, DistributionOK(), FillBuckets(), sample::Mean, NewSphericalProto(), PARAM_DESC::NonEssential, NULL, CLUSTERER::ParamDesc, and CLUSTERER::SampleSize.
Referenced by MakePrototype().
01262 { 01263 PROTOTYPE *Proto = NULL; 01264 int i; 01265 01266 // check that each dimension is a normal distribution 01267 for (i = 0; i < Clusterer->SampleSize; i++) { 01268 if (Clusterer->ParamDesc[i].NonEssential) 01269 continue; 01270 01271 FillBuckets (Buckets, Cluster, i, &(Clusterer->ParamDesc[i]), 01272 Cluster->Mean[i], 01273 sqrt ((FLOAT64) (Statistics->AvgVariance))); 01274 if (!DistributionOK (Buckets)) 01275 break; 01276 } 01277 // if all dimensions matched a normal distribution, make a proto 01278 if (i >= Clusterer->SampleSize) 01279 Proto = NewSphericalProto (Clusterer->SampleSize, Cluster, Statistics); 01280 return (Proto); 01281 } // MakeSphericalProto
Computes the mean of the specified prototype in the indicated dimension.
Proto | prototype to return mean of | |
Dimension | dimension whose mean is to be returned |
Definition at line 698 of file cluster.cpp.
References proto::Mean.
Referenced by PrintNormMatch(), and UniformCertainties().
00698 { 00699 return (Proto->Mean[Dimension]); 00700 } // Mean
INT32 MergeClusters | ( | INT16 | N, | |
register PARAM_DESC | ParamDesc[], | |||
register INT32 | n1, | |||
register INT32 | n2, | |||
register FLOAT32 | m[], | |||
register FLOAT32 | m1[], | |||
register FLOAT32 | m2[] | |||
) |
Merges two clusters into one larger cluster.
N | Number of dimensions (size of arrays) | |
ParamDesc | array of dimension descriptions | |
n1,n2 | number of samples in each old cluster | |
m | array to hold mean of new cluster | |
m1,m2 | arrays containing means of old clusters |
Definition at line 964 of file cluster.cpp.
Referenced by MakeNewCluster().
00969 { 00970 register INT32 i, n; 00971 00972 n = n1 + n2; 00973 for (i = N; i > 0; i--, ParamDesc++, m++, m1++, m2++) { 00974 if (ParamDesc->Circular) { 00975 /* 00976 If distance between means is greater than allowed 00977 reduce upper point by one "rotation" to compute mean 00978 then normalize the mean back into the accepted range 00979 */ 00980 if ((*m2 - *m1) > ParamDesc->HalfRange) { 00981 *m = (n1 * *m1 + n2 * (*m2 - ParamDesc->Range)) / n; 00982 if (*m < ParamDesc->Min) 00983 *m += ParamDesc->Range; 00984 } 00985 else if ((*m1 - *m2) > ParamDesc->HalfRange) { 00986 *m = (n1 * (*m1 - ParamDesc->Range) + n2 * *m2) / n; 00987 if (*m < ParamDesc->Min) 00988 *m += ParamDesc->Range; 00989 } 00990 else 00991 *m = (n1 * *m1 + n2 * *m2) / n; 00992 } 00993 else 00994 *m = (n1 * *m1 + n2 * *m2) / n; 00995 } 00996 return (n); 00997 } // MergeClusters
Do running estimate of % of chars with more than 1 sample in cluster.
Clusterer | data structure holding cluster tree | |
Cluster | cluster containing samples to be tested | |
MaxIllegal | max percentage of samples allowed to have more than 1 feature in the cluster |
The main function of this routine is to help identify clusters which need to be split further, i.e. if numerous training characters have 2 or more features which are contained in the same cluster, then the cluster should be split.
Definition at line 2642 of file cluster.cpp.
References sample::CharID, Emalloc(), FALSE, ILLEGAL_CHAR, InitSampleSearch, memfree(), NextSample(), NULL, CLUSTERER::NumChar, sample::SampleCount, and TRUE.
Referenced by MakePrototype().
02645 { 02646 static BOOL8 *CharFlags = NULL; 02647 static INT32 NumFlags = 0; 02648 int i; 02649 LIST SearchState; 02650 SAMPLE *Sample; 02651 INT32 CharID; 02652 INT32 NumCharInCluster; 02653 INT32 NumIllegalInCluster; 02654 FLOAT32 PercentIllegal; 02655 02656 // initial estimate assumes that no illegal chars exist in the cluster 02657 NumCharInCluster = Cluster->SampleCount; 02658 NumIllegalInCluster = 0; 02659 02660 if (Clusterer->NumChar > NumFlags) { 02661 if (CharFlags != NULL) 02662 memfree(CharFlags); 02663 NumFlags = Clusterer->NumChar; 02664 CharFlags = (BOOL8 *) Emalloc (NumFlags * sizeof (BOOL8)); 02665 } 02666 02667 for (i = 0; i < NumFlags; i++) 02668 CharFlags[i] = FALSE; 02669 02670 // find each sample in the cluster and check if we have seen it before 02671 InitSampleSearch(SearchState, Cluster); 02672 while ((Sample = NextSample (&SearchState)) != NULL) { 02673 CharID = Sample->CharID; 02674 if (CharFlags[CharID] == FALSE) { 02675 CharFlags[CharID] = TRUE; 02676 } 02677 else { 02678 if (CharFlags[CharID] == TRUE) { 02679 NumIllegalInCluster++; 02680 CharFlags[CharID] = ILLEGAL_CHAR; 02681 } 02682 NumCharInCluster--; 02683 PercentIllegal = (FLOAT32) NumIllegalInCluster / NumCharInCluster; 02684 if (PercentIllegal > MaxIllegal) 02685 return (TRUE); 02686 } 02687 } 02688 return (FALSE); 02689 02690 } // MultipleCharSamples
Allocates a new data structure which is used to hold a chi-squared value along with its associated number of degrees of freedom and alpha value.
DegreesOfFreedom | degrees of freedom for new chi value | |
Alpha | Confidence level for new chi value |
Definition at line 2487 of file cluster.cpp.
References CHISTRUCT::Alpha, CHISTRUCT::DegreesOfFreedom, and Emalloc().
Referenced by ComputeChiSquared().
02487 { 02488 CHISTRUCT *NewChiStruct; 02489 02490 NewChiStruct = (CHISTRUCT *) Emalloc (sizeof (CHISTRUCT)); 02491 NewChiStruct->DegreesOfFreedom = DegreesOfFreedom; 02492 NewChiStruct->Alpha = Alpha; 02493 return (NewChiStruct); 02494 02495 } // NewChiStruct
PROTOTYPE * NewEllipticalProto | ( | INT16 | N, | |
CLUSTER * | Cluster, | |||
STATISTICS * | Statistics | |||
) |
Creates an elliptical prototype data structure to approximate the samples in the specified cluster.
N | number of dimensions | |
Cluster | cluster to be made into an elliptical prototype | |
Statistics | statistical info about samples in cluster |
Definition at line 1600 of file cluster.cpp.
References STATISTICS::CoVariance, elliptical, FLOATUNION::Elliptical, Emalloc(), proto::LogMagnitude, proto::Magnitude, MINVARIANCE, NewSimpleProto(), PI, proto::Style, proto::TotalMagnitude, proto::Variance, and proto::Weight.
Referenced by MakeDegenerateProto(), MakeEllipticalProto(), NewMixedProto(), and TestEllipticalProto().
01602 { 01603 PROTOTYPE *Proto; 01604 FLOAT32 *CoVariance; 01605 int i; 01606 01607 Proto = NewSimpleProto (N, Cluster); 01608 Proto->Variance.Elliptical = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32)); 01609 Proto->Magnitude.Elliptical = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32)); 01610 Proto->Weight.Elliptical = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32)); 01611 01612 CoVariance = Statistics->CoVariance; 01613 Proto->TotalMagnitude = 1.0; 01614 for (i = 0; i < N; i++, CoVariance += N + 1) { 01615 Proto->Variance.Elliptical[i] = *CoVariance; 01616 if (Proto->Variance.Elliptical[i] < MINVARIANCE) 01617 Proto->Variance.Elliptical[i] = MINVARIANCE; 01618 01619 Proto->Magnitude.Elliptical[i] = 01620 1.0 / sqrt ((double) (2.0 * PI * Proto->Variance.Elliptical[i])); 01621 Proto->Weight.Elliptical[i] = 1.0 / Proto->Variance.Elliptical[i]; 01622 Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i]; 01623 } 01624 Proto->LogMagnitude = log ((double) Proto->TotalMagnitude); 01625 Proto->Style = elliptical; 01626 return (Proto); 01627 } // NewEllipticalProto
PROTOTYPE * NewMixedProto | ( | INT16 | N, | |
CLUSTER * | Cluster, | |||
STATISTICS * | Statistics | |||
) |
Creates a mixed prototype data structure to approximate the samples in the specified cluster.
N | number of dimensions | |
Cluster | cluster to be made into a mixed prototype | |
Statistics | statistical info about samples in cluster |
Definition at line 1647 of file cluster.cpp.
References proto::Distrib, Emalloc(), mixed, NewEllipticalProto(), normal, and proto::Style.
Referenced by MakeDegenerateProto(), and MakeMixedProto().
01647 { 01648 PROTOTYPE *Proto; 01649 int i; 01650 01651 Proto = NewEllipticalProto (N, Cluster, Statistics); 01652 Proto->Distrib = (DISTRIBUTION *) Emalloc (N * sizeof (DISTRIBUTION)); 01653 01654 for (i = 0; i < N; i++) { 01655 Proto->Distrib[i] = normal; 01656 } 01657 Proto->Style = mixed; 01658 return (Proto); 01659 } // NewMixedProto
Allocates memory to hold a simple prototype data structure: one without independent distributions and variances for each dimension.
N | number of dimensions | |
Cluster | cluster to be made into a prototype |
Definition at line 1673 of file cluster.cpp.
References proto::Cluster, proto::Distrib, Emalloc(), sample::Mean, proto::Mean, NULL, proto::NumSamples, sample::Prototype, sample::SampleCount, proto::Significant, spherical, proto::Style, and TRUE.
Referenced by NewEllipticalProto(), and NewSphericalProto().
01673 { 01674 PROTOTYPE *Proto; 01675 int i; 01676 01677 Proto = (PROTOTYPE *) Emalloc (sizeof (PROTOTYPE)); 01678 Proto->Mean = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32)); 01679 01680 for (i = 0; i < N; i++) 01681 Proto->Mean[i] = Cluster->Mean[i]; 01682 Proto->Distrib = NULL; 01683 01684 Proto->Significant = TRUE; 01685 Proto->Style = spherical; 01686 Proto->NumSamples = Cluster->SampleCount; 01687 Proto->Cluster = Cluster; 01688 Proto->Cluster->Prototype = TRUE; 01689 return (Proto); 01690 } // NewSimpleProto
PROTOTYPE * NewSphericalProto | ( | UINT16 | N, | |
CLUSTER * | Cluster, | |||
STATISTICS * | Statistics | |||
) |
Creates a spherical prototype data structure to approximate the samples in the specified cluster.
N | number of dimensions | |
Cluster | cluster to be made into a spherical prototype | |
Statistics | statistical info about samples in cluster |
Definition at line 1564 of file cluster.cpp.
References STATISTICS::AvgVariance, proto::LogMagnitude, proto::Magnitude, MINVARIANCE, NewSimpleProto(), PI, FLOATUNION::Spherical, proto::TotalMagnitude, proto::Variance, and proto::Weight.
Referenced by MakeDegenerateProto(), and MakeSphericalProto().
01566 { 01567 PROTOTYPE *Proto; 01568 01569 Proto = NewSimpleProto (N, Cluster); 01570 01571 Proto->Variance.Spherical = Statistics->AvgVariance; 01572 if (Proto->Variance.Spherical < MINVARIANCE) 01573 Proto->Variance.Spherical = MINVARIANCE; 01574 01575 Proto->Magnitude.Spherical = 01576 1.0 / sqrt ((double) (2.0 * PI * Proto->Variance.Spherical)); 01577 Proto->TotalMagnitude = (float)pow((double)Proto->Magnitude.Spherical, 01578 (double) N); 01579 Proto->Weight.Spherical = 1.0 / Proto->Variance.Spherical; 01580 Proto->LogMagnitude = log ((double) Proto->TotalMagnitude); 01581 01582 return (Proto); 01583 } // NewSphericalProto
Finds all of the samples which belong to a cluster.
SearchState | ptr to list containing clusters to be searched |
NextSample() to initialize the search.
Definition at line 672 of file cluster.cpp.
References first, sample::Left, NULL, pop(), push(), and TRUE.
Referenced by ComputeStatistics(), FillBuckets(), and MultipleCharSamples().
00672 { 00673 CLUSTER *Cluster; 00674 00675 if (*SearchState == NIL) 00676 return (NULL); 00677 Cluster = (CLUSTER *) first (*SearchState); 00678 *SearchState = pop (*SearchState); 00679 while (TRUE) { 00680 if (Cluster->Left == NULL) 00681 return (Cluster); 00682 *SearchState = push (*SearchState, Cluster->Right); 00683 Cluster = Cluster->Left; 00684 } 00685 } // NextSample
UINT16 NormalBucket | ( | PARAM_DESC * | ParamDesc, | |
FLOAT32 | x, | |||
FLOAT32 | Mean, | |||
FLOAT32 | StdDev | |||
) |
Determines which bucket x falls into in the discrete normal distribution defined by.
ParamDesc | used to identify circular dimensions | |
x | value to be normalized | |
Mean | mean of normal distribution | |
StdDev | standard deviation of normal distribution |
NormalMean and
Definition at line 2180 of file cluster.cpp.
References BUCKETTABLESIZE, PARAM_DESC::Circular, PARAM_DESC::HalfRange, NormalMean, NormalStdDev, and PARAM_DESC::Range.
Referenced by FillBuckets().
02183 { 02184 FLOAT32 X; 02185 02186 // wraparound circular parameters if necessary 02187 if (ParamDesc->Circular) { 02188 if (x - Mean > ParamDesc->HalfRange) 02189 x -= ParamDesc->Range; 02190 else if (x - Mean < -ParamDesc->HalfRange) 02191 x += ParamDesc->Range; 02192 } 02193 02194 X = ((x - Mean) / StdDev) * NormalStdDev + NormalMean; 02195 if (X < 0) 02196 return ((UINT16) 0); 02197 if (X > BUCKETTABLESIZE - 1) 02198 return ((UINT16) (BUCKETTABLESIZE - 1)); 02199 return ((UINT16) floor ((FLOAT64) X)); 02200 } // NormalBucket
Gets probability density for normal distribution defined by 3 global variables.
x | number to compute the normal probability density for |
Definition at line 2025 of file cluster.cpp.
References Distance, NormalMean, and NormalVariance.
Referenced by MakeBuckets().
02025 { 02026 FLOAT64 Distance; 02027 02028 Distance = x - NormalMean; 02029 return (NormalMagnitude * 02030 exp (-0.5 * Distance * Distance / NormalVariance)); 02031 } // NormalDensity
int NumBucketsMatch | ( | void * | arg1, | |
void * | arg2 | |||
) |
Searches list of histogram data structures to find one with the specified number of buckets.
arg1 | Current histogram being tested for a match | |
arg2 | Match key |
Definition at line 2381 of file cluster.cpp.
References BUCKETS::NumberOfBuckets.
Referenced by GetBuckets().
02382 { // UINT16 *DesiredNumberOfBuckets) 02383 BUCKETS *Histogram = (BUCKETS *) arg1; 02384 UINT16 *DesiredNumberOfBuckets = (UINT16 *) arg2; 02385 02386 return (*DesiredNumberOfBuckets == Histogram->NumberOfBuckets); 02387 02388 } // NumBucketsMatch
Computes the optimum number of histogram buckets that should be used in a chi-squared goodness of fit test for the specified number of samples.
SampleCount | number of samples to be tested |
Definition at line 1920 of file cluster.cpp.
References BucketsTable, CountTable, and LOOKUPTABLESIZE.
Referenced by GetBuckets(), and MakeBuckets().
01920 { 01921 UINT8 Last, Next; 01922 FLOAT32 Slope; 01923 01924 if (SampleCount < CountTable[0]) 01925 return (BucketsTable[0]); 01926 01927 for (Last = 0, Next = 1; Next < LOOKUPTABLESIZE; Last++, Next++) { 01928 if (SampleCount <= CountTable[Next]) { 01929 Slope = (FLOAT32) (BucketsTable[Next] - BucketsTable[Last]) / 01930 (FLOAT32) (CountTable[Next] - CountTable[Last]); 01931 return ((UINT16) (BucketsTable[Last] + 01932 Slope * (SampleCount - CountTable[Last]))); 01933 } 01934 } 01935 return (BucketsTable[Last]); 01936 } // OptimumNumberOfBuckets
Find zero (root) of function.
Function | function whose zero is to be found | |
FunctionParams | arbitrary data to pass to function | |
InitialGuess | point to start solution search at | |
Accuracy | maximum allowed error |
It will only work correctly if a solution actually exists and there are no extrema between the solution and the InitialGuess. The algorithms used are extremely primitive.
Definition at line 2516 of file cluster.cpp.
References Abs, DELTARATIO, f, INITIALDELTA, and MAX_FLOAT32.
Referenced by ComputeChiSquared().
02520 { 02521 FLOAT64 x; 02522 FLOAT64 f; 02523 FLOAT64 Slope; 02524 FLOAT64 Delta; 02525 FLOAT64 NewDelta; 02526 FLOAT64 xDelta; 02527 FLOAT64 LastPosX, LastNegX; 02528 02529 x = InitialGuess; 02530 Delta = INITIALDELTA; 02531 LastPosX = MAX_FLOAT32; 02532 LastNegX = -MAX_FLOAT32; 02533 f = (*Function) ((CHISTRUCT *) FunctionParams, x); 02534 while (Abs (LastPosX - LastNegX) > Accuracy) { 02535 // keep track of outer bounds of current estimate 02536 if (f < 0) 02537 LastNegX = x; 02538 else 02539 LastPosX = x; 02540 02541 // compute the approx. slope of f(x) at the current point 02542 Slope = 02543 ((*Function) ((CHISTRUCT *) FunctionParams, x + Delta) - f) / Delta; 02544 02545 // compute the next solution guess */ 02546 xDelta = f / Slope; 02547 x -= xDelta; 02548 02549 // reduce the delta used for computing slope to be a fraction of 02550 //the amount moved to get to the new guess 02551 NewDelta = Abs (xDelta) * DELTARATIO; 02552 if (NewDelta < Delta) 02553 Delta = NewDelta; 02554 02555 // compute the value of the function at the new guess 02556 f = (*Function) ((CHISTRUCT *) FunctionParams, x); 02557 } 02558 return (x); 02559 02560 } // Solve
Computes standard deviation of prototype in the indicated dimension.
Proto | prototype to return standard deviation of | |
Dimension | dimension whose stddev is to be returned |
Definition at line 712 of file cluster.cpp.
References D_random, proto::Distrib, elliptical, FLOATUNION::Elliptical, mixed, normal, spherical, FLOATUNION::Spherical, proto::Style, uniform, and proto::Variance.
Referenced by PrintNormMatch().
00712 { 00713 switch (Proto->Style) { 00714 case spherical: 00715 return ((FLOAT32) sqrt ((double) Proto->Variance.Spherical)); 00716 case elliptical: 00717 return ((FLOAT32) 00718 sqrt ((double) Proto->Variance.Elliptical[Dimension])); 00719 case mixed: 00720 switch (Proto->Distrib[Dimension]) { 00721 case normal: 00722 return ((FLOAT32) 00723 sqrt ((double) Proto->Variance.Elliptical[Dimension])); 00724 case uniform: 00725 case D_random: 00726 return (Proto->Variance.Elliptical[Dimension]); 00727 } 00728 } 00729 return 0.0f; 00730 } // StandardDeviation
PROTOTYPE * TestEllipticalProto | ( | CLUSTERER * | Clusterer, | |
CLUSTER * | Cluster, | |||
STATISTICS * | Statistics | |||
) |
Tests the specified cluster to see if there is a statistically significant difference between the sub-clusters that would be made if the cluster were to be split.
Clusterer | data struct containing samples being clustered | |
Cluster | cluster to be made into an elliptical prototype | |
Statistics | statistical info about cluster |
Definition at line 1197 of file cluster.cpp.
References STATISTICS::CoVariance, cprintf(), Emalloc(), FTable, FTABLE_X, FTABLE_Y, InvertMatrix(), sample::Left, sample::Mean, memfree(), N, NewEllipticalProto(), NULL, sample::Right, sample::SampleCount, and CLUSTERER::SampleSize.
Referenced by MakePrototype().
01199 { 01200 int N = Clusterer->SampleSize; 01201 CLUSTER* Left = Cluster->Left; 01202 CLUSTER* Right = Cluster->Right; 01203 if (Left == NULL || Right == NULL) 01204 return NULL; 01205 int TotalDims = Left->SampleCount + Right->SampleCount; 01206 if (TotalDims < N + 1) 01207 return NULL; 01208 FLOAT32* Inverse = (FLOAT32 *) Emalloc(N * N * sizeof(FLOAT32)); 01209 FLOAT32* Delta = (FLOAT32*) Emalloc(N * sizeof(FLOAT32)); 01210 double err = InvertMatrix(Statistics->CoVariance, N, Inverse); 01211 if (err > 1) { 01212 cprintf("Clustering error: Matrix inverse failed with error %g\n", err); 01213 } 01214 for (int dim = 0; dim < N; ++dim) { 01215 Delta[dim] = Left->Mean[dim] - Right->Mean[dim]; 01216 } 01217 // Compute Hotelling's T-squared. 01218 double Tsq = 0.0; 01219 for (int x = 0; x < N; ++x) { 01220 double temp = 0.0; 01221 for (int y = 0; y < N; ++y) { 01222 temp += Inverse[y + N*x] * Delta[y]; 01223 } 01224 Tsq += Delta[x] * temp; 01225 } 01226 memfree(Inverse); 01227 memfree(Delta); 01228 Tsq *= Left->SampleCount * Right->SampleCount / TotalDims; 01229 double F = Tsq * (TotalDims - N - 1) / ((TotalDims - N) * 2); 01230 int Fx = N; 01231 if (Fx > FTABLE_X) 01232 Fx = FTABLE_X; 01233 --Fx; 01234 int Fy = TotalDims - N - 1; 01235 if (Fy > FTABLE_Y) 01236 Fy = FTABLE_Y; 01237 --Fy; 01238 if (F < FTable[Fy][Fx]) { 01239 return NewEllipticalProto (Clusterer->SampleSize, Cluster, Statistics); 01240 } 01241 return NULL; 01242 }
UINT16 UniformBucket | ( | PARAM_DESC * | ParamDesc, | |
FLOAT32 | x, | |||
FLOAT32 | Mean, | |||
FLOAT32 | StdDev | |||
) |
Determines which bucket x falls into in the discrete uniform distribution defined by BUCKETTABLESIZE.
ParamDesc | used to identify circular dimensions | |
x | value to be normalized | |
Mean | center of range of uniform distribution | |
StdDev | 1/2 the range of the uniform distribution |
Definition at line 2220 of file cluster.cpp.
References BUCKETTABLESIZE, PARAM_DESC::Circular, PARAM_DESC::HalfRange, and PARAM_DESC::Range.
Referenced by FillBuckets().
02223 { 02224 FLOAT32 X; 02225 02226 // wraparound circular parameters if necessary 02227 if (ParamDesc->Circular) { 02228 if (x - Mean > ParamDesc->HalfRange) 02229 x -= ParamDesc->Range; 02230 else if (x - Mean < -ParamDesc->HalfRange) 02231 x += ParamDesc->Range; 02232 } 02233 02234 X = ((x - Mean) / (2 * StdDev) * BUCKETTABLESIZE + BUCKETTABLESIZE / 2.0); 02235 if (X < 0) 02236 return ((UINT16) 0); 02237 if (X > BUCKETTABLESIZE - 1) 02238 return ((UINT16) (BUCKETTABLESIZE - 1)); 02239 return ((UINT16) floor ((FLOAT64) X)); 02240 } // UniformBucket
Gets density of a uniform distribution at a point.
x | number to compute the uniform probability density for |
Definition at line 2046 of file cluster.cpp.
References BUCKETTABLESIZE.
Referenced by MakeBuckets().
02046 { 02047 static FLOAT64 UniformDistributionDensity = (FLOAT64) 1.0 / BUCKETTABLESIZE; 02048 02049 if ((x >= 0.0) && (x <= BUCKETTABLESIZE)) 02050 return (UniformDistributionDensity); 02051 else 02052 return ((FLOAT64) 0.0); 02053 } // UniformDensity
UINT16 BucketsTable[LOOKUPTABLESIZE] [static] |
Initial value:
{ MINBUCKETS, 16, 20, 24, 27, 30, 35, MAXBUCKETS }
Definition at line 280 of file cluster.cpp.
Referenced by OptimumNumberOfBuckets().
UINT32 CountTable[LOOKUPTABLESIZE] [static] |
Initial value:
{ MINSAMPLES, 200, 400, 600, 800, 1000, 1500, 2000 }
Definition at line 277 of file cluster.cpp.
Referenced by OptimumNumberOfBuckets().
INT32 CurrentTemp [static] |
Definition at line 244 of file cluster.cpp.
Referenced by CreateClusterTree(), and MakePotentialClusters().
double FTable[FTABLE_Y][FTABLE_X] |
Table of values approximating the cumulative F-distribution for a confidence of 1%.
Definition at line 35 of file cluster.cpp.
Referenced by TestEllipticalProto().
The following variables are declared as global so that routines which are called from the KD-tree walker can get to them.
Definition at line 241 of file cluster.cpp.
Referenced by CreateClusterTree(), FreeHeapData(), GetTopOfHeap(), HeapPop(), HeapPopWorst(), HeapPush(), HeapStore(), and MakePotentialClusters().
FLOAT64 NormalMagnitude [static] |
Initial value:
(2.0 * NORMALEXTENT) / (SqrtOf2Pi * BUCKETTABLESIZE)
Definition at line 260 of file cluster.cpp.
FLOAT64 NormalMean = BUCKETTABLESIZE / 2 [static] |
FLOAT64 NormalStdDev = BUCKETTABLESIZE / (2.0 * NORMALEXTENT) [static] |
FLOAT64 NormalVariance [static] |
Initial value:
(BUCKETTABLESIZE * BUCKETTABLESIZE) / (4.0 * NORMALEXTENT * NORMALEXTENT)
Definition at line 258 of file cluster.cpp.
Referenced by NormalDensity().
LIST OldBuckets[] = { NIL, NIL, NIL } [static] |
keep a list of histogram buckets to minimize recomputing them
Definition at line 265 of file cluster.cpp.
Referenced by FreeBuckets(), and GetBuckets().
TEMPCLUSTER* TempCluster [static] |
Definition at line 242 of file cluster.cpp.
Referenced by CreateClusterTree(), MakeNewCluster(), and MakePotentialClusters().
Definition at line 243 of file cluster.cpp.
Referenced by CreateClusterTree(), FindNearestNeighbor(), FreeKDTree(), KDDelete(), KDNearestNeighborSearch(), KDStore(), KDWalk(), and MakePotentialClusters().