SHOGUN  v3.1.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
KMeans.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 1999-2008 Gunnar Raetsch
8  * Written (W) 2007-2009 Soeren Sonnenburg
9  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
10  */
11 
14 #include <shogun/labels/Labels.h>
17 #include <shogun/base/Parallel.h>
18 
19 #ifdef HAVE_PTHREAD
20 #include <pthread.h>
21 #endif
22 
23 #define MUSRECALC
24 
25 using namespace shogun;
26 
29 {
30  init();
31 }
32 
33 CKMeans::CKMeans(int32_t k_, CDistance* d)
35 {
36  init();
37  k=k_;
38  set_distance(d);
39 }
40 
41 CKMeans::CKMeans(int32_t k_i, CDistance* d_i, SGMatrix<float64_t> centers_i)
43 {
44  init();
45  k = k_i;
46  set_distance(d_i);
47  set_initial_centers(centers_i);
48 }
49 
51 {
52 }
53 
55 {
56  dimensions = ((CDenseFeatures<float64_t>*) distance->get_lhs())->get_num_features();
57  REQUIRE(centers.num_cols == k,
58  "Expected %d initial cluster centers, got %d", k, centers.num_cols);
59  REQUIRE(centers.num_rows == dimensions,
60  "Expected %d dimensionional cluster centers, got %d", dimensions, centers.num_rows);
61  mus_initial = centers;
62 }
63 
64 void CKMeans::set_random_centers(float64_t* weights_set, int32_t* ClList, int32_t XSize)
65 {
68 
69  for (int32_t i=0; i<XSize; i++)
70  {
71  const int32_t Cl=CMath::random(0, k-1);
72  weights_set[Cl]+=1.0;
73  ClList[i]=Cl;
74 
75  int32_t vlen=0;
76  bool vfree=false;
77  float64_t* vec=lhs->get_feature_vector(i, vlen, vfree);
78 
79  for (int32_t j=0; j<dimensions; j++)
80  mus.matrix[Cl*dimensions+j] += vec[j];
81 
82  lhs->free_feature_vector(vec, i, vfree);
83  }
84 
85  SG_UNREF(lhs);
86 
87  for (int32_t i=0; i<k; i++)
88  {
89  if (weights_set[i]!=0.0)
90  {
91  for (int32_t j=0; j<dimensions; j++)
92  mus.matrix[i*dimensions+j] /= weights_set[i];
93  }
94  }
95 }
96 
98  float64_t* dists, int32_t* ClList, int32_t XSize)
99 {
101 
104 
105  for(int32_t idx=0;idx<XSize;idx++)
106  {
107  for(int32_t m=0;m<k;m++)
108  dists[k*idx+m] = distance->distance(idx,m);
109  }
110 
111  for (int32_t i=0; i<XSize; i++)
112  {
113  float64_t mini=dists[i*k];
114  int32_t Cl = 0, j;
115 
116  for (j=1; j<k; j++)
117  {
118  if (dists[i*k+j]<mini)
119  {
120  Cl=j;
121  mini=dists[i*k+j];
122  }
123  }
124  ClList[i]=Cl;
125  }
126 
127  /* Compute the sum of all points belonging to a cluster
128  * and count the points */
131  for (int32_t i=0; i<XSize; i++)
132  {
133  const int32_t Cl = ClList[i];
134  weights_set[Cl]+=1.0;
135  if (fixed_centers)
136  {
137  int32_t vlen=0;
138  bool vfree=false;
139  float64_t* vec=lhs->get_feature_vector(i, vlen, vfree);
140 
141  for (int32_t j=0; j<dimensions; j++)
142  mus.matrix[Cl*dimensions+j] += vec[j];
143 
144  lhs->free_feature_vector(vec, i, vfree);
145  }
146  }
147  SG_UNREF(lhs);
148 
149  if (fixed_centers)
150  {
151  /* normalization to get the mean */
152  for (int32_t i=0; i<k; i++)
153  {
154  if (weights_set[i]!=0.0)
155  {
156  for (int32_t j=0; j<dimensions; j++)
157  mus.matrix[i*dimensions+j] /= weights_set[i];
158  }
159  }
160  }
161 }
162 
163 void CKMeans::compute_cluster_variances()
164 {
165  /* compute the ,,variances'' of the clusters */
166  for (int32_t i=0; i<k; i++)
167  {
168  float64_t rmin1=0;
169  float64_t rmin2=0;
170 
171  bool first_round=true;
172 
173  for (int32_t j=0; j<k; j++)
174  {
175  if (j!=i)
176  {
177  int32_t l;
178  float64_t dist = 0;
179 
180  for (l=0; l<dimensions; l++)
181  {
182  dist+=CMath::sq(
183  mus.matrix[i*dimensions+l]
184  -mus.matrix[j*dimensions+l]);
185  }
186 
187  if (first_round)
188  {
189  rmin1=dist;
190  rmin2=dist;
191  first_round=false;
192  }
193  else
194  {
195  if ((dist<rmin2) && (dist>=rmin1))
196  rmin2=dist;
197 
198  if (dist<rmin1)
199  {
200  rmin2=rmin1;
201  rmin1=dist;
202  }
203  }
204  }
205  }
206 
207  R.vector[i]=(0.7*CMath::sqrt(rmin1)+0.3*CMath::sqrt(rmin2));
208  }
209 }
210 
212 {
214 
215  if (data)
216  distance->init(data, data);
217 
220 
221  ASSERT(lhs);
222  int32_t XSize=lhs->get_num_vectors();
223  dimensions=lhs->get_num_features();
224 
225  ASSERT(XSize>0 && dimensions>0);
226 
227  int32_t changed=1;
228  const int32_t XDimk=dimensions*k;
229  int32_t iter=0;
230 
232 
234 
235  int32_t *ClList=SG_CALLOC(int32_t, XSize);
236  float64_t *weights_set=SG_CALLOC(float64_t, k);
237  float64_t *dists=SG_CALLOC(float64_t, k*XSize);
238 
241  CFeatures* rhs_cache = distance->replace_rhs(rhs_mus);
242 
243  int32_t vlen=0;
244  bool vfree=false;
245  float64_t* vec=NULL;
246 
247  /* ClList=zeros(XSize,1) ; */
248  memset(ClList, 0, sizeof(int32_t)*XSize);
249  /* weights_set=zeros(k,1) ; */
250  memset(weights_set, 0, sizeof(float64_t)*k);
251 
252  /* cluster_centers=zeros(dimensions, k) ; */
253  memset(mus.matrix, 0, sizeof(float64_t)*XDimk);
254 
255  if (mus_initial.matrix)
256  set_initial_centers(rhs_mus, weights_set, dists, ClList, XSize);
257  else
258  set_random_centers(weights_set, ClList, XSize);
259 
260  while (changed && (iter<max_iter))
261  {
262  iter++;
263  if (iter==max_iter-1)
264  SG_WARNING("kmeans clustering changed throughout %d iterations stopping...\n", max_iter-1)
265 
266  if (iter%1000 == 0)
267  SG_INFO("Iteration[%d/%d]: Assignment of %i patterns changed.\n", iter, max_iter, changed)
268  changed=0;
269 
270  if (!fixed_centers)
271  {
272  /* mus=zeros(dimensions, k) ; */
273  memset(mus.matrix, 0, sizeof(float64_t)*XDimk);
274 
275  for (int32_t i=0; i<XSize; i++)
276  {
277  int32_t Cl=ClList[i];
278 
279  vec=lhs->get_feature_vector(i, vlen, vfree);
280 
281  for (int32_t j=0; j<dimensions; j++)
282  mus.matrix[Cl*dimensions+j] += vec[j];
283 
284  lhs->free_feature_vector(vec, i, vfree);
285  }
286 
287  for (int32_t i=0; i<k; i++)
288  {
289  if (weights_set[i]!=0.0)
290  {
291  for (int32_t j=0; j<dimensions; j++)
292  mus.matrix[i*dimensions+j] /= weights_set[i];
293  }
294  }
295  }
297  rhs_mus->copy_feature_matrix(mus);
298 
299  for (int32_t i=0; i<XSize; i++)
300  {
301  /* ks=ceil(rand(1,XSize)*XSize) ; */
302  const int32_t Pat= CMath::random(0, XSize-1);
303  const int32_t ClList_Pat=ClList[Pat];
304  int32_t imini, j;
305  float64_t mini;
306 
307  /* compute the distance of this point to all centers */
308  for(int32_t idx_k=0;idx_k<k;idx_k++)
309  dists[idx_k]=distance->distance(Pat,idx_k);
310 
311  /* [mini,imini]=min(dists(:,i)) ; */
312  imini=0 ; mini=dists[0];
313  for (j=1; j<k; j++)
314  if (dists[j]<mini)
315  {
316  mini=dists[j];
317  imini=j;
318  }
319 
320  if (imini!=ClList_Pat)
321  {
322  changed++;
323 
324  /* weights_set(imini) = weights_set(imini) + 1.0 ; */
325  weights_set[imini]+= 1.0;
326  /* weights_set(j) = weights_set(j) - 1.0 ; */
327  weights_set[ClList_Pat]-= 1.0;
328 
329  vec=lhs->get_feature_vector(Pat, vlen, vfree);
330 
331  for (j=0; j<dimensions; j++)
332  {
333  mus.matrix[imini*dimensions+j]-=
334  (vec[j]-mus.matrix[imini*dimensions+j]) / weights_set[imini];
335  }
336 
337  lhs->free_feature_vector(vec, Pat, vfree);
338 
339  /* mu_new = mu_old - (x - mu_old)/(n-1) */
340  /* if weights_set(j)~=0 */
341  if (weights_set[ClList_Pat]!=0.0)
342  {
343  vec=lhs->get_feature_vector(Pat, vlen, vfree);
344 
345  for (j=0; j<dimensions; j++)
346  {
347  mus.matrix[ClList_Pat*dimensions+j]-=
348  (vec[j]-mus.matrix[ClList_Pat*dimensions+j]) / weights_set[ClList_Pat];
349  }
350  lhs->free_feature_vector(vec, Pat, vfree);
351  }
352  else
353  {
354  /* mus(:,j)=zeros(dimensions,1) ; */
355  for (j=0; j<dimensions; j++)
356  mus.matrix[ClList_Pat*dimensions+j]=0;
357  }
358 
359  /* ClList(i)= imini ; */
360  ClList[Pat] = imini;
361  }
362  }
363  }
364 
365  compute_cluster_variances();
366  distance->replace_rhs(rhs_cache);
367  delete rhs_mus;
368  SG_FREE(ClList);
369  SG_FREE(weights_set);
370  SG_FREE(dists);
371  SG_UNREF(lhs);
372 
373  return true;
374 }
375 
376 bool CKMeans::load(FILE* srcfile)
377 {
380  return false;
381 }
382 
383 bool CKMeans::save(FILE* dstfile)
384 {
387  return false;
388 }
389 
390 
391 void CKMeans::set_k(int32_t p_k)
392 {
393  ASSERT(p_k>0)
394  this->k=p_k;
395 }
396 
397 int32_t CKMeans::get_k()
398 {
399  return k;
400 }
401 
402 void CKMeans::set_max_iter(int32_t iter)
403 {
404  ASSERT(iter>0)
405  max_iter=iter;
406 }
407 
409 {
410  return max_iter;
411 }
412 
414 {
415  return R;
416 }
417 
419 {
420  if (!R.vector)
421  return SGMatrix<float64_t>();
422 
425  SGMatrix<float64_t> centers=lhs->get_feature_matrix();
426  SG_UNREF(lhs);
427  return centers;
428 }
429 
431 {
432  return dimensions;
433 }
434 
436 {
437  fixed_centers=fixed;
438 }
439 
441 {
442  return fixed_centers;
443 }
444 
446 {
447  /* set lhs of underlying distance to cluster centers */
449  mus);
450 
451  /* store cluster centers in lhs of distance variable */
452  CFeatures* rhs=distance->get_rhs();
453  distance->init(cluster_centers, rhs);
454  SG_UNREF(rhs);
455 }
456 
457 void CKMeans::init()
458 {
459  max_iter=10000;
460  k=3;
461  dimensions=0;
462  fixed_centers=false;
463 
464  SG_ADD(&max_iter, "max_iter", "Maximum number of iterations", MS_AVAILABLE);
465  SG_ADD(&k, "k", "k, the number of clusters", MS_AVAILABLE);
466  SG_ADD(&dimensions, "dimensions", "Dimensions of data", MS_NOT_AVAILABLE);
467  SG_ADD(&R, "R", "Cluster radiuses", MS_NOT_AVAILABLE);
468 }
469 
#define SG_INFO(...)
Definition: SGIO.h:120
#define SG_RESET_LOCALE
Definition: SGIO.h:88
virtual bool save(FILE *dstfile)
Definition: KMeans.cpp:383
Class Distance, a base class for all the distances used in the Shogun toolbox.
Definition: Distance.h:80
int32_t max_iter
maximum number of iterations
Definition: KMeans.h:173
CFeatures * get_lhs()
Definition: Distance.h:194
int32_t dimensions
number of dimensions
Definition: KMeans.h:182
SGVector< float64_t > R
radi of the clusters (size k)
Definition: KMeans.h:185
#define SG_UNREF(x)
Definition: SGRefObject.h:35
static T sq(T x)
x^2
Definition: Math.h:244
#define REQUIRE(x,...)
Definition: SGIO.h:208
CFeatures * get_rhs()
Definition: Distance.h:200
void set_k(int32_t p_k)
Definition: KMeans.cpp:391
int32_t get_dimensions()
Definition: KMeans.cpp:430
int32_t get_num_features() const
#define SG_SET_LOCALE_C
Definition: SGIO.h:87
A generic DistanceMachine interface.
ST * get_feature_vector(int32_t num, int32_t &len, bool &dofree)
static uint64_t random()
Definition: Math.h:528
virtual ~CKMeans()
Definition: KMeans.cpp:50
#define ASSERT(x)
Definition: SGIO.h:203
SGVector< float64_t > get_radiuses()
Definition: KMeans.cpp:413
float64_t get_max_iter()
Definition: KMeans.cpp:408
double float64_t
Definition: common.h:48
virtual void copy_feature_matrix(SGMatrix< ST > src)
virtual bool load(FILE *srcfile)
Definition: KMeans.cpp:376
void free_feature_vector(ST *feat_vec, int32_t num, bool dofree)
index_t num_rows
Definition: SGMatrix.h:301
bool get_fixed_centers()
Definition: KMeans.cpp:440
bool fixed_centers
whether to keep cluster centers fixed or not
Definition: KMeans.h:176
void set_max_iter(int32_t iter)
Definition: KMeans.cpp:402
index_t num_cols
Definition: SGMatrix.h:303
void set_fixed_centers(bool fixed)
Definition: KMeans.cpp:435
CFeatures * replace_rhs(CFeatures *rhs)
Definition: Distance.cpp:145
SGMatrix< float64_t > mus_initial
initial centers supplied
Definition: KMeans.h:188
virtual float64_t distance(int32_t idx_a, int32_t idx_b)
Definition: Distance.cpp:189
virtual void set_initial_centers(SGMatrix< float64_t > centers)
Definition: KMeans.cpp:54
The class Features is the base class of all feature objects.
Definition: Features.h:62
void set_distance(CDistance *d)
virtual void store_model_features()
Definition: KMeans.cpp:445
virtual EFeatureType get_feature_type()=0
#define SG_WARNING(...)
Definition: SGIO.h:130
#define SG_ADD(...)
Definition: SGObject.h:71
static float32_t sqrt(float32_t x)
x^0.5
Definition: Math.h:250
virtual bool train_machine(CFeatures *data=NULL)
Definition: KMeans.cpp:211
int32_t get_k()
Definition: KMeans.cpp:397
virtual bool init(CFeatures *lhs, CFeatures *rhs)
Definition: Distance.cpp:78
virtual int32_t get_num_vectors() const
int32_t k
the k parameter in KMeans
Definition: KMeans.h:179
SGMatrix< float64_t > get_cluster_centers()
Definition: KMeans.cpp:418

SHOGUN Machine Learning Toolbox - Documentation