K-means算法是很典型的基于距离的聚类算法,采用距离作为相似性的评价指标,即认为两个对象的距离越近,其相似度就越大。该算法认为簇是由距离靠近的对象组成的,因此把得到紧凑且独立的簇作为最终目标。
算法过程如下:
1)从N个样本随机选取K个样本作为质心
2)对剩余的每个样本测量其到每个质心的距离,并把它归到最近的质心的类
3)重新计算已经得到的各个类的质心
4)迭代2~3步直至新的质心与原质心相等或小于指定阈值,算法结束
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
|
#include<stdio.h> #include<stdlib.h> #include<string.h> #include<time.h> #include<math.h> #define DIMENSIOM 2 //目前只是处理2维的数据 #define MAX_ROUND_TIME 100 //最大的聚类次数 typedef struct Item{ int dimension_1; //用于存放第一维的数据 int dimension_2; //用于存放第二维的数据 int clusterID; //用于存放该item的cluster center是谁 }Item; Item* data; typedef struct ClusterCenter{ double dimension_1; double dimension_2; int clusterID; }ClusterCenter; ClusterCenter* cluster_center_new; int isContinue; int * cluster_center; //记录center double * distanceFromCenter; //记录一个“点”到所有center的距离 int data_size; char filename[200]; int cluster_count; void initial(); void readDataFromFile(); void initial_cluster(); void calculateDistance_ToOneCenter( int itemID, int centerID, int count); void calculateDistance_ToAllCenter( int itemID); void partition_forOneItem( int itemID); void partition_forAllItem_OneCluster( int round); void calculate_clusterCenter( int round); void K_means(); void writeClusterDataToFile( int round); void writeClusterCenterToFile( int round); void compareNew_OldClusterCenter( double * new_X_Y); void test_1(); int main( int argc, char * argv[]){ if ( argc != 4 ) { printf ( "This application need other parameter to run:" "\n\t\tthe first is the size of data set," "\n\t\tthe second is the file name that contain data" "\n\t\tthe third indicate the cluster_count" "\n" ); exit (0); } srand ((unsigned) time (NULL)); data_size = atoi (argv[1]); strcat (filename, argv[2]); cluster_count = atoi (argv[3]); initial(); readDataFromFile(); initial_cluster(); //test_1(); //partition_forAllItem_OneCluster(); //calculate_clusterCenter(); K_means(); return 0; } /* * 对涉及到的二维动态数组根据main函数中传入的参数分配空间 * */ void initial(){ data = (Item*) malloc ( sizeof ( struct Item) * (data_size + 1)); if ( !data ) { printf ( "malloc error:data!" ); exit (0); } cluster_center = ( int *) malloc ( sizeof ( int ) * (cluster_count + 1)); if ( !cluster_center ) { printf ( "malloc error:cluster_center!\n" ); exit (0); } distanceFromCenter = ( double *) malloc ( sizeof ( double ) * (cluster_count + 1)); if ( !distanceFromCenter ) { printf ( "malloc error: distanceFromCenter!\n" ); exit (0); } cluster_center_new = (ClusterCenter*) malloc ( sizeof ( struct ClusterCenter) * (cluster_count + 1)); if ( !cluster_center_new ) { printf ( "malloc cluster center new error!\n" ); exit (0); } } /* * 从文件中读入x和y数据 * */ void readDataFromFile(){ FILE * fread ; if ( NULL == ( fread = fopen (filename, "r" ))) { printf ( "open file(%s) error!\n" , filename); exit (0); } int row; for ( row = 1; row <= data_size; row++ ) { if ( 2 != fscanf ( fread , "%d %d " , &data[row].dimension_1, &data[row].dimension_2)) { printf ( "fscanf error: %d\n" , row); } data[row].clusterID = 0; } } /* * 根据从主函数中传入的@cluster_count(聚类的个数)来随机的选择@cluster_count个 * 初始的聚类的起点 * */ void initial_cluster(){ //辅助产生不重复的数 int * auxiliary; int i; auxiliary = ( int *) malloc ( sizeof ( int ) * (data_size + 1)); if ( !auxiliary ) { printf ( "malloc error: auxiliary" ); exit (0); } for ( i = 1; i <= data_size; i++ ) { auxiliary[i] = i; } //产生初始化的cluster_count个聚类 int length = data_size; int random; for ( i = 1; i <= cluster_count; i++ ) { random = rand ()%length + 1; //printf("%d \n", auxiliary[random]); //data[auxiliary[random]].clusterID = auxiliary[random]; cluster_center[i] = auxiliary[random]; auxiliary[random] = auxiliary[length--]; } for ( i = 1; i <= cluster_count; i++ ) { cluster_center_new[i].dimension_1 = data[cluster_center[i]].dimension_1; cluster_center_new[i].dimension_2 = data[cluster_center[i]].dimension_2; cluster_center_new[i].clusterID = i; data[cluster_center[i]].clusterID = i; } } /* * 计算一个点(还没有划分到cluster center的点)到一个cluster center的distance * @itemID: 不属于任何cluster中的点 * @centerID: center的ID * @count: 表明在计算的是itemID到第几个@center的distance,并且指明了结果放在distanceFromCenter的第几号元素 * */ void calculateDistance_ToOneCenter( int itemID, int centerID){ distanceFromCenter[centerID] = sqrt ( (data[itemID].dimension_1-cluster_center_new[centerID].dimension_1)*( double )(data[itemID].dimension_1-cluster_center_new[centerID].dimension_1) + ( double )(data[itemID].dimension_2-cluster_center_new[centerID].dimension_2) * (data[itemID].dimension_2-cluster_center_new[centerID].dimension_2) ); } /* * 计算一个点(还没有划分到cluster center的点)到每个cluster center的distance * */ void calculateDistance_ToAllCenter( int itemID){ int i; for ( i = 1; i <= cluster_count; i++ ) { calculateDistance_ToOneCenter(itemID, i); } } void test_1() { calculateDistance_ToAllCenter(3); int i; for ( i = 1; i <= cluster_count; i++ ) { printf ( "%f " , distanceFromCenter[i]); } } /* * 在得到任一的点(不属于任一cluster的)到每一个cluster center的distance之后,决定它属于哪一个cluster center,即取距离最小的 * 函数功能:得到一个item所属的cluster center * */ void partition_forOneItem( int itemID){ //操作对象是 distanceFromCenter和cluster_center int i; int min_index = 1; double min_value = distanceFromCenter[1]; for ( i = 2; i <= cluster_count; i++ ) { if ( distanceFromCenter[i] < min_value ) { min_value = distanceFromCenter[i]; min_index = i; } } data[itemID].clusterID = cluster_center_new[min_index].clusterID; } /* * 得到所有的item所属于的cluster center , 在一轮的聚类中 * */ void partition_forAllItem_OneCluster( int round){ //changed!!!!!!!!!!!!!!!!!!!!!!!! int i; for ( i = 1; i <= data_size; i++ ) { if ( data[i].clusterID != 0 ) continue ; else { calculateDistance_ToAllCenter(i); //计算i到所有center的distance partition_forOneItem(i); //根据distance对i进行partition } } //把聚类得到的数据写入到文件中 writeClusterDataToFile(round); } /* * 将聚类得到的数据写入到文件中,每一个类写入一个文件中 * @round: 表明在进行第几轮的cluster,该参数的另一个作用是指定了文件名字中的第一个项. * */ void writeClusterDataToFile( int round){ int i; char filename[200]; FILE ** file; file = ( FILE **) malloc ( sizeof ( FILE *) * (cluster_count + 1)); if ( !file ) { printf ( "malloc file error!\n" ); exit (0); } for ( i = 1; i <= cluster_count; i++ ) { sprintf (filename, ".//ClusterProcess//round%d_cluster%d.data" , round, i); if ( NULL == (file[i] = fopen (filename, "w" ))) { printf ( "file open(%s) error!" , filename); exit (0); } } for ( i = 1; i <= data_size; i++ ) { //sprintf(filename, ".//ClusterProcess//round%d_cluster%d.data", round, data[i].clusterID); fprintf (file[data[i].clusterID], "%d\t%d\n" , data[i].dimension_1, data[i].dimension_2); } for ( i = 1; i <= cluster_count; i++ ) { //sprintf(filename, ".//ClusterProcess//round%d_cluster%d.data", round, i); fclose (file[i]); } } /* * 重新计算新的cluster center * */ void calculate_clusterCenter( int round){ //changed!!!!!!!!!!!!!!!!!!!!!! int i; double * new_X_Y; /* 用来计算和保存新的cluster center的值,同样的,0号元素不用。1,2号元素分别用来 存放第一个聚类的所有的项的x和y的累加和。3,4号元素分别用来存放第二个聚类的所有 的项的x和y的累加和...... */ new_X_Y = ( double *) malloc ( sizeof ( double ) * (2 * cluster_count + 1)); if ( !new_X_Y ) { printf ( "malloc error: new_X_Y!\n" ); exit (0); } //初始化为0 for ( i = 1; i <= 2*cluster_count; i++ ) new_X_Y[i] = 0.0; //用来统计属于各个cluster的item的个数 int * counter; counter = ( int *) malloc ( sizeof ( int ) * (cluster_count + 1)); if ( !counter ) { printf ( "malloc error: counter\n" ); exit (0); } //初始化为0 for ( i = 1; i <= cluster_count; i++ ) counter[i] = 0; for ( i = 1; i <= data_size; i++ ) { new_X_Y[data[i].clusterID * 2 - 1] += data[i].dimension_1; new_X_Y[data[i].clusterID * 2] += data[i].dimension_2; counter[data[i].clusterID]++; } for ( i = 1; i <= cluster_count; i++ ) { new_X_Y[2 * i - 1] = new_X_Y[2 * i - 1] / ( double )(counter[i]); new_X_Y[2 * i] = new_X_Y[2 * i] / ( double )(counter[i]); } //要将cluster center的值保存在文件中,后续作图 writeClusterCenterToFile(round); /* * 在这里比较一下新的和旧的cluster center值的差别。如果是相等的,则停止K-means算法。 * */ compareNew_OldClusterCenter(new_X_Y); //将新的cluster center的值放入cluster_center_new for ( i = 1; i <= cluster_count; i++ ) { cluster_center_new[i].dimension_1 = new_X_Y[2 * i - 1]; cluster_center_new[i].dimension_2 = new_X_Y[2 * i]; cluster_center_new[i].clusterID = i; } free (new_X_Y); free (counter); //在重新计算了新的cluster center之后,意味着我们要重新来为每一个Item进行聚类,所以data中用于表示聚类ID的clusterID //要都重新置为0。 for ( i = 1; i <= data_size; i++ ) { data[i].clusterID = 0; } } /* * 将得到的新的cluster_count个cluster center的值保存在文件中。以便于观察聚类的过程。 * */ void writeClusterCenterToFile( int round){ FILE * file; int i; char filename[200]; sprintf (filename, ".//ClusterProcess//round%d_clusterCenter.data" , round); if ( NULL == (file = fopen (filename, "w" ))) { printf ( "open file(%s) error!\n" , filename); exit (0); } for ( i = 1; i <= cluster_count; i++ ) { fprintf (file, "%f\t%f\n" , cluster_center_new[i].dimension_1, cluster_center_new[i].dimension_2); } for ( i = 1; i <= cluster_count; i++ ) { fclose (file); } } /* * 比较新旧的cluster center的差异 * */ void compareNew_OldClusterCenter( double * new_X_Y){ int i; isContinue = 0; //等于0表示的是不要继续 for ( i = 1; i <= cluster_count; i++ ) { if ( new_X_Y[2 * i - 1] != cluster_center_new[i].dimension_1 || new_X_Y[2 * i] != cluster_center_new[i].dimension_2) { isContinue = 1; //要继续 break ; } } } /************************************************************************************************ * K-means算法 * ***********************************************************************************************/ void K_means(){ int times_cluster; for ( times_cluster = 1; times_cluster <= MAX_ROUND_TIME; times_cluster++ ) { printf ( "\n times : %d \n" , times_cluster); partition_forAllItem_OneCluster(times_cluster); calculate_clusterCenter(times_cluster); if ( 0 == isContinue ) { break ; //printf("\n\nthe application can stop!\n\n"); } } } |
以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持服务器之家。
原文链接:http://blog.csdn.net/robin_Xu_shuai/article/details/51534064