K-均值算法的基本思想是首先從含有N個數據對象的數據集中隨機選擇K個數據對象做爲初始中心,而後計算每一個數據對象到各中心的距離,根據最近鄰原則,全部數據對象將會被劃分到離它最近的那個中心所表明的簇中,接着分別計算新生成的各個簇中數據對象的均值做爲各簇新的中心,比較新的中心和上一次獲得的中心,若是新的中心沒有發生變化,則算法收斂,輸出結果,若是新的中心和上一次的中心相比發生變化,則要根據新的中心對全部數據對象從新進行劃分。直到知足算法的收斂條件爲止。ios
K-means算法的過程能夠描述爲:算法
算法:劃分並計算基於簇中對象的平均值。 數據庫
輸入:簇的數目K和包含N個對象的數據庫。 ide
輸出:平方偏差總和最小條件下的K個簇。 spa
方法:code
1) 任意選擇K個對象做爲初始的簇中心; 對象
2) 分別計算數據集中每一個元素與所選簇的中心計算距離(通常採用歐式距離),根據最近鄰原則,將元素劃分到相應的簇中; blog
3) 計算每一個簇中對象的平均值,更新簇的中心; ip
4) 重複上面的步驟,直至更新的簇的中心與原簇的中心的差值在預約範圍內或者達到預設的迭代次數; it
5) 輸出K個簇中心。
K-means 方法的時間複雜度爲O(NKT),N表明總元素個數,K表明簇中心個數,T表明迭代次數。K-means算法是一種硬性劃分的聚類,即每一個數據點惟一地分配給一個聚類,因爲事先不知道實際的聚類狀況,所以多是一種嚴重的侷限。該算法對初始中心的選取很是敏感,初始中心隨機選取,致使結果波動較大,穩定性較差。同時該算法對噪聲數據和孤立點數據較爲敏感。該算法一般採用歐式距離做爲數據樣本之間的度量方式,致使該算法對球狀的簇有比較好的聚類效果,可是很難發現其餘形狀的簇。
#include <fstream> #include <iomanip> #include <iostream> #include <time.h> #include <stdlib.h> using namespace std; template <typename DataType> int readData( char* file_path , DataType** &data,int dimension) { if(dimension <= 0) return -1; int data_number = 0; fstream infile; infile.open(file_path,ios::in); DataType datum; long int position = infile.tellg(); while(!infile.eof()) { infile >> datum; data_number++; } infile.close(); position = infile.tellg(); infile.seekg(0,ios::beg); position = infile.tellg(); data_number /= dimension; infile.open(file_path,ios::in); data = new DataType*[data_number]; for( int i = 0; i < data_number; i++) { data[i] = new DataType[dimension]; for( int j = 0; j < dimension; j++) { infile >> data[i][j]; } } infile.close(); return data_number; } template <typename DataType> void kmeans(DataType** &data,int data_number,int dimension, DataType** ¢ers,int K, int* &labels, int iterations, DataType threshold) { if(data == NULL) return; centers = new DataType*[K]; labels = new int[data_number]; DataType** sum; int* counts; sum = new DataType*[K]; counts = new int[K]; for(int i = 0; i < K; i++) { centers[i] = new DataType[dimension]; sum[i] = new DataType[dimension]; counts[i] = 0; for( int j = 0; j < dimension; j++) { sum[i][j] = 0; } } rand_init_centers(data,data_number,dimension,centers,K); int iteration_time = 0; DataType difference = INT_MAX; while( iteration_time < iterations || difference > threshold) { for(int i = 0; i < K; i++) { counts[i] = 0; for( int j = 0; j < dimension; j++) { sum[i][j] = 0; } } for(int i = 0; i < data_number; i++) { labels[i] = select_center(data[i],centers,K,dimension); counts[labels[i]]++; for( int j = 0; j < dimension; j++) { sum[labels[i]][j] += data[i][j]; } } difference = 0; for( int i = 0; i < K; i++) { for( int j = 0; j < dimension; j++) { if(counts[i] > 0) { sum[i][j] /= counts[i]; DataType delta = sum[i][j] - centers[i][j]; difference += delta*delta; centers[i][j] = sum[i][j]; } } } cout << iteration_time << '\t' << difference << endl; iteration_time++; } for( int i = 0; i < K; i++) { if(sum[i] != NULL) { delete[] sum[i]; } } if(sum != NULL) { delete [] sum; } delete [] counts; } template <typename DataType> void save_centers(char* file_path, DataType** centers,int K,int dimension) { ofstream outfile; outfile.open(file_path,ios::out); for(int i = 0; i < K; i++) { outfile.setf(ios::left); for(int j = 0; j < dimension; j++) { outfile.width(10); outfile << centers[i][j]; } outfile.unsetf(ios::left); outfile << endl; } outfile.close(); } template <typename DataType> void save_labels(char* file_path, DataType** data, int* labels,int data_number,int dimension) { ofstream outfile; outfile.open(file_path,ios::out); for(int i = 0; i < data_number; i++) { outfile.setf(ios::left); outfile.width(10); outfile << labels[i]; for(int j = 0; j < dimension; j++) { outfile.width(10); outfile << data[i][j]; } outfile.unsetf(ios::left); outfile<< endl; } outfile.close(); } template <typename DataType> void rand_init_centers(DataType** &data, int data_number,int dimension, DataType** ¢ers,int K) { int step = data_number/K; for(int i = 0; i < K; i++) { srand(time(NULL)); int m = rand()%step; for(int j = 0; j < dimension; j++) { centers[i][j] = data[i*step+m][j]; } } } template <typename DataType> int select_center(DataType* &data_i, DataType** ¢ers, int K, int dimension) { if( K < 0) return -1; int label = 0; DataType min_dist = calculate_dist(data_i, centers[0],dimension); DataType dist = 0; for(int i = 1; i < K; i++) { dist = calculate_dist(data_i, centers[i],dimension); if(min_dist > dist) { min_dist = dist; label = i; } } return label; } template <typename DataType> DataType calculate_dist(DataType* &data_i, DataType* ¢ers_i,int dimension) { if(data_i == NULL || centers_i == NULL) { return (DataType)-1; } DataType dist = 0; for( int j = 0; j < dimension; j++) { DataType delta = data_i[j] - centers_i[j]; dist += delta*delta; } return dist; }
#include <iostream> using namespace std; #include "Kmeans.h" int main(int argc, char* argv[]) { float** data_source; float** clusters; int* labels; data_source = NULL; clusters = NULL; labels = NULL; int K = 5; int iterations = 50; float threshold = 0.001; int dimension = 1764; int data_number = readData("D:/Users/Surge/Desktop/test.txt",data_source,dimension); kmeans(data_source,data_number,dimension,clusters,K,labels,iterations,threshold); save_centers("D:/Users/Surge/Desktop/test_centers.txt",clusters,K,dimension); save_labels("D:/Users/Surge/Desktop/test_labels.txt",data_source,labels,data_number,dimension); for(int i = 0; i < data_number; i++) { if(data_source[i] != NULL) { delete[] data_source[i]; } } for(int i = 0; i < K; i++) { if(clusters[i] != NULL) { delete[] clusters[i]; } } if(data_source != NULL) { delete[] data_source; } if(clusters != NULL) { delete[] clusters; } if(labels != NULL) { delete[] labels; } system("pause"); return 0; }