機器學習分爲有監督學習和無監督學習兩種,有監督學習包括分類和迴歸,無監督學習中一個重要的部分就是聚類了。在我看來,聚類主要有兩個準則和一個思想。兩個準則是:類內距離最小,類間距離最大;一個思想是:EM(Estimation andMaximization)思想。類內距離最小準則表如今如K-均值法、模糊C-均值法(fussy c-means,FCM)等算法中;類間距離最大準則則表如今分層聚類算法中。仔細研究基於這兩個準則的算法的理論,能夠看到EM思想的影子,因而直覺告訴我聚類的兩個準則和EM思想是等價或包含的關係,對於這樣的猜測的證實不是在我能力範圍內的,這裏只是提出這樣的一個觀點。ios
EM思想來源於極大似然估計理論,極大似然估計本質是這樣的:對於現有的一個樣本集,已知其整體分佈,如今要對這個整體分佈的參數進行估計,使得在該參數下的整體分佈模型產生的整體中,進行採樣獲得現有樣本集的機率最大。要說明的是已有的樣本集中的樣本不是獨立同分布的,而是不一樣類別的樣本服從不一樣的分佈,這也是監督學習和無監督學習的一個區別,監督學習的樣本通常是獨立同分布的。這就涉及到這樣一個問題:是否是標記了的訓練樣本就必定要採用有監督的學習方法呢?通常狀況是這樣的,可是有時候對於類間交叉部分樣本而言,無監督學習所獲得的錯誤率比有監督學習獲得的錯誤率要低,產生這種狀況的本質就是監督學習和無監督學習在假設訓練樣本整體分佈時的不一樣。所以對於有標記的訓練樣本集,能夠先對不一樣類別的訓練樣本進行假設檢驗,如不一樣類別訓練樣本的方差有無顯著性差別等,再決定選用有監督學習仍是無監督學習。算法
對EM思想介紹較好的能夠參考博客:http://blog.csdn.net/zouxy09。express
再說說混合高斯模型(GMM)吧,GMM實質上式EM算法的一個特例,該算法假設訓練樣本集中不一樣類別的樣本服從不一樣的高斯分佈。經過構造極大似然函數L(P,Z)來進行參數估計,該似然函數有兩個未知參數:P,樣本的整體分佈參數;Z樣本的類別。若是直接對L求導數是不能直接同時求出P和Z的,可是若是知道其中的任意一個參數均可以將另一個參數求出來,因而先隨機假設訓練樣本的類別,而後求出估計參數P,再由在估計參數P下的整體分佈更新訓練樣本的新類別,就在這樣的一個循環迭代的過程當中使得極大似然函數L(P,X)值達到極大,詳細理論能夠參看EM算法推導過程,不過這多是一個局部極大值。EM算法是一個很是偉大的算法,其推導過程也展示了數學的無窮魅力,這裏不詳述了,有時間慢慢品味吧!dom
最後給出一個用C++實現的GMM算法的源碼吧,雖然這樣的源碼也容易找到,找了幾個,不過都以爲他們的代碼太難理解了,因而花了兩天時間本身寫了一個,但願您有所幫助!GMM算法理論參考博文:機器學習
http://blog.csdn.net/junnan321/article/details/8483351ide
頭文件: #ifndef MYGMM_H #define MYGMM_H class MyGmm{ public: MyGmm():m_dtNum(0), m_dtDim(0), m_numCluster(0), m_data(0), m_weight(0), m_p(0), m_mean(0), m_covariance(0){} ~MyGmm(){ Dispose(); } /************************************** *The most important interface for users. ***************************************/ void Train(double** data, int dtNum, int dtDim, int numCluster, int maxiter); /************************************ * Show those internal data interface. *************************************/ void ShowWeight(); void ShowP(); void ShowMean(); void ShowCovariance(); void ShowData(); private: /************************ *Prevent copy and assign. *************************/ MyGmm(const MyGmm&); const MyGmm& operator=(const MyGmm&); /*************************************************** * Repeat the E step and M step untile it reaches * a certain condition according to your requierment. ****************************************************/ void Iterate(); /*********** *E step. ************/ void ComputeEstimationWeight(); /******** *M step. *********/ void ComputeMaximizationCondition(); /*********************************************** *Set those dynamically allocated memory to zero, *such as m_weight,m_p, etc. ************************************************/ void SetM_CovrianceZero(); void SetM_PZero(); void SetM_WeightZero(); void SetM_MeanZero(); /*********************************************************** *Initialize random cluster, and compute probability of each *class which cosumes the whole samples,mean of each class' *samples,covariance of each class. ************************************************************/ void InitRandomCluster(); /********************** *Allocate temp space *such as m_weight,m_p, etc. ***********************/ bool AllocateTempSpace(double** data, int dtNum, int dtDim, int numCluster, int maxiter); /******************************************************************** *Release those dynamically allocated memory,such as m_weight,m_p, etc. *********************************************************************/ bool Dispose(); private: int m_dtNum; //number of samples int m_dtDim; //dimentions of a sample int m_numCluster; //number of clusters you want to classify double** m_data; //dataset of samples /*E step use*/ double** m_weight; //estimation of probility of each sample that blongs to each class /*M step use*/ double* m_p; //probability of each class which cosumes the whole samples double** m_mean; //mean of each class' samples double*** m_covariance; //covariance of each class int m_maxiter; //for the max iterate times }; #endif 實現文件: #include <iostream> #include <assert.h> #include <math.h> #include <string.h> #include <cstdlib> #include "MyGmm.h" /*Train interface*/ void MyGmm::Train(double** data, int dtNum, int dtDim, int numCluster, int maxiter){ assert(data!=0 && dtNum>0 && dtDim>0 && numCluster>0 && maxiter>0); AllocateTempSpace(data, dtNum, dtDim, numCluster, maxiter); InitRandomCluster(); Iterate(); } /*Allocate temp space for compute use*/ bool MyGmm::AllocateTempSpace(double** data, int dtNum, int dtDim, int numCluster, int maxiter){ m_data = data; m_dtNum = dtNum; m_dtDim = dtDim; m_numCluster = numCluster; m_maxiter = maxiter; m_weight = new double*[m_dtNum]; int i = 0; while( i<m_dtNum ){ m_weight[i] = new double[m_numCluster]; memset(m_weight[i], 0, sizeof(double)*m_numCluster); i++; } m_p = new double[m_numCluster]; memset(m_p, 0, sizeof(double)*m_numCluster); m_mean = new double*[m_numCluster]; i = 0; while( i<m_numCluster ){ m_mean[i] = new double[m_dtDim]; memset(m_mean[i], 0, sizeof(double)*m_dtDim); i++; } m_covariance = new double**[m_numCluster]; for(i=0; i<m_numCluster; i++){ m_covariance[i] = new double*[m_dtDim]; int j = 0; while( j<m_dtDim ){ m_covariance[i][j] = new double[m_dtDim]; memset(m_covariance[i][j], 0, sizeof(double)*m_dtDim); j++; } } return true; } /*Initialize clusters*/ void MyGmm::InitRandomCluster(){ int* randomClass = new int[m_dtNum]; //random class matrix int i; /*initial E step*/ for(i=0; i<m_dtNum; i++){ randomClass[i] = rand() % m_numCluster; std::cout<<randomClass[i]<<" "; } std::cout<<std::endl; /*initial M step,compute m_p,m_mean,m_convariance*/ for(i=0; i<m_dtNum; i++){ m_p[randomClass[i]]++; //count number of samples for each class for(int j=0; j<m_dtDim; j++){ m_mean[randomClass[i]][j] += m_data[i][j]; //sum of each class's samples } } /*mean of each class' samples, m_mean*/ for(i=0; i<m_numCluster; i++){ for(int j=0; j<m_dtDim; j++){ m_mean[i][j] /= m_p[i]; } } /*covariance of each class, m_covariance*/ double* temp = new double[m_dtDim]; memset(temp, 0, sizeof(double)*m_dtDim); for(i=0; i<m_numCluster; i++){ for(int j=0; j<m_dtNum; j++){ for(int k=0; k<m_dtDim; k++){ temp[k] = m_data[j][k] - m_mean[randomClass[j]][k]; } for(int q=0; q<m_dtDim; q++){ for(int p=0; p<m_dtDim; p++){ m_covariance[randomClass[j]][q][p] += temp[q]*temp[p]; } } } for(int q=0; q<m_dtDim; q++){ for(int p=0; p<m_dtDim; p++){ m_covariance[i][q][p] /= m_p[i]; } } } /*probility of each sample that blongs to each class, m_p*/ for(i=0; i<m_numCluster; i++){ m_p[i] /= m_dtNum; } /*release memory*/ delete[] temp; delete[] randomClass; } /*Iterate clustering*/ void MyGmm::Iterate(){ bool loop = true; int iter = 1; while(loop){ ComputeEstimationWeight(); ComputeMaximizationCondition(); iter++; /* Add your condition here!*/ if(iter>m_maxiter) loop = false; } } /*Compute estimation weight*/ void MyGmm::ComputeEstimationWeight(){ int i; double* each_dis = new double[m_numCluster]; //for compute each sample's distribution probility in each Gaussion use for(i=0; i<m_dtNum; i++){ memset(each_dis, 1, sizeof(double)*m_numCluster); double full_dis = 0; //for compute each sample's full distribution use /*compute each sample's distribution probability in each Gaussion*/ for(int j=0; j<m_numCluster; j++){ double all_cov = 1.0; for(int k=0; k<m_dtDim; k++){ /* just regard each dimention of a sample is irrelavant if you want to compute more specificly, for example they are some sort of relavant, you can change the following expression*/ all_cov *= m_covariance[j][k][k]; each_dis[j] *= exp(-0.5*(m_data[i][k]-m_mean[j][k])*(m_data[i][k]-m_mean[j][k])/m_covariance[j][k][k]); } each_dis[j] *= 1 / sqrt(2 * 3.14159 * all_cov); full_dis += each_dis[j]*m_p[j]; } for(int k=0; k<m_numCluster; k++){ m_weight[i][k] = (each_dis[k]*m_p[k]) / full_dis; } } delete[] each_dis; } /*Compute maximization condition according to estimation weight*/ void MyGmm::ComputeMaximizationCondition(){ int i; /*compute the new m_p*/ SetM_PZero(); double total = 0.0; for(i=0; i<m_numCluster; i++){ for(int j=0; j<m_dtNum; j++){ m_p[i] += m_weight[j][i]; } total += m_p[i]; } /*compute the new m_mean*/ SetM_MeanZero(); for(int k=0; k<m_numCluster; k++){ for(int l=0; l<m_dtNum; l++){ for(int m=0; m<m_dtDim; m++){ m_mean[k][m] += m_weight[l][k]*m_data[l][m]; } } for(int m=0; m<m_dtDim; m++){ m_mean[k][m] /= m_p[k]; } } /*compute the m_covariance*/ double* temp = new double[m_dtDim]; memset(temp, 0, sizeof(double)*m_dtDim); SetM_CovrianceZero(); for(i=0; i<m_numCluster; i++){ for(int j=0; j<m_dtNum; j++){ for(int k=0; k<m_dtDim; k++){ temp[k] = m_data[j][k] - m_mean[i][k]; } for(int q=0; q<m_dtDim; q++){ for(int p=0; p<m_dtDim; p++){ m_covariance[i][q][p] += m_weight[j][i]*temp[q]*temp[p]; } } } for(int q=0; q<m_dtDim; q++){ for(int p=0; p<m_dtDim; p++){ m_covariance[i][q][p] /= m_p[i]; } } } /*standarize the m_p*/ for(int j=0; j<m_numCluster; j++){ m_p[j] /= total; } /*release memory*/ delete[] temp; } /*Delete those temp space*/ bool MyGmm::Dispose(){ if(m_weight!=0){ int i = 0; while( i<m_dtNum ){ delete[] m_weight[i]; i++; } delete[] m_weight; } if(m_p!=0){ delete[] m_p; } if(m_mean!=0){ int i = 0; while( i<m_numCluster ){ delete[] m_mean[i]; i++; } delete[] m_mean; } if(m_covariance!=0){ int i = 0; while( i<m_numCluster ){ int j = 0; while( j<m_dtDim ){ delete[] m_covariance[i][j]; j++; } i++; } for(i=0; i<m_numCluster; i++){ delete[] m_covariance[i] } delete[] m_covariance; } return true; } void MyGmm::SetM_CovrianceZero(){ for(int i=0; i<m_numCluster; i++){ int j = 0; while( j<m_dtDim ){ memset(m_covariance[i][j], 0.0, sizeof(double)*m_dtDim); j++; } } } /*Set those temp space to zero*/ void MyGmm::SetM_PZero(){ memset(m_p, 0.0, sizeof(double)*m_numCluster); } void MyGmm::SetM_WeightZero(){ int i = 0; while( i<m_dtNum ){ memset(m_weight[i], 0.0, sizeof(double)*m_numCluster); i++; } } void MyGmm::SetM_MeanZero(){ int i = 0; while( i<m_numCluster ){ memset(m_mean[i], 0.0, sizeof(double)*m_dtDim); i++; } } /*Show data in those temp space, also for debug use*/ void MyGmm::ShowWeight(){ for(int i=0; i<m_dtNum; i++){ for(int j=0; j<m_numCluster; j++){ std::cout<<m_weight[i][j]<<" "; } std::cout<<std::endl; } } void MyGmm::ShowP(){ for(int i=0; i<m_numCluster; i++) std::cout<<m_p[i]<<" "; std::cout<<"\n"; } void MyGmm::ShowMean(){ for(int i=0; i<m_numCluster; i++){ for(int j=0; j<m_dtDim; j++){ std::cout<<m_mean[i][j]<<" "; } std::cout<<"\n"; } } void MyGmm::ShowCovariance(){ for(int i=0; i<m_numCluster; i++){ for(int j=0; j<m_dtDim; j++){ for(int k=0; k<m_dtDim; k++){ std::cout<<m_covariance[i][j][k]<<" "; } std::cout<<"\n"; } std::cout<<"\n"; } } void MyGmm::ShowData(){ for(int i=0; i<m_dtNum; i++){ for(int j=0; j<m_dtDim; j++){ std::cout<<m_data[i][j]<<" "; } std::cout<<std::endl; } } 測試文件: #include <iostream> #include <fstream> #include "MyGmm.h" using namespace std; int main() { double** test_data1 = new double*[10]; int i; for(i=0; i<10; i++) test_data1[i] = new double[3]; test_data1[0][0] = 8; test_data1[0][1] = 4; test_data1[0][2] = 9; test_data1[1][0]=2; test_data1[1][1]=3; test_data1[1][2]=5; test_data1[2][0]=7; test_data1[2][1]=6; test_data1[2][2]=1; test_data1[3][0]=45; test_data1[3][1]=10; test_data1[3][2]=12; test_data1[4][0]=14; test_data1[4][1]=18; test_data1[4][2]=7; test_data1[5][0]=1; test_data1[5][1]=18; test_data1[5][2]=22; test_data1[6][0]=4; test_data1[6][1]=8; test_data1[6][2]=17; test_data1[7][0]=11; test_data1[7][1]=18; test_data1[7][2]=27; test_data1[8][0]=24; test_data1[8][1]=18; test_data1[8][2]=27; test_data1[9][0]=19; test_data1[9][1]=18; test_data1[9][2]=32; MyGmm mg; mg.Train(test_data1, 10, 3, 3, 7); mg.ShowWeight(); for(i=0; i<5; i++) delete[] test_data1[i]; delete[] test_data1; return 0; }