數據挖掘課上面老師介紹了下決策樹ID3算法,我抽空餘時間把這個算法用C++實現了一遍。 node
決策樹算法是很是經常使用的分類算法,是逼近離散目標函數的方法,學習獲得的函數以決策樹的形式表示。其基本思路是不斷選取產生信息增益最大的屬性來劃分樣例集和,構造決策樹。信息增益定義爲結點與其子結點的信息熵之差。信息熵是香農提出的,用於描述信息不純度(不穩定性),其計算公式是 ios
Pi爲子集合中不一樣性(而二元分類即正樣例和負樣例)的樣例的比例。這樣信息收益能夠定義爲樣本按照某屬性劃分時形成熵減小的指望,能夠區分訓練樣本中正負樣本的能力,其計算公司是 算法
我實現該算法針對的樣例集合以下 編程
該表記錄了在不一樣氣候條件下是否去打球的狀況,要求根據該表用程序輸出決策樹 數據結構
C++代碼以下,程序中有詳細註釋 函數
- #include <iostream>
- #include <string>
- #include <vector>
- #include <map>
- #include <algorithm>
- #include <cmath>
- using namespace std;
- #define MAXLEN 6//輸入每行的數據個數
-
- //多叉樹的實現
- //1 廣義表
- //2 父指針表示法,適於常常找父結點的應用
- //3 子女鏈表示法,適於常常找子結點的應用
- //4 左長子,右兄弟表示法,實現比較麻煩
- //5 每一個結點的全部孩子用vector保存
- //教訓:數據結構的設計很重要,本算法採用5比較合適,同時
- //注意維護剩餘樣例和剩餘屬性信息,建樹時橫向遍歷考循環屬性的值,
- //縱向遍歷靠遞歸調用
-
- vector <vector <string> > state;//實例集
- vector <string> item(MAXLEN);//對應一行實例集
- vector <string> attribute_row;//保存首行即屬性行數據
- string end("end");//輸入結束
- string yes("yes");
- string no("no");
- string blank("");
- map<string,vector < string > > map_attribute_values;//存儲屬性對應的全部的值
- int tree_size = 0;
- struct Node{//決策樹節點
- string attribute;//屬性值
- string arrived_value;//到達的屬性值
- vector<Node *> childs;//全部的孩子
- Node(){
- attribute = blank;
- arrived_value = blank;
- }
- };
- Node * root;
-
- //根據數據實例計算屬性與值組成的map
- void ComputeMapFrom2DVector(){
- unsigned int i,j,k;
- bool exited = false;
- vector<string> values;
- for(i = 1; i < MAXLEN-1; i++){//按照列遍歷
- for (j = 1; j < state.size(); j++){
- for (k = 0; k < values.size(); k++){
- if(!values[k].compare(state[j][i])) exited = true;
- }
- if(!exited){
- values.push_back(state[j][i]);//注意Vector的插入都是從前面插入的,注意更新it,始終指向vector頭
- }
- exited = false;
- }
- map_attribute_values[state[0][i]] = values;
- values.erase(values.begin(), values.end());
- }
- }
-
- //根據具體屬性和值來計算熵
- double ComputeEntropy(vector <vector <string> > remain_state, string attribute, string value,bool ifparent){
- vector<int> count (2,0);
- unsigned int i,j;
- bool done_flag = false;//哨兵值
- for(j = 1; j < MAXLEN; j++){
- if(done_flag) break;
- if(!attribute_row[j].compare(attribute)){
- for(i = 1; i < remain_state.size(); i++){
- if((!ifparent&&!remain_state[i][j].compare(value)) || ifparent){//ifparent記錄是否算父節點
- if(!remain_state[i][MAXLEN - 1].compare(yes)){
- count[0]++;
- }
- else count[1]++;
- }
- }
- done_flag = true;
- }
- }
- if(count[0] == 0 || count[1] == 0 ) return 0;//所有是正實例或者負實例
- //具體計算熵 根據[+count[0],-count[1]],log2爲底經過換底公式換成天然數底數
- double sum = count[0] + count[1];
- double entropy = -count[0]/sum*log(count[0]/sum)/log(2.0) - count[1]/sum*log(count[1]/sum)/log(2.0);
- return entropy;
- }
-
- //計算按照屬性attribute劃分當前剩餘實例的信息增益
- double ComputeGain(vector <vector <string> > remain_state, string attribute){
- unsigned int j,k,m;
- //首先求不作劃分時的熵
- double parent_entropy = ComputeEntropy(remain_state, attribute, blank, true);
- double children_entropy = 0;
- //而後求作劃分後各個值的熵
- vector<string> values = map_attribute_values[attribute];
- vector<double> ratio;
- vector<int> count_values;
- int tempint;
- for(m = 0; m < values.size(); m++){
- tempint = 0;
- for(k = 1; k < MAXLEN - 1; k++){
- if(!attribute_row[k].compare(attribute)){
- for(j = 1; j < remain_state.size(); j++){
- if(!remain_state[j][k].compare(values[m])){
- tempint++;
- }
- }
- }
- }
- count_values.push_back(tempint);
- }
-
- for(j = 0; j < values.size(); j++){
- ratio.push_back((double)count_values[j] / (double)(remain_state.size()-1));
- }
- double temp_entropy;
- for(j = 0; j < values.size(); j++){
- temp_entropy = ComputeEntropy(remain_state, attribute, values[j], false);
- children_entropy += ratio[j] * temp_entropy;
- }
- return (parent_entropy - children_entropy);
- }
-
- int FindAttriNumByName(string attri){
- for(int i = 0; i < MAXLEN; i++){
- if(!state[0][i].compare(attri)) return i;
- }
- cerr<<"can't find the numth of attribute"<<endl;
- return 0;
- }
-
- //找出樣例中佔多數的正/負性
- string MostCommonLabel(vector <vector <string> > remain_state){
- int p = 0, n = 0;
- for(unsigned i = 0; i < remain_state.size(); i++){
- if(!remain_state[i][MAXLEN-1].compare(yes)) p++;
- else n++;
- }
- if(p >= n) return yes;
- else return no;
- }
-
- //判斷樣例是否正負性都爲label
- bool AllTheSameLabel(vector <vector <string> > remain_state, string label){
- int count = 0;
- for(unsigned int i = 0; i < remain_state.size(); i++){
- if(!remain_state[i][MAXLEN-1].compare(label)) count++;
- }
- if(count == remain_state.size()-1) return true;
- else return false;
- }
-
- //計算信息增益,DFS構建決策樹
- //current_node爲當前的節點
- //remain_state爲剩餘待分類的樣例
- //remian_attribute爲剩餘尚未考慮的屬性
- //返回根結點指針
- Node * BulidDecisionTreeDFS(Node * p, vector <vector <string> > remain_state, vector <string> remain_attribute){
- //if(remain_state.size() > 0){
- //printv(remain_state);
- //}
- if (p == NULL)
- p = new Node();
- //先看搜索到樹葉的狀況
- if (AllTheSameLabel(remain_state, yes)){
- p->attribute = yes;
- return p;
- }
- if (AllTheSameLabel(remain_state, no)){
- p->attribute = no;
- return p;
- }
- if(remain_attribute.size() == 0){//全部的屬性均已經考慮完了,尚未分盡
- string label = MostCommonLabel(remain_state);
- p->attribute = label;
- return p;
- }
-
- double max_gain = 0, temp_gain;
- vector <string>::iterator max_it = remain_attribute.begin();
- vector <string>::iterator it1;
- for(it1 = remain_attribute.begin(); it1 < remain_attribute.end(); it1++){
- temp_gain = ComputeGain(remain_state, (*it1));
- if(temp_gain > max_gain) {
- max_gain = temp_gain;
- max_it = it1;
- }
- }
- //下面根據max_it指向的屬性來劃分當前樣例,更新樣例集和屬性集
- vector <string> new_attribute;
- vector <vector <string> > new_state;
- for(vector <string>::iterator it2 = remain_attribute.begin(); it2 < remain_attribute.end(); it2++){
- if((*it2).compare(*max_it)) new_attribute.push_back(*it2);
- }
- //肯定了最佳劃分屬性,注意保存
- p->attribute = *max_it;
- vector <string> values = map_attribute_values[*max_it];
- int attribue_num = FindAttriNumByName(*max_it);
- new_state.push_back(attribute_row);
- for(vector <string>::iterator it3 = values.begin(); it3 < values.end(); it3++){
- for(unsigned int i = 1; i < remain_state.size(); i++){
- if(!remain_state[i][attribue_num].compare(*it3)){
- new_state.push_back(remain_state[i]);
- }
- }
- Node * new_node = new Node();
- new_node->arrived_value = *it3;
- if(new_state.size() == 0){//表示當前沒有這個分支的樣例,當前的new_node爲葉子節點
- new_node->attribute = MostCommonLabel(remain_state);
- }
- else
- BulidDecisionTreeDFS(new_node, new_state, new_attribute);
- //遞歸函數返回時即回溯時須要1 將新結點加入父節點孩子容器 2清除new_state容器
- p->childs.push_back(new_node);
- new_state.erase(new_state.begin()+1,new_state.end());//注意先清空new_state中的前一個取值的樣例,準備遍歷下一個取值樣例
- }
- return p;
- }
-
- void Input(){
- string s;
- while(cin>>s,s.compare(end) != 0){//-1爲輸入結束
- item[0] = s;
- for(int i = 1;i < MAXLEN; i++){
- cin>>item[i];
- }
- state.push_back(item);//注意首行信息也輸入進去,即屬性
- }
- for(int j = 0; j < MAXLEN; j++){
- attribute_row.push_back(state[0][j]);
- }
- }
-
- void PrintTree(Node *p, int depth){
- for (int i = 0; i < depth; i++) cout << '\t';//按照樹的深度先輸出tab
- if(!p->arrived_value.empty()){
- cout<<p->arrived_value<<endl;
- for (int i = 0; i < depth+1; i++) cout << '\t';//按照樹的深度先輸出tab
- }
- cout<<p->attribute<<endl;
- for (vector<Node*>::iterator it = p->childs.begin(); it != p->childs.end(); it++){
- PrintTree(*it, depth + 1);
- }
- }
-
- void FreeTree(Node *p){
- if (p == NULL)
- return;
- for (vector<Node*>::iterator it = p->childs.begin(); it != p->childs.end(); it++){
- FreeTree(*it);
- }
- delete p;
- tree_size++;
- }
-
- int main(){
- Input();
- vector <string> remain_attribute;
-
- string outlook("Outlook");
- string Temperature("Temperature");
- string Humidity("Humidity");
- string Wind("Wind");
- remain_attribute.push_back(outlook);
- remain_attribute.push_back(Temperature);
- remain_attribute.push_back(Humidity);
- remain_attribute.push_back(Wind);
- vector <vector <string> > remain_state;
- for(unsigned int i = 0; i < state.size(); i++){
- remain_state.push_back(state[i]);
- }
- ComputeMapFrom2DVector();
- root = BulidDecisionTreeDFS(root,remain_state,remain_attribute);
- cout<<"the decision tree is :"<<endl;
- PrintTree(root,0);
- FreeTree(root);
- cout<<endl;
- cout<<"tree_size:"<<tree_size<<endl;
- return 0;
- }
輸入的訓練數據以下
- Day Outlook Temperature Humidity Wind PlayTennis
- 1 Sunny Hot High Weak no
- 2 Sunny Hot High Strong no
- 3 Overcast Hot High Weak yes
- 4 Rainy Mild High Weak yes
- 5 Rainy Cool Normal Weak yes
- 6 Rainy Cool Normal Strong no
- 7 Overcast Cool Normal Strong yes
- 8 Sunny Mild High Weak no
- 9 Sunny Cool Normal Weak yes
- 10 Rainy Mild Normal Weak yes
- 11 Sunny Mild Normal Strong yes
- 12 Overcast Mild High Strong yes
- 13 Overcast Hot Normal Weak yes
- 14 Rainy Mild High Strong no
- end
程序輸出決策樹以下
能夠用圖形表示爲 學習
有了決策樹後,就能夠根據氣候條件作預測了 優化
例如若是氣候數據是{Sunny,Cool,Normal,Strong} ,根據決策樹到左側的yes葉節點,能夠斷定會去游泳。 spa
另外在編寫這個程序時在數據結構的設計上面走了彎路,多叉樹的實現有不少方法,本算法採用每一個結點的全部孩子用vector保存比較合適,同時注意維護剩餘樣例和剩餘屬性信息,建樹時橫向遍歷靠循環屬性的值,縱向遍歷靠遞歸調用 ,整體是DFS,樹和圖的遍歷在編程時常常遇到,得熟練掌握。程序有些地方的效率還得優化,有不足的點地方還望你們拍磚。 .net