1 # -*- coding: utf-8 -*- 2 """ 3 Created on Wed Jan 24 19:01:40 2018 4 5 @author: markli 6 7 採用信息增益做爲特徵選擇原則構建決策樹 8 """ 9 import numpy as np; 10 import pandas as pd; 11 12 class DecisionTree(object): 13 def __init__(self,features): 14 """ 15 features 樣本具備的特徵的取值範圍,例如具備A1,A2兩個特徵,features=[A1,A2] 16 """ 17 self.features = features; 18 19 def fit(self,TrainData): 20 """ 21 TrainData 訓練樣本,數據格式爲二維數組m*n 22 m 個樣本,n-1 個特徵,最後一列是類別標籤 23 """ 24 tree = self.GetDicTree(TrainData,self.features); 25 return tree; 26 27 28 def SetEntropy(self,Data): 29 """ 30 獲取數據集的經驗熵 31 Data 數據集的最後一列,類別 32 """ 33 N = len(Data[:,-1]); #獲取數據集的大小 34 Numoflabel = pd.value_counts(Data[:,-1]); #得數據集中到每一類別的數量 35 classlabel = list(set(Data[:,-1])); 36 entropy = 0;# 數據集的信息熵 37 for c in classlabel: 38 try: 39 Ck = Numoflabel[c]; #獲得類別爲c的樣例的數量 40 entropy = entropy - Ck/N * np.log2(Ck/N); 41 except KeyError: 42 Ck = 0; 43 entropy = entropy - Ck; 44 45 return entropy; 46 47 def ConditionEntropy(self,Data,index): 48 """ 49 獲取某一特徵的條件經驗熵 50 Data 數據集與TrainData格式一致 51 feature 特徵的取值範圍 例如 A1=[1,2,3] 52 feature_index 該特徵在數據集中屬於第幾個特徵,從0開始 53 """ 54 ConEntropy = 1; 55 feature_value = list(set(Data[:,index])); 56 N = len(Data[:,0]); 57 for a in feature_value: 58 d = Data[np.where(Data[:,index]==a)]; 59 d_n = len(d); 60 if(d_n == 0): 61 return 0; 62 #計算特徵取a值時的數據集的經驗熵 63 d_entropy = self.SetEntropy(d); 64 ConEntropy = ConEntropy * (d_n / N) * d_entropy; 65 66 return -ConEntropy; 67 68 def SelectBestFeature(self,Data): 69 """ 70 選出數據集中最大信息增益的特徵及最優特徵 71 Data 數據集與TrainData格式一致 72 """ 73 AddEntropy = []; 74 entropy = self.SetEntropy(Data); #求得數據集的經驗熵 75 feature_num = len(Data[0])-1; #得到數據集中特徵數量 76 for i in range(feature_num): 77 ConEntropy = self.ConditionEntropy(Data,i); #求得每一個特徵的條件熵 78 adden = entropy - ConEntropy; 79 AddEntropy.append(adden); 80 81 index = np.argmax(AddEntropy); 82 return index; 83 84 85 def VoteClass(self,classlist): 86 """ 87 當特徵被選完,但仍是沒法準確判斷哪一類時,採用投票的方式肯定其類 88 """ 89 classlabel = list(set(classlist)); 90 dic = {}; 91 for c in classlabel: 92 if(c not in dic.keys()): 93 dic[c] = 0; 94 else: 95 dic[c] += 1; 96 return max(dic); 97 98 def GetDicTree(self,TrainData,features): 99 """ 100 構造字典樹 101 TrainData 訓練數據集 102 """ 103 classlabel = list(set(TrainData[:,-1])); #得到數據集的類別標籤 104 #classlabel = [row[-1] for row in TrainData]; 105 106 if(len(classlabel) == 1): 107 return classlabel[0]; 108 109 if(len(TrainData[0]) == 1): 110 return self.VoteClass(TrainData[:,-1]); 111 112 bestfeature_index = self.SelectBestFeature(TrainData); 113 bestfeature = features[bestfeature_index]; #選出最優的特徵 114 dictree = {bestfeature:{}}; #以最優特徵爲節點構建子樹 115 del(features[bestfeature_index]) #刪除已選過的特徵 116 117 #根據最優特徵的取值拆分數據集,遞歸上述選最優特徵過程 118 feature_attr = list(set(TrainData[:,bestfeature_index])); 119 for value in feature_attr: 120 sub_features = features[:]; 121 subdata = self.SplitData(TrainData,bestfeature_index,value); 122 dictree[bestfeature][value] = self.GetDicTree(subdata,sub_features); 123 124 return dictree; 125 126 def SplitData(self,Data,feature_index,feature_value): 127 subdata = Data[np.where(Data[:,feature_index] == feature_value)]; 128 n = len(Data[0]); 129 subdata = [[row[i] for i in range(n) if i != feature_index] for row in subdata]; 130 return np.array(subdata); 131 132 133 134 135
該算法基本實現了運用信息增益實現了決策樹,將選取最優的特徵算法函數改成信息增益率就實現了C4.5算法,該算法還有剪枝操做沒有實現,要是有能實現了能夠交流一下。下面給出測試代碼。算法
# -*- coding: utf-8 -*- """ Created on Thu Jan 25 16:55:25 2018 @author: markli """ import numpy as np; from DecisionTree_InformationAdd import DecisionTree; tree = DecisionTree(['no surfaceing','flippers']); TrainData = np.array([[1,1,'yes'], [1,1,'yes'], [1,0,'no'], [0,1,'no'], [0,1,'no']]); print(tree.fit(TrainData));
{'no surfaceing': {'1': {'flippers': {'1': 'yes', '0': 'no'}}, '0': 'no'}}