決策樹模型練習:https://www.kaggle.com/c/GiveMeSomeCredit/overviewhtml
機器學習腫分類和預測算法的評估:python
a. 準確率算法
b.速度app
c. 強壯行機器學習
d.可規模性ide
e. 可解釋性學習
https://scikit-learn.org/stable/modules/tree.htmlspa
變量的不肯定越大,熵也就越大。excel
生成後的決策樹code
邏輯代碼:
整理好的代碼 --》
python3.6.3
Successfully installed joblib-0.13.2 numpy-1.16.4 scikit-learn-0.21.2 scipy-1.3.0
# -*- coding:utf-8 -*- from sklearn.feature_extraction import DictVectorizer import csv from sklearn import preprocessing from sklearn import tree # 要求是數值型的值 from sklearn.externals.six import StringIO import pandas as pd """ 注意: 決策樹要求要數值型的值,不能是字符串類型的值 例如: no, yes這樣的值是不容許的 須要轉換成矩陣 ==================================== age income student youth high no youth high no middle_aged high no senior medium no senior low yes ==================================== 好比上面這種數據: youth middle_aged senior high medium low ...... 1 0 0 1 0 0 1 0 0 1 0 0 ..... """ fileName = r"C:\Users\Administrator\Desktop\data.xlsx" data = pd.read_excel(fileName) # 刪除id序列 del data["RID"] # headers headers = data.columns.values # print(headers) # ["RID", 'age'.....] # 樣本量 # print(len(data)) # dict格式化單個樣本 # print(dict(data.ix[1])) # 單個樣本最後一個數據 # print(data.ix[1][-1]) featureList = [] labelList = [] for row in range(len(data)): rowData = data.ix[row] labelList.append(rowData[-1]) featureList.append(dict(rowData)) # print(featureList) # [ # {"credit_rating": "fair", "age": "youth"}, # .... #做用,方便轉換成矩陣。將數據轉換成對象 # ] # print(labelList) # ['no', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'no', 'yes', 'yes', 'yes', 'yes', 'yes', 'no'] # =========<格式化數據,轉換成decision tree須要的格式模型>============ vec = DictVectorizer() dummyX = vec.fit_transform(featureList).toarray() print("dummyX:" + str(dummyX)) # 轉換成矩陣的數據了二維 print(vec.get_feature_names()) print("labelList: " + str(labelList)) lb = preprocessing.LabelBinarizer() dummyY = lb.fit_transform(labelList) print("dummyY: " + str(dummyY)) # ===========《決策樹建模分析》============= clf = tree.DecisionTreeClassifier(criterion="entropy") clf = clf.fit(dummyX, dummyY) print("clf: ", str(clf)) # # 存儲決策樹信息 # # Graphviz 將dot轉換成pdf的命令: dot -T pdf iris.dot -o output.pdf # # 能夠查看decision tree 的形狀了(看pdf的值) # with open(r"C:\Users\Administrator\Desktop\code\mechine_learning\allElectronicInformationGainOri.dot", "w") as f: # f = tree.export_graphviz(clf, feature_names = vec.get_feature_names(), out_file = f) # # 下面的代碼屬於預測的代碼 # # 屬於轉化後的矩陣數值,其實就是進行復制修改 oneRowX = dummyX[2, :] print("oneRowX: " + str(oneRowX)) newRowX = oneRowX # newRowX[0] = 1 # newRowX[2] = 1 print("newRowX: ", str(newRowX)) predictedY = clf.predict([newRowX]) # 預測 class_buys_labels的值 print("predictedY: " + str(predictedY))
但這段代碼不是特別通用,並且有bug, 須要修改,但基本邏輯是正確的
# -*- coding:utf-8 -*- from sklearn.feature_extraction import DictVectorizer import csv from sklearn import preprocessing from sklearn import tree # 要求是數值型的值 from sklearn.externals.six import StringIO """ 注意: 決策樹要求要數值型的值,不能是字符串類型的值 例如: no, yes這樣的值是不容許的 須要轉換成矩陣 ==================================== age income student youth high no youth high no middle_aged high no senior medium no senior low yes ==================================== 好比上面這種數據: youth middle_aged senior high medium low ...... 1 0 0 1 0 0 1 0 0 1 0 0 ..... """ allElectronicsData = open(r"C:\Users\Administrator\Desktop\data.xlsx", 'r') reader = csv.reader(allElectronicsData) print(reader) headers = next(reader) print(headers) # ["RID", 'age'.....] featureList = [] labelList = [] for row in reader: labelList.append(row[len(row) - 1]) rowDict = {} for i in range(1, len(row) - 1): rowDict[headers[i]] = row[i] featureList.append(rowDict) print(featureList) # [ # {"credit_rating": "fair", "age": "youth"}, # .... #做用,方便轉換成矩陣。將數據轉換成對象 # ] vec = DictVectorizer() dummyX = vec.fit_transform(featureList).toarray() print("dummyX:" + str(dummyX)) # 轉換成矩陣的數據了二維 print(vec.get_feature_names()) print("labelList: " + str(labelList)) lb = preprocessing.LabelBinarizer() dummyY = lb.fit_transform(labelList) print("dummyY: " + str(dummyY)) clf = tree.DecisionTreeClassifier(criterion="entropy") clf = clf.fit(dummyX, dummyY) print("clf: ", str(clf)) # 存儲決策樹信息 # Graphviz 將dot轉換成pdf的命令: dot -T pdf iris.dot -o output.pdf # 能夠查看decision tree 的形狀了(看pdf的值) with open(r"C:\Users\Administrator\Desktop\code\mechine_learning\allElectronicInformationGainOri.dot", "w") as f: f = tree.export_graphviz(clf, feature_names = vec.get_feature_names(), out_file = f) # 下面的代碼屬於預測的代碼 # 屬於轉化後的矩陣數值,其實就是進行復制修改 oneRowX = dummyX[0, :] print("oneRowX: " + str(oneRowX)) newRowX = oneRowX newRowX[0] = 1 newRowX[2] = 0 print("newRowX: ", str(newRowX)) predictedY = clf.predicted(newRowX) # 預測 class_buys_labels的值 predicted("predictedY: " + str(predictedY)) if __name__ == '__main__': main()