feature【特徵變量】 ——》 label【結果標籤】html
收集問題相關的數據(feature、label)——》python
選擇一種數學模型創建feature和label的關係 ——》算法
根據選擇的模型進行預測dom
knn機器學習
key nearest neighbor 函數
大體步驟:學習
一、收集相關的數據測試
二、選擇合適的feature 和labelspa
三、若是不知道如何選擇feature,能夠先單獨讓每一個feature計算與label的相關度rest
四、選擇合適的k
五、使用數據集進行合適的預測
從300處丟一個球,會落到哪一個窗口?
解決流程:
代碼:
import numpy as np import collections as c
#數據集的準備【在這裏是手動輸入】 data = np.array([ [154,1], [126,2], [70,2], [196,2], [161,2], [371,4] ]) #輸入值 feature = (data[:,0]) #結果label label = data[:,-1] #預測點 predictPoint = 200 #計算每個投擲點距離predictPoint的距離 distance = list(map(lambda x: abs(predictPoint - x),feature)) #對distance的集合 元素從小到大排序(返回的是排序的下標位置) sortindex = (np.argsort(distance)) #用排序的sortindex來操做 label集合 sortlabel = (label[sortindex]) # knn算法的k :取最近的三個數據 k = 3 print(c.Counter(sortlabel[0:k]).most_common(1)[0][0])
補充:
一、map函數
是python 內置的高階函數,它接收一個函數f 和一個list ,並經過把函數f 依次做用在list的每個元素上,並獲得一個新的list,並返回
二、排序
np.sort:把集合直接按值排序
np.argsort :把集合元素按照下標,或者位置去排序,獲得 sortindex
import numpy as np import collections as c def knn(k,feature,label,predictPoint): # 計算每個投擲點距離predictPoint的距離 distance = list(map(lambda x: abs(predictPoint - x), feature)) # 對distance的集合 元素從小到大排序(返回的是排序的下標位置) sortindex = (np.argsort(distance)) # 用排序的sortindex來操做 label集合 sortlabel = (label[sortindex]) return c.Counter(sortlabel[0:k]).most_common(1)[0][0] if __name__== '__main__': data = np.array([ [154,1], [126,2], [70,2], [196,2], [161,2], [371,4] ]) #輸入值 feature = (data[:,0]) #結果label label = data[:,-1] #預測點 predictPoint = 200 # knn算法的k :取最近的三個數據 k = 3 print(knn(k,feature,label,predictPoint))
導入訓練集和測試集:
參考:數據集的生成
import numpy as np import collections as c #導入數據集 data = np.loadtxt("cnn0.csv",delimiter = ",") #打散數據 np.random.shuffle(data) """ 把數據集打散,拆分紅訓練集和測試集 訓練數據是測試數據的十倍 """ testdata = data[0:100] traindata = data[100:-1] #數據集保存成csv文件,自動生成的 np.savetxt("data0-test.csv",testdata,delimiter = ",",fmt = "%d") np.savetxt("data0-train.csv",traindata,delimiter = ",",fmt = "%d") def knn(k,predictPoint,feature,label): # 計算每個投擲點距離predictPoint的距離 distance = list(map(lambda x: abs(predictPoint - x), feature)) # 對distance的集合 元素從小到大排序(返回的是排序的下標位置) sortindex = (np.argsort(distance)) # 用排序的sortindex來操做 label集合 sortlabel = (label[sortindex]) return c.Counter(sortlabel[0:k]).most_common(1)[0][0] if __name__== '__main__': #導入訓練集 traindata = np.loadtxt("data0-train.csv",delimiter = ",") #輸入值 feature = (traindata[:,0]) #結果label label = traindata[:,-1] #導入測試集 testdata = np.loadtxt("data0-test.csv",delimiter = ",") # knn算法的k :取最近的k個數據 for k in range(1,50): count = 0 for item in testdata: predict = knn(k,item[0],feature,label) real = item[-1] if predict == real: count = count + 1 print(f'k = {k},準確率 : {count * 100.0 / len(testdata)}%') """ k = 1,準確率 : 14.0% k = 2,準確率 : 14.0% k = 3,準確率 : 17.0% k = 4,準確率 : 15.0% k = 5,準確率 : 15.0% k = 6,準確率 : 9.0% k = 7,準確率 : 13.0% k = 8,準確率 : 8.0% k = 9,準確率 : 10.0% k = 10,準確率 : 12.0% k = 11,準確率 : 8.0% k = 12,準確率 : 11.0% k = 13,準確率 : 10.0% k = 14,準確率 : 10.0% k = 15,準確率 : 11.0% k = 16,準確率 : 13.0% k = 17,準確率 : 13.0% k = 18,準確率 : 11.0% k = 19,準確率 : 9.0% k = 20,準確率 : 8.0% k = 21,準確率 : 6.0% k = 22,準確率 : 6.0% k = 23,準確率 : 6.0% k = 24,準確率 : 7.0% k = 25,準確率 : 8.0% k = 26,準確率 : 7.0% k = 27,準確率 : 9.0% k = 28,準確率 : 10.0% k = 29,準確率 : 11.0% k = 30,準確率 : 13.0% k = 31,準確率 : 13.0% k = 32,準確率 : 10.0% k = 33,準確率 : 10.0% k = 34,準確率 : 10.0% k = 35,準確率 : 11.0% k = 36,準確率 : 11.0% k = 37,準確率 : 14.0% k = 38,準確率 : 12.0% k = 39,準確率 : 13.0% k = 40,準確率 : 16.0% k = 41,準確率 : 16.0% k = 42,準確率 : 15.0% k = 43,準確率 : 16.0% k = 44,準確率 : 18.0% k = 45,準確率 : 17.0% k = 46,準確率 : 13.0% k = 47,準確率 : 13.0% k = 48,準確率 : 13.0% k = 49,準確率 : 15.0% """
注意:
經驗所得,參數k值通常選爲 【訓練集數據的開平方】爲最優
import numpy as np import collections as c def knn(k,predictPoint,feature,label): # 計算每個投擲點距離predictPoint的距離 distance = list(map(lambda x: abs(predictPoint - x), feature)) # 對distance的集合 元素從小到大排序(返回的是排序的下標位置) sortindex = (np.argsort(distance)) # 用排序的sortindex來操做 label集合 sortlabel = (label[sortindex]) return c.Counter(sortlabel[0:k]).most_common(1)[0][0] if __name__== '__main__': #導入訓練集 traindata = np.loadtxt("data0-train.csv",delimiter = ",") #輸入值 feature = (traindata[:,0]) #結果label label = traindata[:,-1] #導入測試集 testdata = np.loadtxt("data0-test.csv",delimiter = ",") # knn算法的k :取最近的k個數據 count = 0 k = int(len(traindata)**0.5) for item in testdata: predict = knn(k,item[0],feature,label) real = item[1] if predict == real: count = count + 1 print(f'k = {k},準確率 : {count * 100.0 / len(testdata)}%') """ k = 69,準確率 : 16.0% """
例如:特徵變量增長一個顏色項,特徵變量爲兩個
數據集獲取:參考
二維空間distance的計算:
代碼:
import numpy as np import collections as c #將顏色轉換爲彈性數 def color2num(str): dict = {"紅":0.50,"黃":0.51,"藍":0.52,"綠":0.53,"紫":0.54,"粉":0.55} return dict[str] #導入數據集 data = np.loadtxt("cnn1.csv",delimiter = ",",converters={1:color2num},encoding="gbk") #打散數據 np.random.shuffle(data) """ 把數據集打散,拆分紅訓練集和測試集 訓練數據是測試數據的十倍 """ testdata = data[0:100] traindata = data[100:-1] #數據集保存成csv文件,自動生成的 np.savetxt("data1-test.csv",testdata,delimiter = ",",fmt = "%.2f") np.savetxt("data1-train.csv",traindata,delimiter = ",",fmt = "%.2f") def knn(k,predictPoint,ballcolor,feature,label): # 計算每個投擲點距離predictPoint的距離 # 此時的feature: [ [130.0.55],[200,0.52],...] distance = list(map(lambda item:((item[0] - predictPoint)**2 + (item[1] - ballcolor)**2)**0.5, feature)) # 對distance的集合 元素從小到大排序(返回的是排序的下標位置) sortindex = (np.argsort(distance)) # 用排序的sortindex來操做 label集合 sortlabel = (label[sortindex]) return c.Counter(sortlabel[0:k]).most_common(1)[0][0] if __name__== '__main__': #導入訓練集 traindata = np.loadtxt("data1-train.csv",delimiter = ",") #輸入值,取第一列和第二列 feature = (traindata[:,0:2]) #結果label label = traindata[:,-1] #導入測試集 testdata = np.loadtxt("data1-test.csv",delimiter = ",") # knn算法的k :取最近的k個數據 count = 0 k = int(len(traindata)**0.5) for item in testdata: predict = knn(k,item[0],item[1],feature,label) real = item[-1] if predict == real: count = count + 1 print(f'k = {k},準確率 : {count * 100.0 / len(testdata)}%') """ k = 69,準確率 : 19.0% """
代碼:
import numpy as np import collections as c #將顏色轉換爲彈性數 def color2num(str): dict = {"紅":0.50,"黃":0.51,"藍":0.52,"綠":0.53,"紫":0.54,"粉":0.55} return dict[str] #導入數據集 data = np.loadtxt("cnn1.csv",delimiter = ",",converters={1:color2num},encoding="gbk") #打散數據 np.random.shuffle(data) """ 把數據集打散,拆分紅訓練集和測試集 訓練數據是測試數據的十倍 """ testdata = data[0:100] traindata = data[100:-1] #數據集保存成csv文件,自動生成的 np.savetxt("data1-test.csv",testdata,delimiter = ",",fmt = "%.2f") np.savetxt("data1-train.csv",traindata,delimiter = ",",fmt = "%.2f") def knn(k,predictPoint,ballcolor,feature,label): # 計算每個投擲點距離predictPoint的距離 # 此時的feature: [ [130.0.55],[200,0.52],...] # 數據歸一化 distance = list(map(lambda item:(((item[0]-1)/799 - (predictPoint-1)/799)**2 + ((item[1]-0.50)/0.05 - (ballcolor-0.50)/0.05)**2)**0.5, feature)) # 對distance的集合 元素從小到大排序(返回的是排序的下標位置) sortindex = (np.argsort(distance)) # 用排序的sortindex來操做 label集合 sortlabel = (label[sortindex]) return c.Counter(sortlabel[0:k]).most_common(1)[0][0] if __name__== '__main__': #導入訓練集 traindata = np.loadtxt("data1-train.csv",delimiter = ",") #輸入值,取第一列和第二列 feature = (traindata[:,0:2]) #結果label label = traindata[:,-1] #導入測試集 testdata = np.loadtxt("data1-test.csv",delimiter = ",") # knn算法的k :取最近的k個數據 count = 0 k = int(len(traindata)**0.5) for item in testdata: predict = knn(k,item[0],item[1],feature,label) real = item[-1] if predict == real: count = count + 1 print(f'k = {k},準確率 : {count * 100.0 / len(testdata)}%') """ k = 69,準確率 : 21.0% """