一、計算測試實例到全部訓練集實例的距離;
二、對全部的距離進行排序,找到k個最近的鄰居;
三、對k個近鄰對應的結果進行合併,再排序,返回出現次數最多的那個結果。python
對每個k,使用驗證集計算,記錄k對應的錯誤次數,取錯誤數最小的kapp
# -*- coding: utf-8 -*- import os import pandas as pd import matplotlib.pyplot as plt import math import operator #按照8:2的比例分割數據 #testSetIndex : 第幾組爲測試樣本,取值範圍0 - 4 def splitData(trainSet, testSet, testSetIndex): #data = pd.read_csv('iris.txt', skiprows=0, skipfooter=0, sep=r'\s+', encoding="utf-8", engine='python', header=None) data = pd.read_csv('iris.txt', encoding="utf-8", engine='python', header=None) for i in range(150): if testSetIndex == (i % 50) / 10: testSet.append(data.iloc[i]) else: trainSet.append(data.iloc[i]) return #計算歐氏距離 #instance1 : 實例1 #instance2 : 實例2 #dimension :維度 def computeDistance(instance1, instance2, dimension): distance = 0 for i in xrange(dimension): #print(instance1[i], instance2[i]) distance += pow((instance1[i] - instance2[i]), 2) return math.sqrt(distance) #trainSet : 訓練樣本集 #instance : 實例數據 #k : 1 - 120 def kNN(trainSet, instance, k): distances = [] dimension = len(instance) - 1 #計算測試實例到訓練集實例的距離 for i in xrange(len(trainSet)): dist = computeDistance(instance, trainSet[i], dimension) distances.append((trainSet[i], dist)) #對全部的距離進行排序 distances.sort(key=operator.itemgetter(1)) neighbors = [] #返回k個最近鄰 for i in range(k): neighbors.append(distances[i][0]) #對k個近鄰進行合併,返回最多的那個 listClass = {} for i in xrange(len(neighbors)): response = neighbors[i][4] if response in listClass: listClass[response] += 1 else: listClass[response] = 1 #排序 sortResult = sorted(listClass.iteritems(), key = operator.itemgetter(1), reverse=True) return sortResult[0][0] def main(): trainSet = [] #訓練數據集 testSet = [] #測試數據集 #1. 數據分割8:2 splitData(trainSet, testSet, 4) #2. 使用交叉驗證方法肯定最優 k 值,並給出在該情形下分類器的錯誤分類率 errCountSet = [0] * 120 #9總數據切割方式,對每個k,記錄k對應的總錯誤次數 for j in xrange(0, 5): trainSet = [] testSet = [] splitData(trainSet, testSet, j) for k in xrange(0, 120): #對每個k,使用驗證集計算,記錄k對應的錯誤次數 for i in xrange(len(testSet)): trainResult = kNN(trainSet, testSet[i], k + 1) if trainResult != testSet[i][4]: errCountSet[k] = errCountSet[k] + 1 #取錯誤數最小的k 有多個 min = 1 for i in xrange(0, 120): errCountSet[i] = errCountSet[i] / (30 * 5.0) if min > errCountSet[i]: min = errCountSet[i] #k = i + 1 #打印錯誤率最小的k值 for i in xrange(0, 120): if min == errCountSet[i]: print i + 1, min fig = plt.figure(figsize=(20, 15)) ax1 = fig.add_subplot(111) #ax2 = fig.add_subplot(122) ax1.plot(range(1, 121), errCountSet) ax1.set_xlabel('k') ax1.set_ylabel('Error Rate') plt.show() return if __name__ == "__main__": main()
分別使用參數k=1~120進行實驗,並進行交叉驗證,錯誤分類率曲線以下:
測試