k鄰近法實際上利用訓練數據集對特徵向量空間進行劃分,並做爲其分類的「模型」。
曼哈頓距離:html
#-*- coding:utf-8 -*- import numpy as np import operator def createDataset(): #四組二維特徵 group = np.array([[5,115],[7,106],[56,11],[66,9]]) #四組對應標籤 labels = ('動做片','動做片','愛情片','愛情片') return group,labels """ KNN算法 """ def classify(intX, dataSet, labels, k): ''' numpy中shape[0]返回數組的行數,shape[1]返回列數 ''' dataSetSize = dataSet.shape[0] """ 將intX在橫向重複dataSetSize次,縱向重複1次 例如intX=([1,2])--->([[1,2],[1,2],[1,2],[1,2]])便於後面計算 """ diffMat = np.tile(intX, (dataSetSize, 1)) - dataSet """ 計算距離:歐式距離, 特徵相減後乘方,而後再開方 """ sqdifMax = diffMat**2 seqDistances = sqdifMax.sum(axis=1) distances = seqDistances**0.5 #返回distance中元素從小到大排序後的索引 print ("distances:",distances) sortDistance = distances.argsort() print ("sortDistance:", sortDistance) """ 取出前k個元素的類別 """ classCount = {} for i in range(k): voteLabel = labels[sortDistance[i]] s = "第{}個voteLabel={}".format(i, voteLabel) print(s) classCount[voteLabel] = classCount.get(voteLabel,0)+1 #dict.get(key,default=None),字典的get()方法,返回指定鍵的值,若是值不在字典中返回默認值。 #計算類別次數 #key=operator.itemgetter(1)根據字典的值進行排序 #key=operator.itemgetter(0)根據字典的鍵進行排序 #reverse降序排序字典 sortedClassCount = sorted(classCount.items(), key = operator.itemgetter(1), reverse = True) #結果sortedClassCount = [('動做片', 2), ('愛情片', 1)] print ("sortedClassCount:") print(sortedClassCount) return sortedClassCount[0][0] if __name__ == '__main__': group,labels = createDataset() test = [20,101] test_class = classify(test,group,labels,3) print (test_class)