(3)兩個n維向量a(x11,x12,…,x1n)與 b(x21,x22,…,x2n)間的歐氏距離:測試
(1) 計算已知類別數據集中的點與當前點之間的距離;
(2) 按照距離增序排序;
(3) 選取與當前點距離最近的k個點;
(4) 決定這k個點所屬類別的出現頻率;
(5) 返回前k個點出現頻率最高的類別做爲當前點的預測分類。
1 def createDataSet(): 2 group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]]) 3 labels = ['A','A','B','B'] 4 return group, labels
1 >>> tile([1,2],(4)) 2 array([1, 2, 1, 2, 1, 2, 1, 2]) 3 >>> tile([1,2],(4,1)) 4 array([[1, 2], 5 [1, 2], 6 [1, 2], 7 [1, 2]]) 8 >>> tile([1,2],(4,2)) 9 array([[1, 2, 1, 2], 10 [1, 2, 1, 2], 11 [1, 2, 1, 2], 12 [1, 2, 1, 2]])
1 def classify0(inX, dataSet, labels, k): 2 dataSetSize = dataSet.shape[0] 3 diffMat = tile(inX, (dataSetSize,1)) - dataSet #新數據與樣本數據每一行的值相減 [[x-x1,y-y1],[x-x2,y-y2],[x-x3,y-y3],.....] 4 sqDiffMat = diffMat**2 #數組每一項進行平方[[(x-x1)^2,(y-y1)^2],........] 5 sqDistances = sqDiffMat.sum(axis=1)#數組每一個特證求和[[(x-xi)^2+(y-yi)^2],......] 6 distances = sqDistances**0.5 #數組每一個值 開根號 ,,歐式距離公式 完成。。。。 7 sortedDistIndicies = distances.argsort() #argsort函數返回的是數組值從小到大的索引值 8 classCount={} #如下是選取 距離最小的前k個值的索引,從k箇中選取分類最多的一個做爲新數據的分類 9 for i in range(k):# 統計前k個點所屬的類別 10 voteIlabel = labels[sortedDistIndicies[i]] 11 classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1 12 sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True) 13 return sortedClassCount[0][0]# 返回前k個點中頻率最高的類別
其中 inX:須要分類的新數據,dataSet:樣本數據特徵,labels:樣本數據分類,k:選取前k個最近的距離
1 >>> group,labels = kNN.createDataSet() 2 >>> group,labels 3 (array([[ 1. , 1.1], 4 [ 1. , 1. ], 5 [ 0. , 0. ], 6 [ 0. , 0.1]]), ['A', 'A', 'B', 'B']) 7 >>> kNN.classify0([0,0],group,labels,3) 8 'B' 9 >>>
不一樣特徵值有不一樣的均值和取值範圍,若是直接使用特徵值計算距離,取值範圍較大的特徵將對距離計算的結果產生絕對得影響,而使較小的特徵值幾乎沒有做用,近乎沒有用到該屬性。如兩組特徵:{0, 20000, 1.1}和{67, 32000, 0.1},計算距離的算式爲:
newValue = (oldValue – min) / (max – min)
1 def autoNorm(dataSet): 2 minVals = dataSet.min(0)# 分別求各個特徵的最小值 3 maxVals = dataSet.max(0)# 分別求各個特徵的最大值 4 ranges = maxVals - minVals# 各個特徵的取值範圍 5 normDataSet = zeros(shape(dataSet)) 6 m = dataSet.shape[0] 7 normDataSet = dataSet - tile(minVals, (m,1)) # oldValue - min 8 normDataSet = normDataSet/tile(ranges, (m,1)) #element wise divide (oldValue-min)/(max-min) 數據歸一化處理 9 return normDataSet, ranges, minVals
1 def datingClassTest(): 2 hoRatio = 0.50 #hold out 10% 3 datingDataMat,datingLabels = file2matrix('datingTestSet2.txt') #load data setfrom file 4 normMat, ranges, minVals = autoNorm(datingDataMat) 5 m = normMat.shape[0] 6 numTestVecs = int(m*hoRatio) 7 errorCount = 0.0 8 for i in range(numTestVecs): 9 classifierResult = classify0(normMat[i,:],normMat[numTestVecs:m,:],datingLabels[numTestVecs:m],3) 10 print "the classifier came back with: %d, the real answer is: %d" % (classifierResult, datingLabels[i]) 11 if (classifierResult != datingLabels[i]): errorCount += 1.0 12 print "the total error rate is: %f" % (errorCount/float(numTestVecs)) 13 print errorCount
1 >>> kNN.datingClassTest() 2 3 the classifier came back with: 2, the real answer is: 1 4 5 the classifier came back with: 2, the real answer is: 2 6 7 the classifier came back with: 1, the real answer is: 1 8 9 the classifier came back with: 1, the real answer is: 1 10 11 the classifier came back with: 2, the real answer is: 2 12 13 ................................................. 14 15 the total error rate is: 0.064000 16 17 32.0
綜合上述代碼,咱們能夠構建完整的約會網站預測函數:對輸入的數據須要 歸一化處理
1 def classifyPerson(): 2 resultList = ['not at all', 'in small doses', 'in large doses'] 3 percentTats = float(raw_input("Percentage of time spent playing video game?")) 4 ffMiles = float(raw_input("Frequent flier miles earned per year?")) 5 iceCream = float(raw_input("Liters of ice cream consumed per year?")) 6 datingDataMat, datingLabels = file2matrix('datingTestSet.txt') 7 normMat, ranges, minVals = autoNorm(datingDataMat) 8 inArr = array([ffMiles, percentTats, iceCream]) #新數據 須要歸一化處理 9 classifierResult = classify((inArr - minVals) / ranges, normMat, datingLabels, 3) 10 print "You will probably like this person: ", resultList[classifierResult - 1]
1 ''' 2 Created on Sep 16, 2010 3 kNN: k Nearest Neighbors 4 5 Input: inX: vector to compare to existing dataset (1xN) 6 dataSet: size m data set of known vectors (NxM) 7 labels: data set labels (1xM vector) 8 k: number of neighbors to use for comparison (should be an odd number) 9 10 Output: the most popular class label 11 12 @author: pbharrin 13 ''' 14 from numpy import * 15 import operator 16 from os import listdir 17 import matplotlib 18 import matplotlib.pyplot as plt 19 def show(d,l): 20 #d,l=kNN.file2matrix('datingTestSet2.txt') 21 fig=plt.figure() 22 ax=fig.add_subplot(111) 23 ax.scatter(d[:,0],d[:,1],15*array(l),15*array(l)) 24 plt.show() 25 def show2(): 26 datingDataMat,datingLabels=file2matrix('datingTestSet2.txt') 27 fig = plt.figure() 28 ax = fig.add_subplot(111) 29 l=datingDataMat.shape[0] 30 X1=[] 31 Y1=[] 32 X2=[] 33 Y2=[] 34 X3=[] 35 Y3=[] 36 for i in range(l): 37 if datingLabels[i]==1: 38 X1.append(datingDataMat[i,0]);Y1.append(datingDataMat[i,1]) 39 elif datingLabels[i]==2: 40 X2.append(datingDataMat[i,0]);Y2.append(datingDataMat[i,1]) 41 else: 42 X3.append(datingDataMat[i,0]);Y3.append(datingDataMat[i,1]) 43 type1=ax.scatter(X1,Y1,c='red') 44 type2=ax.scatter(X2,Y2,c='green') 45 type3=ax.scatter(X3,Y3,c='blue') 46 #ax.axis([-2,25,-0.2,2.0]) 47 ax.legend([type1, type2, type3], ["Did Not Like", "Liked in Small Doses", "Liked in Large Doses"], loc=2) 48 plt.xlabel('Percentage of Time Spent Playing Video Games') 49 plt.ylabel('Liters of Ice Cream Consumed Per Week') 50 plt.show() 51 52 def classify0(inX, dataSet, labels, k): 53 dataSetSize = dataSet.shape[0] 54 diffMat = tile(inX, (dataSetSize,1)) - dataSet 55 sqDiffMat = diffMat**2 56 sqDistances = sqDiffMat.sum(axis=1) 57 distances = sqDistances**0.5 58 sortedDistIndicies = distances.argsort() 59 classCount={} 60 for i in range(k): 61 voteIlabel = labels[sortedDistIndicies[i]] 62 classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1 63 sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True) 64 return sortedClassCount[0][0] 65 66 def createDataSet(): 67 group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]]) 68 labels = ['A','A','B','B'] 69 return group, labels 70 71 def file2matrix(filename): 72 fr = open(filename) 73 numberOfLines = len(fr.readlines()) #get the number of lines in the file 74 returnMat = zeros((numberOfLines,3)) #prepare matrix to return 75 classLabelVector = [] #prepare labels return 76 fr = open(filename) 77 index = 0 78 for line in fr.readlines(): 79 line = line.strip() 80 listFromLine = line.split('\t') 81 returnMat[index,:] = listFromLine[0:3] 82 classLabelVector.append(int(listFromLine[-1])) 83 index += 1 84 return returnMat,classLabelVector 85 86 def autoNorm(dataSet): 87 minVals = dataSet.min(0) 88 maxVals = dataSet.max(0) 89 ranges = maxVals - minVals 90 normDataSet = zeros(shape(dataSet)) 91 m = dataSet.shape[0] 92 normDataSet = dataSet - tile(minVals, (m,1)) 93 normDataSet = normDataSet/tile(ranges, (m,1)) #element wise divide 94 return normDataSet, ranges, minVals 95 96 def datingClassTest(): 97 hoRatio = 0.50 #hold out 10% 98 datingDataMat,datingLabels = file2matrix('datingTestSet2.txt') #load data setfrom file 99 normMat, ranges, minVals = autoNorm(datingDataMat) 100 m = normMat.shape[0] 101 numTestVecs = int(m*hoRatio) 102 errorCount = 0.0 103 for i in range(numTestVecs): 104 classifierResult = classify0(normMat[i,:],normMat[numTestVecs:m,:],datingLabels[numTestVecs:m],3) 105 print "the classifier came back with: %d, the real answer is: %d" % (classifierResult, datingLabels[i]) 106 if (classifierResult != datingLabels[i]): errorCount += 1.0 107 print "the total error rate is: %f" % (errorCount/float(numTestVecs)) 108 print errorCount 109 110 def img2vector(filename): 111 returnVect = zeros((1,1024)) 112 fr = open(filename) 113 for i in range(32): 114 lineStr = fr.readline() 115 for j in range(32): 116 returnVect[0,32*i+j] = int(lineStr[j]) 117 return returnVect 118 119 def handwritingClassTest(): 120 hwLabels = [] 121 trainingFileList = listdir('trainingDigits') #load the training set 122 m = len(trainingFileList) 123 trainingMat = zeros((m,1024)) 124 for i in range(m): 125 fileNameStr = trainingFileList[i] 126 fileStr = fileNameStr.split('.')[0] #take off .txt 127 classNumStr = int(fileStr.split('_')[0]) 128 hwLabels.append(classNumStr) 129 trainingMat[i,:] = img2vector('trainingDigits/%s' % fileNameStr) 130 testFileList = listdir('testDigits') #iterate through the test set 131 errorCount = 0.0 132 mTest = len(testFileList) 133 for i in range(mTest): 134 fileNameStr = testFileList[i] 135 fileStr = fileNameStr.split('.')[0] #take off .txt 136 classNumStr = int(fileStr.split('_')[0]) 137 vectorUnderTest = img2vector('testDigits/%s' % fileNameStr) 138 classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3) 139 print "the classifier came back with: %d, the real answer is: %d" % (classifierResult, classNumStr) 140 if (classifierResult != classNumStr): errorCount += 1.0 141 print "\nthe total number of errors is: %d" % errorCount 142 print "\nthe total error rate is: %f" % (errorCount/float(mTest))