機器學習實戰——k-鄰近算法:約會網站

一、kNN 算法

算法說明:

set<X1,X2……Xn> 爲已知類別數據集,預測 點Xt 的類別:算法

(1)計算中的set中每個點與Xt的距離數組

(2)按距離增序排列app

(3)選擇距離最小的前k個點ide

(4)肯定前k個點所在的類別的出現頻率函數

(5)返回頻率最高的類別做爲測試的結果測試

 1 from numpy import *
 2 import operator
 3 def createDataSet():
 4     group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
 5     labels = ['A','A','B','B']
 6     return group, labels
 7 
 8 #kNN
 9 def classify0(inX , dataSet ,labels,k):
10     dataSetSize = dataSet.shape[0] #行數
11     diffMat = tile(inX,(dataSetSize,1)) - dataSet # tile(inX,(dataSetSize,1)) 生成 dataSetSize 行 1 列的 元素爲 inX的 數組
12     sqDiffMat = diffMat ** 2 #  ** 爲 ^
13     sqDistances = sqDiffMat.sum(axis=1) # axis=0是按列求和 axis=1 是按行求和
14     distance = sqDistances ** 0.5
15     sortedDisInd = distance.argsort()# argsort,屬於numpy中的函數 返回排序後元素在原對象中的下標
16     classCount = {}
17     for i in range(k):
18         votelabel = labels[sortedDisInd[i]]
19         classCount[votelabel] = classCount.get(votelabel,0) + 1 #dict.get(key, default=None) key:key在字典中查找。 default:在key不存在的狀況下返回值None。
20     sortedClassCount = sorted(classCount.iteritems(),key = operator.itemgetter(1),reverse =True)
21     '''
22     要經過student的第三個域排序,能夠這麼寫:
23     sorted(students, key=operator.itemgetter(2)) 
24     sorted函數也能夠進行多級排序,例如要根據第二個域和第三個域進行排序,能夠這麼寫:
25     sorted(students, key=operator.itemgetter(1,2))
26     即先跟句第二個域排序,再根據第三個域排序。
27     '''
28     return sortedClassCount[0][0]

二、加載數據

下載地址:http://pan.baidu.com/s/1c0NeKCg網站

數據格式:[fre flier miles earned per year]'\t'[per of time spent playing video games]'\t'[liters of ice cream consumed per year]'\t'[1,means do not at all/2,means small do/3,means large do]this

 1 #加載數據
 2 def file2matrix(filename):
 3     fr = open(filename)
 4     arrayOLines = fr.readlines()  #注意須要加s
 5     numberOfLines = len(arrayOLines)
 6     returnMat = zeros((numberOfLines,3))
 7     classLabelVector = []
 8     index = 0
 9     for line in arrayOLines:
10         line = line.strip()
11         listFormLine = line.split('\t')
12         for x in range(0,3):
13             returnMat[index,x] = float(listFormLine[x])
14         classLabelVector.append(int(listFormLine[-1])) # -1 爲最後一個元素
15         index += 1
16     return returnMat,classLabelVector

三、散點圖

 1 import matplotlib
 2 import matplotlib.pyplot as plt
 3 datingDataMat,datingLabels = kNN.file2matrix('datingTestSet.txt')
 4 fig = plt.figure() #figure建立一個繪圖對象
 5 ax = fig.add_subplot(111)# 若參數爲349,意思是:將畫布分割成3行4列,圖像畫在從左到右從上到下的第9塊,
 6 
 7 '''
 8 matplotlib.pyplot.scatter(x, y, s=20, c='b', marker='o', cmap=None, norm=None, vmin=None, vmax=None, alpha=None, linewidths=None, verts=None, hold=None,**kwargs)
 9 其中,xy是點的座標,s點的大小
10 maker是形狀能夠maker=(5,1)5表示形狀是5邊型,1表示是星型(0表示多邊形,2放射型,3圓形)
11 alpha表示透明度;facecolor=‘none’表示不填充。
12 '''
13 
14 ax.scatter(datingDataMat[:,1],datingDataMat[:,2],15.0*array(datingLabels),marker=(5,1),alpha=0.5)
15 plt.show()

 

 

四、歸一化特徵值

因爲特徵值的大小不一樣,因此就會對結果的影響程度不一樣。這就須要咱們歸一化特徵值,把每一個特徵值的大小固定在[0,1]:spa

range = MaxVal - MinValcode

normVal = rawVal / (MaxVal - MinVal)

 1 #歸一化特徵值
 2 def autoNorm(dataSet):
 3     minVals = dataSet.min(0)
 4     maxVals = dataSet.max(0)
 5     ranges = maxVals - minVals
 6     normDataSet = zeros(shape(dataSet))
 7     m = dataSet.shape[0] 
 8     normDataSet = dataSet - tile(minVals,(m,1)) 
 9     normDataSet = normDataSet / tile(ranges,(m,1))
10     return normDataSet,ranges,minVals

5.分類器測試

用10%的數據做爲輸入來測試,另外90%做爲已知集合

 1 def datingClassTest():
 2     hoRatio = 0.10
 3     datingDataMat,datingLabels = file2matrix('datingTestSet.txt')
 4     normMat,ranges,minVals = autoNorm(datingDataMat)
 5     m = normMat.shape[0]
 6     numTestVecs = int(m * hoRatio)
 7     errorCount = 0.0
 8     for i in range(numTestVecs):
 9         classifierResult = classify0(normMat[i,:],normMat[numTestVecs:m,:],datingLabels[numTestVecs:m],3)
10         print "back %d ,real %d" % (classifierResult,datingLabels[i])
11         if(classifierResult != datingLabels[i]):
12              errorCount += 1.0
13     print "range is %f" % (errorCount / float(numTestVecs))

六、約會網站測試

 1 #約會網站測試函數
 2 def classifyPerson():
 3     resultList = ['not at all','in small doses','in large dose']
 4     percentTats = float(raw_input("per of time spent playing video games?"))
 5     ffMiles = float(raw_input("fre flier miles earned per year?"))
 6     iceCream = float(raw_input("liters of ice cream consumed per year?"))
 7     datingDataMat,datingLabels = file2matrix('datingTestSet.txt')
 8     normMat,ranges,minVals = autoNorm(datingDataMat)
 9     inArr = array([ffMiles,percentTats,iceCream])
10     classifierResult = classify0((inArr - minVals)/ranges,normMat,datingLabels,3)
11     print "You will probably like this person :", 
12     print resultList[classifierResult-1]

相關文章
相關標籤/搜索