機器學習 | K近鄰算法

時間 2019-12-07

標籤機器學習近鄰算法简体版

原文原文鏈接

因爲近期學業繁重QAQ，因此我就不說廢話了，直接上代碼~git

使用K近鄰算法改進約會網站

from numpy import *
import operator
import matplotlib
import matplotlib.pyplot as plt

#將文件轉成numpy數組的函數
def file2matrix(filename):
    #打開文件
    fr=open(filename)
    #將文件內容使用數組表示
    arrayOLines=fr.readlines()
    #print('arrayOLines:')
    #print(arrayOLines)
    #數組的長度表示文件的行數
    numberOfLine=len(arrayOLines)
    #print('numberOfLine:')
    #print(numberOfLine)
    #建立返回的NumPy矩陣，內容全爲0
    returnMat=zeros((numberOfLine,3))
    #print('returnMat:')
    #print(returnMat)
    classLabelVector=[]
    index=0
    for line in arrayOLines:
        line=line.strip()
        listFromLine=line.split('\t')
        #print('listFromLine:')
        #print(listFromLine)
        returnMat[index,:]=listFromLine[0:3]
        #print('returnMat:')
        #print(+returnMat)
        classLabelVector.append(int(listFromLine[-1]))
        #print('classLabelVector:')
        #print(classLabelVector)
        index+=1
    return returnMat,classLabelVector
        
#根據數組繪圖的函數
def myDraw(datingDataMat,datingLabels):
    #創建一個畫布
    fig=plt.figure()
    #在畫布中創建圖表
    #fig.add_subplot()函數
    #畫布分割成1行1列
    ax=fig.add_subplot(111)
    ax.scatter(datingDataMat[:,0],datingDataMat[:,1],
    15.0*array(datingLabels),15.0*array(datingLabels))
    plt.show()
        
#歸一化特徵值的函數
#返回的是歸一化後的數組，取值範圍，每一列的最小值歸一化數據
def autoNorm(dataSet):
    minVals=dataSet.min(0)
    maxVals=dataSet.max(0)
    ranges=maxVals-minVals
    normDataSet=zeros(shape(dataSet))
    m=dataSet.shape[0]
    normDataSet=dataSet-tile(minVals,(m,1))
    normDataSet=normDataSet/tile(ranges,(m,1))
    return normDataSet,ranges,minVals
        
#使用k-近鄰算法進行分類
def classify0(inX,dataSet,labels,k):
    dataSetSize=dataSet.shape[0]
    #計算距離
    diffMat=tile(inX,(dataSetSize,1))-dataSet
    sqDiffMat=diffMat**2
    distances=sqDiffMat.sum(axis=1)
    sortedDisIndices=distances.argsort()
    classCount={}
    #選擇距離最小的k個點
    for i in range(k):
        voteIlabel=labels[sortedDisIndices[i]]
        classCount[voteIlabel]=classCount.get(voteIlabel,0)+1
    #排序
    sortedClassCount=sorted(classCount.items(),
    key=operator.itemgetter(1),reverse=True)
    #返回發生頻率最高的元素標籤
    return sortedClassCount[0][0]
        
#將數據分爲訓練集與測試集
#對分類器分類效果進行測試
def datingClassTest():
    #測試數據佔比
    hoRatio=0.10
    datingDataMat,datingLabels=file2matrix('datingTestSet2.txt')
    normMat,ranges,minVals=autoNorm(datingDataMat)
    m=normMat.shape[0]
    #m爲行數1000
    #print('m:')
    #print(m)
    numTestVecs=int(m*hoRatio)
    #選取其中的100個進行測試
    #print('numTestVecs:')
    #print(numTestVecs)
    errorCount=0.0
    #print('normMat[numTestVecs:m,:]:')
    #print(normMat[numTestVecs:m,:])
    #print('datingLabels[numTestVecs:m]:')
    #print(datingLabels[numTestVecs:m])
    for i in range(numTestVecs):
        #print('i:')
        #print(i)
        classifierResult=classify0(normMat[i,:],normMat[numTestVecs:m,:],
        datingLabels[numTestVecs:m],3)
        print("the classifierResult came back with: %d,the real answer is: %d"
        %(classifierResult,datingLabels[i]))
        if(classifierResult!=datingLabels[i]):
            errorCount+=1.0
    print("the total error rate is: %f"%(errorCount/float(numTestVecs)))
    myDraw(datingDataMat,datingLabels)
        
#玩視頻遊戲所消耗的時間百分比
#每一年得到的飛行常客里程數
#每週消費的冰淇淋公升數
#預測函數
def classifyPerson():
    resultList=['not at all','in small doses','in large deses']
    percentTats=float(input("玩視頻遊戲所消耗的時間百分比?"))
    ffMiles=float(input("每一年得到的飛行常客里程數?"))
    iceCream=float(input("每週消費的冰淇淋公升數?"))
    datingDataMat,datingLabels=file2matrix('datingTestSet2.txt')
    normMat,ranges,minVals=autoNorm(datingDataMat)
    inArr=array([ffMiles,percentTats,iceCream])
    classifierResult=classify0((inArr-minVals)/ranges,normMat,datingLabels,3)
    print("You will probably like this person: ",resultList[classifierResult-1])
        
        
def main():
    classifyPerson()
    
    
if __name__=='__main__':
    main()

datingTestSet2.txt數據預覽

40920    8.326976    0.953952    3
14488    7.153469    1.673904    2
26052    1.441871    0.805124    1
75136    13.147394    0.428964    1
38344    1.669788    0.134296    1
72993    10.141740    1.032955    1
35948    6.830792    1.213192    3
42666    13.276369    0.543880    3
67497    8.631577    0.749278    1
35483    12.273169    1.508053    3
50242    3.723498    0.831917    1
63275    8.385879    1.669485    1
5569    4.875435    0.728658    2
51052    4.680098    0.625224    1
77372    15.299570    0.331351    1
43673    1.889461    0.191283    1
61364    7.516754    1.269164    1
69673    14.239195    0.261333    1
15669    0.000000    1.250185    2
28488    10.528555    1.304844    3
6487    3.540265    0.822483    2
37708    2.991551    0.833920    1
22620    5.297865    0.638306    2
28782    6.593803    0.187108    3
19739    2.816760    1.686209    2
36788    12.458258    0.649617    3
5741    0.000000    1.656418    2
28567    9.968648    0.731232    3
6808    1.364838    0.640103    2
41611    0.230453    1.151996    1
36661    11.865402    0.882810    3
43605    0.120460    1.352013    1
15360    8.545204    1.340429    3
63796    5.856649    0.160006    1
10743    9.665618    0.778626    2
70808    9.778763    1.084103    1
72011    4.932976    0.632026    1
5914    2.216246    0.587095    2
14851    14.305636    0.632317    3
33553    12.591889    0.686581    3
44952    3.424649    1.004504    1
17934    0.000000    0.147573    2
27738    8.533823    0.205324    3
29290    9.829528    0.238620    3
42330    11.492186    0.263499    3
36429    3.570968    0.832254    1
39623    1.771228    0.207612    1
32404    3.513921    0.991854    1
27268    4.398172    0.975024    1
5477    4.276823    1.174874    2
14254    5.946014    1.614244    2
68613    13.798970    0.724375    1
41539    10.393591    1.663724    3
7917    3.007577    0.297302    2
21331    1.031938    0.486174    2
8338    4.751212    0.064693    2
5176    3.692269    1.655113    2
18983    10.448091    0.267652    3
68837    10.585786    0.329557    1
13438    1.604501    0.069064    2
48849    3.679497    0.961466    1
12285    3.795146    0.696694    2
7826    2.531885    1.659173    2
5565    9.733340    0.977746    2
10346    6.093067    1.413798    2
1823    7.712960    1.054927    2
9744    11.470364    0.760461    3
16857    2.886529    0.934416    2
39336    10.054373    1.138351    3
65230    9.972470    0.881876    1
2463    2.335785    1.366145    2
27353    11.375155    1.528626    3
16191    0.000000    0.605619    2
12258    4.126787    0.357501    2
42377    6.319522    1.058602    1
25607    8.680527    0.086955    3
77450    14.856391    1.129823    1
58732    2.454285    0.222380    1
46426    7.292202    0.548607    3
32688    8.745137    0.857348    3
64890    8.579001    0.683048    1
8554    2.507302    0.869177    2
28861    11.415476    1.505466    3
42050    4.838540    1.680892    1
32193    10.339507    0.583646    3
64895    6.573742    1.151433    1
2355    6.539397    0.462065    2
0    2.209159    0.723567    2
70406    11.196378    0.836326    1
57399    4.229595    0.128253    1
41732    9.505944    0.005273    3
11429    8.652725    1.348934    3
75270    17.101108    0.490712    1
5459    7.871839    0.717662    2
73520    8.262131    1.361646    1
40279    9.015635    1.658555    3
21540    9.215351    0.806762    3
17694    6.375007    0.033678    2
22329    2.262014    1.022169    1
46570    5.677110    0.709469    1
...

使用K近鄰算法實現手寫識別

from numpy import *
import operator
from os import listdir

#將二維32X32的圖像，
#轉換成一個1X1024的向量
#方便使用以前的分類器
def img2vector(filename):
    returnVect=zeros((1,1024))
    fr=open(filename)
    for i in range(32):
        lineStr=fr.readline()
        for j in range(32):
            returnVect[0,32*i+j]=int(lineStr[j])
    return returnVect

#使用k-近鄰算法進行分類
def classify0(inX,dataSet,labels,k):
    dataSetSize=dataSet.shape[0]
    #計算距離
    diffMat=tile(inX,(dataSetSize,1))-dataSet
    sqDiffMat=diffMat**2
    distances=sqDiffMat.sum(axis=1)
    sortedDisIndices=distances.argsort()
    classCount={}
    #選擇距離最小的k個點
    for i in range(k):
        voteIlabel=labels[sortedDisIndices[i]]
        classCount[voteIlabel]=classCount.get(voteIlabel,0)+1
    #排序
    sortedClassCount=sorted(classCount.items(),
    key=operator.itemgetter(1),reverse=True)
    #返回發生頻率最高的元素標籤
    return sortedClassCount[0][0]

#手寫數字識別系統
def handwritingClassTest():
    #標籤列表
    hwLabels=[]
    #獲取目錄內容
    trainingFileList=listdir('trainingDigits')
    m=len(trainingFileList)
    #以文件夾中的文件個數爲行數
    #將每一個文件中的內容轉換成一個1X1024的向量
    #矩陣的每一行表明一個文件中的全部內容
    trainingMat=zeros((m,1024))
    #從文件名解析分類數字
    #7_200.txt表示數字7的第200個實例
    for i in range(m):
        #獲取文件名
        fileNameStr=trainingFileList[i]
        fileStr=fileNameStr.split('.')[0]
        classNumStr=int(fileStr.split('_')[0])
        hwLabels.append(classNumStr)
        trainingMat[i,:]=img2vector('trainingDigits/%s'%fileNameStr)
    testFileList=listdir('testDigits')
    errorCount=0.0
    mTest=len(testFileList)
    for i in range(mTest):
        fileNameStr=testFileList[i]
        fileStr=fileNameStr.split('.')[0]
        classNumStr=int(fileStr.split('_')[0])
        vectorUnderTest=img2vector('testDigits/%s'%fileNameStr)
        classifierResult=classify0(vectorUnderTest,trainingMat,hwLabels,3)
        print("the classifier came back with: %d,the real answer is: %d"
        %(classifierResult,classNumStr))
        if(classifierResult!=classNumStr):
            errorCount+=1.0
    print("\nthe total number of errors is: %d"%errorCount)
    print("\nthe total error rate is: %f"%(errorCount/float(mTest)))
    
    
def main():
    #testVector=img2vector('./MLiA_SourceCode/machinelearninginaction/Ch02/digits/testDigits/0_13.txt')
    #print('testVector:')
    #print(testVector[0,0:31])
    handwritingClassTest()
    
if __name__=='__main__':
    main()

0_0.txt數據預覽

00000000000001111000000000000000
00000000000011111110000000000000
00000000001111111111000000000000
00000001111111111111100000000000
00000001111111011111100000000000
00000011111110000011110000000000
00000011111110000000111000000000
00000011111110000000111100000000
00000011111110000000011100000000
00000011111110000000011100000000
00000011111100000000011110000000
00000011111100000000001110000000
00000011111100000000001110000000
00000001111110000000000111000000
00000001111110000000000111000000
00000001111110000000000111000000
00000001111110000000000111000000
00000011111110000000001111000000
00000011110110000000001111000000
00000011110000000000011110000000
00000001111000000000001111000000
00000001111000000000011111000000
00000001111000000000111110000000
00000001111000000001111100000000
00000000111000000111111000000000
00000000111100011111110000000000
00000000111111111111110000000000
00000000011111111111110000000000
00000000011111111111100000000000
00000000001111111110000000000000
00000000000111110000000000000000
00000000000011000000000000000000

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。