因爲近期學業繁重QAQ,因此我就不說廢話了,直接上代碼~git
from numpy import * import operator import matplotlib import matplotlib.pyplot as plt #將文件轉成numpy數組的函數 def file2matrix(filename): #打開文件 fr=open(filename) #將文件內容使用數組表示 arrayOLines=fr.readlines() #print('arrayOLines:') #print(arrayOLines) #數組的長度表示文件的行數 numberOfLine=len(arrayOLines) #print('numberOfLine:') #print(numberOfLine) #建立返回的NumPy矩陣,內容全爲0 returnMat=zeros((numberOfLine,3)) #print('returnMat:') #print(returnMat) classLabelVector=[] index=0 for line in arrayOLines: line=line.strip() listFromLine=line.split('\t') #print('listFromLine:') #print(listFromLine) returnMat[index,:]=listFromLine[0:3] #print('returnMat:') #print(+returnMat) classLabelVector.append(int(listFromLine[-1])) #print('classLabelVector:') #print(classLabelVector) index+=1 return returnMat,classLabelVector #根據數組繪圖的函數 def myDraw(datingDataMat,datingLabels): #創建一個畫布 fig=plt.figure() #在畫布中創建圖表 #fig.add_subplot()函數 #畫布分割成1行1列 ax=fig.add_subplot(111) ax.scatter(datingDataMat[:,0],datingDataMat[:,1], 15.0*array(datingLabels),15.0*array(datingLabels)) plt.show() #歸一化特徵值的函數 #返回的是歸一化後的數組,取值範圍,每一列的最小值歸一化數據 def autoNorm(dataSet): minVals=dataSet.min(0) maxVals=dataSet.max(0) ranges=maxVals-minVals normDataSet=zeros(shape(dataSet)) m=dataSet.shape[0] normDataSet=dataSet-tile(minVals,(m,1)) normDataSet=normDataSet/tile(ranges,(m,1)) return normDataSet,ranges,minVals #使用k-近鄰算法進行分類 def classify0(inX,dataSet,labels,k): dataSetSize=dataSet.shape[0] #計算距離 diffMat=tile(inX,(dataSetSize,1))-dataSet sqDiffMat=diffMat**2 distances=sqDiffMat.sum(axis=1) sortedDisIndices=distances.argsort() classCount={} #選擇距離最小的k個點 for i in range(k): voteIlabel=labels[sortedDisIndices[i]] classCount[voteIlabel]=classCount.get(voteIlabel,0)+1 #排序 sortedClassCount=sorted(classCount.items(), key=operator.itemgetter(1),reverse=True) #返回發生頻率最高的元素標籤 return sortedClassCount[0][0] #將數據分爲訓練集與測試集 #對分類器分類效果進行測試 def datingClassTest(): #測試數據佔比 hoRatio=0.10 datingDataMat,datingLabels=file2matrix('datingTestSet2.txt') normMat,ranges,minVals=autoNorm(datingDataMat) m=normMat.shape[0] #m爲行數1000 #print('m:') #print(m) numTestVecs=int(m*hoRatio) #選取其中的100個進行測試 #print('numTestVecs:') #print(numTestVecs) errorCount=0.0 #print('normMat[numTestVecs:m,:]:') #print(normMat[numTestVecs:m,:]) #print('datingLabels[numTestVecs:m]:') #print(datingLabels[numTestVecs:m]) for i in range(numTestVecs): #print('i:') #print(i) classifierResult=classify0(normMat[i,:],normMat[numTestVecs:m,:], datingLabels[numTestVecs:m],3) print("the classifierResult came back with: %d,the real answer is: %d" %(classifierResult,datingLabels[i])) if(classifierResult!=datingLabels[i]): errorCount+=1.0 print("the total error rate is: %f"%(errorCount/float(numTestVecs))) myDraw(datingDataMat,datingLabels) #玩視頻遊戲所消耗的時間百分比 #每一年得到的飛行常客里程數 #每週消費的冰淇淋公升數 #預測函數 def classifyPerson(): resultList=['not at all','in small doses','in large deses'] percentTats=float(input("玩視頻遊戲所消耗的時間百分比?")) ffMiles=float(input("每一年得到的飛行常客里程數?")) iceCream=float(input("每週消費的冰淇淋公升數?")) datingDataMat,datingLabels=file2matrix('datingTestSet2.txt') normMat,ranges,minVals=autoNorm(datingDataMat) inArr=array([ffMiles,percentTats,iceCream]) classifierResult=classify0((inArr-minVals)/ranges,normMat,datingLabels,3) print("You will probably like this person: ",resultList[classifierResult-1]) def main(): classifyPerson() if __name__=='__main__': main()
40920 8.326976 0.953952 3 14488 7.153469 1.673904 2 26052 1.441871 0.805124 1 75136 13.147394 0.428964 1 38344 1.669788 0.134296 1 72993 10.141740 1.032955 1 35948 6.830792 1.213192 3 42666 13.276369 0.543880 3 67497 8.631577 0.749278 1 35483 12.273169 1.508053 3 50242 3.723498 0.831917 1 63275 8.385879 1.669485 1 5569 4.875435 0.728658 2 51052 4.680098 0.625224 1 77372 15.299570 0.331351 1 43673 1.889461 0.191283 1 61364 7.516754 1.269164 1 69673 14.239195 0.261333 1 15669 0.000000 1.250185 2 28488 10.528555 1.304844 3 6487 3.540265 0.822483 2 37708 2.991551 0.833920 1 22620 5.297865 0.638306 2 28782 6.593803 0.187108 3 19739 2.816760 1.686209 2 36788 12.458258 0.649617 3 5741 0.000000 1.656418 2 28567 9.968648 0.731232 3 6808 1.364838 0.640103 2 41611 0.230453 1.151996 1 36661 11.865402 0.882810 3 43605 0.120460 1.352013 1 15360 8.545204 1.340429 3 63796 5.856649 0.160006 1 10743 9.665618 0.778626 2 70808 9.778763 1.084103 1 72011 4.932976 0.632026 1 5914 2.216246 0.587095 2 14851 14.305636 0.632317 3 33553 12.591889 0.686581 3 44952 3.424649 1.004504 1 17934 0.000000 0.147573 2 27738 8.533823 0.205324 3 29290 9.829528 0.238620 3 42330 11.492186 0.263499 3 36429 3.570968 0.832254 1 39623 1.771228 0.207612 1 32404 3.513921 0.991854 1 27268 4.398172 0.975024 1 5477 4.276823 1.174874 2 14254 5.946014 1.614244 2 68613 13.798970 0.724375 1 41539 10.393591 1.663724 3 7917 3.007577 0.297302 2 21331 1.031938 0.486174 2 8338 4.751212 0.064693 2 5176 3.692269 1.655113 2 18983 10.448091 0.267652 3 68837 10.585786 0.329557 1 13438 1.604501 0.069064 2 48849 3.679497 0.961466 1 12285 3.795146 0.696694 2 7826 2.531885 1.659173 2 5565 9.733340 0.977746 2 10346 6.093067 1.413798 2 1823 7.712960 1.054927 2 9744 11.470364 0.760461 3 16857 2.886529 0.934416 2 39336 10.054373 1.138351 3 65230 9.972470 0.881876 1 2463 2.335785 1.366145 2 27353 11.375155 1.528626 3 16191 0.000000 0.605619 2 12258 4.126787 0.357501 2 42377 6.319522 1.058602 1 25607 8.680527 0.086955 3 77450 14.856391 1.129823 1 58732 2.454285 0.222380 1 46426 7.292202 0.548607 3 32688 8.745137 0.857348 3 64890 8.579001 0.683048 1 8554 2.507302 0.869177 2 28861 11.415476 1.505466 3 42050 4.838540 1.680892 1 32193 10.339507 0.583646 3 64895 6.573742 1.151433 1 2355 6.539397 0.462065 2 0 2.209159 0.723567 2 70406 11.196378 0.836326 1 57399 4.229595 0.128253 1 41732 9.505944 0.005273 3 11429 8.652725 1.348934 3 75270 17.101108 0.490712 1 5459 7.871839 0.717662 2 73520 8.262131 1.361646 1 40279 9.015635 1.658555 3 21540 9.215351 0.806762 3 17694 6.375007 0.033678 2 22329 2.262014 1.022169 1 46570 5.677110 0.709469 1 ...
from numpy import * import operator from os import listdir #將二維32X32的圖像, #轉換成一個1X1024的向量 #方便使用以前的分類器 def img2vector(filename): returnVect=zeros((1,1024)) fr=open(filename) for i in range(32): lineStr=fr.readline() for j in range(32): returnVect[0,32*i+j]=int(lineStr[j]) return returnVect #使用k-近鄰算法進行分類 def classify0(inX,dataSet,labels,k): dataSetSize=dataSet.shape[0] #計算距離 diffMat=tile(inX,(dataSetSize,1))-dataSet sqDiffMat=diffMat**2 distances=sqDiffMat.sum(axis=1) sortedDisIndices=distances.argsort() classCount={} #選擇距離最小的k個點 for i in range(k): voteIlabel=labels[sortedDisIndices[i]] classCount[voteIlabel]=classCount.get(voteIlabel,0)+1 #排序 sortedClassCount=sorted(classCount.items(), key=operator.itemgetter(1),reverse=True) #返回發生頻率最高的元素標籤 return sortedClassCount[0][0] #手寫數字識別系統 def handwritingClassTest(): #標籤列表 hwLabels=[] #獲取目錄內容 trainingFileList=listdir('trainingDigits') m=len(trainingFileList) #以文件夾中的文件個數爲行數 #將每一個文件中的內容轉換成一個1X1024的向量 #矩陣的每一行表明一個文件中的全部內容 trainingMat=zeros((m,1024)) #從文件名解析分類數字 #7_200.txt表示數字7的第200個實例 for i in range(m): #獲取文件名 fileNameStr=trainingFileList[i] fileStr=fileNameStr.split('.')[0] classNumStr=int(fileStr.split('_')[0]) hwLabels.append(classNumStr) trainingMat[i,:]=img2vector('trainingDigits/%s'%fileNameStr) testFileList=listdir('testDigits') errorCount=0.0 mTest=len(testFileList) for i in range(mTest): fileNameStr=testFileList[i] fileStr=fileNameStr.split('.')[0] classNumStr=int(fileStr.split('_')[0]) vectorUnderTest=img2vector('testDigits/%s'%fileNameStr) classifierResult=classify0(vectorUnderTest,trainingMat,hwLabels,3) print("the classifier came back with: %d,the real answer is: %d" %(classifierResult,classNumStr)) if(classifierResult!=classNumStr): errorCount+=1.0 print("\nthe total number of errors is: %d"%errorCount) print("\nthe total error rate is: %f"%(errorCount/float(mTest))) def main(): #testVector=img2vector('./MLiA_SourceCode/machinelearninginaction/Ch02/digits/testDigits/0_13.txt') #print('testVector:') #print(testVector[0,0:31]) handwritingClassTest() if __name__=='__main__': main()
00000000000001111000000000000000 00000000000011111110000000000000 00000000001111111111000000000000 00000001111111111111100000000000 00000001111111011111100000000000 00000011111110000011110000000000 00000011111110000000111000000000 00000011111110000000111100000000 00000011111110000000011100000000 00000011111110000000011100000000 00000011111100000000011110000000 00000011111100000000001110000000 00000011111100000000001110000000 00000001111110000000000111000000 00000001111110000000000111000000 00000001111110000000000111000000 00000001111110000000000111000000 00000011111110000000001111000000 00000011110110000000001111000000 00000011110000000000011110000000 00000001111000000000001111000000 00000001111000000000011111000000 00000001111000000000111110000000 00000001111000000001111100000000 00000000111000000111111000000000 00000000111100011111110000000000 00000000111111111111110000000000 00000000011111111111110000000000 00000000011111111111100000000000 00000000001111111110000000000000 00000000000111110000000000000000 00000000000011000000000000000000