因爲近期學業繁重QAQ,因此我就不說廢話了,直接上代碼~git
from numpy import * import operator import matplotlib import matplotlib.pyplot as plt #將文件轉成numpy數組的函數 def file2matrix(filename): #打開文件 fr=open(filename) #將文件內容使用數組表示 arrayOLines=fr.readlines() #print('arrayOLines:') #print(arrayOLines) #數組的長度表示文件的行數 numberOfLine=len(arrayOLines) #print('numberOfLine:') #print(numberOfLine) #建立返回的NumPy矩陣,內容全爲0 returnMat=zeros((numberOfLine,3)) #print('returnMat:') #print(returnMat) classLabelVector=[] index=0 for line in arrayOLines: line=line.strip() listFromLine=line.split('\t') #print('listFromLine:') #print(listFromLine) returnMat[index,:]=listFromLine[0:3] #print('returnMat:') #print(+returnMat) classLabelVector.append(int(listFromLine[-1])) #print('classLabelVector:') #print(classLabelVector) index+=1 return returnMat,classLabelVector #根據數組繪圖的函數 def myDraw(datingDataMat,datingLabels): #創建一個畫布 fig=plt.figure() #在畫布中創建圖表 #fig.add_subplot()函數 #畫布分割成1行1列 ax=fig.add_subplot(111) ax.scatter(datingDataMat[:,0],datingDataMat[:,1], 15.0*array(datingLabels),15.0*array(datingLabels)) plt.show() #歸一化特徵值的函數 #返回的是歸一化後的數組,取值範圍,每一列的最小值歸一化數據 def autoNorm(dataSet): minVals=dataSet.min(0) maxVals=dataSet.max(0) ranges=maxVals-minVals normDataSet=zeros(shape(dataSet)) m=dataSet.shape[0] normDataSet=dataSet-tile(minVals,(m,1)) normDataSet=normDataSet/tile(ranges,(m,1)) return normDataSet,ranges,minVals #使用k-近鄰算法進行分類 def classify0(inX,dataSet,labels,k): dataSetSize=dataSet.shape[0] #計算距離 diffMat=tile(inX,(dataSetSize,1))-dataSet sqDiffMat=diffMat**2 distances=sqDiffMat.sum(axis=1) sortedDisIndices=distances.argsort() classCount={} #選擇距離最小的k個點 for i in range(k): voteIlabel=labels[sortedDisIndices[i]] classCount[voteIlabel]=classCount.get(voteIlabel,0)+1 #排序 sortedClassCount=sorted(classCount.items(), key=operator.itemgetter(1),reverse=True) #返回發生頻率最高的元素標籤 return sortedClassCount[0][0] #將數據分爲訓練集與測試集 #對分類器分類效果進行測試 def datingClassTest(): #測試數據佔比 hoRatio=0.10 datingDataMat,datingLabels=file2matrix('datingTestSet2.txt') normMat,ranges,minVals=autoNorm(datingDataMat) m=normMat.shape[0] #m爲行數1000 #print('m:') #print(m) numTestVecs=int(m*hoRatio) #選取其中的100個進行測試 #print('numTestVecs:') #print(numTestVecs) errorCount=0.0 #print('normMat[numTestVecs:m,:]:') #print(normMat[numTestVecs:m,:]) #print('datingLabels[numTestVecs:m]:') #print(datingLabels[numTestVecs:m]) for i in range(numTestVecs): #print('i:') #print(i) classifierResult=classify0(normMat[i,:],normMat[numTestVecs:m,:], datingLabels[numTestVecs:m],3) print("the classifierResult came back with: %d,the real answer is: %d" %(classifierResult,datingLabels[i])) if(classifierResult!=datingLabels[i]): errorCount+=1.0 print("the total error rate is: %f"%(errorCount/float(numTestVecs))) myDraw(datingDataMat,datingLabels) #玩視頻遊戲所消耗的時間百分比 #每一年得到的飛行常客里程數 #每週消費的冰淇淋公升數 #預測函數 def classifyPerson(): resultList=['not at all','in small doses','in large deses'] percentTats=float(input("玩視頻遊戲所消耗的時間百分比?")) ffMiles=float(input("每一年得到的飛行常客里程數?")) iceCream=float(input("每週消費的冰淇淋公升數?")) datingDataMat,datingLabels=file2matrix('datingTestSet2.txt') normMat,ranges,minVals=autoNorm(datingDataMat) inArr=array([ffMiles,percentTats,iceCream]) classifierResult=classify0((inArr-minVals)/ranges,normMat,datingLabels,3) print("You will probably like this person: ",resultList[classifierResult-1]) def main(): classifyPerson() if __name__=='__main__': main()
40920 8.326976 0.953952 3 14488 7.153469 1.673904 2 26052 1.441871 0.805124 1 75136 13.147394 0.428964 1 38344 1.669788 0.134296 1 72993 10.141740 1.032955 1 35948 6.830792 1.213192 3 42666 13.276369 0.543880 3 67497 8.631577 0.749278 1 35483 12.273169 1.508053 3 50242 3.723498 0.831917 1 63275 8.385879 1.669485 1 5569 4.875435 0.728658 2 51052 4.680098 0.625224 1 77372 15.299570 0.331351 1 43673 1.889461 0.191283 1 61364 7.516754 1.269164 1 69673 14.239195 0.261333 1 15669 0.000000 1.250185 2 28488 10.528555 1.304844 3 6487 3.540265 0.822483 2 37708 2.991551 0.833920 1 22620 5.297865 0.638306 2 28782 6.593803 0.187108 3 19739 2.816760 1.686209 2 36788 12.458258 0.649617 3 5741 0.000000 1.656418 2 28567 9.968648 0.731232 3 6808 1.364838 0.640103 2 41611 0.230453 1.151996 1 36661 11.865402 0.882810 3 43605 0.120460 1.352013 1 15360 8.545204 1.340429 3 63796 5.856649 0.160006 1 10743 9.665618 0.778626 2 70808 9.778763 1.084103 1 72011 4.932976 0.632026 1 5914 2.216246 0.587095 2 14851 14.305636 0.632317 3 33553 12.591889 0.686581 3 44952 3.424649 1.004504 1 17934 0.000000 0.147573 2 27738 8.533823 0.205324 3 29290 9.829528 0.238620 3 42330 11.492186 0.263499 3 36429 3.570968 0.832254 1 39623 1.771228 0.207612 1 32404 3.513921 0.991854 1 27268 4.398172 0.975024 1 5477 4.276823 1.174874 2 14254 5.946014 1.614244 2 68613 13.798970 0.724375 1 41539 10.393591 1.663724 3 7917 3.007577 0.297302 2 21331 1.031938 0.486174 2 8338 4.751212 0.064693 2 5176 3.692269 1.655113 2 18983 10.448091 0.267652 3 68837 10.585786 0.329557 1 13438 1.604501 0.069064 2 48849 3.679497 0.961466 1 12285 3.795146 0.696694 2 7826 2.531885 1.659173 2 5565 9.733340 0.977746 2 10346 6.093067 1.413798 2 1823 7.712960 1.054927 2 9744 11.470364 0.760461 3 16857 2.886529 0.934416 2 39336 10.054373 1.138351 3 65230 9.972470 0.881876 1 2463 2.335785 1.366145 2 27353 11.375155 1.528626 3 16191 0.000000 0.605619 2 12258 4.126787 0.357501 2 42377 6.319522 1.058602 1 25607 8.680527 0.086955 3 77450 14.856391 1.129823 1 58732 2.454285 0.222380 1 46426 7.292202 0.548607 3 32688 8.745137 0.857348 3 64890 8.579001 0.683048 1 8554 2.507302 0.869177 2 28861 11.415476 1.505466 3 42050 4.838540 1.680892 1 32193 10.339507 0.583646 3 64895 6.573742 1.151433 1 2355 6.539397 0.462065 2 0 2.209159 0.723567 2 70406 11.196378 0.836326 1 57399 4.229595 0.128253 1 41732 9.505944 0.005273 3 11429 8.652725 1.348934 3 75270 17.101108 0.490712 1 5459 7.871839 0.717662 2 73520 8.262131 1.361646 1 40279 9.015635 1.658555 3 21540 9.215351 0.806762 3 17694 6.375007 0.033678 2 22329 2.262014 1.022169 1 46570 5.677110 0.709469 1 ...
from numpy import * import operator from os import listdir #將二維32X32的圖像, #轉換成一個1X1024的向量 #方便使用以前的分類器 def img2vector(filename): returnVect=zeros((1,1024)) fr=open(filename) for i in range(32): lineStr=fr.readline() for j in range(32): returnVect[0,32*i+j]=int(lineStr[j]) return returnVect #使用k-近鄰算法進行分類 def classify0(inX,dataSet,labels,k): dataSetSize=dataSet.shape[0] #計算距離 diffMat=tile(inX,(dataSetSize,1))-dataSet sqDiffMat=diffMat**2 distances=sqDiffMat.sum(axis=1) sortedDisIndices=distances.argsort() classCount={} #選擇距離最小的k個點 for i in range(k): voteIlabel=labels[sortedDisIndices[i]] classCount[voteIlabel]=classCount.get(voteIlabel,0)+1 #排序 sortedClassCount=sorted(classCount.items(), key=operator.itemgetter(1),reverse=True) #返回發生頻率最高的元素標籤 return sortedClassCount[0][0] #手寫數字識別系統 def handwritingClassTest(): #標籤列表 hwLabels=[] #獲取目錄內容 trainingFileList=listdir('trainingDigits') m=len(trainingFileList) #以文件夾中的文件個數爲行數 #將每一個文件中的內容轉換成一個1X1024的向量 #矩陣的每一行表明一個文件中的全部內容 trainingMat=zeros((m,1024)) #從文件名解析分類數字 #7_200.txt表示數字7的第200個實例 for i in range(m): #獲取文件名 fileNameStr=trainingFileList[i] fileStr=fileNameStr.split('.')[0] classNumStr=int(fileStr.split('_')[0]) hwLabels.append(classNumStr) trainingMat[i,:]=img2vector('trainingDigits/%s'%fileNameStr) testFileList=listdir('testDigits') errorCount=0.0 mTest=len(testFileList) for i in range(mTest): fileNameStr=testFileList[i] fileStr=fileNameStr.split('.')[0] classNumStr=int(fileStr.split('_')[0]) vectorUnderTest=img2vector('testDigits/%s'%fileNameStr) classifierResult=classify0(vectorUnderTest,trainingMat,hwLabels,3) print("the classifier came back with: %d,the real answer is: %d" %(classifierResult,classNumStr)) if(classifierResult!=classNumStr): errorCount+=1.0 print("\nthe total number of errors is: %d"%errorCount) print("\nthe total error rate is: %f"%(errorCount/float(mTest))) def main(): #testVector=img2vector('./MLiA_SourceCode/machinelearninginaction/Ch02/digits/testDigits/0_13.txt') #print('testVector:') #print(testVector[0,0:31]) handwritingClassTest() if __name__=='__main__': main()
