雖然把text轉成所有量化是能夠的,可是仍是須要把text轉成numpy的形式(這個是必須掌握的)python
在將數據輸入到分類器以前,必須將待處理數據的格式改變爲分類器能夠接受的格式。算法
數據規範化、數據歸一化、數據算法化、輸出偏差分析數組
代碼:app
# -*- coding:utf-8 -*- from numpy import * def file2matrix(filename): fr = open(filename) numberOfLines = len(fr.readlines()) #get the number of lines in the file returnMat = zeros((numberOfLines,3)) #prepare matrix to return classLabelVector = [] #prepare labels return fr = open(filename) index = 0 for line in fr.readlines(): line = line.strip() listFromLine = line.split('\t') returnMat[index,:] = listFromLine[0:3] classLabelVector.append(int(listFromLine[-1])) index += 1 return returnMat,classLabelVector #結果所有量化,把喜歡不喜歡排名一、二、3 datingDataMat,datingLabels = file2matrix('datingTestSet2.txt') import matplotlib import matplotlib.pyplot as plt # matplotlib 是python最著名的繪圖庫,它提供了一整套和matlab類似的命令API,十分適合交互式地行製圖。並且也能夠方便地將它做爲繪圖控件,嵌入GUI應用程序中。 fig = plt.figure() ax = fig.add_subplot(111) ax.scatter(datingDataMat[:,1],datingDataMat[:,2],15.0*array(datingLabels),15.0*array(datingLabels)) plt.show()
def autoNorm(dataSet): minVals = dataSet.min(0) maxVals = dataSet.max(0) ranges = maxVals - minVals normDataSet = zeros(shape(dataSet)) #建立新的返回矩陣 m = dataSet.shape[0] #獲得數據集的行數 shape方法用來獲得矩陣或數組的維數 normDataSet = dataSet - tile(minVals,(m,1)) #tile:numpy中的函數。tile將原來的一個數組minVals,擴充成了m行1列的數組 normDataSet = normDataSet/tile(ranges,(m,1)) return normDataSet,ranges,minVals normMat,ranges,minVals = autoNorm((datingDataMat)) import operator def classify0(inX, dataSet, labels, k): dataSetSize = dataSet.shape[0] diffMat = tile(inX, (dataSetSize,1)) - dataSet sqDiffMat = diffMat**2 sqDistances = sqDiffMat.sum(axis=1) distances = sqDistances**0.5 sortedDistIndicies = distances.argsort() classCount={} for i in range(k): voteIlabel = labels[sortedDistIndicies[i]] classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1 sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True) return sortedClassCount[0][0] def datingClassTest(): hoRatio = 0.10 ErrorCount = 0.0 datingDataMat, datingLabels = file2matrix('datingTestSet2.txt') normMat, ranges, minVals = autoNorm(datingDataMat) m = normMat.shape[0] count = int(m*hoRatio) #這裏須要整型化 for i in range(count): #算法裏使用的數據是count(總數)仍是i(當前數), #逐漸被測試的數據inX使用[i,:],可是數據集使用count # 輸入參數:normMat[i,:]爲測試樣例,表示歸一化後的第i行數據 # normMat[numTestVecs:m,:]爲訓練樣本數據,樣本數量爲(m-numTestVecs)個 # datingLabels[numTestVecs:m]爲訓練樣本對應的類型標籤 # k爲k-近鄰的取值 classifierResult = classify0(normMat[i,:],normMat[count:m,:],datingLabels[count:m],4) print "the classifier came back with:%d,the real answer is :%d"\ % (classifierResult,datingLabels[i]) if (classifierResult != datingLabels[i]) : ErrorCount += 1.0 print "the total error rate is :%f" % (ErrorCount/float(count)) def classifyPerson(): resultList = ['not at all','in small doses','in large doses'] #float定義了輸入的類型 percentTats = float(raw_input( "percentage of time spent playing video games?")) ffMiles = float(raw_input("frequent flier miles earned per year?")) iceCream = float(raw_input("liters of ice cream consumed per year?")) datingDataMat,datingLabels = file2matrix(("datingTestSet2.txt")) normMat,ranges,minVals = autoNorm(datingDataMat) #將輸入的數據數組化 inArr = array([ffMiles,percentTats,iceCream]) classifierResult = classify0((inArr-minVals)/ranges,normMat,datingLabels,3) print "You will probably like this person:",resultList[classifierResult - 1]