飛行里程數 | 遊戲耗時百分比 | 冰淇淋公升數 | 分類結果 |
40920 | 8.326976 | 0.953952 | 3 |
14488 | 7.153469 | 1.673904 | 2 |
26052 | 1.441871 | 0.805124 | 1 |
...... | ...... | ...... | ...... |
數據在datingTestSet2.txt文件中的格式以下所示:算法
from numpy import * def file2matrix(filename): fr = open(filename) numberOfLines = len(fr.readlines()) # get the number of lines in the file returnMat = zeros((numberOfLines, 3)) # prepare matrix to return classLabelVector = [] # prepare labels return fr = open(filename) index = 0 for line in fr.readlines(): line = line.strip() listFromLine = line.split('\t') returnMat[index, :] = listFromLine[0:3] classLabelVector.append(int(listFromLine[-1])) index += 1 return returnMat, classLabelVector
使用file2matrix讀取文件數據,必須確保待解析文件存儲在當前的工做目錄中。導入數據以後,簡單檢查一下數據格式:後端
>>>import kNN >>>datingDataMat,datingLabels = kNN.file2matrix('datingTestSet2.txt') >>>datingDataMat[0:6] array([[ 4.09200000e+04, 8.32697600e+00, 9.53952000e-01], [ 1.44880000e+04, 7.15346900e+00, 1.67390400e+00], [ 2.60520000e+04, 1.44187100e+00, 8.05124000e-01], [ 7.51360000e+04, 1.31473940e+01, 4.28964000e-01], [ 3.83440000e+04, 1.66978800e+00, 1.34296000e-01], [ 7.29930000e+04, 1.01417400e+01, 1.03295500e+00]]) >>> datingLabels[0:6] [3, 2, 1, 1, 1, 1]
分析數據:使用Matplotlib建立散點圖app
>>> import matplotlib >>> import matplotlib.pyplot as plt >>> fig = plt.figure() >>> ax = fig.add_subplot(111) >>> ax.scatter(datingDataMat[:,1],datingDataMat[:,2]) <matplotlib.collections.PathCollection object at 0x0000019E14C9A470> >>> plt.show() >>>
生成的散點圖以下:機器學習
import matplotlib import numpy as np from numpy import * from matplotlib import pyplot as plt def file2matrix(filename): fr = open(filename) numberOfLines = len(fr.readlines()) # get the number of lines in the file returnMat = zeros((numberOfLines, 3)) # prepare matrix to return classLabelVector = [] # prepare labels return fr = open(filename) index = 0 for line in fr.readlines(): line = line.strip() listFromLine = line.split('\t') returnMat[index, :] = listFromLine[0:3] classLabelVector.append(int(listFromLine[-1])) index += 1 return returnMat, classLabelVector datingDataMat,datingLabels = file2matrix('datingTestSet2.txt') fig = plt.figure() ax = plt.subplot(111) ax.scatter(datingDataMat[:,1],datingDataMat[:,2]) plt.show()
上圖因爲沒有使用樣本分類的特徵值,很難看到任何有用的數據模式信息。爲了更好理解數據信息,Matplotlib庫提供的scatter函數支持個性化標記散點圖上的點。調用scatter函數使用下列參數:函數
ax.scatter(datingDataMat[:,1],datingDataMat[:,2],15.0*array(datingLabels),15.0*array(datingLabels))
生成的散點圖以下:工具
import matplotlib import numpy as np from numpy import * from matplotlib import pyplot as plt from matplotlib.font_manager import FontProperties def file2matrix(filename): fr = open(filename) numberOfLines = len(fr.readlines()) # get the number of lines in the file returnMat = zeros((numberOfLines, 3)) # prepare matrix to return classLabelVector = [] # prepare labels return fr = open(filename) index = 0 for line in fr.readlines(): line = line.strip() listFromLine = line.split('\t') returnMat[index, :] = listFromLine[0:3] classLabelVector.append(int(listFromLine[-1])) index += 1 return returnMat, classLabelVector zhfont = FontProperties(fname='C:/Windows/Fonts/simsun.ttc',size=12) datingDataMat,datingLabels = file2matrix('datingTestSet2.txt') fig = plt.figure() plt.figure(figsize=(8, 5), dpi=80) ax = plt.subplot(111) datingLabels = np.array(datingLabels) idx_1 = np.where(datingLabels==1) p1 = ax.scatter(datingDataMat[idx_1,0],datingDataMat[idx_1,1],marker = '*',color = 'r',label='1',s=10) idx_2 = np.where(datingLabels==2) p2 = ax.scatter(datingDataMat[idx_2,0],datingDataMat[idx_2,1],marker = 'o',color ='g',label='2',s=20) idx_3 = np.where(datingLabels==3) p3 = ax.scatter(datingDataMat[idx_3,0],datingDataMat[idx_3,1],marker = '+',color ='b',label='3',s=30) plt.xlabel(u'每一年獲取的飛行里程數', fontproperties=zhfont) plt.ylabel(u'玩視頻遊戲所消耗的事件百分比', fontproperties=zhfont) ax.legend((p1, p2, p3), (u'不喜歡', u'魅力通常', u'極具魅力'), loc=2, prop=zhfont) plt.show()
生成的散點圖以下:學習
第二種方法:字體
import matplotlib from matplotlib import pyplot as plt from matplotlib import font_manager def file2matrix(filename): fr = open(filename) numberOfLines = len(fr.readlines()) # get the number of lines in the file returnMat = zeros((numberOfLines, 3)) # prepare matrix to return classLabelVector = [] # prepare labels return fr = open(filename) index = 0 for line in fr.readlines(): line = line.strip() listFromLine = line.split('\t') returnMat[index, :] = listFromLine[0:3] classLabelVector.append(int(listFromLine[-1])) index += 1 return returnMat, classLabelVector matrix, labels = file2matrix('datingTestSet2.txt') zhfont = matplotlib.font_manager.FontProperties(fname='C:/Windows/Fonts/simsun.ttc',size=12) plt.figure(figsize=(8, 5), dpi=80) axes = plt.subplot(111) # 將三類數據分別取出來 # x軸表明飛行的里程數 # y軸表明玩視頻遊戲的百分比 type1_x = [] type1_y = [] type2_x = [] type2_y = [] type3_x = [] type3_y = [] for i in range(len(labels)): if labels[i] == 1: # 不喜歡 type1_x.append(matrix[i][0]) type1_y.append(matrix[i][1]) if labels[i] == 2: # 魅力通常 type2_x.append(matrix[i][0]) type2_y.append(matrix[i][1]) if labels[i] == 3: # 極具魅力 #print (i, ':', labels[i], ':', type(labels[i])) type3_x.append(matrix[i][0]) type3_y.append(matrix[i][1]) type1 = axes.scatter(type1_x, type1_y, s=20, c='red') type2 = axes.scatter(type2_x, type2_y, s=40, c='green') type3 = axes.scatter(type3_x, type3_y, s=50, c='blue') plt.xlabel(u'每一年獲取的飛行里程數', fontproperties=zhfont) plt.ylabel(u'玩視頻遊戲所消耗的事件百分比', fontproperties=zhfont) axes.legend((type1, type2, type3), (u'不喜歡', u'魅力通常', u'極具魅力'), loc=2, prop=zhfont) plt.show()
生成的散點圖以下:this
# -*- coding: utf-8 -*- import matplotlib.pyplot as plt import matplotlib zhfont1 = matplotlib.font_manager.FontProperties(fname='C:/Windows/Fonts/simsun.ttc') plt.xlabel(u"橫座標xlabel",fontproperties=zhfont1)
到C:\Windows\Fonts\中找到新宋體對應的字體文件simsun.ttf(Window 8和Windows10系統是simsun.ttc,也能夠使用其餘字體)spa