主成分分析(PCA)實現代碼

摘自《機器學習實踐》第13章:機器學習

 1 from numpy import *
 2 import matplotlib
 3 import matplotlib.pyplot as plt
 4 
 5 def loadDataSet(fileName, delim='\t'):
 6     fr = open(fileName)
 7     stringArr = [line.strip().split(delim) for line in fr.readlines()]
 8     datArr = [map(float, line) for line in stringArr]
 9     return mat(datArr)
10 
11 def pca(dataMat, topNfeat = 999999):
12     meanVals = mean(dataMat, axis = 0)
13     meanRemoved = dataMat - meanVals
14     covMat = cov(meanRemoved, rowvar = 0)
15     eigVals, eigVects = linalg.eig(mat(covMat))
16     eigValInd = argsort(eigVals)
17     eigValInd = eigValInd[: - (topNfeat + 1): -1]
18     redEigVects = eigVects[:, eigValInd]
19     lowDDataMat = meanRemoved * redEigVects
20     reconMat = (lowDDataMat * redEigVects.T) + meanVals
21     return lowDDataMat, reconMat
22 
23 if __name__ == "__main__":
24     dataMat = loadDataSet('testSet.txt')
25     lowDMat, reconMat = pca(dataMat, 1)
26     fig = plt.figure()
27     ax = fig.add_subplot(111)
28     ax.scatter(dataMat[:, 0].flatten().A[0], dataMat[:, 1].flatten().A[0], marker='^', s = 90)
29     ax.scatter(reconMat[:, 0].flatten().A[0], reconMat[:, 1].flatten().A[0], marker='o', s = 50, c='red')
30     plt.show()

 

輸入數據格式: 文本文件,每行是兩個以TAB鍵分隔的浮點數。學習

附運行結果:spa

相關文章
相關標籤/搜索