from numpy import * import matplotlib.pyplot as plt def loadDataSet(fileName, delim='\t'): fr=open(fileName) datArr=[] for line in fr.readlines(): lineArr = [] for i in line.strip().split(delim): lineArr.append(float(i)) datArr.append(lineArr) return mat(datArr) def replaceNanWithMean(): datMat=loadDataSet('secom.data',' ') for i in range(shape(datMat)[1]): meanVal=mean(datMat[nonzero(~isnan(datMat[:,i]))[0],i]) datMat[nonzero(isnan(datMat[:,i]))[0],i]=meanVal return datMat def pca(dataMat,topNfeat=9999999): meanVals=mean(dataMat,axis=0) meanRemoved=dataMat-meanVals covMat=cov(meanRemoved,rowvar=0) eigVals,eigVects=linalg.eig(mat(covMat)) eigValInd=argsort(eigVals) eigValInd=eigValInd[:-(topNfeat+1):-1] redEigVects=eigVects[:,eigValInd] lowDDataMat=meanRemoved*redEigVects reconMat=(lowDDataMat*redEigVects.T)+meanVals return lowDDataMat,reconMat if __name__ =='__main__': datMat=replaceNanWithMean() meanVals=mean(datMat,axis=0) meanRemoved=datMat-meanVals covMat=cov(meanRemoved,rowvar=0) eigVals,eigVects=linalg.eig(mat(covMat)) print('eigVals',eigVals)