機器學習PCA數據降維

from numpy import *
import matplotlib.pyplot as plt

def loadDataSet(fileName, delim='\t'):
    fr=open(fileName)
    datArr=[]
    for line in fr.readlines():
        lineArr = []
        for i in line.strip().split(delim):
            lineArr.append(float(i))
        datArr.append(lineArr)
    return mat(datArr)
def replaceNanWithMean():
    datMat=loadDataSet('secom.data',' ')
    for i in range(shape(datMat)[1]):
        meanVal=mean(datMat[nonzero(~isnan(datMat[:,i]))[0],i])
        datMat[nonzero(isnan(datMat[:,i]))[0],i]=meanVal
    return datMat
def pca(dataMat,topNfeat=9999999):
    meanVals=mean(dataMat,axis=0)
    meanRemoved=dataMat-meanVals
    covMat=cov(meanRemoved,rowvar=0)
    eigVals,eigVects=linalg.eig(mat(covMat))
    eigValInd=argsort(eigVals)
    eigValInd=eigValInd[:-(topNfeat+1):-1]
    redEigVects=eigVects[:,eigValInd]
    lowDDataMat=meanRemoved*redEigVects
    reconMat=(lowDDataMat*redEigVects.T)+meanVals
    return lowDDataMat,reconMat

if __name__ =='__main__':
    datMat=replaceNanWithMean()
    meanVals=mean(datMat,axis=0)
    meanRemoved=datMat-meanVals
    covMat=cov(meanRemoved,rowvar=0)
    eigVals,eigVects=linalg.eig(mat(covMat))
    print('eigVals',eigVals)
相關文章
相關標籤/搜索