''' @author: Garvin ''' from numpy import * import matplotlib.pyplot as plt def loadDataSet(fileName, delim='\t'): fr = open(fileName) #一次性把所有的数据全部读入计算机,然后按照delim分隔符进行分割 stringArr = [line.strip().split(delim) for line in fr.readlines()] #通过map函数,把读到的数据全部转换成float类型的值 datArr = [map(float,line) for line in stringArr] return mat(datArr) def pca(dataMat, topNfeat=9999999): #求矩阵中每一列的均值 meanVals = mean(dataMat, axis=0) #去除均值 meanRemoved = dataMat - meanVals #remove mean #求协方差 covMat = cov(meanRemoved, rowvar=0) #求特征值和特征向量 eigVals,eigVects = linalg.eig(mat(covMat)) #求出特征值从小到大元素对应的坐标,坐标计算从0开始 eigValInd = argsort(eigVals) #sort, sort goes smallest to largest #切片处理,找到最大的前topNfeat特征值 eigValInd = eigValInd[:-(topNfeat+1):-1] #cut off unwanted dimensions #找到最大特征值对应的特征向量 redEigVects = eigVects[:,eigValInd] #reorganize eig vects largest to smallest #得到低维度的矩阵 lowDDataMat = meanRemoved * redEigVects#transform data into new dimensions #低维度的矩阵乘以原矩阵的转置加上均值,得到降维之后的矩阵 reconMat = (lowDDataMat * redEigVects.T) + meanVals return lowDDataMat, reconMat def plotBestFit(dataSet1,dataSet2): dataArr1 = array(dataSet1) dataArr2 = array(dataSet2) n = shape(dataArr1)[0] n1=shape(dataArr2)[0] xcord1 = []; ycord1 = [] xcord2 = []; ycord2 = [] xcord3=[];ycord3=[] j=0 for i in range(n): xcord1.append(dataArr1[i,0]); ycord1.append(dataArr1[i,1]) xcord2.append(dataArr2[i,0]); ycord2.append(dataArr2[i,1]) fig = plt.figure() ax = fig.add_subplot(111) ax.scatter(xcord1, ycord1, s=30, c='red', marker='s') ax.scatter(xcord2, ycord2, s=30, c='green') plt.xlabel('X1'); plt.ylabel('X2'); plt.show() if __name__=='__main__': mata=loadDataSet('/Users/hakuri/Desktop/testSet.txt') a,b= pca(mata, 2) |
|