PCA的主要算法如下:
组织数据形式,以便于模型使用;
计算样本每个特征的平均值;
每个样本数据减去该特征的平均值(归一化处理);
求协方差矩阵;
找到协方差矩阵的特征值和特征向量;
对特征值和特征向量重新排列(特征值从大到小排列);
对特征值求取累计贡献率;
对累计贡献率按照某个特定比例选取特征向量集的子集合;
对原始数据(第三步后)进行转换。
直白来讲,PCA就是把需要通过成百上千个变量描述的数据归结为只需要少数几个变量(经典的是2个)来描述数据,也就是**“两句话”概括“千言万语” **。
需要明确的是PCA分析后得出的主成分(PC1、PC2等等)都是没有现实的意义的,你很难说他们代表什么。
import pandas as pd
import numpy as np
import random as rd
from sklearn.decomposition import PCA
from sklearn import preprocessing
import matplotlib.pyplot as plt
genes=['gene'+str(i) for i in range(1,101)]
genes[1:10]
['gene2',
'gene3',
'gene4',
'gene5',
'gene6',
'gene7',
'gene8',
'gene9',
'gene10']
wt=['wt'+str(i) for i in range(1,6)];wt
['wt1', 'wt2', 'wt3', 'wt4', 'wt5']
ko=['ko'+str(i) for i in range(1,6)];ko
['ko1', 'ko2', 'ko3', 'ko4', 'ko5']
data=pd.DataFrame(columns=[*wt,*ko],index=genes);data
for gene in data.index:
data.loc[gene,'wt1':'wt5']=np.random.poisson(lam=rd.randrange(10,1000),size=5)
data.loc[gene,'ko1':'ko5']=np.random.poisson(lam=rd.randrange(10,1000),size=5)
data.head()
scaled_data=preprocessing.scale(data.T) #将数据进行归一化,data.T是将其转置
scaled_data[1]
array([-1.11151204, 0.69306137, 1.33551565, 0.8382505 , -1.04318281,
-0.60405627, -0.92063218, -0.95192761, 1.01280344, 0.8138406 ,
0.95375584, -0.97698326, -1.01080482, 1.19052147, 0.88000624,
1.04411408, -1.02137775, 1.16559709, -0.58782296, 0.9729321 ,
-0.9201476 , 1.04795897, 0.94835528, -0.95032432, 1.01253165,
0.98796019, -1.03156718, -0.86388467, -0.91772975, 1.69465649,
0.47643526, 0.90480737, -1.06546114, -0.98786962, 0.81600049,
0.50271509, -0.4244264 , -0.99752417, 1.0014188 , 1.02462556,
-0.0367508 , -1.14916327, 0.96272824, 1.01814004, -0.5810545 ,
-0.99947228, -0.62205708, -1.00777942, 0.64332696, 1.06180935,
1.10478883, 0.03584203, -0.91763094, -1.07259316, 0.91700974,
0.96225154, 1.090414 , 1.42555954, 1.04743973, 1.09804145,
1.30058119, -0.9986909 , -0.33887362, 0.88132433, -1.00526682,
1.03824522, -1.0178739 , 0.97577143, 1.69152939, 0.97778407,
1.38057501, -0.98224552, -1.10195586, -1.02489382, -1.33045566,
0.85948887, 1.13794653, -1.00022063, -1.27473053, 1.07895361,
1.23898493, 1.21214071, 0.92505393, -0.92270432, -1.16594652,
0.84570775, -1.08873995, 1.0335842 , -0.60963112, 0.86807649,
0.8334977 , -0.98885663, 0.95804155, 1.02711728, 1.05853635,
-0.71603877, -1.02059608, 1.15042003, -1.12393024, -1.13269917])
pca=PCA()
pca.fit(scaled_data)
PCA()
pca_data=pca.transform(scaled_data)
pca_data
array([[-9.57027905e+00, 3.55228194e+00, -1.52420903e+00,
2.57491561e-01, -1.09035428e+00, 8.44196473e-01,
-6.29452014e-02, 1.57271087e-01, -1.29859653e-01,
-2.77555756e-15],
[-9.62851241e+00, -1.63319643e+00, -1.35610282e+00,
-6.35136049e-01, 1.39366206e-01, -9.77877195e-02,
1.27824840e-01, -1.18810683e+00, 1.04379113e+00,
1.29063427e-15],
[-9.38985566e+00, -5.79958372e-02, -2.90465917e-01,
-1.21950007e+00, 2.08037727e+00, -2.86223519e-01,
8.29853435e-01, 5.95257988e-01, -7.02230258e-01,
-5.55111512e-17],
[-9.46057599e+00, -1.35763803e+00, 8.38533411e-01,
2.86851275e+00, 3.52552524e-01, 1.82468221e-01,
-4.00136728e-01, -1.54348720e-01, -4.38992676e-01,
-1.11022302e-15],
[-9.53467186e+00, -4.79525050e-01, 2.30877195e+00,
-1.26139805e+00, -1.45424496e+00, -7.08582358e-01,
-5.25915360e-01, 5.94003147e-01, 2.07663457e-01,
-1.92901251e-15],
[ 9.58278851e+00, 1.92440852e+00, 1.54260419e+00,
-5.50930210e-01, 1.30622748e+00, 4.24921272e-01,
-1.09116118e+00, -8.78108204e-01, 4.02080620e-02,
9.57567359e-16],
[ 9.69622467e+00, 9.43611550e-01, 5.49249516e-02,
5.52672347e-01, -7.37393745e-01, -1.83962859e+00,
1.14543499e+00, -6.24291614e-01, -3.56908896e-01,
-1.73472348e-16],
[ 9.68114642e+00, -1.95288085e+00, -1.86333887e+00,
-6.75455689e-01, -6.50862729e-01, -5.96789361e-03,
-1.17859843e+00, 1.77090925e-01, -8.33851570e-01,
2.70616862e-16],
[ 9.57024639e+00, 2.46984383e-01, -6.19575558e-01,
8.88795612e-01, 6.82666459e-01, -4.16934960e-01,
-1.58375084e-01, 1.25784962e+00, 1.14984026e+00,
2.02615702e-15],
[ 9.05348896e+00, -1.18605019e+00, 9.08857692e-01,
-2.25052203e-01, -6.28334216e-01, 1.90353908e+00,
1.31401872e+00, 6.33826028e-02, 2.03401485e-02,
3.37230244e-15]])
per_var=np.round(pca.explained_variance_ratio_*100,decimals=1)
per_var
array([90.6, 2.7, 1.7, 1.4, 1.1, 0.9, 0.7, 0.5, 0.4, 0. ])
labels=['PC'+str(x) for x in range(1,len(per_var)+1)]
plt.bar(x=range(1,len(per_var)+1),height=per_var,tick_label=labels)
plt.ylabel('Percentage of Explained Variance')
plt.xlabel('Principal Component')
plt.title("Scree Plot")
plt.show()
pca_df=pd.DataFrame(pca_data,index=[*wt,*ko],columns=labels)
plt.scatter(pca_df.PC1,pca_df.PC2)
plt.title("My PCA Graph")
plt.xlabel('PC1-{0}%'.format(per_var[0]))
plt.ylabel('PC2-{0}%'.format(per_var[1]))
for sample in pca_df.index:
plt.annotate(sample,(pca_df.PC1.loc[sample],pca_df.PC2.loc[sample]))
loading_scores=pd.Series(pca.components_[0],index=genes)
print(loading_scores[1:10])
sorted_loading_scores=loading_scores.abs().sort_values(ascending=False)
top_10_genes=sorted_loading_scores[0:10].index.values
print(loading_scores[top_10_genes])
gene2 -0.101846
gene3 -0.100883
gene4 -0.104677
gene5 0.104827
gene6 0.100761
gene7 0.058237
gene8 0.104691
gene9 -0.104923
gene10 -0.104436
dtype: float64
gene48 0.104993
gene72 0.104973
gene78 0.104961
gene74 0.104931
gene56 -0.104927
gene32 -0.104924
gene9 -0.104923
gene22 -0.104906
gene62 0.104896
gene25 -0.104895
dtype: float64
source:StatQuest: PCA in Python