介绍
通过同一组转录组数据,做PCA和PCoA,看看分组情况。在此之前,我通常使用PCA对样本进行检测以剔除离群样本。但并不确定PCA和PCoA哪种方法在转录样本的分析中最优。关于PCA和PCoA算法的区别的区别可以参考知乎链接。
PCA
运行包
# PCA
getwd()
setwd("C:/Users/1/Documents/R/LNN/TSE_PS/")
# FactoMineR和factoextra必须运行,没有的话需要安装。
#install.packages("FactoMineR")
#install.packages("factoextra")
library(tidyverse)
library(FactoMineR)
library(factoextra)
library(RColorBrewer)
#
读入数据
tpm <- read.csv("~/R/LNN/TSE_PS/PS_TSE_tpm.csv")%>%mutate(X = NULL,
bed_num = NULL,
PS = NULL)%>%
column_to_rownames(var = "TSE")
sample_list <- read.csv("~/R/LNN/TSE_PS/sample_list.csv")
#
df_group <- sample_list%>%
mutate(Group = paste(species,gender,sep="_"))%>%
dplyr::select(sample,Group)%>%
column_to_rownames(var = "sample")
展示下使用的数据
tpm中colname为样本名称,ID作为rowname。而分组信息需要将样本名称作为rowname,分组信息可以为一列或多列。
>head(tpm)
PS_FB1 PS_FB2 PS_FB3 PS_FE1 PS_FE2 PS_FE3
KIF6 2.563374 1.371589 4.066951 0.3319875 0.3329647 0.4807414
GOT2 251.468015 251.760476 332.263694 309.8111250 317.3034103 321.5148220
LOC117870645 141.013038 162.829874 136.838119 182.3690559 225.8066411 192.6667206
LOC117870646 72.403094 42.969596 53.023493 44.9923024 75.5684609 38.5259704
DNMT3B 2.388589 3.085964 3.846336 8.2506601 6.8178938 3.7840485
MAPRE1 153.829811 154.892396 154.235473 92.0070980 97.3640793 100.4782548
> head(df_group)
Group
PS_FB1 PS_Female
PS_FB2 PS_Female
PS_FB3 PS_Female
PS_FE1 PS_Female
PS_FE2 PS_Female
PS_FE3 PS_Female
tpm转置后即可直接进行PCA分析
#
data_T <- tpm%>%t()%>%
data.frame()
data_T.pca <- PCA(data_T, graph = FALSE)
PCA <- fviz_pca_ind(data_T.pca,
repel = TRUE,
# geom.ind = "point",#控制点上是否有样品名标识
col.ind = df_group$Group, # 颜色的分组信息
pointsize = 2,
# pointshape = 21, # 控制点形状,注释掉后按分组分配
addEllipses = TRUE,#点的外边框,注释掉后没有边框。
palette = brewer.pal(4,"RdYlBu"),
ellipse.type = "convex",# 用凸包多边形代替椭圆,注释掉后未椭圆
legend.title = "Organ")+
theme_bw()
PCA
同样的代码,换一个分组
# 换一个分组
df_group2 <- sample_list%>%
mutate(Group = paste(species,tissue,sep = " "))%>%
dplyr::select(sample,Group)%>%
column_to_rownames(var = "sample")
PCA2 <- fviz_pca_ind(data_T.pca,
repel = TRUE,
# geom.ind = "point",#控制点上是否有样品名标识
col.ind = df_group2$Group, # 颜色的分组信息
pointsize = 2,
pointshape = 21, # 控制点形状,注释掉后按分组分配
addEllipses = TRUE,#点的外边框,注释掉后没有边框。
# palette = brewer.pal(12,"RdYlBu"),
ellipse.type = "convex",# 用凸包多边形代替椭圆,注释掉后未椭圆
legend.title = "Organ")+
theme_bw()
PCA2
library(patchwork)
PCA | PCA2
PCoA
读包,读入数据
# PCoA
getwd()
setwd("C:/Users/1/Documents/R/LNN/TSE_PS/")
#
library(tidyverse)
# ade包和vegan包是必须的,其它包可省略
#install.packages("ade4")
#install.packages("vegan")
library(ade4)
library(vegan)
library(plyr)
library(RColorBrewer)
#
tpm <- read.csv("~/R/LNN/TSE_PS/PS_TSE_tpm.csv")%>%mutate(X = NULL,
bed_num = NULL,
PS = NULL)
sample_list <- read.csv("~/R/LNN/TSE_PS/sample_list.csv")
df_group <- sample_list%>%
mutate(Group = paste(species,gender,sep="_"))%>%
dplyr::select(sample,Group)
#
展示下tpm和s用以分组的df_group格式,内容有所省略
>head(tpm)
TSE PS_FB1 PS_FB2 PS_FB3 PS_FE1
1 KIF6 2.563374 1.371589 4.066951 0.3319875
2 GOT2 251.468015 251.760476 332.263694 309.8111250
3 LOC117870645 141.013038 162.829874 136.838119 182.3690559
4 LOC117870646 72.403094 42.969596 53.023493 44.9923024
5 DNMT3B 2.388589 3.085964 3.846336 8.2506601
6 MAPRE1 153.829811 154.892396 154.235473 92.0070980
>head(df_group)
sample Group
1 PS_FB1 PS_Female
2 PS_FB2 PS_Female
3 PS_FB3 PS_Female
4 PS_FE1 PS_Female
5 PS_FE2 PS_Female
6 PS_FE3 PS_Female
# 转置
df_t <- tpm%>%
column_to_rownames(var = "TSE")%>%t()
# 计算距离
df.dist <- vegdist(df_t,method = 'bray',na.rm = T)
df.pcoa <- cmdscale(df.dist,k=(nrow(df_t)-1),eig = T,add = T)
data_eig = df.pcoa$eig
data_exp <- data_eig/sum(data_eig)
# 计算PCoA1和PCoA2并生成数值
pcoa1 <- paste(round(100*data_exp[1],2),'%')
pcoa2 <- paste(round(100*data_exp[2],2),'%')
# PCoA图的输入列表
sample_site <- data.frame(df.pcoa$points)[1:2]
# 加入分组信息,将group改成了level,便于在强调顺序时可以指定level
sample_site <- sample_site%>%rownames_to_column(var = "sample")%>%
left_join(df_group)%>%
dplyr::rename(level = Group,
PCoA1 = X1,
PCoA2 = X2)%>%
column_to_rownames(var = "sample")
# 计算分组点的面积
find_hull <- function(sample_site) sample_site[chull(sample_site$PCoA1, sample_site$PCoA2),]
hulls <- ddply(sample_site, "level", find_hull)
# 绘图
ggplot(sample_site,aes(x = PCoA1,y = PCoA2,color=level))+
geom_point()+
theme_classic()
ggplot(sample_site, aes(PCoA1, PCoA2, color=level,shape=level)) +
theme_classic()+
geom_vline(xintercept = 0, color = 'gray', size = 0.4) +
geom_hline(yintercept = 0, color = 'gray', size = 0.4) +
geom_polygon(data = hulls,alpha = 0.2,aes(fill=factor(level),color=level),size=0.2,show.legend = FALSE) +
geom_point(aes(color = level),size = 4,alpha = 0.5) + #设置点的透明度、大小
scale_color_manual(values = hcl.colors(4)) + #分组多了设置颜色麻烦,用包比较简单
scale_fill_manual(values = hcl.colors(4)) +
scale_shape_manual(values = c(20,20,20,20)) +
theme(panel.grid = element_line(color = 'black', linetype = 1, size = 0.1),
panel.background = element_rect(color = 'black', fill = 'transparent'),
legend.title=element_blank()
)+labs(x = paste('PCoA axis1:',pcoa1), y = paste('PCoA axis2:',pcoa2))#前面计算的值,在这里写在坐标里
我的数据和分组设置的不太合理,每组离群的三个点都是相同的组织。
完整代码
· PCA
# PCA
getwd()
setwd("C:/Users/1/Documents/R/LNN/TSE_PS/")
#
#install.packages("FactoMineR")
#install.packages("factoextra")
library(tidyverse)
library(FactoMineR)
library(factoextra)
library(RColorBrewer)
#
tpm <- read.csv("~/R/LNN/TSE_PS/PS_TSE_tpm.csv")%>%mutate(X = NULL,
bed_num = NULL,
PS = NULL)%>%
column_to_rownames(var = "TSE")
sample_list <- read.csv("~/R/LNN/TSE_PS/sample_list.csv")
#
df_group <- sample_list%>%
mutate(Group = paste(species,gender,sep="_"))%>%
dplyr::select(sample,Group)%>%
column_to_rownames(var = "sample")
#
data_T <- tpm%>%t()%>%
data.frame()
data_T.pca <- PCA(data_T, graph = FALSE)
PCA <- fviz_pca_ind(data_T.pca,
repel = TRUE,
# geom.ind = "point",#控制点上是否有样品名标识
col.ind = df_group$Group, # 颜色的分组信息
pointsize = 2,
# pointshape = 21, # 控制点形状,注释掉后按分组分配
addEllipses = TRUE,#点的外边框,注释掉后没有边框。
palette = brewer.pal(4,"RdYlBu"),
ellipse.type = "convex",# 用凸包多边形代替椭圆,注释掉后未椭圆
legend.title = "Organ")+
theme_bw()
PCA
- PCoA
# PCoA
getwd()
setwd("C:/Users/1/Documents/R/LNN/TSE_PS/")
#
library(tidyverse)
#install.packages("ade4")
#install.packages("vegan")
library(ade4)
library(vegan)
library(plyr)
library(RColorBrewer)
#
tpm <- read.csv("~/R/LNN/TSE_PS/PS_TSE_tpm.csv")%>%mutate(X = NULL,
bed_num = NULL,
PS = NULL)
sample_list <- read.csv("~/R/LNN/TSE_PS/sample_list.csv")
#
df_group <- sample_list%>%
mutate(Group = paste(species,gender,sep="_"))%>%
dplyr::select(sample,Group)
df_t <- tpm%>%
column_to_rownames(var = "TSE")%>%t()
#
df.dist <- vegdist(df_t,method = 'bray',na.rm = T)
df.pcoa <- cmdscale(df.dist,k=(nrow(df_t)-1),eig = T,add = T)
data_eig = df.pcoa$eig
data_exp <- data_eig/sum(data_eig)
pcoa1 <- paste(round(100*data_exp[1],2),'%')
pcoa2 <- paste(round(100*data_exp[2],2),'%')
#
sample_site <- data.frame(df.pcoa$points)[1:2]
#
sample_site <- sample_site%>%rownames_to_column(var = "sample")%>%
left_join(df_group)%>%
dplyr::rename(level = Group,
PCoA1 = X1,
PCoA2 = X2)%>%
column_to_rownames(var = "sample")
#
find_hull <- function(sample_site) sample_site[chull(sample_site$PCoA1, sample_site$PCoA2),]
hulls <- ddply(sample_site, "level", find_hull)
#
head(sample_site)
ggplot(sample_site,aes(x = PCoA1,y = PCoA2,color=level))+
geom_point()+
theme_classic()
ggplot(sample_site, aes(PCoA1, PCoA2, color=level,shape=level)) +
theme_classic()+ #去掉背景框
geom_vline(xintercept = 0, color = 'gray', size = 0.4) +
geom_hline(yintercept = 0, color = 'gray', size = 0.4) +
geom_polygon(data = hulls,alpha = 0.2,aes(fill=factor(level),color=level),size=0.2,show.legend = FALSE) +
geom_point(aes(color = level),size = 4,alpha = 0.5) + #设置点的透明度、大小
scale_color_manual(values = hcl.colors(4)) +
scale_fill_manual(values = hcl.colors(4)) +
scale_shape_manual(values = c(20,20,20,20)) +
theme(panel.grid = element_line(color = 'black', linetype = 1, size = 0.1),
panel.background = element_rect(color = 'black', fill = 'transparent'),
legend.title=element_blank()
)+labs(x = paste('PCoA axis1:',pcoa1), y = paste('PCoA axis2:',pcoa2))