论文
Plasma proteome analyses in individuals of European and African ancestry identify cis-pQTLs and models for proteome-wide association studies
https://www.nature.com/articles/s41588-022-01051-w
本地pdf s41588-022-01051-w.pdf
代码链接
https://zenodo.org/record/6332981#.YroV0nZBzic
https://github.com/Jingning-Zhang/PlasmaProtein/tree/v1.2
今天的推文重复一下论文中的Figure1,涉及到5个图,分别是折线图,韦恩图,散点图,频率分布直方图,最后一个知识点是如何将这5个图组合到一起
首先是定义作图主题的内容
library(ggplot2)
My_Theme <- theme(
panel.background = element_blank(),
title = element_text(size = 7),
text = element_text(size = 6)
)
论文中提供的代码没有设置坐标轴的线,如果按照他的主题来做出图没有横纵坐标轴
第一个折线图的代码
library(readxl)
df.peer <- read_excel("data/20220627/Fig1.xlsx", sheet = "1a")
df.peer_highlight <- df.peer[c(9,21),]
p1 <- ggplot(data = df.peer, aes(x = NPEER, y=NpGene, col=Race)) +
geom_line() + geom_point() +
geom_point(data=df.peer_highlight,aes(x = NPEER, y=NpGene, color=Race, fill=Race), size=4,shape=18) +
My_Theme +
labs(y = "# significant SOMAmers ", x="# PEER factors", title= NULL) +
scale_colour_manual(values=c("#238b45","#2171b5")) +
scale_fill_manual(values=c("#238b45","#2171b5"))+
theme(axis.line = element_line())
p1+
scale_y_continuous(labels = scales::label_comma())
这里有一个小知识点,坐标轴文本千分位用逗号分隔,论文中没有提供这个代码,可以参考链接 https://scales.r-lib.org/reference/label_number.html
第二个韦恩图的代码
他这里的韦恩图是借助ggforce这个R包直接画了两个圆
df.venn <- read_excel("data/20220627/Fig1.xlsx", sheet = "1b")
library(ggforce)
p2 <- ggplot(df.venn, aes(x0 = x, y0 = y, r = r, fill = labels)) +
geom_circle(alpha = .4, size = 1, colour = NA) +
theme_void() +
theme(legend.position="none")+
My_Theme+
annotate("text", x = -0.34, y =0.4, label = "1,618 in AA", size=2)+
annotate("text", x = 0, y = 0, label = "1,447 overlapping", size=2)+
annotate("text", x = 0.4, y =0.43, label = "2,004 in EA", size=2)+
coord_fixed()+
scale_fill_manual(values=c("#238b45","#2171b5"))
p2
第三个图是散点图添加拟合曲线
df.effmaf <- read_excel("data/20220627/Fig1.xlsx", sheet = "1c")
p3 <- ggplot(data = df.effmaf, aes(x = maf, y = abs(BETA), col=Race)) +
geom_point(alpha=0.5,size=0.5) +
# geom_point(alpha=0.5,size=power) +
geom_smooth(method="lm", mapping = aes(weight = 1/power), col="#fd8d3c") +
geom_smooth(method="lm", col="#525252") +
scale_x_continuous(breaks=c(0, 0.1, 0.2)) +
theme(legend.position="none")+
My_Theme+
facet_wrap(~Race, ncol=2)+
labs(x="MAF(1-MAF)", y = "Effect size", title = NULL)+
scale_colour_manual(values=c("#238b45","#2171b5"))+
theme(axis.line.y = element_line(),
axis.line.x = element_line())
p3
分面以后两个x轴不能完全链接到一起,可能需要出图后再编辑
第四个散点图
df.efftss <- read_excel("data/20220627/Fig1.xlsx", sheet = "1d")
p4 <- ggplot(data = df.efftss, aes(x = dist, y = abs(BETA), col=Race)) +
geom_point(alpha=0.5,size=0.5) +
scale_x_continuous(breaks=c(-0.25*10^6,0.25*10^6),
labels = scales::label_comma()) +
theme(legend.position="none")+
My_Theme+
facet_wrap(~Race, ncol=2)+
labs(x="Distance to TSS", y = "Effect size", title = NULL)+
scale_colour_manual(values=c("#238b45","#2171b5"))+
coord_cartesian(ylim = c(0.2,NA)) +
theme(axis.line.y = element_line(),
axis.line.x = element_line())
p4
第五个频率分布直方图
df.cond <- read_excel("data/20220627/Fig1.xlsx", sheet = "1e")
p5 <- ggplot(df.cond, aes(x = count)) +
geom_histogram(binwidth = 1, aes(fill=Race), col="white",alpha=0.8) +
theme(legend.position="none")+
My_Theme+
labs(x = "# conditionally significant cis-SNPs",
y = "# significant SOMAmers",title=NULL) +
facet_grid(cols = vars(Race))+
scale_fill_manual(values=c("#238b45","#2171b5"))+
theme(axis.line.y = element_line(),
axis.line.x = element_line())
p5
最后是拼图代码
library(ggpubr)
p <- ggarrange(ggarrange(p1, p2,
ncol = 2, labels = c("a", "b"),
widths = c(0.6,0.4)),
ggarrange(p3, p4, p5,
ncol = 3, labels = c("c", "d","e"),
widths = c(0.35,0.35,0.3)),
nrow = 2, heights = c(0.4,0.6))
p
示例数据和代码可以直接去论文中获取
欢迎大家关注我的公众号
小明的数据分析笔记本
小明的数据分析笔记本 公众号 主要分享:1、R语言和python做数据分析和数据可视化的简单小例子;2、园艺植物相关转录组学、基因组学、群体遗传学文献阅读笔记;3、生物信息学入门学习资料及自己的学习笔记!