04. EdegeR Limma DESeq

Introduction

除了利用ballgown流程,目前比较流行的方法还是通过序列比对和计数获得的计数矩阵,然后用广义线性模型去评估和处理这些复杂实验条件下的数据;常见的R语言工具包有edgeR, DESeq2, and limma+voom(Law et al. 2014Stark, Grzelak, and Hadfield 2019).

值得注意的是:计数矩阵必须使用原始计数矩阵,便于评估和标准化。

EdgeR

1. load matirx and build DEGlist

library(edgeR)

# Load gene(/transcript) count matrix and labels
gene_count <- as.matrix(read.csv("RNAseq/gene_count_matrix.csv", row.names = "gene_id"))

pheno_data <- read.csv("RNAseq/geuvadis_phenodata.csv", header = TRUE)

# colnames(gene_count) <- rownames(pheno_data)
# head(pheno_data, 3)

# Check sample IDs are match to each other in orders
all(rownames(pheno_data) %in% colnames(gene_count))
#> [1] FALSE

all(pheno_data[,1] %in% colnames(gene_count))
#> [1] TRUE

# Create a DGEList
dge_list <- DGEList(counts = gene_count, group = pheno_data$group)

2. Removing genes that are lowly expressed

Plotting the distribution log-CPM values shows that a sizeable proportion of genes within each sample are either unexpressed or lowly-expressed with log-CPM values that are small or negative.

# Transformations from the raw-scale
log_cpm <- cpm(dge_list, log = TRUE, prior.count = 2)
# summary(lcpm)

# set group index
mygroup <- "sex"

# filtering uses CPM values  
keep_exprs <- filterByExpr(dge_list, group = pheno_data[ ,mygroup])
dge_clean <- dge_list[keep_exprs, , keep.lib.sizes=FALSE]

# Transformation
clean_lcpm <- cpm(dge_clean, log = TRUE, prior.count = 2)

# 
library(RColorBrewer)
nsamples <- ncol(dge_list)
col <- brewer.pal(nsamples, "Paired")
samplenames <- colnames(dge_list)

# set new plot plate
dev.new()
par(mfrow = c(1,2))
# density of raw  data
plot(density(log_cpm[,1]), col=col[1], main="", xlab="")
title(main="A. Raw data", xlab="Log-cpm")
for (i in 2:nsamples){
  den <- density(log_cpm[,i])
  lines(den$x, den$y, col=col[i])
}
legend("topright", samplenames, text.col=col, bty="n")

# density of filtered  data
plot(density(clean_lcpm[,1]), col = col[1], main = '', xlab = '')
title(main="B. Filtered data", xlab="Log-cpm")
for (i in 2:nsamples){
  den <- density(clean_lcpm[,i])
  lines(den$x, den$y, col = col[i])
}
legend("topright", samplenames, text.col=col, bty="n")

3. Normalising gene expression distributions

dge_clean <- calcNormFactors(dge_clean, method = "TMM")

# dge_clean$samples$norm.factors
clean_lcpm <- cpm(dge_clean, log=TRUE)

par(mfrow=c(1,2))
boxplot(log_cpm, las=2,main="")
title(main="A. Example: Unnormalised data", ylab="Log-cpm")

boxplot(clean_lcpm, las=2, main="")
title(main="B. Example: Normalised data",ylab="Log-cpm")
image.png

4. Distances

  1. heat map
library(pheatmap)
require(gridExtra)

p1 <- pheatmap(mat = log_cpm,
         show_rownames = FALSE,
         color = palette,
         main = "raw lcpm")

p2 <- pheatmap(mat = clean_lcpm,
         show_rownames = FALSE,
         color = palette,
         main = "clean lcpm")

# extract plot list
plot_list <- list(p1[[4]], p2[[4]])

grid.arrange(arrangeGrob(grobs= plot_list,ncol=2))
image.png
  1. Pearson correlation
library(corrplot)
mat <- cor(clean_lcpm)

corrplot(corr = mat, is.corr = FALSE, type = "upper", 
         col = palette, 
         diag = FALSE)
image.png
  1. MDS Plot
library(ggplot2)
dist_mat <- dist(t(clean_lcpm))
mds_data <- cmdscale(dist_mat)

# scale the plot data
mds_data <- as.data.frame(scale(mds_data))

colnames(mds_data) <- c("MDS 1", "MDS 2")
mds_data$name <- rownames(mds_data)

ggplot(data = mds_data) + 
  geom_point(aes(x = `MDS 1`, y = `MDS 2`), size = 3) +
  geom_text(aes(x = `MDS 1` + 0.12, y =`MDS 2`, 
                label = name), size = 2) + 
  theme_bw()
image.png
  1. PCA Analysis
# library(ggplot2)
pca_fit <- prcomp(t(clean_lcpm))

components <- summary(pca_fit )$importance[2,c(1,2)]
plot_data <- as.data.frame(pca_fit$x)
plot_data$name <- row.names(plot_data)

ggplot(data = plot_data) + geom_point(aes(x = PC1, y = PC2), 
                                      size = 2) +
  geom_text(aes(x = PC1*1.2, y = PC2, label = name)) +
  xlab(label = paste0("PC 1 (", 
                      round(components[1], 4)*100, "%)")) + 
  ylab(label = paste0("PC 2 (", 
                      round(components[2], 4)*100, "%)")) +
  theme_bw()
image.png

5. Differential expression analysis

  1. Removing heteroscedascity from count data
design <- model.matrix(~0 + factor(pheno_data[ ,mygroup]))
colnames(design) <- c("group1", "group2")
contrast_matrix <- makeContrasts(group1-group2, levels=design)

v <- voom(dge_clean, design, plot=TRUE)
image.png
  1. Fitting linear models

vfit <- lmFit(v, design)
vfit <- contrasts.fit(vfit, contrasts=contrast_matrix)
efit <- eBayes(vfit)

plotSA(efit, main="Final model: Mean-variance trend")

DE_gene <- topTable(efit, coef = 1, adjust="BH", n=Inf)

# head map
library(pheatmap)
first <- colorRampPalette(c('#999999','#ffffff'))(10)
second <- colorRampPalette(c('#99d8c9','#66c2a4'))(100)
third <- colorRampPalette(c('#fc8d59','#d53e4f'))(100)

palette <- c(first, second, third)

pheatmap(mat = DE_gene,
         show_rownames = FALSE,
         #color = palette,
         main = "raw lcpm")

# volcano plot
## adding threshold
attach(DE_gene)
DE_gene$threshold <- ifelse(`adj.P.Val` < 0.05 & abs(logFC) > 5,ifelse(logFC > 5, 'up', 'down'), 'no')
detach()

ggplot(DE_gene, aes(logFC, 
                          -log10(adj.P.Val))) +
  geom_point(aes(color = threshold)) +
  scale_color_manual(values=c("#303F9F", "#757575", "#FF5252"))+
  theme_bw()+
  xlab(label = "log(fold-change)") + 
  ylab(label = expression(-log[10] (adj_Pvalue)))

# head map
gene_names <- rownames(DE_gene)[1:100]

i <- which(rownames(v) %in% gene_names)

pheatmap(mat = clean_lcpm[i,],
         show_rownames = FALSE,

         main = "clean lcpm")
image.png
  1. Stricter definition on significance

在某些时候,不仅仅需要用到校正p值阈值,还需要差异倍数的对数(lfc=1)也高于某个最小值来更为严格地定义显著性。

tfit <- treat(vfit, lfc=1)
dt <- decideTests(tfit)
DE_gene <- topTreat(efit, coef = 1, adjust="BH", n=Inf)

# adding threshold
attach(DE_gene)
DE_gene$threshold <- ifelse(`adj.P.Val` < 0.05 & abs(logFC) > 5,ifelse(logFC > 5, 'up', 'down'), 'no')
detach()

ggplot(DE_gene, aes(logFC, 
                          -log10(adj.P.Val))) +
  geom_point(aes(color = threshold)) +
  scale_color_manual(values=c("#303F9F", "#757575", "#FF5252"))+
  theme_bw()+
  xlab(label = "log(fold-change)") + 
  ylab(label = expression(-log[10] (adj_Pvalue)))
image.png
  1. 差异基因可视化2
DE_gene <- DE_gene[order(DE_gene$logFC, decreasing = FALSE), ]
DE_gene$xaixs <- seq(1, nrow(DE_gene), 1)

ggplot(DE_gene, aes(xaixs, logFC)) +
  geom_point(aes(color = threshold)) + 
  geom_hline(yintercept=c(2,-2),linetype=2,size=0.25)+
  geom_hline(yintercept = c(0),linetype=1,size=0.5) + 
  xlab('rank of differentially expressed genes')+ 
  theme_bw()
image.png

6. GO enrichment

1. ID trasformation

GO和KEGG 分析比较烦的一点就是ID转换,由于数据是人类测序结果,我们就以 org.Hs.eg.db 包为例子,看看数据转换的情况。

key types 是里面最需要小心的信息:
1. Ensemble id: ENSG开头
2. Entrez id: 纯数字
3. Symbol id: 基因名称
4. Refseq id: NG, NM, NP等

数据间映射关系(总的来说,ENTREZID是进行GO分析最好的ID类型):

  1. org.Hs.egACCNUM:将Entrez ID标识符映射到GenBank的登录号
  2. org.Hs.egCHRLOC:获取映射到染色体位置的Entrez ID标识符
  3. org.Hs.egENSEMBL:用于Entrez ID与EnsemblID之间的映射
  4. org.Hs.egENSEMBLPROT:用于Entrez ID与Ensembl蛋白ID的映射
  5. org.Hs.egENSEMBLTRANS:用于Entrez ID与Ensembl transcript编号
  6. org.Hs.egENZYME:Entrez基因id和酶活性(EC)之间的图谱
  7. org.Hs.egGENENAME:Entrez ID与基因名称之间的图谱
  8. org.Hs.egGO:Entrez ID与基因本体论(GO) id之间的映射
  9. org.Hs.egMAP:Entrez ID和细胞遗传学图谱/条带之间的映射
  10. org.Hs.egOMIM:Map between Entrez Gene Identifiers and Mendelian Inheritance in Man (MIM) identifiers
  11. org.Hs.egPATH:Entrez ID和KEGG通路标识符之间的映射
  12. org.Hs.egREFSEQ:Entrez ID与RefSeq标识符之间的映射
  13. org.Hs.egSYMBOL:Entrez ID和基因符号之间的映射
  14. org.Hs.egUNIPROT:Map Uniprot accession numbers with Entrez Gene identifiers

2. Exercise

# RSQLite(v2.2.5) has some problems
options(connectionObserver = NULL)
library(org.Hs.eg.db)
library(clusterProfiler)

#
keytypes(org.Hs.eg.db)
temp_names <- rownames(DE_gene)[1:100]

# temp_names <- gsub('NR[0-9]*[|]', '', temp_names)
gene_names <- gsub('[|].*', '', temp_names)

# keytype =  org.Hs.eg(XXX)
## stand form
head(keys(org.Hs.eg.db, keytype = "SYMBOL"))
head(keys(org.Hs.eg.db, keytype = "REFSEQ"))

# ID transformation
gene_list <- mapIds(org.Hs.eg.db, keys = gene_names,
                 column= "ENTREZID",
                 keytype = "REFSEQ")

gene_list <- clusterProfiler::bitr(gene_names, fromType = "ENSEMBL" , toType = "ENTREZID", OrgDb = "org.Hs.eg.db", drop = TRUE)

go_all <- clusterProfiler::enrichGO(gene_list, OrgDb = org.Hs.eg.db,
               ont = 'ALL', pAdjustMethod = 'BH',
               pvalueCutoff = 0.2, qvalueCutoff = 0.5,
               keyType ='ENTREZID')

go_result <- go_all@result

write.table(x = go_result,
            file = "RNAseq/GO_result.txt",
            sep = "\t",
            col.names =  TRUE,
            row.names = FALSE)

3. Visualization

3.1 dot plot

# dot plot
go_result <- read.table(file = "RNAseq/GO_result.txt", 
                        sep = "\t",
                        header = TRUE, 
                        stringsAsFactors = F)

plot_data <- go_result[order(go_result$Count, decreasing = T),]
plot_data$Description <- forcats::fct_reorder(plot_data$Description, 
                                              plot_data$Count)

ggplot(data = plot_data[1:10, ]) + 
  geom_point(aes(x = Count, y = Description, size = Count)) + 
  theme_bw() + 
  scale_y_discrete(labels = function(x) stringr::str_wrap(x, width = 40)) + 
  xlab(label = NULL) + 
  ylab(label = NULL)
image.png

3.2 bar plot

result1 <- read.table(file = "RNAseq/GO_result.txt", 
                        sep = "\t",
                        header = TRUE, 
                        stringsAsFactors = F)

result2 <- read.table(file = "RNAseq/GO_result2.txt", 
                         sep = "\t",
                         header = TRUE, 
                         stringsAsFactors = F)

set_bar_plot <- function(result1, result2, myrank){
  myrank = as.numeric(myrank)
  ## setting df1
  temp1 <- subset(result1, select = c("Description", "qvalue", 
                                         "Count"))
  temp1 <- na.omit(temp1)
  temp1 <- unique(temp1)
  temp1$qvalue <- as.numeric(temp1$qvalue)
  df1 <- temp1[order(temp1$qvalue, 
                     decreasing = F), ][c(1:myrank ), ]

  ## setting df2

  temp2 <- subset(result2, select = c("Description", "qvalue", 
                                         "Count"))
  temp2 <- na.omit(temp2)
  temp2 <- unique(temp2)
  df2 <- temp2[order(temp2$qvalue, 
                     decreasing = F), ][c(1:myrank ), ]

  ## setting bar order for df1
  df1_1 <- df1[!df1$Description %in% df2$Description, ]
  df1_2 <- df1[df1$Description %in% df2$Description, ]

  df1_1 <- df1_1[order(df1_1$Count, decreasing = FALSE), ]
  df1_1$xlab <- seq(1,nrow(df1_1),1)
  df1_1$Count <- df1_1$Count* -1

  df1_2 <- df1[df1$Description %in% df2$Description, ]
  df1_2 <- df1_2[order(df1_2$Count, decreasing = FALSE), ]

  df1_2$xlab <- seq(nrow(df1_1)+1, (nrow(df1_1) + nrow(df1_2)), 1)
  df1_2$Count <- df1_2$Count* -1

  ## setting bar order for df2
  df2_1 <- df2[!df2$Description %in% df1$Description, ]
  df2_2 <- df2[df2$Description %in% df1$Description, ]

  df2_1 <- df2_1[order(df2_1$Count, decreasing = FALSE), ]

  df2_1$xlab <- seq((myrank  + nrow(df2_2) + 1),
                    (myrank  + nrow(df2_2) + nrow(df2_1)),1)
  df2_2 <- df2_2[order(df2_2$Count, decreasing = FALSE), ]
  df2_2$xlab <- seq((myrank   + 1), (myrank  +  nrow(df2_2)), 1)
  df2_2$Count <- df2_2$Count
  mylist <- list(df1_1, df1_2, df2_1, df2_2)
  mydf <- dplyr::bind_rows(mylist)
  return(mydf)
}

library(ggplot2)
ggplot()+ 
  geom_col(data = df1, aes(x = xlab, 
                            y =  Count,
                            fill = qvalue)) +
  geom_text(data = subset(df1, Count < 0), 
            aes(x = xlab, y = 0.1, label = Description),
            hjust = "outward", color = "#c02c38",
            size = 3) + 
  geom_text(data = subset(df1, Count > 0), 
            aes(x = xlab, y = -0.1, label = Description),
            hjust = "inward",  color = "#15559a",
            size = 3) +
  scale_fill_gradient(low = "#046582", 
                      high = "#fa1e0e") + 
  scale_y_continuous(breaks = seq(-5,20,5)) + 
  coord_flip() +
  theme_void() +
  theme(legend.position = c(0.96,0.3),
        axis.line.x = element_line(size = 1),
        axis.ticks.x = element_line(size = 1),
        axis.ticks.length=unit(0.06, "cm"),
        axis.text.x = element_text())
image.png

References

Law, C. W., Y. Chen, W. Shi, and G. K. Smyth. 2014. “Voom: Precision Weights Unlock Linear Model Analysis Tools for RNA-Seq Read Counts.” Journal Article. Genome Biol 15 (2): R29. https://doi.org/10.1186/gb-2014-15-2-r29.

Stark, R., M. Grzelak, and J. Hadfield. 2019. “RNA Sequencing: The Teenage Years.” Journal Article. Nat Rev Genet 20 (11): 631–56. https://doi.org/10.1038/s41576-019-0150-2.

Bioconductor: RNA-seq analysis is easy as 1-2-3 with limma, Glimma and edgeR

©著作权归作者所有,转载或内容合作请联系作者
  • 序言:七十年代末,一起剥皮案震惊了整个滨河市,随后出现的几起案子,更是在滨河造成了极大的恐慌,老刑警刘岩,带你破解...
    沈念sama阅读 199,711评论 5 468
  • 序言:滨河连续发生了三起死亡事件,死亡现场离奇诡异,居然都是意外死亡,警方通过查阅死者的电脑和手机,发现死者居然都...
    沈念sama阅读 83,932评论 2 376
  • 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
    开封第一讲书人阅读 146,770评论 0 330
  • 文/不坏的土叔 我叫张陵,是天一观的道长。 经常有香客问我,道长,这世上最难降的妖魔是什么? 我笑而不...
    开封第一讲书人阅读 53,799评论 1 271
  • 正文 为了忘掉前任,我火速办了婚礼,结果婚礼上,老公的妹妹穿的比我还像新娘。我一直安慰自己,他们只是感情好,可当我...
    茶点故事阅读 62,697评论 5 359
  • 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
    开封第一讲书人阅读 48,069评论 1 276
  • 那天,我揣着相机与录音,去河边找鬼。 笑死,一个胖子当着我的面吹牛,可吹牛的内容都是我干的。 我是一名探鬼主播,决...
    沈念sama阅读 37,535评论 3 390
  • 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...
    开封第一讲书人阅读 36,200评论 0 254
  • 序言:老挝万荣一对情侣失踪,失踪者是张志新(化名)和其女友刘颖,没想到半个月后,有当地人在树林里发现了一具尸体,经...
    沈念sama阅读 40,353评论 1 294
  • 正文 独居荒郊野岭守林人离奇死亡,尸身上长有42处带血的脓包…… 初始之章·张勋 以下内容为张勋视角 年9月15日...
    茶点故事阅读 35,290评论 2 317
  • 正文 我和宋清朗相恋三年,在试婚纱的时候发现自己被绿了。 大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
    茶点故事阅读 37,331评论 1 329
  • 序言:一个原本活蹦乱跳的男人离奇死亡,死状恐怖,灵堂内的尸体忽然破棺而出,到底是诈尸还是另有隐情,我是刑警宁泽,带...
    沈念sama阅读 33,020评论 3 315
  • 正文 年R本政府宣布,位于F岛的核电站,受9级特大地震影响,放射性物质发生泄漏。R本人自食恶果不足惜,却给世界环境...
    茶点故事阅读 38,610评论 3 303
  • 文/蒙蒙 一、第九天 我趴在偏房一处隐蔽的房顶上张望。 院中可真热闹,春花似锦、人声如沸。这庄子的主人今日做“春日...
    开封第一讲书人阅读 29,694评论 0 19
  • 文/苍兰香墨 我抬头看了看天上的太阳。三九已至,却和暖如春,着一层夹袄步出监牢的瞬间,已是汗流浃背。 一阵脚步声响...
    开封第一讲书人阅读 30,927评论 1 255
  • 我被黑心中介骗来泰国打工, 没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留,地道东北人。 一个月前我还...
    沈念sama阅读 42,330评论 2 346
  • 正文 我出身青楼,却偏偏与公主长得像,于是被迫代替她去往敌国和亲。 传闻我的和亲对象是个残疾皇子,可洞房花烛夜当晚...
    茶点故事阅读 41,904评论 2 341

推荐阅读更多精彩内容