library(RTCGA)
library(RTCGA.clinical)
library(RTCGA.rnaseq)
library(RTCGA.mRNA)
library(RTCGA.mutations)
all_TCGA_cancers=infoTCGA()
DT::datatable(all_TCGA_cancers)
#指定任意基因从任意癌症里面获取芯片表达数据(这里是MRNA)
expr<-expressionsTCGA(BRCA.mRNA,OV.mRNA,LUSC.mRNA,extract.cols = c("GATA3","PTEN","XBP1","ESR1","MUC1"))
#简化表达模式
expr$dataset<-gsub(".mRNA","",expr$dataset)#替换,dataset这一列,将.mRNA去掉
expr$bcr_patient_barcode<-paste0(expr$dataset,c(1:150,1:561,1:154))#barcode换掉
#绘制指定基因在不同癌症的表达量区别boxplot
library(ggpubr)
ggboxplot(expr,x="dataset",y="GATA3",title = "GATA3",ylab = "Expression",color = "dataset",palette = "jco")###expr中dataset作为横坐标,GATA3表达量作为纵坐标,颜色根据dataset分组,具体颜色为jco系列
#还可以加上不同癌症之间比较的p值
my_comparisons<-list(c("BRCA","OV"),c("OV","LUSC"))
ggboxplot(expr,x="dataset",y="GATA3",title = "GATA3",ylab = "Expression",color = "dataset",palette = "jco")+stat_compare_means(comparisons = my_comparisons)
#用ggplot2也可以画箱型图
library(ggplot2)
p<-ggplot(expr,aes(x=expr$dataset,y=expr$GATA3))
p<-p+geom_boxplot(aes(fill=expr$dataset))
p+xlab("1")+ylab("2")+ggtitle("3")+guides(fill=guide_legend(title="4"))
###另附小技巧
label.select.criteria <- list(criteria = "`y` > 3.9 & `x` %in% c('BRCA', 'OV')")
ggboxplot(expr, x = "dataset",
y = c("GATA3", "PTEN", "XBP1"),
combine = TRUE,
color = "dataset", palette = "jco",
ylab = "Expression",
label = "bcr_patient_barcode", # column containing point labels
label.select = label.select.criteria, # Select some labels to display
font.label = list(size = 9, face = "italic"), # label font
repel = TRUE # Avoid label text overplotting
)
##一般用到是几个同时呈现
ggboxplot(expr,x="dataset",y=c("GATA3","PTEN","XBP1"),ylab = "Expression",color = "dataset",palette = "jco",combine = TRUE)
#指定任意基因从任意癌症里面获取测序表达数据(rnaseq)
expr <- expressionsTCGA(BRCA.rnaseq, OV.rnaseq, LUSC.rnaseq,extract.cols = c("GATA3|2625", "PTEN|5728", "XBP1|7494","ESR1|2099", "MUC1|4582"))#需要symbol还要entrez的ID:symbol|extrezID
ggboxplot(expr,x="dataset",y="`GATA3|2625`",title = "GATA3|2625",ylab = "Expression",color = "dataset",palette = "jco")
#用全部的rnaseq的表达数据来做主成分分析
library(RTCGA.rnaseq)
library(dplyr)# R包dplyr可用于处理R内部或者外部的结构化数据,相较于plyr包,dplyr专注接受dataframe对象, 大幅提高了速度,并且提供了更稳健的数据库接口。同时,dplyr包可用于操作Spark的dataframe。本文只是基础的dplyr包学习笔记,所以并不会讨论一些高级应用,或者与data.table包的性能比较。
expressionsTCGA(BRCA.rnaseq,OV.rnaseq,LUSC.rnaseq)%>%dplyr::rename(cohort=dataset)%>%filter(substr(bcr_patient_barcode,14,15)=="01")->BRCA.OV.LUSC.rnaseq.cancer#筛选出了“TCGA-GM-A2DA-01A-11R-A18M-07”bcr_barcode都是“01”的,%>%管道符,左边赋于右边
pcaTCGA(BRCA.OV.LUSC.rnaseq.cancer, "cohort") -> pca_plot
plot(pca_plot)
#用突变数据做生存分析
library(RTCGA.mutations)
library(survminer)
library(dplyr)
mutationsTCGA(BRCA.mutations,OV.mutations,LUSC.mutations)%>%filter(Hugo_Symbol=="TP53")%>%filter(substr(bcr_patient_barcode,14,15)=="01")%>%mutate(bcr_patient_barcode=substr(bcr_patient_barcode,1,12))->BRCA_OV.mutations###斜体部分筛选了“01”样本中含TP53的,删除线部分是将第一列barcode重命名了,只取前12个字符
survivalTCGA(BRCA.clinical,OV.clinical,extract.cols = "admin.disease_code")%>%dplyr::rename(disease=admin.disease_code)->BRCA_OV.clinical
BRCA_OV.clinical %>% left_join( BRCA_OV.mutations, by = "bcr_patient_barcode" ) %>%mutate(TP53 = ifelse(!is.na(Variant_Classification), "Mut","WILDorNOINFO")) -> BRCA_OV.clinical_mutations#斜体是说按照barcode将clinical和mutations合并,删除线是说增加一列TP53 的信息,如果variant_classification是空值,则表示wildornoinfo,如果不是空值,则表示其mut
BRCA_OV.clinical_mutations %>%
select(times, patient.vital_status, disease, TP53) -> BRCA_OV.2plot#选取生存分析需要的内容
kmTCGA(
BRCA_OV.2plot,
explanatory.names = c("TP53", "disease"),
break.time.by = 400,
xlim = c(0,2000),
pval = TRUE) -> km_plot
print(km_plot)