本文为TCGA数据的下载,并整理为行名为基因名的数据结构
方法一
#数据下载的网站,下载下来并命名为HNSC_RSEM_genes_normalized.txt
#http://gdac.broadinstitute.org/runs/stddata__2016_01_28/data/HNSC/20160128/gdac.broadinstitute.org_HNSC.Merge_rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.Level_3.2016012800.0.0.tar.gz.md5
library(stringr)
hnsc<-read.table("your_dir/HNSC_RSEM_genes_normalized.txt",header = T,check.names = F,sep="\t")
hnsc<-hnsc[-1,]
row_name<-as.character(hnsc[,1])
row_name<-unlist(lapply(row_name, FUN = function(x) {return(strsplit(x, split = "|",fixed = T)[[1]][1])}))
hnsc[,1]<-row_name
hnsc<-hnsc[!duplicated(hnsc[,1]),]
row.names(hnsc)<-as.character(hnsc[,1])
hnsc<-hnsc[,-1]
col_names<-colnames(hnsc)
new_names<-unlist(lapply(col_names, FUN = function(x) {return(substr(x,1,16))}))
colnames(hnsc)<-new_names
write.csv(hnsc,"your_dir/hnsc_clean_data.csv")
方法二
suppressMessages(library(TCGAbiolinks))
BRCA_path_subtypes <- TCGAquery_subtype(tumor = "brca")
我觉得这确实目前最好的方法,同时这个是官方下载并实时更新的数据下载方式,但是很容易出现报错(Error in x operator is invalid for atomic vectors),解决方式为用github源进行安装,这个很重要。
具体代码代码如下
#https://bioconductor.org/packages/release//workflows/vignettes/SingscoreAMLMutations/inst/doc/workflow_transcriptional_mut_sig_chinese.html
library(devtools)
devtools::install_github("Bioconductor-mirror/biomaRt")
devtools::install_github("BioinformaticsFMRP/TCGAbiolinks")#IF Error: $ operator is invalid for atomic vectors should run this
rm(list=ls())
library(stringr)
cancer_type="TCGA-BRCA"
#get GDC version information
gdc_info = getGDCInfo()
Release<-as.character(gdc_info$data_release)
release<-substr(Release,1,17)
###download BRCA counts data
Download_TCGA<-function(cancer_type,release){
suppressMessages(library(TCGAbiolinks))
suppressMessages(library(SummarizedExperiment))
suppressMessages(library(dplyr))
suppressMessages(library(DT))
counts_query <- GDCquery(project = cancer_type,
data.category = "Transcriptome Profiling",
data.type = "Gene Expression Quantification",
workflow.type = "HTSeq - Counts")
GDCdownload(counts_query)
counts_expdat <- GDCprepare(query =counts_query)
count_matrix= as.data.frame(assay(counts_expdat))
write.csv(count_matrix,file = paste(cancer_type,"Counts_",release,".csv",sep = "-"))
sample_NT <- TCGAquery_SampleTypes(barcode = colnames(count_matrix),typesample = c("NT"))
sample_TP <- TCGAquery_SampleTypes(barcode = colnames(count_matrix),typesample = c("TP"))
write.csv(count_matrix[,sample_NT],file = paste(cancer_type,"Counts_normal_",release,".csv",sep = "-"))
write.csv(count_matrix[,sample_TP],file = paste(cancer_type,"Counts_tumor_",release,".csv",sep = "-"))
###download BRCA FPKM data
FPKM_query <- GDCquery(project = cancer_type,
data.category = "Transcriptome Profiling",
data.type = "Gene Expression Quantification",
workflow.type = "HTSeq - FPKM")
GDCdownload(FPKM_query)
FPKM_expdat <- GDCprepare(query =FPKM_query)
FPKM_matrix= as.data.frame(assay(FPKM_expdat))
write.csv(FPKM_matrix,file = paste(cancer_type,"FPKM_",release,".csv",sep = "-"))
sample_NT <- TCGAquery_SampleTypes(barcode = colnames(FPKM_matrix),typesample = c("NT"))
sample_TP <- TCGAquery_SampleTypes(barcode = colnames(FPKM_matrix),typesample = c("TP"))
write.csv(FPKM_matrix[,sample_NT],file = paste(cancer_type,"FPKM_normal_",release,".csv",sep = "-"))
write.csv(FPKM_matrix[,sample_TP],file = paste(cancer_type,"FPKM_tumor_",release,".csv",sep = "-"))
#########
clinical <- GDCquery_clinic(project = cancer_type, type = "clinical")
## ----echo=TRUE, message=FALSE, warning=FALSE-----------------------------
datatable(clinical, filter = 'top',
options = list(scrollX = TRUE, keys = TRUE, pageLength = 5),
rownames = FALSE)
write.csv(clinical,"clinical_full.csv")
##origanize the clinical data (shuould do some ajustion maybe try next time)
CLC_query <- GDCquery(project = cancer_type,
data.category = "Clinical",
file.type = "xml")
GDCdownload(CLC_query)
clinical <- GDCprepare_clinic(CLC_query, clinical.info = "patient")
clinical_trait <- clinical %>%
dplyr::select(bcr_patient_barcode,gender,vital_status,
days_to_death,days_to_last_followup,race_list,
person_neoplasm_cancer_status,
stage_event_pathologic_stage,
stage_event_tnm_categories ) %>%
distinct( bcr_patient_barcode, .keep_all = TRUE)
#organize sur data
dead_patient <- clinical_trait %>%
dplyr::filter(vital_status == 'Dead') %>%
dplyr::select(-days_to_last_followup) %>%
reshape::rename(c(bcr_patient_barcode = 'Barcode',
gender = 'Gender',
vital_status = 'OS',
days_to_death='OS.Time',
race_list = 'Race',
person_neoplasm_cancer_status='cancer_status',
age_at_initial_pathologic_diagnosis = 'Age',
neoplasm_histologic_grade = 'Grade',
stage_event_pathologic_stage = 'Stage',
stage_event_tnm_categories = 'TNM' )) %>%
mutate(OS=ifelse(OS=='Dead',1,0))%>%
mutate(OS.Time=OS.Time/365)
#organize the clc data
alive_patient <- clinical_trait %>%
dplyr::filter(vital_status == 'Alive') %>%
dplyr::select(-days_to_death) %>%
reshape::rename(c(bcr_patient_barcode = 'Barcode',
gender = 'Gender',
vital_status = 'OS',
days_to_last_followup='OS.Time',
race_list = 'Race',
person_neoplasm_cancer_status='cancer_status',
age_at_initial_pathologic_diagnosis = 'Age',
neoplasm_histologic_grade = 'Grade',
stage_event_pathologic_stage = 'Stage',
stage_event_tnm_categories = 'TNM' )) %>%
mutate(OS=ifelse(OS=='Dead',1,0))%>%
mutate(OS.Time=OS.Time/365)
#combine clincial data
survival_data <- rbind(dead_patient,alive_patient)
write.csv(survival_data , file = paste(cancer_type,"clinical_",release,".csv",sep = "-"))
#download Copy Number Variation data
CNV_query <- GDCquery(project = cancer_type,
data.category = "Copy Number Variation",
data.type = "Copy Number Segment")
GDCdownload(CNV_query)
CNV_expdat <- GDCprepare(query = CNV_query)
CNV_count_matrix=as.data.frame(assay(CNV_expdat))
write.csv(CNV_count_matrix,file = paste(cancer_type,"Copy-Number-Variation_",release,".csv",sep = "-"))
#download methylation
meth_query <- GDCquery(project =cancer_type,
legacy = TRUE,
data.category = "DNA methylation")
GDCdownload(meth_query)
meth_expdat <- GDCprepare(query = meth_query)
meth_count_matrix=assay(meth_expdat)
write.csv(meth_count_matrix,file = paste(cancer_type,"methylation_",release,"2.csv",sep = "-"))
####download miR data
miR_query <- GDCquery(project = cancer_type,
data.category = "Transcriptome Profiling",
data.type = "miRNA Expression Quantification",
workflow.type = "BCGSC miRNA Profiling")
GDCdownload(miR_query)
miR_expdat <- GDCprepare(query = miR_query)
write.csv(miR_expdat,file = paste(cancer_type,"miRNAs_",release,".csv",sep = "-"))
row.names(miR_expdat)<-as.character(miR_expdat[,1])
miR_expdat<-miR_expdat[,-1]
col_name<-unlist(lapply(colnames(miR_expdat), FUN = function(x) {return(strsplit(x, split = "TCGA",fixed = T)[[1]][2])}))
col_name<-col_name[!duplicated(col_name)]
rpkm_names<-paste("reads_per_million_miRNA_mapped_TCGA",col_name,sep = "")
count_names<-paste("read_count_TCGA",col_name,sep = "")
write.csv(miR_expdat[,rpkm_names],file = paste(cancer_type,"miRNAs_RPKM",release,".csv",sep = "-"))
write.csv(miR_expdat[,count_names],file = paste(cancer_type,"miRNAs_",release,".csv",sep = "-"))
}
Download_TCGA(cancer_type,release)
三、多线程批量下载所有TCGA
调用19线程
#!/usr/bin/env Rscript
rm(list=ls())
library(stringr)
library(parallel)
cancerType<-read.csv("projects.csv",header = T)
cancer_type<-as.character(cancerType$project_id)
###download counts data
Download_TCGA<-function(cancer_type){
suppressMessages(library(TCGAbiolinks))
suppressMessages(library(SummarizedExperiment))
suppressMessages(library(dplyr))
suppressMessages(library(DT))
dir= "~/Desktop/tcga_test" #should change this before you run
out_dir=paste0(dir,"/",cancer_type)
dir.create(out_dir,recursive = T)
setwd(out_dir)
#get GDC version information
gdc_info = getGDCInfo()
Release<-as.character(gdc_info$data_release)
release<-substr(Release,1,17)
counts_query <- GDCquery(project = cancer_type,
data.category = "Transcriptome Profiling",
data.type = "Gene Expression Quantification",
workflow.type = "HTSeq - Counts")
GDCdownload(counts_query)
counts_expdat <- GDCprepare(query =counts_query)
count_matrix= as.data.frame(assay(counts_expdat))
write.csv(count_matrix,file = paste(cancer_type,"Counts_",release,".csv",sep = "-"))
sample_NT <- TCGAquery_SampleTypes(barcode = colnames(count_matrix),typesample = c("NT"))
sample_TP <- TCGAquery_SampleTypes(barcode = colnames(count_matrix),typesample = c("TP"))
write.csv(count_matrix[,sample_NT],file = paste(cancer_type,"Counts_normal_",release,".csv",sep = "-"))
write.csv(count_matrix[,sample_TP],file = paste(cancer_type,"Counts_tumor_",release,".csv",sep = "-"))
###download FPKM data
FPKM_query <- GDCquery(project = cancer_type,
data.category = "Transcriptome Profiling",
data.type = "Gene Expression Quantification",
workflow.type = "HTSeq - FPKM")
GDCdownload(FPKM_query)
FPKM_expdat <- GDCprepare(query =FPKM_query)
FPKM_matrix= as.data.frame(assay(FPKM_expdat))
write.csv(FPKM_matrix,file = paste(cancer_type,"FPKM_",release,".csv",sep = "-"))
sample_NT <- TCGAquery_SampleTypes(barcode = colnames(FPKM_matrix),typesample = c("NT"))
sample_TP <- TCGAquery_SampleTypes(barcode = colnames(FPKM_matrix),typesample = c("TP"))
write.csv(FPKM_matrix[,sample_NT],file = paste(cancer_type,"FPKM_normal_",release,".csv",sep = "-"))
write.csv(FPKM_matrix[,sample_TP],file = paste(cancer_type,"FPKM_tumor_",release,".csv",sep = "-"))
#########
clinical <- GDCquery_clinic(project = cancer_type, type = "clinical")
## ----echo=TRUE, message=FALSE, warning=FALSE-----------------------------
datatable(clinical, filter = 'top',
options = list(scrollX = TRUE, keys = TRUE, pageLength = 5),
rownames = FALSE)
write.csv(clinical,"clinical_full.csv")
##origanize the clinical data (shuould do some ajustion maybe try next time)
CLC_query <- GDCquery(project = cancer_type,
data.category = "Clinical",
file.type = "xml")
GDCdownload(CLC_query)
clinical <- GDCprepare_clinic(CLC_query, clinical.info = "patient")
clinical_trait <- clinical %>%
dplyr::select(bcr_patient_barcode,gender,vital_status,
days_to_death,days_to_last_followup,race_list,
person_neoplasm_cancer_status,
stage_event_pathologic_stage,
stage_event_tnm_categories ) %>%
distinct( bcr_patient_barcode, .keep_all = TRUE)
#organize sur data
dead_patient <- clinical_trait %>%
dplyr::filter(vital_status == 'Dead') %>%
dplyr::select(-days_to_last_followup) %>%
reshape::rename(c(bcr_patient_barcode = 'Barcode',
gender = 'Gender',
vital_status = 'OS',
days_to_death='OS.Time',
race_list = 'Race',
person_neoplasm_cancer_status='cancer_status',
age_at_initial_pathologic_diagnosis = 'Age',
neoplasm_histologic_grade = 'Grade',
stage_event_pathologic_stage = 'Stage',
stage_event_tnm_categories = 'TNM' )) %>%
mutate(OS=ifelse(OS=='Dead',1,0))%>%
mutate(OS.Time=OS.Time/365)
#organize the clc data
alive_patient <- clinical_trait %>%
dplyr::filter(vital_status == 'Alive') %>%
dplyr::select(-days_to_death) %>%
reshape::rename(c(bcr_patient_barcode = 'Barcode',
gender = 'Gender',
vital_status = 'OS',
days_to_last_followup='OS.Time',
race_list = 'Race',
person_neoplasm_cancer_status='cancer_status',
age_at_initial_pathologic_diagnosis = 'Age',
neoplasm_histologic_grade = 'Grade',
stage_event_pathologic_stage = 'Stage',
stage_event_tnm_categories = 'TNM' )) %>%
mutate(OS=ifelse(OS=='Dead',1,0))%>%
mutate(OS.Time=OS.Time/365)
#combine clincial data
survival_data <- rbind(dead_patient,alive_patient)
write.csv(survival_data , file = paste(cancer_type,"clinical_",release,".csv",sep = "-"))
#download Copy Number Variation data
CNV_query <- GDCquery(project = cancer_type,
data.category = "Copy Number Variation",
data.type = "Copy Number Segment")
GDCdownload(CNV_query)
CNV_expdat <- GDCprepare(query = CNV_query)
CNV_count_matrix=as.data.frame(assay(CNV_expdat))
write.csv(CNV_count_matrix,file = paste(cancer_type,"Copy-Number-Variation_",release,".csv",sep = "-"))
#download methylation
meth_query <- GDCquery(project =cancer_type,
legacy = TRUE,
data.category = "DNA methylation")
GDCdownload(meth_query)
meth_expdat <- GDCprepare(query = meth_query)
meth_count_matrix=assay(meth_expdat)
write.csv(meth_count_matrix,file = paste(cancer_type,"methylation_",release,"2.csv",sep = "-"))
####download miR data
miR_query <- GDCquery(project = cancer_type,
data.category = "Transcriptome Profiling",
data.type = "miRNA Expression Quantification",
workflow.type = "BCGSC miRNA Profiling")
GDCdownload(miR_query)
miR_expdat <- GDCprepare(query = miR_query)
miR_expdat_matrix=assay(miR_expdat)
write.csv(miR_expdat_matrix,file = paste(cancer_type,"miRNAs_",release,"2.csv",sep = "-"))
message(paste0(cancer_type," Download Finished!"))
}
cl <- makeCluster(19)
parLapply(cl,cancer_type,Download_TCGA)
stopCluster(cl)