下载并预处理TCGA数据

本文为TCGA数据的下载，并整理为行名为基因名的数据结构

方法一

#数据下载的网站，下载下来并命名为HNSC_RSEM_genes_normalized.txt
#http://gdac.broadinstitute.org/runs/stddata__2016_01_28/data/HNSC/20160128/gdac.broadinstitute.org_HNSC.Merge_rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.Level_3.2016012800.0.0.tar.gz.md5
library(stringr)
hnsc<-read.table("your_dir/HNSC_RSEM_genes_normalized.txt",header = T,check.names = F,sep="\t")
hnsc<-hnsc[-1,]
row_name<-as.character(hnsc[,1])
row_name<-unlist(lapply(row_name, FUN = function(x) {return(strsplit(x, split = "|",fixed = T)[[1]][1])}))
hnsc[,1]<-row_name
hnsc<-hnsc[!duplicated(hnsc[,1]),]
row.names(hnsc)<-as.character(hnsc[,1])
hnsc<-hnsc[,-1]
col_names<-colnames(hnsc)
new_names<-unlist(lapply(col_names, FUN = function(x) {return(substr(x,1,16))}))
colnames(hnsc)<-new_names
write.csv(hnsc,"your_dir/hnsc_clean_data.csv")

方法二

乳腺癌PAM50

suppressMessages(library(TCGAbiolinks))
BRCA_path_subtypes <- TCGAquery_subtype(tumor = "brca")

另一种方法，生存分析

我觉得这确实目前最好的方法,同时这个是官方下载并实时更新的数据下载方式，但是很容易出现报错(Error in x $ed :$ operator is invalid for atomic vectors),解决方式为用github源进行安装，这个很重要。
具体代码代码如下

#https://bioconductor.org/packages/release//workflows/vignettes/SingscoreAMLMutations/inst/doc/workflow_transcriptional_mut_sig_chinese.html
library(devtools)
devtools::install_github("Bioconductor-mirror/biomaRt")
devtools::install_github("BioinformaticsFMRP/TCGAbiolinks")#IF Error: $ operator is invalid for atomic vectors should run this


rm(list=ls())
library(stringr)
cancer_type="TCGA-BRCA"
#get GDC version information
gdc_info = getGDCInfo()
Release<-as.character(gdc_info$data_release)
release<-substr(Release,1,17)

###download BRCA counts data

Download_TCGA<-function(cancer_type,release){
  suppressMessages(library(TCGAbiolinks)) 
  suppressMessages(library(SummarizedExperiment))
  suppressMessages(library(dplyr))
  suppressMessages(library(DT))
  counts_query <- GDCquery(project = cancer_type, 
                           data.category = "Transcriptome Profiling", 
                           data.type = "Gene Expression Quantification", 
                           workflow.type = "HTSeq - Counts")
  GDCdownload(counts_query)
  counts_expdat <- GDCprepare(query =counts_query)
  count_matrix= as.data.frame(assay(counts_expdat))
  write.csv(count_matrix,file = paste(cancer_type,"Counts_",release,".csv",sep = "-"))
  sample_NT <- TCGAquery_SampleTypes(barcode = colnames(count_matrix),typesample = c("NT"))
  sample_TP <- TCGAquery_SampleTypes(barcode = colnames(count_matrix),typesample = c("TP"))
  write.csv(count_matrix[,sample_NT],file = paste(cancer_type,"Counts_normal_",release,".csv",sep = "-"))
  write.csv(count_matrix[,sample_TP],file = paste(cancer_type,"Counts_tumor_",release,".csv",sep = "-"))
  
  ###download BRCA FPKM data
  FPKM_query <- GDCquery(project = cancer_type, 
                         data.category = "Transcriptome Profiling", 
                         data.type = "Gene Expression Quantification", 
                         workflow.type = "HTSeq - FPKM")
  GDCdownload(FPKM_query)
  FPKM_expdat <- GDCprepare(query =FPKM_query)
  FPKM_matrix= as.data.frame(assay(FPKM_expdat))
  write.csv(FPKM_matrix,file = paste(cancer_type,"FPKM_",release,".csv",sep = "-"))
  sample_NT <- TCGAquery_SampleTypes(barcode = colnames(FPKM_matrix),typesample = c("NT"))
  sample_TP <- TCGAquery_SampleTypes(barcode = colnames(FPKM_matrix),typesample = c("TP"))
  write.csv(FPKM_matrix[,sample_NT],file = paste(cancer_type,"FPKM_normal_",release,".csv",sep = "-"))
  write.csv(FPKM_matrix[,sample_TP],file = paste(cancer_type,"FPKM_tumor_",release,".csv",sep = "-"))
  
  #########
  clinical <- GDCquery_clinic(project = cancer_type, type = "clinical")
  
  ## ----echo=TRUE, message=FALSE, warning=FALSE-----------------------------
  datatable(clinical, filter = 'top', 
            options = list(scrollX = TRUE, keys = TRUE, pageLength = 5),  
            rownames = FALSE)
  write.csv(clinical,"clinical_full.csv")
  
  ##origanize the clinical data (shuould do some ajustion maybe try next time)
  
  CLC_query <- GDCquery(project = cancer_type, 
                        data.category = "Clinical", 
                        file.type = "xml")
  GDCdownload(CLC_query)
  clinical <- GDCprepare_clinic(CLC_query, clinical.info = "patient")
  
  
  
  clinical_trait <- clinical  %>%
    dplyr::select(bcr_patient_barcode,gender,vital_status,                            
                  days_to_death,days_to_last_followup,race_list,
                  person_neoplasm_cancer_status,
                  stage_event_pathologic_stage,             
                  stage_event_tnm_categories  ) %>%
    distinct( bcr_patient_barcode, .keep_all = TRUE)  
  
  
  #organize sur data
  dead_patient <- clinical_trait  %>%
    dplyr::filter(vital_status == 'Dead') %>%
    dplyr::select(-days_to_last_followup) %>%
    reshape::rename(c(bcr_patient_barcode = 'Barcode',
                      gender = 'Gender',
                      vital_status = 'OS',
                      days_to_death='OS.Time',
                      race_list = 'Race',
                      person_neoplasm_cancer_status='cancer_status',
                      age_at_initial_pathologic_diagnosis = 'Age',
                      neoplasm_histologic_grade = 'Grade',
                      stage_event_pathologic_stage = 'Stage',
                      stage_event_tnm_categories = 'TNM' )) %>%
    mutate(OS=ifelse(OS=='Dead',1,0))%>%
    mutate(OS.Time=OS.Time/365)
  
  
  
  #organize the clc data
  alive_patient <- clinical_trait %>%
    dplyr::filter(vital_status == 'Alive') %>%
    dplyr::select(-days_to_death) %>%
    reshape::rename(c(bcr_patient_barcode = 'Barcode',
                      gender = 'Gender',
                      vital_status = 'OS',
                      days_to_last_followup='OS.Time',
                      race_list = 'Race',
                      person_neoplasm_cancer_status='cancer_status',
                      age_at_initial_pathologic_diagnosis = 'Age',
                      neoplasm_histologic_grade = 'Grade',
                      stage_event_pathologic_stage = 'Stage',
                      stage_event_tnm_categories = 'TNM' )) %>%
    mutate(OS=ifelse(OS=='Dead',1,0))%>%
    mutate(OS.Time=OS.Time/365)
  
  #combine clincial data
  survival_data <- rbind(dead_patient,alive_patient)
  write.csv(survival_data , file = paste(cancer_type,"clinical_",release,".csv",sep = "-"))
  
  #download Copy Number Variation data
  CNV_query <- GDCquery(project = cancer_type, 
                        data.category = "Copy Number Variation", 
                        data.type = "Copy Number Segment")
  
  GDCdownload(CNV_query)
  CNV_expdat <- GDCprepare(query = CNV_query)
  CNV_count_matrix=as.data.frame(assay(CNV_expdat))
  write.csv(CNV_count_matrix,file = paste(cancer_type,"Copy-Number-Variation_",release,".csv",sep = "-"))
  
  #download methylation
  meth_query <- GDCquery(project =cancer_type,
                         legacy = TRUE,
                         data.category = "DNA methylation")
  GDCdownload(meth_query)
  meth_expdat <- GDCprepare(query = meth_query)
  meth_count_matrix=assay(meth_expdat)
  write.csv(meth_count_matrix,file = paste(cancer_type,"methylation_",release,"2.csv",sep = "-"))
  ####download miR data
  miR_query <- GDCquery(project = cancer_type, 
                        data.category = "Transcriptome Profiling", 
                        data.type = "miRNA Expression Quantification", 
                        workflow.type = "BCGSC miRNA Profiling")
  GDCdownload(miR_query)
  miR_expdat <- GDCprepare(query = miR_query)
  write.csv(miR_expdat,file = paste(cancer_type,"miRNAs_",release,".csv",sep = "-"))
  row.names(miR_expdat)<-as.character(miR_expdat[,1])
  miR_expdat<-miR_expdat[,-1]
 col_name<-unlist(lapply(colnames(miR_expdat), FUN = function(x) {return(strsplit(x, split = "TCGA",fixed = T)[[1]][2])}))
 col_name<-col_name[!duplicated(col_name)]
 rpkm_names<-paste("reads_per_million_miRNA_mapped_TCGA",col_name,sep = "")
 count_names<-paste("read_count_TCGA",col_name,sep = "")
 write.csv(miR_expdat[,rpkm_names],file = paste(cancer_type,"miRNAs_RPKM",release,".csv",sep = "-"))
 write.csv(miR_expdat[,count_names],file = paste(cancer_type,"miRNAs_",release,".csv",sep = "-"))
}
Download_TCGA(cancer_type,release)

三、多线程批量下载所有TCGA

调用19线程

#!/usr/bin/env Rscript
rm(list=ls())
library(stringr)
library(parallel)
cancerType<-read.csv("projects.csv",header = T)
cancer_type<-as.character(cancerType$project_id)


###download counts data

Download_TCGA<-function(cancer_type){
  suppressMessages(library(TCGAbiolinks)) 
  suppressMessages(library(SummarizedExperiment))
  suppressMessages(library(dplyr))
  suppressMessages(library(DT))
  dir= "~/Desktop/tcga_test" #should change this before you run
  out_dir=paste0(dir,"/",cancer_type)
  dir.create(out_dir,recursive = T)
  setwd(out_dir)
  #get GDC version information
  gdc_info = getGDCInfo()
  Release<-as.character(gdc_info$data_release)
  release<-substr(Release,1,17)
  counts_query <- GDCquery(project = cancer_type, 
                           data.category = "Transcriptome Profiling", 
                           data.type = "Gene Expression Quantification", 
                           workflow.type = "HTSeq - Counts")
  GDCdownload(counts_query)
  counts_expdat <- GDCprepare(query =counts_query)
  count_matrix= as.data.frame(assay(counts_expdat))
  write.csv(count_matrix,file = paste(cancer_type,"Counts_",release,".csv",sep = "-"))
  sample_NT <- TCGAquery_SampleTypes(barcode = colnames(count_matrix),typesample = c("NT"))
  sample_TP <- TCGAquery_SampleTypes(barcode = colnames(count_matrix),typesample = c("TP"))
  write.csv(count_matrix[,sample_NT],file = paste(cancer_type,"Counts_normal_",release,".csv",sep = "-"))
  write.csv(count_matrix[,sample_TP],file = paste(cancer_type,"Counts_tumor_",release,".csv",sep = "-"))
  
  ###download FPKM data
  FPKM_query <- GDCquery(project = cancer_type, 
                         data.category = "Transcriptome Profiling", 
                         data.type = "Gene Expression Quantification", 
                         workflow.type = "HTSeq - FPKM")
  GDCdownload(FPKM_query)
  FPKM_expdat <- GDCprepare(query =FPKM_query)
  FPKM_matrix= as.data.frame(assay(FPKM_expdat))
  write.csv(FPKM_matrix,file = paste(cancer_type,"FPKM_",release,".csv",sep = "-"))
  sample_NT <- TCGAquery_SampleTypes(barcode = colnames(FPKM_matrix),typesample = c("NT"))
  sample_TP <- TCGAquery_SampleTypes(barcode = colnames(FPKM_matrix),typesample = c("TP"))
  write.csv(FPKM_matrix[,sample_NT],file = paste(cancer_type,"FPKM_normal_",release,".csv",sep = "-"))
  write.csv(FPKM_matrix[,sample_TP],file = paste(cancer_type,"FPKM_tumor_",release,".csv",sep = "-"))
  
  #########
  clinical <- GDCquery_clinic(project = cancer_type, type = "clinical")
  
  ## ----echo=TRUE, message=FALSE, warning=FALSE-----------------------------
  datatable(clinical, filter = 'top', 
            options = list(scrollX = TRUE, keys = TRUE, pageLength = 5),  
            rownames = FALSE)
  write.csv(clinical,"clinical_full.csv")
  
  ##origanize the clinical data (shuould do some ajustion maybe try next time)
  
  CLC_query <- GDCquery(project = cancer_type, 
                        data.category = "Clinical", 
                        file.type = "xml")
  GDCdownload(CLC_query)
  clinical <- GDCprepare_clinic(CLC_query, clinical.info = "patient")
  
  
  
  clinical_trait <- clinical  %>%
    dplyr::select(bcr_patient_barcode,gender,vital_status,                            
                  days_to_death,days_to_last_followup,race_list,
                  person_neoplasm_cancer_status,
                  stage_event_pathologic_stage,             
                  stage_event_tnm_categories  ) %>%
    distinct( bcr_patient_barcode, .keep_all = TRUE)  
  
  
  #organize sur data
  dead_patient <- clinical_trait  %>%
    dplyr::filter(vital_status == 'Dead') %>%
    dplyr::select(-days_to_last_followup) %>%
    reshape::rename(c(bcr_patient_barcode = 'Barcode',
                      gender = 'Gender',
                      vital_status = 'OS',
                      days_to_death='OS.Time',
                      race_list = 'Race',
                      person_neoplasm_cancer_status='cancer_status',
                      age_at_initial_pathologic_diagnosis = 'Age',
                      neoplasm_histologic_grade = 'Grade',
                      stage_event_pathologic_stage = 'Stage',
                      stage_event_tnm_categories = 'TNM' )) %>%
    mutate(OS=ifelse(OS=='Dead',1,0))%>%
    mutate(OS.Time=OS.Time/365)
  
  
  
  #organize the clc data
  alive_patient <- clinical_trait %>%
    dplyr::filter(vital_status == 'Alive') %>%
    dplyr::select(-days_to_death) %>%
    reshape::rename(c(bcr_patient_barcode = 'Barcode',
                      gender = 'Gender',
                      vital_status = 'OS',
                      days_to_last_followup='OS.Time',
                      race_list = 'Race',
                      person_neoplasm_cancer_status='cancer_status',
                      age_at_initial_pathologic_diagnosis = 'Age',
                      neoplasm_histologic_grade = 'Grade',
                      stage_event_pathologic_stage = 'Stage',
                      stage_event_tnm_categories = 'TNM' )) %>%
    mutate(OS=ifelse(OS=='Dead',1,0))%>%
    mutate(OS.Time=OS.Time/365)
  
  #combine clincial data
  survival_data <- rbind(dead_patient,alive_patient)
  write.csv(survival_data , file = paste(cancer_type,"clinical_",release,".csv",sep = "-"))
  
  #download Copy Number Variation data
  CNV_query <- GDCquery(project = cancer_type, 
                        data.category = "Copy Number Variation", 
                        data.type = "Copy Number Segment")
  
  GDCdownload(CNV_query)
  CNV_expdat <- GDCprepare(query = CNV_query)
  CNV_count_matrix=as.data.frame(assay(CNV_expdat))
  write.csv(CNV_count_matrix,file = paste(cancer_type,"Copy-Number-Variation_",release,".csv",sep = "-"))
  
  #download methylation
  meth_query <- GDCquery(project =cancer_type,
                         legacy = TRUE,
                         data.category = "DNA methylation")
  GDCdownload(meth_query)
  meth_expdat <- GDCprepare(query = meth_query)
  meth_count_matrix=assay(meth_expdat)
  write.csv(meth_count_matrix,file = paste(cancer_type,"methylation_",release,"2.csv",sep = "-"))
  ####download miR data
  miR_query <- GDCquery(project = cancer_type, 
                        data.category = "Transcriptome Profiling", 
                        data.type = "miRNA Expression Quantification", 
                        workflow.type = "BCGSC miRNA Profiling")
  GDCdownload(miR_query)
  miR_expdat <- GDCprepare(query = miR_query)
  miR_expdat_matrix=assay(miR_expdat)
  write.csv(miR_expdat_matrix,file = paste(cancer_type,"miRNAs_",release,"2.csv",sep = "-"))
  message(paste0(cancer_type," Download Finished!"))
}
cl <- makeCluster(19)
parLapply(cl,cancer_type,Download_TCGA)
stopCluster(cl)

最后编辑于：2020.05.16 22:25:57

人面猴
序言：七十年代末，一起剥皮案震惊了整个滨河市，随后出现的几起案子，更是在滨河造成了极大的恐慌，老刑警刘岩，带你破解...
沈念sama阅读 205,132评论 6赞 478
死咒
序言：滨河连续发生了三起死亡事件，死亡现场离奇诡异，居然都是意外死亡，警方通过查阅死者的电脑和手机，发现死者居然都...
沈念sama阅读 87,802评论 2赞 381
救了他两次的神仙让他今天三更去死
文/潘晓璐我一进店门，熙熙楼的掌柜王于贵愁眉苦脸地迎上来，“玉大人，你说我怎么就摊上这事。” “怎么了？”我有些...
开封第一讲书人阅读 151,566评论 0赞 338
道士缉凶录：失踪的卖姜人
文/不坏的土叔我叫张陵，是天一观的道长。经常有香客问我，道长，这世上最难降的妖魔是什么？我笑而不...
开封第一讲书人阅读 54,858评论 1赞 277
港岛之恋（遗憾婚礼）
正文为了忘掉前任，我火速办了婚礼，结果婚礼上，老公的妹妹穿的比我还像新娘。我一直安慰自己，他们只是感情好，可当我...
茶点故事阅读 63,867评论 5赞 368
恶毒庶女顶嫁案：这布局不是一般人想出来的
文/花漫我一把揭开白布。她就那样静静地躺着，像睡着了一般。火红的嫁衣衬着肌肤如雪。梳的纹丝不乱的头发上，一...
开封第一讲书人阅读 48,695评论 1赞 282
城市分裂传说
那天，我揣着相机与录音，去河边找鬼。笑死，一个胖子当着我的面吹牛，可吹牛的内容都是我干的。我是一名探鬼主播，决...
沈念sama阅读 38,064评论 3赞 399
双鸳鸯连环套：你想象不到人心有多黑
文/苍兰香墨我猛地睁开眼，长吁一口气：“原来是场噩梦啊……” “哼！你这毒妇竟也来了？” 一声冷哼从身侧响起，我...
开封第一讲书人阅读 36,705评论 0赞 258
万荣杀人案实录
序言：老挝万荣一对情侣失踪，失踪者是张志新（化名）和其女友刘颖，没想到半个月后，有当地人在树林里发现了一具尸体，经...
沈念sama阅读 42,915评论 1赞 300
护林员之死
正文独居荒郊野岭守林人离奇死亡，尸身上长有42处带血的脓包…… 初始之章·张勋以下内容为张勋视角年9月15日...
茶点故事阅读 35,677评论 2赞 323
白月光启示录
正文我和宋清朗相恋三年，在试婚纱的时候发现自己被绿了。大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
茶点故事阅读 37,796评论 1赞 333
活死人
序言：一个原本活蹦乱跳的男人离奇死亡，死状恐怖，灵堂内的尸体忽然破棺而出，到底是诈尸还是另有隐情，我是刑警宁泽，带...
沈念sama阅读 33,432评论 4赞 322
日本核电站爆炸内幕
正文年R本政府宣布，位于F岛的核电站，受9级特大地震影响，放射性物质发生泄漏。R本人自食恶果不足惜，却给世界环境...
茶点故事阅读 39,041评论 3赞 307
男人毒药：我在死后第九天来索命
文/蒙蒙一、第九天我趴在偏房一处隐蔽的房顶上张望。院中可真热闹，春花似锦、人声如沸。这庄子的主人今日做“春日...
开封第一讲书人阅读 29,992评论 0赞 19
一桩弑父案，背后竟有这般阴谋
文/苍兰香墨我抬头看了看天上的太阳。三九已至，却和暖如春，着一层夹袄步出监牢的瞬间，已是汗流浃背。一阵脚步声响...
开封第一讲书人阅读 31,223评论 1赞 260
情欲美人皮
我被黑心中介骗来泰国打工，没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留，地道东北人。一个月前我还...
沈念sama阅读 45,185评论 2赞 352
代替公主和亲
正文我出身青楼，却偏偏与公主长得像，于是被迫代替她去往敌国和亲。传闻我的和亲对象是个残疾皇子，可洞房花烛夜当晚...
茶点故事阅读 42,535评论 2赞 343

下载并预处理TCGA数据

方法一

方法二

三、多线程批量下载所有TCGA

推荐阅读更多精彩内容