TCGA数据挖掘六：怎么随机把数据分成两部分

加载到表达矩阵，分组，生存情况

rm(list=ls())
load(file = 'Rdata/TCGA_KIRC_mut.Rdata')
load(file = 'Rdata/TCGA-KIRC-miRNA-example.Rdata')
group_list=ifelse(as.numeric(substr(colnames(expr),14,15)) < 10,'tumor','normal')
table(group_list)
load(file='Rdata/survival_input.Rdata')
head(phe)
exprSet[1:4,1:4]

切割数据组

方法一：完全随机切割

dim(expr)
set.seed(1234567890)#设定一个数值，使得随机数不会每次都变化
#sample(1:10,3)
k=sample(1:593,300)#在600个数字中随机选300个

t_exp=expr[,k] ##
v_exp=expr[,-k]## 
#把数据分成了两组

table(ifelse(substr(colnames(t_exp),14,15)=='01','tumor','normal'))
table(ifelse(substr(colnames(v_exp),14,15)=='01','tumor','normal'))

t_tumor=t_exp[,substr(colnames(t_exp),14,15)=='01']
v_tumor=t_exp[,substr(colnames(v_exp),14,15)=='01']

t_phe= phe[match(substr(colnames(t_tumor),1,12),phe$ID),]
v_phe=phe[match(substr(colnames(v_tumor),1,12),phe$ID),]
table(t_phe$stage)
table(v_phe$stage)

方法二：recipes包，切割数据，可以保证分组平衡（一般用这个方法比较好）

#install.packages("recipes")
if(F){
  ## 切割数据，可以平衡
  library(caret)
  set.seed(123456789)
  sam<- createDataPartition(phe$event, p = .5,list = FALSE)
  train <- phe[sam,]
  test <- phe[-sam,]
  #查看两组一些临床参数切割比例
  prop.table(table(train$stage))
  prop.table(table(test$stage)) 
}

t_exp= expr[,match(train$ID,substr(colnames(expr),1,12))]
v_exp=expr[,-match(train$ID,substr(colnames(expr),1,12))]
#match返回X第一次在Y种的位置
#v_exp=expr[,-which(colnames(expr)%in%train$ID)]##找到位置之后删除
#把数据分成了两组

table(ifelse(substr(colnames(t_exp),14,15)=='01','tumor','normal'))
table(ifelse(substr(colnames(v_exp),14,15)=='01','tumor','normal'))

t_tumor=t_exp[,substr(colnames(t_exp),14,15)=='01']
v_tumor=v_exp[,substr(colnames(v_exp),14,15)=='01']

t_phe= phe[match(substr(colnames(t_tumor),1,12),phe$ID),]
v_phe=phe[match(substr(colnames(v_tumor),1,12),phe$ID),]
table(t_phe$stage)
table(v_phe$stage)
head(t_phe)
t_tumor[1:4,1:4]

最后：进行了lars回归，把数据分成两部分，一部分算，一部分用来验证

library(lars) 
library(glmnet) 
x=t(log2(t_tumor+1))
y=t_phe$event 
cv_fit <- cv.glmnet(x=x, y=y, alpha = 1, nlambda = 1000)
#进行了一个lars回归，X是表达数据，Y是生存情况
plot.cv.glmnet(cv_fit)
# 两条虚线分别指示了两个特殊的λ值:
c(cv_fit$lambda.min,cv_fit$lambda.1se) 
model_lasso <- glmnet(x=x, y=y, alpha = 1, lambda=cv_fit$lambda.1se)
#把预测和这个数据算出的结果相比较，看看预测准确度
lasso.prob <- predict(cv_fit, newx=t(log2(v_tumor+1)) , s=c(cv_fit$lambda.min,cv_fit$lambda.1se) )
re=cbind(v_phe$event ,lasso.prob)
## https://vip.biotrainee.com/d/812-
library(ROCR)
library(glmnet)
library(caret)
# calculate probabilities for TPR/FPR for predictions
pred <- prediction(re[,2], re[,1])
perf <- performance(pred,"tpr","fpr")
performance(pred,"auc") # shows calculated AUC for model
plot(perf,colorize=FALSE, col="black") # plot ROC curve
lines(c(0,1),c(0,1),col = "gray", lty = 4 )

#把数据分成两部分，一部分算，一部分用来验证

最后

感谢jimmy的生信技能树团队！

感谢导师岑洪老师！

感谢健明、孙小洁，慧美等生信技能树团队的老师一路以来的指导和鼓励！

文中代码来自生信技能树jimmy老师!

最后编辑于：2019.07.14 22:23:08