加载到表达矩阵,分组,生存情况
rm(list=ls())
load(file = 'Rdata/TCGA_KIRC_mut.Rdata')
load(file = 'Rdata/TCGA-KIRC-miRNA-example.Rdata')
group_list=ifelse(as.numeric(substr(colnames(expr),14,15)) < 10,'tumor','normal')
table(group_list)
load(file='Rdata/survival_input.Rdata')
head(phe)
exprSet[1:4,1:4]
切割数据组
方法一:完全随机切割
dim(expr)
set.seed(1234567890)#设定一个数值,使得随机数不会每次都变化
#sample(1:10,3)
k=sample(1:593,300)#在600个数字中随机选300个
t_exp=expr[,k] ##
v_exp=expr[,-k]##
#把数据分成了两组
table(ifelse(substr(colnames(t_exp),14,15)=='01','tumor','normal'))
table(ifelse(substr(colnames(v_exp),14,15)=='01','tumor','normal'))
t_tumor=t_exp[,substr(colnames(t_exp),14,15)=='01']
v_tumor=t_exp[,substr(colnames(v_exp),14,15)=='01']
t_phe= phe[match(substr(colnames(t_tumor),1,12),phe$ID),]
v_phe=phe[match(substr(colnames(v_tumor),1,12),phe$ID),]
table(t_phe$stage)
table(v_phe$stage)
方法二:recipes包,切割数据,可以保证分组平衡(一般用这个方法比较好)
#install.packages("recipes")
if(F){
## 切割数据,可以平衡
library(caret)
set.seed(123456789)
sam<- createDataPartition(phe$event, p = .5,list = FALSE)
train <- phe[sam,]
test <- phe[-sam,]
#查看两组一些临床参数切割比例
prop.table(table(train$stage))
prop.table(table(test$stage))
}
t_exp= expr[,match(train$ID,substr(colnames(expr),1,12))]
v_exp=expr[,-match(train$ID,substr(colnames(expr),1,12))]
#match返回X第一次在Y种的位置
#v_exp=expr[,-which(colnames(expr)%in%train$ID)]##找到位置之后删除
#把数据分成了两组
table(ifelse(substr(colnames(t_exp),14,15)=='01','tumor','normal'))
table(ifelse(substr(colnames(v_exp),14,15)=='01','tumor','normal'))
t_tumor=t_exp[,substr(colnames(t_exp),14,15)=='01']
v_tumor=v_exp[,substr(colnames(v_exp),14,15)=='01']
t_phe= phe[match(substr(colnames(t_tumor),1,12),phe$ID),]
v_phe=phe[match(substr(colnames(v_tumor),1,12),phe$ID),]
table(t_phe$stage)
table(v_phe$stage)
head(t_phe)
t_tumor[1:4,1:4]
最后:进行了lars回归,把数据分成两部分,一部分算,一部分用来验证
library(lars)
library(glmnet)
x=t(log2(t_tumor+1))
y=t_phe$event
cv_fit <- cv.glmnet(x=x, y=y, alpha = 1, nlambda = 1000)
#进行了一个lars回归,X是表达数据,Y是生存情况
plot.cv.glmnet(cv_fit)
# 两条虚线分别指示了两个特殊的λ值:
c(cv_fit$lambda.min,cv_fit$lambda.1se)
model_lasso <- glmnet(x=x, y=y, alpha = 1, lambda=cv_fit$lambda.1se)
#把预测和这个数据算出的结果相比较,看看预测准确度
lasso.prob <- predict(cv_fit, newx=t(log2(v_tumor+1)) , s=c(cv_fit$lambda.min,cv_fit$lambda.1se) )
re=cbind(v_phe$event ,lasso.prob)
## https://vip.biotrainee.com/d/812-
library(ROCR)
library(glmnet)
library(caret)
# calculate probabilities for TPR/FPR for predictions
pred <- prediction(re[,2], re[,1])
perf <- performance(pred,"tpr","fpr")
performance(pred,"auc") # shows calculated AUC for model
plot(perf,colorize=FALSE, col="black") # plot ROC curve
lines(c(0,1),c(0,1),col = "gray", lty = 4 )
#把数据分成两部分,一部分算,一部分用来验证
最后
感谢jimmy的生信技能树团队!
感谢导师岑洪老师!
感谢健明、孙小洁,慧美等生信技能树团队的老师一路以来的指导和鼓励!
文中代码来自生信技能树jimmy老师!