PMID: 30237698 本文的例文
(一)课程介绍
(二)例文思路
(三)R和Rstudio软件的下载和安装(略)
尽量用R
(四)临床资料数据整理
(五)lasso回归筛选变量
目的: 防止模型过度拟合
代码如下
#install.packages("glmnet")
library(glmnet)
setwd("C:\\Users\\scikuangren\\Desktop\\logistics\\1_lasso") #需要改为自己工作目录
mydata<-read.table("input.txt",header=T,sep="\t") #需要修改
#View(mydata)
v1<-as.matrix(mydata[,c(3:7)]) #需要修改,选择因素
v2 <-mydata[,2]#结局
myfit <- glmnet(v1, v2, family = "binomial")
pdf("lambda.pdf")
plot(myfit, xvar = "lambda", label = TRUE)
dev.off()
myfit2 <- cv.glmnet(v1, v2, family="binomial")
pdf("min.pdf")
plot(myfit2)
abline(v=log(c(myfit2$lambda.min,myfit2$lambda.1se)),lty="dashed")
dev.off()
myfit2$lambda.min#最小lambda
##下面求出因素的项目是什么
coe <- coef(myfit, s = myfit2$lambda.min)
act_index <- which(coe != 0)
act_coe <- coe[act_index]
row.names(coe)[act_index]
[1] "(Intercept)" "Use_of_NSAIDs" "Number_of_questions"
[4] "Education_level" "Distance"
##纳入的因素多的话,可以最后只纳入lasso回归的项目,如果比较少,可以全部纳入,需要考虑临床意义和统计学结果。以上有四个
结果如下
(六)基于lasso回归筛选出的变量进行logistic回归分析或者有的论文直接将其认为重要的变量纳入logistics回归
> library(rms)#加载包
> non_tumor<-read.table("input.txt",header=T,sep="\t")
> #将因素转为因子,为列线图准备
> non_tumor$Use_of_GC<-factor(non_tumor$Use_of_GC,labels=c("No","Yes"))
> non_tumor$Use_of_NSAIDs<-factor(non_tumor$Use_of_NSAIDs,labels=c("No","Yes"))
> non_tumor$Number_of_questions<-factor(non_tumor$Number_of_questions,labels=c("0","1",">=2"))
> non_tumor$Education_level<-factor(non_tumor$Education_level,labels=c("Primary","Secondary","Higher"))
> non_tumor$Distance<-factor(non_tumor$Distance,labels=c("<3km",">=3km"))
#加载数据
> ddist <- datadist(non_tumor)
> options(datadist="ddist")
#构建多因素logistic回归
#Status为因变量结局事件,其余为自变量
#多因素“+”
> mylog<- glm(Status~Use_of_GC + Use_of_NSAIDs + Number_of_questions + Education_level + Distance,family=binomial(link = "logit"),data = non_tumor)
> summary(mylog)
Call:
glm(formula = Status ~ Use_of_GC + Use_of_NSAIDs + Number_of_questions +
Education_level + Distance, family = binomial(link = "logit"),
data = non_tumor)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.6280 -0.5903 -0.4371 0.7981 2.1889
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -2.23991 0.19213 -11.659 < 2e-16 ***
Use_of_GCYes -0.06019 0.19894 -0.303 0.76221
Use_of_NSAIDsYes 0.64123 0.23777 2.697 0.00700 **
Number_of_questions1 2.57943 0.20398 12.645 < 2e-16 ***
Number_of_questions>=2 -0.55084 1.14881 -0.479 0.63159
Education_levelSecondary 1.43120 0.29699 4.819 1.44e-06 ***
Education_levelHigher -1.53754 1.10229 -1.395 0.16306
Distance>=3km 1.00899 0.30764 3.280 0.00104 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 908.71 on 734 degrees of freedom
Residual deviance: 653.88 on 727 degrees of freedom
AIC: 669.88
Number of Fisher Scoring iterations: 5
> coefficients(mylog)
(Intercept) Use_of_GCYes Use_of_NSAIDsYes
-2.23990663 -0.06019402 0.64122681
Number_of_questions1 Number_of_questions>=2 Education_levelSecondary
2.57942983 -0.55084386 1.43120253
Education_levelHigher Distance>=3km
-1.53753800 1.00899490
> exp(coefficients(mylog)) #计算OR值
(Intercept) Use_of_GCYes Use_of_NSAIDsYes
0.1064684 0.9415818 1.8988089
Number_of_questions1 Number_of_questions>=2 Education_levelSecondary
13.1896157 0.5764632 4.1837272
Education_levelHigher Distance>=3km
0.2149096 2.7428428
> exp(confint(mylog)) #OR值得置信区间
Waiting for profiling to be done...
2.5 % 97.5 %
(Intercept) 0.07211103 0.1533035
Use_of_GCYes 0.63692511 1.3908062
Use_of_NSAIDsYes 1.19016996 3.0275911
Number_of_questions1 8.90991341 19.8414271
Number_of_questions>=2 0.02846893 4.1170238
Education_levelSecondary 2.34800900 7.5386868
Education_levelHigher 0.01110045 1.3034572
Distance>=3km 1.50146947 5.0269071
(七)列线图的绘制
#install.packages("rms")#加载R包
library(rms)
non_tumor<-read.table("input.txt",header=T,sep="\t")#读取数据
#多因素回归因子化
non_tumor$Use_of_GC<-factor(non_tumor$Use_of_GC,labels=c("No","Yes"))
non_tumor$Use_of_NSAIDs<-factor(non_tumor$Use_of_NSAIDs,labels=c("No","Yes"))
non_tumor$Number_of_questions<-factor(non_tumor$Number_of_questions,labels=c("0","1",">=2"))
non_tumor$Education_level<-factor(non_tumor$Education_level,labels=c("Primary","Secondary","Higher"))
non_tumor$Distance<-factor(non_tumor$Distance,labels=c("<3km",">=3km"))
#加载差异数据
ddist <- datadist(non_tumor)
options(datadist="ddist")
#多因素回归分析
mylog<-lrm(Status~Use_of_GC + Use_of_NSAIDs + Number_of_questions + Education_level + Distance,data=non_tumor,x=T,y=T)
#画列线图
mynom<- nomogram(mylog, fun=plogis,fun.at=c(0.0001,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.9999),lp=F, funlabel="Risk of nonadherence")#"Risk of nonadherence",可以改名,具体论文具体分析
#保存为PDF格式的文件
pdf("Nom.pdf",10,8)
plot(mynom)
dev.off()
列线图结果
(八)C_index
library(rms)
setwd("C:\\Users\\scikuangren\\Desktop\\logistics\\4_C-index")
non_tumor<-read.table("input.txt",header=T,sep="\t")
non_tumor$Use_of_GC<-factor(non_tumor$Use_of_GC,labels=c("No","Yes"))
non_tumor$Use_of_NSAIDs<-factor(non_tumor$Use_of_NSAIDs,labels=c("No","Yes"))
non_tumor$Number_of_questions<-factor(non_tumor$Number_of_questions,labels=c("0","1",">=2"))
non_tumor$Education_level<-factor(non_tumor$Education_level,labels=c("Primary","Secondary","Higher"))
non_tumor$Distance<-factor(non_tumor$Distance,labels=c("<3km",">=3km"))
ddist <- datadist(non_tumor)
options(datadist="ddist")
mylog<-lrm(Status~Use_of_GC + Use_of_NSAIDs + Number_of_questions + Education_level + Distance,data=non_tumor,x=T,y=T)
mylog
#install.packages("Hmisc")
library(Hmisc)
###这一节,主要看这里的代码,其余和之前差别不大
Cindex <- rcorrcens(non_tumor$Status~predict(mylog))
Cindex
#################c-index#######
####0.5,模型没有任何预测能力
####0.5-0.7,比差的准确性
####0.71-0.9,中等的准确性
####>0.9,高准确性
(九)校准 Calibration
#install.packages("rms")
library(rms)
setwd("C:\\Users\\scikuangren\\Desktop\\logistics\\5_Calibration")
non_tumor<-read.table("input.txt",header=T,sep="\t")
non_tumor$Use_of_GC<-factor(non_tumor$Use_of_GC,labels=c("No","Yes"))
non_tumor$Use_of_NSAIDs<-factor(non_tumor$Use_of_NSAIDs,labels=c("No","Yes"))
non_tumor$Number_of_questions<-factor(non_tumor$Number_of_questions,labels=c("0","1",">=2"))
non_tumor$Education_level<-factor(non_tumor$Education_level,labels=c("Primary","Secondary","Higher"))
non_tumor$Distance<-factor(non_tumor$Distance,labels=c("No","Yes"))
ddist <- datadist(non_tumor)
options(datadist="ddist")
mylog<-lrm(Status~Use_of_GC + Use_of_NSAIDs + Number_of_questions + Education_level + Distance,data=non_tumor,x=T,y=T)
#
mycal<-calibrate(mylog,method="boot",B=1000) #B=1000,医院的数据一般不会大于1000,seer数据库挖掘可能比较大,因为越大,电脑运行的越久。这里建议1000
pdf("Calibration.pdf")
plot(mycal,xlab="Nomogram-predicted probability of nonadherence",ylab="Actual diagnosed nonadherence (proportion)",sub=F)
dev.off()
结果如图
校准线越贴近理想线,说明结果就越好。
十(ROC曲线)
ROC的解读类似于C指数,二者仍有不同
#install.packages("ROCR")
library(ROCR)
library(rms)
#读取数据
non_tumor<-read.table("input.txt",header=T,sep="\t")
#建模
mylog<-lrm(Status~Use_of_GC + Use_of_NSAIDs + Number_of_questions + Education_level + Distance,data=non_tumor,x=T,y=T)
pre_rate<-predict(mylog)
ROC1<- prediction(pre_rate,non_tumor$Status)
ROC2<- performance(ROC1,"tpr","fpr")
AUC <- performance(ROC1,"auc")
AUC
AUC<-0.8175882 #这里一定要改为自己的AUC
pdf("ROC.pdf")
plot(ROC2,col="blue", xlab="False positive rate",ylab="True positive rate",lty=1,lwd=3,main=paste("AUC=",AUC))
abline(0,1,lty=2,lwd=3)
dev.off()
(十一)决策曲线 DCA
#为考虑病人的获益情况(有文章也没有做)
install.packages("rmda")
library(rms)
library(rmda)
setwd("C:\\Users\\scikuangren\\Desktop\\logistics\\7_DCA")
non_tumor<-read.table("input.txt",header=T,sep="\t")
modul<- decision_curve(Status~Use_of_GC + Use_of_NSAIDs + Number_of_questions + Education_level + Distance,data= non_tumor,
family = binomial(link ='logit'),
thresholds= seq(0,1, by = 0.01),
confidence.intervals = 0.95)
pdf("DCA.pdf")
plot_decision_curve(modul,
curve.names="Nonadherence prediction nomogram",xlab="Threshold probability",
cost.benefit.axis =FALSE,col= "blue",
confidence.intervals=FALSE,
standardize = FALSE)
dev.off()
modul
净收益 只有在范围内才是收益的
(十二)模型验证
12.1 C-index
#本文利用随机抽样的方法进行验证
library(rms)
non_tumor<-read.table("input.txt",header=T,sep="\t")
non_tumor$Use_of_GC<-factor(non_tumor$Use_of_GC,labels=c("No","Yes"))
non_tumor$Use_of_NSAIDs<-factor(non_tumor$Use_of_NSAIDs,labels=c("No","Yes"))
non_tumor$Number_of_questions<-factor(non_tumor$Number_of_questions,labels=c("0","1",">=2"))
non_tumor$Education_level<-factor(non_tumor$Education_level,labels=c("Primary","Secondary","Higher"))
non_tumor$Distance<-factor(non_tumor$Distance,labels=c("No","Yes"))
ddist <- datadist(non_tumor)
options(datadist="ddist")
mylog<-lrm(Status~Use_of_GC + Use_of_NSAIDs + Number_of_questions + Education_level + Distance,data=non_tumor,x=T,y=T)
#验证部分
set.seed(300)
myc<-validate(mylog,method="b",B = 1000,pr=T,dxy=T)
c_index<-(myc[1,5]+1)/2
c_index
12.2
#一部分建模(70%),一部分验证(30%)
#如果有两/多中心数据,可以分开;否则只能内部抽样
#之后,我们可以计算验证部分的c指数
#install.packages("caret")
library(foreign)
library(survival)
library(caret)
#加载数据
non_tumor<-read.table("input.txt",header=T,sep="\t")
set.seed(300)
non_tumord<-createDataPartition(y=non_tumor$id,p=0.70,list=F)
non_tumordev<-non_tumor[non_tumord, ]
non_tumorv<-non_tumor[-non_tumord,]
#生成两个文件,一个为建模人群,第二个为验证人群
write.csv(non_tumordev, "non_tumordev.csv")
write.csv(non_tumorv, "non_tumorv.csv")
#验证
tcga<-read.table("ver.txt",header=T,sep="\t")
#从non_tumorv.csv数据来的,删除了第一项,保留id和其它因素
#将数据转换成因子格式
tcga$age<-factor(tcga$age,labels=c("<50","50-59","60-69",">=70"))
tcga$sex<-factor(tcga$sex,labels=c("FEMALE","MALE"))
tcga$race<-factor(tcga$race,labels=c("WHITE","BLACK OR AFRICAN AMERICAN","ASIAN"))
tcga$smoking<-factor(tcga$smoking,labels=c("1","2","3","4","5"))
tcga$radiation<-factor(tcga$radiation,labels=c("YES","NO"))
tcga$pharmaceutical<-factor(tcga$pharmaceutical,labels=c("YES","NO"))
tcga$stage_T<-factor(tcga$stage_T,labels=c("T1","T1a","T1b","T1c","T2","T2a","T2b","T3","T4","TX"))
tcga$stage_M<-factor(tcga$stage_M,labels=c("M0","M1","M1b","MX"))
tcga$stage_N<-factor(tcga$stage_N,labels=c("N0","N1","N2","NX"))
tcga$surgery<-factor(tcga$surgery,labels=c("0","1"))
#将数据打包好
ddist <- datadist(tcga)
options(datadist='ddist')
#构建多因素的Cox回归模型
fmla1 <- as.formula(Surv(survival_time,status) ~ age + sex + race + smoking + radiation + pharmaceutical + stage_T + stage_N + stage_M)
cox2 <- coxph(fmla1,data=tcga)
summary(cox2)
#c-index 0.714
#画校准图
cox1 <- cph(Surv(survival_time,status) ~ age + sex + race + smoking + radiation + pharmaceutical + stage_T + stage_N + stage_M,surv=T,x=T, y=T,time.inc = 1*365*5,data=tcga)
cal <- calibrate(cox1, cmethod="KM", method="boot", u=1*365*5, m=50, B=1000)
pdf("calibrate_ver.pdf",12,8)
par(mar = c(10,5,3,2),cex = 1.0)
plot(cal,lwd=3,lty=2,errbar.col="black",xlim = c(0,1),ylim = c(0,1),xlab ="Nomogram Predicted Survival",ylab="Actual Survival",col="blue")
lines(cal,c('mean.predicted',‘KM'),type = ‘a',lwd = 3,col ="black" ,pch = 16)
mtext(“ ”)
box(lwd = 1)
abline(0,1,lty = 3,lwd = 3,col = "black")
dev.off()