################分类算法与数据挖掘---也就是回归于分类算法--对应于Y的0/1算法
####分类分析---原因,想看看这个人的个人信用,预测会不会还钱,考虑贷不贷给他钱0/1;
####依据一些别的特征,预测是不是有肿瘤0/1 类似logistic回归y=0/1---不过数据分成训练集和验证集
#####包-rpart、rpart.plot、party
#####对这些y=0/1分类变量的方法:逻辑回归、决策树、随机森林、支持向量机、神经网络、贝叶斯、K近邻。
#数据准备--- UCI机器学习数据库
loc<-"https://archive.ics.uci.edu/ml/machine-learning-databases/"
ds<-"breast-cancer-wisconsin/breast-cancer-wisconsin.data"
url<-paste(loc,ds,sep = "")
loc
ds
url
breast<-read.table(url,sep = ",",header = F,na.strings = "?")
names(breast)<-c("ID","clumpThickness","sizeUniformity","shapeUniformity","maginalAdhesion","singleEpithelialCellSize",
"bareNuclei","blandChromatin","normalNucleoli","mitosis","class")
df<-breast[-1]
df$class<-factor(df$class,levels = c(2,4),labels = c("benign","malignant"))
##set.seed()保证操作的可重复性,别人也用1234,产生的随机数就和你的一样了
set.seed(1234)
##选取训练集--从nrow(df)中即699个数字中,无放回(如果放回,replace=T)抽取0.7*nrow(df)个数字
train<-sample(nrow(df),0.7*nrow(df))
##提取出训练集的列表
df.train<-df[train,]
###提取验证集的列表
df.validate<-df[-train,]
##看看各个列表的数目
table(df.train$class)
table(df.validate$class)
###方法一: 逻辑回归
###abcdef glm(a~.)用.可以表示除了a之外的所有
##拟合:
fit_logit<-glm(class~.,data = df.train,family = binomial())
summary(fit_logit)
##用模拟的数据在新的数据集中,进行数据的验证: type = "response" 直接返回预测的概率值0~1之间
prob<-predict(fit_logit,df.validate,type="response")
prob
#3将概率大于0.5的定义为恶性肿瘤;概率小于0.5的定义为良性肿瘤,
logit.pred<-factor(prob>.5,levels = c(FALSE,TRUE),labels = c("benign","malignant"))
logit.pred
##3得出实际与预测的交叉表
logit.perf<-table(df.validate$class,logit.pred,dnn=c("Actual","Predicted"))
###预测出118个良,76个恶性
#### 准确率为(76+118)/(129+81)=0.92 (76+118)/200=0.97
(76+118)/(129+81) ###有问题
(76+118)/200
###再回归来看 有几个模拟概率>0.05,不满足,可以删除再模拟,也可以用下面的方法
logit.fit.reduced<-step(fit_logit)
###决策树:两类-经典树,条件推断树
####经典树----rpart包做rpart()决策树;prune()做树枝的剪枝,得到预测误差最小的树枝
library(rpart)
set.seed(1234)
##生成树
dtree<-rpart(class~.,data = df.train,method="class",parms = list(split="information"))
dtree$cptable
plotcp(dtree)
###剪枝 依据dtree$cptable得到的cp复杂度参数=0.0125
dtree.pruned<-prune(dtree,cp=0.0125)
##3作图 prp做出最终的图
library(rpart.plot)
prp(dtree.pruned,type=2,extra = 104,fallen.leaves =TRUE,main="Decision Tree" )
##对验证集进行分类验证
dtree.pred<-predict(dtree.pruned,df.validate,type="class")
dtree.perf<-table(df.validate$class,dtree.pred,dnn = c("Actual","Predicted"))
dtree.perf
(122+79)/200
##3条件推断树 依据显著性分类;对比与经典的的包含为Y为其中一类的纯度分类划分 party包
library(party)
fit_ctree<-ctree(class~.,data = df.train)
plot(fit_ctree,main="Conditional Inference Tree")
ctree.pred<-predict(fit_ctree,df.validate,type="response")
ctree.perf<-table(df.validate$class,ctree.pred,dnn = c("Actual","Predicted"))
ctree.perf
###随机森林法:决策树预测类别众数纪委随机森林预测的样本单元的类别
###sandomForest包 randomForest() 默认生成500棵树
library(randomForest)
set.seed(1234)
###na.action=na.roughfix 有缺省值用对应列的中位数代替
fit.forest<-randomForest(class~.,df.train,na.action=na.roughfix,importance=TRUE)
fit.forest
###给出变量重要性 看看哪个变量重要
importance(fit.forest,type = 2)
##队验证集进行验证
forest.pred<-predict(fit.forest,df.validate)
forest.perf<-table(df.validate$class,forest.pred,dnn = c("Actual","Predicted"))
forest.perf
##支持向量机SVM 投影 x,y-> x2,2k开方*xy,y2 ->变为三维数据 Z1,Z2,Z3
### kernlab包ksvm函数-强大 或者e1071包中的svm()函数 简单
library(e1071)
set.seed(1234)
fit.svm<-svm(class~.,df.train)
fit.svm
svm.predit<-predict(fit.svm,na.omit(df.validate))
svm.perf<-table(na.omit(df.validate)$class,svm.predit,dnn = c("Actual","Predicted"))
svm.perf
#####选择调和参数 改进支持向量机参数
set.seed(1234)
tuned<-tune.svm(class~.,data=df.train,gamma = 10^(-6:1),cost = 10^(-10:10))
#3得到gamma=0.01,cost=1模拟最好
tuned
fit.tuned.svm<-svm(class~.,data = df.train,gamma=0.01,cost=1)
svm.tuned.pred<-predict(fit.tuned.svm,na.omit(df.validate))
svm.tuned.perf<-table(na.omit(df.validate)$class,svm.tuned.pred,dnn = c("Actual","Predicted"))
svm.tuned.perf
##############################################
###选择预测最好的解---评价二分类准确性
performance<-function(table,n=2){
if(!all(dim(table)==c(2,2)))
stop("Must be a 2x2 table")
tn=table[1,1]
fp=table[1,2]
fn=table[2,1]
tp=table[2,2]
sensitivity=tp/(tp+fn)
specificity=tn/(tn+fp)
ppp=tp/(tp+fp)
npp=tn/(tn+fn)
hitrate=(tp+tn)/(tp+tn+fp+fn)
result<-paste("Sensitivity=",round(sensitivity,n),
"\nSpecificity=",round(specificity,n),
"\nPositive Predictive Value=",round(ppp,n),
"\nNegative Predictive Value=",round(npp,n),
"\nAccuracy=",round(hitrate,n),"\n",sep="")
cat(result)
}
performance(logit.perf)
performance(dtree.perf)
performance(ctree.perf)
performance(forest.perf)
performance(svm.perf)
performance(svm.tuned.perf)
################################################
###rattle包进行数据挖掘 library(rattle)-> rattle()进入GUI图形交互界面
#数据准备
loc<-"https://archive.ics.uci.edu/ml/machine-learning-databases/"
ds<-"breast-cancer-wisconsin/breast-cancer-wisconsin.data"
url<-paste(loc,ds,sep = "")
breast<-read.table(url,sep = ",",header = F,na.strings = "?")
names(breast)<-c("ID","clumpThickness","sizeUniformity","shapeUniformity","maginalAdhesion","singleEpithelialCellSize",
"bareNuclei","blandChromatin","normalNucleoli","mitosis","class")
df<-breast[-1]
df$class<-factor(df$class,levels = c(2,4),labels = c("benign","malignant"))
breast$class<-factor(df$class,levels = c(2,4),labels = c("benign","malignant"))
library(rattle)
rattle()