数据与变量:
加载数据
> cup98 <- read.csv("F:/cup98lrn.txt")
> dim(cup98[,1:10])
[1] 95412 10
> n.missing<-rowSums(is.na(cup98))
#计算存在NA值的行数
> sum(n.missing>0)
[1] 95412
选择变量
> varSet<-c(
#demographics
"ODATEDW","OSOURCE","STATE","ZIP","PVASTATE","DOB","RECINHSE","MDMAUD","DOMAIN","CLUSTER","AGE","HOMEOWNR","CHILD03","CHILD07","CHILD12","CHILD18","NUMCHLD","INCOME","GENDER","WEALTH1","HIT",
#donor interests
"COLLECT1","VETERANS","BIBLE","CATLG","HOMEE","PETS","CDPLAY","STEREO","PCOWNERS","PHOTO","CRAFTS","FISHER","GARDENIN","BOATS","WALKER","KIDSTUFF","CARDS","PLATES","PEPSTRFL",
#summary variables of promotion history "CARDPROM","MAXADATE","NUMPROM","CARDPM12","NUMPRM12",
#summary variables of giving history "RAMNTALL","NGIFTALL","CARDGIFT","MINRAMNT","MAXRAMNT","LASTGIFT","LASTDATE","FISTDATE","TIMELAG","AVGGIFT",
#ID & targets
"CONTROLN","TARGET_B","TARGET_D","HPHONE_D",
#RFA
"RFA_2F","RFA_2A","MDMAUD_R","MDMAUD_F","MDMAUD_A",
#OTHERS
"CLUSTER2","GEOCODE2")
#删除Id和TARGET_D属性
> vars <- setdiff(varSet, c("CONTROLN", "TARGET_D"))
> cup98 <- cup98[, vars]
> dim(cup98)
[1] 95412 64
随机森林:
使用randomForest包和party包创建随机森林。
randomForest包无法处理包含缺失值或者拥有超过32个等级水平的分类变量。但学习数据集中的所有记录都包含了一个或多个缺失值。即使是varset中的变量值,也有大于93%的记录含有一个或者多个缺失值。这在真实数据中很常见。集中的很多分类变量都含有超过32个等级水平。例如: 表示国家,邮政编码,职位以及制造商中的变量。这些变量分类中有一部分可以通过分组方式来减少等级水平,例如表示职位的变量。通过将只含有少量记录水平划分为同一组来减少分类变量的等级水平数量。例如小国家和制造商。
#查看缺失值以及分类变量等级超过10 的数据
> library(randomForest)
> rf <- randomForest(TARGET_B~.,data=cup98)
下面查看含有缺失值及分类变量等级超过10 的数据。
> #checking missing values
> n.missing <- rowSums(is.na(cup98))
> (tab.missing <-table(n.missing))
n.missing
0 1 2 3 4 5 6 7 6782 36864 23841 13684 11716 2483 41 1
> #percentage of records without missing values
> (tab.missing <-table(n.missing))
> round(tab.missing["0"]/nrow(cup98),digits=2)
0
0.07
> #check levels of categorial variables
> idx.cat <- which(sapply(cup98,is.factor))
> all.levels <- sapply(names(idx.cat),function(x)
+ nlevels(cup98[,x]))
> all.levels[all.levels>10]
OSOURCE STATE ZIP MDMAUD DOMAIN
896 57 19938 28 17
下面将数据划分为训练集和测试集
> trainPercentage <-80
> testPercentage <-20
> ind <- sample(2,nrow(cup98),replace=TRUE
+ ,prob=c(trainPercentage,testPercentage))
> trainData <- cup98[ind==1,]
> testData <- cup98[ind==2,]
接下来,使用party包中的函数cforest()创建随机森林。对于80%的训练集,创建一棵决策树需要花费2min ,创建一个含有50棵决策树的随机森林需要花费1.5h。
> # cforest
> library(party)
> (time1 <- Sys.time())
> cf <- cforest(TARGET_B~.,data=trainData, + control = cforest_unbiased(mtry=2,ntree=50)) 错误: 无法分配大小为11.3 Gb的矢量
> (time2 <- Sys.time())
[1] "2017-12-26 14:20:21 CST"
> time2-time1
Time difference of 4.906754 mins
> print(object.size(cf),units = "Mb")
Error in structure(.Call(C_objectSize, x), class = "object_size") : 找不到对象'cf'
> myPrediction <-predict(cf,newdata = testData)
Error in predict(cf, newdata = testData) : 找不到对象'cf'
> (time3 <-Sys.time())
[1] "2017-12-26 14:22:46 CST"
> time3-time2
Time difference of 2.417677 mins
内存问题
用party包中的函数ctree()创建决策树。
> memory.limit(4095)
[1] 8099
Warning message:
In memory.limit(4095) : 无法减少内存极限:忽视不用
> library(party)
> ct <- ctree(TARGET_B ~.,data=trainData)
错误: 无法分配大小为11.3 Gb的矢量
函数memory.limit() 对R设置内存限制,memory.size()当前R运行所占用的内存或者最大运存。memory.profile()用来查看使用了哪些内存空间。函数object.size()返回R对象占用的内存大小。关于R内存空间分配的细节参照运行?memory.size返回的信息。
当运行以上代码创建决策树时,遇到了一个内存空间的问题。
※ 减少内存需求的一种方法是对含有多个等级水平的分类变量进行分组或者删除。
首先,使用20%数据进行训练, 其中包含了大约19200行和62列。加入ZIP运行,函数ctree()返回了一个错误“ reach total memory allocation "。删除了ZIP之后可以成功运行,但需要花费25 min。删除OSOURCE后创建一棵决策树只需要5s。同样,输入80%的已经删除了变量ZIP和OSOURCE的数据(大约76000行和60列),执行ctree()函数需要25s。
样本数据的训练模型
为了找到哪些变量将用于建模,在本节需要对创建决策树的过程重复10次。然后收集出现在所有决策树中的每一个变量,并将收集到的变量用于建立最终模型。
首先,将数据集划分为3个子集:训练数据集30%,测试数据集20%和其余的数据。
划分出一部分的数据是为了缩减训练数据和测试数据的规模,以便在内存受限的环境下成功地执行训练和测试。
> library(party)#for tree
> trainPercentage <-30
> testPercentage <-20
> restPercentage <-100 -trainPercentage-testPercentage
> filename <-paste("cup98-ctree",trainPercentage,testPercentage,sep="-")
> vars <-setdiff(varSet,c("TARGET_D","CONTROLN","ZIP","OSOURCE"))
> # partition the data into training and test datasets
> ind <- sample(3,nrow(cup98),replace=T,
+ prob = c(trainPercentage,testPercentage,restPercentage))
> trainData <-cup98[ind==1,vars]
> testData <-cup98[ind==2,vars]
检验抽样后得到的训练集和测试集中的目标变量,看其分布与原始数据中的分布是否一致。如果不一致,可以使用分层抽样。
> #check the percentage of classes
> round(prop.table(table(cup98$TARGET_B)),digits=3)
0 1
0.949 0.051
> round(prop.table(table(cup98$TARGET_B)),digits=3)
0 1
0.949 0.051
> round(prop.table(table(cup98$TARGET_B)),digits=3)
0 1
0.949 0.051
此时保存工作空间
> #remove raw data to save memory
> rm(cup98,ind)
> gc()
used (Mb) gc trigger (Mb) max used (Mb)
Ncells 1610444 86.1 2637877 140.9 2637877 140.9
Vcells 4657403 35.6 647222195 4938.0 877743600 6696.7
> memory.size()
[1] 156.11
接下来,在训练数据上调用函数ctree()创建决策树。为了简化案例并增强可读性,本章使用函数ctree()默认设置来训练决策树。例如,决策树的默认设置在13.4节。在下面的代码中,函数object.size()返回一个数据对象的大小。
加载工作空间(因为在上一步rm操作)
> # build ctree
> myCtree <-NULL
> startTime <-Sys.time()
> myCtree <-ctree(TARGET_B~.,data = trainData)
> Sys.time() - startTime
Time difference of 30.86209 secs
> print(object.size(myCtree),units = "Mb")
9.6 Mb
> #print(myCtree) > memory.size()
[1] 2571.15
> #plot the tree and save it in a .PDF file
> pdf(paste(filename,".pdf",sep=""),width=12,height=9,
+ paper="a4r",pointsize=6)
> plot(myCtree,type="simple",ip_args=list(pval=F),
+ ep_args=list(digits=0),main=filename)
> graphics.off()
构建10棵决策树
使用已选变量建立模型
建立了10棵决策树之后,选取包含的所有变量来建立最后的模型。这一次所有的数据都用于学习,80%作为训练集和20%作为测试集。
> vars.selected<- c("CARDS", "CARDGIFT", "CARDPM12", "CHILD12", "CLUSTER2", "DOMAIN", "GENDER", "GEOCODE2", "HIT", "HOMEOWNR", "INCOME", "LASTDATE", "MINRAMNT", "NGIFTALL", "PEPSTRFL", "RECINHSE", "RFA_2A", "RFA_2F", "STATE", "WALKER")
> trainPercentage <- 80
> testPercentage <- 20
> (fileName <- paste("cup98-ctree", trainPercentage, testPercentage, sep="-"))
[1] "cup98-ctree-80-20" > vars <- c("TARGET_B", vars.selected)
> ind <- sample(2, nrow(cup98), replace=T, prob=c(trainPercentage, testPercentage))
> trainData <- cup98[ind==1, vars]
> testData <- cup98[ind==2, vars]
> #build a decision tree
> myCtree <-ctree(TARGET_B~.,data = trainData)
> print(object.size(myCtree),units="Mb")
48.1 Mb
> memory.size()
[1] 4994.07
> print(myCtree)
Conditional inference tree with 25 terminal nodes Response:
TARGET_B Inputs:
CARDS, CARDGIFT, CARDPM12, CHILD12, CLUSTER2, DOMAIN, GENDER, GEOCODE2, HIT, H;.'tatistic = 70.324 18)* weights = 207 17) CARDPM12 > 4 19)* weights = 2751 16) CARDGIFT > 3 20) LASTDATE <= 9610; criterion = 0.958, statistic = 43.626 21) CARDPM12 <= 4; criterion = 1, statistic = 40.985 22)* weights = 152 21) CARDPM12 > 4 23)* weights = 1057 20) LASTDATE > 9610 24)* weights = 47 1) RFA_2A == {F, G} 25) RFA_2F <= 1; criterion = 1, statistic = 117.069 26) PEPSTRFL == {X}; criterion = 1, statistic = 85.627 27) MINRAMNT <= 13; criterion = 0.999, statistic = 56.749 28)* weights = 8622 27) MINRAMNT > 13 29) RFA_2A == {F}; criterion = 0.987, statistic = 36.194 30)* weights = 87 29) RFA_2A == {G} 31)* weights = 274 26) PEPSTRFL == { } 32) CLUSTER2 <= 27; criterion = 1, statistic = 65.473 33)* weights = 12301 32) CLUSTER2 > 27 34) RFA_2A == {F}; criterion = 0.955, statistic = 27.535 35)* weights = 9557 34) RFA_2A == {G} 36)* weights = 3385 25) RFA_2F > 1 37) PEPSTRFL == {X}; criterion = 1, statistic = 95.078 38) LASTDATE <= 9609; criterion = 0.995, statistic = 62.087 39) GENDER == { , A}; criterion = 0.994, statistic = 54.878 40)* weights = 301 39) GENDER == {F, J, M, U} 41)* weights = 8208 38) LASTDATE > 9609 42) WALKER == {Y}; criterion = 0.969, statistic = 22.234 43)* weights = 58 42) WALKER == { } 44)* weights = 396 37) PEPSTRFL == { } 45) CARDGIFT <= 5; criterion = 1, statistic = 88.729 46) INCOME <= 3; criterion = 0.976, statistic = 84.344 47)* weights = 2770 46) INCOME > 3 48)* weights = 6668 45) CARDGIFT > 5 49)* weights = 543
将所有已建立的决策树保存为一个Rdata文件,并将决策树的图像保存在一个pdf 中。如果一棵决策树很大,其中的节点以及节点中的文本将会出现重叠。避免类似情况的一种方法是纸张的宽和高都相应调大,并使用pointsize将字体调小。此外,在绘制决策树时可以减少图像的文字。设置ip_args=list(pval=FALSE)来压缩P值,并设置ep_args=list(digists=0)缩减数值的长度。
> save(myCtree,file=paste(fileName,".Rdata",sep=""))
> pdf(paste(fileName,".pdf",sep=""),width=12,height=9,
+ paper="a4r",pointsize=6)
> plot(myCtree,type="simple",ip_args=list(pval=F),ep_args=list(digits=0),main=filename)
>plot(myCtree,terminal_panel=node_barplot(myCtree),ip_args=list(pval=F),ep_args=list(digits=0),main=filename)
> graphics.off()
然后使用测试数据对决策树模型进行测试。
> rm(trainData)
> myPrediction <-predict(myCtree,newdata=testData)
> # check predicted results
> testResult <-table(myPrediction,testData$TARGET_B)
> percentageOfOne <-round(100*testResult[,2]/(testResult[,1]+testResult[,2]),digits=1)
> print(testResult)
myPrediction 0 1 0.0303567702900063 4627 168 0.0382599580712788 2375 119 0.0382771164021164 2887 123 0.0418929402637704 303 11 0.0466228332337119 1202 55 0.0517171717171717 612 40 0.0538709677419355 1467 84 0.063 712 39 0.0666666666666667 142 7 0.0670955882352941 492 47 0.0716565129097766 803 59 0.0723014256619145 715 45 0.0746705710102489 151 7 0.0775269872423945 251 21 0.0782828282828283 112 8 0.0939875604699378 649 80 0.0950413223140496 71 6 0.0985507246376812 97 5 0.114043355325165 247 30 0.115384615384615 20 0 0.135135135135135 153 20 0.167844522968198 133 15 0.176795580110497 36 7 0.22 10 1
> boxplot(myPrediction~testData$TARGET_B,xlab = "TARGET_B",ylab="Prediction",ylim=c(0,0.25))
> sl <-sort(myPrediction,decreasing=TRUE,method="quick",index.return=TRUE)
> testSize <- nrow(testData)
> TotalNumOfTarget <-sum(testData$TARGET_B)
> NumOfTarget <-rep(0,testSize)
> NumOfTarget[1] <- (testData$TARGET_B)[sl$ix[1]]
> for(i in 2:testSize){
+ NumOfTarget[i] <-NumOfTarget[i-1]+testData$TARGET_B[sl$ix[i]]}
> plot(1:testSize,NumOfTarget,pty=".",type="l",lty="solid",col="red",ylab="Count Of Responses in Top k",xlab="Top k",main=filename)
> grid(col="gray",lty="dotted")
> percentile <-100*(1:testSize)/testSize
>percentileTarget<-100*NumOfTarget/TotalNumOfTarget
>plot(percentile,percentileTarget,pty=".",type="l",lty="solid",col="red",ylab="Percentage of Predicted Donations(%)",xlab="Percentage of Pool",main=filename)
> grid(col="grey",lty="dotted")
评分
使用一棵较大的决策树进行对大数据评分时,将会出现内存溢出。为了减少内存损耗,将评分数据划分为多少个子集,并对每一个子集分别使用预测模型,然后再将所以的评分结果进行融合。
> memory.limit(4095)
> #read scoring data and training data
> cup98val <-read.csv("F://cup98VAL.txt")
> cup98 <-read.csv("F://cup98LRN.txt")
> library(party)
> treeFileName <-"cup98-ctree-80-20"
> splitNum <-10
评分之前,我们需要查看scoreData 和分类变量的等级水平是否一致。如果不一致,需要根据scoreData中的因子水平对trianData中的进行设置。只对于predict()的执行十分关键。评分数据中分类变量的缺失值和新值都设置成NA(缺失值)。
> #check and set levels of categorical variables
> trainData <-cup98[,vars]
> vars2 <-setdiff(c(vars,"CONTROLN"),"TARGET_B")
>scoreData <-cup98val[,vars2]
>rm(cup98,cup98val)
>scoreNames <-names(scoreData)
>#cat("\n checking and setting variable values \n")
>newScoreData <-scoreData
>variableList <-intersect(trainNames,scoreNames)
下面代码一开始因为格式不正确报错,正确格式截图如下:
查看新数据之后,再加载预测模型并查看内存的使用情况。还要使用函数gc()将不再使用或者产生垃圾的对象删除。
> load(paste(treeFileName,".Rdata",sep=""))
> print(object.size(trainData),units="Mb") 8 Mb
> print(object.size(scoreData),units="Mb") 187 Mb
> print(object.size(newScoreData),units="Mb") 8.1 Mb
> print(object.size(myCtree),units="Mb") 45 Mb
> gc()
used (Mb) gc trigger (Mb) max used (Mb) Ncells 1700741 90.9 2637877 140.9 2554658 136.5 Vcells 146612276 1118.6 296119905 2259.3 295382927 2253.6
> memory.size() [1] 1268.69
> rm(trainNames,scoreNames) >
rm(variableList)
> rm(trainLevels,scoreLevels)
> rm(trainData,scoredata)
> gc()
used (Mb) gc trigger (Mb) max used (Mb) Ncells 1701286 90.9 2637877 140.9 2554658 136.5 Vcells 146618456 1118.7 296119905 2259.3 295382927 2253.6 > memory.size() [1] 1268.72
将评分数据划分为多个子集,并对每个子集建立一棵决策树以便降低内存消耗,评分结果的分布情况如图14-5所示。
> nScore <-dim(newScoreData)[1]
> (splitSize <-round(nScore/splitNum))
[1] 9637
> for (i in 1:splitNum) {
+ startPos <- 1+(i-1)*splitSize
+ if(i==splitNum) {
+ endPos <-nScore
+ }
+ else {
+ endPos <-i*splitSize
+ }
+ print(paste("Predicing:",startPos,"-",endPos))
+ #make prediction + tmpPred <- predict(myCtree,newdata=newScoreData + [startPos:endPos,])
+ myPred <-c(myPred,tmpPred)
+ }
[1] "Predicing: 1 - 9637" [1] "Predicing: 9638 - 19274" [1] "Predicing: 19275 - 28911" [1] "Predicing: 28912 - 38548" [1] "Predicing: 38549 - 48185" [1] "Predicing: 48186 - 57822" [1] "Predicing: 57823 - 67459" [1] "Predicing: 67460 - 77096" [1] "Predicing: 77097 - 86733" [1] "Predicing: 86734 - 96367"
> #cumulative count and percentage
> length(myPred)
[1] 96367
> rankedLevels <-table(round(myPred,digits=4))
> # put highest rank first by reversing the vector
> rankedLevels<-rankedLevels[length(rankedLevels):1]
> levelNum<-length(rankedLevels)
> cumCnt<-rep(0,levelNum)
> cumCnt[1]<-rankedLevels[1]
> for (i in 2:levelNum) {
+ cumCnt[i]<-cumCnt[i-1]+rankedLevels[i]}
> cumPercent<-100*cumCnt/nScore
> cumPercent<-round(cumPercent,digits=1)
> percent<-100*rankedLevels/nScore
> percent<-round(percent,digits=1)
> cumBanking<-data.frame(rankedLevels,cumCnt,percent,cumPercent)
> names(cumRanking)<-c("Frequency","CumFrequency","Percentage","CumPercentage")
> print(cumRanking)
> write.csv(cumRanking,"cup98-cumulative-ranking.csv",row.names=T)
> pdf(paste("cup98-score-distribution.pdf",sep=""))
>plot(rankedLevels,x= names(rankedLevels),type="h",xlab="Score",ylab="#of Customers")
> sl <-sort(myPred,decreasing=TRUE,method="quick",index.return=TRUE)
> varToOutput <-c("CONTROLN")
> score<-round(myPred[sl$ix],digits=4)
> table(score,useNA="ifany")
score 0.0163 0.0285 0.031 0.0361 0.0414 0.0416 0.0486 0.0488 0.0534 0.0539 0.054 3624 10088 3178 18551 12758 5187 1230 5753 938 592 8383 0.0608 0.0703 0.0731 0.075 0.0884 0.125 0.1402 0.1636 0.1687 0.1833 0.1957 3856 6957 2082 6010 5618 138 745 51 213 161 183 0.2308 71
> result<-data.frame(cbind(newScoreData[sl$ix,varToOutput]),score)
> names(result) <-c(varToOutput,"score")
> write.csv(result,"cup98-predicted-score.csv",row.names=F)
##32位写入EXCEL
> library(RODBC)
> xlsFile <- obdcConnectExcel("cup98-predicted-score.xls")
> sqlSave(xlsFile,result,rownames=F)
> obdcCloseAll()
##64位(也可以安装XLconnect包)写入EXCEL(.csv)
> write.table(score,"cup98-predicted-score.csv",sep=",")
输出规则
> print.TerminalNode<-function(x,rule=NULL,...){
+ n.rules<<-n.rules+1
+ node.ids<<-c(node.ids,x$nodeID)
+ n.records<<-c(n.records,sum(x$weights))
+ scores<<-c(scores,x$prediction)
+ ruleset<<-c(ruleset,rule)
+ }
> print.SplittingNode<-function(x,rule=NULL,...){
+ if(!is.null(rule)) {
+ rule<-paste(rule,"\n")
+ }
+ rule2<-print(x$psplit,left=TRUE,rule=rule)
+ print(x$left,rule=rule2)
+ rule3<-print(x$psplit,left=FALSE,rule=rule)
+ print(x$right,rule=rule3)
+ }
> print.orderedSplit<-function(x,left=TRUE,rule=NULL,...){
if(!is.null(attr(x$splitpoint,"levels"))){ sp<-attr(x$splitpoint,"levels")[x$splitpoint]
}else{
sp<-x$splitpoint } n.pad<-20-nchar(x$variableName) pad<-paste(rep(" ",n.pad),collapse=" ")
if(!is.null(x$toleft)) { left<-as.logical(x$toleft)==left } if(left) {
rule2<-paste(rule,x$variableName,pad,"<=",sp,sep="")
} else{ rule2<-paste(rule,x$variableName,pad,">",sp,sep="")
}
rule2
}
> print.nominalSplit<-function(x,left=TRUE,rule=NULL,...){
+ levels<-attr(x$splitpoint,"levels")
+ ###is>0 for levels available in this code
+ tab<-x$table
+ if(left) {
+ lev<-levels[as.logical(x$splitpoint)&(tab>0)]
+ }else{
+ lev<-levels[!as.logical(x$splitpoint)&(tab>0)]
+ }
+ txt<-paste("'",paste(lev,collapse="','"),"'",sep="")
+ n.pad<-20-nchar(x$variableName)
+ pad<-paste(rep("",n.pad),collapse="")
+ rule2<-paste(rule,x$variableName,pad,txt,sep="")
+ rule2
+ }
调用print(myCtree@tree)抽取出规则的所有相关信息并保存到5个全局变量中国。借此案例按照规则的评分对其进行排序,并输出规则及其所包含记录的百分比和累计百分比。cumsum()计算数值型向量的积累和。为了节省空间,这里只输出5个规则。
> load(paste(treeFileName,".Rdata",sep=""))
> # extract rules from treeFileName
> n.rules<-0
> node.ids<-NULL
> n.records<-NULL
> scores<-NULL
> ruleset<-NULL
> print(myCtree@tree)
> n.rules
[1] 50
按照评分的规则进行排序
> sl <-sort(scores,decreasing=T,method="quick",index.return=T)
> percentage <-100*n.records[sl$ix]/sum(myCtree@weights)
> cumPercentage<-round(cumsum(percentage),digits=1)
>percentage<-round(percentage,digits=1)
>load(paste(treeFileName,".Rdata",sep=""))
> #print all rules
>for (i in 1:n.rules) {
cat("Rrule",i,"\n")
cat("Node:",node.ids[sl$ix[i]])
cat(",score:",percentage[i],'%',sep="")
cat(",Percentage:",percentage[i],'%',sep="")
cat(",Cumulative Percentage:",cumPercentage[i],"&",sep="")
cat("ruleset[sl$ix[i]]","\n\n")}
由于篇幅限制 , 只展示其中前五个规则:
Rrule 1 Node: 38,Percentage:0.0680192%,Cumulative Percentage:0.1&Rrule 2 Node: 38,Percentage:0.0680192%,Cumulative Percentage:0.1&Rrule 3 Node: 32,Percentage:0.1805125%,Cumulative Percentage:0.3&Rrule 4 Node: 32,Percentage:0.1805125%,Cumulative Percentage:0.5&Rrule 5 Node: 42,Percentage:0.1569674%,Cumulative Percentage:0.7 ……
输出SAS规则的得分
下面为四个改进的输出函数:
> #functions for printing rules in SAS statement for scoring with a DATA step
> # based on "Print.R"from package party
> print.TerminalNode <-function(x,rule=NULL,...){
+ rule<-sub(' +',"",rule) #remove leading spaces + n.rules<<-n.rules+1
+ node.ids<<-c(node.ids,x$nodeID)
+ n.records<<-c(n.records,sum(x$weights))
+ scores<<-c(scores,x$prediction) +
ruleset<<-c(ruleset,rule)
+ }
> print.SplittingNode<-function(x,rule=NULL,...){
+ if(!is.null(rule)) { + rule<-paste(rule,"\n and")
+ }#endif
+ rule2<-print(x$psplit,left=TRUE,rule=rule)
+ print(x$left,rule=rule2)
+ rule3<-print(x$psplit,left=FALSE,rule=rule)
+ print(x$right,rule=rule3)
+ }
> print.orderedSplit<-function(x,left=TRUE,rule=NULL,...){
+ if(!is.null(attr(x$splitpoint,"levels"))){
+ sp<-attr(x$splitpoint,"levels")[x$splitpoint]
+ }else{
+ sp<-x$splitpoint
+ } + if(!is.null(x$toleft)) {
+ left<-as.logical(x$toleft)==left
+ } + if(left) { + rule2<-paste(rule,x$variableName,"<=",sp,sep="")
+ } else{
+ rule2<-paste(rule,x$variableName,">",sp,sep="")
+ }
+ rule2
+ }
> print.nominalSplit<-function(x,left=TRUE,rule=NULL,...){
+ levels<-attr(x$splitpoint,"levels")
+ ###is>0 for levels available in this code + tab<-x$table
+ if(left) {
+ lev<-levels[as.logical(x$splitpoint)&(tab>0)]
+ }else{ + lev<-levels[!as.logical(x$splitpoint)&(tab>0)]
+ }
+ txt<-paste("'",paste(lev,collapse="','"),"'",sep="")
+ rule2<-paste(rule, " ",x$variableName,"in (",txt,")",sep="")
+ rule2
+ }
> library(party)#for tree
> load(paste(treeFileName,".Rdata",sep=""))
> n.rules<-0
> node.ids<-NULL
> n.records<-NULL
> scores<-NULL
> ruleset<-NULL
> print(myCtree@tree)
> n.rules
[1] 48
按照得分对其排名并输出。
篇幅有限,只给出前4个。
总结:
本章介绍了一个内存受限的环境下对大数据建立预测模型的案例。通过在样本数据上建立决策树,查找并收集有用的变量来建立最终的预测模型。此方法适用于内存受限的大数据建模。
另一种方法是对变量进行抽样,每一次进行变量抽样后都建立一个模型,建立了10个或者20个模型之后,从所有的这些变量中收集有用变量,并建立最终模型。该方法类似于随机森林,随机森林中的每一棵树都由变量的随机抽样子集构建而得到的。但是,与构建随机森林比,该方法需要更少的内存空间。