2022-04-11上

library(ISLR)

library(glmnet)

library(class) # contains knn()

library(MASS) # to have lda()

library(carData)

library(car)

library(pROC)

library(caret)

library(tree)

library(randomForest)

library(gbm)

library(xgboost)

library(e1071)

library(neuralnet)

library(nnet) # implements single layer NNs

library(mlbench) # includes dataset BostonHousing

library(neuralnet)

library(NeuralNetTools)

library(leaps) # contains regsubsets()

library(fdm2id)

library(survival)

# --------------------------------------------------------

# ST4061 / ST6041

# 2021-2022

# Eric Wolsztynski

#### Exercises Section 1: Date pre-processing####

# --------------------------------------------------------

library(glmnet)

library(survival)

library(ISLR)

###############################################################

#### Exercise 1: effect of scaling ####

#scale:标准化"(x-miu)/sigma"

###############################################################

summary(iris)

#取前100个作为训练样本

dat = iris[1:100,]

#去掉没用的level

dat$Species = droplevels(dat$Species)

##由于只取前100行数据作为训练样本，virginica 这个species没用到，所以用droplevels把它去掉

##所以在数据前期处理时，如果新选择的训练样本中有没用到的level，应该用droplevels把它删掉

#分别取自变量x 因变量y(species)

x = dat[,1:4]

y = dat$Species

# 对数据做scale 标准化

## method 1 先scale 再自变量因变量分离

dats = dat

dats[,1:4] = apply(dats[,1:4],2,scale)

##method 2 先自变量因变量分离，再对自变量做标准化

## we can apply scaling to the x data directly:

xs = apply(x,2,scale)

# (1)

##PCA

##iris数据中有四个成分Sepal.Length Sepal.Width Petal.Length Petal.Width；方差最大的为PCA中的第一主成分

##（但是我们并不能看出来第一主成分是原数据中的哪个成分），方差第n大的是PCA中的第n主成分

## 用prcomp()做PCA，summary得到的第一行，是对应主成分的标准差；第二行是Var(PCi)/sum(Var(PCi)); 第三行是第二行的累计和

pca.unscaled = prcomp(x)

pca.scaled = prcomp(x,scale=TRUE)

pca.scaled.2 = prcomp(xs) # should be the same as pca.scaled

summary(pca.unscaled)

summary(pca.scaled)

## Proportion of Variance越大，说明该成分包含的信息越多，如果该成分信息变化，那么总体受影响更大

## summary(pca.unscaled)里，Cumulative Proportion means：

## if you take the first component(PC1), you will capture the 91%, which means it contains the most information

## if you take the first two components, you will capture the 98%, the other two dimensions in the data sets are probably redundant.

## 如果你有前两个成分的数据，那你就抓住了98%的信息, 那么剩下两个成分就是多余的，不重要的

# plot the data on its first 2 dimensions in each space:

#biplot:主成分分析散点图（双标图）

#分析biplot：

#两环境线段之间的夹角的余弦值是它们的相关系数，夹角小于90度表示正相关，说明两环境对品种排序相似，大于90度表示负相关，表示两环境对品种排序相反，等于90度说明两环境不相关。夹角较小说明试验点是重复设置的，去掉一个不影响对品种的评价。

#环境线段的长度是试验点对品种的区分能力，线段越长，区分能力越强。

par(mfrow=c(1,2))

plot(x[,1:2], pch=20, col=y, main="Data in original space")

biplot(pca.unscaled, main="Data in PCA space")

abline(v=0, col='orange')

# see the binary separation in orange along PC1?

# re-use this into biplot in original space:

pca.cols = c("blue","orange")[1+as.numeric(pca.unscaled$x[,1]>0)]

plot(x[,1:2], pch=20, col=pca.cols,

main="Data in original space\n colour-coded using PC1-split")

par(mfrow=c(2,2))

plot(pca.unscaled) # scree plot

biplot(pca.unscaled) # biplot

plot(pca.scaled) # scree plot

biplot(pca.scaled) # biplot

# now analyse this plot :)

# 从图中可以看出两点：

#1、柱状图中，PL和PW对应的柱子很长，说明他们两个包含的信息多（包含主要信息）

#2、在biplot图中，PL和PW的线方向一致，说明他们两个的相关性很大

# Petal.Length和Petal.Width capture the same kind of direction of information, Sepal.Length and Sepal.Width have smaller contribution than those two.

# (2)logistic regression

#dat——unscaled，dats——scaled

logreg.unscaled = glm(Species~., data=dat) # make this work

#family='binomial' means do logistic regression

logreg.unscaled = glm(Species~., data=dat, family='binomial')

logreg.scaled = glm(Species~., data=dats, family='binomial')

#family='binomial' means do logistic regression

# discuss...

# (...unscaled and scaled 's coefficients are the fits different?)

cbind(coef(logreg.unscaled), coef(logreg.scaled))

#the signs are the same(两个数据的正负符号是一致的)

#but the contributions of each of the variables has changed, sometimes relate to each other(但是系数和影响变化了,但是有时是有联系的)

#have a swap between SP and SL, probably because there is a lot of redundancy between them.(SP和SL的大小交换了，可能因为有太多冗杂的东西在里面)

cor(dats$Petal.Length,dats$Petal.Width)

#0.9793217 means a high correlation them, 相关性很强，SP和SL可以混同(confused),他们可以相互交换(interchangeable)

# (... does this align 一致 with the PCA analysis?)

# (yes, we see a change in the role of each variable in the information space，但是PCA里PL和PW是主成分PC1 PC2，他们在逻辑回归里的系数也是很大的两个

# (... and why this align 一致 with the PCA analysis?为什么PCA和逻辑回归会一致)

# (They are both linear techniques)

# (3)

#lasso要先把数据变成matrix

x.m = model.matrix(Species~.+0, data=dat)#unscaled

lasso.cv = cv.glmnet(x.m, y, family="binomial")#用cv.glmnet去找到后面lasso要用的lambda

lasso.unscaled = glmnet(x.m, y, family="binomial", lambda=lasso.cv$lambda.min)

lasso.pred = predict(lasso.unscaled, newx=x.m, type="class")

xs.m = model.matrix(Species~.+0, data=dats)#scaled

lasso.cv = cv.glmnet(xs.m, y, family="binomial")

lasso.scaled = glmnet(xs.m, y, family="binomial", lambda=lasso.cv$lambda.min)

lasso.s.pred = predict(lasso.scaled, newx=xs.m, type="class")

cbind(coef(lasso.unscaled), coef(lasso.scaled))

#由此可见，scaled和unscaled的intercept很不一样，这是因为我们在scaled里centred all the covariance,which means the data in scaled are all central zero,所以there is very little need for an intercept parameter

table(lasso.pred, lasso.s.pred)

###############################################################

#### Exercise 2: data imputation ####

###############################################################

par(mfrow=c(1,1))

summary(lung)

boxplot(lung$meal.cal~lung$sex, col=c("cyan","pink"))

# can you think of other ways of analyzing this?

# (1) lung cancer data: compare meal.cal values between male and female cohorts,

# and discuss w.r.t. gender-specific data imputation

# NB: "missing at random" vs "missing not at random"??

?lung

View(lung)

dim(lung)#228

dim(na.omit(lung))#167

#数据中有NA，处理missing value

#167/228= 0.7324561，有73%的数据，剩下的是不全（丢失）的数据

#data imputation:fill out the missing value补全丢失数据

#replacing the missing values with the mean or media of the variable for the data set.

nas = is.na(lung$meal.cal) # track missing values 看看哪些值是NA

table(nas, lung$sex) #统计男女中各自NA的个数

imales = which(lung$sex==1)#将data set男性的数据位置找出来

m.all = mean(lung$meal.cal, na.rm=TRUE)#男女总体的mean

m.males = mean(lung$meal.cal[imales], na.rm=TRUE)#男性的mean

m.females = mean(lung$meal.cal[-imales], na.rm=TRUE)#女性的mean

t.test(lung$meal.cal[imales], lung$meal.cal[-imales])#预测男女包含missing value的mean是否相等

# p= 0.01989<0.05, 意味着alternative hypothesis: true difference in means is not equal to 0, 男女包含missing value的mean是不相等的，所以要用不同的imputation值去填补他们

# significant difference, hence must use different imputation

# values for each gender

# (2) Run Cox PHMs on original, overall imputed and gender-specific imputed

# datsets, using the cohort sample mean for data imputation. Compare and discuss

#用不同的数据去填充missing value，比较他们用COX模型拟合的差别

# model fitting output.

dat1 = dat2 = lung

# impute overall mean in dat1:

dat1$meal.cal[nas] = m.all #假如都用整体的mean来填充

# impute gender=sepcific mean in dat2:

dat2$meal.cal[(is.na(lung$meal.cal) & (lung$sex==1))] = m.males#用女的均值填充女

dat2$meal.cal[(is.na(lung$meal.cal) & (lung$sex==2))] = m.females#用男的均值填充男

#Fit Cox proportional hazard models: Cox比例风险回归模型

?coxph()

cox0 = coxph(Surv(time,status)~.,data=lung) #没有填充missing value

cox1 = coxph(Surv(time,status)~.,data=dat1) #直接填充不分男女

cox2 = coxph(Surv(time,status)~.,data=dat2) #分男女填充

summary(cox0)

summary(cox1)

summary(cox2)

cbind(coef(coxph(Surv(time,status)~.,data=lung)),

coef(coxph(Surv(time,status)~.,data=dat1)),coef(coxph(Surv(time,status)~.,data=dat2)))

round(cbind(coef(coxph(Surv(time,status)~.,data=lung)),

coef(coxph(Surv(time,status)~.,data=dat1)),coef(coxph(Surv(time,status)~.,data=dat2))),3)

#由比较可见，没有填充和两种填充方法三者meal.cal coefficient的结果差不多

# - dat1 and dat2 yield increased sample size (from 167 to 209, both imputed

# datasets having 209 observations) dat1和dat2产生了增加的样本大小(从167到209，都有209个观测数据集)

# - overall coefficient effects comparable between the 2 sets

# - marginal differences in covariate effect and significance between lung and {dat1;dat2}

# - no substantial difference between dat1 and dat2 outputs

###############################################################

#### Exercise 3: data imputation ####

###############################################################

library(ISLR)

dat = Hitters

View(Hitters)

# (1) (Deletion)

?na.omit()

sdat = na.omit(dat)#返回删除NA后的数据

sx = model.matrix(Salary~.+0,data=sdat)

sy = sdat$Salary

cv.l = cv.glmnet(sx,sy)

slo = glmnet(sx,sy,lambda=cv.l$lambda.min)

# (2) Simple imputation (of Y) using overall mean

ina = which(is.na(dat$Salary))#把NA的位置找出来

dat$Salary[ina] = mean(dat$Salary[-ina])

x = model.matrix(Salary~.+0,data=dat)

y = dat$Salary

cv.l = cv.glmnet(x,y)

lo = glmnet(x,y,lambda=cv.l$lambda.min)

# (3)

#删除 OR 补充缺失值；fit lasso模型；再用拟合好的lasso模型做出估计的y^；比较真实值与用lasso模型估计出来的估计值的标准差

slop = predict(slo,newx=sx)#用fit好的lasso模型

lop = predict(lo,newx=x)

sqrt(mean((slop-sy)^2))

sqrt(mean((lop-y)^2))#标准差差不多

#画图比较用delete和Simple imputation两种方法拟合的模型做出来的每一个y^的差距，如果是45度直线，说明两种方法预测的y^是差不多的

plot(slop,lop[-ina])

abline(a=0,b=1)

abline(lm(lop[-ina]~slop), col='navy')

# What could we do instead of imputing the Y?

###############################################################

#### Exercise 4: resampling ####

###############################################################

#### Exercise 5: resampling (CV vs bootstrapping)####

###############################################################

# Implement this simple analysis and discuss - think about

# (sub)sample sizes!

# 跳过shuff过程

#n = nrow(trees)#都要先洗牌，重新随机排列一下原数据

#trees = trees[sample(1:n, n), ]

x = trees$Girth # sorted in increasing order...

y = trees$Height

plot(x, y, pch=20)

summary(lm(y~x))

N = nrow(trees)

# (1) 10-fold CV on original dataset

set.seed(4060)

K = 10

slp <- numeric(K)

itc <- numeric(K)

cc <- numeric(K)

folds = cut(1:N, K, labels=FALSE)

for(k in 1:K){

i = which(folds==k)

# train:

lmo = lm(y[-i]~x[-i])

slp[k] <- lmo$coefficient[2]

itc[k] <- lmo$coefficient[1]

cc[k] = summary(lmo)$coef[2,2]

# (NB: no testing here, so not the conventional use of CV)

}

mean(cc)#0.3443734

# (2) 10-fold CV on randomized dataset

# shuffle it

set.seed(1)

mix = sample(1:nrow(trees), replace=FALSE)

xr = trees$Girth[mix]

yr = trees$Height[mix]

set.seed(4060)

K = 10

ccr = numeric(K)

folds = cut(1:N, K, labels=FALSE)

for(k in 1:K){

i = which(folds==k)

lmo = lm(yr[-i]~xr[-i])

ccr[k] = summary(lmo)$coef[2,2]

}

mean(ccr)#0.3408507，shuff后的mean更小一些

mean(cc)#0.3443734

#(3)

sd(ccr)#0.01854404，shuff后的sd更小一些

sd(cc)#0.03990858

boxplot(cc,ccr)

t.test(cc,ccr)#p>0.05，拒绝备择假设alternative hypothesis，true difference in means is equal to 0

var.test(cc,ccr)#F-test (var.test()) p<0.05,接受备择假设alternative hypothesis，true ratio of variances is not equal to 1

?var.test

#p>0.05,accept H0, rejept H1（alternative hypothesis）

# (4) Bootstrapping (additional note)用BT来做，跟CV比较

set.seed(4060)

K = 100

cb = numeric(K)

for(i in 1:K){

# bootstrapping

ib = sample(1:N,N,replace=TRUE)

lmb = lm(y[ib]~x[ib])

cb[i] = summary(lmb)$coef[2,2]

}

mean(cb)#0.3163658，不如shuff后的CV结果好

sd(cb)#0.0505108,不如shuff后的CV结果好

dev.new()

par(font=2, font.axis=2, font.lab=2)

boxplot(cbind(cc,ccr,cb), names=c("CV","CVr","Bootstrap"))

t.test(cc,cb)

round(cc, 3)

#(5)

#we have 36.8% points out-of-BT, so using 368 points as test set, 632 points as train set

#(6)

N = 1000

x = runif(N, 2, 20)

y = 2 + 5*x + rnorm(N)

R=33

K1=3

pred.err.cv=numeric(R*K1)

###CV

for(j in 1:R){

mix=sample(1:N,replace = F)

xr=x[mix]

yr=y[mix]

folds=cut(1:N,K1,labels = F)

data53=as.data.frame(cbind(xr,yr))

mse = pred.err = numeric(K1)

for(k in 1:K1){

i.train = which(folds!=k)

o = lm(yr~xr, data=data53, subset=i.train)

i.test = which(folds==k)

yp = predict(o, newdata=data53[i.test,])

pred.err[k] = mean((yp-yr[i.test])^2)

}

pred.err.cv[c(K1*j-K1+1):c(K1*j)]=pred.err

}

mean(pred.err.cv)

###Bootstrapping command+shift+c 全选加注释

# K2 = R*K1

# pred.err.BT = numeric(K2)

# for(j in 1:K2){

# mix=sample(1:N,replace = T)

# xr=x[mix]

# yr=y[mix]

# data53=as.data.frame(cbind(xr,yr))

# o = lm(yr~xr, data=data53)

# yh = o$fitted.values

# pred.err.BT[j] = mean((yh-yr)^2)

# }

# mean(pred.err.BT)

# --------------------------------------------------------

# ST4061 / ST6041

# 2021-2022

# Eric Wolsztynski

# ...

#### Exercises Section 2: Regularization ####

# --------------------------------------------------------

###############################################################

#### Exercise 1: tuning LASSO ####

###############################################################

# Have a go at this exercise yourself...

# you can refer to ST4060 material:)

library(ISLR)

library(glmnet)

Hitters = na.omit(Hitters)

x = model.matrix(Salary~., Hitters)[,-1]

y = Hitters$Salary

cv.rd <- cv.glmnet(x,y)

ridge_mod = glmnet(x, y, alpha = 0, lambda = cv.rd$lambda.min)

pred.rd <- predict(ridge_mod, newx=x)

length(pred.rd)

length(y)

sqrt(mean((pred.rd - y)^2))

summary(ridge_mod)

coef(ridge_mod)

ridge_mod$lambda

the_grid = 10^seq(10, -2, length = 100)

n <- length(the_grid)

criterion = numeric(length(the_grid))

for(i in 1:n){

fit = glmnet(x, y, alpha = 0, lambda = the_grid[i])

pred.fit <- predict(fit, newx=x)

criterion[i] = sqrt(mean((pred.fit - y)^2))

}

which.min(criterion)

the_grid[which.min(criterion)]

###############################################################

#### Exercise 2: tuning LASSO + validation split ####

###############################################################

# Have a go at this exercise yourself too...

# you can refer to ST4060 material:)

# --------------------------------------------------------

# ST4061 / ST6041

# 2021-2022

# Eric Wolsztynski

# ...

#### Exercises Section 3: Classification Exercises ####

# --------------------------------------------------------

# install.packages("class")

# install.packages("MASS")

# install.packages("car")

# install.packages("ISLR")

# install.packages("pROC")

# install.packages("carData")

library(class) # contains knn()

library(MASS) # to have lda()

library(carData)

library(car)

library(ISLR) # contains the datasets

library(pROC)

###############################################################

#### Exercise 1: kNN on iris data ####

###############################################################

set.seed(1)

# shuffle dataset first:

z = iris[sample(1:nrow(iris)),]

# Here we focus on sepal information only

plot(z[,1:2], col=c(1,2,4)[z[,5]],

pch=20, cex=2)

x = z[,1:2] # sepal variables only

y = z$Species

# Here's how to use the knn() classifier:

K = 1

n = nrow(x)

# split the data into train+test:

i.train = sample(1:n, 100)

x.train = x[i.train,]

x.test = x[-i.train,]

y.train = y[i.train]

y.test = y[-i.train]

ko = knn(x.train, x.test, y.train, K)

tb = table(ko, y.test)

1 - sum(diag(tb)) / sum(tb) # overall classification error rate, error rate越小越好

library(caret)

confusionMatrix(data=ko, reference=y.test)#显示the error matrix

#Here， Accuracy : 0.7 which is sum(diag(tb)) / sum(tb); 95%Confidence interval around its accuracy matrix；还能比较3种class的Sensitivity 和 Specificity

?confusionMatrix#创建一个给定特定边界的混淆矩阵

# Build a loop around that to find best k:

# (NB: assess effect of various k-values

# on the same data-split)

Kmax = 30

acc = numeric(Kmax)#acc越大越好

for(k in 1:Kmax){

ko = knn(x.train, x.test, y.train, k)

tb = table(ko, y.test)

acc[k] = sum(diag(tb)) / sum(tb)

}

plot(1-acc, pch=20, t='b', xlab='k')#从图中找到最低点，就是最好的k

#评价：if k is too small(overfitting), decision boundry is overly flexible, low bias+large variance

# if k is too large(underfitting), decision boundry is not flexible enough, high bias+small variance

###############################################################

#### Exercise 2: GLM(logistic regression) on 2-class iris data ####

###############################################################

n = nrow(iris)

is = sample(1:n, size=n, replace=FALSE)

dat = iris[is,-c(3,4)] # shuffled version of the original set, and hold sepal variables only

# record into 2-class problem:转化成2-class problem “是virginica”=1 OR “不是virginica ”=0

dat$is.virginica = as.numeric(dat$Species=="virginica")

dat$Species = NULL # "remove" this component

names(dat)

is = 1:100 # training set

fit = glm(is.virginica~., data=dat, subset=is, family=binomial(logit))

pred = predict(fit, newdata=dat[-is,], type="response")#返回值是estimated probabilities

hist(pred)

?predict.glm

y.test = dat$is.virginica[-is] # true test set classes

boxplot(pred~y.test, names=c("other","virginica"))#通过箱线图可以看出“virginica”和“not virginica”通过glm分开的比较彻底

abline(h=0.5, col=3)

#roughly 70% of data would be classified correctly.

# for varying cut-off (ie threshold) values, compute corresponding

# predicted labels, and corresponding confusion matrix:

#find the best cutoff

err = NULL

for(cut.off in seq(.1, .9, by=.1)){

pred.y = as.numeric(pred>cut.off)

tb = table(pred.y, y.test)

err = c(err, (1-sum(diag(tb))/sum(tb)))

}

plot(seq(.1, .9, by=.1), err, t='b')

#choose the best cutoff to classification.

cut.off=0.2

pred.y = as.numeric(pred>cut.off)

tb = table(pred.y, y.test)

err = (1-sum(diag(tb))/sum(tb))

###############################################################

#### Exercise 3: LDA assumptions ####

###############################################################

## (1) 2-class classification problem

#多类转化成2类的三种方法

#（1）转换成0 1

dat = iris

dat$Species = as.factor(ifelse(iris$Species=="virginica",1,0))

#（2）转换成带命名的（更直观）

#用0 1 区分是virginica还是非virginica并不直观，不如用virginica or other 来区分，如下：

# to recode cleanly, you could use for instance:

dat$Species = car::recode(dat$Species, "0='other'; 1='virginica'")

#（3）先转化成0 1，再命名

# or:直接让levels变成virginica or other

#dat$Species = as.factor(ifelse(iris$Species=="virginica",1,0))#先变成0 1

#levels(dat$Species) = c("other","virginica")#再对0 1 命名

##(2)粗略查看分类情况

par(mfrow=c(1,2))

plot(iris[,1:2], pch=20, col=c(1,2,4)[iris$Species], cex=2)

legend("topright",col=c(1,2,4),

legend=levels(iris$Species),

pch=20, bty='n')#3-class

plot(dat[,1:2], pch=20, col=c(1,4)[dat$Species], cex=2)

legend("topright",col=c(1,4),

legend=levels(dat$Species),

pch=20, bty='n')#2-class：virginica or not virginica

#由于LDA需要满足两个assumption：normal-distribution；equal variance，否则将不能用LDA方法分类

#First assumption: all histograms are looking like normal distribution

#Second assumption: 每个predictor的方差相同

##First，we need to explore distribution of each predictors(每一列指标）:(看是否满足正态分布)

head(dat)

#here, the first four columns are the predictors(or features/attributes/covariates/variables), the last column is class

# 1.boxplots seem relatively symmetric画箱线图看他是否对称:

#look at each column of the dataset:

par(mfrow=c(2,2))

for(j in 1:4){

boxplot(dat[,j]~dat$Species,

ylab='predictor',

col=c('cyan','pink'),

main=names(dat)[j])

}

#here you can see there is a clear separation between the two classes, except the sepal.width

#so we decide to use the three other predictors to make a classification

#由于正态分布是对称的，如果箱线图的箱子是对称(symmetric)的，那么很可能是满足正态分布的，如果箱子不对称，那么会有skew

#看correlation

cor(dat[,1:4])

#here we can see there is 96%(超过90%就可以这么说) correlation between Petal.Width and Petal.Length, which means these two predictors can tell us the same information

#2.只看箱线图是不够的，还要看是否是histogram正态分布 but we'd rather check for Normality more specifically:

par(mfrow=c(2,4), font.lab=2, cex=1.2)

for(j in 1:4){

hist(dat[which(dat$Species=='other'),j], col='cyan',

xlab='predictor for class other',

main=names(dat)[j])

hist(dat[which(dat$Species!='other'),j], col='pink',

xlab='predictor for class virginica',

main=names(dat)[j])

}

#PL and PW for the other class are definitely not normal,其他的图都是approximately normal

#It means there may be a limitation of our analysis if we decide to use LDA here.

#这里其实就不适合用LDA了，但是为了了解这个方法，我们继续往下做

#3.除了用柱状图判断是否normal，也可以用QQ图 could also use QQ-plots:

par(mfrow=c(2,4), cex=1.2)

for(j in 1:4){

x.other = dat[which(dat$Species=='other'),j]

qqnorm(x.other, pch=20, col='cyan',

main=names(dat)[j])

abline(a=mean(x.other), b=sd(x.other))

x.virginica = dat[which(dat$Species!='other'),j]

qqnorm(x.virginica, pch=20, col='pink',

main=names(dat)[j])

abline(a=mean(x.virginica), b=sd(x.virginica))

}

# So what do you think?

#在45度线上的是normal distribution，很明显PL和PW在other上不是

## Check initial assumption of equality of variances:all predictors have same variance for each class

# Bartlett's test with H0: all variances are equal, p>0.05 接受H0，有相同的variance

for(j in 1:4){

print( bartlett.test(dat[,j]~dat$Species)$p.value )

}

#只有SL满足

# Shapiro's test with H0: the distribution is Normal

for(j in 1:4){

print( shapiro.test(dat[which(dat$Species=='virginica'),j])$p.value )

}#p都>0.05，Normal

for(j in 1:4){

print( shapiro.test(dat[which(dat$Species=='other'),j])$p.value )

}#PL和PW的p value 很小，不normal

## Fit LDA model to this dataset and check accuracy:

lda.o = lda(Species~., data=dat)

(lda.o)

# can we track some of the values in that summary?

table(dat$Species)/nrow(dat)

#Here tells us there are one third data belongs to virginica, and two thirds belongs to other class.

#It means we don't have balance in how each class represented in the data set.

rbind(

apply(dat[which(dat$Species=='other'),1:4], 2, mean),

apply(dat[which(dat$Species=='virginica'),1:4], 2, mean)

)#按列求每一列的均值

# what about the coefficients of linear discriminates?那么线性微分的系数呢?

x = as.matrix(dat[,1:4])

proj = x %*% lda.o$scaling #取每一列的数据，乘lda.o$scaling

plot(proj, pch=20, col=dat$Species, cex=2)

#we can see, the projections increase as we move from one class to another

# little hack to recover the fitted values quickly

predo = predict(lda.o, newdata=dat)

names(predo)

predo$class#显示每一个测试样本的分类

predo$posterior#有0.9990663400的概率第一个测试样本属于other class，有0.0009336600的概率它属于virginica

y = predo$x

??predo

plot(proj, y)#由此我们能看出，the scores used by LDA in order to generate the predictions and probabilities are the result of proj by using lda.o$scaling directly.（y=proj）

plot(y, predo$posterior[,2])

boxplot(y ~ (predo$posterior[,2]>.5))#第二列probabilities > 0.5 是virginica（TRUE）

boxplot(proj ~ (predo$posterior[,2]>.5))#两种画出来的图是一样的，因为y和proj相等

# NB: The way these coefs is calculated follows the MANOVA approach

# popular hack to recover the fitted values:

fitted.values = predict(lda.o, newdata=dat)$class

boxplot(y~dat$Species)

boxplot(proj~dat$Species)

(tb.2 = table(fitted.values, dat$Species))

sum(diag(tb.2)) / sum(tb.2)#accuracy

#LDA will be more efficient than LR when class are well separated.

## (2) 3-class classification problem

dat = iris

## Explore distribution of predictors:

# boxplots seem relatively symmetric:

par(mfrow=c(2,2))

# here's just a loop to save having to write 4 boxplot

# instructions with names by hand (being lazy often

# makes for nicer code):

for(j in 1:4){

boxplot(dat[,j]~dat$Species,

xlab = 'Species',

ylab = 'predictor',

col=c('cyan','pink'),

main=names(dat)[j])

}

# but we'd rather check for Normality more specifically:

Ls = levels(dat$Species)

par(mfcol=c(3,4))

for(j in 1:4){

hist(dat[which(dat$Species==Ls[1]),j], col='cyan',

main=names(dat)[j])

hist(dat[which(dat$Species==Ls[2]),j], col='pink',

main=names(dat)[j])

hist(dat[which(dat$Species==Ls[3]),j], col='green',

main=names(dat)[j])

}

# could also use QQ-plots:

par(mfcol=c(3,4))

for(j in 1:4){

x1 = dat[which(dat$Species==Ls[1]),j]

qqnorm(x1, pch=20, col='cyan', main=names(dat)[j])

abline(a=mean(x1), b=sd(x1))

x2 = dat[which(dat$Species==Ls[2]),j]

qqnorm(x2, pch=20, col='pink', main=names(dat)[j])

abline(a=mean(x2), b=sd(x2))

x3 = dat[which(dat$Species==Ls[3]),j]

qqnorm(x3, pch=20, col='green', main=names(dat)[j])

abline(a=mean(x3), b=sd(x3))

}

# So what do you think now?

## Check initial assumption of equality of variances:

# Bartlett's test with H0: all variances are equal

print( bartlett.test(dat[,1]~dat$Species)$p.value )

print( bartlett.test(dat[,2]~dat$Species)$p.value )

print( bartlett.test(dat[,3]~dat$Species)$p.value )

print( bartlett.test(dat[,4]~dat$Species)$p.value )

## or if in lazy mode:

for(j in 1:4){

print( bartlett.test(dat[,j]~dat$Species)$p.value )

}

## Fit LDA model to this dataset and check accuracy:

lda.o = lda(Species~., data=dat)

(lda.o)

ftted.values = predict(lda.o, newdata=dat)$class

(tb.3 = table(ftted.values, dat$Species))

sum(diag(tb.3)) / sum(tb.3)

###############################################################

### Exercise 4: LDA

###############################################################

## (1) 2-class classification problem

dat = iris

dat$Species = as.factor(ifelse(iris$Species=="virginica",1,0))

levels(dat$Species) = c("other","virginica")

n = nrow(dat)

set.seed(4061)

dat = dat[sample(1:n),] # shuffle dataset

i.train = 1:100

dat.train = dat[i.train,]

dat.test = dat[-i.train,]

lda.o = lda(Species~., data=dat.train)

lda.p = predict(lda.o, newdata=dat.test)

names(lda.p)

(tb = table(lda.p$class, dat.test$Species))

sum(diag(tb))/sum(tb)

# QDA:

qda.o = qda(Species~., data=dat.train)

qda.p = predict(qda.o, newdata=dat.test)

(tb = table(qda.p$class, dat.test$Species))

sum(diag(tb))/sum(tb)

## (2) 3-class classification problem

dat = iris

n = nrow(dat)

set.seed(4061)

dat = dat[sample(1:n),]

i.train = 1:100

dat.train = dat[i.train,]

dat.test = dat[-i.train,]

# LDA:

lda.o = lda(Species~., data=dat.train)

lda.p = predict(lda.o, newdata=dat.test)

names(lda.p)

(tb = table(lda.p$class, dat.test$Species))

sum(diag(tb))/sum(tb)

# QDA:

qda.o = qda(Species~., data=dat.train)

qda.p = predict(qda.o, newdata=dat.test)

(tb = table(qda.p$class, dat.test$Species))

sum(diag(tb))/sum(tb)

###############################################################

### Exercise 5: benchmarking

###############################################################

## (1) benchmarking on unscaled data

#ROC图像横纵坐标都越接近1越好， AOC是线与横轴包围的面积，也是越大越好

#sensitivity:纵坐标

#specificity:横坐标

#confusionMatrix：混淆矩阵

set.seed(4061)

n = nrow(Default)

dat = Default[sample(1:n, n, replace=FALSE), ]

# get a random training sample containing 70% of original sample:

i.cv = sample(1:n, round(.7*n), replace=FALSE)

dat.cv = dat[i.cv,] # use this for CV (train+test)

dat.valid = dat[-i.cv,] # save this for later (after CV) (HOLD-OUT)

# tuning of the classifiers:

K.knn = 3

# perform K-fold CV:

K = 10

N = length(i.cv)

folds = cut(1:N, K, labels=FALSE)

acc.knn = acc.glm = acc.lda = acc.qda = numeric(K)

auc.knn = auc.glm = auc.lda = auc.qda = numeric(K)

for(k in 1:K){ # 10-fold CV loop

# split into train and test samples:

i.train = which(folds!=k)

dat.train = dat.cv[i.train, ]

dat.test = dat.cv[-i.train, ]

# adapt these sets for kNN:

x.train = dat.train[,-1]

y.train = dat.train[,1]

x.test = dat.test[,-1]

y.test = dat.test[,1]

x.train[,1] = as.numeric(x.train[,1])

x.test[,1] = as.numeric(x.test[,1])

# train classifiers:

knn.o = knn(x.train, x.test, y.train, K.knn)

glm.o = glm(default~., data=dat.train, family=binomial(logit))

lda.o = lda(default~., data=dat.train)

qda.o = qda(default~., data=dat.train)

# test classifiers:

knn.p = knn.o

glm.p = ( predict(glm.o, newdata=dat.test, type="response") > 0.5 )

lda.p = predict(lda.o, newdata=dat.test)$class

qda.p = predict(qda.o, newdata=dat.test)$class

tb.knn = table(knn.p, y.test)

tb.glm = table(glm.p, y.test)

tb.lda = table(lda.p, y.test)

tb.qda = table(qda.p, y.test)

# store prediction accuracies:

acc.knn[k] = sum(diag(tb.knn)) / sum(tb.knn)

acc.glm[k] = sum(diag(tb.glm)) / sum(tb.glm)

acc.lda[k] = sum(diag(tb.lda)) / sum(tb.lda)

acc.qda[k] = sum(diag(tb.qda)) / sum(tb.qda)

# ROC/AUC analysis:AUC值越大越好

# WARNING: THIS IS NOT Pr(Y=1 | X), BUT Pr(Y = Y_hat | X):

knn.p = attributes(knn(x.train, x.test, y.train, K.knn, prob=TRUE))$prob

glm.p = predict(glm.o, newdata=dat.test, type="response")

lda.p = predict(lda.o, newdata=dat.test)$posterior[,2]

qda.p = predict(qda.o, newdata=dat.test)$posterior[,2]

auc.knn[k] = roc(y.test, knn.p)$auc

auc.glm[k] = roc(y.test, glm.p)$auc

auc.lda[k] = roc(y.test, lda.p)$auc

auc.qda[k] = roc(y.test, qda.p)$auc

}

boxplot(acc.knn, acc.glm, acc.lda, acc.qda,

main="Overall CV prediction accuracy",

names=c("kNN","GLM","LDA","QDA"))

boxplot(auc.knn,auc.glm, auc.lda, auc.qda,

main="Overall CV AUC",

names=c("KNN","GLM","LDA","QDA"))

##### Taking a closer look at performance

roc(y.test, glm.p)$auc#真实值预测值

plot(roc(y.test, glm.p))

#取threshold为0.5，用caret建立混淆矩阵，计算精确度

library(caret)

(tb = table(y.test, glm.p>.5))

pred = as.factor(glm.p>.5)

pred = car::recode(pred, "FALSE='No'; TRUE='Yes'")

caret::confusionMatrix(y.test, pred)

sum(diag(tb))/sum(tb)

(683+3)/(683+3+14)#=98%

#看总体数据的真实情况，No占96.67%，与98%相差不大，说明预测有效

table(Default$default)

table(Default$default)/nrow(Default)

##### Further exercises for you to do:

## adapt code to evaluate sensitivity and specificity

## add validation analysis...

## repeat on scaled data... 会对KNN有影响

###############################################################

### Exercise 6: benchmarking, again

###############################################################

## (1) benchmarking on unscaled data

set.seed(4061)

n = nrow(Default)

dat = Default[sample(1:n, n, replace=FALSE), ]

# get a random training sample containing 70% of original sample:

i.cv = sample(1:n, round(.7*n), replace=FALSE)

x = dat.cv = dat[i.cv,] # use this for CV (train+test)

dat.valid = dat[-i.cv,] # save this for later (after CV)

# Recover ROC curve manually from whole set:

n = nrow(x)

acc = numeric(length(thrs))

sens = spec = numeric(length(thrs))

thrs = seq(.05,.95,by=.05)

for(ithr in 1:length(thrs)){

thr = thrs[ithr]

glmo = glm(default~., data=x,

family=binomial)

tb = table(glmo$fitted.values>thr, x$default)

acc[ithr] = sum(diag(tb))/sum(tb)

# calculate sensitivity for a given threshold

sens[ithr] = tb[2,2]/sum(tb[,2])

# calculate specificity for a given threshold

spec[ithr] = tb[1,1]/sum(tb[,1])

# prediction

}

plot(1-spec,sens,xlim=c(0,1),ylim=c(0,1),t='b')

abline(h=c(0,1),v=c(0,1),col=8)

abline(a=0,b=1,col=8)

plot(acc)

plot(spec, sens)#能看出来spec增加，sens会下降

confusionMatrix(y.test,pred)

# Evaluate a cross-validated ROC curve manually:

# 手工评估交叉验证的ROC曲线:

n = nrow(x)

K = 10

train.acc = test.acc = matrix(NA, nrow=K, ncol=length(thrs))

folds = cut(1:n, K, labels=FALSE)

k = 1

thrs = seq(.05,.95,by=.05)

for(ithr in 1:length(thrs)){

thr = thrs[ithr]

for(k in 1:K){

itrain = which(folds!=k)

glmo = glm(default~., data=x,

family=binomial,

subset=itrain)

tb = table(glmo$fitted.values>thr, x$default[itrain])

train.acc[k, ithr] = sum(diag(tb))/sum(tb)

# calculate sensitivity for a given threshold

sens[ithr] = tb[2,2]/sum(tb[,2])

# calculate specificity for a given threshold

spec[ithr] = tb[1,1]/sum(tb[,1])

# prediction

p.test = predict(glmo, x[-itrain,], type='response')

tb = table(p.test>thr, x$default[-itrain])

test.acc[k,ithr] = sum(diag(tb))/sum(tb)

}

boxplot(test.acc)

mean(train.acc)

mean(test.acc)

####eric上课讲的2018CA1整体代码####

# install.packages("mlbench")

library(mlbench)

data(Sonar)

N = nrow(Sonar)

P = ncol(Sonar)-1

M = 150

set.seed(1)

mdata = Sonar[sample(1:N),]#shuffle

itrain = sample(1:N,M)#150/208作为训练样本

x = mdata[, -ncol (mdata)]

y = mdata$Class

xm = as.matrix(x)

#1)

N-M

#2)

lasso.opt = cv.glmnet(xm[itrain,],

y[itrain],

alpha=1,

family='binomial')

lasso.opt$lambda.min

#0.01103539

#3)

lasso.mod = glmnet(xm[itrain,],

y[itrain],

alpha=1,

family='binomial',

lambda=lasso.opt$lambda.min)

coef(lasso.mod)

#4)

tree.mod = tree(y~., data=x, subset=itrain)

summary(tree.mod)

sort(summary(tree.mod) $used)

#11 variables

#5)

rf.mod = randomForest(y~., data=x, subset=itrain)

varImpPlot(rf.mod)

#6)

tree.pred = predict(tree.mod, x[-itrain,], 'class')

rf.pred = predict(rf.mod, x[-itrain,], 'class')

(tb.tree = table(tree.pred,y[-itrain]))

(tb.rf = table(rf.pred,y[-itrain]))

1-sum(diag(tb.tree))/sum(tb.tree)

1-sum(diag(tb.rf))/sum(tb.rf)

#The classification error rate is smaller, means the model is more accuracy.

#7)

tree.p = predict(tree.mod, x[-itrain,], 'vector')[,2]

rf.p = predict(rf.mod, x[-itrain,], 'prob')[,2]

auc.tree = roc(response=y[-itrain], predictor=tree.p)$auc

auc.rf = roc(response=y[-itrain], predictor=rf.p)$auc

c(auc.tree, auc.rf)

#The auc is greater，means this model has more sensitive and specificity.

#The classification error rate is smaller, means the model is more accuracy.

#Since both error rate and auc improved, we can say that RF model is better rather than a tree for classification.

# --------------------------------------------------------

# ST4061 / ST6041

# 2021-2022

# Eric Wolsztynski

# ...

#### Exercises Section 4: Tree-based methods 用树来分类####

# --------------------------------------------------------

###############################################################

#### Exercise 1: growing and pruning a tree ####

###############################################################

#decision tree：从所有变量中挑出一个最重要的来作为根结点，后面的每一阶都是从所有变量里选最重要的一个作为下一个节点，这会使得有重复的变量被选中，画出的树特别大，造成overfitting

#因此，我们需要对做出的tree做删减pruning，目的是找到一个最理想的tree size分级大小，也就是方差dev最小的那个

install.packages("tree")

library(ISLR) # contains the dataset

library(tree) # contains... tree-building methods

# Recode response variable so as to make it a classification problem

High = ifelse(Carseats$Sales<=8, "No", "Yes")

# Create a data frame that includes both the predictors and response

# (a data frame is like an Excel spreadsheet, if you like)

CS = data.frame(Carseats, High)

CS$Sales = NULL#把sale去掉

CS$High = as.factor(CS$High) # <-- this bit was missing 必须有不然后面会warning

# Fit the tree using all predictors (except for variable Sales,

# which we have "recoded" into a cateorical response variable)

# to response variable High

tree.out = tree(High~., CS)

summary(tree.out)

# plot the tree

plot(tree.out)

text(tree.out, pretty=0)

#The tree is fully grown, there is lots of little branches, maybe have a very detailed breakdown, means there is an overfitting of the dataset

# pruning:修剪

?cv.tree

set.seed(3)

cv.CS = cv.tree(tree.out, FUN=prune.misclass)#添加prune.misclass函数以找到合适的size

names(cv.CS)

# - size:

# number of terminal nodes in each tree in the cost-complexity pruning sequence.

#two method to control the depth of the tree:

# (1)- deviance:

# total deviance of each tree in the cost-complexity pruning sequence.

# (2)- k:

# the value of the cost-complexity pruning parameter of each tree in the sequence.

cv.CS

par(mfrow=c(1,2))

plot(cv.CS$size,cv.CS$dev,t='b')#find which size has the smallest deviation 93 (偏差：观测值-真实值)

min(cv.CS$dev)

cv.CS$size[which.min(cv.CS$dev)]

abline(v=cv.CS$size[which.min(cv.CS$dev)])#marking the 最合适size的location

plot(cv.CS$k,cv.CS$dev,t='b')#还可以找到最小的dev对应的最合适的k

# use pruning:

# - use which.min(cv.CS$dev) to get the location of the optimum

# - retrieve the corresponding tree size

# - pass this information on to pruning function

opt.size = cv.CS$size[which.min(cv.CS$dev)] #Find the optimal size

# see:

plot(cv.CS$size,cv.CS$dev,t='b')

abline(v=cv.CS$size[which.min(cv.CS$dev)])

ptree = prune.misclass(tree.out, best=opt.size) #using the optimal size to pruning the tree

ptree

summary(ptree)

par(mfrow=c(1,2))

plot(tree.out)#initially

text(tree.out, pretty=0)

plot(ptree)#pruned

text(ptree, pretty=0)

#Compare with the initial treem, there is fewer branches in prune tree.

###############################################################

#### Exercise 2: apply CV and ROC analysis ####

###############################################################

# Train/test:

set.seed(2)

n = nrow(CS)

itrain = sample(1:n, 200)#一共有400组数据，随机选取其中的200个作为训练向本

CS.test = CS[-itrain,]

High.test = High[-itrain]

# argument 'subset' makes it easy to handle training/test splits:

tree.out = tree(High~., CS, subset=itrain)#总体的树，没有修剪过

summary(tree.out)

plot(tree.out)

text(tree.out, pretty=0)

# prediction from full tree:

tree.pred = predict(tree.out, CS.test, type="class")#总体的树的预测值

(tb1 = table(tree.pred,High.test))#总体的树对High的混淆矩阵

# prediction from pruned tree:

ptree.pred = predict(ptree, CS.test, type="class")#修剪过的树的预测值

(tb2 = table(ptree.pred,High.test)) # 修剪过的树的confusion matrix

sum(diag(tb1))/sum(tb1)#总体的树的准确度#classification rate

sum(diag(tb2))/sum(tb2)#修剪过的树的准确度

#预测错的值老师叫做misclassified obsevations

#预测错的概率叫misclassification rate

#准确度高的老师叫做is better suited to predict unseen data

# perform ROC analysis

library(pROC)

# here we specify 'type="vector"' to retrieve continuous scores

# as opposed to predicted labels, so that we can apply varying

# threshold values to these scores to build the ROC curve:

#在ROC中，图像越靠近左上角越好

ptree.probs = predict(ptree, CS.test, type="vector")#在画ROC的时候，必须用type="vector"

roc.p = roc(response=(High.test), predictor=ptree.probs[,1])

plot(roc.p)

#AUC

#AUC是曲线下面的面积，越大越好

#下面是两种方法，都可以

#取值范围在0.5和1之间

#越接近1.0，检测方法真实性越高;

#等于0.5时，则真实性最低，无应用价值

#AUC < 0.5，比随机猜测还差

auc(roc.p)

roc.p$auc

###############################################################

#### Exercise 3: find the tree ####

###############################################################

#Bagging

#由于decision tree有许多缺点，比如重复选择，太庞大，overfitting，用一个更为优化的分类方法叫做Bagging

#用Bootstrap，每次BT里选一个新的variable做节点往下分，high variance， low bias，用于分类和回归都可以

#但是每一层选出来的variable会与前面的有很强的相关性

# ... can you find it?

###############################################################

#### Exercise 4: grow a random forest ####

###############################################################

#更好的一种方法，each learner:可以针对某一种特性来分类random subset is considering feature Q<P，每个tree是基于一种特性分类的，许多个特性就有许多个tree，这组成了一个森林random forest

#不容易overfit，不敏感

library(tree)

library(ISLR)

#install.packages("randomForest")

library(randomForest)

# ?Carseats

High = as.factor(ifelse(Carseats$Sales <= 8, 'No', 'Yes'))

CS = data.frame(Carseats, High)

CS$Sales = NULL

P = ncol(CS)-1 # number of features(把High去掉)

# grow a single (unpruned) tree

tree.out = tree(High~., CS)

# fitted values for "training set"按照decision tree直接全部预测

tree.yhat = predict(tree.out, CS, type="class")

# grow a forest:建立一个森林

rf.out = randomForest(High~., CS)

# fitted values for "training set"按照randomForest直接全部预测

rf.yhat = predict(rf.out, CS, type="class")

#If you want to perform bagging instead of random forest

# compare to bagging: using all features,not include y 这种方法叫bagging(用了全部的variables)；rf可以随便选用mtry，自定义用多少feature

bag.out = randomForest(High~., CS, mtry=P)#添加mtry= P = ncol(CS)-1 # number of features(把High去掉)

# fitted values for "training set"

bag.yhat = predict(bag.out, CS, type="class")#按照bag直接全部预测

# confusion matrix for tree:

(tb.tree = table(tree.yhat, High))#按照decision tree直接全部预测的混淆矩阵

# confusion matrix for RF

(tb.rf = table(rf.yhat, High))#按照randomForest直接全部预测的混淆矩阵，可以看出分类全部正确，perfect

# confusion matrix for bagging

(tb.bag = table(bag.yhat, High))#按照bag直接全部预测的混淆矩阵

# Note this is different to the confusion of RF

# matrix for the OOB observations:OOB对建树时未使用的数据（bootstrap没用到的数据）进行误差估计

(tb.rf2 = rf.out$confusion)#rf.out是真实的数据直接放到RF里，不做预测，说明真实情况下按照用RF方法应该有部分数据是错误分类的

sum(diag(tb.tree))/sum(tb.tree)#decision tree方法精确度91%

sum(diag(tb.rf))/sum(tb.rf)#RF方法精确度1（完全正确）但不是什么数据都是1，这里是巧了

sum(diag(tb.bag))/sum(tb.bag)#accurate=1

sum(diag(tb.rf2))/sum(tb.rf2)

#原本bag做出来的精确度应该比rf低，但现在它比rf的精确度高了，1>0.8141762, 说明bag出现了over fit

# train-test split

set.seed(6041)

N = nrow(CS)

itrain = sample(1:N, 200)

CS.train = CS[itrain,]

CS.test = CS[-itrain,]

tree.out = tree(High~., CS.train)

# fitted values for "train set"

tree.yhat = predict(tree.out, CS.train, type="class")

# fitted values for "test set"

tree.pred = predict(tree.out, CS.test, type="class")

rf.out = randomForest(High~., CS.train)

# fitted values for "training set"

rf.yhat = predict(rf.out, CS.train, type="class")

# fitted values for "test set"

rf.pred = predict(rf.out, CS.test, type="class")

bag.out = randomForest(High~., CS.train, mtry=(ncol(CS)-2))

# fitted values for "training set"

bag.yhat = predict(bag.out, CS.train, type="class")

# fitted values for "test set"

bag.pred = predict(bag.out, CS.test, type="class")

# confusion matrix for tree (test data):

(tb.tree = table(tree.pred, CS.test$High))

# confusion matrix for RF (test data):

(tb.rf = table(rf.pred, CS.test$High))#better performance of rf

# confusion matrix for Bagging (test data):

(tb.bag = table(bag.pred, CS.test$High))#bagging is not as good as rf

sum(diag(tb.tree))/sum(tb.tree)

sum(diag(tb.rf))/sum(tb.rf)

sum(diag(tb.bag))/sum(tb.bag)

###############################################################

#### Exercise 5: benchmarking ####

###############################################################

###### Exercise 5: benchmarking (this exercise is left as homework)######

# bring in that code from Section 2 (below) and add to it:

library(class) # contains knn()

library(ISLR) # contains the datasets

library(pROC)

library(tree)

library(randomForest)

#### (1) benchmarking on unscaled data对未缩放数据进行基准测试####

#对一类测试对象的某项性能指标进行定量的和可对比的测试

set.seed(4061)

n = nrow(Default)

dat = Default[sample(1:n, n, replace=FALSE), ]

i.cv = sample(1:n, round(.7*n), replace=FALSE)

dat.cv = dat[i.cv,] # use this for CV (train+test)

dat.valid = dat[-i.cv,] # save this for later (after CV)

# tuning of the classifiers:

K.knn = 3

K = 10

N = length(i.cv)

folds = cut(1:N, K, labels=FALSE)

acc.knn = acc.glm = acc.lda = acc.qda = numeric(K)#prediction accuracies

auc.knn = auc.glm = auc.lda = auc.qda = numeric(K)#AUC

acc.rf = auc.rf = numeric(K)

for(k in 1:K){ # 10-fold CV loop

# split into train and test samples:

#首先要划分训练样本和测试样本

i.train = which(folds!=k)

dat.train = dat.cv[i.train, ]

dat.test = dat.cv[-i.train, ]

#作出KNN拟合

# adapt these sets for kNN:

x.train = dat.train[,-1]

y.train = dat.train[,1]

x.test = dat.test[,-1]

y.test = dat.test[,1]

x.train[,1] = as.numeric(x.train[,1])

x.test[,1] = as.numeric(x.test[,1])

# train classifiers:

knn.o = knn(x.train, x.test, y.train, K.knn)

#通训练样本作出其他拟合

glm.o = glm(default~., data=dat.train, family=binomial(logit))# 做logistic regression

lda.o = lda(default~., data=dat.train)#做LDA

qda.o = qda(default~., data=dat.train)#做QDA

rf.o = randomForest(default~., data=dat.train)#做randomForest

#用测试样本进行预测

# test classifiers:

# (notice that predict.glm() does not have a functionality to

# return categorical values, so we copmute them based on the

# scores by applying a threshold of 50%)

# 测试分类器：

# （注意 predict.glm() 没有功能

# 返回分类值，因此我们根据

# 通过应用 50% 的阈值得分）

knn.p = knn.o

glm.p = ( predict(glm.o, newdata=dat.test, type="response") > 0.5 )

lda.p = predict(lda.o, newdata=dat.test)$class

qda.p = predict(qda.o, newdata=dat.test)$class

rf.p = predict(rf.o, newdata=dat.test)

#做各个方法的测试样本的混淆矩阵

# corresponding confusion matrices:

tb.knn = table(knn.p, y.test)

tb.glm = table(glm.p, y.test)

tb.lda = table(lda.p, y.test)

tb.qda = table(qda.p, y.test)

tb.rf = table(rf.p, y.test)

#每个方法的准确度

# store prediction accuracies:

acc.knn[k] = sum(diag(tb.knn)) / sum(tb.knn)

acc.glm[k] = sum(diag(tb.glm)) / sum(tb.glm)

acc.lda[k] = sum(diag(tb.lda)) / sum(tb.lda)

acc.qda[k] = sum(diag(tb.qda)) / sum(tb.qda)

acc.rf[k] = sum(diag(tb.rf)) / sum(tb.rf)

# ROC/AUC analysis:

#FandonForest没有

# WARNING: THIS IS NOT PR(Y=1 | X), BUT Pr(Y = Y_hat | X):

knn.p = attributes(knn(x.train, x.test, y.train, K.knn, prob=TRUE))$prob

glm.p = predict(glm.o, newdata=dat.test, type="response")

lda.p = predict(lda.o, newdata=dat.test)$posterior[,2]

qda.p = predict(qda.o, newdata=dat.test)$posterior[,2]

#auc.knn[k] = roc(y.test, knn.p)$auc

auc.glm[k] = roc(y.test, glm.p)$auc

auc.lda[k] = roc(y.test, lda.p)$auc

auc.qda[k] = roc(y.test, qda.p)$auc

}

boxplot(acc.knn, acc.glm, acc.lda, acc.qda,

main="Overall CV prediction accuracy",

names=c("kNN","GLM","LDA","QDA"))

#下面两个的区别在于有没有KNN的AUC（因为KNN的AUC的图和其他方法放不在一起）

boxplot(auc.glm, auc.lda, auc.qda,

main="Overall CV AUC",

names=c("GLM","LDA","QDA"))

boxplot(auc.knn, auc.glm, auc.lda, auc.qda,

main="Overall CV AUC",

names=c("kNN","GLM","LDA","QDA"))

###############################################################

### Exercise 6: Variable importance from RF ####

###############################################################

#由RF看哪个variable是最重要的（系数越大越重要）

library(ISLR)

library(randomForest)

# ?Carseats

High = as.factor(ifelse(Carseats$Sales <= 8, 'No', 'Yes'))

CS = data.frame(Carseats, High)

CS$Sales = NULL

# grow a forest:

rf.out = randomForest(High~., CS)

# compare to bagging:

bag.out = randomForest(High~., CS, mtry=(ncol(CS)-1))

cbind(rf.out$importance, bag.out$importance)

#bag会让原本重要的variable更重要，会让不重要的variable更不重要，because essentially those strong predictors have a better chance to get selected every time.

#比如，CompPrice，用bag之后重要性增加了，后面四个variable，用bag之后重要性减少了，还可以说用rf，CompPrice的重要性是US的3倍

par(mfrow=c(1,2))

varImpPlot(rf.out, pch=15, main="Ensemble method 1")#Price最重要，urban最不重要

varImpPlot(bag.out, pch=15, main="Ensemble method 2")

#Mean decrease Gini, 纵坐标是减小的幅度，减小的幅度越大，也就是数值越大，variable越重要。

?randomForest

###############################################################

#### Exercise 7: gradient boosting ####

#baging 所有的自变量都是要用的； boosting是加权的

###############################################################

# gradient boosting model 梯度增强模型

# 之前的方法，下一阶与上一阶会有相关性，为了减小correlation

# 找到真实值与预测值的残差，下一阶基于上一阶的残差分析，每一阶都弥补了上一阶的残差，可以减小偏差

# 缺点：容易over fit，比较敏感

library(ISLR) # contains the dataset

library(tree) # contains... tree-building methods

#install.packages('gbm')

library(gbm) # contains the GB model implementation

library(pROC)

# Recode response variable so as to make it a classification problem

High = as.factor(ifelse(Carseats$Sales<=8, "No", "Yes"))

CS = data.frame(Carseats, High)

# remove Sales from data frame, just to make formulas simpler!

CS$Sales = NULL

####（1）####

set.seed(1)

itest = sample(1:nrow(CS), round(nrow(CS)/4))#题目里面要求选100个作为测试样本

CS.test = CS[itest,]#测试样本

CS = CS[-itest,]

####（2）####

set.seed(4061)

# Note:

gbm(High~., data=CS, distribution="bernoulli")

#对于分类问题，选择bernoulli或者adaboost，前者更为推荐

#对于连续因变量（回归），选择gaussian或者laplace

# so we must recode the levels...

CS$High = (as.numeric(CS$High=="Yes")) # 前面HIGH是factor，这里又变会numeric(yes=1,no=0), this hack could be useful

gb.out = gbm(High~., data=CS,

distribution="bernoulli", # use "gaussian" instead for regression

n.trees=5000, # size of the ensemble

interaction.depth=1) # depth of the trees, 1 = stumps 限制每棵树的深度

#distribution：模型计算损失函数时，需要对输出变量的数据分布做出假设。

#对于分类问题，选择bernoulli或者adaboost，前者更为推荐

#对于连续因变量，选择gaussian或者laplace

#n.trees：即number of iteration—迭代次数。迭代次数的选择与学习速率密切相关

#interaction.depth和n.minobsinnode：

#子决策树即基础学习器的深度和决策树叶节点包含的最小观测树，

#若基础学习器训练得过于复杂，将提升模型对于样本的拟合能力而导致过拟合问题，

#因此子决策树深度不宜过大，

#叶节点可包含的最小观测书不宜过小。

summary(gb.out$train.error)

# inspect output:

par(mar=c(4.5,6,1,1))

summary(gb.out, las=1)#summary函数返回自变量的相对重要性

plot(gb.out)

plot(gb.out, i="Price")# i is the index of variable or which variable you want to pick

#从图中可以看出，随着price的增加，预测的High数值的中位数越小

plot(gb.out, i="ShelveLoc")

#由于ShelveLoc是离散的，画出来是点，可以看出随着ShelveLoc变好，预测的High数值越大

gb.p = predict(gb.out, newdata=CS.test, n.trees=5000)

gb.p

roc.gb = roc(response=CS.test$High, predictor=gb.p)

plot(roc.gb)

roc.gb$auc

# compare AUC's with a Random Forest:

library(randomForest)

CS$High = as.factor(CS$High)

rf.out = randomForest(High~., CS, ntree=5000)

# fitted values for "training set"

rf.p = predict(rf.out, CS.test, type="prob")[,2]

roc.rf = roc(response=CS.test$High, predictor=rf.p)

plot(roc.rf, add=TRUE, col=2)

roc.gb$auc

roc.rf$auc

#AUC高的那个 great illustrate the model

###############################################################

#### Exercise 8: gradient boosting using caret...####

###############################################################

# Plug in the following snips of code within demo code

# Section4b_demo_using_caret.R:

############ (I) Classification example ############

### Gradient boosting (using caret) for classification

rm(list=ls()) # clear the environment

library(ISLR) # contains the data

library(caret) # contains everything else

set.seed(4061) # for reproducibility

# Set up the data (take a subset of the Hitters dataset)

data(Hitters)

Hitters = na.omit(Hitters)

dat = Hitters

n = nrow(dat)

NC = ncol(dat)

# Change the response variable to a factor to make this a

# classification problem:

dat$Salary = as.factor(ifelse(Hitters$Salary>median(Hitters$Salary),

"High","Low"))

# Data partition

itrain = sample(1:n, size=round(.7*n))

dat.train = dat[itrain,]

dat.validation = dat[-itrain,] # independent validation set for later

# x = select(dat.train,-"Salary") ### if using dplyr

# training set:

x = dat.train

x$Salary = NULL

y = as.factor(dat.train$Salary)

gb.out = train(Salary~., data=dat.train, method='gbm', distribution='bernoulli')

gb.fitted = predict(gb.out) # corresponding fitted values

gb.pred = predict(gb.out, dat.validation)

confusionMatrix(reference=dat.validation$Salary, data=gb.pred,

mode="everything")

############ (II) Regression example ############

### Gradient boosting (using caret) for regression

rm(list=ls()) # clear the environment

# Set up the data (take a subset of the Hitters dataset)

data(Hitters)

Hitters = na.omit(Hitters)

dat = Hitters

# hist(dat$Salary)

dat$Salary = log(dat$Salary)

n = nrow(dat)

NC = ncol(dat)

# Data partition

itrain = sample(1:n, size=round(.7*n))

dat.train = dat[itrain,]

dat.validation = dat[-itrain,]

x = dat.train

x$Salary = NULL

y = dat.train$Salary

ytrue = dat.validation$Salary

gb.out = train(Salary~., data=dat.train, method='gbm', distribution='gaussian')

gb.fitted = predict(gb.out) # corresponding fitted values

gb.pred = predict(gb.out, dat.validation)

mean((gb.pred-ytrue)^2)

# --------------------------------------------------------

# ST4061 / ST6041

# 2021-2022

# Eric Wolsztynski

# ...

#### Section 4: demo code for xgboost (Extreme GB) ####

# --------------------------------------------------------

rm(list=ls()) # clear the environment

library(ISLR) # contains the data

library(xgboost) # XGBoost... and xgb.DMatrix()

library(caret)

set.seed(4061) # for reproducibility

# Set up the data (take a subset of the Hitters dataset)

data(Hitters)

Hitters = na.omit(Hitters)

dat = Hitters

n = nrow(dat)

NC = ncol(dat)

# Change the response variable to a factor to make this a

# classification problem:

dat$Salary = as.factor(ifelse(Hitters$Salary>median(Hitters$Salary),

"High","Low"))

# Data partition

itrain = sample(1:n, size=round(.7*n))

dat.train = dat[itrain,]

dat.validation = dat[-itrain,] # independent validation set for later

# x = select(dat.train,-"Salary") ### if using dplyr

# training set:

x = dat.train

x$Salary = NULL

y = dat.train$Salary

# test set:

x.test = dat.validation

x.test$Salary = NULL

y.test = dat.validation$Salary

# XGBoost...

set.seed(4061)

# (a) line up the data in the required format

# train set:

xm = model.matrix(y~., data=x)[,-1]

x.xgb = xgb.DMatrix(xm)

# test set:

xm.test = model.matrix(y.test~., x.test)[,-1]

x.xgb.test = xgb.DMatrix(xm.test)

# (b) training...

# NB: one can run xgbboost() with default parameters by using:

xgb.ctrl = trainControl(method="none")

# otherwise:

xgb.ctrl = trainControl(method="cv", number=10, returnData=FALSE)

xgb.model = train(x.xgb, y, trControl=xgb.ctrl, method="xgbTree")

# NB: use argument tuneGrid to specify custom grids of values for

# tuning parameters. Otherwise train() picks its own grids.

xgb.model$bestTune

# (c) testing...

xgb.pred = predict(xgb.model, newdata=x.xgb.test)

confusionMatrix(data=xgb.pred, reference=y.test)

# --------------------------------------------------------

# The below demo code is only for information. There is no need

# to spend time looking into it for ST4061/ST6041 tests/exam!

# There are a number of parameters to be tuned for XGBoost:

modelLookup('xgbTree')

# All or a subset of these parameters can be tuned in a sequential

# manner. For each tuning parameter, we can define a grid of

# potential values and search for an optimal value within that grid.

# Careful! Running this code will take some time...

# (1) Max number of trees (just an example!):

tune.grid = expand.grid(nrounds = seq(500, 1000, by=100),

eta = c(0.025, 0.05, 0.1, 0.3),

max_depth = c(2, 3, 4, 5, 6),

gamma = 0,

colsample_bytree = 1,

min_child_weight = 1,

subsample = 1)

xgb.ctrl = trainControl(method="cv", number=10, returnData=FALSE)

xgb.tune = train(x.xgb, y, trControl=xgb.ctrl, tuneGrid=tune.grid, method="xgbTree")

# (2) Max tree depth and min child weight (just an example!):

tune.grid = expand.grid(nrounds = seq(500, 1000, by=100),

eta = xgb.tune$bestTune$eta,

max_depth = c(1:xgb.tune$bestTune$max_depth+2),

gamma = 0,

colsample_bytree = 1,

min_child_weight = c(1:3),

subsample = 1)

xgb.ctrl = trainControl(method="cv", number=10, returnData=FALSE)

xgb.tune = train(x.xgb, y, trControl=xgb.ctrl, tuneGrid=tune.grid, method="xgbTree")

# (3) sampling (just an example!):

tune.grid = expand.grid(nrounds = seq(500, 1000, by=100),

eta = xgb.tune$bestTune$eta,

max_depth = xgb.tune$bestTune$max_depth,

gamma = 0,

colsample_bytree = seq(0.2,1,by=.2),

min_child_weight = xgb.tune$bestTune$min_child_weight,

subsample = seq(.5,1,by=.1))

xgb.ctrl = trainControl(method="cv", number=10, returnData=FALSE)

xgb.tune = train(x.xgb, y, trControl=xgb.ctrl, tuneGrid=tune.grid, method="xgbTree")

# (4) gamma (just an example!):

tune.grid = expand.grid(nrounds = seq(500, 1000, by=100),

eta = xgb.tune$bestTune$eta,

max_depth = xgb.tune$bestTune$max_depth,

gamma = seq(0,1,by=.1),

colsample_bytree = xgb.tune$bestTune$colsample_bytree,

min_child_weight = xgb.tune$bestTune$min_child_weight,

subsample = xgb.tune$bestTune$subsample)

xgb.ctrl = trainControl(method="cv", number=10, returnData=FALSE)

xgb.tune = train(x.xgb, y, trControl=xgb.ctrl, tuneGrid=tune.grid, method="xgbTree")

# (5) learning rate (just an example!):

tune.grid = expand.grid(nrounds = seq(500, 5000, by=100),

eta = c(0.01,0.02,0.05,0.075,0.1),

max_depth = xgb.tune$bestTune$max_depth,

gamma = xgb.tune$bestTune$gamma,

colsample_bytree = xgb.tune$bestTune$colsample_bytree,

min_child_weight = xgb.tune$bestTune$min_child_weight,

subsample = xgb.tune$bestTune$subsample)

xgb.ctrl = trainControl(method="cv", number=10, returnData=FALSE)

xgb.tune = train(x.xgb, y, trControl=xgb.ctrl, tuneGrid=tune.grid, method="xgbTree")

# Then fit:

xgb.ctrl = trainControl(method="cv", number=10, returnData=FALSE)

xgb.tune = train(x.xgb, y, trControl=xgb.ctrl, tuneGrid=tune.grid, method="xgbTree")

# testing:

xgb.pred = predict(xgb.model, newdata=x.xgb.test)

confusionMatrix(data=xgb.pred, reference=y.test)

2022-04-11上

推荐阅读更多精彩内容