2022-04-11下

# --------------------------------------------------------

# ST4061 / ST6041

# 2021-2022

# Eric Wolsztynski

# ...

#### Section 5: SVM Support Vector Machine ####

# --------------------------------------------------------

rm(list=ls()) # clear out running environment

library(randomForest)

library(class)

library(pROC)

library(e1071)

#SVM:一种复杂的分类工具

#### Example using SVM on the "best-student" data ####

# simulate data:生成随机数

set.seed(1)

n = 100

mark = rnorm(n, m=50, sd=10)

choc = rnorm(n, m=60, sd=5)

summary(mark)

summary(choc)

#评分标准--f(x) separating hyperplane: f(x)=int+a*mark + b*choc

int = 10

a = 2

b = 4

# rating of students on basis of their marks and level of appreciation of chocolate:

# 根据学生的分数和对巧克力的欣赏程度给他们打分

#生成data set：

mod = int + a*mark + b*choc # values for true model，生成separating hyperplane，该线/平面上是理论值

z = rnorm(n,s=8) # additive noise

obs = mod + z #围绕separating hyperplane生成一系列实际观测值

#分类：

y = as.factor(ifelse(obs>350,1,0)) # classification data，y记录了每个实际观测值所对应的真实分类

plot(mod, obs, xlab="model", ylab="observations", pch=20, cex=2)

plot(mark, choc, xlab="x1 (mark)", ylab="x2 (choc)",

pch=20, col=c(2,4)[as.numeric(y)], cex=2)

legend("topright", box.col=8, bty='n',

legend=c("good student","better student"),

pch=15, col=c(2,4))

table(y)#可以看到how complicated the data set is to be classify

set.seed(1)

# split the data into train+test (50%-50%):

# 生成的dataset是由观测值mark&choc组成的x，对应分类组成的y构成的

x = data.frame(mark,choc)

i.train = sample(1:n, 50)

x.train = x[i.train,]

x.test = x[-i.train,]

y.train = y[i.train]

y.test = y[-i.train]

class(x.train)

class(y.train)

xm = as.matrix(x.train)

# fit an SVM as follows:

# ?svm # in e1071

set.seed(1)

svmo = svm(xm, y.train, kernel='polynomial') #kernel参数可换，根据plot图像看一下大概是什么分割线

#把x和y输入,通过肉眼观察，假定kernel是多项式（曲线）的

#直线linear、曲线polynomial、圆radial basis

svmo

names(svmo)

# cf. ?svm:

# "Parameters of SVM-models usually must be tuned to yield sensible results!"

# 支持向量机模型的参数通常必须调整以产生合理的结果

#为了找到最合理的svm参数(find the maximum-margin hyperplane,离margin近的点才对其有影响)

# one can tune the model as follows:在指定范围ranges&gamma内找到最合适的Parameters

set.seed(1)

svm.tune = e1071::tune(svm, train.x=x.train, train.y=y.train,

kernel='polynomial',

ranges=list(cost=10^(-2:4), gamma=c(0.25,0.5,1,1.5,2)))

# 这里列出一系列想要尝试的调节参数

svm.tune

svm.tune$best.parameters#最好的

# then fit final SVM for optimal parameters:测试样本

svmo.final = svm(xm, y.train, kernel='polynomial',

gamma=svm.tune$best.parameters$gamma,

cost=svm.tune$best.parameters$cost)

# corresponding confusion matrices:混淆矩阵看分的怎么样

table(svmo$fitted,y.train)

table(svmo.final$fitted,y.train)

# we can also use caret for easy comparison:直接看accuracy

library(caret)

caret::confusionMatrix(svmo$fitted,y.train)$overall[1]

caret::confusionMatrix(svmo.final$fitted,y.train)$overall[1]

# assessing model fit to training data评估模型是否适合训练数据

identical(fitted(svmo), svmo$fitted)#TRUE——got the right information here

# to identify support vectors: 能左右hyperplane的点

# either svmo$index (indices), or svmo$SV (coordinates)

length(svmo$index)#tuned前的

length(svmo.final$index)#tuned后的，tuned后能左右hyperplane的点变少了，说明分得更干净了，svm拟合的结果更好了

# visualize:

plot(x.train, pch=20, col=c(1,2)[y.train], cex=2)

points(svmo$SV, pch=14, col=4, cex=2) # explain why this does not work!?

# apply scaling to dataset to see SV's:一定要先归一化，才能描出用到的点（用于找到最好参数的离margin近的点，能左右hyperplane的点）

plot(apply(x.train,2,scale), pch=20, col=c(1,2)[y.train], cex=2)

points(svmo$SV, pch=14, col=4, cex=2)

points(svmo.final$SV, pch=5, col=3, cex=2)

# If you want to use predict(), use a formula-type

# expression when calling svm(). Because of this,

# we re-shape our dataset:

#如果你想使用predict()，在调用svm()时使用公式类型的表达式。因此，我们重新塑造我们的数据集:

dat.train = data.frame(x=x.train, y=y.train)

dat.test = data.frame(x=x.test)

# decision boundary visualization:

svmo = svm(y~., data=dat.train)

plot(svmo, dat.train,

svSymbol = 15, dataSymbol = 'o',

col=c('cyan','pink')) # this is plot.svm()

svmo.final = svm(y~., data=dat.train, kernel='polynomial',

gamma=svm.tune$best.parameters$gamma,

cost=svm.tune$best.parameters$cost) #最好的拟合

plot(svmo.final, dat.train,

svSymbol = 15, dataSymbol = 'o',

col=c('cyan','pink')) # this is plot.svm()

# How to generate predictions from SVM fit:

# fitting the SVM model:

svmo = svm(y~., data=dat.train,

kernel='polynomial',

gamma=svm.tune$best.parameters$gamma,

cost=svm.tune$best.parameters$cost)

# Note that if we need probabilities P(Y=1)

# (for example to calculate ROC+AUC),

# we need to set 'probability=TRUE' also in

# fitting the SVM model:

svmo = svm(y~., data=dat.train, probability=TRUE,

kernel='polynomial',

gamma=svm.tune$best.parameters$gamma,

cost=svm.tune$best.parameters$cost)

#Generate predictions from SVM fit：

svmp = predict(svmo, newdata=dat.test, probability=TRUE)

roc.svm = roc(response=y.test, predictor=attributes(svmp)$probabilities[,2])

roc.svm$auc#越接近1越好

plot(roc.svm)#very happy

# compare with RF:

rf = randomForest(y~., data=dat.train)

rfp = predict(rf, dat.test, type='prob')

roc.rf = roc(y.test, rfp[,2])

roc.rf$auc

plot(roc.svm)

par(new=TRUE)

plot(roc.rf, col='yellow')

legend("bottomright", bty='n',

legend=c("RF","SVM"),

lty=1, lwd=3, col=c('yellow',1))

#random forest 比不过svm

--------------------------------------------------------

# ST4061 / ST6041

# 2021-2022

# Eric Wolsztynski

# ...

##### Section 5: demo code for effect of kernel on SVM ####

# Here we simulate 2D data that have a circular spatial

# distribution, to see the effect of the choice of kernel

# shape on decision boundaries

# --------------------------------------------------------

rm(list=ls())

library(e1071)

# Simulate circular data...

# simulate 2-class circular data:

set.seed(4061)

n = 100

S1 = 15; S2 = 3

x1 = c(rnorm(60, m=0, sd=S1), rnorm(40, m=0, sd=S2))

x2 = c(rnorm(60, m=0, sd=S1), rnorm(40, m=0, sd=S2))

# corresponding 2D circular radii:

rads = sqrt(x1^2+x2^2)

# make up the 2 classes in terms of whether lower or greater than median radius:

c1 = which(rads<median(rads))

c2 = c(1:n)[-c1]

# now we apply scaling factors to further separate the

# 2 classes:

x1[c1] = x1[c1]/1.2

x2[c1] = x2[c1]/1.2

x1[c2] = x1[c2]*1.2

x2[c2] = x2[c2]*1.2

# label data according to class membership:

lab = rep(1,n)

lab[c2] = 2#lab里，c1对应的位置为1；c2对应的位置为2

par(mfrow=c(1,1))

plot(x1,x2,col=c(1,2)[lab], pch=c(15,20)[lab], cex=1.5)

# create final data frame:

x = data.frame(x1,x2)

y = as.factor(lab)

dat = cbind(x,y)

# apply SVMs with different choices of kernel shapes:

svmo.lin = svm(y~., data=dat, kernel='linear')

svmo.pol = svm(y~., data=dat, kernel='polynomial')

svmo.rad = svm(y~., data=dat, kernel='radial')

svmo.sig = svm(y~., data=dat, kernel='sigmoid')

plot(svmo.lin, dat, col=c("cyan","pink"), svSymbol=15)

plot(svmo.pol, dat, col=c("cyan","pink"), svSymbol=15)

plot(svmo.rad, dat, col=c("cyan","pink"), svSymbol=15)

plot(svmo.sig, dat, col=c("cyan","pink"), svSymbol=15)

#### NOTE: the code below is outside the scope of this course! 注意:下面的代码超出了本课程的范围!

#### It is used here for illustrations purposes only.

# this call format is easier when using predict():

svmo.lin = svm(x, y, kernel='linear', scale=F)

svmo.pol = svm(x, y, kernel='polynomial', scale=F)

svmo.rad = svm(x, y, kernel='radial', scale=F)

svmo.sig = svm(x, y, kernel='sigmoid', scale=F)

# evaluate the SVM boundaries on a regular 2D grid of points:

ng = 50

xrg = apply(x, 2, range)

x1g = seq(xrg[1,1], xrg[2,1], length=ng)

x2g = seq(xrg[1,2], xrg[2,2], length=ng)

xgrid = expand.grid(x1g, x2g)

plot(x, col=c(1,2)[y], pch=20)

abline(v=x1g, col=8, lty=1)

abline(h=x2g, col=8, lty=1)

ygrid.lin = predict(svmo.lin, xgrid)

ygrid.pol = predict(svmo.pol, xgrid)

ygrid.rad = predict(svmo.rad, xgrid)

ygrid.sig = predict(svmo.sig, xgrid)

par(mfrow=c(2,2), font.lab=2, font.axis=2)

CEX = .5

COLS = c(1,3)

DCOLS = c(2,4)

plot(xgrid, pch=20, col=COLS[as.numeric(ygrid.lin)], cex=CEX, main="Linear kernel")

points(x, col=DCOLS[as.numeric(y)], pch=20)

# points(x[svmo.lin$index,], pch=21, cex=2)

points(svmo.lin$SV, pch=21, cex=2) # same as previous line!

plot(xgrid, pch=20, col=COLS[as.numeric(ygrid.pol)], cex=CEX, main="Polynomial kernel")

points(x, col=DCOLS[as.numeric(y)], pch=20)

points(x[svmo.pol$index,], pch=21, cex=2)

plot(xgrid, pch=20, col=COLS[as.numeric(ygrid.rad)], cex=CEX, main="Radial kernel")

points(x, col=DCOLS[as.numeric(y)], pch=20)

points(x[svmo.rad$index,], pch=21, cex=2)

plot(xgrid, pch=20, col=COLS[as.numeric(ygrid.sig)], cex=CEX, main="Sigmoid kernel")

points(x, col=DCOLS[as.numeric(y)], pch=20)

points(x[svmo.sig$index,], pch=21, cex=2)

# Alternative plot:

par(mfrow=c(2,2), font.lab=2, font.axis=2)

CEX = .5

COLS = c(1,3)

DCOLS = c(2,4)

L1 = length(x1g)

L2 = length(x2g)

plot(xgrid, pch=20, col=COLS[as.numeric(ygrid.lin)], cex=CEX, main="Linear kernel")

bnds = attributes(predict(svmo.lin, xgrid, decision.values=TRUE))$decision

contour(x1g, x2g, matrix(bnds, L1, L2), level=0, add=TRUE, lwd=2)

plot(xgrid, pch=20, col=COLS[as.numeric(ygrid.pol)], cex=CEX, main="Polynomial kernel")

bnds = attributes(predict(svmo.pol, xgrid, decision.values=TRUE))$decision

contour(x1g, x2g, matrix(bnds, L1, L2), level=0, add=TRUE, lwd=2)

plot(xgrid, pch=20, col=COLS[as.numeric(ygrid.rad)], cex=CEX, main="Radial kernel")

bnds = attributes(predict(svmo.rad, xgrid, decision.values=TRUE))$decision

contour(x1g, x2g, matrix(bnds, L1, L2), level=0, add=TRUE, lwd=2)

plot(xgrid, pch=20, col=COLS[as.numeric(ygrid.sig)], cex=CEX, main="Sigmoid kernel")

bnds = attributes(predict(svmo.sig, xgrid, decision.values=TRUE))$decision

contour(x1g, x2g, matrix(bnds, L1, L2), level=0, add=TRUE, lwd=2)

# NB: naive Bayes decision boundary is obtained with

# contour(x1g, x2g, matrix(bnds, L1, L2), level=0.5, add=TRUE, col=4, lwd=2)

# --------------------------------------------------------

# ST4061 / ST6041

# 2021-2022

# Eric Wolsztynski

# ...

#### Exercises Section 5: SVM ####

# --------------------------------------------------------

rm(list=ls())

library(randomForest)

library(class)

library(pROC)

library(e1071)

library(caret)

library(ISLR)

###############################################################

#### Exercise 1: using SVM to classify (High Carseat sales dataset) ####

###############################################################

library(ISLR) # contains the dataset

# Recode response variable so as to make it a classification problem

High = ifelse(Carseats$Sales<=8, "No", "Yes")

CS = data.frame(Carseats, High)

CS$Sales = NULL

#construct dataset

x = CS

x$High = NULL

y = CS$High

# split the data into train+test (50%-50%):

n = nrow(CS)

set.seed(4061)

i.train = sample(1:n, 350)

x.train = x[i.train,]

x.test = x[-i.train,]

y.train = y[i.train]

y.test = y[-i.train]

class(x.train)

class(y.train)

# ?svm : svm有两种形式

# svmo = svm(xm, y.train, kernel='polynomial')##svm(x, y = NULL, scale = TRUE, type = NULL, kernel ="radial")

# svmo = svm(y~., data=dat.train)##svm(formula, data = NULL, ..., subset, na.action = na.omit, scale = TRUE)

#由于x必须喂一个matrix，y必须喂一个numeric，所以x.train=as.matrix(x.train)，y.train=as.numeric(y.train)

# (3) Explain why this does not work:

svmo = svm(x.train, y.train, kernel='polynomial')

# >> The problem is the presence of categorical variables in

# the dataset. They must be "recoded" into numerical variables

# for svm() to analyse their spatial contribution.

# (4) Carry out the appropriate fix from your conclusion from (a).

# Then, fit two SVM models, one using a linear kernel, the other

# a polynomial kernel. Compare the two appropriately.

#将两个SVM分类器适合于适当的“按摩”训练集，一个使用线性核函数，另一个使用多项式核函数。

#修正的做法：

NC = ncol(x)

# x = x[,-c(NC-1,NC)] # take out the last two columns (predictors)

xm = model.matrix(y~.+0, data=x)#remove the intercept

xm.train = xm[i.train,]

xm.test = xm[-i.train,]

y.train = as.factor(y.train) # so that svm knows it's classification

svmo.lin = svm(xm.train, y.train, kernel='linear')

svmo.pol = svm(xm.train, y.train, kernel='polynomial')

svmy.lin = fitted(svmo.lin)#fitted:显示x的对应类别y

svmy.pol = fitted(svmo.pol)

table(y.train, fitted(svmo.lin))

table(y.train, fitted(svmo.pol))

# (5) Comparison...

# * visual (there are better ways of visualising!):

par(mfrow=c(1,3))

yl = as.integer(y=="Yes")+1

plot(apply(xm,2,scale), pch=c(15,20)[yl], col=c(1,4)[yl],

cex=c(1.2,2)[yl], main="The data")

plot(apply(xm.train,2,scale), pch=c(15,20)[y.train], col=c(1,4)[y.train],

cex=1, main="linear")

points(svmo.lin$SV, pch=5, col=2, cex=1.2)

plot(apply(xm.train,2,scale), pch=c(15,20)[y.train], col=c(1,4)[y.train],

cex=1, main="polynomial")

points(svmo.pol$SV, pch=5, col=2, cex=1.2)

# * in terms of training fit:

svmy.lin = fitted(svmo.lin)

svmy.pol = fitted(svmo.pol)

table(y.train, svmy.lin)

table(y.train, svmy.pol)

# * test error:

pred.lin = predict(svmo.lin, newdata=xm.test, probability=TRUE)

pred.pol = predict(svmo.pol, newdata=xm.test)

# ... the above does not work well:

summary(pred.lin)

# --> these are not probabilities! That's because we need to specify

# ", probability=TRUE" also when fitting the SVM, in order to enable

# probabilities to be computed and returned...

# 这些都不是概率!这是因为我们在拟合SVM时也需要指定“，probability=TRUE”，以便能够计算和返回概率……

# SO IF WE WANT TO GENERATE TEST-SET PREDICTIONS, THIS IS THE WAY:

svmo.lin = svm(xm.train, y.train, kernel='linear', probability=TRUE)

svmo.pol = svm(xm.train, y.train, kernel='polynomial', probability=TRUE)

pred.lin = predict(svmo.lin, newdata=xm.test, probability=TRUE)

pred.pol = predict(svmo.pol, newdata=xm.test, probability=TRUE)

y.test = as.factor(y.test)

confusionMatrix(y.test, pred.lin)

confusionMatrix(y.test, pred.pol)

# * AUC (we need to extract P(Y=1|X))

p.lin = attributes(pred.lin)$probabilities[,2]

p.pol = attributes(pred.pol)$probabilities[,2]

y.test = as.factor(y.test)

roc(response=y.test, predictor=p.lin)$auc

roc(response=y.test, predictor=p.pol)$auc

#sensitivity和specificity都很高

###############################################################

#### Exercise 2: 3-class problem (iris dataset) ####

###############################################################

x = iris

x$Species = NULL

y = iris$Species

set.seed(4061)

n = nrow(x)

i.train = sample(1:n, 100)

x.train = x[i.train,]

x.test = x[-i.train,]

y.train = y[i.train]

y.test = y[-i.train]

# (a)

plot(x.train[,1:2], pch=20, col=c(1,2,4)[as.numeric(y.train)], cex=2)

# (b)

dat = data.frame(x.train, y=as.factor(y.train))

svmo.lin = svm(y~., data=dat, kernel='linear')

svmo.pol = svm(y~., data=dat, kernel='polynomial')

svmo.rad = svm(y~., data=dat, kernel='radial')

# number of support vectors:

summary(svmo.lin)

summary(svmo.pol)

summary(svmo.rad)

#The number of support vectors less, the more complicated controversy.

# test error:

pred.lin = predict(svmo.lin, newdata=x.test)

pred.pol = predict(svmo.pol, newdata=x.test)

pred.rad = predict(svmo.rad, newdata=x.test)

cm.lin = confusionMatrix(y.test, pred.lin)

cm.pol = confusionMatrix(y.test, pred.pol)

cm.rad = confusionMatrix(y.test, pred.rad)

c(cm.lin$overall[1], cm.pol$overall[1], cm.rad$overall[1])

#rad more accurcte.

# (c) tune the model(via cross-validation)...

set.seed(4061)

svm.tune = e1071::tune(svm, train.x=x.train, train.y=y.train,

kernel='radial',

ranges=list(cost=10^(-2:2), gamma=c(0.5,1,1.5,2)))

print(svm.tune)

names(svm.tune)

# retrieve optimal hyper-parameters

bp = svm.tune$best.parameters

# use these to obtain final SVM fit:

svmo.rad.tuned = svm(y~., data=dat, kernel='radial',

cost=bp$cost, gamma=bp$gamma)

summary(svmo.rad)

summary(svmo.rad.tuned)#Not changed much，means it's got more to do with the shape of the kernel itself

#, rather than how the model is tunes for this dataset.

# test set predictions from tuned SVM model:

pred.rad.tuned = predict(svmo.rad.tuned, newdata=x.test)

cm.rad.tuned = confusionMatrix(y.test, pred.rad.tuned)

c(cm.rad$overall[1], cm.rad.tuned$overall[1])

# so maybe not an exact science!?

# ... in fact these performances are comparable, bear in mind CV assessment is

# itself subject to variability...

#These are estimates of prediction performance, there is uncertainty related to the points where the value you have here,

#which means when you see 100% or 96%, you are really looking at the same thing. So you need to look at the confidence interval

#around these value to understand what you are really seeing a difference or not.

###############################################################

#### Exercise 3: SVM using caret ####

###############################################################

# Set up the data (take a subset of the Hitters dataset)

data(Hitters)

Hitters = na.omit(Hitters)

dat = Hitters

n = nrow(dat)

NC = ncol(dat)

# Change into a classification problem:

dat$Salary = as.factor(ifelse(Hitters$Salary>median(Hitters$Salary),

"High","Low"))

# Data partition

set.seed(4061)

itrain = sample(1:n, size=round(.7*n))

dat.train = dat[itrain,]

dat.validation = dat[-itrain,] # independent validation set

x = dat.train # training set

x$Salary = NULL

y = as.factor(dat.train$Salary)

### Random forest

rf.out = caret::train(Salary~., data=dat.train, method='rf')

rf.pred = predict(rf.out, dat.validation)

rf.cm = confusionMatrix(reference=dat.validation$Salary, data=rf.pred, mode="everything")

### SVM (linear)

svm.out = caret::train(Salary~., data=dat.train, method="svmLinear")

svm.pred = predict(svm.out, dat.validation)

svm.cm = confusionMatrix(reference=dat.validation$Salary, data=svm.pred, mode="everything")

# modelLookup('svmRadial')

### SVM (radial)

svmR.out = caret::train(Salary~., data=dat.train, method="svmRadial")

svmR.pred = predict(svmR.out, dat.validation)

svmR.cm = confusionMatrix(reference=dat.validation$Salary, data=svmR.pred, mode="everything")

perf = rbind(rf.cm$overall, svm.cm$overall, svmR.cm$overall)

row.names(perf) = c("RF","SVM.linear","SVM.radial")

round(perf, 4)

perf = cbind(rf.cm$overall, svm.cm$overall, svmR.cm$overall)

colnames(perf) = c("RF","SVM.linear","SVM.radial")

round(perf, 4)

#评价就看accuracy，结合一开始的图像，图像不像是线性的，所以rf应该比较好

###############################################################

#### Exercise 4: SVM-based regression ####

###############################################################

x = iris

x$Sepal.Length = NULL

y = iris$Sepal.Length#using Sepal.Length as response variable

pairs(iris[,1:4])

?pairs#pairs生成一个配对的散点图矩阵，矩阵由X中的每列的列变量对其他各列列变量的散点图组成

set.seed(4061)

n = nrow(x)

i.train = sample(1:n, 100)

x.train = x[i.train,]

x.test = x[-i.train,]

y.train = y[i.train]

y.test = y[-i.train]

dat.train = cbind(x.train,y=y.train)

# specify statistical training settings:

ctrl = caret::trainControl(method='cv')

# perform statistical training:

svm.o = caret::train(y~., data=dat.train, method="svmLinear",

trControl=ctrl)#trControl=ctrl

# compute test set predictions:

svm.p = predict(svm.o, newdata=x.test)

# and corresponding MSE:

mean( (y.test-svm.p)^2 )

par(pty='s') #makes or a square plot box

rr = range(c(y.test, svm.p))

plot(y.test, svm.p, pch=20,

xlab="true values", ylab="predicted values",

xlim=rr,ylim=rr)

abline(a=0,b=1)

#Here is a very good enlightenment

#### Section 6 神经网络####

####分类——归一化；回归——标准化####

#learning rate:is applied to sort of calibrate the speed of the learning process data.

#DL deep learning vs ML meachine learning

------------------------------------------------------------

#### Example 1 : iris data with neuralnet ####

#install.packages('neuralnet')

library(neuralnet)

n = nrow(iris)

dat = iris[sample(1:n), ] # shuffle initial dataset

NC = ncol(dat)

nno = neuralnet(Species~., data=dat, hidden=c(6,5))#知道我们要建几层，每层几个节点时

#hidden 第一层6个，第二层5个的神经网络

plot(nno,

information=FALSE,#不要标数据

col.entry='red',

col.out='green',

show.weights=FALSE)

plot(nno,

information=TRUE,#要标数据

col.entry='red',

col.out='green',

show.weights=TRUE)

#(1)‘blue’ bits are bias

#(2)在黑色线条上的数字是weights, 可以是positive, 也可以是negative的

#### Example 2: single layer NN - regression - effect of scaling ####

library(nnet) # implements single layer NNs

library(mlbench) # includes dataset BostonHousing

#install.packages('mlbench')

data(BostonHousing) # load the dataset

#View(BostonHousing)

# train neural net

n = nrow(BostonHousing)

itrain = sample(1:n, round(.7*n), replace=FALSE)#要70%的数据并取整作为训练样本

nno = nnet(medv~., data=BostonHousing, subset=itrain, size=5)#size 隐藏层中的单元数

#只知道几个神经元，但不知道有几层

#?nnet

summary(nno$fitted.values)

#We saw the fitted value are all 1, it because the information was not scaled, we should do scale first.

# the above output indicates the algorithm did not

# converge, probably due to explosion of the gradients...

#都是1，说明算法没有收敛，需要归一化值(50 是这个数据的最大值，除以最大值):

# We try again, this time normalizing the values

# (50 is the max value for this data, see its range):

nno = nnet(medv/50~., data=BostonHousing, subset=itrain, size=5)

summary(nno$fitted.values)# there was thus a need to normalise the response variable...

# 测试神经网络

preds = predict(nno, newdata=BostonHousing[-itrain,]) * 50 # (we multiply by 50 to fall back to original data domain)

#（由于之前除以了50，我们乘以 50 以回退到原始数据域）

summary(preds)

# RMSE:偏方误差

sqrt(mean((preds-BostonHousing$medv[-itrain])^2))

# compare with lm():

lmo = lm(medv~., data=BostonHousing, subset=itrain)

lm.preds = predict(lmo, newdata= BostonHousing[-itrain,])

# RMSE:

sqrt(mean((lm.preds-BostonHousing$medv[-itrain])^2))

#Compare with lm, we have a lower test RMSE.

#平价的时候看一下length(itrain)和dim(BostonHousing),看有没有足够的训练样本和测试样本

# Further diagnostics may highlight various aspects of the

# model fit - always run these checks!每次都要检查跟lm的比较

# 进一步的诊断可能会突出模型拟合的各个方面 - 始终运行这些检查！

par(mfrow=c(2,2))

################################################################################

#PLOT(1): residuals form LM against NN

plot(lmo$residuals, nno$residuals*50)

abline(a=0, b=1, col="limegreen")

#there are some extreme residuals from LM

################################################################################

#PLOT(2): residuals from LM against Original Train Data - no relationship

plot(BostonHousing$medv[itrain], lmo$residuals, pch=20)

################################################################################

#PLOT(3): residuals from LM against Original Train Data (in grey)

# plus(+) residuals from NN against Original Train Data

plot(BostonHousing$medv[itrain], lmo$residuals, pch=20, col=8)

points(BostonHousing$medv[itrain], nno$residuals*50, pch=20)

################################################################################

#PLOT(4): QQ plot

#In general, Noise is assumed to be normal distributed or Gaussian...

#But in NN, we do not make this assumption. Since between X(input) and Y(output), we hope to use some non-linear function.

#But QQ plot is still a useful tool.

qqnorm(nno$residuals)

abline(a=mean(nno$residuals), b=sd(nno$residuals), col=2)

#### Example 3: effect of tuning parameters (iris data) ####

rm(list=ls())

n = nrow(iris)

# 像往常一样打乱初始数据集（删除第 4 个值，减少数据量，使数据更难准确）打乱数据、重排 #移除Petal.Width数据

dat = iris[sample(1:n),-4]

NC = ncol(dat)

#data scaling 不是变成0,1分布，而是将数据缩放至为 [0,1]

####scale function: y_normalized = (y-min(y)) / (max(y)-min(y)):####

#dat离第四列character型的数据不用归一化，所以dat[,-NC]

mins = apply(dat[,-NC],2,min)

maxs = apply(dat[,-NC],2,max)

dats = dat

dats[,-NC] = scale(dats[,-NC],center=mins,scale=maxs-mins)

# 设置训练样本:

itrain = sample(1:n, round(.7*n), replace=FALSE)

nno = nnet(Species~., data=dats, subset=itrain, size=5)

# 预测:

nnop = predict(nno, dats[-itrain,])

head(nnop)#返回向量、矩阵、表格、数据框或函数的第一部分

#这是从概率中获取预测标签的一种方法：

preds = max.col(nnop) #找到矩阵每一行的最大位置在第几列，用来做混淆矩阵

#（上面的行为每一行选择概率最高的列，即每个观察值）或者我们可以直接使用它：

preds = predict(nno, dats[-itrain,], type='class')

tbp = table(preds, dats$Species[-itrain])#混淆矩阵

sum(diag(tbp))/sum(tbp) #准确率

# #nnet里size怎样影响正确率

####找到最合适的size####

# 在这里，我们尝试使用 1 到 10 的尺寸进行说明，但您可以随意使用这些值！

sizes = c(1:10)

rate = numeric(length(sizes)) # 训练集分类准确率

ratep = numeric(length(sizes)) # 测试集分类准确率

for(d in 1:length(sizes)){

nno = nnet(Species~., data=dats, subset=itrain,

size=sizes[d])

tb = table(max.col(nno$fitted.values), dats$Species[itrain])

rate[d] = sum(diag(tb))/sum(tb)

# now looking at test set predictions

nnop = predict(nno, dats[-itrain,])

tbp = table(max.col(nnop), dats$Species[-itrain])

ratep[d] = sum(diag(tbp))/sum(tbp)

}

plot(rate, pch=20, t='b', xlab="layer size", ylim=range(c(rate,ratep)))

points(ratep, pch=15, t='b', col=2)

legend('bottomright', legend=c('training','testing'),

pch=c(20,15), col=c(1,2), bty='n')

# 注意训练集和测试集的表现不一定相似......

#由此找到最好最合适的size

#nnet里decay(权重衰减参数,默认为 0) 怎样影响正确率

decays = seq(1,.0001,lengt=11)

rate = numeric(length(decays)) # train-set classification rate

ratep = numeric(length(decays)) # test-set classification rate

for(d in 1:length(decays)){

# fit NN with that particular decay value (decays[d]):

nno = nnet(Species~., data=dats, subset=itrain, size=10,

decay=decays[d])

# corresponding train set confusion matrix:

tb = table(max.col(nno$fitted.values), dats$Species[itrain])

rate[d] = sum(diag(tb))/sum(tb)

# now looking at test set predictions:

nnop = predict(nno, dats[-itrain,])

tbp = table(max.col(nnop), dats$Species[-itrain])

ratep[d] = sum(diag(tbp))/sum(tbp)

}

plot(decays, rate, pch=20, t='b', ylim=range(c(rate,ratep)))

points(decays, ratep, pch=15, t='b', col=2)

legend('topright', legend=c('training','testing'),

pch=c(20,15), col=c(1,2), bty='n')

rm(list=ls())

######Exercise ######

#### Exercise 1####

# 1. What type of neural network does this code implement? FFNN

#有两种函数能实现神经网络，一种是step function——大于某一个值是1，小于是0，非黑即白；一种是activation function——有确切的数字输出

# 2.

library(MASS)

library(neuralnet)

# --- NN with one 10-node hidden layer

nms = names(Boston)[-14]

f = as.formula(paste("medv ~", paste(nms, collapse = " + ")))

# fit a single-layer, 10-neuron NN:

set.seed(4061)

out.nn = neuralnet(f, data=Boston, hidden=c(10), rep=5,

linear.output=FALSE)

#plot(out.nn, information=TRUE, col.entry='red', col.out='green',show.weights=TRUE)

#create single-hidden layer neural network and repeat 5 times

# without using an activation function:

set.seed(4061)

out.nn.lin = neuralnet(f, data=Boston, hidden=c(10), rep=1,

linear.output=TRUE)

# Warning message: 算法在 stepmax 内的 1 次重复中没有收敛，所以需要运行此代码两遍

# Algorithm did not converge in 1 of 1 repetition(s) within the stepmax.

#线性输出：

set.seed(4061)

out.nn.tanh = neuralnet(f, data=Boston, hidden=c(10), rep=5,

linear.output=FALSE, act.fct='tanh')

p1 = predict(out.nn, newdata=Boston)

p2 = predict(out.nn.tanh, newdata=Boston)

sqrt(mean((p1-Boston$medv)^2))

sqrt(mean((p2-Boston$medv)^2))

#参数：

#linear.output: logical，线性输出为TRUE，nonlinear为FALSE

#rep: 神经网络训练的重复次数

#act.fct: 一个可微函数，用于平滑协变量或神经元与权重的叉积的结果

#act.fct: 默认“logistic”，也可以是“tanh”

#### Exercise 2 ####

library(neuralnet)

set.seed(4061)

n = nrow(iris)

dat = iris[sample(1:n), ] # shuffle initial dataset

NC = ncol(dat)

nno = neuralnet(Species~., data=dat, hidden=c(6,5))

plot(nno)

#### Exercise 3 #####

#Load dataset MASS::Boston and perform a 70%-30% split for training and test sets

#respectively. Use set.seed(4061) when splitting the data and also every time you run a

#neural network.

#1. Compare single-layer neural network fits from the neuralnet and nnet libraries.

#Can you explain any difference you may find?

# 2. Change the "threshold" argument value to 0.001 in the call to neuralnet, and

#comment on your findings (this run might take a bit more time to converge)

#加载数据集 MASS::Boston 并分别对训练集和测试集执行 70%-30% 的拆分。

#1. 比较来自神经网络和 nnet 库的单层神经网络拟合。

#解释可能发现的任何区别吗？

#2. 在对神经网络的调用中将“阈值”参数值更改为 0.001，

#并评论发现（此运行可能需要更多时间才能收敛）

#当外界刺激达到一定的阀值时,神经元才会受刺激,影响下一个神经元。

#超过阈值,就会引起某一变化,不超过阈值,无论是多少,都不产生影响

rm(list=ls())

library(neuralnet)

library(nnet) # implements single layer NNs

library(MASS) # includes dataset BostonHousing

data(Boston) # load the dataset

# train neural nets

n = nrow(Boston)

itrain = sample(1:n, round(.7*n), replace=FALSE)

dat = Boston

dat$medv = dat$medv/50 #归一化

dat.train = dat[itrain,]

dat.test = dat[-itrain,-14]#自变量的测试样本

y.test = dat[-itrain,14]#因变量的测试样本

#nnet 单层五个神经元

nno1 = nnet(medv~., data=dat.train, size=5, linout=TRUE)

fit1 = nno1$fitted.values

mean((fit1-dat.train$medv)^2) #偏差

#neuralnet 单层五个神经元

nno2 = neuralnet(medv~., data=dat.train, hidden=c(5), linear.output=TRUE)

fit2 = predict(nno2, newdata=dat.train)[,1]

mean((fit2-dat.train$medv)^2) #偏差

##阈值threshold0.0001的neuralnet

nms = names(dat)[-14]

f = as.formula(paste("medv ~", paste(nms, collapse = " + ")))#下面所用的函数太长，所以先写出来

nno3 = neuralnet(f, data=dat.train, hidden=5, threshold=0.0001)#threshold 阈值####f跟medv有啥区别？？####

#Threshold in 'neuralnet' is originally 0.01. Now we set it to be 0.0001.

fit3 = predict(nno3, newdata=dat.train)[,1]

mean((fit3-dat.train$medv)^2)#0.005276877 #even much better!

#用mean来看模型能不能用，mean不能太大

# test neural nets predict一定要乘回去50

y.test = y.test*50

p1 = predict(nno1, newdata=dat.test)*50

p2 = predict(nno2, newdata=dat.test)*50

p3 = predict(nno3, newdata=dat.test)*50

mean((p1-y.test)^2)

mean((p2-y.test)^2)

mean((p3-y.test)^2)

#test的mean用来看哪个模型更好

# explain these differences?

names(nno1)#nnet

names(nno2)#neuralnet

# nnet:

# - activation function: logistic

# - algorithm: BFGS in optim

# - decay: 0

# - learning rate: NA

# - maxit: 100

# neuralnet:

# - activation function: logistic

# - algorithm: (some form of) backpropagation

# - decay: ?

# - learning rate: depending on algorithm

# - maxit:?

# so what is it?

#nnet里的activation function：

#不是所有的信号都要做反应，需要activation function去看需要对哪些信号作出反应

#hide层和output层都有activation function

#hide层的activation function是由act.fun决定的——tanch正切或者logistic

#output层的activation function是由linout决定的

#做regression时一般linout=T，表明output层的activation function是identical的，就是输入是啥输出就是啥，不用做改变

#linout=F output的activation function是logistic，输出值要变成逻辑变量

#默认值hide和output都是logistic

#### Exercise 4 ####

#Fit a single-layer feed-forward neural network using nnet to

#Report on fitted values.

#使用 nnet 拟合单层前馈神经网络

#报告拟合值。

rm(list=ls())

library(caret)

library(neuralnet)

library(nnet)

library(ISLR)

#set up the data (take a subset of the Hitters dataset)设置数据（获取 Hitters 数据集的子集）

dat = na.omit(Hitters) #返回删除NA后的向量a 因为该数据里有缺失值

#is.na(Hitters)

n = nrow(dat)

NC = ncol(dat)

# Then try again after normalizing the response variable to [0,1]:将响应变量归一化为 [0,1]

dats = dat

dats$Salary = (dat$Salary-min(dat$Salary)) / diff(range(dat$Salary))

# train neural net

itrain = sample(1:n, round(.7*n), replace=FALSE)

dat.train = dat[itrain,]

dats.train = dats[itrain,]

dat.test = dat[-itrain,]

dats.test = dats[-itrain,]

#data Salary which is not scaled: do not work

#归一化前 dat是归一化前，dats是归一化后

nno = nnet(Salary~., data=dat.train, size=10, decay=c(0.1))

summary(nno$fitted.values)

#data Salary is scaled, but no regularization: do not work either

#归一化后

nno.s = nnet(Salary~., data=dats.train, size=10, decay=c(0))

summary(nno.s$fitted.values)

#data Salary is scaled, and also have regularization progress: works!

#归一化后

nno.s = nnet(Salary~., data=dats.train, size=10, decay=c(0.1))

summary(nno.s$fitted.values)

#Our last attempt above was a success.

#But we should be able to get a proper fit even for decay=0...

#what's going on? Can you get it to work?

#改进！添加系数linout=1

#(A1) Well, it's one of these small details in how you call a function;

#here we have to specify 'linout=1' because we are considering a regression problem

#for regression problem: class k = 1

#data Salary which is not scaled:

set.seed(4061)

nno = nnet(Salary~., data=dat.train, size=10, decay=c(0.1), linout=1)

summary(nno$fitted.values)

#data Salary which is scaled, but with no decay:

set.seed(4061)

nno.s = nnet(Salary~., data=dats.train, size=10, decay=c(0), linout=1)

summary(nno.s$fitted.values)

#data Salary which is scaled, and also with decay:

set.seed(4061)

nno.s = nnet(Salary~., data=dats.train, size=10, decay=c(0.1), linout=1)

summary(nno.s$fitted.values)

#改进！写function

# (A2) but let's do the whole thing again more cleanly...

# 重新编码和放缩数据对结果影响的比较

# re-encode and scale dataset properly 正确重新编码和缩放数据集

myrecode <- function(x){

# function recoding levels into numerical values

#函数将级别重新编码为数值

if(is.factor(x)){

levels(x)

return(as.numeric(x))

} else {

return(x)

}

myscale <- function(x){

# function applying normalization to [0,1] scale

#对 [0,1] 尺度应用归一化的函数

minx = min(x,na.rm=TRUE)

maxx = max(x,na.rm=TRUE)

return((x-minx)/(maxx-minx))

}

datss = data.frame(lapply(dat,myrecode))

datss = data.frame(lapply(datss,myscale))

# replicate same train-test split:

#复制相同的训练测试拆分：

datss.train = datss[itrain,]

datss.test = datss[-itrain,]

nno.ss.check = nnet(Salary~., data=datss.train, size=10, decay=0, linout=1)

summary(nno.ss.check$fitted.values)

# use same scaled data but with decay as before:

#使用相同的缩放数据，但与以前一样衰减：

nno.ss = nnet(Salary~., data=datss.train, size=10, decay=c(0.1), linout=1)

summary(nno.ss$fitted.values)

# evaluate on test data (with same decay for both models):

#评估测试数据（两个模型的衰减相同）：

datss.test$Salary - dats.test$Salary

pred.s = predict(nno.s, newdata=dats.test)

pred.ss = predict(nno.ss, newdata=datss.test)

mean((dats.test$Salary-pred.s)^2)

mean((datss.test$Salary-pred.ss)^2)

#Feed-forward neural network(FFNN)

#• Single or multiplelayers 单层或多层

#• Forward propagationonly 仅前向传播

#• Number of layers determines function complexity 层数决定功能复杂度

#• Typically uses a nonlinear activation function 通常使用非线性激活函数

#• Some definitions specify a unique hidden layer, others allow any number of layers一些定义指定一个唯一的隐藏层，其他定义允许任意数量的层

#Multilayer Perceptron(MLP)

#Recurrent neural network(RNN)

#Long short-term memory neural network(LSTMNN)

#Convolutional neural network(CNN)

#### Exercise 6: Olden index #####

#使用 NeuralNetTools::olden() 计算以下数据集的变量重要性，

#每次拟合一个 7 神经元单层 FFNN (nnet)：

#1. 鸢尾花数据集（使用全套）；

#2. 波士顿数据集

#。与从随机森林获得的变量重要性评估进行比较。

rm(list=ls())

library(nnet)

library(NeuralNetTools)

library(randomForest)

library(MASS)

myscale <- function(x){

minx = min(x,na.rm=TRUE)

maxx = max(x,na.rm=TRUE)

return((x-minx)/(maxx-minx))

}

# (1) Iris data

# shuffle dataset...

set.seed(4061)

n = nrow(iris)

dat = iris[sample(1:n),]

# rescale predictors...

dat[,1:4] = myscale(dat[,1:4])

# fit Feed-Forward Neural Network...

set.seed(4061)

nno = nnet(Species~., data=dat, size=c(7), linout=FALSE, entropy=TRUE)

pis = nno$fitted.values

matplot(pis, col=c(1,2,4), pch=20)

y.hat = apply(pis, 1, which.max) # fitted values

table(y.hat, dat$Species)

# compute variable importance...

#神经网络中输入变量的相对重要性作为原始输入隐藏、隐藏输出连接权重的乘积之和

vimp.setosa = olden(nno, out_var='setosa', bar_plot=FALSE)

vimp.virginica = olden(nno, out_var='virginica', bar_plot=FALSE)

vimp.versicolor = olden(nno, out_var='versicolor', bar_plot=FALSE)

names(vimp.setosa)

par(mfrow=c(1,2))

plot(iris[,3:4], pch=20, col=c(1,2,4)[iris$Species], cex=2)

plot(iris[,c(1,3)], pch=20, col=c(1,2,4)[iris$Species], cex=2)

dev.new()

plot(olden(nno, out_var='setosa'))

plot(olden(nno, out_var='virginica'))

plot(olden(nno, out_var='versicolor'))

v.imp = cbind(vimp.setosa$importance, vimp.virginica$importance, vimp.versicolor$importance)

rownames(v.imp) = names(dat)[1:4]

colnames(v.imp) = levels(dat$Species)

(v.imp)

#正负值代表positive 还是nagative effect

#绝对值越大，自变量对因变量的影响越大

# fit RF...

set.seed(4061)

rfo = randomForest(Species~., data=dat, ntrees=1000)

rfo$importance

# how can we compare variable importance assessments?

cbind(apply(v.imp, 1, sum), #所有自变量放在一起对y的影响重要性（有抵消）

apply(abs(v.imp), 1, sum), #取绝对值

rfo$importance)

#三个值都大的自变量have overall contribution

# (2) Boston data

set.seed(4061)

n = nrow(Boston)

dat = Boston[sample(1:n),]

# rescale predictors...

dats = myscale(dat)

dats$medv = dat$medv/50

set.seed(4061)

nno = nnet(medv~., data=dats, size=7, linout=1)

y.hat = nno$fitted.values

plot(y.hat*50, dat$medv)#从图中看准确度

mean((y.hat*50-dat$medv)^2)#偏差

v.imp = olden(nno, bar_plot=FALSE)

plot(v.imp)

# fit RF...里面的重要性，跟神经网络拟合得到的重要性进行比较

set.seed(4061)

rfo = randomForest(medv~., data=dat, ntrees=1000)

rfo$importance

# how can we compare variable importance assessments?我们如何比较变量重要性评估？

cbind(v.imp, rfo$importance)

round(cbind(v.imp/sum(abs(v.imp)),

rfo$importance/sum(rfo$importance)),3)*100 #重要性的百分比

#一些变量两边数都大，突出standout

#一些变量差距两边很大

# should we use absolute values of Olden's index?应该使用奥尔登指数的绝对值

par(mfrow=c(2,1))

barplot(abs(v.imp[,1]), main="importance from NN",

names=rownames(v.imp), las=2)

barplot(rfo$importance[,1], main="importance from RF",

names=rownames(v.imp), las=2)

# 作图比较

# or possibly normalize across all values for proportional contribution?

par(mfrow=c(2,1))

NNN = sum(abs(v.imp[,1]))

NRF = sum(abs(rfo$importance[,1]))

barplot(abs(v.imp[,1])/NNN, main="importance from NN",

names=rownames(v.imp), las=2)

barplot(rfo$importance[,1]/NRF, main="importance from RF",

names=rownames(v.imp), las=2)

# 把上面两个图合在一起looks alright... now make it a nicer comparative plot :)

par(font=2, font.axis=2)

imps = rbind(NN=abs(v.imp[,1])/NNN, RF=rfo$importance[,1]/NRF)

cols = c('cyan','pink')

barplot(imps, names=colnames(imps), las=2, beside=TRUE,

col=cols,

ylab="relative importance (%)",

main="Variable importance from NN and RF")

legend("topleft", legend=c('NN','RF'), col=cols, bty='n', pch=15)

####Section 7 Feature Selection ####

library(ISLR)

library(leaps) # contains regsubsets()

#查看哪些变量重要，那些变量不需要

####Exercise 1: best subset selection ####

rm(list=ls())

Hitters = na.omit(Hitters)

dim(Hitters)

# (1) 执行最佳子集选择

reg.full = regsubsets(Salary~., data=Hitters, method="exhaustive")

# method=c("exhaustive","backward", "forward", "seqrep"),

# method="exhaustive"穷举是最优的

#（2）解释标准 regsubsets() 输出。

names(summary(reg.full))

summary(reg.full) #有*表示要这个变量；没*就不要这个变量

summary(reg.full)$which # 追踪穷举过程

summary(reg.full)$outmat # 追踪穷举过程

#举例最后的结果是：

#y~x1+x5

#y~x2+x3+x4

# (3)将 RSS 绘制为变量数量的函数

RSS = summary(reg.full)$rss

plot(RSS, pch=20, t='b', xlab="Number of covariates", ylab='RSS')#残差平方和

# 随着Number of covariates的增大，RSS依旧不能趋于稳定，那么我们要换adjusted R square调整后的 r 平方作为判断标准

R2adj = summary(reg.full)$adjr2 #调整后的 r 平方

plot(R2adj, pch=20, t='b', xlab="Number of covariates",

ylab='Adjusted R^2')

#r 相关系数

#增加自变量的个数时，判定系数就会增加，

#即随着自变量的增多，R平方会越来越大，

#会显得回归模型精度很高，有较好的拟合效果。而实际上可能并非如此，

#有些自变量与因变量（即预测）完全不相关，

#增加这些自变量，并不会提升拟合水平和预测精度。为避免这种现象

#因此需要调整后的R平方

# （4）将最终模型的所需大小增加到 19，并找出哪个子模型有：

#最小的 RSS

#最高调整后的 R2

reg.full = regsubsets(Salary~., data=Hitters, nvmax=19)

par(mfrow=c(1,2))

RSS = summary(reg.full)$rss

plot(RSS, pch=20, t='b', xlab="Number of covariates", ylab='RSS')

#到10的时候才开始趋于稳定，RSS不好

R2 = summary(reg.full)$rsq #调整前的 r 平方

R2adj = summary(reg.full)$adjr2 #调整后的 r 平方

plot(R2, pch=20, t='b', xlab="Number of covariates",

ylab='Original and adjusted R^2')

points(R2adj, col=4, pch=15, t='b')

# 找到最优值，也就是最好有几个变量

R2adj.index = which.max(R2adj)

# 把这个值画在图里

abline(v = R2adj.index)

#11个变量

#提取找到对应的模型：

summary(reg.full)$outmat[R2adj.index,]

mod = summary(reg.full)$which[R2adj.index,]#提取需要的

names(which(mod))

# 画对应的heatmap:

plot(reg.full, scale="adjr2")

summary(reg.full)$outmat

# 与上面的追踪相对应

#0.5为调整后的R平方的临界值，

#如果调整后的R平方小于0.5，则要分析我们所采用和未采用的自变量。

#如果调整后的R平方与R平方存在明显差异，

#则意味着所用的自变量不能很好的测算因变量的变化，

#或者是遗漏了一些可用的自变量

#调整后的R平方与R平方间差距越大，模型的拟合越差。

#### Exercise 2: fwd/bwd subset selection ####

#1. 使用 jumps::regsubsets() 执行向前和向后选择。

#2. 比较 RSS 和 Adjusted R2 w.r.t 的图。模型尺寸。

#3. 比较通过每种方法选择的 4 变量模型中的协变量。

#4. 以任何其他方式探索最终的模型选择

rm(list=ls())

Hitters = na.omit(Hitters)

dim(Hitters)

#（1）使用两种方法

reg.fwd = regsubsets(Salary~., data=Hitters, nvmax=19, method="forward")

reg.bwd = regsubsets(Salary~., data=Hitters, nvmax=19, method="backward")

#（2）比较

par(mfrow=c(1,2))

#画rss图

plot(summary(reg.bwd)$rss, t='b', pch=20, cex=1.5)

points(summary(reg.fwd)$rss, t='b', pch=15, col=2)

#画adjr2图

plot(summary(reg.bwd)$adjr2, t='b', pch=20, cex=1.5)

points(summary(reg.fwd)$adjr2, t='b', pch=15, col=2)

#最优值

R2adj.fwd.index = which.max(summary(reg.fwd)$adjr2)

R2adj.bwd.index = which.max(summary(reg.bwd)$adjr2)

abline(v=c(R2adj.bwd.index, R2adj.fwd.index), col=c(1,2))

#（3）找每个模型中有4个变量的模型

# 4-variable models:

#id:变量个数

coef(reg.fwd, id=4)

coef(reg.bwd, id=4)

# 但最好的还是"exhaustive"

reg.full = regsubsets(Salary~., data=Hitters, method="exhaustive")

names(which(summary(reg.full)$which[4,]==TRUE))

# 从反向消除过程中提取最优模型：

coef(reg.bwd, id=R2adj.bwd.index)

#### Exercise 3: generate predictions from regsubsets() output?####

#但是在Exercise 5里，加了两行代码就能用了

rm(list=ls())

dat = na.omit(Hitters)

n = nrow(dat)

itrain = sample(1:n, 150)

reg.fwd = regsubsets(Salary~., data=dat, nvmax=10,

method="forward", subset=itrain)

#predict(reg.fwd, newdata=dat[-itrain,])

#predict不能用，得自己写predict

beta.hat = coef(reg.fwd, id=4)

# create matrix X:

test.dat = model.matrix(Salary~., data = dat[-itrain,])

# get the test data matrix:

Xtest = test.dat[,names(beta.hat)]

# compute (X Beta^T) as in Y = (X Beta^t) + Epsilon:

pred = Xtest %*% beta.hat

pred = as.numeric(pred) # make this a vector instead

# compute prediction RMSE:

sqrt( mean((pred - dat$Salary[-itrain])^2) )

####Exercise 4: fit and predict using stats::step() ####

#使用 R 的基本包 stats 中的函数 step() 对完整的 Hitters 数据集执行逐步选择（一旦删除了缺失值的观察，如练习 2 中所示）。

#1. 比较 step() 和 regsubsets() 的后向选择输出。

#2. 使用 step() 引用通过反向选择选择的最终模型的模型拟合摘要。

#step : 在逐步算法中通过 AIC 选择模型

rm(list=ls())

Hitters = na.omit(Hitters)

dim(Hitters)

# 从 regsubsets() 中逐步选择：

reg.fwd = regsubsets(Salary~., data=Hitters, nvmax=19, method="forward")

reg.bwd = regsubsets(Salary~., data=Hitters, nvmax=19, method="backward")

# 来自 step() 的逐步选择：

lm.out = lm(Salary~., data=Hitters) # full model

step.bth = step(lm.out, direction="both")#默认方法

step.fwd = step(lm.out, direction="forward")

step.bck = step(lm.out, direction="backward")

# compare backward selections from regsubsets() and step():

# ... from step()...

coef(step.bck)

length(coef(step.bck))

summary(step.bck)

# Nice: we get the model directly! No need to reconstruct

# fitted values by hand this time!

# ... from regsubsets()...

i.opt = which.min(summary(reg.bwd)$bic)

i.set.opt = summary(reg.bwd)$which[i.opt,]

summary(reg.bwd)$which[i.opt, i.set.opt]

# Different models, but smaller one is included in larger one.

# Difference is likely due to using different criteria

# (AIC v BIC, BIC yielding a smaller model).

# NB: we can also assess feature contributions in terms

# of magnitude of their effect:

coefs = abs(coef(step.fwd))/sum(abs(coef(step.fwd)), na.rm=TRUE)*100

coefs = coefs[-1]

coefs[order(abs(coefs), decreasing=TRUE)]

#从大到小每个变量占重要性（1）的比率，后面应该加 %

####Exercise 5: generate predictions from step() output? ####

rm(list=ls())

dat = na.omit(Hitters)

n = nrow(dat)

itrain = sample(1:n, 150)

lmo = lm(Salary~., data=dat, subset=itrain)#这是新添加的

reg.fwd = step(lmo, direction="forward")#这是新添加的

pred = predict(reg.fwd, newdata=dat[-itrain,])

sqrt(mean((pred - dat$Salary[-itrain])^2))

#### 带插入符号的逐步（逻辑）回归 #####

rm(list=ls())

dat = na.omit(Hitters)

n = nrow(dat)

dat = dat[sample(1:n,n),]

dat$y = as.factor(dat$Salary>mean(dat$Salary))#是否大于均值

dat$Salary = NULL

levels(dat$y) = c("low","hi")

# 分层拆分为训练+测试数据集

itrain = createDataPartition(dat$y, p=.75, times=1)[[1]]

dtrain = dat[itrain,]

dtest = dat[-itrain,]

trC = trainControl(method="cv", number=5,

savePredictions = TRUE,

classProbs = TRUE)

co = train(y~., data=dtrain, method='glmStepAIC',

trControl=trC, distribution='binomial')

summary(co$finalModel)

names(co)

preds <- predict(co, dtest)

probs <- predict(co, dtest, type="prob")

table(dtest$y,preds)

pd = data.frame(obs=dtest$y,pred=preds,low=probs$low)

twoClassSummary(pd, lev=levels(dtest$y))

#### on simulated data ####

rm(list=ls())

library(glmnet)

library(leaps)

library(caret)

# We start by creating a dummy data set, so that we know which

# features actually make up the observations. Here a linear

# combination of nonlinear expression of 3 features are used

# to create observations Y.

# The 3 features are:

# - education level

# - number of years of experience

# - some employee rating assessed by their company

# The response variable Y is salary.

# Another 20 features containing random noise are also created

# and added to the dataset. They should not be selected by our

# model selection method...

n = 500 # desired sample size

# level of education (1=secondary level, 2=BSc, 3=MSc, 4=PhD/MBA):

edu = sample(c(1:4), size=n, prob=c(.05,.5,.35,.1), replace=TRUE)

# nbr years experience:

yex = rpois(n=n, lambda=6)

# some obscure employee rating made up by the company:

ert = pmin(5, pmax(0, 5-rexp(n=n, rate=2)))

# employee salary (response variable):

sal = 2*exp(.15*yex) + 3.2*log(edu) + 4*ert

par(mfrow=c(2,2))

plot(factor(edu), main="education")

hist(yex, main="years experience")

hist(ert, main="employee rating")

hist(sal, main="Salaries")

par(mfrow=c(1,3), pch=20)

boxplot(sal~factor(edu), main="salary wrt\n education")

plot(yex, sal, main="salary v\n years experience")

plot(ert, sal, main="salary v\n employee rating")

# now make up some dummy features...

# we don't bother changes scales/means since we will

# be normalizing these features...

p = 20

xtra = matrix(rnorm(n*p), ncol=p, nrow=n)

colnames(xtra) = paste("X",c(1:p),sep="")

par(mfrow=c(4,5), pch=20, mar=c(1,1,1,1))

for(j in 1:p){

plot(xtra[,j], sal, main=paste(j))

}

# the data frame(s):

features = data.frame(edu,yex,ert,xtra)

dat = data.frame(sal,features) # may be more convenient sometimes

# train-test split:

i.train = sample(1:n, size=300, replace=FALSE)

# line up data in several formats for convenience:

dat.train = dat[i.train,]

dat.test = dat[-i.train,]

x.train = features[i.train,]

y.train = sal[i.train]

x.test = features[-i.train,]

y.test = sal[-i.train]

# not forgetting matrix forms for the likes of glmnet:

xm = model.matrix(sal~.,data=features)[,-1]

xm.train = xm[i.train,]

xm.test = xm[-i.train,]

#### check out what LASSO would tell us ####

lasso.cv = cv.glmnet(xm.train, y.train)

lasso = glmnet(xm.train, y.train, lambda=lasso.cv$lambda.min)

coef(lasso)

# c.lasso = caret::train(x.train, y.train, method="glmnet")

# how about cross-validating this?

K = 10

folds = cut(1:n, breaks=K, labels=FALSE)

sel = matrix(0, nrow=K, ncol=ncol(features))

colnames(sel) = names(features)

for(k in 1:K){

itr = which(folds!=k)

lasso.cv = cv.glmnet(xm[itr,], sal[itr])

lasso = glmnet(xm[itr,], sal[itr], lambda=lasso.cv$lambda.min)

isel = which(coef(lasso)[-1] != 0)

sel[k,isel] = 1

}

apply(sel,2,mean)*100

# LASSO thinks X1 and X14, for example, are important...

# We'd be better off increasing the regularization parameter,

# e.g. using lasso.cv$lambda.min*2 instead (try it!).

#### perform FS with caret::rfe based on linear regression ####

subsets <- c(1:5, 10, 15, 20, ncol(features))

ctrl <- rfeControl(functions = lmFuncs,

method = "cv",

number = 10,

# method = "repeatedcv",

# repeats = 5,

verbose = FALSE)

lm.rfe <- rfe(x.train, y.train,

sizes = subsets,

rfeControl = ctrl)

lm.rfe

# This function has picked the correct subset of features

#### compare with leaps...####

reg.bwd = regsubsets(sal~., data=dat.train,

nvmax=ncol(features))

opt.mod = which.max(summary(reg.bwd)$adjr2)

isel = which(summary(reg.bwd)$which[opt.mod,-1])

isel

# how about cross-validating this?

K = 10

folds = cut(1:n, breaks=K, labels=FALSE)

sel = matrix(0, nrow=K, ncol=ncol(features))

colnames(sel) = names(features)

for(k in 1:K){

itr = which(folds!=k)

reg.bwd = regsubsets(sal~., data=dat[itr,],

nvmax=ncol(features))

opt.mod = which.max(summary(reg.bwd)$adjr2)

isel = which(summary(reg.bwd)$which[opt.mod,-1])

sel[k,isel] = 1

}

apply(sel,2,mean)*100

# X1 and X14, again...

#### perform FS with caret::rfe based on RF ####

subsets <- c(1:5, 10, 15, 20, ncol(features))

ctrl <- rfeControl(functions = rfFuncs,

method = "cv",

number = 10,

# method = "repeatedcv",

# repeats = 5,

verbose = FALSE)

rf.rfe <- rfe(x.train, y.train,

sizes = subsets,

rfeControl = ctrl)

rf.rfe

# worth the wait!

# ST4061 / ST6041

# 2021-2022

# Eric Wolsztynski

# ...

#### Section 8: Unsupervised learning Use(s) of PCA...####

# Use(s) of PCA...

#主成分分析（Principal Component Analysis，PCA），是一种统计方法。只做分类，根据几个X决定Y是哪一类

#通过正交变换将一组可能存在相关性的变量转换为一组线性不相关的变量，转换后的这组变量叫主成分

#k均值聚类算法（k-means clustering algorithm）是一种迭代求解的聚类分析算法，

#其步骤是，预将数据分为K组，则随机选取K个对象作为初始的聚类中心，然后计算每个对象与各个种子聚类中心之间的距离，

#把每个对象分配给距离它最近的聚类中心。聚类中心以及分配给它们的对象就代表一个聚类。每分配一个样本，

#聚类的聚类中心会根据聚类中现有的对象被重新计算。这个过程将不断重复直到满足某个终止条件。

#终止条件可以是没有（或最小数目）对象被重新分配给不同的聚类，没有（或最小数目）聚类中心再发生变化，误差平方和局部最小。

# Section 8: Unsupervised learning

# Use(s) of PCA...

#PCA的原理：don't make decisions based on y but on x's in particular

#PCA的目的：filter out some variables in the original domain.

#PCA的缺点：your features become compulsive features of all input features

#PCA里为什么要scale：

#after scaling，the matrix formed by all x becomes correlation matrix instead of covariance Matrix

#redistribute the information as to maximise the variance

#rearranging data so as to extract most of the information early on.

#pca里的数，及他所占的百分比是怎么来的

#for the matrix formed by all x, every xi has its own enginvalue lambda i,

#the its value is sqrt(lambda i) / sum(sqrt(lambda i))

#因此，我们应该通过标准化来取消数据的大小对其对应的特征值和PCA的影响

#install.packages('fdm2id')

# Section 8: Unsupervised learning

# Use(s) of PCA...

rm(list=ls())

library(caret)

library(pROC)

if(1){

# library(mlbench) #also contains a version of the Ionosphere dataset

# head(Ionosphere)

library(fdm2id)

head(ionosphere)#显示数据的前几行数（不知道有几行）

dat = ionosphere

class(dat[,1])

class(dat[,ncol(dat)])

} else {

require(mlbench)

data(Sonar)

dat = Sonar

}

#要符合PCA的命名逻辑

names(dat) = c(paste("X",1:(ncol(dat)-1),sep=''), "Y")#重命名每列数据，将其变为X,Y

head(dat)

y = dat[,ncol(dat)]

x = dat[,-ncol(dat)]

n = nrow(x)

p = ncol(x)

table(y)/sum(table(y))#看数据里的每种y的占比

apply(x,2,sd)

sapply(x,sd)

sapply(x,mean)

M = cor(x)#correlation

diag(M) = 0

which(abs(M)>.9)

caret::findCorrelation(cor(x))

set.seed(6041)

# itrain = sample(1:n, round(.7*n))

# do stratified sampling instead:分层抽样

itrain = createDataPartition(y, p=.7, times=1)[[1]] #do stratified sampling #做分层抽样#结果和上面那个一样

dtrain = dat[itrain,]

dtest = dat[-itrain,]

# (1) Using logistic regression for prediction...

logo = glm(Y~., dat=dtrain, family='binomial')#想看看数据的logistic regression，没啥用

summary(logo)

logp = predict(logo, newdata=dtest, "response")

summary(logp)

logh = as.factor( levels(y)[(logp>.5)+1] )

# table(logh, dtest$Y)

caret::confusionMatrix(logh, dtest$Y)

roco = pROC::roc(predictor=logp, response=dtest$Y)

roco$auc

ci.auc(roco)# 95% CI of roco$auc

# boxplot(dat$X2~dat$Y)

# (2) Using PCA for prediction...

pca = prcomp(x, scale=TRUE)

plot(pca)

plot(x[,2:3], pch=20)#随便看看两个数据的关系

biplot(pca, col=c(1,2))#有原始数据，有向量

biplot(pca, col=c(0,2))#只有向量，没有原始数据

abline(h=0, v=0, col=8, lty=1)#画坐标轴

j = which(summary(pca)$importance[3,]>.85)[1]####只要PCA里前85%的数据####

names(pca)

xt = pca$x[,1:j]

# create reduced data frame

pca.dat = data.frame(xt,Y=y)

pca.dtrain = pca.dat[itrain,]

pca.dtest = pca.dat[-itrain,]

#把前85%的数据取出来，在做logistics regression

pca.logo = glm(Y~., dat=pca.dtrain, family='binomial')

summary(pca.logo)

pca.logp = predict(pca.logo, newdata=pca.dtest, "response")

pca.logh = as.factor( levels(y)[(pca.logp>.5)+1] )

caret::confusionMatrix(logh, dtest$Y)#PAC前的数据#Accuracy : 0.8173

caret::confusionMatrix(pca.logh, dtest$Y)#Accuracy : 0.7885 #没有变低很多

roco = pROC::roc(predictor=logp, response=dtest$Y)#0.7856

pca.roco = pROC::roc(predictor=pca.logp, response=dtest$Y)#0.8302

roco$auc

pca.roco$auc

ci.auc(roco)

ci.auc(pca.roco)

################################说用caret包，没说干啥，也不分析

set.seed(6041)

trC = trainControl(method="boot", number=10,

# method="cv", number=10,repeat(重复几遍)

savePredictions = "all",

classProbs = TRUE)#bootstrapping 也可以cv

modo = caret::train(x, y, family='binomial', method="glm", trControl=trC)

modo$results

pca.modo = caret::train(xt, y, family='binomial', method="glm", trControl=trC)

pca.modo$results

extract.caret.metrics <- function(co){

# co: caret output

ids = unique(co$pred$Resample)

K = length(ids)

aucs = accs = numeric(K)

for(k in 1:K){

sk = subset(co$pred,Resample==ids[k])

l = co$levels[1]

aucs[k] = roc(response=sk$obs, pred=sk[,l], quiet=TRUE)$auc

tb = table(sk$obs, sk$pred)

accs[k] = sum(diag(tb))/sum(tb)

}

return(list(aucs=aucs, accs=accs))

}

# compare CV AUCs and accuracies:

eo = extract.caret.metrics(modo)

c(mean(eo$aucs), mean(eo$accs))#PCA前AUC和accuracy

pca.eo = extract.caret.metrics(pca.modo)

c(mean(pca.eo$aucs), mean(pca.eo$accs))#PCA后AUC和accuracy

# reporting mean and SE instead:

c(mean(eo$aucs),sd(eo$aucs))#PCA前AUC和accuracy

c(mean(pca.eo$aucs),sd(pca.eo$aucs))#PCA后AUC和accuracy

################################说用caret包，没说干啥，也不分析

# (2) Using PCA to cluster features...聚类

biplot(pca, col=c(0,2))

abline(h=0, v=0, col=8, lty=1)

# reproduce plot of projeted features:

pca.biplot <- function(pca,cols=2,a=1,b=2){

plot(pca$rotation[,c(a,b)],pch='')

abline(h=0,lty=3,col=1,lwd=.5)

abline(v=0,lty=3,col=1,lwd=.5)

nms = row.names(pca$rotation)

text(pca$rotation[,c(a,b)],labels=nms,col=cols)

}

pca.biplot(pca,cols=2,a=1,b=2)#只要向量的点上有向量名字，不要向量线

#loading plot（载荷）

#里面的数表示它作为第i主成分时，其对于平均水平的相对程度，正数表示平均水平以上，负数表示平均水平以下

abline(h=0, v=0, col=8, lty=1)

#如果两个名字太近，don't need them all together，we can cluster on that.聚类

C = 5 #被选作集群的点数 center，也是k

C = 10

x.proj = pca$rotation[,1:2]

#ko = kmeans(pca$rotation[,1:2], centers=5)

# run on first two components 要前两个成分

#take five clusters 采取五个集群

ko = kmeans(x.proj, centers=C)

points(ko$centers, pch=20, cex=3)#看哪C个点被选作集群（选择是随机的）

#how they group up in the projected space, do clustering.

cbind(ko$cluster)#看每个点都被分为了哪个集群

table(ko$cluster)#每个集群都有几个点

# pay attention what you cluster though! 所有PCA都要

ko.all = kmeans(pca$rotation, centers=C)

points(ko.all$centers[,1:2], pch=15, cex=3, col=4)

cbind(ko$cluster, ko.all$cluster) # watch out, cluster numbers are random!

#see the number of clusters OK for each cluster，picking identifying which features are contained in the cluster.

pca.clust.set = numeric(C)

#looking at which picture is the closest to the center

for(ic in 1:C){

fs = which(ko$cluster==ic)

centroid = ko$centers[ic,]

dc = NULL

for(ifs in fs){

dc = c(dc, sqrt(mean((x.proj[ifs,]-centroid)^2)))

}

pca.clust.set[ic] = fs[which.min(dc)]#the closest feature from the centre for each cluster

}

x.ss = x[,pca.clust.set]

set.seed(6041)

pca.cl.modo = caret::train(x.ss, y, family='binomial', method="glm", trControl=trC)

pca.cl.modo$results

pca.cl.eo = extract.caret.metrics(pca.cl.modo)

c(mean(eo$aucs), mean(eo$accs))#auc 和 accuracy

c(mean(pca.eo$aucs), mean(pca.eo$accs))

c(mean(pca.cl.eo$aucs), mean(pca.cl.eo$accs))

# reporting mean and SE instead:

c(mean(eo$aucs),sd(eo$aucs))#AUC及其sd

c(mean(pca.eo$aucs),sd(pca.eo$aucs))

c(mean(pca.cl.eo$aucs),sd(pca.cl.eo$aucs))

#### Another example...in PCA####

library(ISLR)

dim(Hitters)

dat = na.omit(Hitters)

n = nrow(dat)

set.seed(4060)

dat = dat[sample(1:n, n, replace=FALSE),]

i.train = 1:round(.7*n)

dat.train = dat[i.train,]

dat.valid = dat[-i.train,]#test group

# (a) Scaled PCA:

#这里有很多x，这里只选取前6个

set.seed(1)

pca = prcomp(dat.train[,1:6], scale=TRUE)

summary(pca)

# 4 PCs are necessary to capture over 86% of the information

# (b) k-means:

set.seed(1)

ko = kmeans(dat.train[,1:6], 4)#4个集群

table(ko$cluster)

par(font=2, font.axis=2, font.lab=2)

dims = c(1,3) #用x1和x3作图，这里随便谁都行

plot(dat.train[,dims], pch=20, col=c(2,4,3,5)[ko$cluster], cex=2)

#对上面的集群作图

points(ko$centers[,dims], pch='*', col=1, cex=4)

#这里没解释这四个点标出来是干什么的

#根据图感觉应该是每个集群的中心点

#can see clearly that there's 4 groups

# From figure:

# Stars indicate the centroids of each cluster

# AtBat contributes more because of scale effect:

# 从图中：

# 星号表示每个簇的质心

# 由于规模效应，AtBat 贡献更大：（AtBat的sd最大）

sapply(dat.train[,1:6],sd)

# (b) k-means:

set.seed(1)

#对PCA进行k-means

pca.ko = kmeans(pca$x, 4)

table(pca.ko$cluster)

par(font=2, font.axis=2, font.lab=2)

dims = c(1,3)

plot(dat.train[,dims], pch=20, col=c(2,4,3,5)[pca.ko$cluster], cex=2)

#和之前的图差别挺大的，也没分析为啥

###1:02讲完了

#### Section 8: miz Imbalance...####

# Imbalance...

#imbalance 就是当数据里绝大部分都是A，只有很少是B时

#比如No占0.9667，而 Yes 只有0.0333

#很多Yes被识别为No

#Sensitivity和Specificity中有一个数会非常小

#处理imbalance（对数据进行加权，让yes的比例变大）以后

#准确率下降了，但Yes的识别率显著提高

#Sensitivity和Specificity都很大

#Confusion Matrix and Statistics

#Reference

#Prediction No Yes

#No TN FN

#Yes FP TP

#True/FALSE

#Positive/Negative

#FP:failed alarm

#FN:false detection

#Sensitivity:TP/(TP+FN)

#Specificity=TN/(TN+FP)

rm(list=ls())

library(ISLR)

head(Default)

tb = table(Default$default)

tb/sum(tb)

dat = Default

set.seed(4061)

itrain = createDataPartition(dat$default, p=.7, times=1)[[1]]

dtrain = dat[itrain,]

dtest = dat[-itrain,]

trC = trainControl(method="boot", number=50,

# method="cv", number=10,

savePredictions = "all",

classProbs = TRUE)

# (1) Using logistic regression for prediction...

set.seed(6041)

x = dtrain

x$default = NULL

y = dtrain$default

modo = caret::train(x, y, family='binomial', method="glm", trControl=trC)

modo$results

preds = predict(modo, dtest)

confusionMatrix(preds, dtest$default)#很多Yes被识别为No

#Sensitivity和Specificity中有一个数会非常小

confusionMatrix(preds, dtest$default, positive=levels(dtest$default)[2])

# positive=levels(dtest$default)[2]：从positive=yes变成positive=No

#Accuracy和matrix不变，Sensitivity和Specificity数值置换了

levels(dtest$default)

# (2) Using weighted logistic regression for prediction...

tbt = table(dtrain$default)

ws = rev( as.numeric(tbt/sum(tbt)) )

w = numeric(nrow(dtrain))

l = levels(dtrain$default)

w[which(dtrain$default==l[1])] = ws[1]

w[which(dtrain$default==l[2])] = ws[2]

#modo = caret::train(x, y, family='binomial', method="glm", trControl=trC)

modo.w = caret::train(x, y, family='binomial', method="glm", trControl=trC, weights=w)

modo.w$results

preds.w = predict(modo.w, dtest)

confusionMatrix(preds.w, dtest$default, positive=levels(dtest$default)[2])

#准确率下降了，但Yes的识别率显著提高

#Sensitivity和Specificity都很大

2022-04-11下

推荐阅读更多精彩内容