20210627可视化复习 K-means聚类
加载需要的数据包
探索 GvHD
pacman::p_load(clusterSim,mlr,mlr3,mlr3cluster,mlr3viz,GGally,mlr3tuning,mlr3learners)
library("tidyverse")
library(mlr)
library(clusterSim)
data(GvHD, package = "mclust")
gvhd1 <- as_tibble(GvHD.control)
gvhd1
gvhd1scale <- gvhd1 %>% scale()
gvhd1scale
ggpairs(GvHD.control,
upper = list(continuous = "density"),
lower = list(continuous = wrap("points", size = 0.4)),
diag = list(continuous = "densityDiag"))
选择 k
gvhdTask <- makeClusterTask(data = as.data.frame(gvhd1scale))
列出聚类的方法
listLearners("cluster")$class
kMeansPar <- makeParamSet(
makeDiscreteParam("centers", values = 3:8),
makeDiscreteParam("algorithm", values = c("Hartigan-Wong", "Lloyd", "MacQueen")))
kMeansPar
gridSearch <- makeTuneControlGrid()
kFold <- makeResampleDesc("CV", iters = 10)
tunedK <- tuneParams(kMeans, task = gvhdTask,
resampling = kFold,
par.set = kMeansPar,
control = gridSearch,
measures = list(db, G1))
#比较算法
kMeansTuningData <- generateHyperParsEffectData(tunedK)
kMeansTuningData$data
longTuningData <- pivot_longer(kMeansTuningData$data,
c(3,4,6), names_to = "Metric", values_to = "Value")
ggplot(longTuningData, aes(centers, Value, col = algorithm)) +
facet_wrap(~ Metric, scales = "free_y") +
geom_line()
#训练模型
tunedKMeans <- setHyperPars(kMeans, par.vals = tunedK$x)
tunedKMeansModel <- train(tunedKMeans, gvhdTask)
kMeansModelData <- getLearnerModel(tunedKMeansModel)
kMeansModelData$iter
gvhd1 <- gvhd1 %>% mutate(
kMeansCluster = as.factor(kMeansModelData$cluster))
ggpairs(gvhd1, aes(col = kMeansCluster),
upper = list(continuous = "density"))
#预测新数据
newCell <- tibble(CD4 = 510, CD8b = 26, CD3 = 500, CD8 = 122) %>%
scale(center = attr(gvhd1scale,"scaled:center"),
scale = attr(gvhd1scale, "scaled:scale")) %>%
as_tibble()
predict(tunedKMeansModel, newdata = newCell)