library(cluster)
library(factoextra)
data("mtcars")
df <- scale(mtcars) #数据标准化
km.dist <- get_dist(df, method = 'euclidean') #计算数据的euclidean距离矩阵
fviz_dist(km.dist) #可视化距离矩阵
均值聚类
fviz_nbclust(df, FUNcluster = kmeans, method = 'wss') +
geom_vline(xintercept = 5, linetype = 3, color = 'red') #选择最优聚类数目
set.seed(123)
km.res <- kmeans(df, centers = 5, nstart = 25)#计算聚类结果,k = 5
table(km.res$cluster)
1 2 3 4 5
4 12 7 2 7
#计算每个聚类的原数据变量的均值
aggregate(mtcars, by = list(cluster = km.res$cluster), mean)
cluster mpg cyl disp hp drat wt qsec vs am gear carb
1 21.92500 5.500000 146.3250 121.50000 3.962500 2.601250 16.42000 0 1 4.500000 4.000000
2 15.05000 8.000000 357.6167 194.16667 3.120833 4.104083 17.14250 0 0 3.000000 3.083333
3 20.74286 5.142857 175.1143 102.14286 3.570000 3.194286 19.96714 1 0 3.571429 2.142857
4 15.40000 8.000000 326.0000 299.50000 3.880000 3.370000 14.55000 0 1 5.000000 6.000000
5 28.37143 4.000000 89.8000 80.57143 4.148571 2.028286 18.70000 1 1 4.142857 1.428571
#可视化聚类
km.plot <- fviz_cluster(km.res, data = df,
repel = T,
ggtheme = theme_minimal())
km.plot
层次聚类
hc.dist <- get_dist(df, method = 'euclidean')
hc.res <- hclust(dist(df), method = 'ward.D2')
fviz_dend(hc.res, cex = 0.45) #原始图形
hc.coph <- cophenetic(hc.res)
cor(hc.dist, hc.coph) #计算一致性系数,大于0.75表示一致性较好
[1] 0.7575063
#用cutree(, k = 5)分成五个聚类,自行决定分类数
grp <- cutree(hc.res, k = 5)
#计算每个聚类里的频数
table(grp)
## grp
## 1 2 3 4 5
## 3 8 7 12 2
#计算每个聚类的原数据变量的均值
aggregate(mtcars, by = list(cluster = grp), mean)
cluster mpg cyl disp hp drat wt qsec vs am gear carb
1 20.56667 6.000000 155.0000 131.6667 3.806667 2.755000 16.32667 0.000 1 4.333333 4.666667
2 28.07500 4.000000 93.6125 81.8750 4.183750 2.042250 18.45000 0.875 1 4.250000 1.500000
3 20.74286 5.142857 175.1143 102.1429 3.570000 3.194286 19.96714 1.000 0 3.571429 2.142857
4 15.05000 8.000000 357.6167 194.1667 3.120833 4.104083 17.14250 0.000 0 3.000000 3.083333
5 15.40000 8.000000 326.0000 299.5000 3.880000 3.370000 14.55000 0.000 1 5.000000 6.000000
hc.plot <- fviz_dend(hc.res, k = 5,
cex = 0.45,
rect = T,
rect_fill = T,
rect_border = 'jco') #分割图形
hc.plot
fviz_dend(hc.res, k = 5,
k_colors = 'jco',
type = 'phylogenic',
repel = T)
fviz_dend(hc.res, k = 5,
cex = 0.5,
k_colors = 'jco',
rect = T,
type = 'circular')
fviz_cluster(list(data = df, cluster = grp),
repel = T,
ellipse.type = 'convex',
palette = 'Set1',
ggtheme = theme_minimal())