本文比较了基于欧氏距离和DTW聚类对时间序列聚类和分类的可靠性。
从抽样的样本中发现了基于DTW聚类算法远远比基于欧氏距离的聚类算法好;从分类的角度中,基于DTW特征提取的分类准确率为87%,而基于欧氏距离的分类准确率为80%,显而易见基于DTW的聚类算法远远的优于基于欧氏距离的分类算法。
##数据下载链接
## http://kdd.ics.uci.edu/databases/synthetic_control/
sc <-read.table("synthetic_control.data.txt", header=F,sep="")
# show one sample from each class
##1-100为随机波动
##101-200 周期
##201-300 上升
##301-400 下降
##401-500 向上偏移
##501-600 向下偏移
idx <-c(1,101,201,301,401,501)
sample1 <-t(sc[idx,])
plot.ts(sample1,main="") ##可视化每类的一个图像
#基于欧氏距离的层次聚类
set.seed(6218)
n <-10
s <-sample(1:100, n)
idx <-c(s,100+s,200+s,300+s,400+s,500+s)
sample2 <-sc[idx,]
observedLabels <-rep(1:6, each=n)
#层次聚类,基于欧氏距离
hc <-hclust(dist(sample2),method="average")
plot(hc,labels=observedLabels,main="")
# 分为6类
rect.hclust(hc,k=6)
memb <-cutree(hc,k=6)
table(observedLabels,memb)
## memb
## observedLabels 1 2 3 4 5 6
## 1 10 0 0 0 0 0
## 2 1 6 2 1 0 0
## 3 0 0 0 0 10 0
## 4 0 0 0 0 0 10
## 5 0 0 0 0 10 0
## 6 0 0 0 0 0 10
###################################################
#基于DTW距离的层次聚类
library(dtw)
## Loading required package:proxy
##
## Attaching package: 'proxy'
## The following objects aremasked from 'package:stats':
##
## as.dist, dist
## The following object ismasked from 'package:base':
##
## as.matrix
## Loaded dtw v1.18-1. See?dtw for help, citation("dtw") for use in publication.
##基于dtw的聚类
distMatrix <-dist(sample2, method="DTW")
hc <-hclust(distMatrix,method="average")
plot(hc,labels=observedLabels,main="")
# 分为6类
rect.hclust(hc,k=6)
memb <-cutree(hc,k=6)
table(observedLabels,memb)
## memb
## observedLabels 1 2 3 4 5 6
## 1 10 0 0 0 0 0
## 2 0 7 3 0 0 0
## 3 0 0 010 0 0
## 4 0 0 0 0 7 3
## 5 2 0 0 8 0 0
## 6 0 0 0 0 0 10
###################################################
##时间序列进行分类
classId <-rep(as.character(1:6), each=100)
newSc <-data.frame(cbind(classId,sc))
library(party)
## Loading required package:grid
## Loading required package:mvtnorm
## Loading required package:modeltools
## Loading required package:stats4
## Loading required package:strucchange
## Loading required package:zoo
##
## Attaching package: 'zoo'
## The following objects aremasked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package:sandwich
ct <-ctree(classId~., data=newSc,
controls =ctree_control(minsplit=30, minbucket=10, maxdepth=5))
pClassId <-predict(ct)
table(classId,pClassId)
## pClassId
## classId 1 2 3 4 5 6
## 1 97 0 0 0 0 3
## 2 1 93 2 0 0 4
## 3 0 0 96 0 4 0
## 4 0 0 0100 0 0
## 5 4 0 10 0 86 0
## 6 0 0 0 87 0 13
#计算准确率
(sum(classId==pClassId)) /nrow(sc)
## [1] 0.8083333
plot(ct, ip_args=list(pval=FALSE), ep_args=list(digits=0))
###################################################
#基于dtw的分类
library(wavelets)
wtData <-NULL
for (i in 1:nrow(sc)) {
a <-t(sc[i,])
wt<-dwt(a,filter="haar", boundary="periodic")
wtData<-rbind(wtData,unlist(c(wt@W,wt@V[[wt@level]])))
}
wtData <-as.data.frame(wtData)
wtSc <-data.frame(cbind(classId,wtData))
###################################################
# build a decision tree with DWT coefficients
ct <-ctree(classId~.,data=wtSc,
controls =ctree_control(minsplit=30, minbucket=10, maxdepth=5))
pClassId <-predict(ct)
table(classId,pClassId)
## pClassId
## classId 1 2 3 4 5 6
## 1 97 3 0 0 0 0
## 2 1 99 0 0 0 0
## 3 0 0 81 019 0
## 4 0 0 063 0 37
## 5 0 0 16 084 0
## 6 0 0 0 1 0 99
(sum(classId==pClassId)) /nrow(wtSc)
## [1] 0.8716667
plot(ct, ip_args=list(pval=FALSE), ep_args=list(digits=0))
set.seed(10)
k <- 20
# create a new time series by adding noise to time series 501
newTS <- sc[501,] + runif(100)*15
distances <- dist(newTS, sc, method="DTW")
s <- sort(as.vector(distances),index.return=TRUE)
# class IDs of k nearest neighbors
table(classId[s$ix[1:k]])