局部异常因子算法-LOF
1、计算k-distance of p:点p的第k距离,也就距离p第k远的点的距离,不包括p。
2、计算k-distance neighborhood of p:点p的第k距离邻域,就是p的第k距离以内的所有点,包括第k距离
3、计算reach-distance:可达距离,若小于第k距离,则可达距离为第k距离,若大于第k距离,则可达距离为真实距离
4、计算local reachability density:局部可达密度
5、计算local outlier factor:局部离群因子
K-Means聚类算法
1、 随机选择初始的两个质心
2、 其他点根据离质心的距离远近分类
3、 计算分好的类的平均值,重新赋值给质心
4、 返回2,直到质心不变
#include "stdafx.h"
#include<vector>
#include<algorithm>
#include<iostream>
using namespace std;
typedef vector<vector<int>> Mat;
//定义分类后返回结果,包括两个质心和两个向量
typedef struct{
int c1;
int c2;
vector<int> vect1;
vector<int> vect2;
}classifyResult;
//求向量的最大值
int maxV(const vector<int>& obj){
int max = 0;
for (int i = 0; i < obj.size(); ++i){
if (obj[i]>max)
max = obj[i];
}
return max;
}
//返回最大值索引
int maxIndex(const vector<int>& obj){
int max = 0 , index_ = 0;
for (int i = 0; i < obj.size(); ++i){
if (obj[i]>max){
max = obj[i];
index_ = i;
}
}
return index_;
}
//返回平均值
double meanV(const vector<int>& obj){
double sum = 0;
for (int i = 0; i < obj.size(); ++i){
sum += obj[i];
}
return sum / obj.size();
}
//返回和
int sumV(const vector<int>& obj){
int sum_ = 0;
for (int i = 0; i < obj.size(); ++i){
sum_ += obj[i];
}
return sum_;
}
//计算点p的k-distance
//obj:数据集 p:点p的索引 k:
int calKDistance(const vector<int>& obj,int p, int k){
vector<int> k_near_distance(k, 2147483647);//定义k邻近的向量,存放p点的最近的k个点的距离
//计算k-distance
for (int j = 0; j < obj.size(); ++j){
if (p == j)//如果轮到的点 是p自己,直接进入下一次循环
continue;
if (int tmp = abs(obj[p] - obj[j]) < maxV(k_near_distance))
k_near_distance[maxIndex(k_near_distance)] = tmp;
}
return maxV(k_near_distance);
}
//计算点p的k-distance neghborhood of p
//obj:数据集 p:点p的索引 k:
vector<int> calKDistanceNeghborhood(const vector<int>& obj, int p, int k){
vector<int> k_neighborhood;//定义第k距离邻域,存储邻域点的索引
//由于k距离邻域是不确定的,所有又得循环一边,找出所有小于等于k-distance的点(第k距离邻域是否包含点p?)
//即计算k-distance neghborhood of p
for (int j = 0; j < obj.size(); ++j){
if (p == j)//如果轮到的点 是p自己,直接进入下一次循环
continue;
if (abs(obj[p] - obj[j]) <= calKDistance(obj, p, k))
k_neighborhood.push_back(j);
}
return k_neighborhood;
}
//计算点o到点p的可达距离reach-distance
//obj:数据集 o:点o的索引 p:点p的索引
int calReachDistance(const vector<int>& obj, int p, int o, int k){
int reach_distance = calKDistance(obj, o, k);
if (int tmp = abs(obj[p] - obj[o]) > reach_distance)
reach_distance = tmp;
return reach_distance;
}
//计算p点的局部可达密度
//obj:数据集 p:点p的索引 k:
double calLocalReachabilityDensity(const vector<int>& obj, int p, int k){
double lrdtmp = 0.0;
vector<int> k_neighborhood = calKDistanceNeghborhood(obj, p, k);
for (int i = 0; i < k_neighborhood.size(); ++i)
lrdtmp += calReachDistance(obj, p, k_neighborhood[i], k);
return 1.0 / (lrdtmp / k_neighborhood.size());
}
//数据预处理,LOF算法过滤局部异常因子
void preDealDate(vector<int>& obj,int k){
}
//数据分类
classifyResult classifyVect(vector<int>& obj){
classifyResult result;
if (obj.empty()){
cout << "empty" << endl;
return result;
}
int len = obj.size();
int dist1, dist2;//定义到两个质心的两个距离
//排序
sort(obj.begin(), obj.end());
//随机选择两个初始质心,为保证选的质心不属于同一类,目前先选最小和最大点
int c1 = obj[0], c2 = obj[len-1];
//如果最大值等于最小值,所有的数都是相同的,结果分成一类就好
if (c1 == c2){
result = { c1, c2, obj, obj };
return result;
}
while (true)
{
vector<int> vect1, vect2;//定义两个向量,分别存储分好的两类数据
int sum1 = 0, sum2 = 0;
//计算其他数到质心的L1-Distance,离谁更近归为哪类
for (int i = 0; i < len; ++i){
dist1 = abs(obj[i] - c1);
dist2 = abs(obj[i] - c2);
if (dist1 <= dist2){//距离c1更近,把该数归为c1类
vect1.push_back(obj[i]);
sum1 += obj[i];
}
else//距离c2更近,把该数归为c2类
{
vect2.push_back(obj[i]);
sum2 += obj[i];
}
}
//分别计算两类的平均值,替换c1,c2
if (c1 == sum1 / vect1.size() && c2 == sum2 / vect2.size()){
result = { c1, c2, vect1, vect2 };
return result;
}
c1 = sum1 / vect1.size();
c2 = sum2 / vect2.size();
}
return result;
}
int main(){
int a[31] = { 1, 2, 4, 7, -1, 3, 2, 5, 7, 8, 0, 5, 3, 2 ,99,87,67,56,78,99,100,97,56,89,98,97,96,95,94,93,190};
//int a[5] = { 1,1,2,0,3 };
vector<int> s(a,a+31);
classifyResult result = classifyVect(s);
vector<int> vect1 = result.vect1;
vector<int> vect2 = result.vect2;
cout << "原始数据是:\n";
for (int i = 0; i < s.size(); ++i)
cout << s[i] << " ";
cout << "\n";
cout << "分成两类后的数据分别是:\n";
cout << "vector1=\n";
for (int i = 0; i < vect1.size(); ++i)
cout << vect1[i] << " ";
cout << "\n";
cout << "vector2=\n";
for (int i = 0; i < vect2.size(); ++i)
cout << vect2[i] << " ";
cout << "\n";
cout << "质心分别是c1=" << result.c1 << " c2=" << result.c2<<"\n";
}