导入相关包
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, Normalizer
from sklearn.metrics import accuracy_score
from sklearn.grid_search import GridSearchCV
import tester
from tester import dump_classifier_and_data
PERF_FORMAT_STRING = "\tAccuracy: {:>0.{display_precision}f}\tPrecision: {:>0.{display_precision}f}\tRecall: {:>0.{display_precision}f}\tF1: {:>0.{display_precision}f}\tF2: {:>0.{display_precision}f}"
RESULTS_FORMAT_STRING = "\tTotal predictions: {:4d}\tTrue positives: {:4d}\tFalse positives: {:4d}\tFalse negatives: {:4d}\tTrue negatives: {:4d}"
import pickle
import sys
from sklearn.cross_validation import StratifiedShuffleSplit
from feature_format import featureFormat, targetFeatureSplit
from tester import test_classifier
from sklearn.metrics import accuracy_score
import numpy as np
import sys
import pickle
sys.path.append("/Users/zhongyaode/AnacondaProjects/sklearn")
from tester import dump_classifier_and_data
with open("/Users/zhongyaode/AnacondaProjects/sklearn/final_project_dataset.pkl", "r") as data_file:
data_dict = pickle.load(data_file)
my_dataset = data_dict
def gen_features(dataset):
"""
生成特征
:param dataset: 数据集
:return : 特征列表
"""
set_features = set()
# Erase features of messages
# list_filter = ['from_poi_to_this_person', 'to_messages', 'email_address',
# 'shared_receipt_with_poi', 'from_messages', 'from_this_person_to_poi']
list_filter = []
count = 0
for _, features in dataset.items():
if count:
set_features = set_features.intersection(set(features.keys()))
else:
set_features = set(features.keys())
count += 1
set_features = list(set_features)
for i in list_filter:
if i in set_features:
set_features.pop(set_features.index(i))
poi = set_features.pop(set_features.index('poi'))
salary = set_features.pop(set_features.index('salary'))
bonus = set_features.pop(set_features.index('bonus'))
set_features.insert(0, poi)
set_features.insert(1, salary)
set_features.insert(2, bonus)
return set_features
features_list=gen_features(my_dataset)
len(features_list)
查看数据中人的数量,特征数,嫌疑人数量、非嫌疑人数量
def data_information(my_dataset):
people_total=[]
poi_data=[]
for x ,y in my_dataset.items():
people_total.append(x)
for x1,y1 in y.items():
#features_total.append(x1)
if x1=='poi':
poi_data.append(y1)
poi_total=poi_data.count(True)
NO_poi_total=poi_data.count(False)
print len(people_total),len(gen_features(my_dataset))-1,poi_total,NO_poi_total
data_information(my_dataset)
查看每个人信息中为nan 的数量
def out_people(my_dataset):
people={}
poi=[]
for x ,y in my_dataset.items():
people[x]=0
for x ,y in my_dataset.items():
for x1,y1 in y.items():
if y1=='NaN':
people[x]=people[x]+1
return sorted(people.items(),key=lambda x:x[1],reverse=True)
out_people(my_dataset)
查看特征中值不是数值或者布尔的数量
from collections import defaultdict
def check_nan(my_dataset, n=2):
根据feature为NaN的数量对feature进行排序,
移除数据中前n个NaN最多的feature
:param my_dataset: 数据集
:param n: 要移除的前n个NaN最多的特征
:return : 移除的特征列表
"""
dict_nan = defaultdict(int)
for _, features in my_dataset.items():
for feature, value in features.items():
if not isinstance(value, int) and not isinstance(value, bool): # and feature not in ['salary', 'bonus']:
dict_nan[feature] += 1
return sorted(dict_nan.items(),key=lambda x:x[1],reverse=True)
check_nan(my_dataset)
删除掉‘THE TRAVEL AGENCY IN THE PARK’,‘TOTAL’,'LOCKHART EUGENE E'这三个数据
dele_people=['THE TRAVEL AGENCY IN THE PARK', 'TOTAL' , 'LOCKHART EUGENE E']
for x ,y in my_dataset.items():
if x in dele_people:
for x1,y1 in y.items():
my_dataset[x].pop(x1)
for x in dele_people:
my_dataset.pop(x)
data_information(my_dataset)
def new_feature(data):
# 增加一个新的bns 特征即bonus和salary之和
#bns特征可以避免bonus和salary的不均衡
for name,feature in data.items():
if isinstance(feature['bonus'],int) and isinstance(feature['salary'],int):
data[name]['bns']=feature['bonus']+feature['salary']
elif isinstance(feature['salary'],int):
data[name]['bns']=feature['salary']
elif isinstance(feature['bonus'],int):
data[name]['bns']=feature['bonus']
else:
data[name]['bns']=0
return data
my_dataset=new_feature(my_dataset)
data_information(my_dataset)
def new_feature(data):
# 增加一个新的poi_messages特征,from_poi_to_this_person和from_messages的比,
#收到来自poi邮件数的占比
for name,feature in data.items():
if isinstance(feature['from_poi_to_this_person'],int) and isinstance(feature['from_messages'],int):
data[name]['poi_messages']=feature['from_poi_to_this_person']/feature['from_messages']
elif isinstance(feature['from_messages'],int):
data[name]['poi_messages']=feature['from_messages']
elif isinstance(feature['from_poi_to_this_person'],int):
data[name]['poi_messages']=feature['from_poi_to_this_person']
else:
data[name]['poi_messages']=0
return data
my_dataset=new_feature(my_dataset)
data_information(my_dataset)
from collections import defaultdict
#移除NAN数量最多的2个特征
def check_nan(my_dataset, n=2):
"""
根据feature为NaN的数量对feature进行排序,
移除数据中前n个NaN最多的feature
:param my_dataset: 数据集
:param n: 要移除的前n个NaN最多的特征
:return : 移除的特征列表
"""
dict_nan = defaultdict(int)
for _, features in my_dataset.items():
for feature, value in features.items():
if not isinstance(value, int) and not isinstance(value, bool): # and feature not in ['salary', 'bonus']:
dict_nan[feature] += 1
list_sorted = sorted(dict_nan.items(), key=lambda item: item[1], reverse=True)
list_result = [i[0] for i in list_sorted[:n]]
for name, _ in my_dataset.items():
for feature in list_result:
my_dataset[name].pop(feature)
return list_result
list_nan=check_nan(my_dataset,n=2)
data_information(my_dataset)
features_list=gen_features(my_dataset)
features_list=gen_features(my_dataset)
len(features_list)
import numpy as np
from feature_format import featureFormat, targetFeatureSplit
data=featureFormat(my_dataset,features_list)
labels,feature=targetFeatureSplit(data)
特征缩放
from sklearn.preprocessing import MinMaxScaler, Normalizer
feature = MinMaxScaler().fit_transform(feature)
使用feature_importances_查看特征的重要程度
def features_score(feature,labels):
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectFromModel
tree_clf=ExtraTreesClassifier()
tree_clf=tree_clf.fit(feature,labels)
tree_scores=tree_clf.feature_importances_
#print tree_scores
a={}
a_features=[]
b_features=[]
featuras_chooses=features_list[1:]
for x in featuras_chooses:
a[x]=0
tree_best_features=[]
for i in featuras_chooses:
if tree_scores[featuras_chooses.index(i)]:
for x,y in a.items():
if x==i:
a[x]=tree_scores[featuras_chooses.index(i)]
sorted(a.items(),key=lambda x:x[1],reverse=True)
a_features=tree_best_features=sorted(a,key=lambda x:a[x])[-4:]
b_features=tree_best_features=sorted(a,key=lambda x:a[x])[-5:]
return a,a_features,b_features
a,a_features,b_features=features_score(feature,labels)
sorted(a.items(),key=lambda x:x[1],reverse=True)
得到四个得分最高的特征
four_features=sorted(a,key=lambda x:a[x])[-4:]
four_features
four_features.insert(0,'poi')
交叉验证,选用StratifiedShuffleSplit的方式将数据分为验证集和测试集
More_feature=[]
#from sklearn.model_selection import StratifiedShuffleSplit
#ss=StratifiedShuffleSplit(n_splits=5,test_size=0.25,train_size=0.75,random_state=0)#分成5组,测试比例为0.25,训练比例是0.75
for x in range(1,10):
from sklearn.model_selection import StratifiedShuffleSplit
ss=StratifiedShuffleSplit(n_splits=5,test_size=0.25,train_size=0.75,random_state=0)#
for train_indices, test_indices in ss.split(feature, labels):
features_train=[feature[ii] for ii in train_indices]
features_test=[feature[ii] for ii in test_indices]
labels_train=[labels[ii] for ii in train_indices]
labels_test=[labels[ii] for ii in test_indices]
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
tree_clf=ExtraTreesClassifier()
tree_clf=tree_clf.fit(features_train,labels_train)
tree_scores=tree_clf.feature_importances_
#print tree_scores
a={}
featuras_chooses=features_list[1:]
for x in featuras_chooses:
a[x]=0
tree_best_features=[]
for i in featuras_chooses:
if tree_scores[featuras_chooses.index(i)]:
for x,y in a.items():
if x==i:
a[x]=tree_scores[featuras_chooses.index(i)]
sorted(a.items(),key=lambda x:x[1],reverse=True)
#选择最佳的特征数 这里试用了2-10 后面发现4个最佳特征效果最好
tree_best_features=sorted(a,key=lambda x:a[x])[-4:]
for x,y in a.items():
if x in tree_best_features:
print x,y
# if 'bns' or 'poi_messages' not in tree_best_features:
# tree_best_features.insert(1,'bns')
# tree_best_features.insert(2,'poi_messages')
tree_best_features.insert(0,'poi')
More_feature.append(tree_best_features)
#print tree_best_features
# f=['poi','exercised_stock_options', 'deferred_income', 'other', 'expenses']
from sklearn import tree
from sklearn import grid_search
# parms={'max_depth':range(5,13)}
# grid=grid_search.GridSearchCV(estimator=DecisionTreeClassifier(),param_grid=parms,scoring='recall')
# grid.fit(features_train,labels_train)
# grid.best_params_
# dump_classifier_and_data(grid,my_dataset,tree_best_features)
# test_classifier(grid,my_dataset,tree_best_features)
mode = tree.DecisionTreeClassifier(criterion='gini',max_depth=10)
mode= mode.fit(features_train,labels_train)
dump_classifier_and_data(mode,my_dataset,tree_best_features)
test_classifier(mode,my_dataset,tree_best_features)
print tree_best_features
3个特征的 最好组合得分
Precision=0.48,Recall=0.43
pp=['poi', 'exercised_stock_options', 'deferred_income', 'poi_messages']
4个特征的 最好组合
#Precision=0.50,Recall=0.46
p=['poi', 'other', 'from_this_person_to_poi', 'expenses', 'exercised_stock_options']
#0.44,0.47
#['poi', 'salary', 'expenses', 'bonus', 'bns', 'exercised_stock_options']
5个特征的 最好组
#Precision=0.42,Recall=0.42
pppp=['poi', 'expenses', 'total_stock_value', 'bns', 'other', 'exercised_stock_options']
d=p+pp+pppp
这里使用两个特征集看那个效果更好,p 特征集以及多次出现在这几个特征集中的特征集合
看多次出现在高得分特征集的集合特征,会不会比p特征还好
e={}
for i in d:
if d.count(i)>1:
e[i]=d.count(i)
best_features=[]
for x ,y in e.items():
best_features.append(x)
best_features#f————得分最高突破0.4了
index=best_features.index('poi')
itme=best_features.pop(index)
best_features.insert(0,itme)
best_features
ode = tree.DecisionTreeClassifier(criterion='gini',max_depth=10)
mode= mode.fit(features_train,labels_train)
dump_classifier_and_data(mode,my_dataset,p)
test_classifier(mode,my_dataset,p)
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(features_train, labels_train)
dump_classifier_and_data(clf,my_dataset,p)
test_classifier(clf,my_dataset,p)
ode = tree.DecisionTreeClassifier(criterion='gini',max_depth=10)
mode= mode.fit(features_train,labels_train)
dump_classifier_and_data(mode,my_dataset,pp)
test_classifier(mode,my_dataset,pp)
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(features_train, labels_train)
dump_classifier_and_data(clf,my_dataset,pp)
test_classifier(clf,my_dataset,pp)
mode = tree.DecisionTreeClassifier(criterion='gini',max_depth=10)
mode= mode.fit(features_train,labels_train)
dump_classifier_and_data(mode,my_dataset,pppp)
test_classifier(mode,my_dataset,pppp)
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(features_train, labels_train)
dump_classifier_and_data(clf,my_dataset,pppp)
test_classifier(clf,my_dataset,pppp)
mode = tree.DecisionTreeClassifier(criterion='gini',max_depth=10)
mode= mode.fit(features_train,labels_train)
dump_classifier_and_data(mode,my_dataset,best_features)
test_classifier(mode,my_dataset,best_features)
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(features_train, labels_train)
dump_classifier_and_data(clf,my_dataset,best_features)
test_classifier(clf,my_dataset,best_features)
mode = tree.DecisionTreeClassifier(criterion='gini',max_depth=10)
mode= mode.fit(features_train,labels_train)
dump_classifier_and_data(mode,my_dataset,four_features)
test_classifier(mode,my_dataset,four_features)
max_depth-初始的树的深度,防止过拟合 #使用 p特征集,用GridSearchCV自动调整参数
from sklearn import tree
from sklearn import grid_search
parms={'max_depth':range(5,10)}
list_score=['recall','precision']
#scoring='precision'
#用GridSearchCV构建最佳参数,并构建最佳的决策树
for x in list_score:
grid=grid_search.GridSearchCV(estimator=DecisionTreeClassifier(),param_grid=parms,scoring=x)
grid.fit(features_train,labels_train)
best_param=grid.best_params_
#best_decision_tree_classifier = DecisionTreeClassifier(max_depth=best_param['max_depth'],
best_decision_tree_classifier = tree.DecisionTreeClassifier(criterion='gini',max_depth=best_param['max_depth'])
best_decision_tree_classifier= mode.fit(features_train,labels_train)
dump_classifier_and_data(best_decision_tree_classifier,my_dataset,four_features)
test_classifier(best_decision_tree_classifier,my_dataset,four_features)
调整criterion参数,criterion:‘gini’是计算基尼不纯度和entropy(信息增益),一般默认gini;max_depth-决策树最大深,为了更好的拟合数据.
from sklearn import tree
#criterion='gini'
for x in range(5,20):
print x
mode = tree.DecisionTreeClassifier(criterion='entropy',max_depth=x)
mode= mode.fit(features_train,labels_train)
dump_classifier_and_data(mode,my_dataset,p)
test_classifier(mode,my_dataset,p)
#entropy参数,Recall最高没有超过4,放弃 使用gini
max_depth-初始的树的深度,防止过拟合 #使用 p特征集,用GridSearchCV自动调整参数
from sklearn import tree
from sklearn import grid_search
parms={'max_depth':range(5,10)}
list_score=['recall','precision']
#scoring='precision'
grid=grid_search.GridSearchCV(estimator=DecisionTreeClassifier(),param_grid=parms)
grid.fit(features_train,labels_train)
grid.best_params_
print grid.best_params_#返回最佳参数
dump_classifier_and_data(grid,my_dataset,p)
test_classifier(grid,my_dataset,p)
效果不好 ,使用手动调试的手动的好
用交叉验证+feature_importances_选特征贝叶斯算法循环10次,从中选出最佳特征集
交叉验证因为数据不平衡,选用StratifiedShuffleSplit的方式将数据分为验证集和测试集
StratifiedShuffleSplit 产生指定数量独立的train/test数据集划分n组
首先将样本随机打乱,然后根据设计的参数划分出train/test对用train训练,用test 测试
每组划分保证每组类比例相同(适用于数据不均衡)
More_feature=[]
from sklearn.model_selection import StratifiedShuffleSplit
ss=StratifiedShuffleSplit(n_splits=5,test_size=0.25,train_size=0.75,random_state=0)#分成5组,测试比例为0.25,训练比例是0.75
for x in range(1,10):
for train_indices, test_indices in ss.split(feature, labels):
features_train=[feature[ii] for ii in train_indices]
features_test=[feature[ii] for ii in test_indices]
labels_train=[labels[ii] for ii in train_indices]
labels_test=[labels[ii] for ii in test_indices]
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
tree_clf=ExtraTreesClassifier()
tree_clf=tree_clf.fit(features_train,labels_train)
tree_scores=tree_clf.feature_importances_
a={}
featuras_chooses=features_list[1:]
for x in featuras_chooses:
a[x]=0
tree_best_features=[]
for i in featuras_chooses:
if tree_scores[featuras_chooses.index(i)]:
for x,y in a.items():
if x==i:
a[x]=tree_scores[featuras_chooses.index(i)]
sorted(a.items(),key=lambda x:x[1],reverse=True)
tree_best_features=sorted(a,key=lambda x:a[x])[-4:]
# if 'bns' or 'poi_messages' not in tree_best_features:
# tree_best_features.insert(1,'bns')
# tree_best_features.insert(2,'poi_messages')
tree_best_features.insert(0,'poi')
More_feature.append(tree_best_features)
#print tree_best_features
# f=['poi','exercised_stock_options', 'deferred_income', 'other', 'expenses']
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn import grid_search
clf = GaussianNB()
parameters = {"kernel": ("linear", "rbf"), "C": range(1, 10)}
svr=svm.SVC()
sv = grid_search.GridSearchCV(svr, parameters)
sv.fit(features_train,labels_train)
sv.best_params_
clf.fit(features_train, labels_train)
dump_classifier_and_data(clf,my_dataset,tree_best_features)
test_classifier(clf,my_dataset,tree_best_features)
print tree_best_features
3个特征的最佳得分Precision=0.53,Recall=0.38
NB_P=['poi', 'total_stock_value', 'exercised_stock_options', 'deferred_income']
NB_seven=['poi', 'exercised_stock_options', 'bns', 'long_term_incentive', 'poi_messages', 'salary', 'bonus', 'deferred_income']
6个特征集最佳得分Precision=0.47,Recall=0.40
NB_six=['poi', 'total_stock_value', 'expenses', 'deferred_income', 'exercised_stock_options', 'long_term_incentive', 'bns']
5个特征集最佳得分Precision=0.49,Recall=0.39
NB_fives=['poi', 'bns', 'total_stock_value', 'expenses', 'deferred_income', 'exercised_stock_options']
4个特征集最佳得分Precision=0.51,Recall=0.38
NB_three=['poi', 'exercised_stock_options', 'total_stock_value', 'deferred_income', 'shared_receipt_with_poi']
d=NB_P+NB_seven+NB_six+NB_fives+NB_three
e={}
for i in d:
if d.count(i)>1:
e[i]=d.count(i)
best_features_NB=[]
for x ,y in e.items():
best_features_NB.append(x)
best_features————得分最高突破0.4了
index=best_features_NB.index('poi')
itme=best_features_NB.pop(index)
best_features_NB.insert(0,itme)
best_features_NB
f=[]
f.append(NB_P)
f.append(NB_seven)
f.append(NB_six)
f.append(NB_fives)
f.append(NB_three)
f.append(d)
for i in range(len(f)):
print f[i]
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(features_train, labels_train)
dump_classifier_and_data(clf,my_dataset,f[i])
test_classifier(clf,my_dataset,f[i])
max_depth-决策树最大深,为了更好的拟合数据,调整max_depth 5到20 的值
from sklearn import tree
#criterion='gini'
for x in range(5,20):
print x
mode = tree.DecisionTreeClassifier(criterion='gini',max_depth=x)
mode= mode.fit(features_train,labels_train)
dump_classifier_and_data(mode,my_dataset,p)
test_classifier(mode,my_dataset,p)
决策树分类器,4个特征
mode = tree.DecisionTreeClassifier(criterion='gini',max_depth=9)
mode= mode.fit(features_train,labels_train)
dump_classifier_and_data(mode,my_dataset,p)
test_classifier(mode,my_dataset,p)
print p
向特征中插入新增的特征bns
p.insert(1,'bns')
看加入'bns特征后的表现
mode = tree.DecisionTreeClassifier(criterion='gini',max_depth=10)
mode= mode.fit(features_train,labels_train)
dump_classifier_and_data(mode,my_dataset,p)
test_classifier(mode,my_dataset,p)
print p
新加入特征后效果没有原来好
p=['poi','other', 'from_this_person_to_poi', 'expenses', 'exercised_stock_options']
最终算法 ,交叉验证因为数据不平衡,选用StratifiedShuffleSplit的方式将数据分为验证集和测试集
StratifiedShuffleSplit 产生指定数量独立的train/test数据集划分n组
首先将样本随机打乱,然后根据设计的参数划分出train/test对用train训练,用test 测试
每组划分保证每组类比例相同(适用于数据不均衡)
mode = tree.DecisionTreeClassifier(criterion='gini',max_depth=9)
mode= mode.fit(features_train,labels_train)
dump_classifier_and_data(mode,my_dataset,p)
test_classifier(mode,my_dataset,p)
准确率(accuracy),正确分类的样本数与总样本数之比
这里poi和非poi分配不均,即所有的都分类为 非poi,accuracy也是85%以上
这里用precision和recall来衡量分类器的效果
精确率(precision)是计算对是"正确被检索的item"占所有“实际被检索到的item”的比列
在这里就是(poi的人被检索为poi的人数)/(检索到的poi的总数)
召回率(recall)是"正确被检索的item"占所有"应该被检索到的item"的比例
这里就是(正确被检索为poi的人数)/(实际的poi人数=18)