泰坦尼克数据集是一个好的可选数据集对于kaggle的新手,
而且很多获胜的kaggle竞赛的团队都对这个数据集有很好的分析.
import numpy as np
import pandas as pd
import re
import sklearn
train_ = pd.read_csv('train.csv')
# test_ = pd.read_csv('test.csv')
print(train_.head())
PassengerId = train_['PassengerId']
print('follow is passengerid')
print(PassengerId[:5])
PassengerId Survived Pclass \
0 1 0 3
1 2 1 1
2 3 1 3
3 4 1 1
4 5 0 3
Name Sex Age SibSp \
0 Braund, Mr. Owen Harris male 22.0 1
1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1
2 Heikkinen, Miss. Laina female 26.0 0
3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1
4 Allen, Mr. William Henry male 35.0 0
Parch Ticket Fare Cabin Embarked
0 0 A/5 21171 7.2500 NaN S
1 0 PC 17599 71.2833 C85 C
2 0 STON/O2. 3101282 7.9250 NaN S
3 0 113803 53.1000 C123 S
4 0 373450 8.0500 NaN S
follow is passengerid
0 1
1 2
2 3
3 4
4 5
Name: PassengerId, dtype: int64
In [3]:
print (train_[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean())
print (train_[["Sex", "Survived"]].groupby(['Sex'], as_index=False).mean())
Pclass Survived
0 1 0.629630
1 2 0.472826
2 3 0.242363
Sex Survived
0 female 0.742038
1 male 0.188908
In [4]:
train_['Age'].mean()
Out[4]:
29.69911764705882
数据清理
feature engineering 特征工程,构造出我们需要的特征
加上一些自己推出的特征
# 计算名字的长度
train_['Name_length'] = train_['Name'].apply(len)
# 将旅客是否住在头等舱二值化
train_['Has_Cabin'] = train_["Cabin"].apply(lambda x: 0 if type(x) == float else 1)
# 发现房子的大小
train_['FamilySize'] = train_['SibSp'] + train_['Parch'] + 1
# 发现是否独居
train_['IsAlone'] = 0
train_.loc[train_['FamilySize'] == 1, 'IsAlone'] = 1
# 移除所有穿上人员的 Embarked 的NULL值
train_['Embarked'] = train_['Embarked'].fillna('S')
train_['Fare'] = train_['Fare'].fillna(train_['Fare'].median())
train_['CategoricalFare'] = pd.qcut(train_['Fare'], 4)
age_avg = train_['Age'].mean()
age_std = train_['Age'].std()
age_null_count = train_['Age'].isnull().sum()
age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)
train_['Age'][np.isnan(train_['Age'])] = age_null_random_list
train_['Age'] = train_['Age'].astype(int)
train_['CategoricalAge'] = pd.cut(train_['Age'], 5)
# 定义函数导出旅客的Title
def get_title(name):
title_search = re.search('([A-Za-z]+)\.',name)
if title_search:
return title_search.group(1)
return ''
train_['Title'] = train_['Name'].apply(get_title)
train_['Title'] = train_['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
train_['Title'] = train_['Title'].replace('Mlle', 'Miss')
train_['Title'] = train_['Title'].replace('Ms', 'Miss')
train_['Title'] = train_['Title'].replace('Mme', 'Mrs')
# 映射 Sex
train_['Sex'] = train_['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
# 映射 titles
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
train_['Title'] = train_['Title'].map(title_mapping)
train_['Title'] = train_['Title'].fillna(0)
# 映射 Embarked
train_['Embarked'] = train_['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
# 映射 Fare
train_.loc[ train_['Fare'] <= 7.91, 'Fare'] = 0
train_.loc[(train_['Fare'] > 7.91) & (train_['Fare'] <= 14.454), 'Fare'] = 1
train_.loc[(train_['Fare'] > 14.454) & (train_['Fare'] <= 31), 'Fare'] = 2
train_.loc[ train_['Fare'] > 31, 'Fare'] = 3
train_['Fare'] = train_['Fare'].astype(int)
# 映射 Age
train_.loc[ train_['Age'] <= 16, 'Age'] = 0
train_.loc[(train_['Age'] > 16) & (train_['Age'] <= 32), 'Age'] = 1
train_.loc[(train_['Age'] > 32) & (train_['Age'] <= 48), 'Age'] = 2
train_.loc[(train_['Age'] > 48) & (train_['Age'] <= 64), 'Age'] = 3
train_.loc[train_['Age'] > 64, 'Age']
Out[5]:
33 66
54 65
96 71
116 70
280 65
456 65
493 71
630 80
672 70
745 70
851 74
Name: Age, dtype: int64
特征选择
drop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp']
train_ = train_.drop(drop_elements, axis = 1)
train_ = train_.drop(['CategoricalAge', 'CategoricalFare'], axis = 1)
# test_ = test_.drop(drop_elements, axis = 1)
In [7]:
train_.head()
Out[7]:
Survived Pclass Sex Age Parch Fare Embarked Name_length Has_Cabin FamilySize IsAlone Title
0 0 3 1 1 0 0 0 23 0 2 0 1
1 1 1 0 2 0 3 1 51 1 2 0 3
2 1 3 0 1 0 1 0 22 0 1 1 2
3 1 1 0 2 0 3 0 44 1 2 0 3
4 0 3 1 2 0 1 0 24 0 1 1 1
可视化
Pearson Correlation Heatmap(Pearson相关性热力图)
# 可视化特征之间的关联
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
colormap = plt.cm.viridis
plt.figure(figsize=(12,12))
plt.title('Pearson Correlation of Features', y=1.05, size=15)
sns.heatmap(train_.astype(float).corr(),linewidths=0.1,vmax=1.0, square=True, cmap=colormap, linecolor='white', annot=True)
1.图中可以告诉我们特征之间没有太多的强关联
这对我们的模型来说是一个好消息,因为这意味着训练数据中没有太多的冗余信息,我们每一个特征都可以提供一个独立的信息
2.两个关联最强的特征是Family size and Parch
最后让我们生成一个 pairplots图来观察每个特征与其它特征之间的关系
g = sns.pairplot(train_[[u'Survived', u'Pclass', u'Sex', u'Age', u'Parch', u'Fare', u'Embarked',
u'FamilySize', u'Title']], hue='Survived', palette = 'seismic',size=1.2,diag_kind = 'kde',diag_kws=dict(shade=True),plot_kws=dict(s=10) )
g.set(xticklabels=[])
用模型来选择特征的重要性
这里使用随机森林来得到每个特征的重要程度
In [60]:
import sklearn
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.cross_validation import KFold
# 使用面向对象的编程方式(OOP),用python类来帮助我们生成多个实例,方便后面多个模型的构建与训练
# 下面创建SkleanHelper类它允许我们扩展内建的方法
ntrain = train_.shape[0]
SEED = 0 # 用于后面多个实例seed参数的重置
NFOLDS = 5 # set folds for out-of-fold prediction
kf = KFold(ntrain,n_folds=NFOLDS,random_state=SEED)
class SklearnHelper(object):
def __init__(self,clf,seed=0,params=None):
params['random_state'] = seed
self.clf = clf(**params)
def train(self, x_train, y_train):
self.clf.fit(x_train, y_train)
def predict(self, x):
return self.clf.predict(x)
def fit(self,x,y):
return self.clf.fit(x,y)
def feature_importances(self,x,y):
print(self.clf.fit(x,y).feature_importances_)
这里有5个模型,它们都来自于sklearn工具包
# Random Forest classifier
# Extra Trees classifier
# AdaBoost classifer
# Gradient Boosting classifer
# Support Vector Machine
参数(parameters)
# Random Forest parameters
rf_params = {
'n_jobs': -1,
'n_estimators': 500,
'warm_start': True,
#'max_features': 0.2,
'max_depth': 6,
'min_samples_leaf': 2,
'max_features' : 'sqrt',
'verbose': 0
}
# Extra Trees Parameters
et_params = {
'n_jobs': -1,
'n_estimators':500,
#'max_features': 0.5,
'max_depth': 8,
'min_samples_leaf': 2,
'verbose': 0
}
# AdaBoost parameters
ada_params = {
'n_estimators': 500,
'learning_rate' : 0.75
}
# Gradient Boosting parameters
gb_params = {
'n_estimators': 500,
#'max_features': 0.2,
'max_depth': 5,
'min_samples_leaf': 2,
'verbose': 0
}
# Support Vector Classifier parameters
svc_params = {
'kernel' : 'linear',
'C' : 0.025
}
并且我们创建5个对象来分别训练不同的模型
rf = SklearnHelper(clf=RandomForestClassifier, seed=SEED, params=rf_params)
et = SklearnHelper(clf=ExtraTreesClassifier, seed=SEED, params=et_params)
ada = SklearnHelper(clf=AdaBoostClassifier, seed=SEED, params=ada_params)
gb = SklearnHelper(clf=GradientBoostingClassifier, seed=SEED, params=gb_params)
svc = SklearnHelper(clf=SVC, seed=SEED, params=svc_params)
创建用于训练和测试的Numpy数组
y_train = train_['Survived'].ravel() # 把Survived作为分类的标签(label)
train = train_.drop(['Survived'], axis=1) # 除去Survived
x_train = train_.values
从不同的分类模型中得到不同的特征比重
# .featureimportances 可以返回每个特征的重要性比重
rf_feature = rf.feature_importances(x_train,y_train)
et_feature = et.feature_importances(x_train, y_train)
ada_feature = ada.feature_importances(x_train, y_train)
gb_feature = gb.feature_importances(x_train,y_train)
[ 0.65595918 0.03713221 0.09610253 0.00581797 0.00419678 0.01383511
0.00504756 0.02717307 0.02183406 0.0192449 0.00290235 0.11075428]
[ 0.72430616 0.03342865 0.12294606 0.00239191 0.00269613 0.01104144
0.00395793 0.00814264 0.02822738 0.00713839 0.00575401 0.04996929]
[ 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[ 0.148 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. ]
由于返回的数据并不能直接使用,所以复制粘贴一下,
rf_features = [0.10474135, 0.21837029, 0.04432652, 0.02249159, 0.05432591, 0.02854371
,0.07570305, 0.01088129 , 0.24247496, 0.13685733 , 0.06128402]
et_features = [ 0.12165657, 0.37098307 ,0.03129623 , 0.01591611 , 0.05525811 , 0.028157
,0.04589793 , 0.02030357 , 0.17289562 , 0.04853517, 0.08910063]
ada_features = [0.028 , 0.008 , 0.012 , 0.05866667, 0.032 , 0.008
,0.04666667 , 0. , 0.05733333, 0.73866667, 0.01066667]
gb_features = [ 0.06796144 , 0.03889349 , 0.07237845 , 0.02628645 , 0.11194395, 0.04778854
,0.05965792 , 0.02774745, 0.07462718, 0.4593142 , 0.01340093]
创建一个包含特征重要性的Dataframe用于可视化
cols = train.columns.values
print(cols)
print(rf_feature)
feature_dataframe = pd.DataFrame( {'features': cols,
'Random Forest feature importances': rf_features,
'Extra Trees feature importances': et_features,
'AdaBoost feature importances': ada_features,
'Gradient Boost feature importances': gb_features
})
feature_dataframe.head()
['Pclass' 'Sex' 'Age' 'Parch' 'Fare' 'Embarked' 'Name_length' 'Has_Cabin'
'FamilySize' 'IsAlone' 'Title']
None
Out[109]:
AdaBoost feature importances Extra Trees feature importances Gradient Boost feature importances Random Forest feature importances features
0 0.028000 0.121657 0.067961 0.104741 Pclass
1 0.008000 0.370983 0.038893 0.218370 Sex
2 0.012000 0.031296 0.072378 0.044327 Age
3 0.058667 0.015916 0.026286 0.022492 Parch
4 0.032000 0.055258 0.111944 0.054326 Fare
绘制散点图描述每个模型输出的特征重要度
trace = go.Scatter(
y = feature_dataframe['Random Forest feature importances'].values,
x = feature_dataframe['features'].values,
mode='markers',
marker=dict(
sizemode = 'diameter',
sizeref = 1,
size = 25,
# size= feature_dataframe['AdaBoost feature importances'].values,
#color = np.random.randn(500), #set color equal to a variable
color = feature_dataframe['Random Forest feature importances'].values,
colorscale='Portland',
showscale=True
),
text = feature_dataframe['features'].values
)
data = [trace]
layout= go.Layout(
autosize= True,
title= 'Random Forest Feature Importance',
hovermode= 'closest',
# xaxis= dict(
# title= 'Pop',
# ticklen= 5,
# zeroline= False,
# gridwidth= 2,
# ),
yaxis=dict(
title= 'Feature Importance',
ticklen= 5,
gridwidth= 2
),
showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig,filename='scatter2010')
# Scatter plot
trace = go.Scatter(
y = feature_dataframe['Extra Trees feature importances'].values,
x = feature_dataframe['features'].values,
mode='markers',
marker=dict(
sizemode = 'diameter',
sizeref = 1,
size = 25,
# size= feature_dataframe['AdaBoost feature importances'].values,
#color = np.random.randn(500), #set color equal to a variable
color = feature_dataframe['Extra Trees feature importances'].values,
colorscale='Portland',
showscale=True
),
text = feature_dataframe['features'].values
)
data = [trace]
layout= go.Layout(
autosize= True,
title= 'Extra Trees Feature Importance',
hovermode= 'closest',
# xaxis= dict(
# title= 'Pop',
# ticklen= 5,
# zeroline= False,
# gridwidth= 2,
# ),
yaxis=dict(
title= 'Feature Importance',
ticklen= 5,
gridwidth= 2
),
showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig,filename='scatter2010')
# Scatter plot
trace = go.Scatter(
y = feature_dataframe['AdaBoost feature importances'].values,
x = feature_dataframe['features'].values,
mode='markers',
marker=dict(
sizemode = 'diameter',
sizeref = 1,
size = 25,
# size= feature_dataframe['AdaBoost feature importances'].values,
#color = np.random.randn(500), #set color equal to a variable
color = feature_dataframe['AdaBoost feature importances'].values,
colorscale='Portland',
showscale=True
),
text = feature_dataframe['features'].values
)
data = [trace]
layout= go.Layout(
autosize= True,
title= 'AdaBoost Feature Importance',
hovermode= 'closest',
# xaxis= dict(
# title= 'Pop',
# ticklen= 5,
# zeroline= False,
# gridwidth= 2,
# ),
yaxis=dict(
title= 'Feature Importance',
ticklen= 5,
gridwidth= 2
),
showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig,filename='scatter2010')
# Scatter plot
trace = go.Scatter(
y = feature_dataframe['Gradient Boost feature importances'].values,
x = feature_dataframe['features'].values,
mode='markers',
marker=dict(
sizemode = 'diameter',
sizeref = 1,
size = 25,
# size= feature_dataframe['AdaBoost feature importances'].values,
#color = np.random.randn(500), #set color equal to a variable
color = feature_dataframe['Gradient Boost feature importances'].values,
colorscale='Portland',
showscale=True
),
text = feature_dataframe['features'].values
)
data = [trace]
layout= go.Layout(
autosize= True,
title= 'Gradient Boosting Feature Importance',
hovermode= 'closest',
# xaxis= dict(
# title= 'Pop',
# ticklen= 5,
# zeroline= False,
# gridwidth= 2,
# ),
yaxis=dict(
title= 'Feature Importance',
ticklen= 5,
gridwidth= 2
),
showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig,filename='scatter2010')
由于图片太多这里就不贴了 ,可以访问github
计算每个模型对每个特征重要性的平均值
feature_dataframe['mean'] = feature_dataframe.mean(axis= 1) # axis = 1 表示操作第二轴(横轴)
feature_dataframe.head(3)
Out[111]:
AdaBoost feature importances Extra Trees feature importances Gradient Boost feature importances Random Forest feature importances features mean
0 0.028 0.121657 0.067961 0.104741 Pclass 0.080590
1 0.008 0.370983 0.038893 0.218370 Sex 0.159062
2 0.012 0.031296 0.072378 0.044327 Age 0.040000
绘制的到的平均值
y = feature_dataframe['mean'].values
x = feature_dataframe['features'].values
data = [go.Bar(
x= x,
y= y,
width = 0.5,
marker=dict(
color = feature_dataframe['mean'].values,
colorscale='Portland',
showscale=True,
reversescale = False
),
opacity=0.6
)]
layout= go.Layout(
autosize= True,
title= 'Barplots of Mean Feature Importance',
hovermode= 'closest',
# xaxis= dict(
# title= 'Pop',
# ticklen= 5,
# zeroline= False,
# gridwidth= 2,
# ),
yaxis=dict(
title= 'Feature Importance',
ticklen= 5,
gridwidth= 2
),
showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='bar-direct-labels')