本例使用sklearn进行kaggle案例泰坦尼克Titanic船员获救预测
环境:python3+Anaconda(Anaconda集成了实验用到的包)
源码及语料:https://gitee.com/yqmyqm/Machine_learn
下面为实现代码:
'''
Created on 2017年12月20日
@author: yqm
'''
import pandas
file_dir = "G:\\研究生\\实验\\语料\\titanic_train.csv"
titanic = pandas.read_csv(file_dir)
# print(titanic.head(5))
# Age列存在缺失项,用Age的平均数填充缺失值(NaN格式)
titanic["Age"] = titanic["Age"].fillna(titanic["Age"].median())
titanic.loc[titanic["Sex"] == "male", "Sex"] = 0
titanic.loc[titanic["Sex"] == "female", "Sex"] = 1
# print(titanic["Sex"].unique())
# print(titanic["Embarked"].unique()) # unique()函数查看一共有多少重复的值 例:['S' 'C' 'Q' nan]
titanic["Embarked"] = titanic["Embarked"].fillna("S")
titanic.loc[titanic["Embarked"] == "S", "Embarked"] = 0
titanic.loc[titanic["Embarked"] == "C", "Embarked"] = 1
titanic.loc[titanic["Embarked"] == "Q", "Embarked"] = 2
# print(titanic.describe())#输出总数,均值。。等信息
from sklearn.linear_model import LinearRegression #引入线性回归
from sklearn.cross_validation import KFold #交叉验证库,将测试集进行切分交叉取平均
#传入模型的特征
predictors = ['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']
alg = LinearRegression() #实例化模型
# print(titanic.shape)
# 将m个样本平均分成3份进行交叉验证
# titanic.shape输出样本集的行和列本例输出为(891, 12),shape[0]指第一个列的值
# 本例titanic.shape[0]为样本的个数
# 根据参数n和n_folds将n个样本分成n_folds份。每次验证过程选取其中1份作为测试集,剩下的n_folds-1份作为训练集,并且做n_folds次这样的验证
kf = KFold(titanic.shape[0], n_folds=3, random_state=1)
# print(kf)
predictions = []
for train, test in kf:
train_predictions = (titanic[predictors].iloc[train,:])#将predictors作为测试特征
train_target = titanic["Survived"].iloc[train] #训练集标签值
alg.fit(train_predictions, train_target) #训练数据
#用训练好的模型预测数据
test_prediction = alg.predict(titanic[predictors].iloc[test,:])
# print(test_prediction)
predictions.append(test_prediction)
print(predictions)