泰坦尼克号船员获救数据分析(代码运行无错误)

(使用Python3.7和sklearn 0.20.2)

import pandas

import numpyas np

import matplotlib.pyplotas plt

from sklearn.linear_modelimport LinearRegression

from sklearn.model_selectionimport KFold, cross_val_score, StratifiedKFold, GridSearchCV

from sklearn.linear_modelimport LogisticRegression

from sklearn.ensembleimport RandomForestClassifier

import re

from sklearn.feature_selectionimport SelectKBest, f_classif

from sklearn.ensembleimport GradientBoostingClassifier

titanic = pandas.read_csv(r"D:\train.csv")

print(titanic.describe())

titanic["Age"] = titanic["Age"].fillna(titanic["Age"].median())# 缺失值用均值填充

print(titanic.describe())

print(titanic["Sex"].unique())# 返回参数中所有不同的值,并且按照从小到大的顺序排列

# 将male和female用0和1代替

titanic.loc[titanic["Sex"] =="male", "Sex"] =0  # loc通过行标签索引行数据,iloc通过行号获取行数据,ix是结合前两种的混合索引

titanic.loc[titanic["Sex"] =="female", "Sex"] =1

print(titanic["Embarked"].unique())

titanic["Embarked"] = titanic["Embarked"].fillna("S")# 有缺失值,我们采用谁多用谁

titanic.loc[titanic["Embarked"] =="S", "Embarked"] =0

titanic.loc[titanic["Embarked"] =="C", "Embarked"] =1

titanic.loc[titanic["Embarked"] =="Q", "Embarked"] =2

# The columns we'll use to predict the target

predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]

# Initialize our algorithm class

alg = LinearRegression()

# Generate cross validation folds for the titanic dataset.It return the row indices corresponding to train and test

# We set random_state to ensure we get the same splits every time we run this.

kf = KFold(n_splits=3, shuffle=False, random_state=1)

predictions = []

for train, testin kf.split(titanic[predictors]):

# The predictors we're using the train the algorithm.Note how we only take the rows in the train folds.

    train_predictors = (titanic[predictors].iloc[train, :])

# The target we're using to train the algorithm

    train_target = titanic["Survived"].iloc[train]

# Training the algorithm using the predictors and target.

    alg.fit(train_predictors, train_target)

# We can now make predictions in the test fold

    test_predictions = alg.predict(titanic[predictors].iloc[test, :])

predictions.append(test_predictions)

# The predictions are in three separate numpy arrays.Concatenate them into one.

# We concatenate them on axis 0, as they only have one axis.

predictions = np.concatenate(predictions, axis=0)

# Map predictions to outcomes(only possible outcome are 1 and 0)

predictions[predictions >.5] =1

predictions[predictions <=.5] =0

accuracy =sum(predictions == titanic["Survived"]) /len(predictions)

print(accuracy)

# Initialize our algorithm

alg = LogisticRegression(solver='liblinear', random_state=1)

# Compute the accuracy score for all the cross validation folds.(much simple than what we did before!)

scores = cross_val_score(alg, titanic[predictors], titanic["Survived"], cv=3)

# Take the mean of the scores (because we have one for each fold)

print(scores.mean())

titanic_test = pandas.read_csv(r"D:\test.csv")

titanic_test["Age"] = titanic_test["Age"].fillna(titanic["Age"].median())

titanic_test["Fare"] = titanic_test["Fare"].fillna(titanic_test["Fare"].median())

titanic_test.loc[titanic_test["Sex"] =="male", "Sex"] =0

titanic_test.loc[titanic_test["Sex"] =="female", "Sex"] =1

titanic_test["Embarked"] = titanic_test["Embarked"].fillna("S")

titanic_test.loc[titanic_test["Embarked"] =="S", "Embarked"] =0

titanic_test.loc[titanic_test["Embarked"] =="C", "Embarked"] =1

titanic_test.loc[titanic_test["Embarked"] =="Q", "Embarked"] =2

predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]

# Initialize our algorithm with the default parameters

# n_estimators is the number of trees we want to make

# min_samples_split is the minimum number of rows we need to make a split

# min_samples_leaf is the minimum number of samples we can have-

# at the place where a tree branch ends(the bottom points of the tree)

alg = RandomForestClassifier(random_state=1, n_estimators=10, min_samples_split=2, min_samples_leaf=1)

# Compute the accuracy score for all the cross validation folds.(much simpler than what we did before!)

kf = KFold(n_splits=3, shuffle=False, random_state=1)

scores = cross_val_score(alg, titanic[predictors], titanic["Survived"], cv=kf)

# Take the mean of the scores(because we have one for each fold)

print(scores.mean())

# 上面所得效果不理想,我们调节随机森林分类器的参数

alg = RandomForestClassifier(random_state=1, n_estimators=50, min_samples_split=4, min_samples_leaf=2)

# Compute the accuracy score for all the cross validation folds.(much simpler than what we did before!)

kf = KFold(n_splits=3, shuffle=False, random_state=1)

scores = cross_val_score(alg, titanic[predictors], titanic["Survived"], cv=kf)

# Take the mean of the scores(because we have one for each fold)

print(scores.mean())

# Generating a familysize column

titanic["FamilySize"] = titanic["SibSp"] + titanic["Parch"]

# The apply method generates a new series

titanic["NameLength"] = titanic["Name"].apply(lambda x:len(x))

# A function to get the title from a name

def get_title(name):

# Use a regular expression to search for a title.Titles always consist of capital and lowercase letters,

# and end with a period

    title_search = re.search("([A-Za-z]+)\.", name)

# If the title exists,extract and return it.

    if title_search:

return title_search.group(1)

return ""

# Get all the titles and print how often each one occurs.

titles = titanic["Name"].apply(get_title)

print(pandas.value_counts(titles))

# Map each title to an integer.Some titles are very rare, and are compresses into the same codes as other titles.

title_mapping = {"Mr":1, "Miss":2, "Mrs":3, "Master":4, "Dr":5, "Rev":6, "Major":7, "Col":8, "Mlle":9, "Mme":10, "Don":11, "Ms":12, "Lady":13, "Sir":14, "Capt":15, "Jonkheer":16, "Countess":17}

for k, vin title_mapping.items():

titles[titles == k] = v

# Verify that we converted everything

print(pandas.value_counts(titles))

# Add in the title column.

titanic["Title"] = titles

predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked", "FamilySize", "Title", "NameLength"]

# Perform feature selection

selector = SelectKBest(f_classif, k=5)

selector.fit(titanic[predictors], titanic["Survived"])

# Get the raw p-values for each feature,and transform from p-values into scores

scores = -np.log(selector.pvalues_)

# Plot the scores.See how "Pclass","Sex","Title", and "Fare" are the best?

plt.bar(range(len(predictors)), scores)

plt.xticks(range(len(predictors)), predictors, rotation="vertical")

plt.show()

# Pick only the four best features.

predictors = ["Pclass", "Sex", "Fare", "Title"]

alg = RandomForestClassifier(random_state=1, n_estimators=50, min_samples_split=8, min_samples_leaf=4)

# The algorithms we want to ensemble.

# We're using the more linear predictors for the logistic regression,everything with the gradient boosting classifier

algorithms = [

[GradientBoostingClassifier(random_state=1, n_estimators=25, max_depth=3),

    ["Pclass", "Sex", "Fare", "FamilySize", "Title", "Age", "Embarked"]],

    [LogisticRegression(solver='liblinear', random_state=1), ["Pclass", "Sex", "Fare", "FamilySize", "Title", "Age", "Embarked"]]

]

# Initialize the cross validation folds

kf = KFold(n_splits=3, shuffle=False, random_state=1)

print(titanic[predictors])

predictions = []

for train, testin kf.split(titanic[predictors]):

train_target = titanic["Survived"].iloc[train]

full_test_predictions = []

# Make predictions for each algorithm on each fold

    for alg, predictorsin algorithms:

# Fit the algorithm on the training data

        alg.fit(titanic[predictors].iloc[train, :], train_target)

# Select and predict on the test fold.

# The .astype(float) is necessary to convert the dataframe to all floats and avoid an sklearn error

        test_predictions = alg.predict_proba(titanic[predictors].iloc[test, :].astype(float))[:, 1]

full_test_predictions.append(test_predictions)

# Use a simple ensembling scheme--just average the predictions to get the final classification.

    test_predictions = (full_test_predictions[0] + full_test_predictions[1]) /2

    # Any value over .5 is assumed to be a I prediction,and below .5 is a 0 prediction.

    test_predictions[test_predictions <=.5] =0

    test_predictions[test_predictions >.5] =1

    predictions.append(test_predictions)

# Put all the predictions together into one array.

predictions = np.concatenate(predictions, axis=0)

accuracy =sum(predictions == titanic["Survived"]) /len(predictions)

print(accuracy)

©著作权归作者所有,转载或内容合作请联系作者
  • 序言:七十年代末,一起剥皮案震惊了整个滨河市,随后出现的几起案子,更是在滨河造成了极大的恐慌,老刑警刘岩,带你破解...
    沈念sama阅读 204,530评论 6 478
  • 序言:滨河连续发生了三起死亡事件,死亡现场离奇诡异,居然都是意外死亡,警方通过查阅死者的电脑和手机,发现死者居然都...
    沈念sama阅读 86,403评论 2 381
  • 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
    开封第一讲书人阅读 151,120评论 0 337
  • 文/不坏的土叔 我叫张陵,是天一观的道长。 经常有香客问我,道长,这世上最难降的妖魔是什么? 我笑而不...
    开封第一讲书人阅读 54,770评论 1 277
  • 正文 为了忘掉前任,我火速办了婚礼,结果婚礼上,老公的妹妹穿的比我还像新娘。我一直安慰自己,他们只是感情好,可当我...
    茶点故事阅读 63,758评论 5 367
  • 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
    开封第一讲书人阅读 48,649评论 1 281
  • 那天,我揣着相机与录音,去河边找鬼。 笑死,一个胖子当着我的面吹牛,可吹牛的内容都是我干的。 我是一名探鬼主播,决...
    沈念sama阅读 38,021评论 3 398
  • 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...
    开封第一讲书人阅读 36,675评论 0 258
  • 序言:老挝万荣一对情侣失踪,失踪者是张志新(化名)和其女友刘颖,没想到半个月后,有当地人在树林里发现了一具尸体,经...
    沈念sama阅读 40,931评论 1 299
  • 正文 独居荒郊野岭守林人离奇死亡,尸身上长有42处带血的脓包…… 初始之章·张勋 以下内容为张勋视角 年9月15日...
    茶点故事阅读 35,659评论 2 321
  • 正文 我和宋清朗相恋三年,在试婚纱的时候发现自己被绿了。 大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
    茶点故事阅读 37,751评论 1 330
  • 序言:一个原本活蹦乱跳的男人离奇死亡,死状恐怖,灵堂内的尸体忽然破棺而出,到底是诈尸还是另有隐情,我是刑警宁泽,带...
    沈念sama阅读 33,410评论 4 321
  • 正文 年R本政府宣布,位于F岛的核电站,受9级特大地震影响,放射性物质发生泄漏。R本人自食恶果不足惜,却给世界环境...
    茶点故事阅读 39,004评论 3 307
  • 文/蒙蒙 一、第九天 我趴在偏房一处隐蔽的房顶上张望。 院中可真热闹,春花似锦、人声如沸。这庄子的主人今日做“春日...
    开封第一讲书人阅读 29,969评论 0 19
  • 文/苍兰香墨 我抬头看了看天上的太阳。三九已至,却和暖如春,着一层夹袄步出监牢的瞬间,已是汗流浃背。 一阵脚步声响...
    开封第一讲书人阅读 31,203评论 1 260
  • 我被黑心中介骗来泰国打工, 没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留,地道东北人。 一个月前我还...
    沈念sama阅读 45,042评论 2 350
  • 正文 我出身青楼,却偏偏与公主长得像,于是被迫代替她去往敌国和亲。 传闻我的和亲对象是个残疾皇子,可洞房花烛夜当晚...
    茶点故事阅读 42,493评论 2 343

推荐阅读更多精彩内容