泰坦尼克号船员获救数据分析（代码运行无错误）

（使用Python3.7和sklearn 0.20.2）

import pandas

import numpyas np

import matplotlib.pyplotas plt

from sklearn.linear_modelimport LinearRegression

from sklearn.model_selectionimport KFold, cross_val_score, StratifiedKFold, GridSearchCV

from sklearn.linear_modelimport LogisticRegression

from sklearn.ensembleimport RandomForestClassifier

import re

from sklearn.feature_selectionimport SelectKBest, f_classif

from sklearn.ensembleimport GradientBoostingClassifier

titanic = pandas.read_csv(r"D:\train.csv")

print(titanic.describe())

titanic["Age"] = titanic["Age"].fillna(titanic["Age"].median())# 缺失值用均值填充

print(titanic.describe())

print(titanic["Sex"].unique())# 返回参数中所有不同的值，并且按照从小到大的顺序排列

# 将male和female用0和1代替

titanic.loc[titanic["Sex"] =="male", "Sex"] =0 # loc通过行标签索引行数据，iloc通过行号获取行数据，ix是结合前两种的混合索引

titanic.loc[titanic["Sex"] =="female", "Sex"] =1

print(titanic["Embarked"].unique())

titanic["Embarked"] = titanic["Embarked"].fillna("S")# 有缺失值，我们采用谁多用谁

titanic.loc[titanic["Embarked"] =="S", "Embarked"] =0

titanic.loc[titanic["Embarked"] =="C", "Embarked"] =1

titanic.loc[titanic["Embarked"] =="Q", "Embarked"] =2

# The columns we'll use to predict the target

predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]

# Initialize our algorithm class

alg = LinearRegression()

# Generate cross validation folds for the titanic dataset.It return the row indices corresponding to train and test

# We set random_state to ensure we get the same splits every time we run this.

kf = KFold(n_splits=3, shuffle=False, random_state=1)

predictions = []

for train, testin kf.split(titanic[predictors]):

# The predictors we're using the train the algorithm.Note how we only take the rows in the train folds.

train_predictors = (titanic[predictors].iloc[train, :])

# The target we're using to train the algorithm

train_target = titanic["Survived"].iloc[train]

# Training the algorithm using the predictors and target.

alg.fit(train_predictors, train_target)

# We can now make predictions in the test fold

test_predictions = alg.predict(titanic[predictors].iloc[test, :])

predictions.append(test_predictions)

# The predictions are in three separate numpy arrays.Concatenate them into one.

# We concatenate them on axis 0, as they only have one axis.

predictions = np.concatenate(predictions, axis=0)

# Map predictions to outcomes(only possible outcome are 1 and 0)

predictions[predictions >.5] =1

predictions[predictions <=.5] =0

accuracy =sum(predictions == titanic["Survived"]) /len(predictions)

print(accuracy)

# Initialize our algorithm

alg = LogisticRegression(solver='liblinear', random_state=1)

# Compute the accuracy score for all the cross validation folds.(much simple than what we did before!)

scores = cross_val_score(alg, titanic[predictors], titanic["Survived"], cv=3)

# Take the mean of the scores (because we have one for each fold)

print(scores.mean())

titanic_test = pandas.read_csv(r"D:\test.csv")

titanic_test["Age"] = titanic_test["Age"].fillna(titanic["Age"].median())

titanic_test["Fare"] = titanic_test["Fare"].fillna(titanic_test["Fare"].median())

titanic_test.loc[titanic_test["Sex"] =="male", "Sex"] =0

titanic_test.loc[titanic_test["Sex"] =="female", "Sex"] =1

titanic_test["Embarked"] = titanic_test["Embarked"].fillna("S")

titanic_test.loc[titanic_test["Embarked"] =="S", "Embarked"] =0

titanic_test.loc[titanic_test["Embarked"] =="C", "Embarked"] =1

titanic_test.loc[titanic_test["Embarked"] =="Q", "Embarked"] =2

predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]

# Initialize our algorithm with the default parameters

# n_estimators is the number of trees we want to make

# min_samples_split is the minimum number of rows we need to make a split

# min_samples_leaf is the minimum number of samples we can have-

# at the place where a tree branch ends(the bottom points of the tree)

alg = RandomForestClassifier(random_state=1, n_estimators=10, min_samples_split=2, min_samples_leaf=1)

# Compute the accuracy score for all the cross validation folds.(much simpler than what we did before!)

kf = KFold(n_splits=3, shuffle=False, random_state=1)

scores = cross_val_score(alg, titanic[predictors], titanic["Survived"], cv=kf)

# Take the mean of the scores(because we have one for each fold)

print(scores.mean())

# 上面所得效果不理想，我们调节随机森林分类器的参数

alg = RandomForestClassifier(random_state=1, n_estimators=50, min_samples_split=4, min_samples_leaf=2)

# Compute the accuracy score for all the cross validation folds.(much simpler than what we did before!)

kf = KFold(n_splits=3, shuffle=False, random_state=1)

scores = cross_val_score(alg, titanic[predictors], titanic["Survived"], cv=kf)

# Take the mean of the scores(because we have one for each fold)

print(scores.mean())

# Generating a familysize column

titanic["FamilySize"] = titanic["SibSp"] + titanic["Parch"]

# The apply method generates a new series

titanic["NameLength"] = titanic["Name"].apply(lambda x:len(x))

# A function to get the title from a name

def get_title(name):

# Use a regular expression to search for a title.Titles always consist of capital and lowercase letters,

# and end with a period

title_search = re.search("([A-Za-z]+)\.", name)

# If the title exists,extract and return it.

if title_search:

return title_search.group(1)

return ""

# Get all the titles and print how often each one occurs.

titles = titanic["Name"].apply(get_title)

print(pandas.value_counts(titles))

# Map each title to an integer.Some titles are very rare, and are compresses into the same codes as other titles.

title_mapping = {"Mr":1, "Miss":2, "Mrs":3, "Master":4, "Dr":5, "Rev":6, "Major":7, "Col":8, "Mlle":9, "Mme":10, "Don":11, "Ms":12, "Lady":13, "Sir":14, "Capt":15, "Jonkheer":16, "Countess":17}

for k, vin title_mapping.items():

titles[titles == k] = v

# Verify that we converted everything

print(pandas.value_counts(titles))

# Add in the title column.

titanic["Title"] = titles

predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked", "FamilySize", "Title", "NameLength"]

# Perform feature selection

selector = SelectKBest(f_classif, k=5)

selector.fit(titanic[predictors], titanic["Survived"])

# Get the raw p-values for each feature,and transform from p-values into scores

scores = -np.log(selector.pvalues_)

# Plot the scores.See how "Pclass","Sex","Title", and "Fare" are the best?

plt.bar(range(len(predictors)), scores)

plt.xticks(range(len(predictors)), predictors, rotation="vertical")

plt.show()

# Pick only the four best features.

predictors = ["Pclass", "Sex", "Fare", "Title"]

alg = RandomForestClassifier(random_state=1, n_estimators=50, min_samples_split=8, min_samples_leaf=4)

# The algorithms we want to ensemble.

# We're using the more linear predictors for the logistic regression,everything with the gradient boosting classifier

algorithms = [

[GradientBoostingClassifier(random_state=1, n_estimators=25, max_depth=3),

["Pclass", "Sex", "Fare", "FamilySize", "Title", "Age", "Embarked"]],

[LogisticRegression(solver='liblinear', random_state=1), ["Pclass", "Sex", "Fare", "FamilySize", "Title", "Age", "Embarked"]]

]

# Initialize the cross validation folds

kf = KFold(n_splits=3, shuffle=False, random_state=1)

print(titanic[predictors])

predictions = []

for train, testin kf.split(titanic[predictors]):

train_target = titanic["Survived"].iloc[train]

full_test_predictions = []

# Make predictions for each algorithm on each fold

for alg, predictorsin algorithms:

# Fit the algorithm on the training data

alg.fit(titanic[predictors].iloc[train, :], train_target)

# Select and predict on the test fold.

# The .astype(float) is necessary to convert the dataframe to all floats and avoid an sklearn error

test_predictions = alg.predict_proba(titanic[predictors].iloc[test, :].astype(float))[:, 1]

full_test_predictions.append(test_predictions)

# Use a simple ensembling scheme--just average the predictions to get the final classification.

test_predictions = (full_test_predictions[0] + full_test_predictions[1]) /2

# Any value over .5 is assumed to be a I prediction,and below .5 is a 0 prediction.

test_predictions[test_predictions <=.5] =0

test_predictions[test_predictions >.5] =1

predictions.append(test_predictions)

# Put all the predictions together into one array.

predictions = np.concatenate(predictions, axis=0)

accuracy =sum(predictions == titanic["Survived"]) /len(predictions)

print(accuracy)

泰坦尼克号船员获救数据分析（代码运行无错误）

推荐阅读更多精彩内容