(使用Python3.7和sklearn 0.20.2)
import pandas
import numpyas np
import matplotlib.pyplotas plt
from sklearn.linear_modelimport LinearRegression
from sklearn.model_selectionimport KFold, cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.linear_modelimport LogisticRegression
from sklearn.ensembleimport RandomForestClassifier
import re
from sklearn.feature_selectionimport SelectKBest, f_classif
from sklearn.ensembleimport GradientBoostingClassifier
titanic = pandas.read_csv(r"D:\train.csv")
print(titanic.describe())
titanic["Age"] = titanic["Age"].fillna(titanic["Age"].median())# 缺失值用均值填充
print(titanic.describe())
print(titanic["Sex"].unique())# 返回参数中所有不同的值,并且按照从小到大的顺序排列
# 将male和female用0和1代替
titanic.loc[titanic["Sex"] =="male", "Sex"] =0 # loc通过行标签索引行数据,iloc通过行号获取行数据,ix是结合前两种的混合索引
titanic.loc[titanic["Sex"] =="female", "Sex"] =1
print(titanic["Embarked"].unique())
titanic["Embarked"] = titanic["Embarked"].fillna("S")# 有缺失值,我们采用谁多用谁
titanic.loc[titanic["Embarked"] =="S", "Embarked"] =0
titanic.loc[titanic["Embarked"] =="C", "Embarked"] =1
titanic.loc[titanic["Embarked"] =="Q", "Embarked"] =2
# The columns we'll use to predict the target
predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]
# Initialize our algorithm class
alg = LinearRegression()
# Generate cross validation folds for the titanic dataset.It return the row indices corresponding to train and test
# We set random_state to ensure we get the same splits every time we run this.
kf = KFold(n_splits=3, shuffle=False, random_state=1)
predictions = []
for train, testin kf.split(titanic[predictors]):
# The predictors we're using the train the algorithm.Note how we only take the rows in the train folds.
train_predictors = (titanic[predictors].iloc[train, :])
# The target we're using to train the algorithm
train_target = titanic["Survived"].iloc[train]
# Training the algorithm using the predictors and target.
alg.fit(train_predictors, train_target)
# We can now make predictions in the test fold
test_predictions = alg.predict(titanic[predictors].iloc[test, :])
predictions.append(test_predictions)
# The predictions are in three separate numpy arrays.Concatenate them into one.
# We concatenate them on axis 0, as they only have one axis.
predictions = np.concatenate(predictions, axis=0)
# Map predictions to outcomes(only possible outcome are 1 and 0)
predictions[predictions >.5] =1
predictions[predictions <=.5] =0
accuracy =sum(predictions == titanic["Survived"]) /len(predictions)
print(accuracy)
# Initialize our algorithm
alg = LogisticRegression(solver='liblinear', random_state=1)
# Compute the accuracy score for all the cross validation folds.(much simple than what we did before!)
scores = cross_val_score(alg, titanic[predictors], titanic["Survived"], cv=3)
# Take the mean of the scores (because we have one for each fold)
print(scores.mean())
titanic_test = pandas.read_csv(r"D:\test.csv")
titanic_test["Age"] = titanic_test["Age"].fillna(titanic["Age"].median())
titanic_test["Fare"] = titanic_test["Fare"].fillna(titanic_test["Fare"].median())
titanic_test.loc[titanic_test["Sex"] =="male", "Sex"] =0
titanic_test.loc[titanic_test["Sex"] =="female", "Sex"] =1
titanic_test["Embarked"] = titanic_test["Embarked"].fillna("S")
titanic_test.loc[titanic_test["Embarked"] =="S", "Embarked"] =0
titanic_test.loc[titanic_test["Embarked"] =="C", "Embarked"] =1
titanic_test.loc[titanic_test["Embarked"] =="Q", "Embarked"] =2
predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]
# Initialize our algorithm with the default parameters
# n_estimators is the number of trees we want to make
# min_samples_split is the minimum number of rows we need to make a split
# min_samples_leaf is the minimum number of samples we can have-
# at the place where a tree branch ends(the bottom points of the tree)
alg = RandomForestClassifier(random_state=1, n_estimators=10, min_samples_split=2, min_samples_leaf=1)
# Compute the accuracy score for all the cross validation folds.(much simpler than what we did before!)
kf = KFold(n_splits=3, shuffle=False, random_state=1)
scores = cross_val_score(alg, titanic[predictors], titanic["Survived"], cv=kf)
# Take the mean of the scores(because we have one for each fold)
print(scores.mean())
# 上面所得效果不理想,我们调节随机森林分类器的参数
alg = RandomForestClassifier(random_state=1, n_estimators=50, min_samples_split=4, min_samples_leaf=2)
# Compute the accuracy score for all the cross validation folds.(much simpler than what we did before!)
kf = KFold(n_splits=3, shuffle=False, random_state=1)
scores = cross_val_score(alg, titanic[predictors], titanic["Survived"], cv=kf)
# Take the mean of the scores(because we have one for each fold)
print(scores.mean())
# Generating a familysize column
titanic["FamilySize"] = titanic["SibSp"] + titanic["Parch"]
# The apply method generates a new series
titanic["NameLength"] = titanic["Name"].apply(lambda x:len(x))
# A function to get the title from a name
def get_title(name):
# Use a regular expression to search for a title.Titles always consist of capital and lowercase letters,
# and end with a period
title_search = re.search("([A-Za-z]+)\.", name)
# If the title exists,extract and return it.
if title_search:
return title_search.group(1)
return ""
# Get all the titles and print how often each one occurs.
titles = titanic["Name"].apply(get_title)
print(pandas.value_counts(titles))
# Map each title to an integer.Some titles are very rare, and are compresses into the same codes as other titles.
title_mapping = {"Mr":1, "Miss":2, "Mrs":3, "Master":4, "Dr":5, "Rev":6, "Major":7, "Col":8, "Mlle":9, "Mme":10, "Don":11, "Ms":12, "Lady":13, "Sir":14, "Capt":15, "Jonkheer":16, "Countess":17}
for k, vin title_mapping.items():
titles[titles == k] = v
# Verify that we converted everything
print(pandas.value_counts(titles))
# Add in the title column.
titanic["Title"] = titles
predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked", "FamilySize", "Title", "NameLength"]
# Perform feature selection
selector = SelectKBest(f_classif, k=5)
selector.fit(titanic[predictors], titanic["Survived"])
# Get the raw p-values for each feature,and transform from p-values into scores
scores = -np.log(selector.pvalues_)
# Plot the scores.See how "Pclass","Sex","Title", and "Fare" are the best?
plt.bar(range(len(predictors)), scores)
plt.xticks(range(len(predictors)), predictors, rotation="vertical")
plt.show()
# Pick only the four best features.
predictors = ["Pclass", "Sex", "Fare", "Title"]
alg = RandomForestClassifier(random_state=1, n_estimators=50, min_samples_split=8, min_samples_leaf=4)
# The algorithms we want to ensemble.
# We're using the more linear predictors for the logistic regression,everything with the gradient boosting classifier
algorithms = [
[GradientBoostingClassifier(random_state=1, n_estimators=25, max_depth=3),
["Pclass", "Sex", "Fare", "FamilySize", "Title", "Age", "Embarked"]],
[LogisticRegression(solver='liblinear', random_state=1), ["Pclass", "Sex", "Fare", "FamilySize", "Title", "Age", "Embarked"]]
]
# Initialize the cross validation folds
kf = KFold(n_splits=3, shuffle=False, random_state=1)
print(titanic[predictors])
predictions = []
for train, testin kf.split(titanic[predictors]):
train_target = titanic["Survived"].iloc[train]
full_test_predictions = []
# Make predictions for each algorithm on each fold
for alg, predictorsin algorithms:
# Fit the algorithm on the training data
alg.fit(titanic[predictors].iloc[train, :], train_target)
# Select and predict on the test fold.
# The .astype(float) is necessary to convert the dataframe to all floats and avoid an sklearn error
test_predictions = alg.predict_proba(titanic[predictors].iloc[test, :].astype(float))[:, 1]
full_test_predictions.append(test_predictions)
# Use a simple ensembling scheme--just average the predictions to get the final classification.
test_predictions = (full_test_predictions[0] + full_test_predictions[1]) /2
# Any value over .5 is assumed to be a I prediction,and below .5 is a 0 prediction.
test_predictions[test_predictions <=.5] =0
test_predictions[test_predictions >.5] =1
predictions.append(test_predictions)
# Put all the predictions together into one array.
predictions = np.concatenate(predictions, axis=0)
accuracy =sum(predictions == titanic["Survived"]) /len(predictions)
print(accuracy)