计算判断的accuracy
admissions = admissions.rename(columns={'admit':'actual_label'})
# 寻找相等的matches, matches返回的是true or false
matches = admissions["predicted_label"] == admissions["actual_label"]
# 得到matches为true的predictions
correct_predictions = admissions[matches]
print (correct_predictions.head(5))
accuracy = len(correct_predictions)/len(admissions)
print (accuracy)
计算sensitivity = true positives / (tp + fn)
# From the previous screen
true_positive_filter = (admissions["predicted_label"] == 1) & (admissions["actual_label"] == 1)
true_positives = len(admissions[true_positive_filter])
false_negative_filter = (admissions["predicted_label"] == 0) & (admissions["actual_label"] == 1)
false_negatives = len(admissions[false_negative_filter])
sensitivity = true_positives / (true_positives + false_negatives)
print (sensitivity)
计算specificity = true negatives / (tn + fp)
# From previous screens
true_negative_filter = (admissions["predicted_label"] == 0) & (admissions["actual_label"] == 0)
true_negatives = len(admissions[true_negative_filter])
false_positive_filter = (admissions["predicted_label"] == 1) & (admissions["actual_label"] == 0)
false_positives = len(admissions[false_positive_filter])
specificity = (true_negatives) / (false_positives + true_negatives)
print(specificity)
Cross-validation
通过np.random.permutation将data frame的index打乱,随机选择一部分作为test和train。
import numpy as np
np.random.seed(8)
admissions = pd.read_csv("admissions.csv")
admissions["actual_label"] = admissions["admit"]
admissions = admissions.drop("admit", axis=1)
shuffled_index = np.random.permutation(admissions.index)
shuffled_admissions = admissions.loc[shuffled_index]
train = shuffled_admissions.iloc[0:515]
test = shuffled_admissions.iloc[515:len(shuffled_admissions)]
print(shuffled_admissions.head())
Use train set to fit model, then use test set to calculate accuracy
import numpy as np
np.random.seed(8)
shuffled_index = np.random.permutation(admissions.index)
shuffled_admissions = admissions.loc[shuffled_index]
train = shuffled_admissions.iloc[0:515]
test = shuffled_admissions.iloc[515:len(shuffled_admissions)]
model = LogisticRegression()
model.fit(train[["gpa"]], train["actual_label"])
# return predicted result to labels, then set test["predicted_labels"] to labels
labels = model.predict(test[["gpa"]])
test["predicted_label"] = labels
matches = test["predicted_label"] == test["actual_label"]
correct_predictions = test[matches]
accuracy = len(correct_predictions) / len(test)
print(accuracy)
计算ROC
import matplotlib.pyplot as plt
from sklearn import metrics
probabilities = model.predict_proba(test[["gpa"]])
fpr, tpr, thresholds = metrics.roc_curve(test["actual_label"], probabilities[:,1])
plt.plot(fpr, tpr)
print (probabilities)
#这里的probabilities有两列,第一列是预测返回值是0的概率(not admit),第二列是预测返回值是1的概率(admit)
#在metrics.roc_curve里面,采用的probabilities[:,1],即使选择第二列返回值是1的概率
#如果只需要返回值是1的概率
logistic_model.predict_proba(data)[:,1]
计算AUC score
# Note the different import style!
from sklearn.metrics import roc_auc_score
auc_score = roc_auc_score(test["actual_label"], probabilities[:,1])
print (auc_score)