随机森林分类器
随机森林的英文是 Random Forest,英文简写是 RF。它实际上是一个包含多个决策树的分类器,每一个子分类器都是一棵 CART 分类回归树。做分类的时候,输出结果是每个子分类器的分类结果中最多的那个;做回归的时候,输出结果是每棵 CART 树的回归结果的平均值。
在 sklearn 中,用 RandomForestClassifier() 构造随机森林模型, fit 函数拟合,使用 predict 函数预测。
from sklearn.model_selection import GridSearchCV
GridSearchCV,它是 Python 的参数自动搜索模块。只要告诉它想要调优的参数以及参数的取值范围,它就会把所有的情况都跑一遍,然后选出哪个参数是最优的,并给出结果。
# -*- coding: utf-8 -*-
# 使用 RandomForest 对 IRIS 数据集进行分类
# 利用 GridSearchCV 寻找最优参数
#得出结果当 n_estimators=6 的时候,是最优参数,也就是随机森林一共有 6 个子决策树
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_iris
rf = RandomForestClassifier()
parameters = {"n_estimators": range(1,11)}
iris = load_iris()
# 使用 GridSearchCV 进行参数调优
clf = GridSearchCV(estimator=rf, param_grid=parameters)
# 对 iris 数据集进行分类
clf.fit(iris.data, iris.target)
print(" 最优分数: %.4lf" %clf.best_score_)
print(" 最优参数:", clf.best_params_)
Pipeline 管道机制进行流水线作业
# -*- coding: utf-8 -*-
# 使用 RandomForest 对 IRIS 数据集进行分类
# 利用 GridSearchCV 寻找最优参数, 使用 Pipeline 进行流水作业
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
rf = RandomForestClassifier()
parameters = {"randomforestclassifier__n_estimators": range(1,11)}
iris = load_iris()
pipeline = Pipeline([
('scaler', StandardScaler()),
('randomforestclassifier', rf)
])
# 使用 GridSearchCV 进行参数调优
clf = GridSearchCV(estimator=pipeline, param_grid=parameters)
# 对 iris 数据集进行分类
clf.fit(iris.data, iris.target)
print(" 最优分数: %.4lf" %clf.best_score_)
print(" 最优参数:", clf.best_params_)
信用卡违约率分析
有个疑惑,为什么分类器构造参数要adaboostclassifier__n_estimators': [10, 50, 100]这样写,不可以直接n_estimators :[10, 50, 100];猜想:是因为用了pipeline,所以格式固定。
数据地址https://github.com/cystanford/credit_default
# -*- coding: utf-8 -*-
# 信用卡违约率分析
import pandas as pd
from sklearn.model_selection import learning_curve, train_test_split,GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from matplotlib import pyplot as plt
import seaborn as sns
# 数据加载
data = pd.read_csv('Desktop/UCI_Credit_Card.csv')
# 数据探索
print(data.shape) # 查看数据集大小
print(data.describe()) # 数据集概览
print(data.info()) #查看数据的信息,包括每个字段的名称、非空数量、字段的数据类型
# 查看下一个月违约率的情况
next_month = data['default.payment.next.month'].value_counts()
print(next_month)
df = pd.DataFrame({'default.payment.next.month': next_month.index,'values': next_month.values})
print(df.head())
plt.rcParams['font.sans-serif']=['SimHei'] # 用来正常显示中文标签
plt.figure(figsize = (6,6))
plt.title('信用卡违约率客户\n (违约:1,守约:0)')
sns.set_color_codes("pastel")
sns.barplot(x = 'default.payment.next.month', y="values", data=df)
locs, labels = plt.xticks()
plt.show()
# 特征选择,去掉 ID 字段、最后一个结果字段即可
data.drop(['ID'], inplace=True, axis =1) #ID 这个字段没有用
target = data['default.payment.next.month'].values
columns = data.columns.tolist()
columns.remove('default.payment.next.month')
features = data[columns].values
# 30% 作为测试集,其余作为训练集
train_x, test_x, train_y, test_y = train_test_split(features, target, test_size=0.30, stratify = target, random_state = 1)
# 构造各种分类器
classifiers = [
SVC(random_state = 1, kernel = 'rbf'),
DecisionTreeClassifier(random_state = 1, criterion = 'gini'),
RandomForestClassifier(random_state = 1, criterion = 'gini'),
KNeighborsClassifier(metric = 'minkowski'),
AdaBoostClassifier(random_state=None)
]
# 分类器名称
classifier_names = [
'svc',
'decisiontreeclassifier',
'randomforestclassifier',
'kneighborsclassifier',
'adaboostclassifier'
]
# 分类器参数
classifier_param_grid = [
{'svc__C':[1], 'svc__gamma':[0.01]},
{'decisiontreeclassifier__max_depth':[6,9,11]},
{'randomforestclassifier__n_estimators':[3,5,6]} ,
{'kneighborsclassifier__n_neighbors':[4,6,8]},
{'adaboostclassifier__n_estimators': [10, 50, 100]}
]
# 对具体的分类器进行 GridSearchCV 参数调优
def GridSearchCV_work(pipeline, train_x, train_y, test_x, test_y, param_grid, classifier_names,score = 'accuracy'):
response = {}
gridsearch = GridSearchCV(estimator = pipeline, param_grid = param_grid, scoring = score)
# 寻找最优的参数 和最优的准确率分数
search = gridsearch.fit(train_x, train_y)
print("GridSearch 最优参数:", search.best_params_)
print("GridSearch 最优分数: %0.4lf" %search.best_score_)
predict_y = gridsearch.predict(test_x)
print(classifier_names," 准确率 %0.4lf" %accuracy_score(test_y, predict_y))
response['predict_y'] = predict_y
response['accuracy_score'] = accuracy_score(test_y,predict_y)
return response
for model, model_name, model_param_grid in zip(classifiers, classifier_names, classifier_param_grid):
pipeline = Pipeline([
('scaler', StandardScaler()),
(model_name, model)
])
result = GridSearchCV_work(pipeline, train_x, train_y, test_x, test_y, model_param_grid , model_name, score = 'accuracy')