常用的scikit-learn分类器
# -*- coding: utf-8 -*-
"""
Created on Fri Jul 29 21:51:11 2016
The use of ten classic machine learning algorithm!
@author: ckawyh
"""
import sys
import time
from sklearn import metrics
import numpy as np
import cPickle as pickle
reload(sys)
sys.setdefaultencoding('utf8')
# Multinomial Naive Bayes Classifier
def naive_bayes_classifier(train_x, train_y):
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB(alpha=0.01)
model.fit(train_x, train_y)
return model
# KNN Classifier
def knn_classifier(train_x, train_y):
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()
model.fit(train_x, train_y)
return model
# Logistic Regression Classifier
def logistic_regression_classifier(train_x, train_y):
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(penalty='l2')
model.fit(train_x, train_y)
return model
# Random Forest Classifier
def random_forest_classifier(train_x, train_y):
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=8)
model.fit(train_x, train_y)
return model
# Decision Tree Classifier
def decision_tree_classifier(train_x, train_y):
from sklearn import tree
model = tree.DecisionTreeClassifier()
model.fit(train_x, train_y)
return model
# GBDT(Gradient Boosting Decision Tree) Classifier
def gradient_boosting_classifier(train_x, train_y):
from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier(n_estimators=200)
model.fit(train_x, train_y)
return model
# SVM Classifier
def svm_classifier(train_x, train_y):
from sklearn.svm import SVC
model = SVC(kernel='rbf', probability=True)
model.fit(train_x, train_y)
return model
# SVM Classifier using cross validation
def svm_cross_validation(train_x, train_y):
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVC
model = SVC(kernel='rbf', probability=True)
param_grid = {'C': [1e-3, 1e-2, 1e-1, 1, 10, 100, 1000], 'gamma': [0.001, 0.0001]}
grid_search = GridSearchCV(model, param_grid, n_jobs = 1, verbose=1)
grid_search.fit(train_x, train_y)
best_parameters = grid_search.best_estimator_.get_params()
for para, val in best_parameters.items():
print para, val
model = SVC(kernel='rbf', C=best_parameters['C'], gamma=best_parameters['gamma'], probability=True)
model.fit(train_x, train_y)
return model
if __name__ == '__main__':
from sklearn.datasets import load_iris
from sklearn import cross_validation
from pandas import DataFrame
data_dict = load_iris()
data = data_dict.data
label = data_dict.target
df = DataFrame(data)
df[4] = label
data_array = df.as_matrix()
split_train, split_cv = cross_validation.train_test_split(data_array,test_size=0.3,random_state=0)
train_x = split_train[:,0:4]
train_y = split_train[:,4]
test_x = split_cv[:,0:4]
test_y = split_cv[:,4]
model_save_file = None
model_save = {}
test_classifiers = ['NB', 'KNN', 'LR', 'RF', 'DT', 'SVM', 'SVMCV', 'GBDT']
classifiers = {'NB':naive_bayes_classifier,
'KNN':knn_classifier,
'LR':logistic_regression_classifier,
'RF':random_forest_classifier,
'DT':decision_tree_classifier,
'SVM':svm_classifier,
'SVMCV':svm_cross_validation,
'GBDT':gradient_boosting_classifier
}
num_train, num_feat = train_x.shape
num_test, num_feat = test_x.shape
is_binary_class = (len(np.unique(train_y)) == 2)
print '******************** Data Info *********************'
print '#training data: %d, #testing_data: %d, dimension: %d' % (num_train, num_test, num_feat)
for classifier in test_classifiers:
print '******************* %s ********************' % classifier
start_time = time.time()
model = classifiers[classifier](train_x, train_y)
print 'training took %fs!' % (time.time() - start_time)
predict = model.predict(test_x)
if model_save_file != None:
model_save[classifier] = model
accuracy = metrics.accuracy_score(test_y, predict)
report = metrics.classification_report(test_y, predict)
print 'accuracy: %.2f%%' % (100 * accuracy)
print report
if model_save_file != None:
pickle.dump(model_save, open(model_save_file, 'wb'))
运行结果:
******************** Data Info *********************
#training data: 105, #testing_data: 45, dimension: 4
******************* NB ********************
training took 0.001000s!
accuracy: 60.00%
precision recall f1-score support
0.0 1.00 1.00 1.00 16
1.0 0.00 0.00 0.00 18
2.0 0.38 1.00 0.55 11
avg / total 0.45 0.60 0.49 45
******************* KNN ********************
training took 0.000000s!
accuracy: 97.78%
precision recall f1-score support
0.0 1.00 1.00 1.00 16
1.0 1.00 0.94 0.97 18
2.0 0.92 1.00 0.96 11
avg / total 0.98 0.98 0.98 45
******************* LR ********************
training took 0.001000s!
accuracy: 88.89%
precision recall f1-score support
0.0 1.00 1.00 1.00 16
1.0 1.00 0.72 0.84 18
2.0 0.69 1.00 0.81 11
avg / total 0.92 0.89 0.89 45
******************* RF ********************
training took 0.019000s!
accuracy: 97.78%
precision recall f1-score support
0.0 1.00 1.00 1.00 16
1.0 1.00 0.94 0.97 18
2.0 0.92 1.00 0.96 11
avg / total 0.98 0.98 0.98 45
******************* DT ********************
training took 0.000000s!
accuracy: 97.78%
precision recall f1-score support
0.0 1.00 1.00 1.00 16
1.0 1.00 0.94 0.97 18
2.0 0.92 1.00 0.96 11
avg / total 0.98 0.98 0.98 45
******************* SVM ********************
training took 0.001000s!
accuracy: 97.78%
precision recall f1-score support
0.0 1.00 1.00 1.00 16
1.0 1.00 0.94 0.97 18
2.0 0.92 1.00 0.96 11
avg / total 0.98 0.98 0.98 45
******************* SVMCV ********************
Fitting 3 folds for each of 14 candidates, totalling 42 fits
kernel rbf
C 1000
verbose False
probability True
degree 3
shrinking True
max_iter -1
decision_function_shape None
random_state None
tol 0.001
cache_size 200
coef0 0.0
gamma 0.001
class_weight None
training took 0.143000s!
accuracy: 97.78%
precision recall f1-score support
0.0 1.00 1.00 1.00 16
1.0 1.00 0.94 0.97 18
2.0 0.92 1.00 0.96 11
avg / total 0.98 0.98 0.98 45
******************* GBDT ********************
[Parallel(n_jobs=1)]: Done 42 out of 42 | elapsed: 0.0s finished
training took 0.176000s!
accuracy: 97.78%
precision recall f1-score support
0.0 1.00 1.00 1.00 16
1.0 1.00 0.94 0.97 18
2.0 0.92 1.00 0.96 11
avg / total 0.98 0.98 0.98 45