logistic回归 | 葡萄酒质量预测
葡萄酒质量预测
数据下载地址,完整代码下载地址
# -*- coding: UTF-8 -*-
"""
葡萄酒质量预测
算法:Softmax
数据:datas/winequality-red.csv,datas/winequality-white.csv
"""
import os
import warnings
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import sklearn
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import label_binarize
from sklearn.linear_model.coordinate_descent import ConvergenceWarning
from sklearn.linear_model import LogisticRegressionCV
## 设置字符集,防止中文乱码
mpl.rcParams['font.sans-serif'] = [u'simHei']
mpl.rcParams['axes.unicode_minus'] = False
## 拦截异常
warnings.filterwarnings(action='ignore', category=ConvergenceWarning)
warnings.filterwarnings(action='ignore', category=UserWarning)
# 读取数据
red_path = os.path.join('datas', 'winequality-red.csv')
red_df = pd.read_csv(red_path, sep=';')
red_df['type'] = 1
white_path = os.path.join('datas', 'winequality-white.csv')
white_df = pd.read_csv(white_path, sep=';')
white_df['type'] = 2
df = pd.concat([red_df, white_df], axis=0)
# 自变量名称
names = ["fixed acidity", "volatile acidity", "citric acid",
"residual sugar", "chlorides", "free sulfur dioxide",
"total sulfur dioxide", "density", "pH", "sulphates", "alcohol", "type"]
# 因变量
quality = "quality"
# 异常值处理
datas = df.replace('?', np.NAN).dropna(how='any')
X = datas[names]
Y = datas[quality]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=0)
ss = MinMaxScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)
# print(Y_train.value_counts())
# 训练模型
lr = LogisticRegressionCV(multi_class='multinomial',fit_intercept=True, Cs=np.logspace(-5, 1, 100),
penalty='l2', solver='lbfgs')
lr.fit(X_train, Y_train) # 训练模型
Y_predict = lr.predict(X_test) # 预测结果
print("train score: ", lr.score(X_train, Y_train))
print("θ: ", lr.coef_)
print("intercept: ", lr.intercept_)
number = np.arange(len(X_test))
plt.figure(figsize=(14,7), facecolor='w')
plt.ylim(-1, 11)
plt.plot(number, Y_test, 'ro', markersize = 8, zorder=3, label=u'真实值')
plt.plot(number, Y_predict, 'go', markersize = 14, zorder=2, label=u'预测值,$R^2$=%.3f' % lr.score(X_test, Y_test))
plt.legend(loc = 'upper left')
plt.xlabel(u'数据编号', fontsize=18)
plt.ylabel(u'葡萄酒质量', fontsize=18)
plt.title(u'Logistic回归算法对数据进行分类', fontsize=20)
plt.show()