1. 简单线性回归手动实现
numpy as np
import matplotlib.pyplot as plt
class LinerRegression(object):
def __init__(self, learning_rate=0.01, max_iter=100, seed=None):
np.random.seed(seed)
self.lr = learning_rate
self.max_iter = max_iter
self.w = np.random.normal(1, 0.1)
self.b = np.random.normal(1, 0.1)
self.loss_arr = []
def fit(self, x, y):
self.x = x
self.y = y
for i in range(self.max_iter):
self._train_step()
self.loss_arr.append(self.loss())
# print('loss: \t{:.3}'.format(self.loss()))
# print('w: \t{:.3}'.format(self.w))
# print('b: \t{:.3}'.format(self.b))
def _f(self, x, w, b):
return x * w + b
def predict(self, x=None):
if x is None:
x = self.x
y_pred = self._f(x, self.w, self.b)
return y_pred
def loss(self, y_true=None, y_pred=None):
if y_true is None or y_pred is None:
y_true = self.y
y_pred = self.predict(self.x)
return np.mean((y_true - y_pred)**2)
def _calc_gradient(self):
d_w = np.mean((self.x * self.w + self.b - self.y) * self.x)
d_b = np.mean(self.x * self.w + self.b - self.y)
return d_w, d_b
def _train_step(self):
d_w, d_b = self._calc_gradient()
self.w = self.w - self.lr * d_w
self.b = self.b - self.lr * d_b
return self.w, self.b
def show_data(x, y, w=None, b=None):
plt.scatter(x, y, marker='.')
if w is not None and b is not None:
plt.plot(x, w*x+b, c='red')
plt.show()
# 生成数据
np.random.seed(272)
data_size = 100
x = np.random.uniform(low=1.0, high=10.0, size=data_size)
y = x * 20 + 10 + np.random.normal(loc=0.0, scale=10.0, size=data_size)
# plt.scatter(x, y, marker='.')
# plt.show()
# 分割成训练集、测试集
shuffled_index = np.random.permutation(data_size)
x = x[shuffled_index]
y = y[shuffled_index]
split_index = int(data_size * 0.7)
x_train = x[:split_index]
y_train = y[:split_index]
x_test = x[split_index:]
y_test = y[split_index:]
# 数据可视化
# plt.scatter(x_train, y_train, marker='.')
# plt.show()
# plt.scatter(x_test, y_test, marker='.')
# plt.show()
# 训练模型
regr = LinerRegression(learning_rate=0.01, max_iter=10, seed=314)
regr.fit(x_train, y_train)
print('cost: \t{:.3}'.format(regr.loss()))
print('w: \t{:.3}'.format(regr.w))
print('b: \t{:.3}'.format(regr.b))
show_data(x, y, regr.w, regr.b)
# 显示损失
plt.scatter(np.arange(len(regr.loss_arr)), regr.loss_arr, marker='o', c='green')
plt.show()
打印结果:
cost: 1.06e+02
w: 20.8
b: 4.21
2. sklearn实现各种线性回归
import numpy as np
from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, RidgeCV, Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error
from terminaltables import AsciiTable
#解决中文显示问题
from matplotlib.font_manager import *
myfont = FontProperties(fname='C:\Windows\Fonts\simhei.ttf')
# 生成数据
np.random.seed(272)
data_size = 100
x = np.random.uniform(low=1.0, high=10.0, size=data_size)
y = x * 20 + 10 + np.random.normal(loc=0.0, scale=10.0, size=data_size)
# plt.scatter(x, y, marker='.')
# plt.show()
# 分割成训练集、测试集
shuffled_index = np.random.permutation(data_size)
x = x[shuffled_index]
y = y[shuffled_index]
split_index = int(data_size * 0.7)
x_train = x[:split_index].reshape(-1, 1)
y_train = y[:split_index].reshape(-1, 1)
x_test = x[split_index:].reshape(-1, 1)
y_test = y[split_index:].reshape(-1, 1)
# 显示结果
def show_data(x, y, w=None, b=None, title='', xlabel='', ylabel=''):
plt.scatter(x, y, marker='.')
if w is not None and b is not None:
plt.plot(x, w*x+b, c='red')
plt.title(title, fontproperties=myfont)
plt.xlabel(xlabel, fontproperties=myfont)
plt.ylabel(ylabel, fontproperties=myfont)
plt.show()
# 线性回归
def linear_regression():
model = LinearRegression()
model.fit(x_train, y_train)
w = int(model.coef_)
b = int(model.intercept_)
show_data(x_train, y_train, w, b, title='线性回归-训练集', xlabel='x坐标', ylabel='y坐标')
mse_train, mae_train, mse_test, mae_test = calc_metrics(model, x_train, x_test, y_train, y_test)
metrics = {
'model': 'linear',
'alphas': '',
'alpha': '',
'best_alpha': '',
'w': int(model.coef_),
'b': int(model.intercept_),
'mse(训练集)': mse_train,
'mae(训练集)': mae_train,
'mse(测试集)': mse_test,
'mae(测试集)': mae_test,
}
return metrics
# lasso回归
def lasso_regression():
alpha = 0.2
model = Lasso(alpha=alpha)
model.fit(x_train, y_train)
w = int(model.coef_)
b = int(model.intercept_)
show_data(x_train, y_train, w, b, title='lasso回归-训练集', xlabel='x坐标', ylabel='y坐标')
mse_train, mae_train, mse_test, mae_test = calc_metrics(model, x_train, x_test, y_train, y_test)
metrics = {
'model': 'lasso',
'alphas': '',
'alpha': alpha,
'best_alpha': '',
'w': int(model.coef_),
'b': int(model.intercept_),
'mse(训练集)': mse_train,
'mae(训练集)': mae_train,
'mse(测试集)': mse_test,
'mae(测试集)': mae_test,
}
return metrics
# lasso回归CV实现方式
def lassoCV_regression():
alphas = [i/10 for i in range(5)]
model = LassoCV(alphas=alphas)
model.fit(x_train, y_train)
w = int(model.coef_)
b = int(model.intercept_)
mse_train, mae_train, mse_test, mae_test = calc_metrics(model, x_train, x_test, y_train, y_test)
show_data(x_train, y_train, w, b, title='lassoCV回归-训练集', xlabel='x坐标', ylabel='y坐标')
metrics = {
'model': 'lassoCV',
'alphas': str(alphas),
'alpha': '',
'best_alpha': model.alpha_,
'w': int(model.coef_),
'b': int(model.intercept_),
'mse(训练集)': mse_train,
'mae(训练集)': mae_train,
'mse(测试集)': mse_test,
'mae(测试集)': mae_test,
}
return metrics
# ridge回归
def ridge_regression():
alpha = 0.3
model = Ridge(alpha=alpha)
model.fit(x_train, y_train)
w = int(model.coef_)
b = int(model.intercept_)
show_data(x_train, y_train, w, b, title='ridge回归-训练集', xlabel='x坐标', ylabel='y坐标')
mse_train, mae_train, mse_test, mae_test = calc_metrics(model, x_train, x_test, y_train, y_test)
metrics = {
'model': 'ridge',
'alphas': '',
'alpha': alpha,
'best_alpha': '',
'w': int(model.coef_),
'b': int(model.intercept_),
'mse(训练集)': mse_train,
'mae(训练集)': mae_train,
'mse(测试集)': mse_test,
'mae(测试集)': mae_test,
}
return metrics
# ridge回归CV实现方式
def ridgeCV_regression():
alphas = [i / 10 for i in range(1, 5)]
model = RidgeCV(alphas=alphas, store_cv_values=True)
model.fit(x_train, y_train)
w = int(model.coef_)
b = int(model.intercept_)
mse_train, mae_train, mse_test, mae_test = calc_metrics(model, x_train, x_test, y_train, y_test)
show_data(x_train, y_train, w, b, title='ridgeCV回归-训练集', xlabel='x坐标', ylabel='y坐标')
metrics = {
'model': 'ridgeCV',
'alphas': str(alphas),
'alpha': '',
'best_alpha': model.alpha_,
'w': int(model.coef_),
'b': int(model.intercept_),
'mse(训练集)': mse_train,
'mae(训练集)': mae_train,
'mse(测试集)': mse_test,
'mae(测试集)': mae_test,
}
return metrics
# 格式化指标
def show_metrics(metrics_info_list):
metric_str = ''
metric_table = [["model", 'alphas', 'alpha', 'best_alpha', 'w', 'b', 'mse(训练集)', 'mae(训练集)', 'mse(测试集)', 'mae(测试集)']]
metric_list = metric_table[0]
for info in metrics_info_list:
row_metrics = ["%.20s" % info.get(key, None) for key in metric_list]
metric_table += [row_metrics]
# metric_table += [['%6s' % info.get('model', ''), '%6s' % info.get('datatype', ''), *row_metrics]]
metric_str += AsciiTable(metric_table).table
return metric_str
# 计算指标
def calc_metrics(model, x_train, x_test, y_train, y_test):
y_pred_train = model.predict(x_train)
mse_train = mean_squared_error(y_train, y_pred_train)
mae_train = mean_absolute_error(y_train, y_pred_train)
y_pred_test = model.predict(x_test)
mse_test = mean_squared_error(y_test, y_pred_test)
mae_test = mean_absolute_error(y_test, y_pred_test)
return mse_train, mae_train, mse_test, mae_test
metrics_1 = linear_regression()
metrics_2 = lasso_regression()
metrics_3 = lassoCV_regression()
metrics_4 = ridge_regression()
metrics_5 = ridgeCV_regression()
metrics_info_list = [metrics_1, metrics_2, metrics_3, metrics_4, metrics_5]
metric_str = show_metrics(metrics_info_list)
print(metric_str)
打印结果:
图片结果: