5、LASSO模型选择:交叉验证-AIC-BIC
import time
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LassoCV, LassoLarsCV, LassoLarsIC
from sklearn import datasets
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
# 这是为了在执行np.log10时避免被零除
EPSILON = 1e-4
X, y = datasets.load_diabetes(return_X_y=True)
rng = np.random.RandomState(42)
X = np.c_[X, rng.randn(X.shape[0], 14)] # 增加一些不好的特性
# 按照Lars的方法对数据进行规范化,以便进行比较
X /= np.sqrt(np.sum(X ** 2, axis=0))
# LassoLarsIC: 基于BIC/AIC准则的最小角回归
model_bic = LassoLarsIC(criterion='bic')
t1 = time.time()
model_bic.fit(X, y)
t_bic = time.time() - t1
alpha_bic_ = model_bic.alpha_
model_aic = LassoLarsIC(criterion='aic')
model_aic.fit(X, y)
alpha_aic_ = model_aic.alpha_
def plot_ic_criterion(model, name, color):
criterion_ = model.criterion_
plt.semilogx(model.alphas_ + EPSILON, criterion_, '--', color=color,
linewidth=3, label='%s criterion' % name)
plt.axvline(model.alpha_ + EPSILON, color=color, linewidth=3,
label='alpha: %s estimate' % name)
plt.xlabel(r'$\alpha$')
plt.ylabel('criterion')
plt.figure()
plot_ic_criterion(model_aic, 'AIC', 'b')
plot_ic_criterion(model_bic, 'BIC', 'r')
plt.legend()
plt.title('信息-模型选择的标准 (训练时间: %.3fs)'
% t_bic)
# LassoCV: 坐标下降
# 计算路径
print("Computing regularization path using the coordinate descent lasso...")
t1 = time.time()
model = LassoCV(cv=20).fit(X, y)
t_lasso_cv = time.time() - t1
# 显示结果
plt.figure()
ymin, ymax = 2300, 3800
plt.semilogx(model.alphas_ + EPSILON, model.mse_path_, ':')
plt.plot(model.alphas_ + EPSILON, model.mse_path_.mean(axis=-1), 'k',
label='Average across the folds', linewidth=2)
plt.axvline(model.alpha_ + EPSILON, linestyle='--', color='k',
label='alpha: CV estimate')
plt.legend()
plt.xlabel(r'$\alpha$')
plt.ylabel('Mean square error')
plt.title('每个折叠上的均方误差:坐标下降'
'(训练时间 : %.2fs)' % t_lasso_cv)
plt.axis('tight')
plt.ylim(ymin, ymax)
# LassoLarsCV:最小角回归
# 计算路径
print("Computing regularization path using the Lars lasso...")
t1 = time.time()
model = LassoLarsCV(cv=20).fit(X, y)
t_lasso_lars_cv = time.time() - t1
# 显示结果
plt.figure()
plt.semilogx(model.cv_alphas_ + EPSILON, model.mse_path_, ':')
plt.semilogx(model.cv_alphas_ + EPSILON, model.mse_path_.mean(axis=-1), 'k',
label='Average across the folds', linewidth=2)
plt.axvline(model.alpha_, linestyle='--', color='k',
label='alpha CV')
plt.legend()
plt.xlabel(r'$\alpha$')
plt.ylabel('Mean square error')
plt.title('每折均方误差: Lars (训练时间 : %.2fs)'
% t_lasso_lars_cv)
plt.axis('tight')
plt.ylim(ymin, ymax)
plt.show()