一,EDA
1,观察目标变量
sns.set_style("white")
f, ax = plt.subplots(figsize=(8, 7))
#Check the new distribution
sns.distplot(train['SalePrice'], color="b");
ax.xaxis.grid(False)
ax.set(ylabel="Frequency")
ax.set(xlabel="SalePrice")
ax.set(title="SalePrice distribution")
sns.despine(trim=True, left=True) #去除框
plt.show()
2,求与目标变量的相关系数
corr_with_SalePrice = train.corr()["SalePrice"].sort_values(ascending=False)
plt.figure(figsize=(20,6))
corr_with_SalePrice.drop("SalePrice").plot.bar()
plt.show()
3,箱线图(离散)观察异常值,一般为与目标变量相关度较高的变量
data=pd.concat([train['SalePrice'],train['OverallQual']],axis=1)
f,ax=plt.subplots(figsize(8,6))
fig=sns.boxplot(x=train['OverallQual'],y="SalePrice",data=data)
fig.axis(ymin=0,ymax=800000);
连续值观察异常值
data=pd.concat([train['SalePrice'],train['TotalBsmtSF']],axis=1)
data.plot.scatter(x='TotalBsmtSF',y='SalePrice',alpha=0.3,ylim=(0,800000))
二,特征处理
1,使目标变量趋于正态分布
train["SalePrice"]=np.log1p(train["SalePrice"])
获取正太分布后的均值和标准差
from scipy.stats import skew, norm
(mu, sigma) = norm.fit(train['SalePrice'])
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))
2,去除异常值
train.drop(train[(train['OverallQual']<5)&(train['SalePrice']>200000)].index,inplace=True)
train.drop(train[(train['GrLivArea']>4500)&(train['SalePrice']<300000)].index,inplace=True)
3,缺失值
def percent_missing(df):
data = pd.DataFrame(df)
df_cols = list(pd.DataFrame(data))
dict_x = {}
for i in range(0, len(df_cols)):
dict_x.update({df_cols[i]: round(data[df_cols[i]].isnull().mean()*100,2)})
return dict_x
missing = percent_missing(all_features)
df_miss = sorted(missing.items(), key=lambda x: x[1], reverse=True)
4,处理偏度较大的变量
skew_features = all_features[numeric].apply(lambda x: skew(x)).sort_values(ascending=False)
high_skew = skew_features[skew_features > 0.5]
skew_index = high_skew.index
print("There are {} numerical features with Skew > 0.5 :".format(high_skew.shape[0]))
skewness = pd.DataFrame({'Skew' :high_skew})
skew_features.head(10)
from scipy.special import boxcox1p
for i in skew_index:
all_features[i] = boxcox1p(all_features[i], boxcox_normmax(all_features[i] + 1))
5,创建新的特征
比如组合特征,log特征,平方特征
6,非数值特征转码
all_features = pd.get_dummies(all_features).reset_index(drop=True)
三,模型
kf = KFold(n_splits=12, random_state=42, shuffle=True)
def cv_rmse(model, X=X):
rmse = np.sqrt(-cross_val_score(model, X, train_labels, scoring="neg_mean_squared_error", cv=kf))
return (rmse)
建立多个模型,进行模型融合
from mlxtend.regressor import StackingCVRegressor
stack_gen = StackingCVRegressor(regressors=(xgboost, lightgbm, svr, ridge, gbr, rf),
meta_regressor=xgboost,
use_features_in_secondary=True)
def blended_predictions(X):
return ((0.1 * ridge_model_full_data.predict(X)) + \
(0.2 * svr_model_full_data.predict(X)) + \
(0.1 * gbr_model_full_data.predict(X)) + \
(0.1 * xgb_model_full_data.predict(X)) + \
(0.1 * lgb_model_full_data.predict(X)) + \
(0.05 * rf_model_full_data.predict(X)) + \
(0.35 * stack_gen_model.predict(np.array(X))))
最后输出结果记得np.expm1