1. 问题描述
根据给定的80多个维度的特征信息预测房价。
2. 大体思路
数据探索
通过可视化,查看房价分布,各特征和房价之间的相关性等。
数据清洗
补全数据:去掉缺失过多的特征、与预测目标相关性很低的特征。之后通过补上众数等方法把缺省数据补上。
转换成0/1:当特征值是object的时候,根据具体意义转换成数字。
对于数字数据进行标准化
打乱顺序,分出训练集和测试集
建立模型
弹性网络( Elastic Net)
当多个特征和另一个特征相关的时候弹性网络非常有用。
ElasticNet 是一种使用L1和L2先验作为正则化矩阵的线性回归模型.这种组合用于只有很少的权重非零的稀疏模型,比如:class:Lasso, 但是又能保持:class:Ridge 的正则化属性.我们可以使用 l1_ratio 参数来调节L1和L2的凸组合(一类特殊的线性组合)。GBDT(Gradient Boosting Decision Tree)迭代决策树
目前GBDT有两个不同的描述版本,两者各有支持者,读文献时要注意区分。残差版本把GBDT说成一个残差迭代树,认为每一棵回归树都在学习前N-1棵树的残差,Gradient版本把GBDT说成一个梯度迭代树,使用梯度下降法求解,认为每一棵回归树在学习前N-1棵树的梯度下降值,
第一个版本详见博客GBDT(MART) 迭代决策树入门教程 | 简介
第二个版本详见博客GBDT(Gradient Boosting Decision Tree) 没有实现只有原理
具体代码:
Visualization.py
#!/usr/bin/python
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#explore the dataset
sns.set(style="whitegrid",color_codes=True)
sns.set(font_scale=1)
houses=pd.read_csv('sources/train.csv')
print houses.head()
houses_test=pd.read_csv('sources/test.csv')
print houses_test.head()
print "_______________________________________________"
print houses.shape
print houses_test.shape
#info method provides information about dataset like
#total values in each column, null/not null, datatype, memory occupied etc
print "_______________________________________________"
print houses.info()
#Describe gives statistical information about numerical columns in the dataset
print "_______________________________________________"
print houses.describe()
#How many columns with different datatypes are there?
print "_______________________________________________"
print houses.get_dtype_counts()
#coreelation in data
corr=houses.corr()["SalePrice"]
print "_______________________________________________"
print corr[np.argsort(corr, axis=0)[::-1]]
# plotting correlations
num_feat = houses.columns[houses.dtypes != object]
num_feat = num_feat[1:-1]
labels = []
values = []
for col in num_feat:
labels.append(col)
values.append(np.corrcoef(houses[col].values, houses.SalePrice.values)[0, 1])
ind = np.arange(len(labels))
width = 0.9
fig, ax = plt.subplots(figsize=(12, 40))
rects = ax.barh(ind, np.array(values), color='red')
ax.set_yticks(ind + ((width) / 2.))
ax.set_yticklabels(labels, rotation='horizontal')
ax.set_xlabel("Correlation coefficient")
ax.set_title("Correlation Coefficients w.r.t Sale Price");
plt.show()
#查看数据的多重线性:即变量间的相关关系
#Multicollinearity increases the standard errors of the coefficients.
# That means, multicollinearity makes some variables statistically insignificant
# when they should be significant.
# To avoid this we can do 3 things:
# Completely remove those variables
# Make new feature by adding them or by some other operation.
# Use PCA, which will reduce feature set to small number of non-collinear features.
correlations=houses.corr()
attrs = correlations.iloc[:-1,:-1] # all except target
threshold = 0.5 ##
important_corrs = (attrs[abs(attrs) > threshold][attrs != 1.0]) \
.unstack().dropna().to_dict()
unique_important_corrs = pd.DataFrame(
list(set([(tuple(sorted(key)), important_corrs[key]) \
for key in important_corrs])),
columns=['Attribute Pair', 'Correlation'])
# sorted by absolute value
unique_important_corrs = unique_important_corrs.ix[
abs(unique_important_corrs['Correlation']).argsort()[::-1]]
print "_______________________________________________"
print unique_important_corrs
#Heatmap
corrMatrix = houses[["SalePrice", "OverallQual", "GrLivArea", "GarageCars",
"GarageArea", "GarageYrBlt", "TotalBsmtSF", "1stFlrSF", "FullBath",
"TotRmsAbvGrd", "YearBuilt", "YearRemodAdd"]].corr()
sns.set(font_scale=1.10)
plt.figure(figsize=(10,10))
sns.heatmap(corrMatrix,vmax=.8,linewidths=0.01, square=True,annot=True,cmap='viridis',linecolor="white")
plt.title('Correlation between features')
plt.show()
dataCleaning.py
#!/usr/bin/python
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import shuffle
from sklearn import ensemble, tree, linear_model
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.metrics import r2_score, mean_squared_error
houses=pd.read_csv('sources/train.csv')
houses_test=pd.read_csv('sources/test.csv')
#Importing my function
# Prints R2 and RMSE scores
def get_score(prediction, lables):
print('R2: {}'.format(r2_score(prediction, lables)))
print('RMSE: {}'.format(np.sqrt(mean_squared_error(prediction, lables))))
# Shows scores for train and validation sets
def train_test(estimator, x_trn, x_tst, y_trn, y_tst):
prediction_train = estimator.predict(x_trn)
# Printing estimator
print(estimator)
# Printing train scores
get_score(prediction_train, y_trn)
prediction_test = estimator.predict(x_tst)
# Printing test scores
print("Test")
get_score(prediction_test, y_tst)
# checking for missing data
NAs=pd.concat([houses.isnull().sum(),houses_test.isnull().sum()],axis=1,keys=['Train','Test'])
print NAs[NAs.sum(axis=1)>0]
#spliting to features and lables and deleting variable I don't need
train_labels=houses.pop('SalePrice')
features=pd.concat([houses,houses_test],keys=['train','test'])
#get rid of features that have more than half of missing information or do not correlate to SalePrice
features.drop(
['Utilities', 'RoofMatl', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'Heating', 'LowQualFinSF',
'BsmtFullBath', 'BsmtHalfBath', 'Functional', 'GarageYrBlt', 'GarageArea', 'GarageCond', 'WoodDeckSF',
'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC', 'Fence', 'MiscFeature',
'MiscVal'],
axis=1, inplace=True)
#Filling NAs and converting features
# MSSubClass as str
features['MSSubClass'] = features['MSSubClass'].astype(str)
# MSZoning NA in pred. filling with most popular values
features['MSZoning'] = features['MSZoning'].fillna(features['MSZoning'].mode()[0])
# LotFrontage NA in all. I suppose NA means 0
features['LotFrontage'] = features['LotFrontage'].fillna(features['LotFrontage'].mean())
# Alley NA in all. NA means no access
features['Alley'] = features['Alley'].fillna('NOACCESS')
# Converting OverallCond to str
features.OverallCond = features.OverallCond.astype(str)
# MasVnrType NA in all. filling with most popular values
features['MasVnrType'] = features['MasVnrType'].fillna(features['MasVnrType'].mode()[0])
# BsmtQual, BsmtCond, BsmtExposure, BsmtFinType1, BsmtFinType2
# NA in all. NA means No basement
for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
features[col] = features[col].fillna('NoBSMT')
# TotalBsmtSF NA in pred. I suppose NA means 0
features['TotalBsmtSF'] = features['TotalBsmtSF'].fillna(0)
# Electrical NA in pred. filling with most popular values
features['Electrical'] = features['Electrical'].fillna(features['Electrical'].mode()[0])
# KitchenAbvGr to categorical
features['KitchenAbvGr'] = features['KitchenAbvGr'].astype(str)
# KitchenQual NA in pred. filling with most popular values
features['KitchenQual'] = features['KitchenQual'].fillna(features['KitchenQual'].mode()[0])
# FireplaceQu NA in all. NA means No Fireplace
features['FireplaceQu'] = features['FireplaceQu'].fillna('NoFP')
# GarageType, GarageFinish, GarageQual NA in all. NA means No Garage
for col in ('GarageType', 'GarageFinish', 'GarageQual'):
features[col] = features[col].fillna('NoGRG')
# GarageCars NA in pred. I suppose NA means 0
features['GarageCars'] = features['GarageCars'].fillna(0.0)
# SaleType NA in pred. filling with most popular values
features['SaleType'] = features['SaleType'].fillna(features['SaleType'].mode()[0])
# Year and Month to categorical
features['YrSold'] = features['YrSold'].astype(str)
features['MoSold'] = features['MoSold'].astype(str)
# Adding total sqfootage feature and removing Basement, 1st and 2nd floor features
features['TotalSF'] = features['TotalBsmtSF'] + features['1stFlrSF'] + features['2ndFlrSF']
features.drop(['TotalBsmtSF', '1stFlrSF', '2ndFlrSF'], axis=1, inplace=True)
#Log transformation 显示价格区间分布
train_labels=np.log(train_labels)
sns.distplot(train_labels)
# Standardizing numeric data
numeric_features = features.loc[:,['LotFrontage', 'LotArea', 'GrLivArea', 'TotalSF']]
numeric_features_standardized = (numeric_features - numeric_features.mean())/numeric_features.std()
ax = sns.pairplot(numeric_features_standardized)
#Converting categorical data to dummies
# Getting Dummies from Condition1 and Condition2
conditions = set([x for x in features['Condition1']] + [x for x in features['Condition2']])
dummies = pd.DataFrame(data=np.zeros((len(features.index), len(conditions))),
index=features.index, columns=conditions)
for i, cond in enumerate(zip(features['Condition1'], features['Condition2'])):
dummies.ix[i, cond] = 1
features = pd.concat([features, dummies.add_prefix('Condition_')], axis=1)
features.drop(['Condition1', 'Condition2'], axis=1, inplace=True)
# Getting Dummies from Exterior1st and Exterior2nd
exteriors = set([x for x in features['Exterior1st']] + [x for x in features['Exterior2nd']])
dummies = pd.DataFrame(data=np.zeros((len(features.index), len(exteriors))),
index=features.index, columns=exteriors)
for i, ext in enumerate(zip(features['Exterior1st'], features['Exterior2nd'])):
dummies.ix[i, ext] = 1
features = pd.concat([features, dummies.add_prefix('Exterior_')], axis=1)
features.drop(['Exterior1st', 'Exterior2nd', 'Exterior_nan'], axis=1, inplace=True)
# Getting Dummies from all other categorical vars
for col in features.dtypes[features.dtypes == 'object'].index:
for_dummy = features.pop(col)
features = pd.concat([features, pd.get_dummies(for_dummy, prefix=col)], axis=1)
#Obtaining standardized dataset
### Copying features
features_standardized = features.copy()
### Replacing numeric features by standardized values
features_standardized.update(numeric_features_standardized)
#Splitting train and test features
### Splitting features
train_features = features.loc['train'].drop('Id', axis=1).select_dtypes(include=[np.number]).values
test_features = features.loc['test'].drop('Id', axis=1).select_dtypes(include=[np.number]).values
### Splitting standardized features
train_features_st = features_standardized.loc['train'].drop('Id', axis=1).select_dtypes(include=[np.number]).values
test_features_st = features_standardized.loc['test'].drop('Id', axis=1).select_dtypes(include=[np.number]).values
#Splitting to train and validation sets
### Shuffling train sets
train_features_st, train_features, train_labels = shuffle(train_features_st, train_features, train_labels)
### Splitting
x_train, x_test, y_train, y_test = train_test_split(train_features, train_labels, test_size=0.1, random_state=200)
x_train_st, x_test_st, y_train_st, y_test_st = train_test_split(train_features_st, train_labels, test_size=0.1, random_state=200)
plt.show()
ElasticNet.py
#!/usr/bin/python
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import linear_model
from dataCleaning import *
#用Elastic拟合数字特征
ENSTest = linear_model.ElasticNetCV(alphas=[0.0001, 0.0005, 0.001, 0.01, 0.1, 1, 10], l1_ratio=[.01, .1, .5, .9, .99], max_iter=5000).fit(x_train_st, y_train_st)
train_test(ENSTest, x_train_st, x_test_st, y_train_st, y_test_st) #RMSE 均方根误差
# Average R2 score and standart deviation of 5-fold cross-validation
scores = cross_val_score(ENSTest, train_features_st, train_labels, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
GrandientBoosting.py
from dataCleaning import *
GBest = ensemble.GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, max_depth=3, max_features='sqrt',
min_samples_leaf=15, min_samples_split=10, loss='huber').fit(x_train,
y_train)
train_test(GBest, x_train, x_test, y_train, y_test)
#max_features='sqrt' to reduce overfitting of my model.
#use loss='huber' because it more tolerant to outliers
# All other hyper-parameters was chosen using GridSearchCV
# Average R2 score and standart deviation of 5-fold cross-validation
scores = cross_val_score(GBest, train_features_st, train_labels, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
Ensembling.py
from GradientBoosting import GBest
from ElasticNet import ENSTest
from dataCleaning import *
from LR import clf_lr
#final ensemble model is an average of Gradient Boosting and Elastic Net predictions.
# Retraining models
GB_model = GBest.fit(train_features, train_labels)
ENST_model = ENSTest.fit(train_features_st, train_labels)
LR_model=clf_lr.fit(train_features_st, train_labels)
test=pd.read_csv('sources/test.csv')
## Getting our SalePrice estimation
Final_labels = (np.exp(GB_model.predict(test_features)) + np.exp(ENST_model.predict(test_features_st))) / 2
## Saving to CSV
pd.DataFrame({'Id': test.Id, 'SalePrice': Final_labels}).to_csv('result.csv', index =False)
扩展
GBDT
工作原理:http://blog.csdn.net/w28971023/article/details/8240756
GBDT的核心就在于,每一棵树学的是之前所有树结论和的残差,这个残差就是一个加预测值后能得真实值的累加量。比如A的真实年龄是18岁,但第一棵树的预测年龄是12岁,差了6岁,即残差为6岁。那么在第二棵树里我们把A的年龄设为6岁去学习,如果第二棵树真的能把A分到6岁的叶子节点,那累加两棵树的结论就是A的真实年龄;如果第二棵树的结论是5岁,则A仍然存在1岁的残差,第三棵树里A的年龄就变成1岁,继续学。这就是Gradient Boosting在GBDT中的意义,简单吧。
Boosting的最大好处在于,每一步的残差计算其实变相地增大了分错instance的权重,而已经分对的instance则都趋向于0。这样后面的树就能越来越专注那些前面被分错的instance。
适用范围:该版本GBDT几乎可用于所有回归问题(线性/非线性),相对logistic regression仅能用于线性回归,GBDT的适用面非常广。亦可用于二分类问题(设定阈值,大于阈值为正例,反之为负例)。
调参:http://www.cnblogs.com/pinard/p/6143927.html
主要还是用GridSearchCV网格搜素逐个确定参数