2019-01-15

Scrapy爬虫与机器学习之三:房屋挂牌价格预测

Scrapy爬虫与机器学习之三:房屋挂牌价格预测

本文在前期抓取房产中介二手房某区域所有2453套房屋基础上,使用机器学习的线性回归模型进行预测朋友拟挂牌房屋的价格。经过比较几个模型,使用平均误差评价指标MAE,发现表现最好的模型是Grandient Boosting Regressor. MAE=1837元/平方米。把朋友房屋信息输入此预测模型,得到的预测挂牌价格是75064.67元/平方米。比去年市场挂牌价下降了10%左右。


代码:


import pandas as pd

#import some necessary librairies

import numpy as np # linear algebra

import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

get_ipython().run_line_magic('matplotlib', 'inline')

import matplotlib.pyplot as plt  # Matlab-style plotting

import seaborn as sns

color = sns.color_palette()

sns.set_style('darkgrid')

import warnings

def ignore_warn(*args, **kwargs):

    pass

warnings.warn = ignore_warn #ignore annoying warning (from sklearn and seaborn)

from scipy import stats

from scipy.stats import norm, skew #for some statistics

pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x)) #Limiting floats output to 3 decimal points

from subprocess import check_output

#读入数据文件

dftrain = pd.read_csv(r'C:\Users\Guoli\Desktop\scrapyfolder\new_env\ajk\ajk0115.csv',encoding="gbk")

#删除总价

dftrain=dftrain.drop('totalprice',axis=1)

dftrain.head()

print("keys of df dataset:\n{}".format(dftrain.keys()))

from sklearn.model_selection import train_test_split

print("type of data:{}".format(type(dftrain['unitprice'])))

fig, ax = plt.subplots()

ax.scatter(x = dftrain['floorsize'], y = dftrain['unitprice'])

plt.ylabel('unitprice', fontsize=13)

plt.xlabel('floorsize', fontsize=13)

plt.show()

#删除异常数据:面积超过400平方米

dftrain = dftrain.drop(dftrain[(dftrain['floorsize']>400)].index)

#Check the graphic again

fig, ax = plt.subplots()

ax.scatter(dftrain['floorsize'], dftrain['unitprice'])

plt.ylabel('unitprice', fontsize=13)

plt.xlabel('floorsize', fontsize=13)

plt.show()

#删除单价大于10万的数据

dftrain = dftrain.drop(dftrain[(dftrain['unitprice']>100000)].index)

#面积与单价的关系画图

#Check the graphic again

fig, ax = plt.subplots()

ax.scatter(dftrain['floorsize'], dftrain['unitprice'])

plt.ylabel('unitprice', fontsize=13)

plt.xlabel('floorsize', fontsize=13)

plt.show()

import numpy as np # linear algebra

import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

get_ipython().run_line_magic('matplotlib', 'inline')

import matplotlib.pyplot as plt  # Matlab-style plotting

import seaborn as sns

color = sns.color_palette()

sns.set_style('darkgrid')

import warnings

def ignore_warn(*args, **kwargs):

    pass

warnings.warn = ignore_warn #ignore annoying warning (from sklearn and seaborn)

#查看单价的分布状态

from scipy import stats

from scipy.stats import norm, skew #for some statistics

pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x)) #Limiting floats output to 3 decimal points

from subprocess import check_output

import seaborn as sns

import matplotlib.pyplot as plt

sns.distplot(dftrain['unitprice'] , fit=norm);

# Get the fitted parameters used by the function

(mu, sigma) = norm.fit(dftrain['unitprice'])

print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))

#Now plot the distribution

plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],

            loc='best')

plt.ylabel('Frequency')

plt.title('unitprice distribution')

#Get also the QQ-plot

fig = plt.figure()

res = stats.probplot(dftrain['unitprice'], plot=plt)

plt.show()

#相关系数

#Correlation map to see how features are correlated with SalePrice

corrmat = dftrain.corr()

plt.subplots(figsize=(12,9))

sns.heatmap(corrmat, vmax=0.9, square=True)

dftrain.corr()#correlations of features

#category 独热编码

dftraindummy = pd.get_dummies(dftrain)

#分箱

bins = [0,0.1,0.5,0.9,1]

groupnames = ['floorsize1','floorsize2','floorsize3','floorsize4']

floorsize_binned = pd.get_dummies(pd.qcut(dftraindummy['floorsize'],bins,labels=groupnames))

dftrainbined = pd.concat([dftraindummy,floorsize_binned],axis=1)

#Correlation map to see how features are correlated with SalePrice

corrmat = dftrainbined.corr()

plt.subplots(figsize=(12,9))

sns.heatmap(corrmat, vmax=0.9, square=True)

#划分训练集和测试集

X = dftrainbined.iloc[:,dftrainbined.columns!='unitprice']

y = dftrainbined['unitprice'].values

#预测分析

import numpy as np

import matplotlib.pyplot as plt

from sklearn import ensemble

from sklearn.utils import shuffle

from sklearn.metrics import mean_squared_error

#线性回归模型

from sklearn.linear_model import LinearRegression

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42)#random_state=42时SCORE OF TEST:0.79

lr = LinearRegression().fit(X_train, y_train)

# print("Training set score:{:.2f}".format(lr.score(X_train,y_train)))

# print('Test train score:{:.2f}'.format(grid_search.score(X_train,y_train)))

print("Train set score:{:.2f}".format(lr.score(X_train,y_train)))

print("Test set score:{:.2f}".format(lr.score(X_test,y_test)))

mse = mean_squared_error(y_test, lr.predict(X_test))

print("MSE: %.4f" % mse)

#Ridge模型

from sklearn.linear_model import Ridge

ridge = Ridge().fit(X_train, y_train)#alpha=1.0 by default

print("Ridge Training set score:{:.2f}".format(ridge.score(X_train, y_train)))

print("Ridge Test set score:{:.2f}".format(ridge.score(X_test, y_test)))

from sklearn.model_selection import cross_val_score

#Ridge CV优化

from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import train_test_split

from sklearn.linear_model import Ridge

X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42)#random_state=42时SCORE OF TEST:0.79

ridge = Ridge()

param_grid={'alpha':[0.001,0.01,0.1,1]}

grid_search = GridSearchCV(ridge, param_grid, cv=5)

grid_search.fit(X_train,y_train)

print('Test set score:{:.2f}'.format(grid_search.score(X_test,y_test)))

print("Best parameters:{}".format(grid_search.best_params_))

print('Test train score:{:.2f}'.format(grid_search.score(X_train,y_train)))

print('Best cross-validation score:{:.2f}'.format(grid_search.best_score_))

#Lasso 模型

# L1 regularization, some coefficients are exactly zero.

from sklearn.linear_model import Lasso

lasso = Lasso(max_iter=1000000)

from sklearn.model_selection import GridSearchCV

param_grid={'alpha':[0.001,0.01,0.1,1]}

grid_search = GridSearchCV(lasso, param_grid,cv=5)

X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42)

grid_search.fit(X_train,y_train)

print('Test set score:{:.2f}'.format(grid_search.score(X_test,y_test)))

print("Best parameters:{}".format(grid_search.best_params_))

print('Test train score:{:.2f}'.format(grid_search.score(X_train,y_train)))

print('Best cross-validation score:{:.2f}'.format(grid_search.best_score_))

lasso=grid_search.fit(X_train,y_train)

acutuals = y_test

predictions = lasso.predict(X_test)

maelasso = median_absolute_error(actuals, predictions)

print("MAE: %.4f" % maelasso)#平均绝对差

#Lasso negelect some features, the score is not as good as Ridge, let's do another combination

#ElasticNet 模型

from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC

ENet = ElasticNet(l1_ratio=.9,max_iter=1000000, random_state=42)

param_grid={'alpha':[0.001,0.01,0.1,1]}

grid_search = GridSearchCV(ENet, param_grid,cv=5)

X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42)

grid_search.fit(X_train,y_train)

print('Test train score:{:.2f}'.format(grid_search.score(X_train,y_train)))

print('Test set score:{:.2f}'.format(grid_search.score(X_test,y_test)))

enet=grid_search.fit(X_train,y_train)

acutuals = y_test

predictions = enet.predict(X_test)

maeenet = median_absolute_error(actuals, predictions)

print("MAE: %.4f" % maeenet)#平均绝对差

# GBoost模型

from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor

from sklearn.model_selection import KFold

from sklearn.model_selection import cross_val_score

kfold = KFold(n_splits=5)

GBoost = GradientBoostingRegressor(n_estimators=500,learning_rate=0.1,max_depth=10, max_features='sqrt',

                                  min_samples_leaf=5, min_samples_split=5,

                                  loss='huber', random_state =42)

# param_grid={'n_estimator':[100,500,1000,2000,30000]}

# grid_search = GridSearchCV(GBoost, param_grid,cv=5)

X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42)

GBoost.fit(X_train,y_train)

print("cross-validation train scores:\n{}".format(np.mean(cross_val_score(GBoost,X_train,y_train,cv=kfold))))

print("cross-validation test scores:\n{}".format(np.mean(cross_val_score(GBoost,X_test,y_test,cv=kfold))))

mse = mean_squared_error(y_test, GBoost.predict(X_test))

print("MSE: %.4f" % mse)

from sklearn.metrics import mean_squared_log_error

msle = mean_squared_log_error(y_test, GBoost.predict(X_test))

print("MSLE: %.4f" % msle)

from sklearn.metrics import median_absolute_error

mae = median_absolute_error(y_test, GBoost.predict(X_test))

print("MAE: %.4f" % mae)#平均绝对差

# GradientBoosting Regressor 模型参数优化

<h1>GradientBoostingRegressor优化后mae:1795.62元/平方米,在所有模型表现最好</h1>

params = {'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 5,

          'learning_rate': 0.1, 'loss': 'ls'}

clfr = ensemble.GradientBoostingRegressor(**params)

clfr.fit(X_train, y_train)

print("cross-validation train scores:\n{}".format(np.mean(cross_val_score(clfr,X_train,y_train,cv=kfold))))

print("cross-validation test scores:\n{}".format(np.mean(cross_val_score(clfr,X_test,y_test,cv=kfold))))

mse = mean_squared_error(y_test, clfr.predict(X_test))

print("MSE: %.4f" % mse)

mae = median_absolute_error(y_test, clfr.predict(X_test))

print("MAE: %.4f" % mae)#平均绝对差

# compute test set deviance

test_score = np.zeros((params['n_estimators'],), dtype=np.float64)

for i, y_pred in enumerate(clfr.staged_predict(X_test)):

    test_score[i] = clfr.loss_(y_test, y_pred)

plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)

plt.title('Deviance')

plt.plot(np.arange(params['n_estimators']) + 1, clfr.train_score_, 'b-',

        label='Training Set Deviance')

plt.plot(np.arange(params['n_estimators']) + 1, test_score, 'r-',

        label='Test Set Deviance')

plt.legend(loc='upper right')

plt.xlabel('Boosting Iterations')

plt.ylabel('Deviance')

# #############################################################################

# Plot feature importance

feature_importance = clfr.feature_importances_

# make importances relative to max importance

feature_importance = 100.0 * (feature_importance / feature_importance.max())

sorted_idx = np.argsort(feature_importance)

pos = np.arange(sorted_idx.shape[0]) + .5

plt.subplot(1, 2, 2)

plt.barh(pos, feature_importance[sorted_idx], align='center')

plt.yticks(pos, dftrainbined.columns[sorted_idx])

plt.xlabel('Relative Importance')

plt.title('Variable Importance')

plt.show()

# <h1>预测房屋的拟挂牌价格76293元/平方米,误差在1795.62元/平方米</h1>

# In[259]:

import numpy as np

X2 =np.array([3,2004,155,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0])

X3=X2.reshape(1, -1)

pred_clfr=clfr.predict(X3)

# In[260]:

#LightGBM模型

import lightgbm as lgb

gbm1 = lgb.LGBMRegressor(objective='regression',

                        num_leaves=31,

                        learning_rate=0.05,

                        n_estimators=1000)

gbm1.fit(X_train, y_train,

        eval_set=[(X_test, y_test)],

        eval_metric='l1',

        early_stopping_rounds=5)

print('Start predicting...')

y_pred = gbm1.predict(X_test)

# eval

#print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)

maelgbm = median_absolute_error(y_test, y_pred)

print("MAE: %.4f" % maelgbm)#平均绝对差

print("cross-validation test scores:\n{}".format(np.mean(cross_val_score(gbm1,X_test,y_test,cv=5))))

#XGbOOST 模型

import pickle

import xgboost as xgb

import numpy as np

from sklearn.model_selection import KFold, train_test_split, GridSearchCV

from sklearn.metrics import confusion_matrix, mean_squared_error

from sklearn.model_selection import KFold

kf = KFold(n_splits=2, shuffle=True, random_state=42)

for train_index, test_index in kf.split(X):

    xgb_model = xgb.XGBRegressor().fit(X, y)

    predictions = xgb_model.predict(X)

    actuals = y

    print(mean_squared_error(actuals, predictions))

print("cross-validation test scores:\n{}".format(np.mean(cross_val_score(xgb_model,X_test,y_test,cv=kfold))))

maelxgb_model = median_absolute_error(actuals, predictions)

print("MAE: %.4f" % maelxgb_model)#平均绝对差

xgb_model2 = xgb.XGBRegressor(n_estimators= 1000,early_stopping_rounds=5,learning_rate=0.05,n_jobs=-1)

clf = xgb_model2.fit(X_train,y_train)

predictions = clf.predict(X_test)

actuals = y_test

print(mean_squared_error(actuals, predictions))

print("cross-validation test scores:\n{}".format(np.mean(cross_val_score(clf,X_test,y_test,cv=kfold))))

maeclf = median_absolute_error(actuals, predictions)

print("MAE: %.4f" % maeclf)#平均绝对差

xgb_model3 = xgb.XGBRegressor(n_estimators= 2000,learning_rate=0.01,max_depth=2,n_jobs=-1)#learning rate 越小,得分越高

clf2 = xgb_model3.fit(X_train,y_train)

predictions = clf2.predict(X_test)

actuals = y_test

print(mean_squared_error(actuals, predictions))

print("cross-validation test scores:\n{}".format(np.mean(cross_val_score(clf2,X_test,y_test,cv=kfold))))

maeclf2 = median_absolute_error(actuals, predictions)

print("MAE: %.4f" % maeclf2)#平均绝对差

xgb_model4 = xgb.XGBRegressor(n_estimators= 1000,learning_rate=0.1,max_depth=3,gamma=0.01,n_jobs=-1)

clf3 = xgb_model4.fit(X_train,y_train)

predictions = clf3.predict(X_test)

actuals = y_test

print(mean_squared_error(actuals, predictions))

print("cross-validation test scores:\n{}".format(np.mean(cross_val_score(clf3,X_test,y_test,cv=5))))

maeclf3 = median_absolute_error(actuals, predictions)

print("MAE: %.4f" % maeclf3)#平均绝对差

#

ensemble3 = X2*0.3+ pred_gbm1*0.7

ensemble3

#把表现最好的GBM模型保存

from sklearn.externals import joblib

model1=clfr

filename = 'GBMclfr_finalized_model0115.sav'

joblib.dump(model1, filename)

# some time later...

# load the model from disk

# loaded_model = joblib.load(filename)

# result = loaded_model.score(X_test, Y_test)

# print(result)

# #The sklearn API models are picklable

# print("Pickling sklearn API models")

# # must open in binary format to pickle

# pickle.dump(clf3, open("XGB0918NYCTaxi.pkl", "wb"))

# # clf2 = pickle.load(open("best_boston.pkl", "rb"))

# # print(np.allclose(clf.predict(X), clf2.predict(X)))

©著作权归作者所有,转载或内容合作请联系作者
  • 序言:七十年代末,一起剥皮案震惊了整个滨河市,随后出现的几起案子,更是在滨河造成了极大的恐慌,老刑警刘岩,带你破解...
    沈念sama阅读 206,602评论 6 481
  • 序言:滨河连续发生了三起死亡事件,死亡现场离奇诡异,居然都是意外死亡,警方通过查阅死者的电脑和手机,发现死者居然都...
    沈念sama阅读 88,442评论 2 382
  • 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
    开封第一讲书人阅读 152,878评论 0 344
  • 文/不坏的土叔 我叫张陵,是天一观的道长。 经常有香客问我,道长,这世上最难降的妖魔是什么? 我笑而不...
    开封第一讲书人阅读 55,306评论 1 279
  • 正文 为了忘掉前任,我火速办了婚礼,结果婚礼上,老公的妹妹穿的比我还像新娘。我一直安慰自己,他们只是感情好,可当我...
    茶点故事阅读 64,330评论 5 373
  • 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
    开封第一讲书人阅读 49,071评论 1 285
  • 那天,我揣着相机与录音,去河边找鬼。 笑死,一个胖子当着我的面吹牛,可吹牛的内容都是我干的。 我是一名探鬼主播,决...
    沈念sama阅读 38,382评论 3 400
  • 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...
    开封第一讲书人阅读 37,006评论 0 259
  • 序言:老挝万荣一对情侣失踪,失踪者是张志新(化名)和其女友刘颖,没想到半个月后,有当地人在树林里发现了一具尸体,经...
    沈念sama阅读 43,512评论 1 300
  • 正文 独居荒郊野岭守林人离奇死亡,尸身上长有42处带血的脓包…… 初始之章·张勋 以下内容为张勋视角 年9月15日...
    茶点故事阅读 35,965评论 2 325
  • 正文 我和宋清朗相恋三年,在试婚纱的时候发现自己被绿了。 大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
    茶点故事阅读 38,094评论 1 333
  • 序言:一个原本活蹦乱跳的男人离奇死亡,死状恐怖,灵堂内的尸体忽然破棺而出,到底是诈尸还是另有隐情,我是刑警宁泽,带...
    沈念sama阅读 33,732评论 4 323
  • 正文 年R本政府宣布,位于F岛的核电站,受9级特大地震影响,放射性物质发生泄漏。R本人自食恶果不足惜,却给世界环境...
    茶点故事阅读 39,283评论 3 307
  • 文/蒙蒙 一、第九天 我趴在偏房一处隐蔽的房顶上张望。 院中可真热闹,春花似锦、人声如沸。这庄子的主人今日做“春日...
    开封第一讲书人阅读 30,286评论 0 19
  • 文/苍兰香墨 我抬头看了看天上的太阳。三九已至,却和暖如春,着一层夹袄步出监牢的瞬间,已是汗流浃背。 一阵脚步声响...
    开封第一讲书人阅读 31,512评论 1 262
  • 我被黑心中介骗来泰国打工, 没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留,地道东北人。 一个月前我还...
    沈念sama阅读 45,536评论 2 354
  • 正文 我出身青楼,却偏偏与公主长得像,于是被迫代替她去往敌国和亲。 传闻我的和亲对象是个残疾皇子,可洞房花烛夜当晚...
    茶点故事阅读 42,828评论 2 345

推荐阅读更多精彩内容