本课的目标是创建钻石价格的预测模型
Scatterplot Review
library(ggplot2)
ggplot(aes(x=carat,y=price),data=diamonds)+
xlim(0,quantile(diamonds$carat,0.99))+
ylim(0,quantile(diamonds$price,0.99))+
geom_point(color=I('black'),fill=I('#F79420'),shape=21)
ggplot(aes(x=carat,y=price),data=diamonds)+
scale_x_continuous(lim=c(0,quantile(diamonds$carat,0.99)))+
scale_y_continuous(lim=c(0,quantile(diamonds$price,0.99)))+
geom_point(color=I('black'),fill=I('#F79420'),shape=21)
在价格和克拉的图中,我们可以看到非线性关系(可能是指数关系或者其他)
ggpairs Function
Notes:
# install these if necessary
install.packages('GGally') #做散点图矩阵
library(GGally)
install.packages('scales')
library(scales)
install.packages('memisc') #汇总递归
library(memisc)
install.packages('lattice')
library(lattice)
install.packages('MASS') #用于各种函数
library(MASS)
install.packages('car') #重写变量代码
library(car)
install.packages('reshape')
library(reshape)
install.packages('plyr')
library(plyr)
# load the ggplot graphics package and the others
library(ggplot2)
library(GGally)
library(scales)
library(memisc)
# sample 10,000 diamonds from the data set
set.seed(20022012)
diamond_samp <- diamonds[sample(1:length(diamonds$price), 10000), ]
ggpairs(diamond_samp,
lower = list(continuous = wrap("points", shape = I('.'))),
upper = list(combo = wrap("box", outlier.shape = I('.'))))
钻石的价格和carat高度相关
钻石的重量是体积的函数,而体积是xyz,因此我们特别感兴趣的变量可能是:
克拉重量的立方根
The Demand of Diamonds
library(gridExtra)
plot1 <- ggplot(aes(x=price),data=diamonds) +
geom_histogram(fill=I("#099DD9"),binwidth=100)+
ggtitle('Price')
plot2 <- ggplot(aes(x=price),data=diamonds) +
geom_histogram(fill=I("#F79420"),binwidth=0.01)+
ggtitle('Price (log10)')+
scale_x_log10()
grid.arrange(plot1,plot2,ncol=2)
plot2中出现了两个峰值,这两个峰值符合
我们对钻石消费者特点的富人买家和穷人买家两个类别的推测:
Scatterplot Transformation
将y轴的价格转化为log10
ggplot(aes(x=carat,y=price),data=diamonds)+
geom_point()+
scale_y_continuous(trans=log10_trans())+
ggtitle("Price (log10) by Carat")
将克拉转换为立方根
cuberoot_trans = function() trans_new('cuberoot',
transform=function(x)x^(1/3),
inverse = function(x)x^3)
ggplot(aes(x=carat,y=price),data=diamonds)+
geom_point()+
scale_x_continuous(trans=cuberoot_trans(),limits=c(0.2,3),
breaks=c(0.2,0.5,1,2,3))+
scale_y_continuous(trans=log10_trans(),limits=c(350,15000),
breaks=c(350,1000,5000,10000,15000))+
ggtitle("Price (log10) by Cube-Root of Carat")
Overplotting Revisited
通过jitter和alpha,我们能在图中更好地看出关键区域数据的疏密程度
head(sort(table(diamonds$carat),decreasing=T))
head(sort(table(diamonds$price),decreasing = T))
ggplot(aes(x=carat, y=price), data = diamonds) +
geom_point(alpha=1/2,position='jitter',size=0.75) +
scale_x_continuous(trans = cuberoot_trans(), limits = c(0.2, 3),
breaks = c(0.2, 0.5, 1, 2, 3)) +
scale_y_continuous(trans = log10_trans(), limits = c(350, 15000),
breaks = c(350, 1000, 5000, 10000, 15000)) +
ggtitle('Price (log10) by Cube-Root of Carat')
经过数据转化后的图形,看上去carat和price之间存在近乎现行的关系,但肯定还有其他因素影响钻石的价格
Price vs. Carat and Clarity
使用颜色对图形进行可视化,关系为克拉和价格,使用颜色表示不同的clarity
library(RColorBrewer)
ggplot(aes(x = carat, y = price,color=clarity), data = diamonds) +
geom_point(alpha = 0.5, size = 1, position = 'jitter') +
scale_color_brewer(type = 'div',
guide = guide_legend(title = 'Clarity', reverse = T,
override.aes = list(alpha = 1, size = 2))) +
scale_x_continuous(trans = cuberoot_trans(), limits = c(0.2, 3),
breaks = c(0.2, 0.5, 1, 2, 3)) +
scale_y_continuous(trans = log10_trans(), limits = c(350, 15000),
breaks = c(350, 1000, 5000, 10000, 15000)) +
ggtitle('Price (log10) by Cube-Root of Carat and Clarity')
Clarity and Price
上面生成的图中,可以看出clarity能够解释价格变化的很多方面,将carat保持常量,可以看出净度较低的钻石始终比净度较高的钻石价格低
Response:
Price vs. Carat and Cut
ggplot(aes(x = carat, y = price,color=cut), data = diamonds) +
geom_point(alpha = 0.5, size = 1, position = 'jitter') +
scale_color_brewer(type = 'div',
guide = guide_legend(title = 'Cut', reverse = T,
override.aes = list(alpha = 1, size = 2))) +
scale_x_continuous(trans = cuberoot_trans(), limits = c(0.2, 3),
breaks = c(0.2, 0.5, 1, 2, 3)) +
scale_y_continuous(trans = log10_trans(), limits = c(350, 15000),
breaks = c(350, 1000, 5000, 10000, 15000)) +
ggtitle('Price (log10) by Cube-Root of Carat and Cut')
Cut and Price
我们在切割上看不到太多的价格差异
Price vs. Carat and Color
ggplot(aes(x = carat, y = price,color=color), data = diamonds) +
geom_point(alpha = 0.5, size = 1, position = 'jitter') +
scale_color_brewer(type = 'div',
guide = guide_legend(title = 'Color',
override.aes = list(alpha = 1, size = 2))) +
scale_x_continuous(trans = cuberoot_trans(), limits = c(0.2, 3),
breaks = c(0.2, 0.5, 1, 2, 3)) +
scale_y_continuous(trans = log10_trans(), limits = c(350, 15000),
breaks = c(350, 1000, 5000, 10000, 15000)) +
ggtitle('Price (log10) by Cube-Root of Carat and Color')
颜色能解释价格的一些变化
Linear Models in R
在R中,我们可以使用lm() function创建模型
格式:lm(y~x)
其中y是结果变量,x是解释变量
例:log(price)~carat^(1/3)
Building the Linear Model
m1 <- lm(I(log(price)) ~ I(carat^(1/3)), data = diamonds)
m2 <- update(m1, ~ . + carat)
m3 <- update(m2, ~ . + cut)
m4 <- update(m3, ~ . + color)
m5 <- update(m4, ~ . + clarity)
mtable(m1, m2, m3, m4, m5)
我们最后的模型是
log(price)=0.415+9.144*carat^(1/3)-1.093*carat+
(..*cut+..*color+..*clarity)
Model Problems
What could be some problems when using this model?
What else should we think about when using this model?
2008-2014
-inflation
-2008 global recession
-diamond market in china heating up
-uneven recovery/price increase across different carat weight
A Bigger, Better Data Set
加载更大数据/法一:
install.packages('bitops')
install.packages('RCurl')
library(bitops)
library(RCurl)
diamondsurl = getBinaryURL("https://raw.github.com/solomonm/diamonds-data/master/BigDiamonds.Rda")
load(rawConnection(diamondsurl))
加载更大数据/法二:
load("BigDiamonds.rda")
Building a Model Using the Big Diamonds Data Set
只采用GIA认证的钻石,以及价格在1万美元一下的钻石
diamondsbig$logprice <- log(diamondsbig$price)
m1 <- lm(I(logprice) ~ I(carat^(1/3)),
data = subset(diamondsbig),cert=='GIA'&price<10000)
m2 <- update(m1, ~ . + carat)
m3 <- update(m2, ~ . + cut)
m4 <- update(m3, ~ . + color)
m5 <- update(m4, ~ . + clarity)
mtable(m1, m2, m3, m4, m5)
Predictions
由于对价格取了log10,我们需要对模型的结果取幂
thisDiamond = data.frame(carat=1.00,cut="V.Good",
color="I",clarity="VS1")
modelEstimate = predict(m5, newdata = thisDiamond,
interval="prediction", level = .95)
exp(modelEstimate)
fit
--fitted point estimate
lwr&upr
--the lower and upper bound of the 95% confidentical interval