探索两个变量

getwd()
list.files()
pf <- read.csv("pseudo_facebook.tsv",sep='\t')
pf <- read.delim('pseudo_facebook.tsv')

Scatterplots

library(ggplot2)
qplot(x=age,y=friend_count,data=pf)
qplot(age,friend_count,data=pf)
ggplot(aes(x=age,y=friend_count),data=pf)+
  geom_point()

ggplot Syntax

ggplot(aes(x=age,y=friend_count),data=pf)+
  geom_point()+
  xlim(13,90)

Overplotting

对于散点图出现重叠，我们可以用alph参数来解决
gitter向图表添加噪音

ggplot(aes(x=age,y=friend_count),data=pf)+
  geom_jitter(alpha=1/20)+
  xlim(13,90)

Coord_trans()

ggplot(aes(x=age,y=friend_count),data=pf)+
  geom_point(alpha=1/20)+
  xlim(13,90)+
  coord_trans(y='sqrt')

ggplot(aes(x=age,y=friend_count),data=pf)+
  geom_point(alpha=1/20,position=position_jitter(h=0))+
  xlim(13,90)+
  coord_trans(y='sqrt')

如果要在y='sqrt'的基础上添加噪音，那么要使用新的参数position=position_jitter(h=0),因为一些人的好友数为0，添加噪音后会出现负数，那么取sqrt就会出现虚数

Alpha and Jitter

Notes:

names(pf)
summary(pf$friendships_initiated)
ggplot(aes(x=age,y=friendships_initiated),data=pf)+
  geom_point(alpha=1/10,position=position_jitter(h=0))+
  xlim(13,90)+
  coord_trans(y='sqrt')

Conditional Means

library(dplyr)
age_groups <- group_by(pf,age)
pf.fc_by_age <- summarise(age_groups,
                          friend_count_mean = mean(friend_count),
                          friend_count_median=median(as.numeric(friend_count)),
                          n=n())
pf.fc_by_age <- arrange(pf.fc_by_age,age)
head(pf.fc_by_age)

使用%>%的版本

pf.fc_by_age <- pf %>%
  group_by(age)%>%
  summarise(friend_count_mean=mean(friend_count),
            friend_count_median=median(as.numeric(friend_count)),
            n=n())%>%
  arrange(age)

head(pf.fc_by_age)

Create plot use pf.fc_by_age

ggplot(aes(x=age,y=friend_count_mean),data=pf.fc_by_age)+
  geom_point()
ggplot(aes(x=age,y=friend_count_mean),data=pf.fc_by_age)+
  geom_line()

将年龄和平均好友数的图添加到原来的年龄-好友数的散点图中

原图：

ggplot(aes(x=age,y=friend_count),data=pf)+
  xlim(13,90)+
  geom_point(alpha=1/20,
             position=position_jitter(h=0),
             color='orange')+
  coord_trans(y='sqrt')

添加后的图：

ggplot(aes(x=age,y=friend_count),data=pf)+
  xlim(13,90)+
  geom_point(alpha=1/20,
             position=position_jitter(h=0),
             color='orange')+
  coord_trans(y='sqrt')+
  geom_line(stat='summary',fun.y=mean)

继续在上图中添加数据的分位数(10%,90%,中位数)

ggplot(aes(x=age,y=friend_count),data=pf)+
  xlim(13,90)+
  geom_point(alpha=1/20,
             position=position_jitter(h=0),
             color='orange')+
  coord_trans(y='sqrt')+
  geom_line(stat='summary',fun.y=mean)+
  geom_line(stat='summary',fun.y=quantile,fun.args=list(probs=0.1),
            color='blue',linetype=2)+
  geom_line(stat='summary',fun.y=quantile,fun.args=list(probs=0.9),
            color='blue',linetype=2)+
  geom_line(stat='summary',fun.y=quantile,fun.args=list(probs=0.5),
            color='blue')

对数据进行限制，放大数据

ggplot(aes(x=age,y=friend_count),data=pf)+
  geom_point(alpha=1/20,
             position=position_jitter(h=0),
             color='orange')+
  coord_trans(y='sqrt')+
  geom_line(stat='summary',fun.y=mean)+
  geom_line(stat='summary',fun.y=quantile,fun.args=list(probs=0.1),
            color='blue',linetype=2)+
  geom_line(stat='summary',fun.y=quantile,fun.args=list(probs=0.9),
            color='blue',linetype=2)+
  geom_line(stat='summary',fun.y=quantile,fun.args=list(probs=0.5),
            color='blue')+
  coord_cartesian(xlim=c(13,70),ylim=c(0,1000))

Correlation

cor.test(pf$age,pf$friend_count)
cor.test(pf$age,pf$friend_count,method='pearson')
with(pf,cor.test(age,friend_count,method='pearson'))

Correlation on Subsets

with(subset(pf,age<=70),cor.test(age, friend_count))
with(subset(pf,age<=70),cor.test(age, friend_count,method='pearson'))

Correlation Methods

单调关系的度量

with(subset(pf,age<=70),cor.test(age, friend_count,method='spearman'))

"likes_received"和"www_likes_received"之间的散点图

summary(pf$likes_received)
summary(pf$www_likes_received)
ggplot(aes(x=www_likes_received,y=likes_received),data=pf)+
  geom_point()

Strong Correlations

ggplot(aes(x=www_likes_received,y=likes_received),data=pf)+
  geom_point()+
  xlim(0,quantile(pf$www_likes_received,0.95))+
  ylim(0,quantile(pf$likes_received,0.95))+
  geom_smooth(method='lm',color='red')

geom_smooth(method='lm')中的lm代表linear model

Correlation between "likes_received" and "www_likes_received"

cor.test(pf$likes_received,pf$www_likes_received,method='pearson')

得到如此高的相关系数，是因为一个变量是另一个变量的超集

Moira on Correlation

通常会对变量做递归，而递归的假设是这些变量相互独立，所以要先研究变量之间的相关性，来决定哪些变量不要一同放入递归模型中

More Caution with Correlation


library(alr3)
data(Mitchell)
View(Mitchell)

plot: Temp vs Month

ggplot(aes(x=Month,y=Temp),data=Mitchell)+
  geom_point()
qplot(data=Mitchell,Month,Temp)

correlation coefficient: Temp vs Month

cor.test(Mitchell$Month,Mitchell$Temp)
with(Mitchell,cor.test(Month,Temp,method='pearson'))

Making Sense of Data

按月将月份和温度的散点图的X轴分隔开

summary(Mitchell$Month)
ggplot(aes(x=Month,y=Temp),data=Mitchell)+
  geom_point()+
  scale_x_continuous(breaks=seq(0,203,12))

拉伸该图片，使其x轴长于y轴，这时候我们发现：
图像出现了类似正弦/余弦一样的循环模式
数据的性质可以决定图像的模式

Understanding Noise: Age to Age Months

Notes:
如果选择更加精细的bins，那么图像会出现更多的噪音

pf$age_with_months <- pf$age + (12-pf$dob_month)/12

Age with Months Means

pf.fc_by_age_months <- pf %>%
  group_by(age_with_months)%>%
  summarise(friend_count_mean=mean(friend_count),
            friend_count_median=median(as.numeric(friend_count)),
            n=n())%>%
  ungroup()%>%
  arrange(age_with_months)
head(pf.fc_by_age_months)

不用%>%的另一种方法

age_month_groups <-group_by(pf,age_with_months)
pf.fc_by_age_months1 <- summarise(age_month_groups,
                                 fc_mean = mean(friend_count),
                                 fc_median=median(as.numeric(friend_count)),
                                 n=n())
pf.fc_by_age_months1 <-arrange(pf.fc_by_age_months1,age_with_months)
head(pf.fc_by_age_months1)

Noise in Conditional Means

ggplot(aes(x=age_with_months,y=friend_count_mean),
       data=subset(pf.fc_by_age_months,age_with_months<71))+
  geom_line()

Smoothing Conditional Means

年龄-好友数图：

p1 <- ggplot(aes(x=age,y=friend_count_mean),
       data=subset(pf.fc_by_age,age<71))+
  geom_line()

月龄-好友数图

p2 <- ggplot(aes(x=age_with_months,y=friend_count_mean),
       data=subset(pf.fc_by_age_months,age_with_months<71))+
  geom_line()

library(gridExtra)
grid.arrange(p1,p2,ncol=1)

将年龄在5的倍数内的用户混在一起计算平均好友数,并与之前的两个图放置在一起

p3 <- ggplot(aes(x=round(age/5)*5,y=friend_count),
             data=subset(pf,age<71))+
  geom_line(stat='summary',fun.y=mean)
grid.arrange(p1,p2,p3,ncol=1)

用平滑函数进行拟合

p1 <- ggplot(aes(x=age,y=friend_count_mean),
       data=subset(pf.fc_by_age,age<71))+
  geom_line()+
  geom_smooth()

p2 <- ggplot(aes(x=age_with_months,y=friend_count_mean),
       data=subset(pf.fc_by_age_months,age_with_months<71))+
  geom_line()+
  geom_smooth()

p3 <- ggplot(aes(x=round(age/5)*5,y=friend_count),
             data=subset(pf,age<71))+
  geom_line(stat='summary',fun.y=mean)
grid.arrange(p1,p2,p3,ncol=1)

习题集
1.价格与x

data(diamonds)
View(diamonds)
ggplot(aes(x=x,y=price),data=diamonds)+
  geom_point(alpha=1/10)

2.相关系数

cor.test(diamonds$price,diamonds$x)
cor.test(diamonds$price,diamonds$y)
cor.test(diamonds$price,diamonds$z)

3.价格与深度

ggplot(aes(x=depth,y=price),data=diamonds)+
  geom_jitter(alpha=1/50)

ggplot(data = diamonds, aes(x = depth, y = price)) + 
  geom_point(alpha=1/100)+
  scale_x_continuous(breaks=seq(0,80,2))

cor.test(diamonds$depth,diamonds$price)

4.价格与克拉

ggplot(data = diamonds, aes(x = carat, y = price)) + 
  geom_point(alpha=1/50)+
  xlim(0,quantile(diamonds$carat,0.9))+
  ylim(0,quantile(diamonds$price,0.9))

5.价格与体积

diamonds$v = diamonds$x*diamonds$y*diamonds$z

ggplot(data = diamonds, aes(x = v, y = price)) + 
  geom_point()

Some volumes are 0
we can find out how many diamonds have 0 volume by using

count(diamonds$v == 0)

The count() function comes with the plyr package.

library(plyr)

The plyr package will conflict with the dplyr package,so we must unload the plyr package when we use dplyr package.

detach("package:plyr", unload=TRUE)
library(dplyr)

6.子集相关性

with(subset(diamonds,v>0&v<800),cor.test(v,price))

7.调整 - 价格与体积

ggplot(data = subset(diamonds,v>0&v<800), aes(x = v, y = price)) + 
  geom_point(alpha=1/100)+
  geom_smooth()

8.平均价格--净度
Use the function dplyr package to create a new data frame containing info on diamonds by clarity.

Name the data frame diamondsByClarity
The data frame should contain the following variables in this order.
(1) mean_price
(2) median_price
(3) min_price
(4) max_price
(5) n
where n is the number of diamonds in each level of clarity.
法一

library(dplyr)
clarity_groups <- group_by(diamonds,clarity)
diamondsByClarity <- summarise(clarity_groups,
                              mean_price=mean(price),
                              median_price=median(as.numeric(price)),
                              min_price=min(price),
                              max_price=max(price),
                              n=n())
diamondsByClarity <- arrange(diamondsByClarity,clarity)

法二：

diamondsByClarity <- diamonds %>%
  group_by(clarity)%>%
  summarise(mean_price=mean(price),
            median_price=median(as.numeric(price)),
            min_price=min(price),
            max_price=max(price),
            n=n())%>%
  arrange(clarity)

9.平均价格柱状图(stat = "identity")

diamonds_by_clarity <- group_by(diamonds, clarity)
diamonds_mp_by_clarity <- summarise(diamonds_by_clarity, mean_price = mean(price))

diamonds_by_color <- group_by(diamonds, color)
diamonds_mp_by_color <- summarise(diamonds_by_color, mean_price = mean(price))

p1 <- ggplot(aes(x=color,y=mean_price),data=diamonds_mp_by_color)+
  geom_bar(stat = "identity")
p2 <- ggplot(aes(x=clarity,y=mean_price),data=diamonds_mp_by_clarity)+
  geom_bar(stat = "identity")
library(gridExtra)
grid.arrange(p1,p2,ncol=1)