getwd()
list.files()
pf <- read.csv("pseudo_facebook.tsv",sep='\t')
pf <- read.delim('pseudo_facebook.tsv')
Scatterplots
library(ggplot2)
qplot(x=age,y=friend_count,data=pf)
qplot(age,friend_count,data=pf)
ggplot(aes(x=age,y=friend_count),data=pf)+
geom_point()
ggplot Syntax
ggplot(aes(x=age,y=friend_count),data=pf)+
geom_point()+
xlim(13,90)
Overplotting
对于散点图出现重叠,我们可以用alph参数来解决
gitter向图表添加噪音
ggplot(aes(x=age,y=friend_count),data=pf)+
geom_jitter(alpha=1/20)+
xlim(13,90)
Coord_trans()
ggplot(aes(x=age,y=friend_count),data=pf)+
geom_point(alpha=1/20)+
xlim(13,90)+
coord_trans(y='sqrt')
ggplot(aes(x=age,y=friend_count),data=pf)+
geom_point(alpha=1/20,position=position_jitter(h=0))+
xlim(13,90)+
coord_trans(y='sqrt')
如果要在y='sqrt'的基础上添加噪音,那么要使用新的参数position=position_jitter(h=0),因为一些人的好友数为0,添加噪音后会出现负数,那么取sqrt就会出现虚数
Alpha and Jitter
Notes:
names(pf)
summary(pf$friendships_initiated)
ggplot(aes(x=age,y=friendships_initiated),data=pf)+
geom_point(alpha=1/10,position=position_jitter(h=0))+
xlim(13,90)+
coord_trans(y='sqrt')
Conditional Means
library(dplyr)
age_groups <- group_by(pf,age)
pf.fc_by_age <- summarise(age_groups,
friend_count_mean = mean(friend_count),
friend_count_median=median(as.numeric(friend_count)),
n=n())
pf.fc_by_age <- arrange(pf.fc_by_age,age)
head(pf.fc_by_age)
使用%>%的版本
pf.fc_by_age <- pf %>%
group_by(age)%>%
summarise(friend_count_mean=mean(friend_count),
friend_count_median=median(as.numeric(friend_count)),
n=n())%>%
arrange(age)
head(pf.fc_by_age)
Create plot use pf.fc_by_age
ggplot(aes(x=age,y=friend_count_mean),data=pf.fc_by_age)+
geom_point()
ggplot(aes(x=age,y=friend_count_mean),data=pf.fc_by_age)+
geom_line()
将年龄和平均好友数的图添加到原来的年龄-好友数的散点图中
原图:
ggplot(aes(x=age,y=friend_count),data=pf)+
xlim(13,90)+
geom_point(alpha=1/20,
position=position_jitter(h=0),
color='orange')+
coord_trans(y='sqrt')
添加后的图:
ggplot(aes(x=age,y=friend_count),data=pf)+
xlim(13,90)+
geom_point(alpha=1/20,
position=position_jitter(h=0),
color='orange')+
coord_trans(y='sqrt')+
geom_line(stat='summary',fun.y=mean)
继续在上图中添加数据的分位数(10%,90%,中位数)
ggplot(aes(x=age,y=friend_count),data=pf)+
xlim(13,90)+
geom_point(alpha=1/20,
position=position_jitter(h=0),
color='orange')+
coord_trans(y='sqrt')+
geom_line(stat='summary',fun.y=mean)+
geom_line(stat='summary',fun.y=quantile,fun.args=list(probs=0.1),
color='blue',linetype=2)+
geom_line(stat='summary',fun.y=quantile,fun.args=list(probs=0.9),
color='blue',linetype=2)+
geom_line(stat='summary',fun.y=quantile,fun.args=list(probs=0.5),
color='blue')
对数据进行限制,放大数据
ggplot(aes(x=age,y=friend_count),data=pf)+
geom_point(alpha=1/20,
position=position_jitter(h=0),
color='orange')+
coord_trans(y='sqrt')+
geom_line(stat='summary',fun.y=mean)+
geom_line(stat='summary',fun.y=quantile,fun.args=list(probs=0.1),
color='blue',linetype=2)+
geom_line(stat='summary',fun.y=quantile,fun.args=list(probs=0.9),
color='blue',linetype=2)+
geom_line(stat='summary',fun.y=quantile,fun.args=list(probs=0.5),
color='blue')+
coord_cartesian(xlim=c(13,70),ylim=c(0,1000))
Correlation
cor.test(pf$age,pf$friend_count)
cor.test(pf$age,pf$friend_count,method='pearson')
with(pf,cor.test(age,friend_count,method='pearson'))
Correlation on Subsets
with(subset(pf,age<=70),cor.test(age, friend_count))
with(subset(pf,age<=70),cor.test(age, friend_count,method='pearson'))
Correlation Methods
单调关系的度量
with(subset(pf,age<=70),cor.test(age, friend_count,method='spearman'))
"likes_received"和"www_likes_received"之间的散点图
summary(pf$likes_received)
summary(pf$www_likes_received)
ggplot(aes(x=www_likes_received,y=likes_received),data=pf)+
geom_point()
Strong Correlations
ggplot(aes(x=www_likes_received,y=likes_received),data=pf)+
geom_point()+
xlim(0,quantile(pf$www_likes_received,0.95))+
ylim(0,quantile(pf$likes_received,0.95))+
geom_smooth(method='lm',color='red')
geom_smooth(method='lm')中的lm代表linear model
Correlation between "likes_received" and "www_likes_received"
cor.test(pf$likes_received,pf$www_likes_received,method='pearson')
得到如此高的相关系数,是因为一个变量是另一个变量的超集
Moira on Correlation
通常会对变量做递归,而递归的假设是这些变量相互独立,所以要先研究变量之间的相关性,来决定哪些变量不要一同放入递归模型中
More Caution with Correlation
library(alr3)
data(Mitchell)
View(Mitchell)
plot: Temp vs Month
ggplot(aes(x=Month,y=Temp),data=Mitchell)+
geom_point()
qplot(data=Mitchell,Month,Temp)
correlation coefficient: Temp vs Month
cor.test(Mitchell$Month,Mitchell$Temp)
with(Mitchell,cor.test(Month,Temp,method='pearson'))
Making Sense of Data
按月将月份和温度的散点图的X轴分隔开
summary(Mitchell$Month)
ggplot(aes(x=Month,y=Temp),data=Mitchell)+
geom_point()+
scale_x_continuous(breaks=seq(0,203,12))
拉伸该图片,使其x轴长于y轴,这时候我们发现:
图像出现了类似正弦/余弦一样的循环模式
数据的性质可以决定图像的模式
Understanding Noise: Age to Age Months
Notes:
如果选择更加精细的bins,那么图像会出现更多的噪音
pf$age_with_months <- pf$age + (12-pf$dob_month)/12
Age with Months Means
pf.fc_by_age_months <- pf %>%
group_by(age_with_months)%>%
summarise(friend_count_mean=mean(friend_count),
friend_count_median=median(as.numeric(friend_count)),
n=n())%>%
ungroup()%>%
arrange(age_with_months)
head(pf.fc_by_age_months)
不用%>%的另一种方法
age_month_groups <-group_by(pf,age_with_months)
pf.fc_by_age_months1 <- summarise(age_month_groups,
fc_mean = mean(friend_count),
fc_median=median(as.numeric(friend_count)),
n=n())
pf.fc_by_age_months1 <-arrange(pf.fc_by_age_months1,age_with_months)
head(pf.fc_by_age_months1)
Noise in Conditional Means
ggplot(aes(x=age_with_months,y=friend_count_mean),
data=subset(pf.fc_by_age_months,age_with_months<71))+
geom_line()
Smoothing Conditional Means
年龄-好友数图:
p1 <- ggplot(aes(x=age,y=friend_count_mean),
data=subset(pf.fc_by_age,age<71))+
geom_line()
月龄-好友数图
p2 <- ggplot(aes(x=age_with_months,y=friend_count_mean),
data=subset(pf.fc_by_age_months,age_with_months<71))+
geom_line()
library(gridExtra)
grid.arrange(p1,p2,ncol=1)
将年龄在5的倍数内的用户混在一起计算平均好友数,并与之前的两个图放置在一起
p3 <- ggplot(aes(x=round(age/5)*5,y=friend_count),
data=subset(pf,age<71))+
geom_line(stat='summary',fun.y=mean)
grid.arrange(p1,p2,p3,ncol=1)
用平滑函数进行拟合
p1 <- ggplot(aes(x=age,y=friend_count_mean),
data=subset(pf.fc_by_age,age<71))+
geom_line()+
geom_smooth()
p2 <- ggplot(aes(x=age_with_months,y=friend_count_mean),
data=subset(pf.fc_by_age_months,age_with_months<71))+
geom_line()+
geom_smooth()
p3 <- ggplot(aes(x=round(age/5)*5,y=friend_count),
data=subset(pf,age<71))+
geom_line(stat='summary',fun.y=mean)
grid.arrange(p1,p2,p3,ncol=1)
习题集
1.价格与x
data(diamonds)
View(diamonds)
ggplot(aes(x=x,y=price),data=diamonds)+
geom_point(alpha=1/10)
2.相关系数
cor.test(diamonds$price,diamonds$x)
cor.test(diamonds$price,diamonds$y)
cor.test(diamonds$price,diamonds$z)
3.价格与深度
ggplot(aes(x=depth,y=price),data=diamonds)+
geom_jitter(alpha=1/50)
ggplot(data = diamonds, aes(x = depth, y = price)) +
geom_point(alpha=1/100)+
scale_x_continuous(breaks=seq(0,80,2))
cor.test(diamonds$depth,diamonds$price)
4.价格与克拉
ggplot(data = diamonds, aes(x = carat, y = price)) +
geom_point(alpha=1/50)+
xlim(0,quantile(diamonds$carat,0.9))+
ylim(0,quantile(diamonds$price,0.9))
5.价格与体积
diamonds$v = diamonds$x*diamonds$y*diamonds$z
ggplot(data = diamonds, aes(x = v, y = price)) +
geom_point()
Some volumes are 0
we can find out how many diamonds have 0 volume by using
count(diamonds$v == 0)
The count() function comes with the plyr package.
library(plyr)
The plyr package will conflict with the dplyr package,so we must unload the plyr package when we use dplyr package.
detach("package:plyr", unload=TRUE)
library(dplyr)
6.子集相关性
with(subset(diamonds,v>0&v<800),cor.test(v,price))
7.调整 - 价格与体积
ggplot(data = subset(diamonds,v>0&v<800), aes(x = v, y = price)) +
geom_point(alpha=1/100)+
geom_smooth()
8.平均价格--净度
Use the function dplyr package to create a new data frame containing info on diamonds by clarity.
Name the data frame diamondsByClarity
The data frame should contain the following variables in this order.
(1) mean_price
(2) median_price
(3) min_price
(4) max_price
(5) n
where n is the number of diamonds in each level of clarity.
法一
library(dplyr)
clarity_groups <- group_by(diamonds,clarity)
diamondsByClarity <- summarise(clarity_groups,
mean_price=mean(price),
median_price=median(as.numeric(price)),
min_price=min(price),
max_price=max(price),
n=n())
diamondsByClarity <- arrange(diamondsByClarity,clarity)
法二:
diamondsByClarity <- diamonds %>%
group_by(clarity)%>%
summarise(mean_price=mean(price),
median_price=median(as.numeric(price)),
min_price=min(price),
max_price=max(price),
n=n())%>%
arrange(clarity)
9.平均价格柱状图(stat = "identity")
diamonds_by_clarity <- group_by(diamonds, clarity)
diamonds_mp_by_clarity <- summarise(diamonds_by_clarity, mean_price = mean(price))
diamonds_by_color <- group_by(diamonds, color)
diamonds_mp_by_color <- summarise(diamonds_by_color, mean_price = mean(price))
p1 <- ggplot(aes(x=color,y=mean_price),data=diamonds_mp_by_color)+
geom_bar(stat = "identity")
p2 <- ggplot(aes(x=clarity,y=mean_price),data=diamonds_mp_by_clarity)+
geom_bar(stat = "identity")
library(gridExtra)
grid.arrange(p1,p2,ncol=1)