- 导入数据
大数据集的导入以及处理一些特殊的文件类型。
setwd('D:\\Rdata\\Books学R语言\\R语言数据分析美盖尔盖伊')
library(hflights)
write.csv(hflights,'hflights.csv',row.names = FALSE)
str(hflights)
导入
使用system.time测试导入文件的时间
system.time(read.csv('hflights.csv'))
# 指定列的转换类型
colClasses <- sapply(hflights,class)
system.time(read.csv('hflights.csv',colClasses = colClasses))
对同一任务重复n此测试,然后再对仿真结果进行汇总
得到关于数据的多种观测结果,分析确定结果中统计的显著差异
# 默认运行100此
library(microbenchmark)
f <- function() read.csv('hflights.csv')
g <- function() read.csv('hflights.csv',colClasses = colClasses,nrows = 227496,comment.char = '')
res <- microbenchmark(f(),g(),times = 10) # times 指定运行次数
res
# 按规定的字体大小输出测试结果
print(res,digits = 20)
结果分析
## italic
## expression
boxplot(res,xlab = '',
main = expression(paste('Benchmarking ',italic('read.table'))))
规模大于物理内存的数据集
library(sqldf)
system.time(read.csv.sql('hflights.csv'))
library(ff)
system.time(read.csv.ffdf(file = 'hflights.csv'))
library(bigmemory)
system.time(read.big.matrix('hflights.csv',header = TRUE))
## 或者data.table
文本文件编译测试平台
library(data.table)
system.time(dt <- fread('hflights.csv'))
# data.table类转换为data.frame
df <- as.data.frame(dt)
is.data.frame(dt)
导入文本文件的子集
数据导入时进行筛选
df <- read.csv.sql('hflights.csv',sql = "select * from file where Dest = '\"BNA\"'")
注意,sql默认为"select * from file",且sqldf不能自动识别双引号
在导入到R会话前筛选平面文件
从数据库中导入数据
- 数据筛选
subset
which
[
[[
sqldf
library(sqldf)
sqldf("select * from mtcars where am = 1 and vs = 1")
# subset
subset(mtcars,am == 1 & vs == 1)
sqldf的row.names参数默认为FALSE
identical(
sqldf("select * from mtcars where am = 1 and vs = 1",
row.names = TRUE),
subset(mtcars,am == 1 & vs == 1)
)
筛选指定列
subset(mtcars,am == 1 & vs == 1,select = hp:wt)
较大数据集//筛选
dplyr
system.time(sqldf("select * from hflights where Dest == 'BNA'",row.names = TRUE))
system.time(subset(hflights,Dest == 'BNA'))
library(dplyr)
system.time(filter(hflights,Dest == 'BNA'))
str(select(filter(hflights,Dest == 'BNA'),DepTime:ArrTime))
行的名称在dplyr结果中不保留
mtcars$rownames <- rownames(mtcars)
head(mtcars)
select(filter(mtcars,hp>300),c(rownames,hp))
data.table方法
library(data.table)
hflights_dt <- data.table(hflights)
## 筛选行
system.time(hflights_dt[Dest == 'BNA'])
head(hflights_dt)
# 筛选列
head(hflights_dt[Dest == 'BNA',.(Year,Dest)])
##list
head(hflights_dt[Dest == 'BNA',list(Year,Dest)])
# 使用data.frame的函数c
head(hflights_dt[Dest == 'BNA',c('Year','ArrTime'),with = FALSE])
聚集
aggregate函数
aggregate(hflights$Diverted,by = list(hflights$DayOfWeek),FUN = mean)
使用with函数
with(hflights,aggregate(Diverted,by = list(DayOfWeek),FUN= mean))
公式化标记
aggregate(Diverted ~ DayOfWeek,data = hflights,FUN = mean)
使用基础的R命令实现快速聚集
# apply
tapply(hflights$Diverted, hflights$DayOfWeek, mean)
plyr包的ddply函数
library(plyr)
ddply(hflights,.(DayOfWeek),function(x) mean(x$Diverted))
plyr包的.函数为用户提供了一种方便的引用变量(名称)的方法。
显式指定相应的列名
# 调用summarise辅助函数来替代上面的匿名函数
ddply(hflights,.(DayOfWeek),summarise,Diverted = mean(Diverted))
dplyr
hflights_DayofWeek <- group_by(hflights,DayOfWeek)
str(attributes(hflights_DayofWeek))
# summarise
dplyr::summarise(hflights_DayofWeek,mean(Diverted))
使用data.table实现聚集
hflights_dt[,mean(Diverted),by = DayOfWeek]
# 列命名
hflights_dt[,.('mean'=mean(Diverted)),by = DayOfWeek]
测试
汇总函数
统计子分组样例数
ddply(hflights,.(DayOfWeek),summarise,n = length(Diverted))
ddply(hflights,.(DayOfWeek),nrow)
table(hflights$DayOfWeek)
# plyr的count函数
count(hflights,'DayOfWeek')
# dplyr
dplyr::summarise(hflights_DayofWeek,n())
# hflights_DayOfWeek的结构
attr(hflights_DayofWeek,'group_sizes')
# data.table
hflights_dt[,.N,by =.(DayOfWeek)]
- 数据重构
# 数据重构
library(data.table)
library(dplyr)
# 1.矩阵转置
(m <- matrix(1:9,3))
t(m)
# 适用于data.frame对象
head(iris)
t(head(iris))
# 2.基于字符串匹配实现数据筛选
library(dplyr)
library(hflights)
str(select(hflights,ends_with("delay")))
# ignore.case 是否区分大小写
str(select(hflights,contains('T',ignore.case=FALSE)))
# 正则表达式
# match
# 长度为5或者6的列名
str(select(hflights,matches("^[[:alpha:]]{5,6}$"))) #[]{n}
# 符号- 筛选所有不符合表达式条件的列名
# 列名定义时最常用的字符个数
table(nchar(names(hflights)))
names(hflights)
colnames(hflights)
# 去掉列名长度为7或8的列
names(select(hflights,-matches("^[[:alpha:]]{7,8}$")))
# 3.数据重排序
str(arrange(hflights,ActualElapsedTime))
# 管道命令操作符
hflights %>% arrange(ActualElapsedTime) %>% str
# dplyr
hflights %>%
arrange(ActualElapsedTime) %>%
select(ActualElapsedTime,Dest) %>%
subset(Dest != 'Aus') %>%
head %>%
str
# data.table
str(head(data.table(hflights,key = 'ActualElapsedTime')[Dest != 'AUS',c('ActualElapsedTime','Dest'),with = FALSE]))
str(head(na.omit(
data.table(hflights,key = 'ActualElapsedTime'))[Dest != 'AUS',.(ActualElapsedTime,Dest)]))
# na.omit在哪里调用
# 速度比较
system.time(hflights_dt$DistanceKMs <- hflights_dt$Distance / 0.62137)
system.time(hflights_dt[,DistanceKMs := Distance / 0.62137])
# dplyr和data.table
# 内存使用分析
# 内存位置//指针的值
library(pryr)
hflights_dt <- data.table(hflights)
address(hflights_dt)
# 查看传统的赋值操作符是否会改变存放对象的地址
hflights_dt$DistanceKMs <- hflights_dt$Distance / 0.62137
address(hflights_dt)
# 查看data.table包的:=的使用方法
hflights_dt <- data.table(hflights)
address(hflights_dt)
hflights_dt[,DistanceKMs := Distance / 0.62137]
address(hflights_dt)
# within
system.time(within(hflights_dt,DistanceKMs <- Distance / 0.62137))
# 同时创建多个变量
hflights_dt[,c('DistanceKMs','DistanceFeets'):= .(Distance / 0.62137,Distance * 5280)]
carriers <- unique(hflights_dt$UniqueCarrier)
carriers
hflights_dt[,paste('carrier',carriers,sep = '_'):=
lapply(carriers,function(x) as.numeric(UniqueCarrier == x))]
str(hflights_dt[,grep('^carrier',names(hflights_dt)),with=FALSE])
# 正则
select(iris,grep('^P',colnames(iris)))
# 采用dplyr包生成新变量
hflights <- hflights %>% mutate(DistanceKMs = Distance / 0.62137)
# 数据集合并
# dplyr/join
# data.table调用[ 操作符的mult参数
(wdays <- data.frame(
DayOfWeek = 1:7,
DayOfWeekString = c("Sunday","Monday","Tuesday","Wednesday","Thursday","Friday","Saturday")
))
system.time(merge(hflights,wdays))
system.time(merge(hflights_dt,wdays,by = 'DayOfWeek'))
# 相同结构 rbind cbind
# 稀疏矩阵 rBind cBind
# do.call命令,对list对象的所有元素执行rbind或cbind命令
# rbindlist 合并data.table对象
# 4.数据整形
# 将宽表转换为长表 melt
library(reshape2)
hflights_melted <- melt(hflights,id.vars = 0,measure.vars = c('ActualElapsedTime','AirTime'))
str(hflights_melted)
# ggplot2绘图需要
# 将长表转换为宽表 cast
hflights_melted <- melt(hflights,id.vars = 'Month',measure.vars = c('ActualElapsedTime','AirTime'))
head(hflights_melted)
(df <- dcast(hflights_melted,Month ~ variable,fun.aggregate = mean,na.rm = TRUE))
library(ggplot2)
ggplot(melt(df,id.vars = 'Month')) +
geom_line(aes(x = Month,y = value,color = variable)) +
scale_x_continuous(breaks = 1:12) +
theme_bw() +
theme(legend.position = 'top')
# tidyr包 gather和spread
library(tidyr)
str(gather(hflights[,c('Month','ActualElapsedTime','AirTime')],variable,value,-Month))
- 建模