R,笔记04

数据处理

> # 抽取数据去重复
> de_dup <- function()
+ {
+   i <- which(duplicated(iris))
+   x <- iris[-i, ]
+   #print(x)
+ }
> head(de_dup())
  Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1          5.1         3.5          1.4         0.2  setosa
2          4.9         3.0          1.4         0.2  setosa
3          4.7         3.2          1.3         0.2  setosa
4          4.6         3.1          1.5         0.2  setosa
5          5.0         3.6          1.4         0.2  setosa
6          5.4         3.9          1.7         0.4  setosa
#或者
iris[!duplicated(iris), ]

去掉NA

> head(airquality[complete.cases(airquality), ])
  Ozone Solar.R Wind Temp Month Day
1    41     190  7.4   67     5   1
2    36     118  8.0   72     5   2
3    12     149 12.6   74     5   3
4    18     313 11.5   62     5   4
7    23     299  8.6   65     5   7
8    19      99 13.8   59     5   8
> # 或者na.omit(airquality)

with() identical() within()函数

> # 用with()函数计算鸢尾花,花萼与花瓣的长度比
> rwith <- with(iris, Sepal.Length / Petal.Length)
> head(rwith)
[1] 3.642857 3.500000 3.615385 3.066667 3.571429 3.176471
> 
> # identical()基本作用是检测两个对象是否完全相同,相同返回T,否则,F
> 
> # within函数与with类似,但主要用于列运算,将运算结果放入新列
> myiris <- iris # 不破坏内建数据集
> myiris <- within(myiris, lenth.ratio <- Sepal.Length / Petal.Length)
> head(myiris)
  Sepal.Length Sepal.Width Petal.Length Petal.Width Species lenth.ratio
1          5.1         3.5          1.4         0.2  setosa    3.642857
2          4.9         3.0          1.4         0.2  setosa    3.500000
3          4.7         3.2          1.3         0.2  setosa    3.615385
4          4.6         3.1          1.5         0.2  setosa    3.066667
5          5.0         3.6          1.4         0.2  setosa    3.571429
6          5.4         3.9          1.7         0.4  setosa    3.176471

分割数据

> # 分割数据
> # cut()将数据等量切割,处理后的数据是factor数据型态
> # 将state.77对象依人口数做分割,分成5等份
> popu <- state.x77[, "Population"]
> cutpopu <- cut(popu, 5)
> head(cutpopu)
[1] (344,4.53e+03]     (344,4.53e+03]     (344,4.53e+03]     (344,4.53e+03]     (1.7e+04,2.12e+04] (344,4.53e+03]    
Levels: (344,4.53e+03] (4.53e+03,8.7e+03] (8.7e+03,1.29e+04] (1.29e+04,1.7e+04] (1.7e+04,2.12e+04]

> #分割时,按人口数由多到少,分别给予名称"high" "2nd" "3rd" "4th" "low"
> cut(popu, 5, labels =  c ("high", "2nd", "3rd", "4th", "low"))
 [1] high high high high low  high high high 2nd  2nd  high high 3rd  2nd  high high high high high high 2nd  3rd  high high 2nd  high high high high 2nd  high low 
[33] 2nd  high 3rd  high high 3rd  high high high high 3rd  high high 2nd  high high 2nd  high
Levels: high 2nd 3rd 4th low
> 
> #要了解每一人口数分类有多少州
> x.popu <- cut(popu, 5, labels =  c ("high", "2nd", "3rd", "4th", "low"))
> table(x.popu)
x.popu
high  2nd  3rd  4th  low 
  34    9    5    0    2 

合并数据

准备数据库
> mystates.x77 <- as.data.frame(state.x77)
> mystates.x77$name <- rownames(state.x77) # 给新数据增加一个字段name
> head(mystates.x77)
           Population Income Illiteracy Life Exp Murder HS Grad Frost   Area       name
Alabama          3615   3624        2.1    69.05   15.1    41.3    20  50708    Alabama
Alaska            365   6315        1.5    69.31   11.3    66.7   152 566432     Alaska
Arizona          2212   4530        1.8    70.55    7.8    58.1    15 113417    Arizona
Arkansas         2110   3378        1.9    70.66   10.1    39.9    65  51945   Arkansas
California      21198   5114        1.1    71.71   10.3    62.6    20 156361 California
Colorado         2541   4884        0.7    72.06    6.8    63.9   166 103766   Colorado
> row.names(mystates.x77) <- NULL # 删除原来行名
> head(mystates.x77)
  Population Income Illiteracy Life Exp Murder HS Grad Frost   Area       name
1       3615   3624        2.1    69.05   15.1    41.3    20  50708    Alabama
2        365   6315        1.5    69.31   11.3    66.7   152 566432     Alaska
3       2212   4530        1.8    70.55    7.8    58.1    15 113417    Arizona
4       2110   3378        1.9    70.66   10.1    39.9    65  51945   Arkansas
5      21198   5114        1.1    71.71   10.3    62.6    20 156361 California
6       2541   4884        0.7    72.06    6.8    63.9   166 103766   Colorado

> #人口大于500万的选出来(原单位是千人数),同时新对象要有2个字段name 和 population
> mypopu.states <- mystates.x77[mystates.x77$Population > 5000, c("name", "Population")]
> mypopu.states
             name Population
5      California      21198
9         Florida       8277
13       Illinois      11197
14        Indiana       5313
21  Massachusetts       5814
22       Michigan       9111
30     New Jersey       7333
32       New York      18076
33 North Carolina       5441
35           Ohio      10735
38   Pennsylvania      11860
43          Texas      12237

> #选出月收入大于5000美元的。同时新对象要有2个字段name 和 Income
> myincomes.states <- mystates.x77[mystates.x77$Income > 5000, c("name", "Income")]
> myincomes.states
           name Income
2        Alaska   6315
5    California   5114
7   Connecticut   5348
13     Illinois   5107
20     Maryland   5299
28       Nevada   5149
30   New Jersey   5237
34 North Dakota   5087

merge

> # merge()交集合并。merge(x, y, all = F),默认是交接合并
> # 合并上述两个数据中人数超500万的州和月收入超5000美元的州
> merge(mypopu.states, myincomes.states)
        name Population Income
1 California      21198   5114
2   Illinois      11197   5107
3 New Jersey       7333   5237
> 
> # 取并集
> merge(mypopu.states, myincomes.states, all = T)
             name Population Income
1          Alaska         NA   6315
2      California      21198   5114
3     Connecticut         NA   5348
4         Florida       8277     NA
5        Illinois      11197   5107
6         Indiana       5313     NA
7        Maryland         NA   5299
8   Massachusetts       5814     NA
9        Michigan       9111     NA
10         Nevada         NA   5149
11     New Jersey       7333   5237
12       New York      18076     NA
13 North Carolina       5441     NA
14   North Dakota         NA   5087
15           Ohio      10735     NA
16   Pennsylvania      11860     NA
17          Texas      12237     NA
> 
> # merge参数all.x = T, 保证第一个对象的元素在合并中都存在,第二个如没有则NA填充
> merge(mypopu.states, myincomes.states, all.x = T)
             name Population Income
1      California      21198   5114
2         Florida       8277     NA
3        Illinois      11197   5107
4         Indiana       5313     NA
5   Massachusetts       5814     NA
6        Michigan       9111     NA
7      New Jersey       7333   5237
8        New York      18076     NA
9  North Carolina       5441     NA
10           Ohio      10735     NA
11   Pennsylvania      11860     NA
12          Texas      12237     NA

match

> # match()类似于取两个对象交集,即第一对象x的某行数据若在第二个对象y中找到符合条件的数据,则返回第二个对象中
> # 相应数据的位置,否则返回NA。所以match后会返回一个与第一个对象长度相同的向量。
> 
> # 找出符合人口数多于500万,同时月授予超5000美元的行数据,在对象myincomes.states中的位置,返回的向量数值即是要的结果。
> my.index <- match(mypopu.states$name, myincomes.states$name)
> my.index
 [1]  2 NA  4 NA NA NA  7 NA NA NA NA NA
> 
> # 提取出myincome.states中人口数多于500万,同时月收入超5000美元的州的数据。
> myincomes.states[na.omit(my.index), ]
         name Income
5  California   5114
13   Illinois   5107
30 New Jersey   5237
> 
> # %in%将返回于第一个对象长度相同的逻辑向量,在向量中为T的元素是我们要的数据
> my.index2 <- mypopu.states$name %in% myincomes.states$name
> my.index2
 [1]  TRUE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE
> 
> #抽出mypopu.states中人口多于500万,同时月收入过5000美元的州数据
> mypopu.states[my.index2, ]
         name Population
5  California      21198
13   Illinois      11197
30 New Jersey       7333
> 
> # 换种做法
> my.index <- match(mypopu.states$name, myincomes.states$name)
> my.index3 <- !is.na(my.index) #my.index中不是NA的赋值给my.index3
> my.index3
 [1]  TRUE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE
> mypopu.states[my.index3, ]
         name Population
5  California      21198
13   Illinois      11197
30 New Jersey       7333

排序

> # 排序sort/order
> # 数据框的排序,对state.info数据框依据Income字段执行升序排列。
> mystate.info <- data.frame(Region = state.region, state.x77)
> mystate.info
                      Region Population Income Illiteracy Life.Exp Murder HS.Grad Frost   Area
Alabama                South       3615   3624        2.1    69.05   15.1    41.3    20  50708
Alaska                  West        365   6315        1.5    69.31   11.3    66.7   152 566432
Arizona                 West       2212   4530        1.8    70.55    7.8    58.1    15 113417
Arkansas               South       2110   3378        1.9    70.66   10.1    39.9    65  51945
California              West      21198   5114        1.1    71.71   10.3    62.6    20 156361
Colorado                West       2541   4884        0.7    72.06    6.8    63.9   166 103766
Connecticut        Northeast       3100   5348        1.1    72.48    3.1    56.0   139   4862
Delaware               South        579   4809        0.9    70.06    6.2    54.6   103   1982
Florida                South       8277   4815        1.3    70.66   10.7    52.6    11  54090
Georgia                South       4931   4091        2.0    68.54   13.9    40.6    60  58073
Hawaii                  West        868   4963        1.9    73.60    6.2    61.9     0   6425
Idaho                   West        813   4119        0.6    71.87    5.3    59.5   126  82677
Illinois       North Central      11197   5107        0.9    70.14   10.3    52.6   127  55748
Indiana        North Central       5313   4458        0.7    70.88    7.1    52.9   122  36097
Iowa           North Central       2861   4628        0.5    72.56    2.3    59.0   140  55941
Kansas         North Central       2280   4669        0.6    72.58    4.5    59.9   114  81787
Kentucky               South       3387   3712        1.6    70.10   10.6    38.5    95  39650
Louisiana              South       3806   3545        2.8    68.76   13.2    42.2    12  44930
Maine              Northeast       1058   3694        0.7    70.39    2.7    54.7   161  30920
Maryland               South       4122   5299        0.9    70.22    8.5    52.3   101   9891
Massachusetts      Northeast       5814   4755        1.1    71.83    3.3    58.5   103   7826
Michigan       North Central       9111   4751        0.9    70.63   11.1    52.8   125  56817
Minnesota      North Central       3921   4675        0.6    72.96    2.3    57.6   160  79289
Mississippi            South       2341   3098        2.4    68.09   12.5    41.0    50  47296
Missouri       North Central       4767   4254        0.8    70.69    9.3    48.8   108  68995
Montana                 West        746   4347        0.6    70.56    5.0    59.2   155 145587
Nebraska       North Central       1544   4508        0.6    72.60    2.9    59.3   139  76483
Nevada                  West        590   5149        0.5    69.03   11.5    65.2   188 109889
New Hampshire      Northeast        812   4281        0.7    71.23    3.3    57.6   174   9027
New Jersey         Northeast       7333   5237        1.1    70.93    5.2    52.5   115   7521
New Mexico              West       1144   3601        2.2    70.32    9.7    55.2   120 121412
New York           Northeast      18076   4903        1.4    70.55   10.9    52.7    82  47831
North Carolina         South       5441   3875        1.8    69.21   11.1    38.5    80  48798
North Dakota   North Central        637   5087        0.8    72.78    1.4    50.3   186  69273
Ohio           North Central      10735   4561        0.8    70.82    7.4    53.2   124  40975
Oklahoma               South       2715   3983        1.1    71.42    6.4    51.6    82  68782
Oregon                  West       2284   4660        0.6    72.13    4.2    60.0    44  96184
Pennsylvania       Northeast      11860   4449        1.0    70.43    6.1    50.2   126  44966
Rhode Island       Northeast        931   4558        1.3    71.90    2.4    46.4   127   1049
South Carolina         South       2816   3635        2.3    67.96   11.6    37.8    65  30225
South Dakota   North Central        681   4167        0.5    72.08    1.7    53.3   172  75955
Tennessee              South       4173   3821        1.7    70.11   11.0    41.8    70  41328
Texas                  South      12237   4188        2.2    70.90   12.2    47.4    35 262134
Utah                    West       1203   4022        0.6    72.90    4.5    67.3   137  82096
Vermont            Northeast        472   3907        0.6    71.64    5.5    57.1   168   9267
Virginia               South       4981   4701        1.4    70.08    9.5    47.8    85  39780
Washington              West       3559   4864        0.6    71.72    4.3    63.5    32  66570
West Virginia          South       1799   3617        1.4    69.48    6.7    41.6   100  24070
Wisconsin      North Central       4589   4468        0.7    72.48    3.0    54.5   149  54464
Wyoming                 West        376   4566        0.6    70.29    6.9    62.9   173  97203
> head(mystate.info)
           Region Population Income Illiteracy Life.Exp Murder HS.Grad Frost   Area
Alabama     South       3615   3624        2.1    69.05   15.1    41.3    20  50708
Alaska       West        365   6315        1.5    69.31   11.3    66.7   152 566432
Arizona      West       2212   4530        1.8    70.55    7.8    58.1    15 113417
Arkansas    South       2110   3378        1.9    70.66   10.1    39.9    65  51945
California   West      21198   5114        1.1    71.71   10.3    62.6    20 156361
Colorado     West       2541   4884        0.7    72.06    6.8    63.9   166 103766
> state.info <- mystate.info[1:15, ]
> inc.order <- order(state.info$Income) # 默认升序
> state.info[inc.order, ]
                   Region Population Income Illiteracy Life.Exp Murder HS.Grad Frost   Area
Arkansas            South       2110   3378        1.9    70.66   10.1    39.9    65  51945
Alabama             South       3615   3624        2.1    69.05   15.1    41.3    20  50708
Georgia             South       4931   4091        2.0    68.54   13.9    40.6    60  58073
Idaho                West        813   4119        0.6    71.87    5.3    59.5   126  82677
Indiana     North Central       5313   4458        0.7    70.88    7.1    52.9   122  36097
Arizona              West       2212   4530        1.8    70.55    7.8    58.1    15 113417
Iowa        North Central       2861   4628        0.5    72.56    2.3    59.0   140  55941
Delaware            South        579   4809        0.9    70.06    6.2    54.6   103   1982
Florida             South       8277   4815        1.3    70.66   10.7    52.6    11  54090
Colorado             West       2541   4884        0.7    72.06    6.8    63.9   166 103766
Hawaii               West        868   4963        1.9    73.60    6.2    61.9     0   6425
Illinois    North Central      11197   5107        0.9    70.14   10.3    52.6   127  55748
California           West      21198   5114        1.1    71.71   10.3    62.6    20 156361
Connecticut     Northeast       3100   5348        1.1    72.48    3.1    56.0   139   4862
Alaska               West        365   6315        1.5    69.31   11.3    66.7   152 566432
> 
> # 排序是增加次要键值,格式,order(主要健值,次要键值,……)
> # 以state.info 数据框为例,将Region作为主要健值,Income作为次要健值,升序排。
> inc.order2 <- order(state.info$Region, state.info$Income)
> state.info[inc.order2, ]
                   Region Population Income Illiteracy Life.Exp Murder HS.Grad Frost   Area
Connecticut     Northeast       3100   5348        1.1    72.48    3.1    56.0   139   4862
Arkansas            South       2110   3378        1.9    70.66   10.1    39.9    65  51945
Alabama             South       3615   3624        2.1    69.05   15.1    41.3    20  50708
Georgia             South       4931   4091        2.0    68.54   13.9    40.6    60  58073
Delaware            South        579   4809        0.9    70.06    6.2    54.6   103   1982
Florida             South       8277   4815        1.3    70.66   10.7    52.6    11  54090
Indiana     North Central       5313   4458        0.7    70.88    7.1    52.9   122  36097
Iowa        North Central       2861   4628        0.5    72.56    2.3    59.0   140  55941
Illinois    North Central      11197   5107        0.9    70.14   10.3    52.6   127  55748
Idaho                West        813   4119        0.6    71.87    5.3    59.5   126  82677
Arizona              West       2212   4530        1.8    70.55    7.8    58.1    15 113417
Colorado             West       2541   4884        0.7    72.06    6.8    63.9   166 103766
Hawaii               West        868   4963        1.9    73.60    6.2    61.9     0   6425
California           West      21198   5114        1.1    71.71   10.3    62.6    20 156361
Alaska               West        365   6315        1.5    69.31   11.3    66.7   152 566432
> # 在排序结果中south在northeast和north central之间,错了吗?这是由于state.region是一个因子,class()可知。
> # 对因子而言order的排序,相当于是执行levels排序,所以应该小心。
> 
> # 混合排序。部分字段升序排,部分字段降序排,用xtfrm(),可将原向量转为数值向量,当想要以不同方式排序时,在xtfrm()前加上—即可
> 
> #以state.info为例,将Region作为主要健值升序排,Income作次要健值降序排。
> mix.order <- order(state.info$Region, -xtfrm(state.info$Income))
> state.info[mix.order, ]
                   Region Population Income Illiteracy Life.Exp Murder HS.Grad Frost   Area
Connecticut     Northeast       3100   5348        1.1    72.48    3.1    56.0   139   4862
Florida             South       8277   4815        1.3    70.66   10.7    52.6    11  54090
Delaware            South        579   4809        0.9    70.06    6.2    54.6   103   1982
Georgia             South       4931   4091        2.0    68.54   13.9    40.6    60  58073
Alabama             South       3615   3624        2.1    69.05   15.1    41.3    20  50708
Arkansas            South       2110   3378        1.9    70.66   10.1    39.9    65  51945
Illinois    North Central      11197   5107        0.9    70.14   10.3    52.6   127  55748
Iowa        North Central       2861   4628        0.5    72.56    2.3    59.0   140  55941
Indiana     North Central       5313   4458        0.7    70.88    7.1    52.9   122  36097
Alaska               West        365   6315        1.5    69.31   11.3    66.7   152 566432
California           West      21198   5114        1.1    71.71   10.3    62.6    20 156361
Hawaii               West        868   4963        1.9    73.60    6.2    61.9     0   6425
Colorado             West       2541   4884        0.7    72.06    6.8    63.9   166 103766
Arizona              West       2212   4530        1.8    70.55    7.8    58.1    15 113417
Idaho                West        813   4119        0.6    71.87    5.3    59.5   126  82677

公式符号等

> # 公式符号,指的是统计学符号,基本的如下
> # y ~ a y是a的函数
> # y ~ a + b y是a和b的函数
> # y ~ a - b y是a的函数但排除b
> 
> # 认识长格式数据(Long Format)与宽格式数据(Wide Format)
> # reshapes2扩展包的melt()函数/dcast()函数

©著作权归作者所有,转载或内容合作请联系作者
  • 序言:七十年代末,一起剥皮案震惊了整个滨河市,随后出现的几起案子,更是在滨河造成了极大的恐慌,老刑警刘岩,带你破解...
    沈念sama阅读 200,302评论 5 470
  • 序言:滨河连续发生了三起死亡事件,死亡现场离奇诡异,居然都是意外死亡,警方通过查阅死者的电脑和手机,发现死者居然都...
    沈念sama阅读 84,232评论 2 377
  • 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
    开封第一讲书人阅读 147,337评论 0 332
  • 文/不坏的土叔 我叫张陵,是天一观的道长。 经常有香客问我,道长,这世上最难降的妖魔是什么? 我笑而不...
    开封第一讲书人阅读 53,977评论 1 272
  • 正文 为了忘掉前任,我火速办了婚礼,结果婚礼上,老公的妹妹穿的比我还像新娘。我一直安慰自己,他们只是感情好,可当我...
    茶点故事阅读 62,920评论 5 360
  • 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
    开封第一讲书人阅读 48,194评论 1 277
  • 那天,我揣着相机与录音,去河边找鬼。 笑死,一个胖子当着我的面吹牛,可吹牛的内容都是我干的。 我是一名探鬼主播,决...
    沈念sama阅读 37,638评论 3 390
  • 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...
    开封第一讲书人阅读 36,319评论 0 254
  • 序言:老挝万荣一对情侣失踪,失踪者是张志新(化名)和其女友刘颖,没想到半个月后,有当地人在树林里发现了一具尸体,经...
    沈念sama阅读 40,455评论 1 294
  • 正文 独居荒郊野岭守林人离奇死亡,尸身上长有42处带血的脓包…… 初始之章·张勋 以下内容为张勋视角 年9月15日...
    茶点故事阅读 35,379评论 2 317
  • 正文 我和宋清朗相恋三年,在试婚纱的时候发现自己被绿了。 大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
    茶点故事阅读 37,426评论 1 329
  • 序言:一个原本活蹦乱跳的男人离奇死亡,死状恐怖,灵堂内的尸体忽然破棺而出,到底是诈尸还是另有隐情,我是刑警宁泽,带...
    沈念sama阅读 33,106评论 3 315
  • 正文 年R本政府宣布,位于F岛的核电站,受9级特大地震影响,放射性物质发生泄漏。R本人自食恶果不足惜,却给世界环境...
    茶点故事阅读 38,696评论 3 303
  • 文/蒙蒙 一、第九天 我趴在偏房一处隐蔽的房顶上张望。 院中可真热闹,春花似锦、人声如沸。这庄子的主人今日做“春日...
    开封第一讲书人阅读 29,786评论 0 19
  • 文/苍兰香墨 我抬头看了看天上的太阳。三九已至,却和暖如春,着一层夹袄步出监牢的瞬间,已是汗流浃背。 一阵脚步声响...
    开封第一讲书人阅读 30,996评论 1 255
  • 我被黑心中介骗来泰国打工, 没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留,地道东北人。 一个月前我还...
    沈念sama阅读 42,467评论 2 346
  • 正文 我出身青楼,却偏偏与公主长得像,于是被迫代替她去往敌国和亲。 传闻我的和亲对象是个残疾皇子,可洞房花烛夜当晚...
    茶点故事阅读 42,043评论 2 341

推荐阅读更多精彩内容