对于非结构化和半结构化的数据,正则表达式可以用非常简练的语言来描述字符串中的表达模式。第一次见到正则表达式,你可能会以为这是猫咪在键盘上踩出来的,但是随着逐渐加深对他的理解后,你就会体会其中的深刻含义了。
str_length(c("a", "R for data science", NA))
[1] 1 18 NA
字符串组合
str_c("x", "y")
#> [1] "xy"
str_c("x", "y", "z")
#> [1] "xyz"
str_c("x", "y", sep = ", ")
#> [1] "x, y"
x <- c("abc", NA)
str_c("|-", x, "-|")
#> [1] "|-abc-|" NA
str_c("|-", str_replace_na(x), "-|")
#> [1] "|-abc-|" "|-NA-|"
str_c("prefix-", c("a", "b", "c"), "-suffix")
#> [1] "prefix-a-suffix" "prefix-b-suffix" "prefix-c-suffix"
name <- "Hadley"
time_of_day <- "morning"
birthday <- FALSE
str_c(
"Good ", time_of_day, " ", name,
if (birthday) " and HAPPY BIRTHDAY",
"."
)
#> [1] "Good morning Hadley."
str_c(c("x", "y", "z"), collapse = ", ")
#> [1] "x, y, z"
提取子字符串
x <- c("Apple", "Banana", "Pear")
str_sub(x, 1, 3)
#> [1] "App" "Ban" "Pea"
# negative numbers count backwards from end
str_sub(x, -3, -1)
#> [1] "ple" "ana" "ear"
str_sub("a", 1, 5)
#> [1] "a"
str_sub(x, 1, 1) <- str_to_lower(str_sub(x, 1, 1))
x
#> [1] "apple" "banana" "pear"
区域设置
# Turkish has two i's: with and without a dot, and it
# has a different rule for capitalising them:
str_to_upper(c("i", "ı"))
#> [1] "I" "I"
str_to_upper(c("i", "ı"), locale = "tr")
#> [1] "İ" "I"
x <- c("apple", "eggplant", "banana")
str_sort(x, locale = "en") # English
#> [1] "apple" "banana" "eggplant"
str_sort(x, locale = "haw") # Hawaiian
#> [1] "apple" "eggplant" "banana"
正则表达式模式匹配
x <- c("apple", "banana", "pear")
str_view(x, "an")
banana
str_view(x, ".a.")
banana
pear
锚定
^ to match the start of the string.
$ to match the end of the string.
x <- c("apple", "banana", "pear")
str_view(x, "^a")
apple
str_view(x, "a$")
banana
字符串类和字符选项
\d: matches any digit.
\s: matches any whitespace (e.g. space, tab, newline).
[abc]: matches a, b, or c.
# Look for a literal character that normally has special meaning in a regex
str_view(c("abc", "a.c", "a*c", "a c"), "a[.]c")
a.c
str_view(c("grey", "gray"), "gr(e|a)y")
grey
gray
重复
?: 0 or 1
+: 1 or more
*: 0 or more
x <- "1888 is the longest year in Roman numerals: MDCCCLXXXVIII"
str_view(x, "CC?")
1888 is the longest year in Roman numerals: MDCC
CLXXXVIII
str_view(x, "CC+")
1888 is the longest year in Roman numerals: MDCCC
LXXXVIII
str_view(x, 'C[LX]+')
1888 is the longest year in Roman numerals: MDCCCLXXX
VIII
{n}: exactly n
{n,}: n or more
{,m}: at most m
{n,m}: between n and m
str_view(x, "C{2}")
分组与回溯引用
str_view(fruit, "(..)\\1", match = TRUE)
banana
coco
nut
cucu
mber
juju
be
papa
ya
salal
berry
匹配检测
x <- c("apple", "banana", "pear")
str_detect(x, "e")
#> [1] TRUE FALSE TRUE
# How many common words start with t?
sum(str_detect(words, "^t"))
#> [1] 65
# What proportion of common words end with a vowel?
mean(str_detect(words, "[aeiou]$"))
#> [1] 0.277
# Find all words containing at least one vowel, and negate
no_vowels_1 <- !str_detect(words, "[aeiou]")
# Find all words consisting only of consonants (non-vowels)
no_vowels_2 <- str_detect(words, "^[^aeiou]+$")
identical(no_vowels_1, no_vowels_2)
#> [1] TRUE
words[str_detect(words, "x$")]
#> [1] "box" "sex" "six" "tax"
str_subset(words, "x$")
#> [1] "box" "sex" "six" "tax"
df <- tibble(
word = words,
i = seq_along(word)
)
df %>%
filter(str_detect(word, "x$"))
#> # A tibble: 4 x 2
#> word i
#> <chr> <int>
#> 1 box 108
#> 2 sex 747
#> 3 six 772
#> 4 tax 841
x <- c("apple", "banana", "pear")
str_count(x, "a")
#> [1] 1 3 1
# On average, how many vowels per word?
mean(str_count(words, "[aeiou]"))
#> [1] 1.99
df %>%
mutate(
vowels = str_count(word, "[aeiou]"),
consonants = str_count(word, "[^aeiou]")
)
#> # A tibble: 980 x 4
#> word i vowels consonants
#> <chr> <int> <int> <int>
#> 1 a 1 1 0
#> 2 able 2 2 2
#> 3 about 3 3 2
#> 4 absolute 4 4 4
#> 5 accept 5 2 4
#> 6 account 6 3 4
#> # … with 974 more rows
str_count("abababa", "aba")
#> [1] 2
str_view_all("abababa", "aba")
aba b aba
提取匹配内容
length(sentences)
#> [1] 720
head(sentences)
#> [1] "The birch canoe slid on the smooth planks."
#> [2] "Glue the sheet to the dark blue background."
#> [3] "It's easy to tell the depth of a well."
#> [4] "These days a chicken leg is a rare dish."
#> [5] "Rice is often served in round bowls."
#> [6] "The juice of lemons makes fine punch."
colours <- c("red", "orange", "yellow", "green", "blue", "purple")
colour_match <- str_c(colours, collapse = "|")
colour_match
#> [1] "red|orange|yellow|green|blue|purple"
has_colour <- str_subset(sentences, colour_match)
matches <- str_extract(has_colour, colour_match)
head(matches)
#> [1] "blue" "blue" "red" "red" "red" "blue"
more <- sentences[str_count(sentences, colour_match) > 1]
str_view_all(more, colour_match)
It is hard to erase blue
or red
ink.
Thegreen
light in the brown box flickered.
The sky in the west is tinged with orange
red
.
str_extract(more, colour_match)
#> [1] "blue" "green" "orange"
分组匹配
noun <- "(a|the) ([^ ]+)"
has_noun <- sentences %>%
str_subset(noun) %>%
head(10)
has_noun %>%
str_extract(noun)
#> [1] "the smooth" "the sheet" "the depth" "a chicken" "the parked"
#> [6] "the sun" "the huge" "the ball" "the woman" "a helps"
tibble(sentence = sentences) %>%
tidyr::extract(
sentence, c("article", "noun"), "(a|the) ([^ ]+)",
remove = FALSE
)
#> # A tibble: 720 x 3
#> sentence article noun
#> <chr> <chr> <chr>
#> 1 The birch canoe slid on the smooth planks. the smooth
#> 2 Glue the sheet to the dark blue background. the sheet
#> 3 It's easy to tell the depth of a well. the depth
#> 4 These days a chicken leg is a rare dish. a chicken
#> 5 Rice is often served in round bowls. <NA> <NA>
#> 6 The juice of lemons makes fine punch. <NA> <NA>
#> # … with 714 more rows
替换匹配内容
x <- c("apple", "pear", "banana")
str_replace(x, "[aeiou]", "-")
#> [1] "-pple" "p-ar" "b-nana"
str_replace_all(x, "[aeiou]", "-")
#> [1] "-ppl-" "p--r" "b-n-n-"
x <- c("1 house", "2 cars", "3 people")
str_replace_all(x, c("1" = "one", "2" = "two", "3" = "three"))
#> [1] "one house" "two cars" "three people"
sentences %>%
str_replace("([^ ]+) ([^ ]+) ([^ ]+)", "\\1 \\3 \\2") %>%
head(5)
#> [1] "The canoe birch slid on the smooth planks."
#> [2] "Glue sheet the to the dark blue background."
#> [3] "It's to easy tell the depth of a well."
#> [4] "These a days chicken leg is a rare dish."
#> [5] "Rice often is served in round bowls."
拆分
sentences %>%
head(5) %>%
str_split(" ")
#> [[1]]
#> [1] "The" "birch" "canoe" "slid" "on" "the" "smooth"
#> [8] "planks."
#>
#> [[2]]
#> [1] "Glue" "the" "sheet" "to" "the"
#> [6] "dark" "blue" "background."
#>
#> [[3]]
#> [1] "It's" "easy" "to" "tell" "the" "depth" "of" "a" "well."
#>
#> [[4]]
#> [1] "These" "days" "a" "chicken" "leg" "is" "a"
#> [8] "rare" "dish."
#>
#> [[5]]
#> [1] "Rice" "is" "often" "served" "in" "round" "bowls."
"a|b|c|d" %>%
str_split("\\|") %>%
.[[1]]
#> [1] "a" "b" "c" "d"
sentences %>%
head(5) %>%
str_split(" ", simplify = TRUE)
#> [,1] [,2] [,3] [,4] [,5] [,6] [,7]
#> [1,] "The" "birch" "canoe" "slid" "on" "the" "smooth"
#> [2,] "Glue" "the" "sheet" "to" "the" "dark" "blue"
#> [3,] "It's" "easy" "to" "tell" "the" "depth" "of"
#> [4,] "These" "days" "a" "chicken" "leg" "is" "a"
#> [5,] "Rice" "is" "often" "served" "in" "round" "bowls."
#> [,8] [,9]
#> [1,] "planks." ""
#> [2,] "background." ""
#> [3,] "a" "well."
#> [4,] "rare" "dish."
#> [5,] "" ""