R:在 tibble 列中使用 strsplit
R: using strsplit in a tibble column
我有一个小标题,其中一列是字符串。它们是调查受访者表示他们演奏过的乐器的名称。我想捕捉每一种乐器,因为它有自己的独立弦乐。此列中的值范围从单一字符串(如吉他)到更复杂的答案:Sing、Drums/Percussion、Piano/Keyboard...等
我试过这样的事情:
options <- strsplit(survey$instruments_list, "\, | \/ | ")
不幸的是,输出的几个字符串之间仍然有 / 字符。
还有最后一个问题,其中一位受访者的回答令人难以置信,冗长的答案被多个空格隔开,我只想要乐器,而不是他们的生活故事。
任何建议将不胜感激,谢谢!
编辑:
dput(head(survey))
的结果
structure(list(time_submitted = c("8/27/19 20:22", "8/29/19 12:15",
"8/28/19 19:33", "8/29/19 16:25", "8/27/19 15:40", "8/27/19 22:59"
), pseudonym_generator = c("Fake rapper name generator", "Fake band name generator",
"Fake band name generator", "Fake band name generator", "Fake band name generator",
"Fake band name generator"), pseudonym = c("Lord Los Angeles",
"Heroes War", "Puppets War", "West Magic", "Eller Angel", "Trace Stripes"
), sex = c("Male", "Male", "Male", "Male", "Male", "Male"), academic_major = c("Computer Science",
"Computer Science", "Math", "Computer Science", "Computer Science",
"Computer Science"), academic_level = c("Senior", "Junior", "Senior",
"Junior", "Senior", "Senior"), year_born = c(1994, 1997, 1996,
1999, 1998, 1986), instrument_list = c("Rap", "Guitar", "Guitar",
"Trumpet", "Piano/Keyboards, Ukulele", NA), favorite_song_artist = c("40 crew",
"Arctic Monkeys", "Avatar", "Ben Folds", "blink-182", "brian jonestown massacre / sarabeth tucek"
), favorite_song = c("Not Enough", "Arabella", "The Eagle Has Landed",
"Still", "She's Out Of Her Mind", "Seer"), favorite_song_link = c("https://www.youtube.com/watch?v=uITuGZKljgQ",
"https://www.youtube.com/watch?v=Jn6-TItCazo", "https://www.youtube.com/watch?v=4p6GWewmTYQ",
"https://www.youtube.com/watch?v=ShBzUK4rnI8", "https://www.youtube.com/watch?v=krpm0v_486k",
"https://youtu.be/C-XT7DZsNP8")), class = c("tbl_df", "tbl",
"data.frame"), row.names = c(NA, -6L))
这个怎么样:
library(dplyr)
library(tidyr)
survey %>%
transmute(pseudonym, inst = strsplit(instrument_list, "[,/]")) %>%
filter(!is.na(inst)) %>%
unnest() %>%
mutate(inst = trimws(inst), plays = TRUE) %>%
spread(inst, plays) %>%
mutate_at(vars(-pseudonym), Negate(is.na))
# # A tibble: 5 x 7
# pseudonym Guitar Keyboards Piano Rap Trumpet Ukulele
# <chr> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl>
# 1 Eller Angel FALSE TRUE TRUE FALSE FALSE TRUE
# 2 Heroes War TRUE FALSE FALSE FALSE FALSE FALSE
# 3 Lord Los Angeles FALSE FALSE FALSE TRUE FALSE FALSE
# 4 Puppets War TRUE FALSE FALSE FALSE FALSE FALSE
# 5 West Magic FALSE FALSE FALSE FALSE TRUE FALSE
将多种乐器合并到一个类别中并不难。我将改编您的一种乐器来演示。
一种方法是用case_when
,也许是两种方法中比较direct/literal的一种:
survey %>%
mutate(instrument_list = if_else(grepl("Lord", pseudonym), "Electric Guitar", instrument_list)) %>%
transmute(pseudonym, inst = strsplit(instrument_list, "[,/]")) %>%
filter(!is.na(inst)) %>%
unnest() %>%
mutate(inst = trimws(inst), plays = TRUE) %>%
spread(inst, plays) %>%
mutate_at(vars(-pseudonym), Negate(is.na))
# # A tibble: 5 x 7
# pseudonym `Electric Guitar` Guitar Keyboards Piano Trumpet Ukulele
# <chr> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl>
# 1 Eller Angel FALSE FALSE TRUE TRUE FALSE TRUE
# 2 Heroes War FALSE TRUE FALSE FALSE FALSE FALSE
# 3 Lord Los Angeles TRUE FALSE FALSE FALSE FALSE FALSE
# 4 Puppets War FALSE TRUE FALSE FALSE FALSE FALSE
# 5 West Magic FALSE FALSE FALSE FALSE TRUE FALSE
survey %>%
mutate(instrument_list = if_else(grepl("Lord", pseudonym), "Electric Guitar", instrument_list)) %>%
transmute(pseudonym, inst = strsplit(instrument_list, "[,/]")) %>%
filter(!is.na(inst)) %>%
unnest() %>%
mutate(
inst = case_when(
grepl("\bPiano\b", inst, ignore.case = TRUE) ~ "Piano",
grepl("\bUkelete\b", inst, ignore.case = TRUE) ~ "Ukelele",
grepl("\bGuitar\b", inst, ignore.case = TRUE) ~ "Guitar",
TRUE ~ trimws(inst)),
plays = TRUE,
) %>%
spread(inst, plays) %>%
mutate_at(vars(-pseudonym), Negate(is.na))
# # A tibble: 5 x 6
# pseudonym Guitar Keyboards Piano Trumpet Ukulele
# <chr> <lgl> <lgl> <lgl> <lgl> <lgl>
# 1 Eller Angel FALSE TRUE TRUE FALSE TRUE
# 2 Heroes War TRUE FALSE FALSE FALSE FALSE
# 3 Lord Los Angeles TRUE FALSE FALSE FALSE FALSE
# 4 Puppets War TRUE FALSE FALSE FALSE FALSE
# 5 West Magic FALSE FALSE FALSE TRUE FALSE
另一种方法(如果你有更多)是 merge/join 在一个框架中。这样做的一个优点是它可以非常具体并且包括非常不同的工具(其中正则表达式可能比您想要处理的更多)。一个缺点是它可能过于具体......例如,它不会加入拼写错误或大小写差异。
gen_inst <- tibble::tribble(
~inst, ~newinst
,"Electric Guitar", "Guitar"
,"Electric Bass" , "Guitar"
,"Electric Piano" , "Piano"
,"Pipe Organ" , "Piano"
)
survey %>%
mutate(instrument_list = if_else(grepl("Lord", pseudonym), "Electric Guitar", instrument_list)) %>%
transmute(pseudonym, inst = strsplit(instrument_list, "[,/]")) %>%
filter(!is.na(inst)) %>%
unnest() %>%
left_join(gen_inst, by = "inst") %>%
mutate(
inst = if_else(is.na(newinst), trimws(inst), newinst),
plays = TRUE
) %>%
select(-newinst) %>%
spread(inst, plays) %>%
mutate_at(vars(-pseudonym), Negate(is.na))
# # A tibble: 5 x 6
# pseudonym Guitar Keyboards Piano Trumpet Ukulele
# <chr> <lgl> <lgl> <lgl> <lgl> <lgl>
# 1 Eller Angel FALSE TRUE TRUE FALSE TRUE
# 2 Heroes War TRUE FALSE FALSE FALSE FALSE
# 3 Lord Los Angeles TRUE FALSE FALSE FALSE FALSE
# 4 Puppets War TRUE FALSE FALSE FALSE FALSE
# 5 West Magic FALSE FALSE FALSE TRUE FALSE
我们还可以使用 splitstackshape
中的 cSplit_e
output <- splitstackshape::cSplit_e(survey, "instrument_list", type = "character",
fill = 0, sep=",|/", fixed = FALSE)
output[12:17]
# instrument_list_Guitar instrument_list_Keyboards instrument_list_Piano
#1 0 0 0
#2 1 0 0
#3 1 0 0
#4 0 0 0
#5 0 1 1
#6 0 0 0
# instrument_list_Rap instrument_list_Trumpet instrument_list_Ukulele
#1 1 0 0
#2 0 0 0
#3 0 0 0
#4 0 1 0
#5 0 0 1
#6 0 0 0
这里列中的1代表乐器被演奏,0代表没有演奏。
与其他答案没有太大区别,但这里使用了一些 tidyr
便利。 separate_rows
拆分字符串并在一次调用中取消嵌套;如果您在正则表达式中包含可选的 \s
,则 ", "
中的空格将包含在分隔符中,因此您可以跳过修剪空格。添加一个虚拟变量给出一个值来填充仪器列,并且 NA
s 被填充为 0。
library(dplyr)
library(tidyr)
survey_wide <- survey %>%
select(pseudonym, instrument_list) %>%
separate_rows(instrument_list, sep = "(\,|\/)\s?") %>%
filter(!is.na(instrument_list)) %>%
mutate(dummy = 1) %>%
spread(key = instrument_list, value = dummy, fill = 0)
survey_wide
#> # A tibble: 5 x 7
#> pseudonym Guitar Keyboards Piano Rap Trumpet Ukulele
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 Eller Angel 0 1 1 0 0 1
#> 2 Heroes War 1 0 0 0 0 0
#> 3 Lord Los Angeles 0 0 0 1 0 0
#> 4 Puppets War 1 0 0 0 0 0
#> 5 West Magic 0 0 0 0 1 0
如果您需要布尔值而不是数字,则需要额外的步骤:
survey_wide %>%
mutate_at(vars(-pseudonym), as.logical)
#> # A tibble: 5 x 7
#> pseudonym Guitar Keyboards Piano Rap Trumpet Ukulele
#> <chr> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl>
#> 1 Eller Angel FALSE TRUE TRUE FALSE FALSE TRUE
#> 2 Heroes War TRUE FALSE FALSE FALSE FALSE FALSE
#> 3 Lord Los Angeles FALSE FALSE FALSE TRUE FALSE FALSE
#> 4 Puppets War TRUE FALSE FALSE FALSE FALSE FALSE
#> 5 West Magic FALSE FALSE FALSE FALSE TRUE FALSE
我有一个小标题,其中一列是字符串。它们是调查受访者表示他们演奏过的乐器的名称。我想捕捉每一种乐器,因为它有自己的独立弦乐。此列中的值范围从单一字符串(如吉他)到更复杂的答案:Sing、Drums/Percussion、Piano/Keyboard...等 我试过这样的事情:
options <- strsplit(survey$instruments_list, "\, | \/ | ")
不幸的是,输出的几个字符串之间仍然有 / 字符。
还有最后一个问题,其中一位受访者的回答令人难以置信,冗长的答案被多个空格隔开,我只想要乐器,而不是他们的生活故事。
任何建议将不胜感激,谢谢! 编辑: dput(head(survey))
的结果structure(list(time_submitted = c("8/27/19 20:22", "8/29/19 12:15",
"8/28/19 19:33", "8/29/19 16:25", "8/27/19 15:40", "8/27/19 22:59"
), pseudonym_generator = c("Fake rapper name generator", "Fake band name generator",
"Fake band name generator", "Fake band name generator", "Fake band name generator",
"Fake band name generator"), pseudonym = c("Lord Los Angeles",
"Heroes War", "Puppets War", "West Magic", "Eller Angel", "Trace Stripes"
), sex = c("Male", "Male", "Male", "Male", "Male", "Male"), academic_major = c("Computer Science",
"Computer Science", "Math", "Computer Science", "Computer Science",
"Computer Science"), academic_level = c("Senior", "Junior", "Senior",
"Junior", "Senior", "Senior"), year_born = c(1994, 1997, 1996,
1999, 1998, 1986), instrument_list = c("Rap", "Guitar", "Guitar",
"Trumpet", "Piano/Keyboards, Ukulele", NA), favorite_song_artist = c("40 crew",
"Arctic Monkeys", "Avatar", "Ben Folds", "blink-182", "brian jonestown massacre / sarabeth tucek"
), favorite_song = c("Not Enough", "Arabella", "The Eagle Has Landed",
"Still", "She's Out Of Her Mind", "Seer"), favorite_song_link = c("https://www.youtube.com/watch?v=uITuGZKljgQ",
"https://www.youtube.com/watch?v=Jn6-TItCazo", "https://www.youtube.com/watch?v=4p6GWewmTYQ",
"https://www.youtube.com/watch?v=ShBzUK4rnI8", "https://www.youtube.com/watch?v=krpm0v_486k",
"https://youtu.be/C-XT7DZsNP8")), class = c("tbl_df", "tbl",
"data.frame"), row.names = c(NA, -6L))
这个怎么样:
library(dplyr)
library(tidyr)
survey %>%
transmute(pseudonym, inst = strsplit(instrument_list, "[,/]")) %>%
filter(!is.na(inst)) %>%
unnest() %>%
mutate(inst = trimws(inst), plays = TRUE) %>%
spread(inst, plays) %>%
mutate_at(vars(-pseudonym), Negate(is.na))
# # A tibble: 5 x 7
# pseudonym Guitar Keyboards Piano Rap Trumpet Ukulele
# <chr> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl>
# 1 Eller Angel FALSE TRUE TRUE FALSE FALSE TRUE
# 2 Heroes War TRUE FALSE FALSE FALSE FALSE FALSE
# 3 Lord Los Angeles FALSE FALSE FALSE TRUE FALSE FALSE
# 4 Puppets War TRUE FALSE FALSE FALSE FALSE FALSE
# 5 West Magic FALSE FALSE FALSE FALSE TRUE FALSE
将多种乐器合并到一个类别中并不难。我将改编您的一种乐器来演示。
一种方法是用case_when
,也许是两种方法中比较direct/literal的一种:
survey %>%
mutate(instrument_list = if_else(grepl("Lord", pseudonym), "Electric Guitar", instrument_list)) %>%
transmute(pseudonym, inst = strsplit(instrument_list, "[,/]")) %>%
filter(!is.na(inst)) %>%
unnest() %>%
mutate(inst = trimws(inst), plays = TRUE) %>%
spread(inst, plays) %>%
mutate_at(vars(-pseudonym), Negate(is.na))
# # A tibble: 5 x 7
# pseudonym `Electric Guitar` Guitar Keyboards Piano Trumpet Ukulele
# <chr> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl>
# 1 Eller Angel FALSE FALSE TRUE TRUE FALSE TRUE
# 2 Heroes War FALSE TRUE FALSE FALSE FALSE FALSE
# 3 Lord Los Angeles TRUE FALSE FALSE FALSE FALSE FALSE
# 4 Puppets War FALSE TRUE FALSE FALSE FALSE FALSE
# 5 West Magic FALSE FALSE FALSE FALSE TRUE FALSE
survey %>%
mutate(instrument_list = if_else(grepl("Lord", pseudonym), "Electric Guitar", instrument_list)) %>%
transmute(pseudonym, inst = strsplit(instrument_list, "[,/]")) %>%
filter(!is.na(inst)) %>%
unnest() %>%
mutate(
inst = case_when(
grepl("\bPiano\b", inst, ignore.case = TRUE) ~ "Piano",
grepl("\bUkelete\b", inst, ignore.case = TRUE) ~ "Ukelele",
grepl("\bGuitar\b", inst, ignore.case = TRUE) ~ "Guitar",
TRUE ~ trimws(inst)),
plays = TRUE,
) %>%
spread(inst, plays) %>%
mutate_at(vars(-pseudonym), Negate(is.na))
# # A tibble: 5 x 6
# pseudonym Guitar Keyboards Piano Trumpet Ukulele
# <chr> <lgl> <lgl> <lgl> <lgl> <lgl>
# 1 Eller Angel FALSE TRUE TRUE FALSE TRUE
# 2 Heroes War TRUE FALSE FALSE FALSE FALSE
# 3 Lord Los Angeles TRUE FALSE FALSE FALSE FALSE
# 4 Puppets War TRUE FALSE FALSE FALSE FALSE
# 5 West Magic FALSE FALSE FALSE TRUE FALSE
另一种方法(如果你有更多)是 merge/join 在一个框架中。这样做的一个优点是它可以非常具体并且包括非常不同的工具(其中正则表达式可能比您想要处理的更多)。一个缺点是它可能过于具体......例如,它不会加入拼写错误或大小写差异。
gen_inst <- tibble::tribble(
~inst, ~newinst
,"Electric Guitar", "Guitar"
,"Electric Bass" , "Guitar"
,"Electric Piano" , "Piano"
,"Pipe Organ" , "Piano"
)
survey %>%
mutate(instrument_list = if_else(grepl("Lord", pseudonym), "Electric Guitar", instrument_list)) %>%
transmute(pseudonym, inst = strsplit(instrument_list, "[,/]")) %>%
filter(!is.na(inst)) %>%
unnest() %>%
left_join(gen_inst, by = "inst") %>%
mutate(
inst = if_else(is.na(newinst), trimws(inst), newinst),
plays = TRUE
) %>%
select(-newinst) %>%
spread(inst, plays) %>%
mutate_at(vars(-pseudonym), Negate(is.na))
# # A tibble: 5 x 6
# pseudonym Guitar Keyboards Piano Trumpet Ukulele
# <chr> <lgl> <lgl> <lgl> <lgl> <lgl>
# 1 Eller Angel FALSE TRUE TRUE FALSE TRUE
# 2 Heroes War TRUE FALSE FALSE FALSE FALSE
# 3 Lord Los Angeles TRUE FALSE FALSE FALSE FALSE
# 4 Puppets War TRUE FALSE FALSE FALSE FALSE
# 5 West Magic FALSE FALSE FALSE TRUE FALSE
我们还可以使用 splitstackshape
cSplit_e
output <- splitstackshape::cSplit_e(survey, "instrument_list", type = "character",
fill = 0, sep=",|/", fixed = FALSE)
output[12:17]
# instrument_list_Guitar instrument_list_Keyboards instrument_list_Piano
#1 0 0 0
#2 1 0 0
#3 1 0 0
#4 0 0 0
#5 0 1 1
#6 0 0 0
# instrument_list_Rap instrument_list_Trumpet instrument_list_Ukulele
#1 1 0 0
#2 0 0 0
#3 0 0 0
#4 0 1 0
#5 0 0 1
#6 0 0 0
这里列中的1代表乐器被演奏,0代表没有演奏。
与其他答案没有太大区别,但这里使用了一些 tidyr
便利。 separate_rows
拆分字符串并在一次调用中取消嵌套;如果您在正则表达式中包含可选的 \s
,则 ", "
中的空格将包含在分隔符中,因此您可以跳过修剪空格。添加一个虚拟变量给出一个值来填充仪器列,并且 NA
s 被填充为 0。
library(dplyr)
library(tidyr)
survey_wide <- survey %>%
select(pseudonym, instrument_list) %>%
separate_rows(instrument_list, sep = "(\,|\/)\s?") %>%
filter(!is.na(instrument_list)) %>%
mutate(dummy = 1) %>%
spread(key = instrument_list, value = dummy, fill = 0)
survey_wide
#> # A tibble: 5 x 7
#> pseudonym Guitar Keyboards Piano Rap Trumpet Ukulele
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 Eller Angel 0 1 1 0 0 1
#> 2 Heroes War 1 0 0 0 0 0
#> 3 Lord Los Angeles 0 0 0 1 0 0
#> 4 Puppets War 1 0 0 0 0 0
#> 5 West Magic 0 0 0 0 1 0
如果您需要布尔值而不是数字,则需要额外的步骤:
survey_wide %>%
mutate_at(vars(-pseudonym), as.logical)
#> # A tibble: 5 x 7
#> pseudonym Guitar Keyboards Piano Rap Trumpet Ukulele
#> <chr> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl>
#> 1 Eller Angel FALSE TRUE TRUE FALSE FALSE TRUE
#> 2 Heroes War TRUE FALSE FALSE FALSE FALSE FALSE
#> 3 Lord Los Angeles FALSE FALSE FALSE TRUE FALSE FALSE
#> 4 Puppets War TRUE FALSE FALSE FALSE FALSE FALSE
#> 5 West Magic FALSE FALSE FALSE FALSE TRUE FALSE